Tuesday, November 20, 2007

Trie in Python

Here is my humble implementation of a trie in Python, just something to get me started.
Any comments on improving the code are welcome

import re, types
from collections import defaultdict

def tokenize(s, removePunctuation = True):
if removePunctuation:
p = re.compile('[\.,!?()\[\]{}:;"\'<>/ \n\r\t]')
return [el for el in p.split(s) if el]
t = []
s = re.split('\s', s)
p = re.compile('(\W)')
for phrase in s:
words = p.split(phrase)
for word in words:
return [el for el in t if el]

class Node(object):

def __init__(self):
self.freq = 0
self.next = defaultdict(int)

class Trie(object):

def __init__(self, inFile):

self.nodes = []
self.n = 1
self.numberOfWords = 0

for line in file(inFile):
words = tokenize(line)
for w in words:
currNode = 0
for char in w:
if self.nodes[currNode].next[char] == 0:
self.nodes[currNode].next[char] = self.n
currNode = self.n
self.n += 1
currNode = self.nodes[currNode].next[char]
self.nodes[currNode].freq += 1
self.numberOfWords = len([node for node in self.nodes if node.freq != 0])

def __getitem__(self, word):
"""Return the frequency of the given word."""

if isinstance(word, types.StringTypes):
currNode = 0
for char in word:
if self.nodes[currNode].next[char] == 0:
raise AttributeError("No such word: %s" % word)
currNode = self.nodes[currNode].next[char]
return self.nodes[currNode].freq
raise TypeError()

def __len__(self):
"""Return the number of nodes in the trie."""

return self.n


mark said...

Awesome! Works great. I like the way you store the nodes in a single array. I would naively have constructed much messier nested structures. Thanks for the post!

Manoj Venkat said...
This comment has been removed by the author.