Here is my humble implementation of a trie in Python, just something to get me started.
Any comments on improving the code are welcome
import re, types
from collections import defaultdict
def tokenize(s, removePunctuation = True):
if removePunctuation:
p = re.compile('[\.,!?()\[\]{}:;"\'<>/ \n\r\t]')
return [el for el in p.split(s) if el]
else:
t = []
s = re.split('\s', s)
p = re.compile('(\W)')
for phrase in s:
words = p.split(phrase)
for word in words:
t.append(word)
return [el for el in t if el]
class Node(object):
def __init__(self):
self.freq = 0
self.next = defaultdict(int)
class Trie(object):
def __init__(self, inFile):
self.nodes = []
self.nodes.append(Node())
self.n = 1
self.numberOfWords = 0
for line in file(inFile):
words = tokenize(line)
for w in words:
currNode = 0
for char in w:
if self.nodes[currNode].next[char] == 0:
self.nodes.append(Node())
self.nodes[currNode].next[char] = self.n
currNode = self.n
self.n += 1
else:
currNode = self.nodes[currNode].next[char]
self.nodes[currNode].freq += 1
self.numberOfWords = len([node for node in self.nodes if node.freq != 0])
def __getitem__(self, word):
"""Return the frequency of the given word."""
if isinstance(word, types.StringTypes):
currNode = 0
for char in word:
if self.nodes[currNode].next[char] == 0:
raise AttributeError("No such word: %s" % word)
else:
currNode = self.nodes[currNode].next[char]
return self.nodes[currNode].freq
else:
raise TypeError()
def __len__(self):
"""Return the number of nodes in the trie."""
return self.n
2 comments:
Awesome! Works great. I like the way you store the nodes in a single array. I would naively have constructed much messier nested structures. Thanks for the post!
Post a Comment