Forked from
xw0078 / CS-6604-WebArchive
11 commits behind the upstream repository.
-
Ritesh Bansal authoredRitesh Bansal authored
Trie.py 2.99 KiB
class TrieNode:
def __init__(self):
self.children = {}
self.data = {}
self.isEndOfUrl = False
def extract(self, startTimestamp , endTimeStamp):
pCrawl = self
pCrawlCopy = TrieNode()
pCrawlCopy.isEndOfUrl = pCrawl.isEndOfUrl
for data in pCrawl.data:
if data <= endTimeStamp and data >= startTimestamp:
pCrawlCopy.data[data] = pCrawl.data[data]
for child in pCrawl.children:
pCrawlJunior = pCrawl.children[child]
pCrawlCopy.children[child] = pCrawlJunior.extract(startTimestamp, endTimeStamp)
return pCrawlCopy
class Trie:
def __init__(self):
self.root = self.getNode()
def getNode(self):
# Returns new trie node (initialized to NULLs)
return TrieNode()
def insert(self, url, timestamp, payload):
urlSplit = url.split('/')
pCrawl = self.root
# for level in urlSplit:
for i in range(1, len(urlSplit)):
# if current character is not present
level = urlSplit[i]
if len(level) == 0: continue
if pCrawl.children.__contains__(level):
pCrawl = pCrawl.children[level];
else:
pCrawl.children[level] = TrieNode()
pCrawl = pCrawl.children[level]
pCrawl.data[timestamp] = payload;
pCrawl.isEndOfUrl = True
def extract(self, startTimestamp , endTimeStamp):
# extract tree based on given timestamp
return self.root.extract(startTimestamp, endTimeStamp)
def comparison(self, tree1):
# compare two trees
from collections import deque
stack_tree2 = deque()
stack_tree1 = deque()
stack_tree2.append(self.root)
stack_tree1.append(tree1)
while (len(stack_tree2) != 0):
tree2 = stack_tree2.pop()
tree = stack_tree1.pop()
for data in tree2.data:
if tree.data.__contains__(data):
if not tree.data[data] == tree2.data[data]:
return False
else:
return False
for child in tree2.children:
if tree.children.__contains__(child):
if not stack_tree2.__contains__(child):
stack_tree2.append(tree2.children[child])
stack_tree1.append(tree.children[child])
else:
return False
if (len(stack_tree1) != 0):
return False
return True
def main():
keys = ['/spotlight/impact/2014-11-24-master/naturalists.html', '/']
# Trie object
t = Trie()
# Construct trie
for key in keys:
t.insert(key)
# Search for different keys
print("{} ---- {}".format("/spotlight/impact/2014-11-24-master/naturalists.html", [t.search("/spotlight/impact/2014-11-24-master/naturalists.html")]))
print("{} ---- {}".format("/", [t.search("/")]))
if __name__ == '__main__':
main()