diff --git a/Trie.py b/Trie.py index 79d10aab824c0b92051761ee8e68d8f59aeec660..bd0b65927806f46414b18ae4fc362aa0714e37c9 100644 --- a/Trie.py +++ b/Trie.py @@ -2,6 +2,7 @@ class TrieNode: def __init__(self): self.children = {} self.data = {} + self.name = '' self.isEndOfUrl = False def extract(self, startTimestamp , endTimeStamp): @@ -17,33 +18,49 @@ class TrieNode: return pCrawlCopy class Trie: - def __init__(self): self.root = self.getNode() + self.matrixElements = {} + self.counter = 0 def getNode(self): # Returns new trie node (initialized to NULLs) return TrieNode() def insert(self, url, timestamp, payload): + newNodePath = '' urlSplit = url.split('/') pCrawl = self.root + isnewpath = False # for level in urlSplit: for i in range(1, len(urlSplit)): # if current character is not present level = urlSplit[i] if len(level) == 0: continue + if not self.matrixElements.__contains__(level): + self.counter = self.counter+1 + self.matrixElements[level] = self.counter if pCrawl.children.__contains__(level): pCrawl = pCrawl.children[level]; else: + newNodePath = newNodePath+level+'/' pCrawl.children[level] = TrieNode() pCrawl = pCrawl.children[level] + pCrawl.name = level pCrawl.data[timestamp] = payload; pCrawl.isEndOfUrl = True + if(newNodePath!=''): + newNodePath =newNodePath[:-1] + isnewpath = True + return (isnewpath,newNodePath) def extract(self, startTimestamp , endTimeStamp): # extract tree based on given timestamp - return self.root.extract(startTimestamp, endTimeStamp) + trieCopy = Trie() + trieCopy.counter = self.counter + trieCopy.matrixElements = self.matrixElements + trieCopy.root = self.root.extract(startTimestamp, endTimeStamp) + return trieCopy def comparison(self, tree1): # compare two trees @@ -53,7 +70,7 @@ class Trie: stack_tree2.append(self.root) stack_tree1.append(tree1) - while (len(stack_tree2) != 0): + while(len(stack_tree2)!=0): tree2 = stack_tree2.pop() tree = stack_tree1.pop() for data in tree2.data: @@ -70,11 +87,50 @@ class Trie: stack_tree1.append(tree.children[child]) else: return False - if (len(stack_tree1) != 0): + if(len(stack_tree1)!=0): return False return True + def ancestorMatrixRec(self, node, anc, mat): + # base case + if node == None: + return mat + + import numpy as np + mat = np.asarray(mat) + + # Update all ancestors of current node + data_node = self.matrixElements[node.name] + for i in anc: + mat[self.matrixElements[i]][data_node] = 1 + + # Push data to list of ancestors + anc.append(node.name) + + # Traverse left and right subtrees + + for child in node.children: + pCrawlJunior = node.children[child] + mat = self.ancestorMatrixRec(pCrawlJunior, anc, mat) + # Remove data from list the list of ancestors + # as all descendants of it are processed now. + anc.pop(-1) + + return mat + + + # This function mainly calls ancestorMatrixRec() + def ancestorMatrix(self): + # Create an empty ancestor array + anc = [] + # rows, cols = (len(self.matrixElements), len(self.matrixElements)) + # mat = [[0] * cols] * rows + import numpy as np + mat = np.zeros((len(self.matrixElements), len(self.matrixElements)),dtype=int) + # Fill ancestor matrix and find + return self.ancestorMatrixRec(self.root, anc, mat) + def main(): keys = ['/spotlight/impact/2014-11-24-master/naturalists.html', '/'] diff --git a/main.py b/main.py index d3245a17d3e96131e6ee654c29bd3b72066d99fd..9952f904a4edf358e95ac64b237431c0f458962d 100644 --- a/main.py +++ b/main.py @@ -25,11 +25,14 @@ for i in listOfFolder: data.append(zz_new) data_array = np.asarray(data) -threshold = [100] +# threshold = [100] +threshold = [len(data_array)] -# making sitemap +# making sitemap tree data_train = data_array[0:threshold[0]] -sitemapURLS = {} + +#unique domains +sitemapdomains = {} # sitemapURLS["/"] = "www.vt.edu" # sitemap = Trie() @@ -38,23 +41,26 @@ for dayData in data_train: for i in range(len(dayDataNP)): # parsedurl = urlparse(dayDataNP[i][2]) parsedurl = urlparse(os.path.splitext(dayDataNP[i][2])[0]) - if not sitemapURLS.__contains__(parsedurl.hostname): - sitemapURLS[parsedurl.hostname] = Trie() - sitemap = sitemapURLS[parsedurl.hostname] + if not sitemapdomains.__contains__(parsedurl.hostname): + sitemapdomains[parsedurl.hostname] = Trie() + sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname + sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0 + sitemap = sitemapdomains[parsedurl.hostname] timestamp = dayDataNP[i][1] payload = dayDataNP[i][9] - sitemap.insert(parsedurl.path, timestamp, payload) + isnewpath, newnodepath = sitemap.insert(parsedurl.path, timestamp, payload) + if isnewpath: print(newnodepath) + # if not(sitemapURLS.__contains__(parsedurl.path)): + # sitemapURLS[parsedurl.path] = parsedurl[1]+parsedurl[2] + # sitemap.insert(parsedurl.path, timestamp, payload) -vtTree = sitemapURLS['www.vt.edu'] +vtTree = sitemapdomains['www.vt.edu'] vtTreeCopy = vtTree.extract('20140906125541','20141215204723') -result = vtTree.comparison(vtTreeCopy) +result = vtTree.comparison(vtTreeCopy.root) print(result) result = vtTree.comparison(vtTree.root) print(result) -print('done') - - - # if not(sitemapURLS.__contains__(parsedurl.path)): - # sitemapURLS[parsedurl.path] = parsedurl[1]+parsedurl[2] - # sitemap.insert(parsedurl.path, timestamp, payload) +matrix = vtTree.ancestorMatrix() +matrix = np.asarray(matrix) +print('done')