From 32cda43509de7b8e5e98b76a1635c967470e4239 Mon Sep 17 00:00:00 2001 From: Ritesh Bansal <riteshobansal@gmail.com> Date: Tue, 3 Dec 2019 11:45:21 -0500 Subject: [PATCH] bugs --- CNN_1hour2levelMainNew.py | 6 +++--- Trie.py | 38 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/CNN_1hour2levelMainNew.py b/CNN_1hour2levelMainNew.py index 5bebb40..825f05e 100644 --- a/CNN_1hour2levelMainNew.py +++ b/CNN_1hour2levelMainNew.py @@ -20,10 +20,10 @@ sitemapdomains = cnnFocusCrawl.makingSitemapTree(data_train) sitemapdomains = cnnFocusCrawl.testingSitemapTreeClassiyRF(sitemapdomains, data_test) edition_cnn_com = sitemapdomains['www.cnn.com'] -edition_cnn_com_Copy = edition_cnn_com.extract() -result = edition_cnn_com.comparison(edition_cnn_com_Copy.root) +edition_cnn_com_Copy = edition_cnn_com.extract("","") +result = edition_cnn_com.isSame(edition_cnn_com_Copy.root) print(result) -result = edition_cnn_com.comparison(edition_cnn_com.root) +result = edition_cnn_com.isSame(edition_cnn_com.root) print(result) matrix = edition_cnn_com.ancestorMatrix() matrix = np.asarray(matrix) diff --git a/Trie.py b/Trie.py index 118f4c5..b5dbb73 100644 --- a/Trie.py +++ b/Trie.py @@ -27,6 +27,22 @@ class Trie: # Returns new trie node (initialized to NULLs) return TrieNode() + def isStructureChange(self, url): + urlSplit = url.split('/') + pCrawl = self.root + isnewpath = False + # for level in urlSplit: + for i in range(1, len(urlSplit)): + # if current character is not present + level = urlSplit[i] + if len(level) == 0: continue + if pCrawl.children.__contains__(level): + pCrawl = pCrawl.children[level]; + else: + isnewpath = True + break + return isnewpath + def insert(self, url, timestamp, payload): newNodePath = '' urlSplit = url.split('/') @@ -54,15 +70,34 @@ class Trie: isnewpath = True return (isnewpath,newNodePath) + def extractNodeData(self, url): + newNodePath = '' + urlSplit = url.split('/') + pCrawl = self.root + # for level in urlSplit: + for i in range(1, len(urlSplit)): + # if current character is not present + level = urlSplit[i] + if len(level) == 0: continue + pCrawl = pCrawl.children[level]; + return pCrawl.data + + + def extract(self, startTimestamp , endTimeStamp): # extract tree based on given timestamp + if startTimestamp == None or len(startTimestamp.strip())==0: + startTimestamp = "0" + if endTimeStamp == None or len(endTimeStamp.strip())==0: + import sys + endTimeStamp = str(sys.maxsize) trieCopy = Trie() trieCopy.counter = self.counter trieCopy.matrixElements = self.matrixElements trieCopy.root = self.root.extract(startTimestamp, endTimeStamp) return trieCopy - def comparison(self, tree1): + def isSame(self, tree1): # compare two trees from collections import deque stack_tree2 = deque() @@ -109,7 +144,6 @@ class Trie: anc.append(node.name) # Traverse left and right subtrees - for child in node.children: pCrawlJunior = node.children[child] mat = self.ancestorMatrixRec(pCrawlJunior, anc, mat) -- GitLab