diff --git a/CNN_1hour2levelMainNew.py b/CNN_1hour2levelMainNew.py index 5bebb4094c3a2750019ee97cdf685620024b2289..825f05efa79783bb245b6ffeef0ee3a6482a10d3 100644 --- a/CNN_1hour2levelMainNew.py +++ b/CNN_1hour2levelMainNew.py @@ -20,10 +20,10 @@ sitemapdomains = cnnFocusCrawl.makingSitemapTree(data_train) sitemapdomains = cnnFocusCrawl.testingSitemapTreeClassiyRF(sitemapdomains, data_test) edition_cnn_com = sitemapdomains['www.cnn.com'] -edition_cnn_com_Copy = edition_cnn_com.extract() -result = edition_cnn_com.comparison(edition_cnn_com_Copy.root) +edition_cnn_com_Copy = edition_cnn_com.extract("","") +result = edition_cnn_com.isSame(edition_cnn_com_Copy.root) print(result) -result = edition_cnn_com.comparison(edition_cnn_com.root) +result = edition_cnn_com.isSame(edition_cnn_com.root) print(result) matrix = edition_cnn_com.ancestorMatrix() matrix = np.asarray(matrix) diff --git a/Trie.py b/Trie.py index 118f4c51b484ebd36af60ee671fbb3b1d29cb441..b5dbb7378046569a68d7f1e60e5930bef65ef548 100644 --- a/Trie.py +++ b/Trie.py @@ -27,6 +27,22 @@ class Trie: # Returns new trie node (initialized to NULLs) return TrieNode() + def isStructureChange(self, url): + urlSplit = url.split('/') + pCrawl = self.root + isnewpath = False + # for level in urlSplit: + for i in range(1, len(urlSplit)): + # if current character is not present + level = urlSplit[i] + if len(level) == 0: continue + if pCrawl.children.__contains__(level): + pCrawl = pCrawl.children[level]; + else: + isnewpath = True + break + return isnewpath + def insert(self, url, timestamp, payload): newNodePath = '' urlSplit = url.split('/') @@ -54,15 +70,34 @@ class Trie: isnewpath = True return (isnewpath,newNodePath) + def extractNodeData(self, url): + newNodePath = '' + urlSplit = url.split('/') + pCrawl = self.root + # for level in urlSplit: + for i in range(1, len(urlSplit)): + # if current character is not present + level = urlSplit[i] + if len(level) == 0: continue + pCrawl = pCrawl.children[level]; + return pCrawl.data + + + def extract(self, startTimestamp , endTimeStamp): # extract tree based on given timestamp + if startTimestamp == None or len(startTimestamp.strip())==0: + startTimestamp = "0" + if endTimeStamp == None or len(endTimeStamp.strip())==0: + import sys + endTimeStamp = str(sys.maxsize) trieCopy = Trie() trieCopy.counter = self.counter trieCopy.matrixElements = self.matrixElements trieCopy.root = self.root.extract(startTimestamp, endTimeStamp) return trieCopy - def comparison(self, tree1): + def isSame(self, tree1): # compare two trees from collections import deque stack_tree2 = deque() @@ -109,7 +144,6 @@ class Trie: anc.append(node.name) # Traverse left and right subtrees - for child in node.children: pCrawlJunior = node.children[child] mat = self.ancestorMatrixRec(pCrawlJunior, anc, mat)