bugs

32cda435 · Ritesh Bansal · e14fb7fe · 32cda435 · 32cda435
Commit 32cda435 authored 5 years ago by Ritesh Bansal
--- a/CNN_1hour2levelMainNew.py
+++ b/CNN_1hour2levelMainNew.py
@@ -20,10 +20,10 @@ sitemapdomains = cnnFocusCrawl.makingSitemapTree(data_train)
 sitemapdomains = cnnFocusCrawl.testingSitemapTreeClassiyRF(sitemapdomains, data_test)
 edition_cnn_com = sitemapdomains['www.cnn.com']
-edition_cnn_com_Copy = edition_cnn_com.extract()
+edition_cnn_com_Copy = edition_cnn_com.extract("","")
-result  = edition_cnn_com.comparison(edition_cnn_com_Copy.root)
+result  = edition_cnn_com.isSame(edition_cnn_com_Copy.root)
 print(result)
-result  = edition_cnn_com.comparison(edition_cnn_com.root)
+result  = edition_cnn_com.isSame(edition_cnn_com.root)
 print(result)
 matrix = edition_cnn_com.ancestorMatrix()
 matrix = np.asarray(matrix)

--- a/Trie.py
+++ b/Trie.py
@@ -27,6 +27,22 @@ class Trie:
        # Returns new trie node (initialized to NULLs)
        return TrieNode()
+    def isStructureChange(self, url):
+        urlSplit = url.split('/')
+        pCrawl = self.root
+        isnewpath = False
+        # for level in urlSplit:
+        for i in range(1, len(urlSplit)):
+            # if current character is not present
+            level = urlSplit[i]
+            if len(level) == 0: continue
+            if pCrawl.children.__contains__(level):
+                pCrawl = pCrawl.children[level];
+            else:
+                isnewpath = True
+                break
+        return isnewpath
    def insert(self, url, timestamp, payload):
        newNodePath = ''
        urlSplit = url.split('/')
@@ -54,15 +70,34 @@ class Trie:
            isnewpath = True
        return (isnewpath,newNodePath)
+    def extractNodeData(self, url):
+        newNodePath = ''
+        urlSplit = url.split('/')
+        pCrawl = self.root
+        # for level in urlSplit:
+        for i in range(1, len(urlSplit)):
+            # if current character is not present
+            level = urlSplit[i]
+            if len(level) == 0: continue
+            pCrawl = pCrawl.children[level];
+        return pCrawl.data
    def extract(self, startTimestamp , endTimeStamp):
        # extract tree based on given timestamp
+        if startTimestamp == None or len(startTimestamp.strip())==0:
+            startTimestamp = "0"
+        if endTimeStamp == None or len(endTimeStamp.strip())==0:
+            import sys
+            endTimeStamp = str(sys.maxsize)
        trieCopy = Trie()
        trieCopy.counter = self.counter
        trieCopy.matrixElements = self.matrixElements
        trieCopy.root = self.root.extract(startTimestamp, endTimeStamp)
        return trieCopy
-    def comparison(self, tree1):
+    def isSame(self, tree1):
        # compare two trees
        from collections import deque
        stack_tree2 = deque()
@@ -109,7 +144,6 @@ class Trie:
        anc.append(node.name)
        # Traverse left and right subtrees
        for child in node.children:
            pCrawlJunior = node.children[child]
            mat = self.ancestorMatrixRec(pCrawlJunior, anc, mat)