Skip to content
Snippets Groups Projects
Commit 32cda435 authored by Ritesh Bansal's avatar Ritesh Bansal
Browse files

bugs

parent e14fb7fe
No related branches found
No related tags found
No related merge requests found
...@@ -20,10 +20,10 @@ sitemapdomains = cnnFocusCrawl.makingSitemapTree(data_train) ...@@ -20,10 +20,10 @@ sitemapdomains = cnnFocusCrawl.makingSitemapTree(data_train)
sitemapdomains = cnnFocusCrawl.testingSitemapTreeClassiyRF(sitemapdomains, data_test) sitemapdomains = cnnFocusCrawl.testingSitemapTreeClassiyRF(sitemapdomains, data_test)
edition_cnn_com = sitemapdomains['www.cnn.com'] edition_cnn_com = sitemapdomains['www.cnn.com']
edition_cnn_com_Copy = edition_cnn_com.extract() edition_cnn_com_Copy = edition_cnn_com.extract("","")
result = edition_cnn_com.comparison(edition_cnn_com_Copy.root) result = edition_cnn_com.isSame(edition_cnn_com_Copy.root)
print(result) print(result)
result = edition_cnn_com.comparison(edition_cnn_com.root) result = edition_cnn_com.isSame(edition_cnn_com.root)
print(result) print(result)
matrix = edition_cnn_com.ancestorMatrix() matrix = edition_cnn_com.ancestorMatrix()
matrix = np.asarray(matrix) matrix = np.asarray(matrix)
......
...@@ -27,6 +27,22 @@ class Trie: ...@@ -27,6 +27,22 @@ class Trie:
# Returns new trie node (initialized to NULLs) # Returns new trie node (initialized to NULLs)
return TrieNode() return TrieNode()
def isStructureChange(self, url):
urlSplit = url.split('/')
pCrawl = self.root
isnewpath = False
# for level in urlSplit:
for i in range(1, len(urlSplit)):
# if current character is not present
level = urlSplit[i]
if len(level) == 0: continue
if pCrawl.children.__contains__(level):
pCrawl = pCrawl.children[level];
else:
isnewpath = True
break
return isnewpath
def insert(self, url, timestamp, payload): def insert(self, url, timestamp, payload):
newNodePath = '' newNodePath = ''
urlSplit = url.split('/') urlSplit = url.split('/')
...@@ -54,15 +70,34 @@ class Trie: ...@@ -54,15 +70,34 @@ class Trie:
isnewpath = True isnewpath = True
return (isnewpath,newNodePath) return (isnewpath,newNodePath)
def extractNodeData(self, url):
newNodePath = ''
urlSplit = url.split('/')
pCrawl = self.root
# for level in urlSplit:
for i in range(1, len(urlSplit)):
# if current character is not present
level = urlSplit[i]
if len(level) == 0: continue
pCrawl = pCrawl.children[level];
return pCrawl.data
def extract(self, startTimestamp , endTimeStamp): def extract(self, startTimestamp , endTimeStamp):
# extract tree based on given timestamp # extract tree based on given timestamp
if startTimestamp == None or len(startTimestamp.strip())==0:
startTimestamp = "0"
if endTimeStamp == None or len(endTimeStamp.strip())==0:
import sys
endTimeStamp = str(sys.maxsize)
trieCopy = Trie() trieCopy = Trie()
trieCopy.counter = self.counter trieCopy.counter = self.counter
trieCopy.matrixElements = self.matrixElements trieCopy.matrixElements = self.matrixElements
trieCopy.root = self.root.extract(startTimestamp, endTimeStamp) trieCopy.root = self.root.extract(startTimestamp, endTimeStamp)
return trieCopy return trieCopy
def comparison(self, tree1): def isSame(self, tree1):
# compare two trees # compare two trees
from collections import deque from collections import deque
stack_tree2 = deque() stack_tree2 = deque()
...@@ -109,7 +144,6 @@ class Trie: ...@@ -109,7 +144,6 @@ class Trie:
anc.append(node.name) anc.append(node.name)
# Traverse left and right subtrees # Traverse left and right subtrees
for child in node.children: for child in node.children:
pCrawlJunior = node.children[child] pCrawlJunior = node.children[child]
mat = self.ancestorMatrixRec(pCrawlJunior, anc, mat) mat = self.ancestorMatrixRec(pCrawlJunior, anc, mat)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment