Skip to content
Snippets Groups Projects
Commit db376411 authored by Ritesh Bansal's avatar Ritesh Bansal
Browse files

added logic for matrix creation

parent e24fdbc9
No related branches found
No related tags found
No related merge requests found
...@@ -2,6 +2,7 @@ class TrieNode: ...@@ -2,6 +2,7 @@ class TrieNode:
def __init__(self): def __init__(self):
self.children = {} self.children = {}
self.data = {} self.data = {}
self.name = ''
self.isEndOfUrl = False self.isEndOfUrl = False
def extract(self, startTimestamp , endTimeStamp): def extract(self, startTimestamp , endTimeStamp):
...@@ -17,33 +18,49 @@ class TrieNode: ...@@ -17,33 +18,49 @@ class TrieNode:
return pCrawlCopy return pCrawlCopy
class Trie: class Trie:
def __init__(self): def __init__(self):
self.root = self.getNode() self.root = self.getNode()
self.matrixElements = {}
self.counter = 0
def getNode(self): def getNode(self):
# Returns new trie node (initialized to NULLs) # Returns new trie node (initialized to NULLs)
return TrieNode() return TrieNode()
def insert(self, url, timestamp, payload): def insert(self, url, timestamp, payload):
newNodePath = ''
urlSplit = url.split('/') urlSplit = url.split('/')
pCrawl = self.root pCrawl = self.root
isnewpath = False
# for level in urlSplit: # for level in urlSplit:
for i in range(1, len(urlSplit)): for i in range(1, len(urlSplit)):
# if current character is not present # if current character is not present
level = urlSplit[i] level = urlSplit[i]
if len(level) == 0: continue if len(level) == 0: continue
if not self.matrixElements.__contains__(level):
self.counter = self.counter+1
self.matrixElements[level] = self.counter
if pCrawl.children.__contains__(level): if pCrawl.children.__contains__(level):
pCrawl = pCrawl.children[level]; pCrawl = pCrawl.children[level];
else: else:
newNodePath = newNodePath+level+'/'
pCrawl.children[level] = TrieNode() pCrawl.children[level] = TrieNode()
pCrawl = pCrawl.children[level] pCrawl = pCrawl.children[level]
pCrawl.name = level
pCrawl.data[timestamp] = payload; pCrawl.data[timestamp] = payload;
pCrawl.isEndOfUrl = True pCrawl.isEndOfUrl = True
if(newNodePath!=''):
newNodePath =newNodePath[:-1]
isnewpath = True
return (isnewpath,newNodePath)
def extract(self, startTimestamp , endTimeStamp): def extract(self, startTimestamp , endTimeStamp):
# extract tree based on given timestamp # extract tree based on given timestamp
return self.root.extract(startTimestamp, endTimeStamp) trieCopy = Trie()
trieCopy.counter = self.counter
trieCopy.matrixElements = self.matrixElements
trieCopy.root = self.root.extract(startTimestamp, endTimeStamp)
return trieCopy
def comparison(self, tree1): def comparison(self, tree1):
# compare two trees # compare two trees
...@@ -53,7 +70,7 @@ class Trie: ...@@ -53,7 +70,7 @@ class Trie:
stack_tree2.append(self.root) stack_tree2.append(self.root)
stack_tree1.append(tree1) stack_tree1.append(tree1)
while (len(stack_tree2) != 0): while(len(stack_tree2)!=0):
tree2 = stack_tree2.pop() tree2 = stack_tree2.pop()
tree = stack_tree1.pop() tree = stack_tree1.pop()
for data in tree2.data: for data in tree2.data:
...@@ -70,11 +87,50 @@ class Trie: ...@@ -70,11 +87,50 @@ class Trie:
stack_tree1.append(tree.children[child]) stack_tree1.append(tree.children[child])
else: else:
return False return False
if (len(stack_tree1) != 0): if(len(stack_tree1)!=0):
return False return False
return True return True
def ancestorMatrixRec(self, node, anc, mat):
# base case
if node == None:
return mat
import numpy as np
mat = np.asarray(mat)
# Update all ancestors of current node
data_node = self.matrixElements[node.name]
for i in anc:
mat[self.matrixElements[i]][data_node] = 1
# Push data to list of ancestors
anc.append(node.name)
# Traverse left and right subtrees
for child in node.children:
pCrawlJunior = node.children[child]
mat = self.ancestorMatrixRec(pCrawlJunior, anc, mat)
# Remove data from list the list of ancestors
# as all descendants of it are processed now.
anc.pop(-1)
return mat
# This function mainly calls ancestorMatrixRec()
def ancestorMatrix(self):
# Create an empty ancestor array
anc = []
# rows, cols = (len(self.matrixElements), len(self.matrixElements))
# mat = [[0] * cols] * rows
import numpy as np
mat = np.zeros((len(self.matrixElements), len(self.matrixElements)),dtype=int)
# Fill ancestor matrix and find
return self.ancestorMatrixRec(self.root, anc, mat)
def main(): def main():
keys = ['/spotlight/impact/2014-11-24-master/naturalists.html', '/'] keys = ['/spotlight/impact/2014-11-24-master/naturalists.html', '/']
......
...@@ -25,11 +25,14 @@ for i in listOfFolder: ...@@ -25,11 +25,14 @@ for i in listOfFolder:
data.append(zz_new) data.append(zz_new)
data_array = np.asarray(data) data_array = np.asarray(data)
threshold = [100] # threshold = [100]
threshold = [len(data_array)]
# making sitemap # making sitemap tree
data_train = data_array[0:threshold[0]] data_train = data_array[0:threshold[0]]
sitemapURLS = {}
#unique domains
sitemapdomains = {}
# sitemapURLS["/"] = "www.vt.edu" # sitemapURLS["/"] = "www.vt.edu"
# sitemap = Trie() # sitemap = Trie()
...@@ -38,23 +41,26 @@ for dayData in data_train: ...@@ -38,23 +41,26 @@ for dayData in data_train:
for i in range(len(dayDataNP)): for i in range(len(dayDataNP)):
# parsedurl = urlparse(dayDataNP[i][2]) # parsedurl = urlparse(dayDataNP[i][2])
parsedurl = urlparse(os.path.splitext(dayDataNP[i][2])[0]) parsedurl = urlparse(os.path.splitext(dayDataNP[i][2])[0])
if not sitemapURLS.__contains__(parsedurl.hostname): if not sitemapdomains.__contains__(parsedurl.hostname):
sitemapURLS[parsedurl.hostname] = Trie() sitemapdomains[parsedurl.hostname] = Trie()
sitemap = sitemapURLS[parsedurl.hostname] sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname
sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0
sitemap = sitemapdomains[parsedurl.hostname]
timestamp = dayDataNP[i][1] timestamp = dayDataNP[i][1]
payload = dayDataNP[i][9] payload = dayDataNP[i][9]
sitemap.insert(parsedurl.path, timestamp, payload) isnewpath, newnodepath = sitemap.insert(parsedurl.path, timestamp, payload)
if isnewpath: print(newnodepath)
# if not(sitemapURLS.__contains__(parsedurl.path)):
# sitemapURLS[parsedurl.path] = parsedurl[1]+parsedurl[2]
# sitemap.insert(parsedurl.path, timestamp, payload)
vtTree = sitemapURLS['www.vt.edu'] vtTree = sitemapdomains['www.vt.edu']
vtTreeCopy = vtTree.extract('20140906125541','20141215204723') vtTreeCopy = vtTree.extract('20140906125541','20141215204723')
result = vtTree.comparison(vtTreeCopy) result = vtTree.comparison(vtTreeCopy.root)
print(result) print(result)
result = vtTree.comparison(vtTree.root) result = vtTree.comparison(vtTree.root)
print(result) print(result)
print('done')
matrix = vtTree.ancestorMatrix()
# if not(sitemapURLS.__contains__(parsedurl.path)): matrix = np.asarray(matrix)
# sitemapURLS[parsedurl.path] = parsedurl[1]+parsedurl[2] print('done')
# sitemap.insert(parsedurl.path, timestamp, payload)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment