Skip to content
Snippets Groups Projects
Commit db376411 authored by Ritesh Bansal's avatar Ritesh Bansal
Browse files

added logic for matrix creation

parent e24fdbc9
No related branches found
No related tags found
No related merge requests found
......@@ -2,6 +2,7 @@ class TrieNode:
def __init__(self):
self.children = {}
self.data = {}
self.name = ''
self.isEndOfUrl = False
def extract(self, startTimestamp , endTimeStamp):
......@@ -17,33 +18,49 @@ class TrieNode:
return pCrawlCopy
class Trie:
def __init__(self):
self.root = self.getNode()
self.matrixElements = {}
self.counter = 0
def getNode(self):
# Returns new trie node (initialized to NULLs)
return TrieNode()
def insert(self, url, timestamp, payload):
newNodePath = ''
urlSplit = url.split('/')
pCrawl = self.root
isnewpath = False
# for level in urlSplit:
for i in range(1, len(urlSplit)):
# if current character is not present
level = urlSplit[i]
if len(level) == 0: continue
if not self.matrixElements.__contains__(level):
self.counter = self.counter+1
self.matrixElements[level] = self.counter
if pCrawl.children.__contains__(level):
pCrawl = pCrawl.children[level];
else:
newNodePath = newNodePath+level+'/'
pCrawl.children[level] = TrieNode()
pCrawl = pCrawl.children[level]
pCrawl.name = level
pCrawl.data[timestamp] = payload;
pCrawl.isEndOfUrl = True
if(newNodePath!=''):
newNodePath =newNodePath[:-1]
isnewpath = True
return (isnewpath,newNodePath)
def extract(self, startTimestamp , endTimeStamp):
# extract tree based on given timestamp
return self.root.extract(startTimestamp, endTimeStamp)
trieCopy = Trie()
trieCopy.counter = self.counter
trieCopy.matrixElements = self.matrixElements
trieCopy.root = self.root.extract(startTimestamp, endTimeStamp)
return trieCopy
def comparison(self, tree1):
# compare two trees
......@@ -53,7 +70,7 @@ class Trie:
stack_tree2.append(self.root)
stack_tree1.append(tree1)
while (len(stack_tree2) != 0):
while(len(stack_tree2)!=0):
tree2 = stack_tree2.pop()
tree = stack_tree1.pop()
for data in tree2.data:
......@@ -70,11 +87,50 @@ class Trie:
stack_tree1.append(tree.children[child])
else:
return False
if (len(stack_tree1) != 0):
if(len(stack_tree1)!=0):
return False
return True
def ancestorMatrixRec(self, node, anc, mat):
# base case
if node == None:
return mat
import numpy as np
mat = np.asarray(mat)
# Update all ancestors of current node
data_node = self.matrixElements[node.name]
for i in anc:
mat[self.matrixElements[i]][data_node] = 1
# Push data to list of ancestors
anc.append(node.name)
# Traverse left and right subtrees
for child in node.children:
pCrawlJunior = node.children[child]
mat = self.ancestorMatrixRec(pCrawlJunior, anc, mat)
# Remove data from list the list of ancestors
# as all descendants of it are processed now.
anc.pop(-1)
return mat
# This function mainly calls ancestorMatrixRec()
def ancestorMatrix(self):
# Create an empty ancestor array
anc = []
# rows, cols = (len(self.matrixElements), len(self.matrixElements))
# mat = [[0] * cols] * rows
import numpy as np
mat = np.zeros((len(self.matrixElements), len(self.matrixElements)),dtype=int)
# Fill ancestor matrix and find
return self.ancestorMatrixRec(self.root, anc, mat)
def main():
keys = ['/spotlight/impact/2014-11-24-master/naturalists.html', '/']
......
......@@ -25,11 +25,14 @@ for i in listOfFolder:
data.append(zz_new)
data_array = np.asarray(data)
threshold = [100]
# threshold = [100]
threshold = [len(data_array)]
# making sitemap
# making sitemap tree
data_train = data_array[0:threshold[0]]
sitemapURLS = {}
#unique domains
sitemapdomains = {}
# sitemapURLS["/"] = "www.vt.edu"
# sitemap = Trie()
......@@ -38,23 +41,26 @@ for dayData in data_train:
for i in range(len(dayDataNP)):
# parsedurl = urlparse(dayDataNP[i][2])
parsedurl = urlparse(os.path.splitext(dayDataNP[i][2])[0])
if not sitemapURLS.__contains__(parsedurl.hostname):
sitemapURLS[parsedurl.hostname] = Trie()
sitemap = sitemapURLS[parsedurl.hostname]
if not sitemapdomains.__contains__(parsedurl.hostname):
sitemapdomains[parsedurl.hostname] = Trie()
sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname
sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0
sitemap = sitemapdomains[parsedurl.hostname]
timestamp = dayDataNP[i][1]
payload = dayDataNP[i][9]
sitemap.insert(parsedurl.path, timestamp, payload)
isnewpath, newnodepath = sitemap.insert(parsedurl.path, timestamp, payload)
if isnewpath: print(newnodepath)
# if not(sitemapURLS.__contains__(parsedurl.path)):
# sitemapURLS[parsedurl.path] = parsedurl[1]+parsedurl[2]
# sitemap.insert(parsedurl.path, timestamp, payload)
vtTree = sitemapURLS['www.vt.edu']
vtTree = sitemapdomains['www.vt.edu']
vtTreeCopy = vtTree.extract('20140906125541','20141215204723')
result = vtTree.comparison(vtTreeCopy)
result = vtTree.comparison(vtTreeCopy.root)
print(result)
result = vtTree.comparison(vtTree.root)
print(result)
print('done')
# if not(sitemapURLS.__contains__(parsedurl.path)):
# sitemapURLS[parsedurl.path] = parsedurl[1]+parsedurl[2]
# sitemap.insert(parsedurl.path, timestamp, payload)
matrix = vtTree.ancestorMatrix()
matrix = np.asarray(matrix)
print('done')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment