Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
CS-6604-WebArchive
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
namanahuja
CS-6604-WebArchive
Commits
db376411
Commit
db376411
authored
5 years ago
by
Ritesh Bansal
Browse files
Options
Downloads
Patches
Plain Diff
added logic for matrix creation
parent
e24fdbc9
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
Trie.py
+60
-4
60 additions, 4 deletions
Trie.py
main.py
+21
-15
21 additions, 15 deletions
main.py
with
81 additions
and
19 deletions
Trie.py
+
60
−
4
View file @
db376411
...
...
@@ -2,6 +2,7 @@ class TrieNode:
def
__init__
(
self
):
self
.
children
=
{}
self
.
data
=
{}
self
.
name
=
''
self
.
isEndOfUrl
=
False
def
extract
(
self
,
startTimestamp
,
endTimeStamp
):
...
...
@@ -17,33 +18,49 @@ class TrieNode:
return
pCrawlCopy
class
Trie
:
def
__init__
(
self
):
self
.
root
=
self
.
getNode
()
self
.
matrixElements
=
{}
self
.
counter
=
0
def
getNode
(
self
):
# Returns new trie node (initialized to NULLs)
return
TrieNode
()
def
insert
(
self
,
url
,
timestamp
,
payload
):
newNodePath
=
''
urlSplit
=
url
.
split
(
'
/
'
)
pCrawl
=
self
.
root
isnewpath
=
False
# for level in urlSplit:
for
i
in
range
(
1
,
len
(
urlSplit
)):
# if current character is not present
level
=
urlSplit
[
i
]
if
len
(
level
)
==
0
:
continue
if
not
self
.
matrixElements
.
__contains__
(
level
):
self
.
counter
=
self
.
counter
+
1
self
.
matrixElements
[
level
]
=
self
.
counter
if
pCrawl
.
children
.
__contains__
(
level
):
pCrawl
=
pCrawl
.
children
[
level
];
else
:
newNodePath
=
newNodePath
+
level
+
'
/
'
pCrawl
.
children
[
level
]
=
TrieNode
()
pCrawl
=
pCrawl
.
children
[
level
]
pCrawl
.
name
=
level
pCrawl
.
data
[
timestamp
]
=
payload
;
pCrawl
.
isEndOfUrl
=
True
if
(
newNodePath
!=
''
):
newNodePath
=
newNodePath
[:
-
1
]
isnewpath
=
True
return
(
isnewpath
,
newNodePath
)
def
extract
(
self
,
startTimestamp
,
endTimeStamp
):
# extract tree based on given timestamp
return
self
.
root
.
extract
(
startTimestamp
,
endTimeStamp
)
trieCopy
=
Trie
()
trieCopy
.
counter
=
self
.
counter
trieCopy
.
matrixElements
=
self
.
matrixElements
trieCopy
.
root
=
self
.
root
.
extract
(
startTimestamp
,
endTimeStamp
)
return
trieCopy
def
comparison
(
self
,
tree1
):
# compare two trees
...
...
@@ -53,7 +70,7 @@ class Trie:
stack_tree2
.
append
(
self
.
root
)
stack_tree1
.
append
(
tree1
)
while
(
len
(
stack_tree2
)
!=
0
):
while
(
len
(
stack_tree2
)
!=
0
):
tree2
=
stack_tree2
.
pop
()
tree
=
stack_tree1
.
pop
()
for
data
in
tree2
.
data
:
...
...
@@ -70,11 +87,50 @@ class Trie:
stack_tree1
.
append
(
tree
.
children
[
child
])
else
:
return
False
if
(
len
(
stack_tree1
)
!=
0
):
if
(
len
(
stack_tree1
)
!=
0
):
return
False
return
True
def
ancestorMatrixRec
(
self
,
node
,
anc
,
mat
):
# base case
if
node
==
None
:
return
mat
import
numpy
as
np
mat
=
np
.
asarray
(
mat
)
# Update all ancestors of current node
data_node
=
self
.
matrixElements
[
node
.
name
]
for
i
in
anc
:
mat
[
self
.
matrixElements
[
i
]][
data_node
]
=
1
# Push data to list of ancestors
anc
.
append
(
node
.
name
)
# Traverse left and right subtrees
for
child
in
node
.
children
:
pCrawlJunior
=
node
.
children
[
child
]
mat
=
self
.
ancestorMatrixRec
(
pCrawlJunior
,
anc
,
mat
)
# Remove data from list the list of ancestors
# as all descendants of it are processed now.
anc
.
pop
(
-
1
)
return
mat
# This function mainly calls ancestorMatrixRec()
def
ancestorMatrix
(
self
):
# Create an empty ancestor array
anc
=
[]
# rows, cols = (len(self.matrixElements), len(self.matrixElements))
# mat = [[0] * cols] * rows
import
numpy
as
np
mat
=
np
.
zeros
((
len
(
self
.
matrixElements
),
len
(
self
.
matrixElements
)),
dtype
=
int
)
# Fill ancestor matrix and find
return
self
.
ancestorMatrixRec
(
self
.
root
,
anc
,
mat
)
def
main
():
keys
=
[
'
/spotlight/impact/2014-11-24-master/naturalists.html
'
,
'
/
'
]
...
...
This diff is collapsed.
Click to expand it.
main.py
+
21
−
15
View file @
db376411
...
...
@@ -25,11 +25,14 @@ for i in listOfFolder:
data
.
append
(
zz_new
)
data_array
=
np
.
asarray
(
data
)
threshold
=
[
100
]
# threshold = [100]
threshold
=
[
len
(
data_array
)]
# making sitemap
# making sitemap
tree
data_train
=
data_array
[
0
:
threshold
[
0
]]
sitemapURLS
=
{}
#unique domains
sitemapdomains
=
{}
# sitemapURLS["/"] = "www.vt.edu"
# sitemap = Trie()
...
...
@@ -38,23 +41,26 @@ for dayData in data_train:
for
i
in
range
(
len
(
dayDataNP
)):
# parsedurl = urlparse(dayDataNP[i][2])
parsedurl
=
urlparse
(
os
.
path
.
splitext
(
dayDataNP
[
i
][
2
])[
0
])
if
not
sitemapURLS
.
__contains__
(
parsedurl
.
hostname
):
sitemapURLS
[
parsedurl
.
hostname
]
=
Trie
()
sitemap
=
sitemapURLS
[
parsedurl
.
hostname
]
if
not
sitemapdomains
.
__contains__
(
parsedurl
.
hostname
):
sitemapdomains
[
parsedurl
.
hostname
]
=
Trie
()
sitemapdomains
[
parsedurl
.
hostname
].
root
.
name
=
parsedurl
.
hostname
sitemapdomains
[
parsedurl
.
hostname
].
matrixElements
[
parsedurl
.
hostname
]
=
0
sitemap
=
sitemapdomains
[
parsedurl
.
hostname
]
timestamp
=
dayDataNP
[
i
][
1
]
payload
=
dayDataNP
[
i
][
9
]
sitemap
.
insert
(
parsedurl
.
path
,
timestamp
,
payload
)
isnewpath
,
newnodepath
=
sitemap
.
insert
(
parsedurl
.
path
,
timestamp
,
payload
)
if
isnewpath
:
print
(
newnodepath
)
# if not(sitemapURLS.__contains__(parsedurl.path)):
# sitemapURLS[parsedurl.path] = parsedurl[1]+parsedurl[2]
# sitemap.insert(parsedurl.path, timestamp, payload)
vtTree
=
sitemap
URLS
[
'
www.vt.edu
'
]
vtTree
=
sitemap
domains
[
'
www.vt.edu
'
]
vtTreeCopy
=
vtTree
.
extract
(
'
20140906125541
'
,
'
20141215204723
'
)
result
=
vtTree
.
comparison
(
vtTreeCopy
)
result
=
vtTree
.
comparison
(
vtTreeCopy
.
root
)
print
(
result
)
result
=
vtTree
.
comparison
(
vtTree
.
root
)
print
(
result
)
print
(
'
done
'
)
# if not(sitemapURLS.__contains__(parsedurl.path)):
# sitemapURLS[parsedurl.path] = parsedurl[1]+parsedurl[2]
# sitemap.insert(parsedurl.path, timestamp, payload)
matrix
=
vtTree
.
ancestorMatrix
()
matrix
=
np
.
asarray
(
matrix
)
print
(
'
done
'
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment