Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
CS-6604-WebArchive
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
xw0078
CS-6604-WebArchive
Commits
32cda435
Commit
32cda435
authored
5 years ago
by
Ritesh Bansal
Browse files
Options
Downloads
Patches
Plain Diff
bugs
parent
e14fb7fe
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
CNN_1hour2levelMainNew.py
+3
-3
3 additions, 3 deletions
CNN_1hour2levelMainNew.py
Trie.py
+36
-2
36 additions, 2 deletions
Trie.py
with
39 additions
and
5 deletions
CNN_1hour2levelMainNew.py
+
3
−
3
View file @
32cda435
...
@@ -20,10 +20,10 @@ sitemapdomains = cnnFocusCrawl.makingSitemapTree(data_train)
...
@@ -20,10 +20,10 @@ sitemapdomains = cnnFocusCrawl.makingSitemapTree(data_train)
sitemapdomains
=
cnnFocusCrawl
.
testingSitemapTreeClassiyRF
(
sitemapdomains
,
data_test
)
sitemapdomains
=
cnnFocusCrawl
.
testingSitemapTreeClassiyRF
(
sitemapdomains
,
data_test
)
edition_cnn_com
=
sitemapdomains
[
'
www.cnn.com
'
]
edition_cnn_com
=
sitemapdomains
[
'
www.cnn.com
'
]
edition_cnn_com_Copy
=
edition_cnn_com
.
extract
()
edition_cnn_com_Copy
=
edition_cnn_com
.
extract
(
""
,
""
)
result
=
edition_cnn_com
.
comparison
(
edition_cnn_com_Copy
.
root
)
result
=
edition_cnn_com
.
isSame
(
edition_cnn_com_Copy
.
root
)
print
(
result
)
print
(
result
)
result
=
edition_cnn_com
.
comparison
(
edition_cnn_com
.
root
)
result
=
edition_cnn_com
.
isSame
(
edition_cnn_com
.
root
)
print
(
result
)
print
(
result
)
matrix
=
edition_cnn_com
.
ancestorMatrix
()
matrix
=
edition_cnn_com
.
ancestorMatrix
()
matrix
=
np
.
asarray
(
matrix
)
matrix
=
np
.
asarray
(
matrix
)
...
...
This diff is collapsed.
Click to expand it.
Trie.py
+
36
−
2
View file @
32cda435
...
@@ -27,6 +27,22 @@ class Trie:
...
@@ -27,6 +27,22 @@ class Trie:
# Returns new trie node (initialized to NULLs)
# Returns new trie node (initialized to NULLs)
return
TrieNode
()
return
TrieNode
()
def
isStructureChange
(
self
,
url
):
urlSplit
=
url
.
split
(
'
/
'
)
pCrawl
=
self
.
root
isnewpath
=
False
# for level in urlSplit:
for
i
in
range
(
1
,
len
(
urlSplit
)):
# if current character is not present
level
=
urlSplit
[
i
]
if
len
(
level
)
==
0
:
continue
if
pCrawl
.
children
.
__contains__
(
level
):
pCrawl
=
pCrawl
.
children
[
level
];
else
:
isnewpath
=
True
break
return
isnewpath
def
insert
(
self
,
url
,
timestamp
,
payload
):
def
insert
(
self
,
url
,
timestamp
,
payload
):
newNodePath
=
''
newNodePath
=
''
urlSplit
=
url
.
split
(
'
/
'
)
urlSplit
=
url
.
split
(
'
/
'
)
...
@@ -54,15 +70,34 @@ class Trie:
...
@@ -54,15 +70,34 @@ class Trie:
isnewpath
=
True
isnewpath
=
True
return
(
isnewpath
,
newNodePath
)
return
(
isnewpath
,
newNodePath
)
def
extractNodeData
(
self
,
url
):
newNodePath
=
''
urlSplit
=
url
.
split
(
'
/
'
)
pCrawl
=
self
.
root
# for level in urlSplit:
for
i
in
range
(
1
,
len
(
urlSplit
)):
# if current character is not present
level
=
urlSplit
[
i
]
if
len
(
level
)
==
0
:
continue
pCrawl
=
pCrawl
.
children
[
level
];
return
pCrawl
.
data
def
extract
(
self
,
startTimestamp
,
endTimeStamp
):
def
extract
(
self
,
startTimestamp
,
endTimeStamp
):
# extract tree based on given timestamp
# extract tree based on given timestamp
if
startTimestamp
==
None
or
len
(
startTimestamp
.
strip
())
==
0
:
startTimestamp
=
"
0
"
if
endTimeStamp
==
None
or
len
(
endTimeStamp
.
strip
())
==
0
:
import
sys
endTimeStamp
=
str
(
sys
.
maxsize
)
trieCopy
=
Trie
()
trieCopy
=
Trie
()
trieCopy
.
counter
=
self
.
counter
trieCopy
.
counter
=
self
.
counter
trieCopy
.
matrixElements
=
self
.
matrixElements
trieCopy
.
matrixElements
=
self
.
matrixElements
trieCopy
.
root
=
self
.
root
.
extract
(
startTimestamp
,
endTimeStamp
)
trieCopy
.
root
=
self
.
root
.
extract
(
startTimestamp
,
endTimeStamp
)
return
trieCopy
return
trieCopy
def
comparison
(
self
,
tree1
):
def
isSame
(
self
,
tree1
):
# compare two trees
# compare two trees
from
collections
import
deque
from
collections
import
deque
stack_tree2
=
deque
()
stack_tree2
=
deque
()
...
@@ -109,7 +144,6 @@ class Trie:
...
@@ -109,7 +144,6 @@ class Trie:
anc
.
append
(
node
.
name
)
anc
.
append
(
node
.
name
)
# Traverse left and right subtrees
# Traverse left and right subtrees
for
child
in
node
.
children
:
for
child
in
node
.
children
:
pCrawlJunior
=
node
.
children
[
child
]
pCrawlJunior
=
node
.
children
[
child
]
mat
=
self
.
ancestorMatrixRec
(
pCrawlJunior
,
anc
,
mat
)
mat
=
self
.
ancestorMatrixRec
(
pCrawlJunior
,
anc
,
mat
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment