Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
CS-6604-WebArchive
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
namanahuja
CS-6604-WebArchive
Commits
ad0e9c6e
Commit
ad0e9c6e
authored
5 years ago
by
Naman
Browse files
Options
Downloads
Patches
Plain Diff
latest
parent
670644ee
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
ExampleNotebooks/classifyArchives.ipynb
+533
-83
533 additions, 83 deletions
ExampleNotebooks/classifyArchives.ipynb
ExampleNotebooks/dataExtract.ipynb
+30
-9
30 additions, 9 deletions
ExampleNotebooks/dataExtract.ipynb
with
563 additions
and
92 deletions
ExampleNotebooks/classifyArchives.ipynb
+
533
−
83
View file @
ad0e9c6e
Source diff could not be displayed: it is too large. Options to address this:
view the blob
.
This diff is collapsed.
Click to expand it.
ExampleNotebooks/dataExtract.ipynb
+
30
−
9
View file @
ad0e9c6e
...
...
@@ -47,14 +47,14 @@
},
{
"cell_type": "code",
"execution_count":
2
2,
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1
7
8 parquet files found\n"
"1
9
8 parquet files found\n"
]
}
],
...
...
@@ -74,7 +74,7 @@
},
{
"cell_type": "code",
"execution_count":
2
3,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
...
...
@@ -97,7 +97,7 @@
},
{
"cell_type": "code",
"execution_count":
2
4,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
...
...
@@ -106,7 +106,7 @@
},
{
"cell_type": "code",
"execution_count":
2
5,
"execution_count": 5,
"metadata": {},
"outputs": [
{
...
...
@@ -223,6 +223,7 @@
"Processing File 108\n",
"Processing File 109\n",
"Processing File 110\n",
"Processing File 111\n",
"Processing File 112\n",
"Processing File 113\n",
"Processing File 114\n",
...
...
@@ -289,7 +290,27 @@
"Processing File 175\n",
"Processing File 176\n",
"Processing File 177\n",
"Processing File 178\n"
"Processing File 178\n",
"Processing File 179\n",
"Processing File 180\n",
"Processing File 181\n",
"Processing File 182\n",
"Processing File 183\n",
"Processing File 184\n",
"Processing File 185\n",
"Processing File 186\n",
"Processing File 187\n",
"Processing File 188\n",
"Processing File 189\n",
"Processing File 190\n",
"Processing File 191\n",
"Processing File 192\n",
"Processing File 193\n",
"Processing File 194\n",
"Processing File 195\n",
"Processing File 196\n",
"Processing File 197\n",
"Processing File 198\n"
]
}
],
...
...
@@ -325,7 +346,7 @@
},
{
"cell_type": "code",
"execution_count":
2
6,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
...
...
@@ -334,7 +355,7 @@
},
{
"cell_type": "code",
"execution_count":
29
,
"execution_count":
7
,
"metadata": {},
"outputs": [],
"source": [
...
...
@@ -343,7 +364,7 @@
},
{
"cell_type": "code",
"execution_count":
31
,
"execution_count":
8
,
"metadata": {},
"outputs": [],
"source": [
...
...
%% Cell type:code id: tags:
```
python
import
os
import
pandas
as
pd
from
html_similarity
import
style_similarity
,
structural_similarity
,
similarity
from
bs4
import
BeautifulSoup
,
Doctype
from
bs4.element
import
Comment
from
collections
import
Counter
from
scipy.spatial
import
distance
from
nltk.corpus
import
stopwords
from
nltk.tokenize
import
word_tokenize
from
nltk.tokenize.treebank
import
TreebankWordDetokenizer
import
string
import
spacy
from
nltk.metrics
import
edit_distance
from
nltk.metrics
import
edit_distance
from
nltk.metrics
import
interval_distance
from
nltk
import
jaccard_distance
import
textdistance
from
sklearn.model_selection
import
train_test_split
from
sklearn.preprocessing
import
StandardScaler
from
sklearn.ensemble
import
RandomForestClassifier
from
sklearn.metrics
import
accuracy_score
import
datetime
import
fbprophet
import
gc
from
fastparquet
import
ParquetFile
import
pyarrow.parquet
as
pq
import
json
from
pyspark.sql
import
SparkSession
from
pyspark.sql
import
SQLContext
```
%% Output
ERROR:fbprophet:Importing plotly failed. Interactive plots will not work.
%% Cell type:code id: tags:
```
python
root
=
'
../data
'
modelUrl
=
'
cnn.com
'
parquetFiles
=
[]
for
root
,
dirs
,
files
in
os
.
walk
(
root
):
path
=
root
.
split
(
os
.
sep
)
for
file
in
files
:
if
file
.
endswith
(
"
.parquet
"
):
parquetFiles
.
append
(
os
.
path
.
join
(
root
,
file
))
print
(
str
(
len
(
parquetFiles
))
+
"
parquet files found
"
)
```
%% Output
1
7
8 parquet files found
1
9
8 parquet files found
%% Cell type:code id: tags:
```
python
spark
=
SparkSession
.
builder
\
.
master
(
"
local[*]
"
)
\
.
config
(
"
spark.executor.memory
"
,
"
70g
"
)
\
.
config
(
"
spark.driver.memory
"
,
"
50g
"
)
\
.
config
(
"
spark.memory.offHeap.enabled
"
,
"
true
"
)
\
.
config
(
"
spark.memory.offHeap.size
"
,
"
14g
"
)
\
.
appName
(
"
sampleCodeForReference
"
)
\
.
config
(
"
spark.driver.cores
"
,
"
12
"
)
\
.
getOrCreate
()
spark
.
conf
.
set
(
"
spark.sql.parquet.enableVectorizedReader
"
,
"
false
"
)
sc
=
spark
.
sparkContext
sqlContext
=
SQLContext
(
sc
)
```
%% Cell type:code id: tags:
```
python
archiveData
=
[]
```
%% Cell type:code id: tags:
```
python
for
k
in
range
(
len
(
parquetFiles
)):
#print("Processing File " + str(k+1))
try
:
file
=
sqlContext
.
read
.
parquet
(
parquetFiles
[
k
])
UriComponents
=
file
.
rdd
.
take
(
1
)[
0
].
originalUrl
.
split
(
'
/
'
)
payload
=
file
.
rdd
.
take
(
1
)[
0
].
payload
mime
=
file
.
rdd
.
take
(
1
)[
0
].
mime
filename
=
file
.
rdd
.
take
(
1
)[
0
].
filename
timestamp
=
filename
.
split
(
'
.
'
)[
0
][
4
:]
#print(mime, UriComponents, len(payload))
print
(
"
Processing File
"
+
str
(
k
+
1
))
if
(
mime
==
'
text/html
'
and
len
(
payload
)
>
1
and
modelUrl
in
UriComponents
[
-
1
]):
currentData
=
{}
currentData
[
'
payload
'
]
=
payload
currentData
[
'
timestamp
'
]
=
timestamp
archiveData
.
append
(
currentData
)
except
:
pass
```
%% Output
Processing File 1
Processing File 2
Processing File 3
Processing File 4
Processing File 5
Processing File 6
Processing File 7
Processing File 8
Processing File 9
Processing File 10
Processing File 11
Processing File 12
Processing File 13
Processing File 14
Processing File 15
Processing File 16
Processing File 17
Processing File 18
Processing File 19
Processing File 20
Processing File 21
Processing File 22
Processing File 23
Processing File 24
Processing File 25
Processing File 26
Processing File 27
Processing File 28
Processing File 29
Processing File 30
Processing File 31
Processing File 32
Processing File 33
Processing File 34
Processing File 35
Processing File 36
Processing File 37
Processing File 38
Processing File 39
Processing File 40
Processing File 41
Processing File 42
Processing File 43
Processing File 44
Processing File 45
Processing File 46
Processing File 47
Processing File 48
Processing File 49
Processing File 50
Processing File 51
Processing File 52
Processing File 53
Processing File 54
Processing File 55
Processing File 56
Processing File 57
Processing File 58
Processing File 59
Processing File 60
Processing File 61
Processing File 62
Processing File 63
Processing File 64
Processing File 65
Processing File 66
Processing File 67
Processing File 68
Processing File 69
Processing File 70
Processing File 71
Processing File 72
Processing File 73
Processing File 74
Processing File 75
Processing File 76
Processing File 77
Processing File 78
Processing File 79
Processing File 80
Processing File 81
Processing File 82
Processing File 83
Processing File 84
Processing File 85
Processing File 86
Processing File 87
Processing File 88
Processing File 89
Processing File 90
Processing File 91
Processing File 92
Processing File 93
Processing File 94
Processing File 95
Processing File 96
Processing File 97
Processing File 98
Processing File 99
Processing File 100
Processing File 101
Processing File 102
Processing File 103
Processing File 104
Processing File 105
Processing File 106
Processing File 107
Processing File 108
Processing File 109
Processing File 110
Processing File 111
Processing File 112
Processing File 113
Processing File 114
Processing File 115
Processing File 116
Processing File 117
Processing File 118
Processing File 119
Processing File 120
Processing File 121
Processing File 122
Processing File 123
Processing File 124
Processing File 125
Processing File 126
Processing File 127
Processing File 128
Processing File 129
Processing File 130
Processing File 131
Processing File 132
Processing File 133
Processing File 134
Processing File 135
Processing File 136
Processing File 137
Processing File 138
Processing File 139
Processing File 140
Processing File 141
Processing File 142
Processing File 143
Processing File 144
Processing File 145
Processing File 146
Processing File 147
Processing File 148
Processing File 149
Processing File 150
Processing File 151
Processing File 152
Processing File 153
Processing File 154
Processing File 155
Processing File 156
Processing File 157
Processing File 158
Processing File 159
Processing File 160
Processing File 161
Processing File 162
Processing File 163
Processing File 164
Processing File 165
Processing File 166
Processing File 167
Processing File 168
Processing File 169
Processing File 170
Processing File 171
Processing File 172
Processing File 173
Processing File 174
Processing File 175
Processing File 176
Processing File 177
Processing File 178
Processing File 179
Processing File 180
Processing File 181
Processing File 182
Processing File 183
Processing File 184
Processing File 185
Processing File 186
Processing File 187
Processing File 188
Processing File 189
Processing File 190
Processing File 191
Processing File 192
Processing File 193
Processing File 194
Processing File 195
Processing File 196
Processing File 197
Processing File 198
%% Cell type:code id: tags:
```
python
archiveData
.
sort
(
key
=
lambda
x
:
x
[
'
timestamp
'
],
reverse
=
False
)
```
%% Cell type:code id: tags:
```
python
df
=
pd
.
DataFrame
(
archiveData
,
columns
=
[
'
payload
'
,
'
timestamp
'
])
```
%% Cell type:code id: tags:
```
python
df
.
to_pickle
(
"
./archiveData.pkl
"
)
```
%% Cell type:code id: tags:
```
python
``
`
%%
Cell
type
:
code
id
:
tags
:
```
python
```
%% Cell type:code id: tags:
```
python
```
%% Cell type:code id: tags:
```
python
```
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment