Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • namanahuja/cs-6604-webarchive
  • siddharth/cs-6604-webarchive
  • xw0078/cs-6604-webarchive
3 results
Show changes
Commits on Source (17)
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 2
}
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 2
}
%% Cell type:code id: tags:
``` python
import os
import pandas as pd
from html_similarity import style_similarity, structural_similarity, similarity
from bs4 import BeautifulSoup, Doctype
from bs4.element import Comment
from collections import Counter
from scipy.spatial import distance
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string
import spacy
from nltk.metrics import edit_distance
from nltk.metrics import edit_distance
from nltk.metrics import interval_distance
from nltk import jaccard_distance
import textdistance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
import datetime
import fbprophet
import gc
import math
import numpy as np
from fastparquet import ParquetFile
```
%% Cell type:code id: tags:
``` python
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
def text_from_html(htmlPage):
soup = BeautifulSoup(htmlPage, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
def split(word):
return [char for char in word]
def filter_text(text):
stop_words = set(stopwords.words('english'))
stop_words.update(split(string.punctuation))
nlp = spacy.load('en_core_web_sm')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
stop_words.update(spacy_stopwords)
#stop_words.update(["\\t","\\n","\\r"])
text = text.replace("\\n", "")
text = text.replace("\\r", "")
text = text.replace("\\t", "")
word_tokens_text = word_tokenize(text)
filtered_text = [w for w in word_tokens_text if not w in stop_words]
filtered_text = TreebankWordDetokenizer().detokenize(filtered_text)
return filtered_text
```
%% Cell type:code id: tags:
``` python
archiveData = pd.read_pickle("./archiveData.pkl")
```
%% Cell type:code id: tags:
``` python
def classifyProphet(archiveData):
basePayload = archiveData.iloc[0]['payload']
basePayloadText = text_from_html(basePayload)
basePayloadFilteredText = filter_text(basePayloadText)
baseTimestamp = datetime.datetime.strptime(archiveData.iloc[0]['timestamp'], '%Y%m%d%H%M%S')
lastSavedDataIndex = 0
timeElapsed = 0
dataset = []
for i in range(1, len(archiveData)):
print("Proceesing " + str(i))
hasContentChanged = 0
overallSimilarity = similarity(basePayload, archiveData.iloc[i]['payload'])
styleSimilarity = style_similarity(basePayload, archiveData.iloc[i]['payload'])
structuralSimilarity = structural_similarity(basePayload, archiveData.iloc[i]['payload'])
timestamp = datetime.datetime.strptime(archiveData.iloc[i]['timestamp'], '%Y%m%d%H%M%S')
archiveText = text_from_html(archiveData.iloc[i]['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#print(overallSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity)
if(overallSimilarity < 0.98 or cosineSimilarity < 1):
hasContentChanged = 1
lastSavedDataIndex = i
basePayload = archiveData.iloc[i]['payload']
basePayloadText = archiveText
basePayloadFilteredText = filteredArchiveText
baseTimestamp = datetime.datetime.strptime(archiveData.iloc[i]['timestamp'], '%Y%m%d%H%M%S')
data = [timestamp, hasContentChanged]
dataset.append(data)
df = pd.DataFrame(dataset, columns = ['ds', 'y'])
m = fbprophet.Prophet()
m.fit(df)
print(df)
future = m.make_future_dataframe(periods=1, freq="5H", include_history = False)
forecast = m.predict(future)
allfuture = m.make_future_dataframe(periods=72, freq="5H", include_history = True)
allforecast = m.predict(allfuture)
print(forecast)
fig1 = m.plot(allforecast)
fig2 = m.plot_components(allforecast)
return forecast
```
%% Cell type:code id: tags:
``` python
classifyProphet(archiveData)
```
%% Output
Proceesing 1
Proceesing 2
Proceesing 3
Proceesing 4
Proceesing 5
Proceesing 6
Proceesing 7
Proceesing 8
Proceesing 9
Proceesing 10
Proceesing 11
Proceesing 12
Proceesing 13
Proceesing 14
Proceesing 15
Proceesing 16
Proceesing 17
Proceesing 18
Proceesing 19
Proceesing 20
Proceesing 21
Proceesing 22
Proceesing 23
Proceesing 24
Proceesing 25
Proceesing 26
Proceesing 27
Proceesing 28
Proceesing 29
Proceesing 30
Proceesing 31
Proceesing 32
Proceesing 33
Proceesing 34
Proceesing 35
Proceesing 36
Proceesing 37
Proceesing 38
Proceesing 39
Proceesing 40
Proceesing 41
Proceesing 42
Proceesing 43
Proceesing 44
Proceesing 45
Proceesing 46
Proceesing 47
Proceesing 48
Proceesing 49
Proceesing 50
Proceesing 51
Proceesing 52
Proceesing 53
Proceesing 54
Proceesing 55
Proceesing 56
Proceesing 57
Proceesing 58
Proceesing 59
Proceesing 60
Proceesing 61
Proceesing 62
Proceesing 63
Proceesing 64
Proceesing 65
Proceesing 66
Proceesing 67
Proceesing 68
Proceesing 69
Proceesing 70
Proceesing 71
Proceesing 72
Proceesing 73
Proceesing 74
Proceesing 75
Proceesing 76
Proceesing 77
Proceesing 78
Proceesing 79
Proceesing 80
Proceesing 81
Proceesing 82
Proceesing 83
Proceesing 84
Proceesing 85
Proceesing 86
Proceesing 87
Proceesing 88
Proceesing 89
Proceesing 90
Proceesing 91
Proceesing 92
Proceesing 93
Proceesing 94
Proceesing 95
Proceesing 96
Proceesing 97
Proceesing 98
Proceesing 99
Proceesing 100
Proceesing 101
Proceesing 102
Proceesing 103
Proceesing 104
Proceesing 105
Proceesing 106
Proceesing 107
Proceesing 108
Proceesing 109
Proceesing 110
Proceesing 111
Proceesing 112
Proceesing 113
Proceesing 114
Proceesing 115
Proceesing 116
Proceesing 117
Proceesing 118
Proceesing 119
Proceesing 120
Proceesing 121
Proceesing 122
Proceesing 123
Proceesing 124
Proceesing 125
Proceesing 126
Proceesing 127
Proceesing 128
Proceesing 129
Proceesing 130
Proceesing 131
Proceesing 132
Proceesing 133
Proceesing 134
Proceesing 135
Proceesing 136
Proceesing 137
Proceesing 138
Proceesing 139
Proceesing 140
Proceesing 141
Proceesing 142
Proceesing 143
Proceesing 144
Proceesing 145
Proceesing 146
Proceesing 147
Proceesing 148
Proceesing 149
Proceesing 150
Proceesing 151
Proceesing 152
Proceesing 153
Proceesing 154
Proceesing 155
Proceesing 156
Proceesing 157
Proceesing 158
Proceesing 159
Proceesing 160
Proceesing 161
Proceesing 162
Proceesing 163
Proceesing 164
Proceesing 165
Proceesing 166
Proceesing 167
Proceesing 168
Proceesing 169
Proceesing 170
Proceesing 171
Proceesing 172
Proceesing 173
Proceesing 174
Proceesing 175
Proceesing 176
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
ds y
0 2019-11-06 17:10:18 0
1 2019-11-06 19:10:18 0
2 2019-11-06 20:10:18 0
3 2019-11-07 00:10:18 0
4 2019-11-07 01:10:18 0
.. ... ..
171 2019-11-19 06:10:18 0
172 2019-11-19 07:10:18 0
173 2019-11-19 08:10:18 0
174 2019-11-19 09:10:18 0
175 2019-11-19 11:10:18 0
[176 rows x 2 columns]
ds trend yhat_lower yhat_upper trend_lower \
0 2019-11-19 16:10:18 0.031191 -0.024171 0.247586 0.031191
trend_upper additive_terms additive_terms_lower additive_terms_upper \
0 0.031191 0.07207 0.07207 0.07207
daily daily_lower daily_upper multiplicative_terms \
0 0.07207 0.07207 0.07207 0.0
multiplicative_terms_lower multiplicative_terms_upper yhat
0 0.0 0.0 0.103261
ds trend yhat_lower yhat_upper trend_lower \
0 2019-11-19 16:10:18 0.031191 -0.024171 0.247586 0.031191
trend_upper additive_terms additive_terms_lower additive_terms_upper \
0 0.031191 0.07207 0.07207 0.07207
daily daily_lower daily_upper multiplicative_terms \
0 0.07207 0.07207 0.07207 0.0
multiplicative_terms_lower multiplicative_terms_upper yhat
0 0.0 0.0 0.103261
%% Cell type:code id: tags:
``` python
def classiyWindowSVM(archiveData):
basePayload = archiveData.iloc[0]['payload']
basePayloadText = text_from_html(basePayload)
basePayloadFilteredText = filter_text(basePayloadText)
baseTimestamp = datetime.datetime.strptime(archiveData.iloc[0]['timestamp'], '%Y%m%d%H%M%S')
lastSavedDataIndex = 0
timeElapsed = 0
dataset = []
print(str(len(archiveData)) + " datapoints found")
for i in range(1, len(archiveData)):
if(i % 10 is 0):
print(str(i) + " Records processed")
hasContentChanged = False
overallSimilarity = similarity(basePayload, archiveData.iloc[i]['payload'])
styleSimilarity = style_similarity(basePayload, archiveData.iloc[i]['payload'])
structuralSimilarity = structural_similarity(basePayload, archiveData.iloc[i]['payload'])
timestamp = datetime.datetime.strptime(archiveData.iloc[i]['timestamp'], '%Y%m%d%H%M%S')
baseTimestampDiff = ((timestamp - baseTimestamp).total_seconds())/3600.0
archiveText = text_from_html(archiveData.iloc[i]['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
if(overallSimilarity < 0.98 or cosineSimilarity < 1):
hasContentChanged = True
lastSavedDataIndex = i
basePayload = archiveData.iloc[i]['payload']
basePayloadText = archiveText
basePayloadFilteredText = filteredArchiveText
baseTimestamp = datetime.datetime.strptime(archiveData.iloc[i]['timestamp'], '%Y%m%d%H%M%S')
data = [baseTimestampDiff, overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged]
dataset.append(data)
df = pd.DataFrame(dataset, columns = ['timeDiff', 'similarity', 'styleSimilarity', 'structureSimilarity', 'cosine', 'jaccard', 'sorensen', 'changed'])
print("Dataframe created")
X = []
y = []
windowSize = 10
for i in range(len(df.index) - windowSize - 1):
bound = min(i + windowSize + 1, len(archiveData))
window = df.iloc[i:bound-1, 0].values
windowLabel = df.iloc[bound, 7]
X.append(window)
y.append(windowLabel)
trainEx = math.floor(len(X)*0.8)
X_train = X[0:trainEx]
y_train = y[0:trainEx]
X_test = X[trainEx:]
y_test = y[trainEx:]
print("Starting SVM Classification")
print(X_train, y_train)
regressor = svm.SVC()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print(y_pred)
return np.mean(y_test == y_pred)
```
%% Cell type:code id: tags:
``` python
print(classiyWindowSVM(archiveData))
```
%% Output
177 datapoints found
10 Records processed
20 Records processed
30 Records processed
40 Records processed
50 Records processed
60 Records processed
70 Records processed
80 Records processed
90 Records processed
100 Records processed
110 Records processed
120 Records processed
130 Records processed
140 Records processed
150 Records processed
160 Records processed
170 Records processed
Dataframe created
Starting SVM Classification
[array([ 1., 3., 4., 8., 9., 11., 13., 14., 17., 21.]), array([ 3., 4., 8., 9., 11., 13., 14., 17., 21., 22.]), array([ 4., 8., 9., 11., 13., 14., 17., 21., 22., 24.]), array([ 8., 9., 11., 13., 14., 17., 21., 22., 24., 26.]), array([ 9., 11., 13., 14., 17., 21., 22., 24., 26., 27.]), array([11., 13., 14., 17., 21., 22., 24., 26., 27., 28.]), array([13., 14., 17., 21., 22., 24., 26., 27., 28., 29.]), array([14., 17., 21., 22., 24., 26., 27., 28., 29., 31.]), array([17., 21., 22., 24., 26., 27., 28., 29., 31., 32.]), array([21., 22., 24., 26., 27., 28., 29., 31., 32., 33.]), array([22., 24., 26., 27., 28., 29., 31., 32., 33., 34.]), array([24., 26., 27., 28., 29., 31., 32., 33., 34., 36.]), array([26., 27., 28., 29., 31., 32., 33., 34., 36., 39.]), array([27., 28., 29., 31., 32., 33., 34., 36., 39., 40.]), array([28., 29., 31., 32., 33., 34., 36., 39., 40., 41.]), array([29., 31., 32., 33., 34., 36., 39., 40., 41., 43.]), array([31., 32., 33., 34., 36., 39., 40., 41., 43., 44.]), array([32., 33., 34., 36., 39., 40., 41., 43., 44., 45.]), array([33., 34., 36., 39., 40., 41., 43., 44., 45., 48.]), array([34., 36., 39., 40., 41., 43., 44., 45., 48., 49.]), array([36., 39., 40., 41., 43., 44., 45., 48., 49., 50.]), array([39., 40., 41., 43., 44., 45., 48., 49., 50., 51.]), array([40., 41., 43., 44., 45., 48., 49., 50., 51., 52.]), array([41., 43., 44., 45., 48., 49., 50., 51., 52., 54.]), array([43., 44., 45., 48., 49., 50., 51., 52., 54., 55.]), array([44., 45., 48., 49., 50., 51., 52., 54., 55., 56.]), array([45., 48., 49., 50., 51., 52., 54., 55., 56., 57.]), array([48., 49., 50., 51., 52., 54., 55., 56., 57., 58.]), array([49., 50., 51., 52., 54., 55., 56., 57., 58., 62.]), array([50., 51., 52., 54., 55., 56., 57., 58., 62., 63.]), array([51., 52., 54., 55., 56., 57., 58., 62., 63., 64.]), array([52., 54., 55., 56., 57., 58., 62., 63., 64., 66.]), array([54., 55., 56., 57., 58., 62., 63., 64., 66., 70.]), array([55., 56., 57., 58., 62., 63., 64., 66., 70., 72.]), array([56., 57., 58., 62., 63., 64., 66., 70., 72., 73.]), array([57., 58., 62., 63., 64., 66., 70., 72., 73., 76.]), array([58., 62., 63., 64., 66., 70., 72., 73., 76., 77.]), array([62., 63., 64., 66., 70., 72., 73., 76., 77., 78.]), array([63., 64., 66., 70., 72., 73., 76., 77., 78., 80.]), array([64., 66., 70., 72., 73., 76., 77., 78., 80., 81.]), array([66., 70., 72., 73., 76., 77., 78., 80., 81., 82.]), array([70., 72., 73., 76., 77., 78., 80., 81., 82., 84.]), array([72., 73., 76., 77., 78., 80., 81., 82., 84., 85.]), array([73., 76., 77., 78., 80., 81., 82., 84., 85., 86.]), array([76., 77., 78., 80., 81., 82., 84., 85., 86., 87.]), array([77., 78., 80., 81., 82., 84., 85., 86., 87., 88.]), array([78., 80., 81., 82., 84., 85., 86., 87., 88., 89.]), array([80., 81., 82., 84., 85., 86., 87., 88., 89., 90.]), array([81., 82., 84., 85., 86., 87., 88., 89., 90., 91.]), array([82., 84., 85., 86., 87., 88., 89., 90., 91., 92.]), array([84., 85., 86., 87., 88., 89., 90., 91., 92., 96.]), array([85., 86., 87., 88., 89., 90., 91., 92., 96., 97.]), array([86., 87., 88., 89., 90., 91., 92., 96., 97., 98.]), array([87., 88., 89., 90., 91., 92., 96., 97., 98., 99.]), array([ 88., 89., 90., 91., 92., 96., 97., 98., 99., 100.]), array([ 89., 90., 91., 92., 96., 97., 98., 99., 100., 103.]), array([ 90., 91., 92., 96., 97., 98., 99., 100., 103., 106.]), array([ 91., 92., 96., 97., 98., 99., 100., 103., 106., 108.]), array([ 92., 96., 97., 98., 99., 100., 103., 106., 108., 109.]), array([ 96., 97., 98., 99., 100., 103., 106., 108., 109., 110.]), array([ 97., 98., 99., 100., 103., 106., 108., 109., 110., 111.]), array([ 98., 99., 100., 103., 106., 108., 109., 110., 111., 112.]), array([ 99., 100., 103., 106., 108., 109., 110., 111., 112., 118.]), array([100., 103., 106., 108., 109., 110., 111., 112., 118., 120.]), array([103., 106., 108., 109., 110., 111., 112., 118., 120., 122.]), array([106., 108., 109., 110., 111., 112., 118., 120., 122., 123.]), array([108., 109., 110., 111., 112., 118., 120., 122., 123., 124.]), array([109., 110., 111., 112., 118., 120., 122., 123., 124., 126.]), array([110., 111., 112., 118., 120., 122., 123., 124., 126., 128.]), array([111., 112., 118., 120., 122., 123., 124., 126., 128., 129.]), array([112., 118., 120., 122., 123., 124., 126., 128., 129., 130.]), array([118., 120., 122., 123., 124., 126., 128., 129., 130., 132.]), array([120., 122., 123., 124., 126., 128., 129., 130., 132., 133.]), array([122., 123., 124., 126., 128., 129., 130., 132., 133., 135.]), array([123., 124., 126., 128., 129., 130., 132., 133., 135., 139.]), array([124., 126., 128., 129., 130., 132., 133., 135., 139., 142.]), array([126., 128., 129., 130., 132., 133., 135., 139., 142., 143.]), array([128., 129., 130., 132., 133., 135., 139., 142., 143., 144.]), array([129., 130., 132., 133., 135., 139., 142., 143., 144., 146.]), array([130., 132., 133., 135., 139., 142., 143., 144., 146., 150.]), array([132., 133., 135., 139., 142., 143., 144., 146., 150., 152.]), array([133., 135., 139., 142., 143., 144., 146., 150., 152., 154.]), array([135., 139., 142., 143., 144., 146., 150., 152., 154., 157.]), array([139., 142., 143., 144., 146., 150., 152., 154., 157., 158.]), array([142., 143., 144., 146., 150., 152., 154., 157., 158., 159.]), array([143., 144., 146., 150., 152., 154., 157., 158., 159., 161.]), array([144., 146., 150., 152., 154., 157., 158., 159., 161., 164.]), array([146., 150., 152., 154., 157., 158., 159., 161., 164., 165.]), array([150., 152., 154., 157., 158., 159., 161., 164., 165., 166.]), array([152., 154., 157., 158., 159., 161., 164., 165., 166., 169.]), array([154., 157., 158., 159., 161., 164., 165., 166., 169., 170.]), array([157., 158., 159., 161., 164., 165., 166., 169., 170., 171.]), array([158., 159., 161., 164., 165., 166., 169., 170., 171., 172.]), array([159., 161., 164., 165., 166., 169., 170., 171., 172., 173.]), array([161., 164., 165., 166., 169., 170., 171., 172., 173., 174.]), array([164., 165., 166., 169., 170., 171., 172., 173., 174., 176.]), array([165., 166., 169., 170., 171., 172., 173., 174., 176., 178.]), array([166., 169., 170., 171., 172., 173., 174., 176., 178., 179.]), array([169., 170., 171., 172., 173., 174., 176., 178., 179., 181.]), array([170., 171., 172., 173., 174., 176., 178., 179., 181., 184.]), array([171., 172., 173., 174., 176., 178., 179., 181., 184., 185.]), array([172., 173., 174., 176., 178., 179., 181., 184., 185., 188.]), array([173., 174., 176., 178., 179., 181., 184., 185., 188., 190.]), array([174., 176., 178., 179., 181., 184., 185., 188., 190., 192.]), array([176., 178., 179., 181., 184., 185., 188., 190., 192., 193.]), array([178., 179., 181., 184., 185., 188., 190., 192., 193., 194.]), array([179., 181., 184., 185., 188., 190., 192., 193., 194., 196.]), array([181., 184., 185., 188., 190., 192., 193., 194., 196., 197.]), array([184., 185., 188., 190., 192., 193., 194., 196., 197., 198.]), array([185., 188., 190., 192., 193., 194., 196., 197., 198., 202.]), array([188., 190., 192., 193., 194., 196., 197., 198., 202., 205.]), array([190., 192., 193., 194., 196., 197., 198., 202., 205., 210.]), array([192., 193., 194., 196., 197., 198., 202., 205., 210., 214.]), array([193., 194., 196., 197., 198., 202., 205., 210., 214., 215.]), array([194., 196., 197., 198., 202., 205., 210., 214., 215., 2.]), array([196., 197., 198., 202., 205., 210., 214., 215., 2., 3.]), array([197., 198., 202., 205., 210., 214., 215., 2., 3., 6.]), array([198., 202., 205., 210., 214., 215., 2., 3., 6., 7.]), array([202., 205., 210., 214., 215., 2., 3., 6., 7., 8.]), array([205., 210., 214., 215., 2., 3., 6., 7., 8., 9.]), array([210., 214., 215., 2., 3., 6., 7., 8., 9., 11.]), array([214., 215., 2., 3., 6., 7., 8., 9., 11., 13.]), array([215., 2., 3., 6., 7., 8., 9., 11., 13., 14.]), array([ 2., 3., 6., 7., 8., 9., 11., 13., 14., 15.]), array([ 3., 6., 7., 8., 9., 11., 13., 14., 15., 17.]), array([ 6., 7., 8., 9., 11., 13., 14., 15., 17., 18.]), array([ 7., 8., 9., 11., 13., 14., 15., 17., 18., 19.]), array([ 8., 9., 11., 13., 14., 15., 17., 18., 19., 21.]), array([ 9., 11., 13., 14., 15., 17., 18., 19., 21., 23.]), array([11., 13., 14., 15., 17., 18., 19., 21., 23., 24.]), array([13., 14., 15., 17., 18., 19., 21., 23., 24., 26.]), array([14., 15., 17., 18., 19., 21., 23., 24., 26., 29.])] [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
[False False False False False False False False False False False False
False False False False False False False False False False False False
False False False False False False False False False]
0.9696969696969697
/home/naman/anaconda3/envs/dl/lib/python3.7/site-packages/sklearn/svm/base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
import os
import pandas as pd
from html_similarity import style_similarity, structural_similarity, similarity
from bs4 import BeautifulSoup, Doctype
from bs4.element import Comment
from collections import Counter
from scipy.spatial import distance
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string
import spacy
from nltk.metrics import edit_distance
from nltk.metrics import edit_distance
from nltk.metrics import interval_distance
from nltk import jaccard_distance
import textdistance
import datetime
import fbprophet
```
%% Output
ERROR:fbprophet:Importing plotly failed. Interactive plots will not work.
%% Cell type:code id: tags:
``` python
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
```
%% Cell type:code id: tags:
``` python
def text_from_html(htmlPage):
soup = BeautifulSoup(htmlPage, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
```
%% Cell type:code id: tags:
``` python
def split(word):
return [char for char in word]
```
%% Cell type:code id: tags:
``` python
def filter_text(text):
stop_words = set(stopwords.words('english'))
stop_words.update(split(string.punctuation))
nlp = spacy.load('en_core_web_sm')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
stop_words.update(spacy_stopwords)
#stop_words.update(["\\t","\\n","\\r"])
text = text.replace("\\n", "")
text = text.replace("\\r", "")
text = text.replace("\\t", "")
word_tokens_text = word_tokenize(text)
filtered_text = [w for w in word_tokens_text if not w in stop_words]
filtered_text = TreebankWordDetokenizer().detokenize(filtered_text)
return filtered_text
```
%% Cell type:code id: tags:
``` python
root = 'data'
modelUrl = 'vt.edu'
parquetFiles = []
for root, dirs, files in os.walk(root):
path = root.split(os.sep)
for file in files:
if file.endswith(".parquet"):
parquetFiles.append(os.path.join(root, file))
parquetFiles.sort()
```
%% Cell type:code id: tags:
``` python
archiveData = []
for fileName in parquetFiles:
file = pd.read_parquet(fileName, engine='pyarrow')
numRows = len(file.index)
for i in range(numRows):
UriComponents = file.iloc[i].originalUrl.split('/')
payload = file.iloc[i].payload
mime = file.iloc[i].mime
timestamp = file.iloc[i].timestamp
soup = (BeautifulSoup(payload, "html.parser"))
if (mime == 'text/html' and len(payload) > 1 and modelUrl in UriComponents[-2]):
currentData = {}
currentData['payload'] = payload
currentData['timestamp'] = timestamp
archiveData.append(currentData)
archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
```
%% Cell type:code id: tags:
``` python
basePayload = archiveData[0]['payload']
basePayloadText = text_from_html(basePayload)
basePayloadFilteredText = filter_text(basePayloadText)
baseTimestamp = datetime.datetime.strptime(archiveData[0]['timestamp'], '%Y%m%d%H%M%S')
lastSavedDataIndex = 0
timeElapsed = 0
dataset = []
for i in range(1, len(archiveData)):
hasContentChanged = 0
overallSimilarity = similarity(basePayload, archiveData[i]['payload'])
styleSimilarity = style_similarity(basePayload, archiveData[i]['payload'])
structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload'])
timestamp = datetime.datetime.strptime(archiveData[i]['timestamp'], '%Y%m%d%H%M%S')
archiveText = text_from_html(archiveData[i]['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
if(overallSimilarity < 0.80 or cosineSimilarity < 0.95):
hasContentChanged = 1
lastSavedDataIndex = i
basePayload = archiveData[i]['payload']
basePayloadText = archiveText
basePayloadFilteredText = filteredArchiveText
baseTimestamp = datetime.datetime.strptime(archiveData[i]['timestamp'], '%Y%m%d%H%M%S')
data = [timestamp, hasContentChanged]
dataset.append(data)
df = pd.DataFrame(dataset, columns = ['ds', 'y'])
```
%% Cell type:code id: tags:
``` python
with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
print(df)
```
%% Output
ds y
0 2014-01-02 06:23:07 0
1 2014-01-02 08:24:55 0
2 2014-01-02 08:45:27 1
3 2014-01-02 09:05:47 1
4 2014-01-02 21:19:39 0
5 2014-01-06 03:31:08 0
6 2014-01-06 04:42:17 0
7 2014-01-06 04:42:18 1
8 2014-01-06 04:44:27 1
9 2014-01-06 15:31:52 0
10 2014-01-06 17:56:09 0
11 2014-01-09 19:39:44 0
12 2014-01-09 19:39:44 0
13 2014-01-09 21:59:28 0
14 2014-01-09 21:59:28 0
15 2014-01-10 01:51:01 0
16 2014-01-10 05:06:23 0
17 2014-01-10 21:27:04 0
18 2014-01-11 03:22:41 0
19 2014-01-11 05:19:24 0
20 2014-01-12 11:59:05 0
21 2014-01-12 11:59:05 0
22 2014-01-12 16:04:34 0
23 2014-01-12 16:04:34 0
24 2014-01-13 14:25:07 0
25 2014-01-13 21:00:45 0
26 2014-01-13 23:00:43 0
27 2014-01-15 00:46:07 0
28 2014-01-15 00:46:07 0
29 2014-01-15 06:07:06 0
30 2014-01-15 06:07:06 0
31 2014-01-15 08:45:41 1
32 2014-01-15 08:45:41 0
33 2014-01-16 16:57:55 1
34 2014-01-16 16:57:55 0
35 2014-01-16 19:16:27 0
36 2014-01-16 19:16:27 0
37 2014-01-17 19:18:46 0
38 2014-01-17 19:18:46 0
39 2014-01-17 21:41:44 0
40 2014-01-17 21:41:44 0
41 2014-01-18 16:42:28 0
42 2014-01-18 16:42:28 0
43 2014-01-18 16:42:29 1
44 2014-01-18 16:42:29 1
45 2014-01-19 05:57:12 1
46 2014-01-19 05:57:12 0
47 2014-01-19 07:10:17 0
48 2014-01-19 07:10:17 0
49 2014-01-19 07:54:58 0
50 2014-01-19 07:54:58 0
51 2014-01-19 16:36:26 0
52 2014-01-19 16:36:26 0
53 2014-01-19 16:36:27 1
54 2014-01-19 16:36:27 1
55 2014-01-20 04:05:00 1
56 2014-01-20 04:05:00 0
57 2014-01-20 04:37:30 0
58 2014-01-20 04:37:30 0
59 2014-01-20 16:51:04 0
60 2014-01-20 16:51:04 0
61 2014-01-20 17:47:00 0
62 2014-01-20 17:47:00 0
63 2014-01-20 17:47:01 1
64 2014-01-20 17:47:01 1
65 2014-01-24 02:45:03 1
66 2014-01-24 05:33:17 0
67 2014-01-24 05:33:32 1
68 2014-01-24 18:32:02 1
69 2014-01-24 18:58:56 0
70 2014-01-25 14:08:09 0
71 2014-01-27 02:17:39 1
72 2014-01-27 12:57:49 1
73 2014-01-27 13:34:52 0
74 2014-01-28 18:32:58 0
75 2014-01-28 18:32:58 0
76 2014-01-28 19:14:12 0
77 2014-01-28 19:14:12 0
78 2014-01-29 14:22:26 0
79 2014-01-29 14:22:26 0
80 2014-01-29 15:19:38 0
81 2014-01-29 15:19:38 0
82 2014-01-30 14:57:47 0
83 2014-01-30 14:57:47 0
84 2014-01-30 15:46:02 0
85 2014-01-30 15:46:02 0
86 2014-01-31 05:22:38 0
87 2014-01-31 05:22:38 0
88 2014-01-31 05:22:40 1
89 2014-01-31 05:54:36 1
90 2014-01-31 05:54:36 0
91 2014-01-31 05:54:56 1
92 2014-01-31 05:54:56 0
93 2014-01-31 15:39:57 1
94 2014-01-31 15:39:57 0
95 2014-01-31 15:39:57 0
96 2014-01-31 15:39:57 0
97 2014-01-31 15:56:57 0
98 2014-01-31 15:56:57 0
99 2014-01-31 16:44:07 0
100 2014-01-31 16:44:07 0
101 2014-02-01 16:35:43 0
%% Cell type:code id: tags:
``` python
m = fbprophet.Prophet()
m.fit(df)
```
%% Output
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
<fbprophet.forecaster.Prophet at 0x7f88a6454e80>
%% Cell type:code id: tags:
``` python
future = m.make_future_dataframe(periods=5, freq="5H", include_history = True)
forecast = m.predict(future)
```
%% Cell type:code id: tags:
``` python
with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']])
```
%% Output
ds yhat yhat_lower yhat_upper
0 2014-01-02 06:23:07 0.301154 -0.209832 0.786637
1 2014-01-02 08:24:55 0.481716 -0.022577 0.923686
2 2014-01-02 08:45:27 0.528029 0.054056 1.025637
3 2014-01-02 09:05:47 0.568961 0.075138 1.052453
4 2014-01-02 21:19:39 -0.066840 -0.548597 0.433626
5 2014-01-06 03:31:08 0.451446 -0.004701 0.937194
6 2014-01-06 04:42:17 0.396473 -0.079246 0.919093
7 2014-01-06 04:42:18 0.396457 -0.069923 0.875722
8 2014-01-06 04:44:27 0.394399 -0.108454 0.859912
9 2014-01-06 15:31:52 0.120939 -0.365588 0.613614
10 2014-01-06 17:56:09 0.337558 -0.165478 0.803918
11 2014-01-09 19:39:44 0.115910 -0.345411 0.605042
12 2014-01-09 19:39:44 0.115910 -0.369545 0.630505
13 2014-01-09 21:59:28 -0.017147 -0.510574 0.495736
14 2014-01-09 21:59:28 -0.017147 -0.520591 0.444783
15 2014-01-10 01:51:01 0.381821 -0.088473 0.847155
16 2014-01-10 05:06:23 0.298256 -0.186433 0.798374
17 2014-01-10 21:27:04 -0.059535 -0.538476 0.457655
18 2014-01-11 03:22:41 0.375980 -0.121407 0.848701
19 2014-01-11 05:19:24 0.269404 -0.217147 0.747495
20 2014-01-12 11:59:05 0.250720 -0.240855 0.692035
21 2014-01-12 11:59:05 0.250720 -0.219700 0.750856
22 2014-01-12 16:04:34 0.181727 -0.321655 0.651186
23 2014-01-12 16:04:34 0.181727 -0.258054 0.674920
24 2014-01-13 14:25:07 0.014647 -0.483959 0.501471
25 2014-01-13 21:00:45 -0.069628 -0.585214 0.419536
26 2014-01-13 23:00:43 0.038134 -0.424401 0.545902
27 2014-01-15 00:46:07 0.086748 -0.398095 0.535729
28 2014-01-15 00:46:07 0.086748 -0.356840 0.559423
29 2014-01-15 06:07:06 0.107494 -0.363596 0.615086
30 2014-01-15 06:07:06 0.107494 -0.377719 0.592525
31 2014-01-15 08:45:41 0.374386 -0.109331 0.859783
32 2014-01-15 08:45:41 0.374386 -0.115335 0.862611
33 2014-01-16 16:57:55 0.373912 -0.091702 0.858834
34 2014-01-16 16:57:55 0.373912 -0.135959 0.888679
35 2014-01-16 19:16:27 0.207019 -0.280475 0.682437
36 2014-01-16 19:16:27 0.207019 -0.283235 0.690337
37 2014-01-17 19:18:46 0.175258 -0.300540 0.661739
38 2014-01-17 19:18:46 0.175258 -0.346439 0.662989
39 2014-01-17 21:41:44 -0.027686 -0.511957 0.455303
40 2014-01-17 21:41:44 -0.027686 -0.501050 0.486177
41 2014-01-18 16:42:28 0.235133 -0.250183 0.745811
42 2014-01-18 16:42:28 0.235133 -0.264631 0.740877
43 2014-01-18 16:42:29 0.235166 -0.318422 0.692317
44 2014-01-18 16:42:29 0.235166 -0.246304 0.690065
45 2014-01-19 05:57:12 0.202653 -0.290616 0.675664
46 2014-01-19 05:57:12 0.202653 -0.282254 0.699371
47 2014-01-19 07:10:17 0.240329 -0.236855 0.788029
48 2014-01-19 07:10:17 0.240329 -0.243340 0.734593
49 2014-01-19 07:54:58 0.323653 -0.143582 0.816076
50 2014-01-19 07:54:58 0.323653 -0.165810 0.817118
51 2014-01-19 16:36:26 0.296798 -0.141055 0.755271
52 2014-01-19 16:36:26 0.296798 -0.173162 0.764311
53 2014-01-19 16:36:27 0.296839 -0.174661 0.751170
54 2014-01-19 16:36:27 0.296839 -0.217148 0.784472
55 2014-01-20 04:05:00 0.477437 0.001442 0.984067
56 2014-01-20 04:05:00 0.477437 0.027484 1.015459
57 2014-01-20 04:37:30 0.448685 -0.067710 0.978810
58 2014-01-20 04:37:30 0.448685 -0.062217 0.962453
59 2014-01-20 16:51:04 0.369736 -0.118153 0.854328
60 2014-01-20 16:51:04 0.369736 -0.106555 0.879499
61 2014-01-20 17:47:00 0.394175 -0.108565 0.850560
62 2014-01-20 17:47:00 0.394175 -0.109186 0.893989
63 2014-01-20 17:47:01 0.394162 -0.079906 0.893696
64 2014-01-20 17:47:01 0.394162 -0.052524 0.911933
65 2014-01-24 02:45:03 0.446108 -0.023596 0.954046
66 2014-01-24 05:33:17 0.322382 -0.181396 0.813396
67 2014-01-24 05:33:32 0.322197 -0.131517 0.819520
68 2014-01-24 18:32:02 0.322079 -0.164513 0.815407
69 2014-01-24 18:58:56 0.254880 -0.251733 0.706612
70 2014-01-25 14:08:09 -0.074844 -0.543373 0.432285
71 2014-01-27 02:17:39 0.524953 0.015077 1.001371
72 2014-01-27 12:57:49 0.192673 -0.271437 0.722455
73 2014-01-27 13:34:52 0.092480 -0.418278 0.593815
74 2014-01-28 18:32:58 0.069132 -0.420431 0.562176
75 2014-01-28 18:32:58 0.069132 -0.422038 0.527794
76 2014-01-28 19:14:12 -0.037719 -0.472525 0.474678
77 2014-01-28 19:14:12 -0.037719 -0.525146 0.470822
78 2014-01-29 14:22:26 -0.040749 -0.503921 0.482987
79 2014-01-29 14:22:26 -0.040749 -0.519798 0.436592
80 2014-01-29 15:19:38 0.079868 -0.433312 0.558957
81 2014-01-29 15:19:38 0.079868 -0.398617 0.548653
82 2014-01-30 14:57:47 0.112257 -0.370978 0.596200
83 2014-01-30 14:57:47 0.112257 -0.365768 0.608632
84 2014-01-30 15:46:02 0.242557 -0.251430 0.737558
85 2014-01-30 15:46:02 0.242557 -0.205900 0.730044
86 2014-01-31 05:22:38 0.354749 -0.105646 0.855112
87 2014-01-31 05:22:38 0.354749 -0.136055 0.842609
88 2014-01-31 05:22:40 0.354721 -0.130697 0.850031
89 2014-01-31 05:54:36 0.333572 -0.146379 0.832398
90 2014-01-31 05:54:36 0.333572 -0.166266 0.834868
91 2014-01-31 05:54:56 0.333431 -0.179676 0.842891
92 2014-01-31 05:54:56 0.333431 -0.149686 0.799401
93 2014-01-31 15:39:57 0.182952 -0.312721 0.648097
94 2014-01-31 15:39:57 0.182952 -0.311386 0.684947
95 2014-01-31 15:39:57 0.182952 -0.322271 0.642993
96 2014-01-31 15:39:57 0.182952 -0.316130 0.670530
97 2014-01-31 15:56:57 0.235376 -0.258915 0.745846
98 2014-01-31 15:56:57 0.235376 -0.259331 0.730614
99 2014-01-31 16:44:07 0.361386 -0.138696 0.822562
100 2014-01-31 16:44:07 0.361386 -0.098824 0.910457
101 2014-02-01 16:35:43 0.268359 -0.223586 0.770928
102 2014-02-01 21:35:43 -0.076333 -0.605510 0.383764
103 2014-02-02 02:35:43 0.372742 -0.092220 0.843381
104 2014-02-02 07:35:43 0.330733 -0.174839 0.834838
105 2014-02-02 12:35:43 0.177590 -0.305230 0.665102
106 2014-02-02 17:35:43 0.424186 -0.076330 0.924897
%% Cell type:code id: tags:
``` python
fig1 = m.plot(forecast)
```
%% Output
%% Cell type:code id: tags:
``` python
fig2 = m.plot_components(forecast)
```
%% Output
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
import os
import pandas as pd
from html_similarity import style_similarity, structural_similarity, similarity
from bs4 import BeautifulSoup, Doctype
from bs4.element import Comment
from collections import Counter
from scipy.spatial import distance
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string
import spacy
from nltk.metrics import edit_distance
from nltk.metrics import edit_distance
from nltk.metrics import interval_distance
from nltk import jaccard_distance
import textdistance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import datetime
import fbprophet
import gc
from fastparquet import ParquetFile
import pyarrow.parquet as pq
import json
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
```
%% Output
ERROR:fbprophet:Importing plotly failed. Interactive plots will not work.
%% Cell type:code id: tags:
``` python
root = '../data'
modelUrl = 'cnn.com'
parquetFiles = []
for root, dirs, files in os.walk(root):
path = root.split(os.sep)
for file in files:
if file.endswith(".parquet"):
parquetFiles.append(os.path.join(root, file))
print(str(len(parquetFiles)) + " parquet files found")
```
%% Output
178 parquet files found
%% Cell type:code id: tags:
``` python
spark = SparkSession.builder \
.master("local[*]")\
.config("spark.executor.memory", "70g")\
.config("spark.driver.memory", "50g")\
.config("spark.memory.offHeap.enabled", "true")\
.config("spark.memory.offHeap.size", "14g")\
.appName("sampleCodeForReference")\
.config("spark.driver.cores", "12")\
.getOrCreate()
spark.conf.set("spark.sql.parquet.enableVectorizedReader","false")
sc = spark.sparkContext
sqlContext = SQLContext(sc)
```
%% Cell type:code id: tags:
``` python
archiveData = []
```
%% Cell type:code id: tags:
``` python
for k in range(len(parquetFiles)):
#print("Processing File " + str(k+1))
try:
file = sqlContext.read.parquet(parquetFiles[k])
UriComponents = file.rdd.take(1)[0].originalUrl.split('/')
payload = file.rdd.take(1)[0].payload
mime = file.rdd.take(1)[0].mime
filename = file.rdd.take(1)[0].filename
timestamp = filename.split('.')[0][4:]
#print(mime, UriComponents, len(payload))
print("Processing File " + str(k+1))
if (mime == 'text/html' and len(payload) > 1 and modelUrl in UriComponents[-1]):
currentData = {}
currentData['payload'] = payload
currentData['timestamp'] = timestamp
archiveData.append(currentData)
except:
pass
```
%% Output
Processing File 1
Processing File 2
Processing File 3
Processing File 4
Processing File 5
Processing File 6
Processing File 7
Processing File 8
Processing File 9
Processing File 10
Processing File 11
Processing File 12
Processing File 13
Processing File 14
Processing File 15
Processing File 16
Processing File 17
Processing File 18
Processing File 19
Processing File 20
Processing File 21
Processing File 22
Processing File 23
Processing File 24
Processing File 25
Processing File 26
Processing File 27
Processing File 28
Processing File 29
Processing File 30
Processing File 31
Processing File 32
Processing File 33
Processing File 34
Processing File 35
Processing File 36
Processing File 37
Processing File 38
Processing File 39
Processing File 40
Processing File 41
Processing File 42
Processing File 43
Processing File 44
Processing File 45
Processing File 46
Processing File 47
Processing File 48
Processing File 49
Processing File 50
Processing File 51
Processing File 52
Processing File 53
Processing File 54
Processing File 55
Processing File 56
Processing File 57
Processing File 58
Processing File 59
Processing File 60
Processing File 61
Processing File 62
Processing File 63
Processing File 64
Processing File 65
Processing File 66
Processing File 67
Processing File 68
Processing File 69
Processing File 70
Processing File 71
Processing File 72
Processing File 73
Processing File 74
Processing File 75
Processing File 76
Processing File 77
Processing File 78
Processing File 79
Processing File 80
Processing File 81
Processing File 82
Processing File 83
Processing File 84
Processing File 85
Processing File 86
Processing File 87
Processing File 88
Processing File 89
Processing File 90
Processing File 91
Processing File 92
Processing File 93
Processing File 94
Processing File 95
Processing File 96
Processing File 97
Processing File 98
Processing File 99
Processing File 100
Processing File 101
Processing File 102
Processing File 103
Processing File 104
Processing File 105
Processing File 106
Processing File 107
Processing File 108
Processing File 109
Processing File 110
Processing File 112
Processing File 113
Processing File 114
Processing File 115
Processing File 116
Processing File 117
Processing File 118
Processing File 119
Processing File 120
Processing File 121
Processing File 122
Processing File 123
Processing File 124
Processing File 125
Processing File 126
Processing File 127
Processing File 128
Processing File 129
Processing File 130
Processing File 131
Processing File 132
Processing File 133
Processing File 134
Processing File 135
Processing File 136
Processing File 137
Processing File 138
Processing File 139
Processing File 140
Processing File 141
Processing File 142
Processing File 143
Processing File 144
Processing File 145
Processing File 146
Processing File 147
Processing File 148
Processing File 149
Processing File 150
Processing File 151
Processing File 152
Processing File 153
Processing File 154
Processing File 155
Processing File 156
Processing File 157
Processing File 158
Processing File 159
Processing File 160
Processing File 161
Processing File 162
Processing File 163
Processing File 164
Processing File 165
Processing File 166
Processing File 167
Processing File 168
Processing File 169
Processing File 170
Processing File 171
Processing File 172
Processing File 173
Processing File 174
Processing File 175
Processing File 176
Processing File 177
Processing File 178
%% Cell type:code id: tags:
``` python
archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
```
%% Cell type:code id: tags:
``` python
df = pd.DataFrame(archiveData, columns = ['payload', 'timestamp'])
```
%% Cell type:code id: tags:
``` python
df.to_pickle("./archiveData.pkl")
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
......@@ -4,6 +4,18 @@ class TrieNode:
self.data = {}
self.isEndOfUrl = False
def extract(self, startTimestamp , endTimeStamp):
pCrawl = self
pCrawlCopy = TrieNode()
pCrawlCopy.isEndOfUrl = pCrawl.isEndOfUrl
for data in pCrawl.data:
if data <= endTimeStamp and data >= startTimestamp:
pCrawlCopy.data[data] = pCrawl.data[data]
for child in pCrawl.children:
pCrawlJunior = pCrawl.children[child]
pCrawlCopy.children[child] = pCrawlJunior.extract(startTimestamp, endTimeStamp)
return pCrawlCopy
class Trie:
def __init__(self):
......@@ -30,15 +42,37 @@ class Trie:
pCrawl.isEndOfUrl = True
def extract(self, startTimestamp , endTimeStamp):
print()
# extract tree based on given timestamp
# pCrawl = self.root
# for child in pCrawl.children:
# print(child)
return self.root.extract(startTimestamp, endTimeStamp)
def comparison(self, tree1, tree2):
print()
def comparison(self, tree1):
# compare two trees
from collections import deque
stack_tree2 = deque()
stack_tree1 = deque()
stack_tree2.append(self.root)
stack_tree1.append(tree1)
while (len(stack_tree2) != 0):
tree2 = stack_tree2.pop()
tree = stack_tree1.pop()
for data in tree2.data:
if tree.data.__contains__(data):
if not tree.data[data] == tree2.data[data]:
return False
else:
return False
for child in tree2.children:
if tree.children.__contains__(child):
if not stack_tree2.__contains__(child):
stack_tree2.append(tree2.children[child])
stack_tree1.append(tree.children[child])
else:
return False
if (len(stack_tree1) != 0):
return False
return True
def main():
......
#!/usr/bin/env python
# coding: utf-8
# In[2]:
import os
import pandas as pd
from html_similarity import style_similarity, structural_similarity, similarity
from bs4 import BeautifulSoup, Doctype
from bs4.element import Comment
from collections import Counter
from scipy.spatial import distance
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string
import spacy
from nltk.metrics import edit_distance
from nltk.metrics import edit_distance
from nltk.metrics import interval_distance
from nltk import jaccard_distance
import textdistance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm
# In[3]:
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
# In[4]:
def text_from_html(htmlPage):
soup = BeautifulSoup(htmlPage, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
# In[5]:
def split(word):
return [char for char in word]
# In[6]:
def filter_text(text):
stop_words = set(stopwords.words('english'))
stop_words.update(split(string.punctuation))
nlp = spacy.load('en_core_web_sm')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
stop_words.update(spacy_stopwords)
#stop_words.update(["\\t","\\n","\\r"])
text = text.replace("\\n", "")
text = text.replace("\\r", "")
text = text.replace("\\t", "")
word_tokens_text = word_tokenize(text)
filtered_text = [w for w in word_tokens_text if not w in stop_words]
filtered_text = TreebankWordDetokenizer().detokenize(filtered_text)
return filtered_text
# In[ ]:
# In[ ]:
# In[7]:
def classiyRF(archiveData, newRecord):
archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
basePayload = archiveData[0]['payload']
basePayloadText = text_from_html(basePayload)
basePayloadFilteredText = filter_text(basePayloadText)
lastSavedDataIndex = 0
dataset = []
print(str(len(archiveData)) + " datapoints found")
for i in range(1, len(archiveData)):
if(i % 100 is 0):
print(str(i) + " Records processed")
hasContentChanged = False
overallSimilarity = similarity(basePayload, archiveData[i]['payload'])
styleSimilarity = style_similarity(basePayload, archiveData[i]['payload'])
structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload'])
archiveText = text_from_html(archiveData[i]['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
if(overallSimilarity < 0.80 or cosineSimilarity < 0.95):
hasContentChanged = True
lastSavedDataIndex = i
basePayload = archiveData[i]['payload']
basePayloadText = archiveText
basePayloadFilteredText = filteredArchiveText
data = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged]
dataset.append(data)
df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'structureSimilarity', 'cosine', 'jaccard', 'sorensen', 'changed'])
print("Dataframe created")
X = df.iloc[:, 0:6].values
y = df.iloc[:, 6].values
sc = StandardScaler()
X_train = sc.fit_transform(X)
overallSimilarity = similarity(basePayload, newRecord['payload'])
styleSimilarity = style_similarity(basePayload, newRecord['payload'])
structuralSimilarity = structural_similarity(basePayload, newRecord['payload'])
archiveText = text_from_html(newRecord['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
X_test = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity]
print("Starting Random Forest Classification")
regressor = RandomForestClassifier(n_estimators=20, random_state=0)
regressor.fit(X_train, y)
y_pred = regressor.predict([X_test])
return y_pred
# In[ ]:
def classiySVM(archiveData, newRecord):
archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
basePayload = archiveData[0]['payload']
basePayloadText = text_from_html(basePayload)
basePayloadFilteredText = filter_text(basePayloadText)
lastSavedDataIndex = 0
dataset = []
print(str(len(archiveData)) + " datapoints found")
for i in range(1, len(archiveData)):
if(i % 100 is 0):
print(str(i) + " Records processed")
hasContentChanged = False
overallSimilarity = similarity(basePayload, archiveData[i]['payload'])
styleSimilarity = style_similarity(basePayload, archiveData[i]['payload'])
structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload'])
archiveText = text_from_html(archiveData[i]['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
if(overallSimilarity < 0.80 or cosineSimilarity < 0.95):
hasContentChanged = True
lastSavedDataIndex = i
basePayload = archiveData[i]['payload']
basePayloadText = archiveText
basePayloadFilteredText = filteredArchiveText
data = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged]
dataset.append(data)
df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'structureSimilarity', 'cosine', 'jaccard', 'sorensen', 'changed'])
print("Dataframe created")
X = df.iloc[:, 0:6].values
y = df.iloc[:, 6].values
sc = StandardScaler()
X_train = sc.fit_transform(X)
overallSimilarity = similarity(basePayload, newRecord['payload'])
styleSimilarity = style_similarity(basePayload, newRecord['payload'])
structuralSimilarity = structural_similarity(basePayload, newRecord['payload'])
archiveText = text_from_html(newRecord['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
X_test = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity]
print("Starting SVM Classification")
regressor = svm.SVC()
regressor.fit(X_train, y)
y_pred = regressor.predict([X_test])
return y_pred
%% Cell type:markdown id: tags:
# Get download link from Wayback Machine and Download Data
%% Cell type:markdown id: tags:
## List items in the collection
%% Cell type:code id: tags:
``` python
import imp
import waybackcollectiondownloader
from waybackcollectiondownloader import WaybackCollectionDownloader
imp.reload(waybackcollectiondownloader)
downloader = WaybackCollectionDownloader(collectionUrl)
downloader.ScrapeDownloadLinks("/home/xw0078/data/WaybackDownload/twitterarchiveLinks.txt")
```
%% Output
Log In Successful
End at page:
https://archive.org/details/twitterarchive?&sort=-publicdate&page=9
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
import requests
from bs4 import BeautifulSoup as bs
import re
import sys
import os
import datetime
import errno
class WaybackCollectionDownloader:
collectionUrl = ""
collectionUrlSorted = ""
credential = {
'username': 'xw0078@vt.edu',
'password': 'arc!sh007'
}
loginLink = "https://archive.org/account/login"
outputFile = "/home/xw0078/data/WaybackDownload/"
downloadLinkPrefix = "https://archive.org/download/"
def __init__(self,collectionUrl,**kwargs):
self.collectionUrl = collectionUrl
self.collectionUrlSorted = collectionUrl+"?&sort=-publicdate&page="
def generateDownloadLinkFile(self,filePath):
self.outputFile = filePath
if os.path.isfile(self.outputFile):
ts = datetime.datetime.now()
tsStr = ts.strftime("%d-%b-%Y (%H:%M:%S)")
os.rename(self.outputFile, self.outputFile.replace(".txt",tsStr+".txt"))
self.createInputDir(self.outputFile)
f = open(self.outputFile,"w+")
f.close()
def loginValidation(self,session):
page = session.get(self.loginLink)
return "Log out" in page.text
def collectionPageContenValidation(self,pageText):
return "No results matched your criteria" not in pageText
def ScrapeDownloadLinks(self,filePath):
self.generateDownloadLinkFile(filePath)
with requests.Session() as s:
# Login
s.get(self.loginLink)
s.post(self.loginLink,data = self.credential)
if self.loginValidation(s) != True:
print("Bad Login")
sys.exit
print("Log In Successful")
# iterate collection page to get all collection content
pageNumber = 1
while True:
currentPageLink = self.collectionUrlSorted+str(pageNumber)
currentPage = s.get(currentPageLink)
if self.collectionPageContenValidation(currentPage.text) == False:
print("End at page:")
print(currentPageLink)
return
self.parseWaybackCollectionItems(currentPage,s)
pageNumber+=1
def parseWaybackCollectionItems(self,page,session):
soup = bs(page.text,'html.parser')
collectionList = soup.find(class_='results')
collectionList_items = collectionList.find_all('a',href=re.compile('/details/.*'),class_=lambda x: x != 'stealth')
for item in collectionList_items:
itemLink = "https://archive.org"+item.get('href')
downloadPageLink = itemLink.replace("details","download")
itemPage = session.get(downloadPageLink)
self.parseWaybackItemDownloadPage(itemPage,downloadPageLink,session)
def parseWaybackItemDownloadPage(self,page,parentLink,session):
soup = bs(page.text,'html.parser')
downloadList = soup.find(class_='directory-listing-table')
warcItems = downloadList.find_all('a',href=re.compile('.*warc.gz'))
cdxItems = downloadList.find_all('a',href=re.compile('.*os.cdx.gz'))
if len(warcItems) != len(cdxItems):
print("WARC CDX item number not matching: WARC "+ len(warcItems) +" cdx "+len(cdxItems))
sys.exit(0)
for warc,cdx in zip(warcItems,cdxItems):
warcLink = parentLink+"/"+warc.get('href')
cdxLink = parentLink+"/"+cdx.get('href')
self.appendLineToFile(warcLink)
self.appendLineToFile(cdxLink)
def appendLineToFile(self,input):
f = open(self.outputFile,"a+")
f.write(input)
f.write("\r\n")
f.close()
def createInputDir(self,input):
if not os.path.exists(os.path.dirname(input)):
try:
os.makedirs(os.path.dirname(input))
except OSError as exc:
if exc.errno != errno.EEXIST:
raise
\ No newline at end of file
......@@ -45,20 +45,16 @@ for dayData in data_train:
payload = dayDataNP[i][9]
sitemap.insert(parsedurl.path, timestamp, payload)
# if not(sitemapURLS.__contains__(parsedurl.path)):
# sitemapURLS[parsedurl.path] = parsedurl[1]+parsedurl[2]
# sitemap.insert(parsedurl.path, timestamp, payload)
vtTree = sitemapURLS['www.vt.edu']
vtTreeCopy = vtTree.extract('20140906125541','20141215204723')
result = vtTree.comparison(vtTreeCopy)
print(result)
result = vtTree.comparison(vtTree.root)
print(result)
print('done')
# if not(sitemapURLS.__contains__(parsedurl.path)):
# sitemapURLS[parsedurl.path] = parsedurl[1]+parsedurl[2]
# sitemap.insert(parsedurl.path, timestamp, payload)