Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • namanahuja/cs-6604-webarchive
  • siddharth/cs-6604-webarchive
  • xw0078/cs-6604-webarchive
3 results
Show changes
Commits on Source (11)
%% Cell type:code id: tags:
``` python
import os
import pandas as pd
from html_similarity import style_similarity, structural_similarity, similarity
from bs4 import BeautifulSoup, Doctype
from bs4.element import Comment
from collections import Counter
from scipy.spatial import distance
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string
import spacy
from nltk.metrics import edit_distance
from nltk.metrics import edit_distance
from nltk.metrics import interval_distance
from nltk import jaccard_distance
import textdistance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
import datetime
import fbprophet
import gc
import math
import numpy as np
from fastparquet import ParquetFile
```
%% Cell type:code id: tags:
``` python
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
def text_from_html(htmlPage):
soup = BeautifulSoup(htmlPage, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
def split(word):
return [char for char in word]
def filter_text(text):
stop_words = set(stopwords.words('english'))
stop_words.update(split(string.punctuation))
nlp = spacy.load('en_core_web_sm')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
stop_words.update(spacy_stopwords)
#stop_words.update(["\\t","\\n","\\r"])
text = text.replace("\\n", "")
text = text.replace("\\r", "")
text = text.replace("\\t", "")
word_tokens_text = word_tokenize(text)
filtered_text = [w for w in word_tokens_text if not w in stop_words]
filtered_text = TreebankWordDetokenizer().detokenize(filtered_text)
return filtered_text
```
%% Cell type:code id: tags:
``` python
archiveData = pd.read_pickle("./archiveData.pkl")
```
%% Cell type:code id: tags:
``` python
def classifyProphet(archiveData):
basePayload = archiveData.iloc[0]['payload']
basePayloadText = text_from_html(basePayload)
basePayloadFilteredText = filter_text(basePayloadText)
baseTimestamp = datetime.datetime.strptime(archiveData.iloc[0]['timestamp'], '%Y%m%d%H%M%S')
lastSavedDataIndex = 0
timeElapsed = 0
dataset = []
for i in range(1, len(archiveData)):
print("Proceesing " + str(i))
hasContentChanged = 0
overallSimilarity = similarity(basePayload, archiveData.iloc[i]['payload'])
styleSimilarity = style_similarity(basePayload, archiveData.iloc[i]['payload'])
structuralSimilarity = structural_similarity(basePayload, archiveData.iloc[i]['payload'])
timestamp = datetime.datetime.strptime(archiveData.iloc[i]['timestamp'], '%Y%m%d%H%M%S')
archiveText = text_from_html(archiveData.iloc[i]['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#print(overallSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity)
if(overallSimilarity < 0.98 or cosineSimilarity < 1):
hasContentChanged = 1
lastSavedDataIndex = i
basePayload = archiveData.iloc[i]['payload']
basePayloadText = archiveText
basePayloadFilteredText = filteredArchiveText
baseTimestamp = datetime.datetime.strptime(archiveData.iloc[i]['timestamp'], '%Y%m%d%H%M%S')
data = [timestamp, hasContentChanged]
dataset.append(data)
df = pd.DataFrame(dataset, columns = ['ds', 'y'])
m = fbprophet.Prophet()
m.fit(df)
print(df)
future = m.make_future_dataframe(periods=1, freq="5H", include_history = False)
forecast = m.predict(future)
allfuture = m.make_future_dataframe(periods=72, freq="5H", include_history = True)
allforecast = m.predict(allfuture)
print(forecast)
fig1 = m.plot(allforecast)
fig2 = m.plot_components(allforecast)
return forecast
```
%% Cell type:code id: tags:
``` python
classifyProphet(archiveData)
```
%% Output
Proceesing 1
Proceesing 2
Proceesing 3
Proceesing 4
Proceesing 5
Proceesing 6
Proceesing 7
Proceesing 8
Proceesing 9
Proceesing 10
Proceesing 11
Proceesing 12
Proceesing 13
Proceesing 14
Proceesing 15
Proceesing 16
Proceesing 17
Proceesing 18
Proceesing 19
Proceesing 20
Proceesing 21
Proceesing 22
Proceesing 23
Proceesing 24
Proceesing 25
Proceesing 26
Proceesing 27
Proceesing 28
Proceesing 29
Proceesing 30
Proceesing 31
Proceesing 32
Proceesing 33
Proceesing 34
Proceesing 35
Proceesing 36
Proceesing 37
Proceesing 38
Proceesing 39
Proceesing 40
Proceesing 41
Proceesing 42
Proceesing 43
Proceesing 44
Proceesing 45
Proceesing 46
Proceesing 47
Proceesing 48
Proceesing 49
Proceesing 50
Proceesing 51
Proceesing 52
Proceesing 53
Proceesing 54
Proceesing 55
Proceesing 56
Proceesing 57
Proceesing 58
Proceesing 59
Proceesing 60
Proceesing 61
Proceesing 62
Proceesing 63
Proceesing 64
Proceesing 65
Proceesing 66
Proceesing 67
Proceesing 68
Proceesing 69
Proceesing 70
Proceesing 71
Proceesing 72
Proceesing 73
Proceesing 74
Proceesing 75
Proceesing 76
Proceesing 77
Proceesing 78
Proceesing 79
Proceesing 80
Proceesing 81
Proceesing 82
Proceesing 83
Proceesing 84
Proceesing 85
Proceesing 86
Proceesing 87
Proceesing 88
Proceesing 89
Proceesing 90
Proceesing 91
Proceesing 92
Proceesing 93
Proceesing 94
Proceesing 95
Proceesing 96
Proceesing 97
Proceesing 98
Proceesing 99
Proceesing 100
Proceesing 101
Proceesing 102
Proceesing 103
Proceesing 104
Proceesing 105
Proceesing 106
Proceesing 107
Proceesing 108
Proceesing 109
Proceesing 110
Proceesing 111
Proceesing 112
Proceesing 113
Proceesing 114
Proceesing 115
Proceesing 116
Proceesing 117
Proceesing 118
Proceesing 119
Proceesing 120
Proceesing 121
Proceesing 122
Proceesing 123
Proceesing 124
Proceesing 125
Proceesing 126
Proceesing 127
Proceesing 128
Proceesing 129
Proceesing 130
Proceesing 131
Proceesing 132
Proceesing 133
Proceesing 134
Proceesing 135
Proceesing 136
Proceesing 137
Proceesing 138
Proceesing 139
Proceesing 140
Proceesing 141
Proceesing 142
Proceesing 143
Proceesing 144
Proceesing 145
Proceesing 146
Proceesing 147
Proceesing 148
Proceesing 149
Proceesing 150
Proceesing 151
Proceesing 152
Proceesing 153
Proceesing 154
Proceesing 155
Proceesing 156
Proceesing 157
Proceesing 158
Proceesing 159
Proceesing 160
Proceesing 161
Proceesing 162
Proceesing 163
Proceesing 164
Proceesing 165
Proceesing 166
Proceesing 167
Proceesing 168
Proceesing 169
Proceesing 170
Proceesing 171
Proceesing 172
Proceesing 173
Proceesing 174
Proceesing 175
Proceesing 176
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
ds y
0 2019-11-06 17:10:18 0
1 2019-11-06 19:10:18 0
2 2019-11-06 20:10:18 0
3 2019-11-07 00:10:18 0
4 2019-11-07 01:10:18 0
.. ... ..
171 2019-11-19 06:10:18 0
172 2019-11-19 07:10:18 0
173 2019-11-19 08:10:18 0
174 2019-11-19 09:10:18 0
175 2019-11-19 11:10:18 0
[176 rows x 2 columns]
ds trend yhat_lower yhat_upper trend_lower \
0 2019-11-19 16:10:18 0.031191 -0.024171 0.247586 0.031191
trend_upper additive_terms additive_terms_lower additive_terms_upper \
0 0.031191 0.07207 0.07207 0.07207
daily daily_lower daily_upper multiplicative_terms \
0 0.07207 0.07207 0.07207 0.0
multiplicative_terms_lower multiplicative_terms_upper yhat
0 0.0 0.0 0.103261
ds trend yhat_lower yhat_upper trend_lower \
0 2019-11-19 16:10:18 0.031191 -0.024171 0.247586 0.031191
trend_upper additive_terms additive_terms_lower additive_terms_upper \
0 0.031191 0.07207 0.07207 0.07207
daily daily_lower daily_upper multiplicative_terms \
0 0.07207 0.07207 0.07207 0.0
multiplicative_terms_lower multiplicative_terms_upper yhat
0 0.0 0.0 0.103261
%% Cell type:code id: tags:
``` python
def classiyWindowSVM(archiveData):
basePayload = archiveData.iloc[0]['payload']
basePayloadText = text_from_html(basePayload)
basePayloadFilteredText = filter_text(basePayloadText)
baseTimestamp = datetime.datetime.strptime(archiveData.iloc[0]['timestamp'], '%Y%m%d%H%M%S')
lastSavedDataIndex = 0
timeElapsed = 0
dataset = []
print(str(len(archiveData)) + " datapoints found")
for i in range(1, len(archiveData)):
if(i % 10 is 0):
print(str(i) + " Records processed")
hasContentChanged = False
overallSimilarity = similarity(basePayload, archiveData.iloc[i]['payload'])
styleSimilarity = style_similarity(basePayload, archiveData.iloc[i]['payload'])
structuralSimilarity = structural_similarity(basePayload, archiveData.iloc[i]['payload'])
timestamp = datetime.datetime.strptime(archiveData.iloc[i]['timestamp'], '%Y%m%d%H%M%S')
baseTimestampDiff = ((timestamp - baseTimestamp).total_seconds())/3600.0
archiveText = text_from_html(archiveData.iloc[i]['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
if(overallSimilarity < 0.98 or cosineSimilarity < 1):
hasContentChanged = True
lastSavedDataIndex = i
basePayload = archiveData.iloc[i]['payload']
basePayloadText = archiveText
basePayloadFilteredText = filteredArchiveText
baseTimestamp = datetime.datetime.strptime(archiveData.iloc[i]['timestamp'], '%Y%m%d%H%M%S')
data = [baseTimestampDiff, overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged]
dataset.append(data)
df = pd.DataFrame(dataset, columns = ['timeDiff', 'similarity', 'styleSimilarity', 'structureSimilarity', 'cosine', 'jaccard', 'sorensen', 'changed'])
print("Dataframe created")
X = []
y = []
windowSize = 10
for i in range(len(df.index) - windowSize - 1):
bound = min(i + windowSize + 1, len(archiveData))
window = df.iloc[i:bound-1, 0].values
windowLabel = df.iloc[bound, 7]
X.append(window)
y.append(windowLabel)
trainEx = math.floor(len(X)*0.8)
X_train = X[0:trainEx]
y_train = y[0:trainEx]
X_test = X[trainEx:]
y_test = y[trainEx:]
print("Starting SVM Classification")
print(X_train, y_train)
regressor = svm.SVC()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print(y_pred)
return np.mean(y_test == y_pred)
```
%% Cell type:code id: tags:
``` python
print(classiyWindowSVM(archiveData))
```
%% Output
177 datapoints found
10 Records processed
20 Records processed
30 Records processed
40 Records processed
50 Records processed
60 Records processed
70 Records processed
80 Records processed
90 Records processed
100 Records processed
110 Records processed
120 Records processed
130 Records processed
140 Records processed
150 Records processed
160 Records processed
170 Records processed
Dataframe created
Starting SVM Classification
[array([ 1., 3., 4., 8., 9., 11., 13., 14., 17., 21.]), array([ 3., 4., 8., 9., 11., 13., 14., 17., 21., 22.]), array([ 4., 8., 9., 11., 13., 14., 17., 21., 22., 24.]), array([ 8., 9., 11., 13., 14., 17., 21., 22., 24., 26.]), array([ 9., 11., 13., 14., 17., 21., 22., 24., 26., 27.]), array([11., 13., 14., 17., 21., 22., 24., 26., 27., 28.]), array([13., 14., 17., 21., 22., 24., 26., 27., 28., 29.]), array([14., 17., 21., 22., 24., 26., 27., 28., 29., 31.]), array([17., 21., 22., 24., 26., 27., 28., 29., 31., 32.]), array([21., 22., 24., 26., 27., 28., 29., 31., 32., 33.]), array([22., 24., 26., 27., 28., 29., 31., 32., 33., 34.]), array([24., 26., 27., 28., 29., 31., 32., 33., 34., 36.]), array([26., 27., 28., 29., 31., 32., 33., 34., 36., 39.]), array([27., 28., 29., 31., 32., 33., 34., 36., 39., 40.]), array([28., 29., 31., 32., 33., 34., 36., 39., 40., 41.]), array([29., 31., 32., 33., 34., 36., 39., 40., 41., 43.]), array([31., 32., 33., 34., 36., 39., 40., 41., 43., 44.]), array([32., 33., 34., 36., 39., 40., 41., 43., 44., 45.]), array([33., 34., 36., 39., 40., 41., 43., 44., 45., 48.]), array([34., 36., 39., 40., 41., 43., 44., 45., 48., 49.]), array([36., 39., 40., 41., 43., 44., 45., 48., 49., 50.]), array([39., 40., 41., 43., 44., 45., 48., 49., 50., 51.]), array([40., 41., 43., 44., 45., 48., 49., 50., 51., 52.]), array([41., 43., 44., 45., 48., 49., 50., 51., 52., 54.]), array([43., 44., 45., 48., 49., 50., 51., 52., 54., 55.]), array([44., 45., 48., 49., 50., 51., 52., 54., 55., 56.]), array([45., 48., 49., 50., 51., 52., 54., 55., 56., 57.]), array([48., 49., 50., 51., 52., 54., 55., 56., 57., 58.]), array([49., 50., 51., 52., 54., 55., 56., 57., 58., 62.]), array([50., 51., 52., 54., 55., 56., 57., 58., 62., 63.]), array([51., 52., 54., 55., 56., 57., 58., 62., 63., 64.]), array([52., 54., 55., 56., 57., 58., 62., 63., 64., 66.]), array([54., 55., 56., 57., 58., 62., 63., 64., 66., 70.]), array([55., 56., 57., 58., 62., 63., 64., 66., 70., 72.]), array([56., 57., 58., 62., 63., 64., 66., 70., 72., 73.]), array([57., 58., 62., 63., 64., 66., 70., 72., 73., 76.]), array([58., 62., 63., 64., 66., 70., 72., 73., 76., 77.]), array([62., 63., 64., 66., 70., 72., 73., 76., 77., 78.]), array([63., 64., 66., 70., 72., 73., 76., 77., 78., 80.]), array([64., 66., 70., 72., 73., 76., 77., 78., 80., 81.]), array([66., 70., 72., 73., 76., 77., 78., 80., 81., 82.]), array([70., 72., 73., 76., 77., 78., 80., 81., 82., 84.]), array([72., 73., 76., 77., 78., 80., 81., 82., 84., 85.]), array([73., 76., 77., 78., 80., 81., 82., 84., 85., 86.]), array([76., 77., 78., 80., 81., 82., 84., 85., 86., 87.]), array([77., 78., 80., 81., 82., 84., 85., 86., 87., 88.]), array([78., 80., 81., 82., 84., 85., 86., 87., 88., 89.]), array([80., 81., 82., 84., 85., 86., 87., 88., 89., 90.]), array([81., 82., 84., 85., 86., 87., 88., 89., 90., 91.]), array([82., 84., 85., 86., 87., 88., 89., 90., 91., 92.]), array([84., 85., 86., 87., 88., 89., 90., 91., 92., 96.]), array([85., 86., 87., 88., 89., 90., 91., 92., 96., 97.]), array([86., 87., 88., 89., 90., 91., 92., 96., 97., 98.]), array([87., 88., 89., 90., 91., 92., 96., 97., 98., 99.]), array([ 88., 89., 90., 91., 92., 96., 97., 98., 99., 100.]), array([ 89., 90., 91., 92., 96., 97., 98., 99., 100., 103.]), array([ 90., 91., 92., 96., 97., 98., 99., 100., 103., 106.]), array([ 91., 92., 96., 97., 98., 99., 100., 103., 106., 108.]), array([ 92., 96., 97., 98., 99., 100., 103., 106., 108., 109.]), array([ 96., 97., 98., 99., 100., 103., 106., 108., 109., 110.]), array([ 97., 98., 99., 100., 103., 106., 108., 109., 110., 111.]), array([ 98., 99., 100., 103., 106., 108., 109., 110., 111., 112.]), array([ 99., 100., 103., 106., 108., 109., 110., 111., 112., 118.]), array([100., 103., 106., 108., 109., 110., 111., 112., 118., 120.]), array([103., 106., 108., 109., 110., 111., 112., 118., 120., 122.]), array([106., 108., 109., 110., 111., 112., 118., 120., 122., 123.]), array([108., 109., 110., 111., 112., 118., 120., 122., 123., 124.]), array([109., 110., 111., 112., 118., 120., 122., 123., 124., 126.]), array([110., 111., 112., 118., 120., 122., 123., 124., 126., 128.]), array([111., 112., 118., 120., 122., 123., 124., 126., 128., 129.]), array([112., 118., 120., 122., 123., 124., 126., 128., 129., 130.]), array([118., 120., 122., 123., 124., 126., 128., 129., 130., 132.]), array([120., 122., 123., 124., 126., 128., 129., 130., 132., 133.]), array([122., 123., 124., 126., 128., 129., 130., 132., 133., 135.]), array([123., 124., 126., 128., 129., 130., 132., 133., 135., 139.]), array([124., 126., 128., 129., 130., 132., 133., 135., 139., 142.]), array([126., 128., 129., 130., 132., 133., 135., 139., 142., 143.]), array([128., 129., 130., 132., 133., 135., 139., 142., 143., 144.]), array([129., 130., 132., 133., 135., 139., 142., 143., 144., 146.]), array([130., 132., 133., 135., 139., 142., 143., 144., 146., 150.]), array([132., 133., 135., 139., 142., 143., 144., 146., 150., 152.]), array([133., 135., 139., 142., 143., 144., 146., 150., 152., 154.]), array([135., 139., 142., 143., 144., 146., 150., 152., 154., 157.]), array([139., 142., 143., 144., 146., 150., 152., 154., 157., 158.]), array([142., 143., 144., 146., 150., 152., 154., 157., 158., 159.]), array([143., 144., 146., 150., 152., 154., 157., 158., 159., 161.]), array([144., 146., 150., 152., 154., 157., 158., 159., 161., 164.]), array([146., 150., 152., 154., 157., 158., 159., 161., 164., 165.]), array([150., 152., 154., 157., 158., 159., 161., 164., 165., 166.]), array([152., 154., 157., 158., 159., 161., 164., 165., 166., 169.]), array([154., 157., 158., 159., 161., 164., 165., 166., 169., 170.]), array([157., 158., 159., 161., 164., 165., 166., 169., 170., 171.]), array([158., 159., 161., 164., 165., 166., 169., 170., 171., 172.]), array([159., 161., 164., 165., 166., 169., 170., 171., 172., 173.]), array([161., 164., 165., 166., 169., 170., 171., 172., 173., 174.]), array([164., 165., 166., 169., 170., 171., 172., 173., 174., 176.]), array([165., 166., 169., 170., 171., 172., 173., 174., 176., 178.]), array([166., 169., 170., 171., 172., 173., 174., 176., 178., 179.]), array([169., 170., 171., 172., 173., 174., 176., 178., 179., 181.]), array([170., 171., 172., 173., 174., 176., 178., 179., 181., 184.]), array([171., 172., 173., 174., 176., 178., 179., 181., 184., 185.]), array([172., 173., 174., 176., 178., 179., 181., 184., 185., 188.]), array([173., 174., 176., 178., 179., 181., 184., 185., 188., 190.]), array([174., 176., 178., 179., 181., 184., 185., 188., 190., 192.]), array([176., 178., 179., 181., 184., 185., 188., 190., 192., 193.]), array([178., 179., 181., 184., 185., 188., 190., 192., 193., 194.]), array([179., 181., 184., 185., 188., 190., 192., 193., 194., 196.]), array([181., 184., 185., 188., 190., 192., 193., 194., 196., 197.]), array([184., 185., 188., 190., 192., 193., 194., 196., 197., 198.]), array([185., 188., 190., 192., 193., 194., 196., 197., 198., 202.]), array([188., 190., 192., 193., 194., 196., 197., 198., 202., 205.]), array([190., 192., 193., 194., 196., 197., 198., 202., 205., 210.]), array([192., 193., 194., 196., 197., 198., 202., 205., 210., 214.]), array([193., 194., 196., 197., 198., 202., 205., 210., 214., 215.]), array([194., 196., 197., 198., 202., 205., 210., 214., 215., 2.]), array([196., 197., 198., 202., 205., 210., 214., 215., 2., 3.]), array([197., 198., 202., 205., 210., 214., 215., 2., 3., 6.]), array([198., 202., 205., 210., 214., 215., 2., 3., 6., 7.]), array([202., 205., 210., 214., 215., 2., 3., 6., 7., 8.]), array([205., 210., 214., 215., 2., 3., 6., 7., 8., 9.]), array([210., 214., 215., 2., 3., 6., 7., 8., 9., 11.]), array([214., 215., 2., 3., 6., 7., 8., 9., 11., 13.]), array([215., 2., 3., 6., 7., 8., 9., 11., 13., 14.]), array([ 2., 3., 6., 7., 8., 9., 11., 13., 14., 15.]), array([ 3., 6., 7., 8., 9., 11., 13., 14., 15., 17.]), array([ 6., 7., 8., 9., 11., 13., 14., 15., 17., 18.]), array([ 7., 8., 9., 11., 13., 14., 15., 17., 18., 19.]), array([ 8., 9., 11., 13., 14., 15., 17., 18., 19., 21.]), array([ 9., 11., 13., 14., 15., 17., 18., 19., 21., 23.]), array([11., 13., 14., 15., 17., 18., 19., 21., 23., 24.]), array([13., 14., 15., 17., 18., 19., 21., 23., 24., 26.]), array([14., 15., 17., 18., 19., 21., 23., 24., 26., 29.])] [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
[False False False False False False False False False False False False
False False False False False False False False False False False False
False False False False False False False False False]
0.9696969696969697
/home/naman/anaconda3/envs/dl/lib/python3.7/site-packages/sklearn/svm/base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
"avoid this warning.", FutureWarning)
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
This diff is collapsed.
%% Cell type:code id: tags:
``` python
import os
import pandas as pd
from html_similarity import style_similarity, structural_similarity, similarity
from bs4 import BeautifulSoup, Doctype
from bs4.element import Comment
from collections import Counter
from scipy.spatial import distance
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string
import spacy
from nltk.metrics import edit_distance
from nltk.metrics import edit_distance
from nltk.metrics import interval_distance
from nltk import jaccard_distance
import textdistance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import datetime
import fbprophet
import gc
from fastparquet import ParquetFile
import pyarrow.parquet as pq
import json
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
```
%% Output
ERROR:fbprophet:Importing plotly failed. Interactive plots will not work.
%% Cell type:code id: tags:
``` python
root = '../data'
modelUrl = 'cnn.com'
parquetFiles = []
for root, dirs, files in os.walk(root):
path = root.split(os.sep)
for file in files:
if file.endswith(".parquet"):
parquetFiles.append(os.path.join(root, file))
print(str(len(parquetFiles)) + " parquet files found")
```
%% Output
178 parquet files found
%% Cell type:code id: tags:
``` python
spark = SparkSession.builder \
.master("local[*]")\
.config("spark.executor.memory", "70g")\
.config("spark.driver.memory", "50g")\
.config("spark.memory.offHeap.enabled", "true")\
.config("spark.memory.offHeap.size", "14g")\
.appName("sampleCodeForReference")\
.config("spark.driver.cores", "12")\
.getOrCreate()
spark.conf.set("spark.sql.parquet.enableVectorizedReader","false")
sc = spark.sparkContext
sqlContext = SQLContext(sc)
```
%% Cell type:code id: tags:
``` python
archiveData = []
```
%% Cell type:code id: tags:
``` python
for k in range(len(parquetFiles)):
#print("Processing File " + str(k+1))
try:
file = sqlContext.read.parquet(parquetFiles[k])
UriComponents = file.rdd.take(1)[0].originalUrl.split('/')
payload = file.rdd.take(1)[0].payload
mime = file.rdd.take(1)[0].mime
filename = file.rdd.take(1)[0].filename
timestamp = filename.split('.')[0][4:]
#print(mime, UriComponents, len(payload))
print("Processing File " + str(k+1))
if (mime == 'text/html' and len(payload) > 1 and modelUrl in UriComponents[-1]):
currentData = {}
currentData['payload'] = payload
currentData['timestamp'] = timestamp
archiveData.append(currentData)
except:
pass
```
%% Output
Processing File 1
Processing File 2
Processing File 3
Processing File 4
Processing File 5
Processing File 6
Processing File 7
Processing File 8
Processing File 9
Processing File 10
Processing File 11
Processing File 12
Processing File 13
Processing File 14
Processing File 15
Processing File 16
Processing File 17
Processing File 18
Processing File 19
Processing File 20
Processing File 21
Processing File 22
Processing File 23
Processing File 24
Processing File 25
Processing File 26
Processing File 27
Processing File 28
Processing File 29
Processing File 30
Processing File 31
Processing File 32
Processing File 33
Processing File 34
Processing File 35
Processing File 36
Processing File 37
Processing File 38
Processing File 39
Processing File 40
Processing File 41
Processing File 42
Processing File 43
Processing File 44
Processing File 45
Processing File 46
Processing File 47
Processing File 48
Processing File 49
Processing File 50
Processing File 51
Processing File 52
Processing File 53
Processing File 54
Processing File 55
Processing File 56
Processing File 57
Processing File 58
Processing File 59
Processing File 60
Processing File 61
Processing File 62
Processing File 63
Processing File 64
Processing File 65
Processing File 66
Processing File 67
Processing File 68
Processing File 69
Processing File 70
Processing File 71
Processing File 72
Processing File 73
Processing File 74
Processing File 75
Processing File 76
Processing File 77
Processing File 78
Processing File 79
Processing File 80
Processing File 81
Processing File 82
Processing File 83
Processing File 84
Processing File 85
Processing File 86
Processing File 87
Processing File 88
Processing File 89
Processing File 90
Processing File 91
Processing File 92
Processing File 93
Processing File 94
Processing File 95
Processing File 96
Processing File 97
Processing File 98
Processing File 99
Processing File 100
Processing File 101
Processing File 102
Processing File 103
Processing File 104
Processing File 105
Processing File 106
Processing File 107
Processing File 108
Processing File 109
Processing File 110
Processing File 112
Processing File 113
Processing File 114
Processing File 115
Processing File 116
Processing File 117
Processing File 118
Processing File 119
Processing File 120
Processing File 121
Processing File 122
Processing File 123
Processing File 124
Processing File 125
Processing File 126
Processing File 127
Processing File 128
Processing File 129
Processing File 130
Processing File 131
Processing File 132
Processing File 133
Processing File 134
Processing File 135
Processing File 136
Processing File 137
Processing File 138
Processing File 139
Processing File 140
Processing File 141
Processing File 142
Processing File 143
Processing File 144
Processing File 145
Processing File 146
Processing File 147
Processing File 148
Processing File 149
Processing File 150
Processing File 151
Processing File 152
Processing File 153
Processing File 154
Processing File 155
Processing File 156
Processing File 157
Processing File 158
Processing File 159
Processing File 160
Processing File 161
Processing File 162
Processing File 163
Processing File 164
Processing File 165
Processing File 166
Processing File 167
Processing File 168
Processing File 169
Processing File 170
Processing File 171
Processing File 172
Processing File 173
Processing File 174
Processing File 175
Processing File 176
Processing File 177
Processing File 178
%% Cell type:code id: tags:
``` python
archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
```
%% Cell type:code id: tags:
``` python
df = pd.DataFrame(archiveData, columns = ['payload', 'timestamp'])
```
%% Cell type:code id: tags:
``` python
df.to_pickle("./archiveData.pkl")
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
#!/usr/bin/env python
# coding: utf-8
# In[2]:
import os
import pandas as pd
from html_similarity import style_similarity, structural_similarity, similarity
from bs4 import BeautifulSoup, Doctype
from bs4.element import Comment
from collections import Counter
from scipy.spatial import distance
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string
import spacy
from nltk.metrics import edit_distance
from nltk.metrics import edit_distance
from nltk.metrics import interval_distance
from nltk import jaccard_distance
import textdistance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm
# In[3]:
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
# In[4]:
def text_from_html(htmlPage):
soup = BeautifulSoup(htmlPage, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
# In[5]:
def split(word):
return [char for char in word]
# In[6]:
def filter_text(text):
stop_words = set(stopwords.words('english'))
stop_words.update(split(string.punctuation))
nlp = spacy.load('en_core_web_sm')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
stop_words.update(spacy_stopwords)
#stop_words.update(["\\t","\\n","\\r"])
text = text.replace("\\n", "")
text = text.replace("\\r", "")
text = text.replace("\\t", "")
word_tokens_text = word_tokenize(text)
filtered_text = [w for w in word_tokens_text if not w in stop_words]
filtered_text = TreebankWordDetokenizer().detokenize(filtered_text)
return filtered_text
# In[ ]:
# In[ ]:
# In[7]:
def classiyRF(archiveData, newRecord):
archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
basePayload = archiveData[0]['payload']
basePayloadText = text_from_html(basePayload)
basePayloadFilteredText = filter_text(basePayloadText)
lastSavedDataIndex = 0
dataset = []
print(str(len(archiveData)) + " datapoints found")
for i in range(1, len(archiveData)):
if(i % 100 is 0):
print(str(i) + " Records processed")
hasContentChanged = False
overallSimilarity = similarity(basePayload, archiveData[i]['payload'])
styleSimilarity = style_similarity(basePayload, archiveData[i]['payload'])
structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload'])
archiveText = text_from_html(archiveData[i]['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
if(overallSimilarity < 0.80 or cosineSimilarity < 0.95):
hasContentChanged = True
lastSavedDataIndex = i
basePayload = archiveData[i]['payload']
basePayloadText = archiveText
basePayloadFilteredText = filteredArchiveText
data = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged]
dataset.append(data)
df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'structureSimilarity', 'cosine', 'jaccard', 'sorensen', 'changed'])
print("Dataframe created")
X = df.iloc[:, 0:6].values
y = df.iloc[:, 6].values
sc = StandardScaler()
X_train = sc.fit_transform(X)
overallSimilarity = similarity(basePayload, newRecord['payload'])
styleSimilarity = style_similarity(basePayload, newRecord['payload'])
structuralSimilarity = structural_similarity(basePayload, newRecord['payload'])
archiveText = text_from_html(newRecord['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
X_test = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity]
print("Starting Random Forest Classification")
regressor = RandomForestClassifier(n_estimators=20, random_state=0)
regressor.fit(X_train, y)
y_pred = regressor.predict([X_test])
return y_pred
# In[ ]:
def classiySVM(archiveData, newRecord):
archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
basePayload = archiveData[0]['payload']
basePayloadText = text_from_html(basePayload)
basePayloadFilteredText = filter_text(basePayloadText)
lastSavedDataIndex = 0
dataset = []
print(str(len(archiveData)) + " datapoints found")
for i in range(1, len(archiveData)):
if(i % 100 is 0):
print(str(i) + " Records processed")
hasContentChanged = False
overallSimilarity = similarity(basePayload, archiveData[i]['payload'])
styleSimilarity = style_similarity(basePayload, archiveData[i]['payload'])
structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload'])
archiveText = text_from_html(archiveData[i]['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
if(overallSimilarity < 0.80 or cosineSimilarity < 0.95):
hasContentChanged = True
lastSavedDataIndex = i
basePayload = archiveData[i]['payload']
basePayloadText = archiveText
basePayloadFilteredText = filteredArchiveText
data = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged]
dataset.append(data)
df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'structureSimilarity', 'cosine', 'jaccard', 'sorensen', 'changed'])
print("Dataframe created")
X = df.iloc[:, 0:6].values
y = df.iloc[:, 6].values
sc = StandardScaler()
X_train = sc.fit_transform(X)
overallSimilarity = similarity(basePayload, newRecord['payload'])
styleSimilarity = style_similarity(basePayload, newRecord['payload'])
structuralSimilarity = structural_similarity(basePayload, newRecord['payload'])
archiveText = text_from_html(newRecord['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
X_test = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity]
print("Starting SVM Classification")
regressor = svm.SVC()
regressor.fit(X_train, y)
y_pred = regressor.predict([X_test])
return y_pred