Compare revisions

Naman Ahuja · Naman Ahuja · Naman Ahuja · siddharth · siddharth · Naman
--- a/ExampleNotebooks/classifyArchives.ipynb
+++ b/ExampleNotebooks/classifyArchives.ipynb
+%% Cell type:code id: tags:
+
+``` python
+import os
+import pandas as pd
+from html_similarity import style_similarity, structural_similarity, similarity
+from bs4 import BeautifulSoup, Doctype
+from bs4.element import Comment
+from collections import Counter
+from scipy.spatial import distance
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.tokenize.treebank import TreebankWordDetokenizer
+import string
+import spacy
+from nltk.metrics import edit_distance
+from nltk.metrics import edit_distance
+from nltk.metrics import interval_distance
+from nltk import jaccard_distance
+import textdistance
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.ensemble import RandomForestClassifier
+from sklearn import svm
+from sklearn.metrics import accuracy_score
+import datetime
+import fbprophet
+import gc
+import math
+import numpy as np
+from fastparquet import ParquetFile
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def tag_visible(element):
+    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
+        return False
+    if isinstance(element, Comment):
+        return False
+    return True
+
+def text_from_html(htmlPage):
+    soup = BeautifulSoup(htmlPage, 'html.parser')
+    texts = soup.findAll(text=True)
+    visible_texts = filter(tag_visible, texts)
+    return u" ".join(t.strip() for t in visible_texts)
+
+def split(word):
+    return [char for char in word]
+
+def filter_text(text):
+    stop_words = set(stopwords.words('english'))
+    stop_words.update(split(string.punctuation))
+    nlp = spacy.load('en_core_web_sm')
+    spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
+    stop_words.update(spacy_stopwords)
+    #stop_words.update(["\\t","\\n","\\r"])
+
+    text = text.replace("\\n", "")
+    text = text.replace("\\r", "")
+    text = text.replace("\\t", "")
+
+    word_tokens_text = word_tokenize(text)
+
+    filtered_text = [w for w in word_tokens_text if not w in stop_words]
+
+    filtered_text = TreebankWordDetokenizer().detokenize(filtered_text)
+
+    return filtered_text
+```
+
+%% Cell type:code id: tags:
+
+``` python
+archiveData = pd.read_pickle("./archiveData.pkl")
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def classifyProphet(archiveData):
+
+    basePayload = archiveData.iloc[0]['payload']
+    basePayloadText = text_from_html(basePayload)
+    basePayloadFilteredText = filter_text(basePayloadText)
+    baseTimestamp = datetime.datetime.strptime(archiveData.iloc[0]['timestamp'], '%Y%m%d%H%M%S')
+    lastSavedDataIndex = 0
+    timeElapsed = 0
+    dataset = []
+
+    for i in range(1, len(archiveData)):
+
+        print("Proceesing " + str(i))
+
+        hasContentChanged = 0
+
+        overallSimilarity = similarity(basePayload, archiveData.iloc[i]['payload'])
+        styleSimilarity = style_similarity(basePayload, archiveData.iloc[i]['payload'])
+        structuralSimilarity = structural_similarity(basePayload, archiveData.iloc[i]['payload'])
+
+        timestamp = datetime.datetime.strptime(archiveData.iloc[i]['timestamp'], '%Y%m%d%H%M%S')
+
+        archiveText = text_from_html(archiveData.iloc[i]['payload'])
+        filteredArchiveText = filter_text(archiveText)
+
+        cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+        jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+        #editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+        sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+
+        #print(overallSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity)
+        if(overallSimilarity < 0.98 or cosineSimilarity < 1):
+            hasContentChanged = 1
+            lastSavedDataIndex = i
+            basePayload = archiveData.iloc[i]['payload']
+            basePayloadText = archiveText
+            basePayloadFilteredText = filteredArchiveText
+            baseTimestamp = datetime.datetime.strptime(archiveData.iloc[i]['timestamp'], '%Y%m%d%H%M%S')
+
+        data = [timestamp, hasContentChanged]
+        dataset.append(data)
+
+    df = pd.DataFrame(dataset, columns = ['ds', 'y'])
+
+    m = fbprophet.Prophet()
+    m.fit(df)
+    print(df)
+    future = m.make_future_dataframe(periods=1, freq="5H", include_history = False)
+    forecast = m.predict(future)
+
+    allfuture = m.make_future_dataframe(periods=72, freq="5H", include_history = True)
+    allforecast = m.predict(allfuture)
+
+    print(forecast)
+
+    fig1 = m.plot(allforecast)
+    fig2 = m.plot_components(allforecast)
+
+    return forecast
+```
+
+%% Cell type:code id: tags:
+
+``` python
+classifyProphet(archiveData)
+```
+
+%% Output
+
+    Proceesing 1
+    Proceesing 2
+    Proceesing 3
+    Proceesing 4
+    Proceesing 5
+    Proceesing 6
+    Proceesing 7
+    Proceesing 8
+    Proceesing 9
+    Proceesing 10
+    Proceesing 11
+    Proceesing 12
+    Proceesing 13
+    Proceesing 14
+    Proceesing 15
+    Proceesing 16
+    Proceesing 17
+    Proceesing 18
+    Proceesing 19
+    Proceesing 20
+    Proceesing 21
+    Proceesing 22
+    Proceesing 23
+    Proceesing 24
+    Proceesing 25
+    Proceesing 26
+    Proceesing 27
+    Proceesing 28
+    Proceesing 29
+    Proceesing 30
+    Proceesing 31
+    Proceesing 32
+    Proceesing 33
+    Proceesing 34
+    Proceesing 35
+    Proceesing 36
+    Proceesing 37
+    Proceesing 38
+    Proceesing 39
+    Proceesing 40
+    Proceesing 41
+    Proceesing 42
+    Proceesing 43
+    Proceesing 44
+    Proceesing 45
+    Proceesing 46
+    Proceesing 47
+    Proceesing 48
+    Proceesing 49
+    Proceesing 50
+    Proceesing 51
+    Proceesing 52
+    Proceesing 53
+    Proceesing 54
+    Proceesing 55
+    Proceesing 56
+    Proceesing 57
+    Proceesing 58
+    Proceesing 59
+    Proceesing 60
+    Proceesing 61
+    Proceesing 62
+    Proceesing 63
+    Proceesing 64
+    Proceesing 65
+    Proceesing 66
+    Proceesing 67
+    Proceesing 68
+    Proceesing 69
+    Proceesing 70
+    Proceesing 71
+    Proceesing 72
+    Proceesing 73
+    Proceesing 74
+    Proceesing 75
+    Proceesing 76
+    Proceesing 77
+    Proceesing 78
+    Proceesing 79
+    Proceesing 80
+    Proceesing 81
+    Proceesing 82
+    Proceesing 83
+    Proceesing 84
+    Proceesing 85
+    Proceesing 86
+    Proceesing 87
+    Proceesing 88
+    Proceesing 89
+    Proceesing 90
+    Proceesing 91
+    Proceesing 92
+    Proceesing 93
+    Proceesing 94
+    Proceesing 95
+    Proceesing 96
+    Proceesing 97
+    Proceesing 98
+    Proceesing 99
+    Proceesing 100
+    Proceesing 101
+    Proceesing 102
+    Proceesing 103
+    Proceesing 104
+    Proceesing 105
+    Proceesing 106
+    Proceesing 107
+    Proceesing 108
+    Proceesing 109
+    Proceesing 110
+    Proceesing 111
+    Proceesing 112
+    Proceesing 113
+    Proceesing 114
+    Proceesing 115
+    Proceesing 116
+    Proceesing 117
+    Proceesing 118
+    Proceesing 119
+    Proceesing 120
+    Proceesing 121
+    Proceesing 122
+    Proceesing 123
+    Proceesing 124
+    Proceesing 125
+    Proceesing 126
+    Proceesing 127
+    Proceesing 128
+    Proceesing 129
+    Proceesing 130
+    Proceesing 131
+    Proceesing 132
+    Proceesing 133
+    Proceesing 134
+    Proceesing 135
+    Proceesing 136
+    Proceesing 137
+    Proceesing 138
+    Proceesing 139
+    Proceesing 140
+    Proceesing 141
+    Proceesing 142
+    Proceesing 143
+    Proceesing 144
+    Proceesing 145
+    Proceesing 146
+    Proceesing 147
+    Proceesing 148
+    Proceesing 149
+    Proceesing 150
+    Proceesing 151
+    Proceesing 152
+    Proceesing 153
+    Proceesing 154
+    Proceesing 155
+    Proceesing 156
+    Proceesing 157
+    Proceesing 158
+    Proceesing 159
+    Proceesing 160
+    Proceesing 161
+    Proceesing 162
+    Proceesing 163
+    Proceesing 164
+    Proceesing 165
+    Proceesing 166
+    Proceesing 167
+    Proceesing 168
+    Proceesing 169
+    Proceesing 170
+    Proceesing 171
+    Proceesing 172
+    Proceesing 173
+    Proceesing 174
+    Proceesing 175
+    Proceesing 176
+
+    INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
+    INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
+
+                         ds  y
+    0   2019-11-06 17:10:18  0
+    1   2019-11-06 19:10:18  0
+    2   2019-11-06 20:10:18  0
+    3   2019-11-07 00:10:18  0
+    4   2019-11-07 01:10:18  0
+    ..                  ... ..
+    171 2019-11-19 06:10:18  0
+    172 2019-11-19 07:10:18  0
+    173 2019-11-19 08:10:18  0
+    174 2019-11-19 09:10:18  0
+    175 2019-11-19 11:10:18  0
+    
+    [176 rows x 2 columns]
+                       ds     trend  yhat_lower  yhat_upper  trend_lower  \
+    0 2019-11-19 16:10:18  0.031191   -0.024171    0.247586     0.031191
+    
+       trend_upper  additive_terms  additive_terms_lower  additive_terms_upper  \
+    0     0.031191         0.07207               0.07207               0.07207
+    
+         daily  daily_lower  daily_upper  multiplicative_terms  \
+    0  0.07207      0.07207      0.07207                   0.0
+    
+       multiplicative_terms_lower  multiplicative_terms_upper      yhat
+    0                         0.0                         0.0  0.103261
+
+                       ds     trend  yhat_lower  yhat_upper  trend_lower  \
+    0 2019-11-19 16:10:18  0.031191   -0.024171    0.247586     0.031191
+    
+       trend_upper  additive_terms  additive_terms_lower  additive_terms_upper  \
+    0     0.031191         0.07207               0.07207               0.07207
+    
+         daily  daily_lower  daily_upper  multiplicative_terms  \
+    0  0.07207      0.07207      0.07207                   0.0
+    
+       multiplicative_terms_lower  multiplicative_terms_upper      yhat
+    0                         0.0                         0.0  0.103261
+
+
+
+
+
+%% Cell type:code id: tags:
+
+``` python
+def classiyWindowSVM(archiveData):
+
+    basePayload = archiveData.iloc[0]['payload']
+    basePayloadText = text_from_html(basePayload)
+    basePayloadFilteredText = filter_text(basePayloadText)
+    baseTimestamp = datetime.datetime.strptime(archiveData.iloc[0]['timestamp'], '%Y%m%d%H%M%S')
+    lastSavedDataIndex = 0
+    timeElapsed = 0
+    dataset = []
+
+    print(str(len(archiveData)) + " datapoints found")
+
+    for i in range(1, len(archiveData)):
+        if(i % 10 is 0):
+            print(str(i) + " Records processed")
+
+        hasContentChanged = False
+
+        overallSimilarity = similarity(basePayload, archiveData.iloc[i]['payload'])
+        styleSimilarity = style_similarity(basePayload, archiveData.iloc[i]['payload'])
+        structuralSimilarity = structural_similarity(basePayload, archiveData.iloc[i]['payload'])
+
+        timestamp = datetime.datetime.strptime(archiveData.iloc[i]['timestamp'], '%Y%m%d%H%M%S')
+        baseTimestampDiff = ((timestamp - baseTimestamp).total_seconds())/3600.0
+        archiveText = text_from_html(archiveData.iloc[i]['payload'])
+        filteredArchiveText = filter_text(archiveText)
+
+        cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+        jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+        #editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+        sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+
+        if(overallSimilarity < 0.98 or cosineSimilarity < 1):
+            hasContentChanged = True
+            lastSavedDataIndex = i
+            basePayload = archiveData.iloc[i]['payload']
+            basePayloadText = archiveText
+            basePayloadFilteredText = filteredArchiveText
+            baseTimestamp = datetime.datetime.strptime(archiveData.iloc[i]['timestamp'], '%Y%m%d%H%M%S')
+
+
+        data = [baseTimestampDiff, overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged]
+        dataset.append(data)
+
+
+
+    df = pd.DataFrame(dataset, columns = ['timeDiff', 'similarity', 'styleSimilarity', 'structureSimilarity',                                           'cosine', 'jaccard', 'sorensen', 'changed'])
+    print("Dataframe created")
+
+
+    X = []
+    y = []
+
+    windowSize = 10
+
+    for i in range(len(df.index) - windowSize - 1):
+        bound = min(i + windowSize + 1, len(archiveData))
+        window = df.iloc[i:bound-1, 0].values
+        windowLabel = df.iloc[bound, 7]
+        X.append(window)
+        y.append(windowLabel)
+
+    trainEx = math.floor(len(X)*0.8)
+
+    X_train = X[0:trainEx]
+    y_train = y[0:trainEx]
+
+    X_test = X[trainEx:]
+    y_test = y[trainEx:]
+
+    print("Starting SVM Classification")
+
+    print(X_train, y_train)
+
+    regressor = svm.SVC()
+    regressor.fit(X_train, y_train)
+    y_pred = regressor.predict(X_test)
+
+    print(y_pred)
+
+    return np.mean(y_test == y_pred)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+print(classiyWindowSVM(archiveData))
+```
+
+%% Output
+
+    177 datapoints found
+    10 Records processed
+    20 Records processed
+    30 Records processed
+    40 Records processed
+    50 Records processed
+    60 Records processed
+    70 Records processed
+    80 Records processed
+    90 Records processed
+    100 Records processed
+    110 Records processed
+    120 Records processed
+    130 Records processed
+    140 Records processed
+    150 Records processed
+    160 Records processed
+    170 Records processed
+    Dataframe created
+    Starting SVM Classification
+    [array([ 1.,  3.,  4.,  8.,  9., 11., 13., 14., 17., 21.]), array([ 3.,  4.,  8.,  9., 11., 13., 14., 17., 21., 22.]), array([ 4.,  8.,  9., 11., 13., 14., 17., 21., 22., 24.]), array([ 8.,  9., 11., 13., 14., 17., 21., 22., 24., 26.]), array([ 9., 11., 13., 14., 17., 21., 22., 24., 26., 27.]), array([11., 13., 14., 17., 21., 22., 24., 26., 27., 28.]), array([13., 14., 17., 21., 22., 24., 26., 27., 28., 29.]), array([14., 17., 21., 22., 24., 26., 27., 28., 29., 31.]), array([17., 21., 22., 24., 26., 27., 28., 29., 31., 32.]), array([21., 22., 24., 26., 27., 28., 29., 31., 32., 33.]), array([22., 24., 26., 27., 28., 29., 31., 32., 33., 34.]), array([24., 26., 27., 28., 29., 31., 32., 33., 34., 36.]), array([26., 27., 28., 29., 31., 32., 33., 34., 36., 39.]), array([27., 28., 29., 31., 32., 33., 34., 36., 39., 40.]), array([28., 29., 31., 32., 33., 34., 36., 39., 40., 41.]), array([29., 31., 32., 33., 34., 36., 39., 40., 41., 43.]), array([31., 32., 33., 34., 36., 39., 40., 41., 43., 44.]), array([32., 33., 34., 36., 39., 40., 41., 43., 44., 45.]), array([33., 34., 36., 39., 40., 41., 43., 44., 45., 48.]), array([34., 36., 39., 40., 41., 43., 44., 45., 48., 49.]), array([36., 39., 40., 41., 43., 44., 45., 48., 49., 50.]), array([39., 40., 41., 43., 44., 45., 48., 49., 50., 51.]), array([40., 41., 43., 44., 45., 48., 49., 50., 51., 52.]), array([41., 43., 44., 45., 48., 49., 50., 51., 52., 54.]), array([43., 44., 45., 48., 49., 50., 51., 52., 54., 55.]), array([44., 45., 48., 49., 50., 51., 52., 54., 55., 56.]), array([45., 48., 49., 50., 51., 52., 54., 55., 56., 57.]), array([48., 49., 50., 51., 52., 54., 55., 56., 57., 58.]), array([49., 50., 51., 52., 54., 55., 56., 57., 58., 62.]), array([50., 51., 52., 54., 55., 56., 57., 58., 62., 63.]), array([51., 52., 54., 55., 56., 57., 58., 62., 63., 64.]), array([52., 54., 55., 56., 57., 58., 62., 63., 64., 66.]), array([54., 55., 56., 57., 58., 62., 63., 64., 66., 70.]), array([55., 56., 57., 58., 62., 63., 64., 66., 70., 72.]), array([56., 57., 58., 62., 63., 64., 66., 70., 72., 73.]), array([57., 58., 62., 63., 64., 66., 70., 72., 73., 76.]), array([58., 62., 63., 64., 66., 70., 72., 73., 76., 77.]), array([62., 63., 64., 66., 70., 72., 73., 76., 77., 78.]), array([63., 64., 66., 70., 72., 73., 76., 77., 78., 80.]), array([64., 66., 70., 72., 73., 76., 77., 78., 80., 81.]), array([66., 70., 72., 73., 76., 77., 78., 80., 81., 82.]), array([70., 72., 73., 76., 77., 78., 80., 81., 82., 84.]), array([72., 73., 76., 77., 78., 80., 81., 82., 84., 85.]), array([73., 76., 77., 78., 80., 81., 82., 84., 85., 86.]), array([76., 77., 78., 80., 81., 82., 84., 85., 86., 87.]), array([77., 78., 80., 81., 82., 84., 85., 86., 87., 88.]), array([78., 80., 81., 82., 84., 85., 86., 87., 88., 89.]), array([80., 81., 82., 84., 85., 86., 87., 88., 89., 90.]), array([81., 82., 84., 85., 86., 87., 88., 89., 90., 91.]), array([82., 84., 85., 86., 87., 88., 89., 90., 91., 92.]), array([84., 85., 86., 87., 88., 89., 90., 91., 92., 96.]), array([85., 86., 87., 88., 89., 90., 91., 92., 96., 97.]), array([86., 87., 88., 89., 90., 91., 92., 96., 97., 98.]), array([87., 88., 89., 90., 91., 92., 96., 97., 98., 99.]), array([ 88.,  89.,  90.,  91.,  92.,  96.,  97.,  98.,  99., 100.]), array([ 89.,  90.,  91.,  92.,  96.,  97.,  98.,  99., 100., 103.]), array([ 90.,  91.,  92.,  96.,  97.,  98.,  99., 100., 103., 106.]), array([ 91.,  92.,  96.,  97.,  98.,  99., 100., 103., 106., 108.]), array([ 92.,  96.,  97.,  98.,  99., 100., 103., 106., 108., 109.]), array([ 96.,  97.,  98.,  99., 100., 103., 106., 108., 109., 110.]), array([ 97.,  98.,  99., 100., 103., 106., 108., 109., 110., 111.]), array([ 98.,  99., 100., 103., 106., 108., 109., 110., 111., 112.]), array([ 99., 100., 103., 106., 108., 109., 110., 111., 112., 118.]), array([100., 103., 106., 108., 109., 110., 111., 112., 118., 120.]), array([103., 106., 108., 109., 110., 111., 112., 118., 120., 122.]), array([106., 108., 109., 110., 111., 112., 118., 120., 122., 123.]), array([108., 109., 110., 111., 112., 118., 120., 122., 123., 124.]), array([109., 110., 111., 112., 118., 120., 122., 123., 124., 126.]), array([110., 111., 112., 118., 120., 122., 123., 124., 126., 128.]), array([111., 112., 118., 120., 122., 123., 124., 126., 128., 129.]), array([112., 118., 120., 122., 123., 124., 126., 128., 129., 130.]), array([118., 120., 122., 123., 124., 126., 128., 129., 130., 132.]), array([120., 122., 123., 124., 126., 128., 129., 130., 132., 133.]), array([122., 123., 124., 126., 128., 129., 130., 132., 133., 135.]), array([123., 124., 126., 128., 129., 130., 132., 133., 135., 139.]), array([124., 126., 128., 129., 130., 132., 133., 135., 139., 142.]), array([126., 128., 129., 130., 132., 133., 135., 139., 142., 143.]), array([128., 129., 130., 132., 133., 135., 139., 142., 143., 144.]), array([129., 130., 132., 133., 135., 139., 142., 143., 144., 146.]), array([130., 132., 133., 135., 139., 142., 143., 144., 146., 150.]), array([132., 133., 135., 139., 142., 143., 144., 146., 150., 152.]), array([133., 135., 139., 142., 143., 144., 146., 150., 152., 154.]), array([135., 139., 142., 143., 144., 146., 150., 152., 154., 157.]), array([139., 142., 143., 144., 146., 150., 152., 154., 157., 158.]), array([142., 143., 144., 146., 150., 152., 154., 157., 158., 159.]), array([143., 144., 146., 150., 152., 154., 157., 158., 159., 161.]), array([144., 146., 150., 152., 154., 157., 158., 159., 161., 164.]), array([146., 150., 152., 154., 157., 158., 159., 161., 164., 165.]), array([150., 152., 154., 157., 158., 159., 161., 164., 165., 166.]), array([152., 154., 157., 158., 159., 161., 164., 165., 166., 169.]), array([154., 157., 158., 159., 161., 164., 165., 166., 169., 170.]), array([157., 158., 159., 161., 164., 165., 166., 169., 170., 171.]), array([158., 159., 161., 164., 165., 166., 169., 170., 171., 172.]), array([159., 161., 164., 165., 166., 169., 170., 171., 172., 173.]), array([161., 164., 165., 166., 169., 170., 171., 172., 173., 174.]), array([164., 165., 166., 169., 170., 171., 172., 173., 174., 176.]), array([165., 166., 169., 170., 171., 172., 173., 174., 176., 178.]), array([166., 169., 170., 171., 172., 173., 174., 176., 178., 179.]), array([169., 170., 171., 172., 173., 174., 176., 178., 179., 181.]), array([170., 171., 172., 173., 174., 176., 178., 179., 181., 184.]), array([171., 172., 173., 174., 176., 178., 179., 181., 184., 185.]), array([172., 173., 174., 176., 178., 179., 181., 184., 185., 188.]), array([173., 174., 176., 178., 179., 181., 184., 185., 188., 190.]), array([174., 176., 178., 179., 181., 184., 185., 188., 190., 192.]), array([176., 178., 179., 181., 184., 185., 188., 190., 192., 193.]), array([178., 179., 181., 184., 185., 188., 190., 192., 193., 194.]), array([179., 181., 184., 185., 188., 190., 192., 193., 194., 196.]), array([181., 184., 185., 188., 190., 192., 193., 194., 196., 197.]), array([184., 185., 188., 190., 192., 193., 194., 196., 197., 198.]), array([185., 188., 190., 192., 193., 194., 196., 197., 198., 202.]), array([188., 190., 192., 193., 194., 196., 197., 198., 202., 205.]), array([190., 192., 193., 194., 196., 197., 198., 202., 205., 210.]), array([192., 193., 194., 196., 197., 198., 202., 205., 210., 214.]), array([193., 194., 196., 197., 198., 202., 205., 210., 214., 215.]), array([194., 196., 197., 198., 202., 205., 210., 214., 215.,   2.]), array([196., 197., 198., 202., 205., 210., 214., 215.,   2.,   3.]), array([197., 198., 202., 205., 210., 214., 215.,   2.,   3.,   6.]), array([198., 202., 205., 210., 214., 215.,   2.,   3.,   6.,   7.]), array([202., 205., 210., 214., 215.,   2.,   3.,   6.,   7.,   8.]), array([205., 210., 214., 215.,   2.,   3.,   6.,   7.,   8.,   9.]), array([210., 214., 215.,   2.,   3.,   6.,   7.,   8.,   9.,  11.]), array([214., 215.,   2.,   3.,   6.,   7.,   8.,   9.,  11.,  13.]), array([215.,   2.,   3.,   6.,   7.,   8.,   9.,  11.,  13.,  14.]), array([ 2.,  3.,  6.,  7.,  8.,  9., 11., 13., 14., 15.]), array([ 3.,  6.,  7.,  8.,  9., 11., 13., 14., 15., 17.]), array([ 6.,  7.,  8.,  9., 11., 13., 14., 15., 17., 18.]), array([ 7.,  8.,  9., 11., 13., 14., 15., 17., 18., 19.]), array([ 8.,  9., 11., 13., 14., 15., 17., 18., 19., 21.]), array([ 9., 11., 13., 14., 15., 17., 18., 19., 21., 23.]), array([11., 13., 14., 15., 17., 18., 19., 21., 23., 24.]), array([13., 14., 15., 17., 18., 19., 21., 23., 24., 26.]), array([14., 15., 17., 18., 19., 21., 23., 24., 26., 29.])] [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
+    [False False False False False False False False False False False False
+     False False False False False False False False False False False False
+     False False False False False False False False False]
+    0.9696969696969697
+
+    /home/naman/anaconda3/envs/dl/lib/python3.7/site-packages/sklearn/svm/base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
+      "avoid this warning.", FutureWarning)
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
+%% Cell type:code id: tags:
+
+``` python
+import os
+import pandas as pd
+from html_similarity import style_similarity, structural_similarity, similarity
+from bs4 import BeautifulSoup, Doctype
+from bs4.element import Comment
+from collections import Counter
+from scipy.spatial import distance
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.tokenize.treebank import TreebankWordDetokenizer
+import string
+import spacy
+from nltk.metrics import edit_distance
+from nltk.metrics import edit_distance
+from nltk.metrics import interval_distance
+from nltk import jaccard_distance
+import textdistance
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.ensemble import RandomForestClassifier
+from sklearn import svm
+from sklearn.metrics import accuracy_score
+import datetime
+import fbprophet
+import gc
+import math
+import numpy as np
+from fastparquet import ParquetFile
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def tag_visible(element):
+    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
+        return False
+    if isinstance(element, Comment):
+        return False
+    return True
+
+def text_from_html(htmlPage):
+    soup = BeautifulSoup(htmlPage, 'html.parser')
+    texts = soup.findAll(text=True)
+    visible_texts = filter(tag_visible, texts)
+    return u" ".join(t.strip() for t in visible_texts)
+
+def split(word):
+    return [char for char in word]
+
+def filter_text(text):
+    stop_words = set(stopwords.words('english'))
+    stop_words.update(split(string.punctuation))
+    nlp = spacy.load('en_core_web_sm')
+    spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
+    stop_words.update(spacy_stopwords)
+    #stop_words.update(["\\t","\\n","\\r"])
+
+    text = text.replace("\\n", "")
+    text = text.replace("\\r", "")
+    text = text.replace("\\t", "")
+
+    word_tokens_text = word_tokenize(text)
+
+    filtered_text = [w for w in word_tokens_text if not w in stop_words]
+
+    filtered_text = TreebankWordDetokenizer().detokenize(filtered_text)
+
+    return filtered_text
+```
+
+%% Cell type:code id: tags:
+
+``` python
+archiveData = pd.read_pickle("./archiveData.pkl")
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def classifyProphet(archiveData):
+
+    basePayload = archiveData.iloc[0]['payload']
+    basePayloadText = text_from_html(basePayload)
+    basePayloadFilteredText = filter_text(basePayloadText)
+    baseTimestamp = datetime.datetime.strptime(archiveData.iloc[0]['timestamp'], '%Y%m%d%H%M%S')
+    lastSavedDataIndex = 0
+    timeElapsed = 0
+    dataset = []
+
+    for i in range(1, len(archiveData)):
+
+        print("Proceesing " + str(i))
+
+        hasContentChanged = 0
+
+        overallSimilarity = similarity(basePayload, archiveData.iloc[i]['payload'])
+        styleSimilarity = style_similarity(basePayload, archiveData.iloc[i]['payload'])
+        structuralSimilarity = structural_similarity(basePayload, archiveData.iloc[i]['payload'])
+
+        timestamp = datetime.datetime.strptime(archiveData.iloc[i]['timestamp'], '%Y%m%d%H%M%S')
+
+        archiveText = text_from_html(archiveData.iloc[i]['payload'])
+        filteredArchiveText = filter_text(archiveText)
+
+        cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+        jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+        #editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+        sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+
+        #print(overallSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity)
+        if(overallSimilarity < 0.98 or cosineSimilarity < 1):
+            hasContentChanged = 1
+            lastSavedDataIndex = i
+            basePayload = archiveData.iloc[i]['payload']
+            basePayloadText = archiveText
+            basePayloadFilteredText = filteredArchiveText
+            baseTimestamp = datetime.datetime.strptime(archiveData.iloc[i]['timestamp'], '%Y%m%d%H%M%S')
+
+        data = [timestamp, hasContentChanged]
+        dataset.append(data)
+
+    df = pd.DataFrame(dataset, columns = ['ds', 'y'])
+
+    m = fbprophet.Prophet()
+    m.fit(df)
+    print(df)
+    future = m.make_future_dataframe(periods=1, freq="5H", include_history = False)
+    forecast = m.predict(future)
+
+    allfuture = m.make_future_dataframe(periods=72, freq="5H", include_history = True)
+    allforecast = m.predict(allfuture)
+
+    print(forecast)
+
+    fig1 = m.plot(allforecast)
+    fig2 = m.plot_components(allforecast)
+
+    return forecast
+```
+
+%% Cell type:code id: tags:
+
+``` python
+classifyProphet(archiveData)
+```
+
+%% Output
+
+    Proceesing 1
+    Proceesing 2
+    Proceesing 3
+    Proceesing 4
+    Proceesing 5
+    Proceesing 6
+    Proceesing 7
+    Proceesing 8
+    Proceesing 9
+    Proceesing 10
+    Proceesing 11
+    Proceesing 12
+    Proceesing 13
+    Proceesing 14
+    Proceesing 15
+    Proceesing 16
+    Proceesing 17
+    Proceesing 18
+    Proceesing 19
+    Proceesing 20
+    Proceesing 21
+    Proceesing 22
+    Proceesing 23
+    Proceesing 24
+    Proceesing 25
+    Proceesing 26
+    Proceesing 27
+    Proceesing 28
+    Proceesing 29
+    Proceesing 30
+    Proceesing 31
+    Proceesing 32
+    Proceesing 33
+    Proceesing 34
+    Proceesing 35
+    Proceesing 36
+    Proceesing 37
+    Proceesing 38
+    Proceesing 39
+    Proceesing 40
+    Proceesing 41
+    Proceesing 42
+    Proceesing 43
+    Proceesing 44
+    Proceesing 45
+    Proceesing 46
+    Proceesing 47
+    Proceesing 48
+    Proceesing 49
+    Proceesing 50
+    Proceesing 51
+    Proceesing 52
+    Proceesing 53
+    Proceesing 54
+    Proceesing 55
+    Proceesing 56
+    Proceesing 57
+    Proceesing 58
+    Proceesing 59
+    Proceesing 60
+    Proceesing 61
+    Proceesing 62
+    Proceesing 63
+    Proceesing 64
+    Proceesing 65
+    Proceesing 66
+    Proceesing 67
+    Proceesing 68
+    Proceesing 69
+    Proceesing 70
+    Proceesing 71
+    Proceesing 72
+    Proceesing 73
+    Proceesing 74
+    Proceesing 75
+    Proceesing 76
+    Proceesing 77
+    Proceesing 78
+    Proceesing 79
+    Proceesing 80
+    Proceesing 81
+    Proceesing 82
+    Proceesing 83
+    Proceesing 84
+    Proceesing 85
+    Proceesing 86
+    Proceesing 87
+    Proceesing 88
+    Proceesing 89
+    Proceesing 90
+    Proceesing 91
+    Proceesing 92
+    Proceesing 93
+    Proceesing 94
+    Proceesing 95
+    Proceesing 96
+    Proceesing 97
+    Proceesing 98
+    Proceesing 99
+    Proceesing 100
+    Proceesing 101
+    Proceesing 102
+    Proceesing 103
+    Proceesing 104
+    Proceesing 105
+    Proceesing 106
+    Proceesing 107
+    Proceesing 108
+    Proceesing 109
+    Proceesing 110
+    Proceesing 111
+    Proceesing 112
+    Proceesing 113
+    Proceesing 114
+    Proceesing 115
+    Proceesing 116
+    Proceesing 117
+    Proceesing 118
+    Proceesing 119
+    Proceesing 120
+    Proceesing 121
+    Proceesing 122
+    Proceesing 123
+    Proceesing 124
+    Proceesing 125
+    Proceesing 126
+    Proceesing 127
+    Proceesing 128
+    Proceesing 129
+    Proceesing 130
+    Proceesing 131
+    Proceesing 132
+    Proceesing 133
+    Proceesing 134
+    Proceesing 135
+    Proceesing 136
+    Proceesing 137
+    Proceesing 138
+    Proceesing 139
+    Proceesing 140
+    Proceesing 141
+    Proceesing 142
+    Proceesing 143
+    Proceesing 144
+    Proceesing 145
+    Proceesing 146
+    Proceesing 147
+    Proceesing 148
+    Proceesing 149
+    Proceesing 150
+    Proceesing 151
+    Proceesing 152
+    Proceesing 153
+    Proceesing 154
+    Proceesing 155
+    Proceesing 156
+    Proceesing 157
+    Proceesing 158
+    Proceesing 159
+    Proceesing 160
+    Proceesing 161
+    Proceesing 162
+    Proceesing 163
+    Proceesing 164
+    Proceesing 165
+    Proceesing 166
+    Proceesing 167
+    Proceesing 168
+    Proceesing 169
+    Proceesing 170
+    Proceesing 171
+    Proceesing 172
+    Proceesing 173
+    Proceesing 174
+    Proceesing 175
+    Proceesing 176
+
+    INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
+    INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
+
+                         ds  y
+    0   2019-11-06 17:10:18  0
+    1   2019-11-06 19:10:18  0
+    2   2019-11-06 20:10:18  0
+    3   2019-11-07 00:10:18  0
+    4   2019-11-07 01:10:18  0
+    ..                  ... ..
+    171 2019-11-19 06:10:18  0
+    172 2019-11-19 07:10:18  0
+    173 2019-11-19 08:10:18  0
+    174 2019-11-19 09:10:18  0
+    175 2019-11-19 11:10:18  0
+    
+    [176 rows x 2 columns]
+                       ds     trend  yhat_lower  yhat_upper  trend_lower  \
+    0 2019-11-19 16:10:18  0.031191   -0.024171    0.247586     0.031191
+    
+       trend_upper  additive_terms  additive_terms_lower  additive_terms_upper  \
+    0     0.031191         0.07207               0.07207               0.07207
+    
+         daily  daily_lower  daily_upper  multiplicative_terms  \
+    0  0.07207      0.07207      0.07207                   0.0
+    
+       multiplicative_terms_lower  multiplicative_terms_upper      yhat
+    0                         0.0                         0.0  0.103261
+
+                       ds     trend  yhat_lower  yhat_upper  trend_lower  \
+    0 2019-11-19 16:10:18  0.031191   -0.024171    0.247586     0.031191
+    
+       trend_upper  additive_terms  additive_terms_lower  additive_terms_upper  \
+    0     0.031191         0.07207               0.07207               0.07207
+    
+         daily  daily_lower  daily_upper  multiplicative_terms  \
+    0  0.07207      0.07207      0.07207                   0.0
+    
+       multiplicative_terms_lower  multiplicative_terms_upper      yhat
+    0                         0.0                         0.0  0.103261
+
+
+
+
+
+%% Cell type:code id: tags:
+
+``` python
+def classiyWindowSVM(archiveData):
+
+    basePayload = archiveData.iloc[0]['payload']
+    basePayloadText = text_from_html(basePayload)
+    basePayloadFilteredText = filter_text(basePayloadText)
+    baseTimestamp = datetime.datetime.strptime(archiveData.iloc[0]['timestamp'], '%Y%m%d%H%M%S')
+    lastSavedDataIndex = 0
+    timeElapsed = 0
+    dataset = []
+
+    print(str(len(archiveData)) + " datapoints found")
+
+    for i in range(1, len(archiveData)):
+        if(i % 10 is 0):
+            print(str(i) + " Records processed")
+
+        hasContentChanged = False
+
+        overallSimilarity = similarity(basePayload, archiveData.iloc[i]['payload'])
+        styleSimilarity = style_similarity(basePayload, archiveData.iloc[i]['payload'])
+        structuralSimilarity = structural_similarity(basePayload, archiveData.iloc[i]['payload'])
+
+        timestamp = datetime.datetime.strptime(archiveData.iloc[i]['timestamp'], '%Y%m%d%H%M%S')
+        baseTimestampDiff = ((timestamp - baseTimestamp).total_seconds())/3600.0
+        archiveText = text_from_html(archiveData.iloc[i]['payload'])
+        filteredArchiveText = filter_text(archiveText)
+
+        cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+        jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+        #editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+        sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+
+        if(overallSimilarity < 0.98 or cosineSimilarity < 1):
+            hasContentChanged = True
+            lastSavedDataIndex = i
+            basePayload = archiveData.iloc[i]['payload']
+            basePayloadText = archiveText
+            basePayloadFilteredText = filteredArchiveText
+            baseTimestamp = datetime.datetime.strptime(archiveData.iloc[i]['timestamp'], '%Y%m%d%H%M%S')
+
+
+        data = [baseTimestampDiff, overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged]
+        dataset.append(data)
+
+
+
+    df = pd.DataFrame(dataset, columns = ['timeDiff', 'similarity', 'styleSimilarity', 'structureSimilarity',                                           'cosine', 'jaccard', 'sorensen', 'changed'])
+    print("Dataframe created")
+
+
+    X = []
+    y = []
+
+    windowSize = 10
+
+    for i in range(len(df.index) - windowSize - 1):
+        bound = min(i + windowSize + 1, len(archiveData))
+        window = df.iloc[i:bound-1, 0].values
+        windowLabel = df.iloc[bound, 7]
+        X.append(window)
+        y.append(windowLabel)
+
+    trainEx = math.floor(len(X)*0.8)
+
+    X_train = X[0:trainEx]
+    y_train = y[0:trainEx]
+
+    X_test = X[trainEx:]
+    y_test = y[trainEx:]
+
+    print("Starting SVM Classification")
+
+    print(X_train, y_train)
+
+    regressor = svm.SVC()
+    regressor.fit(X_train, y_train)
+    y_pred = regressor.predict(X_test)
+
+    print(y_pred)
+
+    return np.mean(y_test == y_pred)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+print(classiyWindowSVM(archiveData))
+```
+
+%% Output
+
+    177 datapoints found
+    10 Records processed
+    20 Records processed
+    30 Records processed
+    40 Records processed
+    50 Records processed
+    60 Records processed
+    70 Records processed
+    80 Records processed
+    90 Records processed
+    100 Records processed
+    110 Records processed
+    120 Records processed
+    130 Records processed
+    140 Records processed
+    150 Records processed
+    160 Records processed
+    170 Records processed
+    Dataframe created
+    Starting SVM Classification
+    [array([ 1.,  3.,  4.,  8.,  9., 11., 13., 14., 17., 21.]), array([ 3.,  4.,  8.,  9., 11., 13., 14., 17., 21., 22.]), array([ 4.,  8.,  9., 11., 13., 14., 17., 21., 22., 24.]), array([ 8.,  9., 11., 13., 14., 17., 21., 22., 24., 26.]), array([ 9., 11., 13., 14., 17., 21., 22., 24., 26., 27.]), array([11., 13., 14., 17., 21., 22., 24., 26., 27., 28.]), array([13., 14., 17., 21., 22., 24., 26., 27., 28., 29.]), array([14., 17., 21., 22., 24., 26., 27., 28., 29., 31.]), array([17., 21., 22., 24., 26., 27., 28., 29., 31., 32.]), array([21., 22., 24., 26., 27., 28., 29., 31., 32., 33.]), array([22., 24., 26., 27., 28., 29., 31., 32., 33., 34.]), array([24., 26., 27., 28., 29., 31., 32., 33., 34., 36.]), array([26., 27., 28., 29., 31., 32., 33., 34., 36., 39.]), array([27., 28., 29., 31., 32., 33., 34., 36., 39., 40.]), array([28., 29., 31., 32., 33., 34., 36., 39., 40., 41.]), array([29., 31., 32., 33., 34., 36., 39., 40., 41., 43.]), array([31., 32., 33., 34., 36., 39., 40., 41., 43., 44.]), array([32., 33., 34., 36., 39., 40., 41., 43., 44., 45.]), array([33., 34., 36., 39., 40., 41., 43., 44., 45., 48.]), array([34., 36., 39., 40., 41., 43., 44., 45., 48., 49.]), array([36., 39., 40., 41., 43., 44., 45., 48., 49., 50.]), array([39., 40., 41., 43., 44., 45., 48., 49., 50., 51.]), array([40., 41., 43., 44., 45., 48., 49., 50., 51., 52.]), array([41., 43., 44., 45., 48., 49., 50., 51., 52., 54.]), array([43., 44., 45., 48., 49., 50., 51., 52., 54., 55.]), array([44., 45., 48., 49., 50., 51., 52., 54., 55., 56.]), array([45., 48., 49., 50., 51., 52., 54., 55., 56., 57.]), array([48., 49., 50., 51., 52., 54., 55., 56., 57., 58.]), array([49., 50., 51., 52., 54., 55., 56., 57., 58., 62.]), array([50., 51., 52., 54., 55., 56., 57., 58., 62., 63.]), array([51., 52., 54., 55., 56., 57., 58., 62., 63., 64.]), array([52., 54., 55., 56., 57., 58., 62., 63., 64., 66.]), array([54., 55., 56., 57., 58., 62., 63., 64., 66., 70.]), array([55., 56., 57., 58., 62., 63., 64., 66., 70., 72.]), array([56., 57., 58., 62., 63., 64., 66., 70., 72., 73.]), array([57., 58., 62., 63., 64., 66., 70., 72., 73., 76.]), array([58., 62., 63., 64., 66., 70., 72., 73., 76., 77.]), array([62., 63., 64., 66., 70., 72., 73., 76., 77., 78.]), array([63., 64., 66., 70., 72., 73., 76., 77., 78., 80.]), array([64., 66., 70., 72., 73., 76., 77., 78., 80., 81.]), array([66., 70., 72., 73., 76., 77., 78., 80., 81., 82.]), array([70., 72., 73., 76., 77., 78., 80., 81., 82., 84.]), array([72., 73., 76., 77., 78., 80., 81., 82., 84., 85.]), array([73., 76., 77., 78., 80., 81., 82., 84., 85., 86.]), array([76., 77., 78., 80., 81., 82., 84., 85., 86., 87.]), array([77., 78., 80., 81., 82., 84., 85., 86., 87., 88.]), array([78., 80., 81., 82., 84., 85., 86., 87., 88., 89.]), array([80., 81., 82., 84., 85., 86., 87., 88., 89., 90.]), array([81., 82., 84., 85., 86., 87., 88., 89., 90., 91.]), array([82., 84., 85., 86., 87., 88., 89., 90., 91., 92.]), array([84., 85., 86., 87., 88., 89., 90., 91., 92., 96.]), array([85., 86., 87., 88., 89., 90., 91., 92., 96., 97.]), array([86., 87., 88., 89., 90., 91., 92., 96., 97., 98.]), array([87., 88., 89., 90., 91., 92., 96., 97., 98., 99.]), array([ 88.,  89.,  90.,  91.,  92.,  96.,  97.,  98.,  99., 100.]), array([ 89.,  90.,  91.,  92.,  96.,  97.,  98.,  99., 100., 103.]), array([ 90.,  91.,  92.,  96.,  97.,  98.,  99., 100., 103., 106.]), array([ 91.,  92.,  96.,  97.,  98.,  99., 100., 103., 106., 108.]), array([ 92.,  96.,  97.,  98.,  99., 100., 103., 106., 108., 109.]), array([ 96.,  97.,  98.,  99., 100., 103., 106., 108., 109., 110.]), array([ 97.,  98.,  99., 100., 103., 106., 108., 109., 110., 111.]), array([ 98.,  99., 100., 103., 106., 108., 109., 110., 111., 112.]), array([ 99., 100., 103., 106., 108., 109., 110., 111., 112., 118.]), array([100., 103., 106., 108., 109., 110., 111., 112., 118., 120.]), array([103., 106., 108., 109., 110., 111., 112., 118., 120., 122.]), array([106., 108., 109., 110., 111., 112., 118., 120., 122., 123.]), array([108., 109., 110., 111., 112., 118., 120., 122., 123., 124.]), array([109., 110., 111., 112., 118., 120., 122., 123., 124., 126.]), array([110., 111., 112., 118., 120., 122., 123., 124., 126., 128.]), array([111., 112., 118., 120., 122., 123., 124., 126., 128., 129.]), array([112., 118., 120., 122., 123., 124., 126., 128., 129., 130.]), array([118., 120., 122., 123., 124., 126., 128., 129., 130., 132.]), array([120., 122., 123., 124., 126., 128., 129., 130., 132., 133.]), array([122., 123., 124., 126., 128., 129., 130., 132., 133., 135.]), array([123., 124., 126., 128., 129., 130., 132., 133., 135., 139.]), array([124., 126., 128., 129., 130., 132., 133., 135., 139., 142.]), array([126., 128., 129., 130., 132., 133., 135., 139., 142., 143.]), array([128., 129., 130., 132., 133., 135., 139., 142., 143., 144.]), array([129., 130., 132., 133., 135., 139., 142., 143., 144., 146.]), array([130., 132., 133., 135., 139., 142., 143., 144., 146., 150.]), array([132., 133., 135., 139., 142., 143., 144., 146., 150., 152.]), array([133., 135., 139., 142., 143., 144., 146., 150., 152., 154.]), array([135., 139., 142., 143., 144., 146., 150., 152., 154., 157.]), array([139., 142., 143., 144., 146., 150., 152., 154., 157., 158.]), array([142., 143., 144., 146., 150., 152., 154., 157., 158., 159.]), array([143., 144., 146., 150., 152., 154., 157., 158., 159., 161.]), array([144., 146., 150., 152., 154., 157., 158., 159., 161., 164.]), array([146., 150., 152., 154., 157., 158., 159., 161., 164., 165.]), array([150., 152., 154., 157., 158., 159., 161., 164., 165., 166.]), array([152., 154., 157., 158., 159., 161., 164., 165., 166., 169.]), array([154., 157., 158., 159., 161., 164., 165., 166., 169., 170.]), array([157., 158., 159., 161., 164., 165., 166., 169., 170., 171.]), array([158., 159., 161., 164., 165., 166., 169., 170., 171., 172.]), array([159., 161., 164., 165., 166., 169., 170., 171., 172., 173.]), array([161., 164., 165., 166., 169., 170., 171., 172., 173., 174.]), array([164., 165., 166., 169., 170., 171., 172., 173., 174., 176.]), array([165., 166., 169., 170., 171., 172., 173., 174., 176., 178.]), array([166., 169., 170., 171., 172., 173., 174., 176., 178., 179.]), array([169., 170., 171., 172., 173., 174., 176., 178., 179., 181.]), array([170., 171., 172., 173., 174., 176., 178., 179., 181., 184.]), array([171., 172., 173., 174., 176., 178., 179., 181., 184., 185.]), array([172., 173., 174., 176., 178., 179., 181., 184., 185., 188.]), array([173., 174., 176., 178., 179., 181., 184., 185., 188., 190.]), array([174., 176., 178., 179., 181., 184., 185., 188., 190., 192.]), array([176., 178., 179., 181., 184., 185., 188., 190., 192., 193.]), array([178., 179., 181., 184., 185., 188., 190., 192., 193., 194.]), array([179., 181., 184., 185., 188., 190., 192., 193., 194., 196.]), array([181., 184., 185., 188., 190., 192., 193., 194., 196., 197.]), array([184., 185., 188., 190., 192., 193., 194., 196., 197., 198.]), array([185., 188., 190., 192., 193., 194., 196., 197., 198., 202.]), array([188., 190., 192., 193., 194., 196., 197., 198., 202., 205.]), array([190., 192., 193., 194., 196., 197., 198., 202., 205., 210.]), array([192., 193., 194., 196., 197., 198., 202., 205., 210., 214.]), array([193., 194., 196., 197., 198., 202., 205., 210., 214., 215.]), array([194., 196., 197., 198., 202., 205., 210., 214., 215.,   2.]), array([196., 197., 198., 202., 205., 210., 214., 215.,   2.,   3.]), array([197., 198., 202., 205., 210., 214., 215.,   2.,   3.,   6.]), array([198., 202., 205., 210., 214., 215.,   2.,   3.,   6.,   7.]), array([202., 205., 210., 214., 215.,   2.,   3.,   6.,   7.,   8.]), array([205., 210., 214., 215.,   2.,   3.,   6.,   7.,   8.,   9.]), array([210., 214., 215.,   2.,   3.,   6.,   7.,   8.,   9.,  11.]), array([214., 215.,   2.,   3.,   6.,   7.,   8.,   9.,  11.,  13.]), array([215.,   2.,   3.,   6.,   7.,   8.,   9.,  11.,  13.,  14.]), array([ 2.,  3.,  6.,  7.,  8.,  9., 11., 13., 14., 15.]), array([ 3.,  6.,  7.,  8.,  9., 11., 13., 14., 15., 17.]), array([ 6.,  7.,  8.,  9., 11., 13., 14., 15., 17., 18.]), array([ 7.,  8.,  9., 11., 13., 14., 15., 17., 18., 19.]), array([ 8.,  9., 11., 13., 14., 15., 17., 18., 19., 21.]), array([ 9., 11., 13., 14., 15., 17., 18., 19., 21., 23.]), array([11., 13., 14., 15., 17., 18., 19., 21., 23., 24.]), array([13., 14., 15., 17., 18., 19., 21., 23., 24., 26.]), array([14., 15., 17., 18., 19., 21., 23., 24., 26., 29.])] [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
+    [False False False False False False False False False False False False
+     False False False False False False False False False False False False
+     False False False False False False False False False]
+    0.9696969696969697
+
+    /home/naman/anaconda3/envs/dl/lib/python3.7/site-packages/sklearn/svm/base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
+      "avoid this warning.", FutureWarning)
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
--- a/ExampleNotebooks/contentClassifier.ipynb
+++ b/ExampleNotebooks/contentClassifier.ipynb
--- a/ExampleNotebooks/dataExtract.ipynb
+++ b/ExampleNotebooks/dataExtract.ipynb
+%% Cell type:code id: tags:
+
+``` python
+import os
+import pandas as pd
+from html_similarity import style_similarity, structural_similarity, similarity
+from bs4 import BeautifulSoup, Doctype
+from bs4.element import Comment
+from collections import Counter
+from scipy.spatial import distance
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.tokenize.treebank import TreebankWordDetokenizer
+import string
+import spacy
+from nltk.metrics import edit_distance
+from nltk.metrics import edit_distance
+from nltk.metrics import interval_distance
+from nltk import jaccard_distance
+import textdistance
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score
+import datetime
+import fbprophet
+import gc
+from fastparquet import ParquetFile
+import pyarrow.parquet as pq
+import json
+from pyspark.sql import SparkSession
+from pyspark.sql import SQLContext
+```
+
+%% Output
+
+    ERROR:fbprophet:Importing plotly failed. Interactive plots will not work.
+
+%% Cell type:code id: tags:
+
+``` python
+root = '../data'
+modelUrl = 'cnn.com'
+
+parquetFiles = []
+for root, dirs, files in os.walk(root):
+    path = root.split(os.sep)
+    for file in files:
+        if file.endswith(".parquet"):
+            parquetFiles.append(os.path.join(root, file))
+
+print(str(len(parquetFiles)) + " parquet files found")
+```
+
+%% Output
+
+    178 parquet files found
+
+%% Cell type:code id: tags:
+
+``` python
+spark = SparkSession.builder \
+    .master("local[*]")\
+    .config("spark.executor.memory", "70g")\
+    .config("spark.driver.memory", "50g")\
+    .config("spark.memory.offHeap.enabled", "true")\
+    .config("spark.memory.offHeap.size", "14g")\
+    .appName("sampleCodeForReference")\
+    .config("spark.driver.cores", "12")\
+    .getOrCreate()
+
+spark.conf.set("spark.sql.parquet.enableVectorizedReader","false")
+
+sc = spark.sparkContext
+
+sqlContext = SQLContext(sc)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+archiveData = []
+```
+
+%% Cell type:code id: tags:
+
+``` python
+for k in range(len(parquetFiles)):
+
+    #print("Processing File " + str(k+1))
+
+    try:
+        file = sqlContext.read.parquet(parquetFiles[k])
+
+        UriComponents = file.rdd.take(1)[0].originalUrl.split('/')
+        payload = file.rdd.take(1)[0].payload
+        mime = file.rdd.take(1)[0].mime
+        filename = file.rdd.take(1)[0].filename
+        timestamp = filename.split('.')[0][4:]
+
+        #print(mime, UriComponents, len(payload))
+        print("Processing File " + str(k+1))
+
+        if (mime == 'text/html' and len(payload) > 1 and modelUrl in UriComponents[-1]):
+            currentData = {}
+            currentData['payload'] = payload
+            currentData['timestamp'] = timestamp
+            archiveData.append(currentData)
+
+    except:
+        pass
+
+
+
+```
+
+%% Output
+
+    Processing File 1
+    Processing File 2
+    Processing File 3
+    Processing File 4
+    Processing File 5
+    Processing File 6
+    Processing File 7
+    Processing File 8
+    Processing File 9
+    Processing File 10
+    Processing File 11
+    Processing File 12
+    Processing File 13
+    Processing File 14
+    Processing File 15
+    Processing File 16
+    Processing File 17
+    Processing File 18
+    Processing File 19
+    Processing File 20
+    Processing File 21
+    Processing File 22
+    Processing File 23
+    Processing File 24
+    Processing File 25
+    Processing File 26
+    Processing File 27
+    Processing File 28
+    Processing File 29
+    Processing File 30
+    Processing File 31
+    Processing File 32
+    Processing File 33
+    Processing File 34
+    Processing File 35
+    Processing File 36
+    Processing File 37
+    Processing File 38
+    Processing File 39
+    Processing File 40
+    Processing File 41
+    Processing File 42
+    Processing File 43
+    Processing File 44
+    Processing File 45
+    Processing File 46
+    Processing File 47
+    Processing File 48
+    Processing File 49
+    Processing File 50
+    Processing File 51
+    Processing File 52
+    Processing File 53
+    Processing File 54
+    Processing File 55
+    Processing File 56
+    Processing File 57
+    Processing File 58
+    Processing File 59
+    Processing File 60
+    Processing File 61
+    Processing File 62
+    Processing File 63
+    Processing File 64
+    Processing File 65
+    Processing File 66
+    Processing File 67
+    Processing File 68
+    Processing File 69
+    Processing File 70
+    Processing File 71
+    Processing File 72
+    Processing File 73
+    Processing File 74
+    Processing File 75
+    Processing File 76
+    Processing File 77
+    Processing File 78
+    Processing File 79
+    Processing File 80
+    Processing File 81
+    Processing File 82
+    Processing File 83
+    Processing File 84
+    Processing File 85
+    Processing File 86
+    Processing File 87
+    Processing File 88
+    Processing File 89
+    Processing File 90
+    Processing File 91
+    Processing File 92
+    Processing File 93
+    Processing File 94
+    Processing File 95
+    Processing File 96
+    Processing File 97
+    Processing File 98
+    Processing File 99
+    Processing File 100
+    Processing File 101
+    Processing File 102
+    Processing File 103
+    Processing File 104
+    Processing File 105
+    Processing File 106
+    Processing File 107
+    Processing File 108
+    Processing File 109
+    Processing File 110
+    Processing File 112
+    Processing File 113
+    Processing File 114
+    Processing File 115
+    Processing File 116
+    Processing File 117
+    Processing File 118
+    Processing File 119
+    Processing File 120
+    Processing File 121
+    Processing File 122
+    Processing File 123
+    Processing File 124
+    Processing File 125
+    Processing File 126
+    Processing File 127
+    Processing File 128
+    Processing File 129
+    Processing File 130
+    Processing File 131
+    Processing File 132
+    Processing File 133
+    Processing File 134
+    Processing File 135
+    Processing File 136
+    Processing File 137
+    Processing File 138
+    Processing File 139
+    Processing File 140
+    Processing File 141
+    Processing File 142
+    Processing File 143
+    Processing File 144
+    Processing File 145
+    Processing File 146
+    Processing File 147
+    Processing File 148
+    Processing File 149
+    Processing File 150
+    Processing File 151
+    Processing File 152
+    Processing File 153
+    Processing File 154
+    Processing File 155
+    Processing File 156
+    Processing File 157
+    Processing File 158
+    Processing File 159
+    Processing File 160
+    Processing File 161
+    Processing File 162
+    Processing File 163
+    Processing File 164
+    Processing File 165
+    Processing File 166
+    Processing File 167
+    Processing File 168
+    Processing File 169
+    Processing File 170
+    Processing File 171
+    Processing File 172
+    Processing File 173
+    Processing File 174
+    Processing File 175
+    Processing File 176
+    Processing File 177
+    Processing File 178
+
+%% Cell type:code id: tags:
+
+``` python
+archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+df = pd.DataFrame(archiveData, columns = ['payload', 'timestamp'])
+```
+
+%% Cell type:code id: tags:
+
+``` python
+df.to_pickle("./archiveData.pkl")
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
+%% Cell type:code id: tags:
+
+``` python
+import os
+import pandas as pd
+from html_similarity import style_similarity, structural_similarity, similarity
+from bs4 import BeautifulSoup, Doctype
+from bs4.element import Comment
+from collections import Counter
+from scipy.spatial import distance
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.tokenize.treebank import TreebankWordDetokenizer
+import string
+import spacy
+from nltk.metrics import edit_distance
+from nltk.metrics import edit_distance
+from nltk.metrics import interval_distance
+from nltk import jaccard_distance
+import textdistance
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score
+import datetime
+import fbprophet
+import gc
+from fastparquet import ParquetFile
+import pyarrow.parquet as pq
+import json
+from pyspark.sql import SparkSession
+from pyspark.sql import SQLContext
+```
+
+%% Output
+
+    ERROR:fbprophet:Importing plotly failed. Interactive plots will not work.
+
+%% Cell type:code id: tags:
+
+``` python
+root = '../data'
+modelUrl = 'cnn.com'
+
+parquetFiles = []
+for root, dirs, files in os.walk(root):
+    path = root.split(os.sep)
+    for file in files:
+        if file.endswith(".parquet"):
+            parquetFiles.append(os.path.join(root, file))
+
+print(str(len(parquetFiles)) + " parquet files found")
+```
+
+%% Output
+
+    178 parquet files found
+
+%% Cell type:code id: tags:
+
+``` python
+spark = SparkSession.builder \
+    .master("local[*]")\
+    .config("spark.executor.memory", "70g")\
+    .config("spark.driver.memory", "50g")\
+    .config("spark.memory.offHeap.enabled", "true")\
+    .config("spark.memory.offHeap.size", "14g")\
+    .appName("sampleCodeForReference")\
+    .config("spark.driver.cores", "12")\
+    .getOrCreate()
+
+spark.conf.set("spark.sql.parquet.enableVectorizedReader","false")
+
+sc = spark.sparkContext
+
+sqlContext = SQLContext(sc)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+archiveData = []
+```
+
+%% Cell type:code id: tags:
+
+``` python
+for k in range(len(parquetFiles)):
+
+    #print("Processing File " + str(k+1))
+
+    try:
+        file = sqlContext.read.parquet(parquetFiles[k])
+
+        UriComponents = file.rdd.take(1)[0].originalUrl.split('/')
+        payload = file.rdd.take(1)[0].payload
+        mime = file.rdd.take(1)[0].mime
+        filename = file.rdd.take(1)[0].filename
+        timestamp = filename.split('.')[0][4:]
+
+        #print(mime, UriComponents, len(payload))
+        print("Processing File " + str(k+1))
+
+        if (mime == 'text/html' and len(payload) > 1 and modelUrl in UriComponents[-1]):
+            currentData = {}
+            currentData['payload'] = payload
+            currentData['timestamp'] = timestamp
+            archiveData.append(currentData)
+
+    except:
+        pass
+
+
+
+```
+
+%% Output
+
+    Processing File 1
+    Processing File 2
+    Processing File 3
+    Processing File 4
+    Processing File 5
+    Processing File 6
+    Processing File 7
+    Processing File 8
+    Processing File 9
+    Processing File 10
+    Processing File 11
+    Processing File 12
+    Processing File 13
+    Processing File 14
+    Processing File 15
+    Processing File 16
+    Processing File 17
+    Processing File 18
+    Processing File 19
+    Processing File 20
+    Processing File 21
+    Processing File 22
+    Processing File 23
+    Processing File 24
+    Processing File 25
+    Processing File 26
+    Processing File 27
+    Processing File 28
+    Processing File 29
+    Processing File 30
+    Processing File 31
+    Processing File 32
+    Processing File 33
+    Processing File 34
+    Processing File 35
+    Processing File 36
+    Processing File 37
+    Processing File 38
+    Processing File 39
+    Processing File 40
+    Processing File 41
+    Processing File 42
+    Processing File 43
+    Processing File 44
+    Processing File 45
+    Processing File 46
+    Processing File 47
+    Processing File 48
+    Processing File 49
+    Processing File 50
+    Processing File 51
+    Processing File 52
+    Processing File 53
+    Processing File 54
+    Processing File 55
+    Processing File 56
+    Processing File 57
+    Processing File 58
+    Processing File 59
+    Processing File 60
+    Processing File 61
+    Processing File 62
+    Processing File 63
+    Processing File 64
+    Processing File 65
+    Processing File 66
+    Processing File 67
+    Processing File 68
+    Processing File 69
+    Processing File 70
+    Processing File 71
+    Processing File 72
+    Processing File 73
+    Processing File 74
+    Processing File 75
+    Processing File 76
+    Processing File 77
+    Processing File 78
+    Processing File 79
+    Processing File 80
+    Processing File 81
+    Processing File 82
+    Processing File 83
+    Processing File 84
+    Processing File 85
+    Processing File 86
+    Processing File 87
+    Processing File 88
+    Processing File 89
+    Processing File 90
+    Processing File 91
+    Processing File 92
+    Processing File 93
+    Processing File 94
+    Processing File 95
+    Processing File 96
+    Processing File 97
+    Processing File 98
+    Processing File 99
+    Processing File 100
+    Processing File 101
+    Processing File 102
+    Processing File 103
+    Processing File 104
+    Processing File 105
+    Processing File 106
+    Processing File 107
+    Processing File 108
+    Processing File 109
+    Processing File 110
+    Processing File 112
+    Processing File 113
+    Processing File 114
+    Processing File 115
+    Processing File 116
+    Processing File 117
+    Processing File 118
+    Processing File 119
+    Processing File 120
+    Processing File 121
+    Processing File 122
+    Processing File 123
+    Processing File 124
+    Processing File 125
+    Processing File 126
+    Processing File 127
+    Processing File 128
+    Processing File 129
+    Processing File 130
+    Processing File 131
+    Processing File 132
+    Processing File 133
+    Processing File 134
+    Processing File 135
+    Processing File 136
+    Processing File 137
+    Processing File 138
+    Processing File 139
+    Processing File 140
+    Processing File 141
+    Processing File 142
+    Processing File 143
+    Processing File 144
+    Processing File 145
+    Processing File 146
+    Processing File 147
+    Processing File 148
+    Processing File 149
+    Processing File 150
+    Processing File 151
+    Processing File 152
+    Processing File 153
+    Processing File 154
+    Processing File 155
+    Processing File 156
+    Processing File 157
+    Processing File 158
+    Processing File 159
+    Processing File 160
+    Processing File 161
+    Processing File 162
+    Processing File 163
+    Processing File 164
+    Processing File 165
+    Processing File 166
+    Processing File 167
+    Processing File 168
+    Processing File 169
+    Processing File 170
+    Processing File 171
+    Processing File 172
+    Processing File 173
+    Processing File 174
+    Processing File 175
+    Processing File 176
+    Processing File 177
+    Processing File 178
+
+%% Cell type:code id: tags:
+
+``` python
+archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+df = pd.DataFrame(archiveData, columns = ['payload', 'timestamp'])
+```
+
+%% Cell type:code id: tags:
+
+``` python
+df.to_pickle("./archiveData.pkl")
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
--- a/archiveTextClassifier.py
+++ b/archiveTextClassifier.py
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[2]:
+
+
+import os
+import pandas as pd
+from html_similarity import style_similarity, structural_similarity, similarity
+from bs4 import BeautifulSoup, Doctype
+from bs4.element import Comment
+from collections import Counter
+from scipy.spatial import distance
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.tokenize.treebank import TreebankWordDetokenizer
+import string
+import spacy
+from nltk.metrics import edit_distance
+from nltk.metrics import edit_distance
+from nltk.metrics import interval_distance
+from nltk import jaccard_distance
+import textdistance
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score
+from sklearn import svm
+
+
+# In[3]:
+
+
+def tag_visible(element):
+    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
+        return False
+    if isinstance(element, Comment):
+        return False
+    return True
+
+
+# In[4]:
+
+
+def text_from_html(htmlPage):
+    soup = BeautifulSoup(htmlPage, 'html.parser')
+    texts = soup.findAll(text=True)
+    visible_texts = filter(tag_visible, texts)
+    return u" ".join(t.strip() for t in visible_texts)
+
+
+# In[5]:
+
+
+def split(word): 
+    return [char for char in word]
+
+
+# In[6]:
+
+
+def filter_text(text):
+    stop_words = set(stopwords.words('english'))
+    stop_words.update(split(string.punctuation))
+    nlp = spacy.load('en_core_web_sm')
+    spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
+    stop_words.update(spacy_stopwords)
+    #stop_words.update(["\\t","\\n","\\r"])
+    
+    text = text.replace("\\n", "")
+    text = text.replace("\\r", "")
+    text = text.replace("\\t", "")    
+    
+    word_tokens_text = word_tokenize(text)
+
+    filtered_text = [w for w in word_tokens_text if not w in stop_words]
+
+    filtered_text = TreebankWordDetokenizer().detokenize(filtered_text)
+    
+    return filtered_text
+
+
+# In[ ]:
+
+
+
+
+
+# In[ ]:
+
+
+
+
+
+# In[7]:
+
+
+def classiyRF(archiveData, newRecord):
+    archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
+    
+    basePayload = archiveData[0]['payload']
+    basePayloadText = text_from_html(basePayload)
+    basePayloadFilteredText = filter_text(basePayloadText)
+    lastSavedDataIndex = 0
+    dataset = []
+
+    print(str(len(archiveData)) + " datapoints found")
+
+    for i in range(1, len(archiveData)):
+        if(i % 100 is 0):
+            print(str(i) + " Records processed")
+
+        hasContentChanged = False
+
+        overallSimilarity = similarity(basePayload, archiveData[i]['payload'])
+        styleSimilarity = style_similarity(basePayload, archiveData[i]['payload'])
+        structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload'])
+
+        archiveText = text_from_html(archiveData[i]['payload'])
+        filteredArchiveText = filter_text(archiveText)
+
+        cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+        jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+        #editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+        sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+
+        if(overallSimilarity < 0.80 or cosineSimilarity < 0.95):
+            hasContentChanged = True
+            lastSavedDataIndex = i
+            basePayload = archiveData[i]['payload']
+            basePayloadText = archiveText
+            basePayloadFilteredText = filteredArchiveText
+
+        data = [overallSimilarity, styleSimilarity, structuralSimilarity,                cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged]
+        dataset.append(data)
+
+    
+    
+    df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'structureSimilarity',                                           'cosine', 'jaccard', 'sorensen', 'changed']) 
+    print("Dataframe created")
+    
+    X = df.iloc[:, 0:6].values
+    y = df.iloc[:, 6].values
+    
+    sc = StandardScaler()
+    X_train = sc.fit_transform(X)
+    
+    
+    overallSimilarity = similarity(basePayload, newRecord['payload'])
+    styleSimilarity = style_similarity(basePayload, newRecord['payload'])
+    structuralSimilarity = structural_similarity(basePayload, newRecord['payload'])
+
+    archiveText = text_from_html(newRecord['payload'])
+    filteredArchiveText = filter_text(archiveText)
+
+    cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+    jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+    #editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+    sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+    
+    X_test = [overallSimilarity, styleSimilarity, structuralSimilarity,                cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity]
+    
+    
+    print("Starting Random Forest Classification")
+    
+    regressor = RandomForestClassifier(n_estimators=20, random_state=0)
+    regressor.fit(X_train, y)
+    y_pred = regressor.predict([X_test])
+    
+    return y_pred
+
+    
+    
+    
+    
+    
+
+
+# In[ ]:
+
+
+def classiySVM(archiveData, newRecord):
+    archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
+    
+    basePayload = archiveData[0]['payload']
+    basePayloadText = text_from_html(basePayload)
+    basePayloadFilteredText = filter_text(basePayloadText)
+    lastSavedDataIndex = 0
+    dataset = []
+
+    print(str(len(archiveData)) + " datapoints found")
+
+    for i in range(1, len(archiveData)):
+        if(i % 100 is 0):
+            print(str(i) + " Records processed")
+
+        hasContentChanged = False
+
+        overallSimilarity = similarity(basePayload, archiveData[i]['payload'])
+        styleSimilarity = style_similarity(basePayload, archiveData[i]['payload'])
+        structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload'])
+
+        archiveText = text_from_html(archiveData[i]['payload'])
+        filteredArchiveText = filter_text(archiveText)
+
+        cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+        jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+        #editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+        sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+
+        if(overallSimilarity < 0.80 or cosineSimilarity < 0.95):
+            hasContentChanged = True
+            lastSavedDataIndex = i
+            basePayload = archiveData[i]['payload']
+            basePayloadText = archiveText
+            basePayloadFilteredText = filteredArchiveText
+
+        data = [overallSimilarity, styleSimilarity, structuralSimilarity,                cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged]
+        dataset.append(data)
+
+    
+    
+    df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'structureSimilarity',                                           'cosine', 'jaccard', 'sorensen', 'changed']) 
+    print("Dataframe created")
+    
+    X = df.iloc[:, 0:6].values
+    y = df.iloc[:, 6].values
+    
+    sc = StandardScaler()
+    X_train = sc.fit_transform(X)
+    
+    
+    overallSimilarity = similarity(basePayload, newRecord['payload'])
+    styleSimilarity = style_similarity(basePayload, newRecord['payload'])
+    structuralSimilarity = structural_similarity(basePayload, newRecord['payload'])
+
+    archiveText = text_from_html(newRecord['payload'])
+    filteredArchiveText = filter_text(archiveText)
+
+    cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+    jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+    #editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+    sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
+    
+    X_test = [overallSimilarity, styleSimilarity, structuralSimilarity,                cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity]
+    
+    
+    print("Starting SVM Classification")
+    
+    regressor = svm.SVC()
+    regressor.fit(X_train, y)
+    y_pred = regressor.predict([X_test])
+    
+    return y_pred
+
+    
+    
+    
+    
+    
+
No results found