Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • namanahuja/cs-6604-webarchive
  • siddharth/cs-6604-webarchive
  • xw0078/cs-6604-webarchive
3 results
Show changes
Commits on Source (12)
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 2
}
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 2
}
This diff is collapsed.
This diff is collapsed.
%% Cell type:code id: tags:
``` python
import os
import pandas as pd
from html_similarity import style_similarity, structural_similarity, similarity
from bs4 import BeautifulSoup, Doctype
from bs4.element import Comment
from collections import Counter
from scipy.spatial import distance
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string
import spacy
from nltk.metrics import edit_distance
from nltk.metrics import edit_distance
from nltk.metrics import interval_distance
from nltk import jaccard_distance
import textdistance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import datetime
import fbprophet
import gc
from fastparquet import ParquetFile
import pyarrow.parquet as pq
import json
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
```
%% Output
ERROR:fbprophet:Importing plotly failed. Interactive plots will not work.
%% Cell type:code id: tags:
``` python
root = '../data'
modelUrl = 'cnn.com'
parquetFiles = []
for root, dirs, files in os.walk(root):
path = root.split(os.sep)
for file in files:
if file.endswith(".parquet"):
parquetFiles.append(os.path.join(root, file))
print(str(len(parquetFiles)) + " parquet files found")
```
%% Output
178 parquet files found
%% Cell type:code id: tags:
``` python
spark = SparkSession.builder \
.master("local[*]")\
.config("spark.executor.memory", "70g")\
.config("spark.driver.memory", "50g")\
.config("spark.memory.offHeap.enabled", "true")\
.config("spark.memory.offHeap.size", "14g")\
.appName("sampleCodeForReference")\
.config("spark.driver.cores", "12")\
.getOrCreate()
spark.conf.set("spark.sql.parquet.enableVectorizedReader","false")
sc = spark.sparkContext
sqlContext = SQLContext(sc)
```
%% Cell type:code id: tags:
``` python
archiveData = []
```
%% Cell type:code id: tags:
``` python
for k in range(len(parquetFiles)):
#print("Processing File " + str(k+1))
try:
file = sqlContext.read.parquet(parquetFiles[k])
UriComponents = file.rdd.take(1)[0].originalUrl.split('/')
payload = file.rdd.take(1)[0].payload
mime = file.rdd.take(1)[0].mime
filename = file.rdd.take(1)[0].filename
timestamp = filename.split('.')[0][4:]
#print(mime, UriComponents, len(payload))
print("Processing File " + str(k+1))
if (mime == 'text/html' and len(payload) > 1 and modelUrl in UriComponents[-1]):
currentData = {}
currentData['payload'] = payload
currentData['timestamp'] = timestamp
archiveData.append(currentData)
except:
pass
```
%% Output
Processing File 1
Processing File 2
Processing File 3
Processing File 4
Processing File 5
Processing File 6
Processing File 7
Processing File 8
Processing File 9
Processing File 10
Processing File 11
Processing File 12
Processing File 13
Processing File 14
Processing File 15
Processing File 16
Processing File 17
Processing File 18
Processing File 19
Processing File 20
Processing File 21
Processing File 22
Processing File 23
Processing File 24
Processing File 25
Processing File 26
Processing File 27
Processing File 28
Processing File 29
Processing File 30
Processing File 31
Processing File 32
Processing File 33
Processing File 34
Processing File 35
Processing File 36
Processing File 37
Processing File 38
Processing File 39
Processing File 40
Processing File 41
Processing File 42
Processing File 43
Processing File 44
Processing File 45
Processing File 46
Processing File 47
Processing File 48
Processing File 49
Processing File 50
Processing File 51
Processing File 52
Processing File 53
Processing File 54
Processing File 55
Processing File 56
Processing File 57
Processing File 58
Processing File 59
Processing File 60
Processing File 61
Processing File 62
Processing File 63
Processing File 64
Processing File 65
Processing File 66
Processing File 67
Processing File 68
Processing File 69
Processing File 70
Processing File 71
Processing File 72
Processing File 73
Processing File 74
Processing File 75
Processing File 76
Processing File 77
Processing File 78
Processing File 79
Processing File 80
Processing File 81
Processing File 82
Processing File 83
Processing File 84
Processing File 85
Processing File 86
Processing File 87
Processing File 88
Processing File 89
Processing File 90
Processing File 91
Processing File 92
Processing File 93
Processing File 94
Processing File 95
Processing File 96
Processing File 97
Processing File 98
Processing File 99
Processing File 100
Processing File 101
Processing File 102
Processing File 103
Processing File 104
Processing File 105
Processing File 106
Processing File 107
Processing File 108
Processing File 109
Processing File 110
Processing File 112
Processing File 113
Processing File 114
Processing File 115
Processing File 116
Processing File 117
Processing File 118
Processing File 119
Processing File 120
Processing File 121
Processing File 122
Processing File 123
Processing File 124
Processing File 125
Processing File 126
Processing File 127
Processing File 128
Processing File 129
Processing File 130
Processing File 131
Processing File 132
Processing File 133
Processing File 134
Processing File 135
Processing File 136
Processing File 137
Processing File 138
Processing File 139
Processing File 140
Processing File 141
Processing File 142
Processing File 143
Processing File 144
Processing File 145
Processing File 146
Processing File 147
Processing File 148
Processing File 149
Processing File 150
Processing File 151
Processing File 152
Processing File 153
Processing File 154
Processing File 155
Processing File 156
Processing File 157
Processing File 158
Processing File 159
Processing File 160
Processing File 161
Processing File 162
Processing File 163
Processing File 164
Processing File 165
Processing File 166
Processing File 167
Processing File 168
Processing File 169
Processing File 170
Processing File 171
Processing File 172
Processing File 173
Processing File 174
Processing File 175
Processing File 176
Processing File 177
Processing File 178
%% Cell type:code id: tags:
``` python
archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
```
%% Cell type:code id: tags:
``` python
df = pd.DataFrame(archiveData, columns = ['payload', 'timestamp'])
```
%% Cell type:code id: tags:
``` python
df.to_pickle("./archiveData.pkl")
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
#!/usr/bin/env python
# coding: utf-8
# In[2]:
import os
import pandas as pd
from html_similarity import style_similarity, structural_similarity, similarity
from bs4 import BeautifulSoup, Doctype
from bs4.element import Comment
from collections import Counter
from scipy.spatial import distance
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string
import spacy
from nltk.metrics import edit_distance
from nltk.metrics import edit_distance
from nltk.metrics import interval_distance
from nltk import jaccard_distance
import textdistance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm
# In[3]:
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
# In[4]:
def text_from_html(htmlPage):
soup = BeautifulSoup(htmlPage, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
# In[5]:
def split(word):
return [char for char in word]
# In[6]:
def filter_text(text):
stop_words = set(stopwords.words('english'))
stop_words.update(split(string.punctuation))
nlp = spacy.load('en_core_web_sm')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
stop_words.update(spacy_stopwords)
#stop_words.update(["\\t","\\n","\\r"])
text = text.replace("\\n", "")
text = text.replace("\\r", "")
text = text.replace("\\t", "")
word_tokens_text = word_tokenize(text)
filtered_text = [w for w in word_tokens_text if not w in stop_words]
filtered_text = TreebankWordDetokenizer().detokenize(filtered_text)
return filtered_text
# In[ ]:
# In[ ]:
# In[7]:
def classiyRF(archiveData, newRecord):
archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
basePayload = archiveData[0]['payload']
basePayloadText = text_from_html(basePayload)
basePayloadFilteredText = filter_text(basePayloadText)
lastSavedDataIndex = 0
dataset = []
print(str(len(archiveData)) + " datapoints found")
for i in range(1, len(archiveData)):
if(i % 100 is 0):
print(str(i) + " Records processed")
hasContentChanged = False
overallSimilarity = similarity(basePayload, archiveData[i]['payload'])
styleSimilarity = style_similarity(basePayload, archiveData[i]['payload'])
structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload'])
archiveText = text_from_html(archiveData[i]['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
if(overallSimilarity < 0.80 or cosineSimilarity < 0.95):
hasContentChanged = True
lastSavedDataIndex = i
basePayload = archiveData[i]['payload']
basePayloadText = archiveText
basePayloadFilteredText = filteredArchiveText
data = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged]
dataset.append(data)
df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'structureSimilarity', 'cosine', 'jaccard', 'sorensen', 'changed'])
print("Dataframe created")
X = df.iloc[:, 0:6].values
y = df.iloc[:, 6].values
sc = StandardScaler()
X_train = sc.fit_transform(X)
overallSimilarity = similarity(basePayload, newRecord['payload'])
styleSimilarity = style_similarity(basePayload, newRecord['payload'])
structuralSimilarity = structural_similarity(basePayload, newRecord['payload'])
archiveText = text_from_html(newRecord['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
X_test = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity]
print("Starting Random Forest Classification")
regressor = RandomForestClassifier(n_estimators=20, random_state=0)
regressor.fit(X_train, y)
y_pred = regressor.predict([X_test])
return y_pred
# In[ ]:
def classiySVM(archiveData, newRecord):
archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
basePayload = archiveData[0]['payload']
basePayloadText = text_from_html(basePayload)
basePayloadFilteredText = filter_text(basePayloadText)
lastSavedDataIndex = 0
dataset = []
print(str(len(archiveData)) + " datapoints found")
for i in range(1, len(archiveData)):
if(i % 100 is 0):
print(str(i) + " Records processed")
hasContentChanged = False
overallSimilarity = similarity(basePayload, archiveData[i]['payload'])
styleSimilarity = style_similarity(basePayload, archiveData[i]['payload'])
structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload'])
archiveText = text_from_html(archiveData[i]['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
if(overallSimilarity < 0.80 or cosineSimilarity < 0.95):
hasContentChanged = True
lastSavedDataIndex = i
basePayload = archiveData[i]['payload']
basePayloadText = archiveText
basePayloadFilteredText = filteredArchiveText
data = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged]
dataset.append(data)
df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'structureSimilarity', 'cosine', 'jaccard', 'sorensen', 'changed'])
print("Dataframe created")
X = df.iloc[:, 0:6].values
y = df.iloc[:, 6].values
sc = StandardScaler()
X_train = sc.fit_transform(X)
overallSimilarity = similarity(basePayload, newRecord['payload'])
styleSimilarity = style_similarity(basePayload, newRecord['payload'])
structuralSimilarity = structural_similarity(basePayload, newRecord['payload'])
archiveText = text_from_html(newRecord['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
X_test = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity]
print("Starting SVM Classification")
regressor = svm.SVC()
regressor.fit(X_train, y)
y_pred = regressor.predict([X_test])
return y_pred