Skip to content
Snippets Groups Projects
Commit d60c6ddd authored by Naman's avatar Naman
Browse files

classifiers

parent cf194f0d
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python
# coding: utf-8
# In[2]:
import os
import pandas as pd
from html_similarity import style_similarity, structural_similarity, similarity
from bs4 import BeautifulSoup, Doctype
from bs4.element import Comment
from collections import Counter
from scipy.spatial import distance
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string
import spacy
from nltk.metrics import edit_distance
from nltk.metrics import edit_distance
from nltk.metrics import interval_distance
from nltk import jaccard_distance
import textdistance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm
# In[3]:
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
# In[4]:
def text_from_html(htmlPage):
soup = BeautifulSoup(htmlPage, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
# In[5]:
def split(word):
return [char for char in word]
# In[6]:
def filter_text(text):
stop_words = set(stopwords.words('english'))
stop_words.update(split(string.punctuation))
nlp = spacy.load('en_core_web_sm')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
stop_words.update(spacy_stopwords)
#stop_words.update(["\\t","\\n","\\r"])
text = text.replace("\\n", "")
text = text.replace("\\r", "")
text = text.replace("\\t", "")
word_tokens_text = word_tokenize(text)
filtered_text = [w for w in word_tokens_text if not w in stop_words]
filtered_text = TreebankWordDetokenizer().detokenize(filtered_text)
return filtered_text
# In[ ]:
# In[ ]:
# In[7]:
def classiyRF(archiveData, newRecord):
archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
basePayload = archiveData[0]['payload']
basePayloadText = text_from_html(basePayload)
basePayloadFilteredText = filter_text(basePayloadText)
lastSavedDataIndex = 0
dataset = []
print(str(len(archiveData)) + " datapoints found")
for i in range(1, len(archiveData)):
if(i % 100 is 0):
print(str(i) + " Records processed")
hasContentChanged = False
overallSimilarity = similarity(basePayload, archiveData[i]['payload'])
styleSimilarity = style_similarity(basePayload, archiveData[i]['payload'])
structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload'])
archiveText = text_from_html(archiveData[i]['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
if(overallSimilarity < 0.80 or cosineSimilarity < 0.95):
hasContentChanged = True
lastSavedDataIndex = i
basePayload = archiveData[i]['payload']
basePayloadText = archiveText
basePayloadFilteredText = filteredArchiveText
data = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged]
dataset.append(data)
df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'structureSimilarity', 'cosine', 'jaccard', 'sorensen', 'changed'])
print("Dataframe created")
X = df.iloc[:, 0:6].values
y = df.iloc[:, 6].values
sc = StandardScaler()
X_train = sc.fit_transform(X)
overallSimilarity = similarity(basePayload, newRecord['payload'])
styleSimilarity = style_similarity(basePayload, newRecord['payload'])
structuralSimilarity = structural_similarity(basePayload, newRecord['payload'])
archiveText = text_from_html(newRecord['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
X_test = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity]
print("Starting Random Forest Classification")
regressor = RandomForestClassifier(n_estimators=20, random_state=0)
regressor.fit(X_train, y)
y_pred = regressor.predict([X_test])
return y_pred
# In[ ]:
def classiySVM(archiveData, newRecord):
archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
basePayload = archiveData[0]['payload']
basePayloadText = text_from_html(basePayload)
basePayloadFilteredText = filter_text(basePayloadText)
lastSavedDataIndex = 0
dataset = []
print(str(len(archiveData)) + " datapoints found")
for i in range(1, len(archiveData)):
if(i % 100 is 0):
print(str(i) + " Records processed")
hasContentChanged = False
overallSimilarity = similarity(basePayload, archiveData[i]['payload'])
styleSimilarity = style_similarity(basePayload, archiveData[i]['payload'])
structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload'])
archiveText = text_from_html(archiveData[i]['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
if(overallSimilarity < 0.80 or cosineSimilarity < 0.95):
hasContentChanged = True
lastSavedDataIndex = i
basePayload = archiveData[i]['payload']
basePayloadText = archiveText
basePayloadFilteredText = filteredArchiveText
data = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged]
dataset.append(data)
df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'structureSimilarity', 'cosine', 'jaccard', 'sorensen', 'changed'])
print("Dataframe created")
X = df.iloc[:, 0:6].values
y = df.iloc[:, 6].values
sc = StandardScaler()
X_train = sc.fit_transform(X)
overallSimilarity = similarity(basePayload, newRecord['payload'])
styleSimilarity = style_similarity(basePayload, newRecord['payload'])
structuralSimilarity = structural_similarity(basePayload, newRecord['payload'])
archiveText = text_from_html(newRecord['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
X_test = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity]
print("Starting SVM Classification")
regressor = svm.SVC()
regressor.fit(X_train, y)
y_pred = regressor.predict([X_test])
return y_pred
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment