Skip to content
Snippets Groups Projects
Commit d60c6ddd authored by Naman's avatar Naman
Browse files

classifiers

parent cf194f0d
No related branches found
No related tags found
3 merge requests!6Text compare,!5Text compare,!4Text compare
#!/usr/bin/env python
# coding: utf-8
# In[2]:
import os
import pandas as pd
from html_similarity import style_similarity, structural_similarity, similarity
from bs4 import BeautifulSoup, Doctype
from bs4.element import Comment
from collections import Counter
from scipy.spatial import distance
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string
import spacy
from nltk.metrics import edit_distance
from nltk.metrics import edit_distance
from nltk.metrics import interval_distance
from nltk import jaccard_distance
import textdistance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm
# In[3]:
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
# In[4]:
def text_from_html(htmlPage):
soup = BeautifulSoup(htmlPage, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
# In[5]:
def split(word):
return [char for char in word]
# In[6]:
def filter_text(text):
stop_words = set(stopwords.words('english'))
stop_words.update(split(string.punctuation))
nlp = spacy.load('en_core_web_sm')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
stop_words.update(spacy_stopwords)
#stop_words.update(["\\t","\\n","\\r"])
text = text.replace("\\n", "")
text = text.replace("\\r", "")
text = text.replace("\\t", "")
word_tokens_text = word_tokenize(text)
filtered_text = [w for w in word_tokens_text if not w in stop_words]
filtered_text = TreebankWordDetokenizer().detokenize(filtered_text)
return filtered_text
# In[ ]:
# In[ ]:
# In[7]:
def classiyRF(archiveData, newRecord):
archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
basePayload = archiveData[0]['payload']
basePayloadText = text_from_html(basePayload)
basePayloadFilteredText = filter_text(basePayloadText)
lastSavedDataIndex = 0
dataset = []
print(str(len(archiveData)) + " datapoints found")
for i in range(1, len(archiveData)):
if(i % 100 is 0):
print(str(i) + " Records processed")
hasContentChanged = False
overallSimilarity = similarity(basePayload, archiveData[i]['payload'])
styleSimilarity = style_similarity(basePayload, archiveData[i]['payload'])
structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload'])
archiveText = text_from_html(archiveData[i]['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
if(overallSimilarity < 0.80 or cosineSimilarity < 0.95):
hasContentChanged = True
lastSavedDataIndex = i
basePayload = archiveData[i]['payload']
basePayloadText = archiveText
basePayloadFilteredText = filteredArchiveText
data = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged]
dataset.append(data)
df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'structureSimilarity', 'cosine', 'jaccard', 'sorensen', 'changed'])
print("Dataframe created")
X = df.iloc[:, 0:6].values
y = df.iloc[:, 6].values
sc = StandardScaler()
X_train = sc.fit_transform(X)
overallSimilarity = similarity(basePayload, newRecord['payload'])
styleSimilarity = style_similarity(basePayload, newRecord['payload'])
structuralSimilarity = structural_similarity(basePayload, newRecord['payload'])
archiveText = text_from_html(newRecord['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
X_test = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity]
print("Starting Random Forest Classification")
regressor = RandomForestClassifier(n_estimators=20, random_state=0)
regressor.fit(X_train, y)
y_pred = regressor.predict([X_test])
return y_pred
# In[ ]:
def classiySVM(archiveData, newRecord):
archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
basePayload = archiveData[0]['payload']
basePayloadText = text_from_html(basePayload)
basePayloadFilteredText = filter_text(basePayloadText)
lastSavedDataIndex = 0
dataset = []
print(str(len(archiveData)) + " datapoints found")
for i in range(1, len(archiveData)):
if(i % 100 is 0):
print(str(i) + " Records processed")
hasContentChanged = False
overallSimilarity = similarity(basePayload, archiveData[i]['payload'])
styleSimilarity = style_similarity(basePayload, archiveData[i]['payload'])
structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload'])
archiveText = text_from_html(archiveData[i]['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
if(overallSimilarity < 0.80 or cosineSimilarity < 0.95):
hasContentChanged = True
lastSavedDataIndex = i
basePayload = archiveData[i]['payload']
basePayloadText = archiveText
basePayloadFilteredText = filteredArchiveText
data = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged]
dataset.append(data)
df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'structureSimilarity', 'cosine', 'jaccard', 'sorensen', 'changed'])
print("Dataframe created")
X = df.iloc[:, 0:6].values
y = df.iloc[:, 6].values
sc = StandardScaler()
X_train = sc.fit_transform(X)
overallSimilarity = similarity(basePayload, newRecord['payload'])
styleSimilarity = style_similarity(basePayload, newRecord['payload'])
structuralSimilarity = structural_similarity(basePayload, newRecord['payload'])
archiveText = text_from_html(newRecord['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
X_test = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity]
print("Starting SVM Classification")
regressor = svm.SVC()
regressor.fit(X_train, y)
y_pred = regressor.predict([X_test])
return y_pred
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment