Skip to content
Snippets Groups Projects
Commit cf194f0d authored by siddharth's avatar siddharth
Browse files

Added text comparisons

parent 9664f130
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
``` python
import os
import pandas as pd
from html_similarity import style_similarity, structural_similarity, similarity
from bs4 import BeautifulSoup, Doctype
from bs4.element import Comment
from collections import Counter
from scipy.spatial import distance
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string
import spacy
from nltk.metrics import edit_distance
from nltk.metrics import edit_distance
from nltk.metrics import interval_distance
from nltk import jaccard_distance
import textdistance
```
%% Cell type:code id: tags:
``` python
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
```
%% Cell type:code id: tags:
``` python
def text_from_html(htmlPage):
soup = BeautifulSoup(htmlPage, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
```
%% Cell type:code id: tags:
``` python
def split(word):
return [char for char in word]
```
%% Cell type:code id: tags:
``` python
def filter_text(text):
stop_words = set(stopwords.words('english'))
stop_words.update(split(string.punctuation))
nlp = spacy.load('en_core_web_sm')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
stop_words.update(spacy_stopwords)
#stop_words.update(["\\t","\\n","\\r"])
text = text.replace("\\n", "")
text = text.replace("\\r", "")
text = text.replace("\\t", "")
word_tokens_text = word_tokenize(text)
filtered_text = [w for w in word_tokens_text if not w in stop_words]
filtered_text = TreebankWordDetokenizer().detokenize(filtered_text)
return filtered_text
```
%% Cell type:code id: tags:
``` python
root = 'data'
modelUrl = 'vt.edu'
parquetFiles = []
for root, dirs, files in os.walk(root):
path = root.split(os.sep)
for file in files:
if file.endswith(".parquet"):
parquetFiles.append(os.path.join(root, file))
parquetFiles.sort()
```
%% Cell type:code id: tags:
``` python
archiveData = []
for fileName in parquetFiles:
file = pd.read_parquet(fileName, engine='pyarrow')
numRows = len(file.index)
for i in range(numRows):
UriComponents = file.iloc[i].originalUrl.split('/')
payload = file.iloc[i].payload
mime = file.iloc[i].mime
timestamp = file.iloc[i].timestamp
soup = (BeautifulSoup(payload, "html.parser"))
if (mime == 'text/html' and len(payload) > 1 and modelUrl in UriComponents[-2]):
currentData = {}
currentData['payload'] = payload
currentData['timestamp'] = timestamp
archiveData.append(currentData)
archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
```
%% Output
/home/naman/anaconda3/envs/archive/lib/python3.7/site-packages/pyarrow/pandas_compat.py:752: FutureWarning: .labels was deprecated in version 0.24.0. Use .codes instead.
labels, = index.labels
%% Cell type:code id: tags:
``` python
basePayload = archiveData[0]['payload']
basePayloadText = text_from_html(basePayload)
basePayloadFilteredText = filter_text(basePayloadText)
lastSavedDataIndex = 0
dataset = []
for i in range(1, len(archiveData)):
hasContentChanged = 0
overallSimilarity = similarity(basePayload, archiveData[i]['payload'])
styleSimilarity = style_similarity(basePayload, archiveData[i]['payload'])
structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload'])
if(overallSimilarity < 0.80):
archiveText = text_from_html(archiveData[i]['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
if(overallSimilarity < 0.80 or cosineSimilarity < 0.95):
hasContentChanged = 1
lastSavedDataIndex = i
basePayload = archiveData[i]['payload']
basePayloadText = archiveText
basePayloadFilteredText = filteredArchiveText
data = [overallSimilarity, styleSimilarity, structuralSimilarity, hasContentChanged]
data = [overallSimilarity, styleSimilarity, structuralSimilarity, \
cosineSimilarity, jaccardSimilarity, editDistanceSimilarity, sorensenDiceSimilarity, hasContentChanged]
dataset.append(data)
df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'styleSimilarity', 'changed'])
df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'styleSimilarity', \
'cosine', 'jaccard', 'editDistance', 'sorensen', 'changed'])
```
%% Cell type:code id: tags:
``` python
df
```
%% Output
similarity styleSimilarity styleSimilarity changed
0 1.000000 1.000000 1.000000 0
1 1.000000 1.000000 1.000000 0
2 0.381462 0.242424 0.520499 1
3 0.383244 0.242424 0.524064 1
similarity styleSimilarity styleSimilarity cosine jaccard \
0 1.000000 1.000000 1.000000 1.000000 1.000000
1 0.054290 0.067961 0.040619 0.791283 0.639430
2 0.057191 0.067961 0.046422 0.791283 0.639430
3 0.004076 0.000000 0.008152 0.000000 0.000000
4 0.002717 0.000000 0.005435 0.000000 0.000000
5 1.000000 1.000000 1.000000 1.000000 1.000000
6 0.054290 0.067961 0.040619 0.791283 0.639430
7 0.057191 0.067961 0.046422 0.791283 0.639430
8 0.522592 0.403846 0.641337 0.958929 0.921058
9 0.456482 0.403846 0.509119 0.958929 0.921058
10 0.477532 0.262626 0.692437 0.862712 0.746641
11 0.411985 0.262626 0.561345 0.862712 0.746641
12 0.529099 0.384615 0.673583 0.953176 0.908992
13 0.459666 0.384615 0.534717 0.953176 0.908992
14 0.477532 0.262626 0.692437 0.862712 0.746641
15 0.579166 0.843750 0.314582 0.451621 0.204802
16 0.233851 0.242718 0.224983 0.517600 0.268598
17 1.000000 1.000000 1.000000 1.000000 1.000000
18 0.520570 0.375000 0.666140 0.949281 0.902434
editDistance sorensen changed
0 1.000000 1.000000 0
1 0.271394 0.780064 1
2 0.271394 0.780064 1
3 0.000000 0.000000 1
4 0.000000 0.000000 1
5 1.000000 1.000000 0
6 0.271394 0.780064 1
7 0.271394 0.780064 1
8 0.729919 0.958907 1
9 0.729919 0.958907 1
10 0.716349 0.854945 1
11 0.716349 0.854945 1
12 0.744145 0.952327 1
13 0.744145 0.952327 1
14 0.716349 0.854945 1
15 0.203484 0.339976 1
16 0.255152 0.423456 1
17 1.000000 1.000000 0
18 0.744364 0.948715 1
%% Cell type:code id: tags:
``` python
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment