Skip to content
Snippets Groups Projects
Commit 5ba0d09e authored by namanahuja's avatar namanahuja
Browse files

Delete HTML_Similarity.ipynb

parent ab26ffe0
Branches textCompare
No related tags found
No related merge requests found
%% Cell type:code id: tags:
```
# -*- coding: utf-8 -*-
import pandas as pd
from html_similarity import style_similarity, structural_similarity, similarity
from bs4 import BeautifulSoup, Doctype
import imgkit
```
%% Cell type:code id: tags:
```
file = pd.read_parquet('data.parquet', engine='pyarrow')
numRows = len(file.index)
validPayloads = []
timestamps = []
for i in range(numRows):
payload = file.iloc[i].payload
mime = file.iloc[i].mime
timestamp = file.iloc[i].timestamp
soup = (BeautifulSoup(payload, "html.parser"))
# check for only vt.edu
if (mime == 'text/html' and len(payload) > 1):
validPayloads.append(payload)
timestamps.append(timestamp)
```
%% Cell type:code id: tags:
```
for i in range(len(validPayloads)):
outFileName = 'captures/' + str(timestamps[i]) + '.jpg'
#imgkit.from_string(validPayloads[i], outFileName)
scores = [[-1 for i in range(len(validPayloads))] for j in range(len(validPayloads))]
for i in range(len(validPayloads)):
payload1 = validPayloads[i]
for j in range(len(validPayloads)):
payload2 = validPayloads[j]
try:
# print(i,j)
score = str(similarity(payload1, payload2))
scores[i][j] = score
# print(score)
except:
print(i, j)
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment