Skip to content
Snippets Groups Projects
Commit ab26ffe0 authored by namanahuja's avatar namanahuja
Browse files

HTML

parent 990412c6
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
```
# -*- coding: utf-8 -*-
import pandas as pd
from html_similarity import style_similarity, structural_similarity, similarity
from bs4 import BeautifulSoup, Doctype
import imgkit
```
%% Cell type:code id: tags:
```
file = pd.read_parquet('data.parquet', engine='pyarrow')
numRows = len(file.index)
validPayloads = []
timestamps = []
for i in range(numRows):
payload = file.iloc[i].payload
mime = file.iloc[i].mime
timestamp = file.iloc[i].timestamp
soup = (BeautifulSoup(payload, "html.parser"))
# check for only vt.edu
if (mime == 'text/html' and len(payload) > 1):
validPayloads.append(payload)
timestamps.append(timestamp)
```
%% Cell type:code id: tags:
```
for i in range(len(validPayloads)):
outFileName = 'captures/' + str(timestamps[i]) + '.jpg'
#imgkit.from_string(validPayloads[i], outFileName)
scores = [[-1 for i in range(len(validPayloads))] for j in range(len(validPayloads))]
for i in range(len(validPayloads)):
payload1 = validPayloads[i]
for j in range(len(validPayloads)):
payload2 = validPayloads[j]
try:
# print(i,j)
score = str(similarity(payload1, payload2))
scores[i][j] = score
# print(score)
except:
print(i, j)
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment