Skip to content
Snippets Groups Projects

HTML_Similarity

Closed namanahuja requested to merge HTMLSim into master
+ 97
0
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "HTML_Similarity.ipynb",
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "code",
"metadata": {
"id": "2zXzAV9mRb-L",
"colab_type": "code",
"colab": {}
},
"source": [
"# -*- coding: utf-8 -*-\n",
"\n",
"import pandas as pd\n",
"from html_similarity import style_similarity, structural_similarity, similarity\n",
"from bs4 import BeautifulSoup, Doctype\n",
"import imgkit"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "XgqSvWUYRwui",
"colab_type": "code",
"colab": {}
},
"source": [
"file = pd.read_parquet('data.parquet', engine='pyarrow')\n",
"numRows = len(file.index)\n",
"\n",
"validPayloads = []\n",
"timestamps = []\n",
"for i in range(numRows):\n",
" payload = file.iloc[i].payload\n",
" mime = file.iloc[i].mime\n",
" timestamp = file.iloc[i].timestamp\n",
"\n",
" soup = (BeautifulSoup(payload, \"html.parser\"))\n",
"\n",
" # check for only vt.edu\n",
"\n",
" if (mime == 'text/html' and len(payload) > 1):\n",
" validPayloads.append(payload)\n",
" timestamps.append(timestamp)\n",
"\n"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Q9UNsU7gRt0Y",
"colab_type": "code",
"colab": {}
},
"source": [
"for i in range(len(validPayloads)):\n",
" outFileName = 'captures/' + str(timestamps[i]) + '.jpg'\n",
"\n",
" #imgkit.from_string(validPayloads[i], outFileName)\n",
"\n",
"scores = [[-1 for i in range(len(validPayloads))] for j in range(len(validPayloads))]\n",
"for i in range(len(validPayloads)):\n",
" payload1 = validPayloads[i]\n",
"\n",
" for j in range(len(validPayloads)):\n",
"\n",
" payload2 = validPayloads[j]\n",
"\n",
" try:\n",
" # print(i,j)\n",
" score = str(similarity(payload1, payload2))\n",
" scores[i][j] = score\n",
" # print(score)\n",
"\n",
" except:\n",
" print(i, j)"
],
"execution_count": 0,
"outputs": []
}
]
}
\ No newline at end of file
Loading