diff --git a/ExampleNotebooks/HTML_Similarity.ipynb b/ExampleNotebooks/HTML_Similarity.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..b2be1f4efae88547b269776e02b67ac602d22c18 --- /dev/null +++ b/ExampleNotebooks/HTML_Similarity.ipynb @@ -0,0 +1,97 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "HTML_Similarity.ipynb", + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "code", + "metadata": { + "id": "2zXzAV9mRb-L", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# -*- coding: utf-8 -*-\n", + "\n", + "import pandas as pd\n", + "from html_similarity import style_similarity, structural_similarity, similarity\n", + "from bs4 import BeautifulSoup, Doctype\n", + "import imgkit" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "XgqSvWUYRwui", + "colab_type": "code", + "colab": {} + }, + "source": [ + "file = pd.read_parquet('data.parquet', engine='pyarrow')\n", + "numRows = len(file.index)\n", + "\n", + "validPayloads = []\n", + "timestamps = []\n", + "for i in range(numRows):\n", + " payload = file.iloc[i].payload\n", + " mime = file.iloc[i].mime\n", + " timestamp = file.iloc[i].timestamp\n", + "\n", + " soup = (BeautifulSoup(payload, \"html.parser\"))\n", + "\n", + " # check for only vt.edu\n", + "\n", + " if (mime == 'text/html' and len(payload) > 1):\n", + " validPayloads.append(payload)\n", + " timestamps.append(timestamp)\n", + "\n" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Q9UNsU7gRt0Y", + "colab_type": "code", + "colab": {} + }, + "source": [ + "for i in range(len(validPayloads)):\n", + " outFileName = 'captures/' + str(timestamps[i]) + '.jpg'\n", + "\n", + " #imgkit.from_string(validPayloads[i], outFileName)\n", + "\n", + "scores = [[-1 for i in range(len(validPayloads))] for j in range(len(validPayloads))]\n", + "for i in range(len(validPayloads)):\n", + " payload1 = validPayloads[i]\n", + "\n", + " for j in range(len(validPayloads)):\n", + "\n", + " payload2 = validPayloads[j]\n", + "\n", + " try:\n", + " # print(i,j)\n", + " score = str(similarity(payload1, payload2))\n", + " scores[i][j] = score\n", + " # print(score)\n", + "\n", + " except:\n", + " print(i, j)" + ], + "execution_count": 0, + "outputs": [] + } + ] +} \ No newline at end of file