From 5ba0d09e537cc14845c1d763059c397aac0aadfb Mon Sep 17 00:00:00 2001 From: namanahuja <namanahuja@vt.edu> Date: Fri, 8 Nov 2019 17:48:31 -0500 Subject: [PATCH] Delete HTML_Similarity.ipynb --- HTML_Similarity.ipynb | 97 ------------------------------------------- 1 file changed, 97 deletions(-) delete mode 100644 HTML_Similarity.ipynb diff --git a/HTML_Similarity.ipynb b/HTML_Similarity.ipynb deleted file mode 100644 index b2be1f4..0000000 --- a/HTML_Similarity.ipynb +++ /dev/null @@ -1,97 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "HTML_Similarity.ipynb", - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - } - }, - "cells": [ - { - "cell_type": "code", - "metadata": { - "id": "2zXzAV9mRb-L", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# -*- coding: utf-8 -*-\n", - "\n", - "import pandas as pd\n", - "from html_similarity import style_similarity, structural_similarity, similarity\n", - "from bs4 import BeautifulSoup, Doctype\n", - "import imgkit" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "XgqSvWUYRwui", - "colab_type": "code", - "colab": {} - }, - "source": [ - "file = pd.read_parquet('data.parquet', engine='pyarrow')\n", - "numRows = len(file.index)\n", - "\n", - "validPayloads = []\n", - "timestamps = []\n", - "for i in range(numRows):\n", - " payload = file.iloc[i].payload\n", - " mime = file.iloc[i].mime\n", - " timestamp = file.iloc[i].timestamp\n", - "\n", - " soup = (BeautifulSoup(payload, \"html.parser\"))\n", - "\n", - " # check for only vt.edu\n", - "\n", - " if (mime == 'text/html' and len(payload) > 1):\n", - " validPayloads.append(payload)\n", - " timestamps.append(timestamp)\n", - "\n" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "Q9UNsU7gRt0Y", - "colab_type": "code", - "colab": {} - }, - "source": [ - "for i in range(len(validPayloads)):\n", - " outFileName = 'captures/' + str(timestamps[i]) + '.jpg'\n", - "\n", - " #imgkit.from_string(validPayloads[i], outFileName)\n", - "\n", - "scores = [[-1 for i in range(len(validPayloads))] for j in range(len(validPayloads))]\n", - "for i in range(len(validPayloads)):\n", - " payload1 = validPayloads[i]\n", - "\n", - " for j in range(len(validPayloads)):\n", - "\n", - " payload2 = validPayloads[j]\n", - "\n", - " try:\n", - " # print(i,j)\n", - " score = str(similarity(payload1, payload2))\n", - " scores[i][j] = score\n", - " # print(score)\n", - "\n", - " except:\n", - " print(i, j)" - ], - "execution_count": 0, - "outputs": [] - } - ] -} \ No newline at end of file -- GitLab