From 5ba0d09e537cc14845c1d763059c397aac0aadfb Mon Sep 17 00:00:00 2001
From: namanahuja <namanahuja@vt.edu>
Date: Fri, 8 Nov 2019 17:48:31 -0500
Subject: [PATCH] Delete HTML_Similarity.ipynb

---
 HTML_Similarity.ipynb | 97 -------------------------------------------
 1 file changed, 97 deletions(-)
 delete mode 100644 HTML_Similarity.ipynb

diff --git a/HTML_Similarity.ipynb b/HTML_Similarity.ipynb
deleted file mode 100644
index b2be1f4..0000000
--- a/HTML_Similarity.ipynb
+++ /dev/null
@@ -1,97 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "HTML_Similarity.ipynb",
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    }
-  },
-  "cells": [
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "2zXzAV9mRb-L",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# -*- coding: utf-8 -*-\n",
-        "\n",
-        "import pandas as pd\n",
-        "from html_similarity import style_similarity, structural_similarity, similarity\n",
-        "from bs4 import BeautifulSoup, Doctype\n",
-        "import imgkit"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "XgqSvWUYRwui",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "file = pd.read_parquet('data.parquet', engine='pyarrow')\n",
-        "numRows = len(file.index)\n",
-        "\n",
-        "validPayloads = []\n",
-        "timestamps = []\n",
-        "for i in range(numRows):\n",
-        "    payload = file.iloc[i].payload\n",
-        "    mime = file.iloc[i].mime\n",
-        "    timestamp = file.iloc[i].timestamp\n",
-        "\n",
-        "    soup = (BeautifulSoup(payload, \"html.parser\"))\n",
-        "\n",
-        "    # check for only vt.edu\n",
-        "\n",
-        "    if (mime == 'text/html' and len(payload) > 1):\n",
-        "        validPayloads.append(payload)\n",
-        "        timestamps.append(timestamp)\n",
-        "\n"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "Q9UNsU7gRt0Y",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "for i in range(len(validPayloads)):\n",
-        "    outFileName = 'captures/' + str(timestamps[i]) + '.jpg'\n",
-        "\n",
-        "    #imgkit.from_string(validPayloads[i], outFileName)\n",
-        "\n",
-        "scores = [[-1 for i in range(len(validPayloads))] for j in range(len(validPayloads))]\n",
-        "for i in range(len(validPayloads)):\n",
-        "    payload1 = validPayloads[i]\n",
-        "\n",
-        "    for j in range(len(validPayloads)):\n",
-        "\n",
-        "        payload2 = validPayloads[j]\n",
-        "\n",
-        "        try:\n",
-        "            # print(i,j)\n",
-        "            score = str(similarity(payload1, payload2))\n",
-        "            scores[i][j] = score\n",
-        "            # print(score)\n",
-        "\n",
-        "        except:\n",
-        "            print(i, j)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    }
-  ]
-}
\ No newline at end of file
-- 
GitLab