From ab26ffe0fdc7be83c601d98319e477ce6e9bf64a Mon Sep 17 00:00:00 2001
From: namanahuja <namanahuja@vt.edu>
Date: Fri, 8 Nov 2019 17:48:18 -0500
Subject: [PATCH] HTML

---
 ExampleNotebooks/HTML_Similarity.ipynb | 97 ++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 ExampleNotebooks/HTML_Similarity.ipynb

diff --git a/ExampleNotebooks/HTML_Similarity.ipynb b/ExampleNotebooks/HTML_Similarity.ipynb
new file mode 100644
index 0000000..b2be1f4
--- /dev/null
+++ b/ExampleNotebooks/HTML_Similarity.ipynb
@@ -0,0 +1,97 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "HTML_Similarity.ipynb",
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "2zXzAV9mRb-L",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# -*- coding: utf-8 -*-\n",
+        "\n",
+        "import pandas as pd\n",
+        "from html_similarity import style_similarity, structural_similarity, similarity\n",
+        "from bs4 import BeautifulSoup, Doctype\n",
+        "import imgkit"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "XgqSvWUYRwui",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "file = pd.read_parquet('data.parquet', engine='pyarrow')\n",
+        "numRows = len(file.index)\n",
+        "\n",
+        "validPayloads = []\n",
+        "timestamps = []\n",
+        "for i in range(numRows):\n",
+        "    payload = file.iloc[i].payload\n",
+        "    mime = file.iloc[i].mime\n",
+        "    timestamp = file.iloc[i].timestamp\n",
+        "\n",
+        "    soup = (BeautifulSoup(payload, \"html.parser\"))\n",
+        "\n",
+        "    # check for only vt.edu\n",
+        "\n",
+        "    if (mime == 'text/html' and len(payload) > 1):\n",
+        "        validPayloads.append(payload)\n",
+        "        timestamps.append(timestamp)\n",
+        "\n"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Q9UNsU7gRt0Y",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "for i in range(len(validPayloads)):\n",
+        "    outFileName = 'captures/' + str(timestamps[i]) + '.jpg'\n",
+        "\n",
+        "    #imgkit.from_string(validPayloads[i], outFileName)\n",
+        "\n",
+        "scores = [[-1 for i in range(len(validPayloads))] for j in range(len(validPayloads))]\n",
+        "for i in range(len(validPayloads)):\n",
+        "    payload1 = validPayloads[i]\n",
+        "\n",
+        "    for j in range(len(validPayloads)):\n",
+        "\n",
+        "        payload2 = validPayloads[j]\n",
+        "\n",
+        "        try:\n",
+        "            # print(i,j)\n",
+        "            score = str(similarity(payload1, payload2))\n",
+        "            scores[i][j] = score\n",
+        "            # print(score)\n",
+        "\n",
+        "        except:\n",
+        "            print(i, j)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
-- 
GitLab