From 97cadb18646f14849ea261bced69cae5ea42db2a Mon Sep 17 00:00:00 2001 From: Ritesh Bansal <riteshobansal@gmail.com> Date: Wed, 11 Dec 2019 15:48:29 -0500 Subject: [PATCH] final models --- CNN_1hour2level.py | 157 --------------------- CNN_1hour2levelMainNew.py | 85 ++++++++++-- MainNewsDataSet.py | 81 ----------- NewCNN_1hour2levelUpdated.py | 159 +++++++++++++++++++++ ParquetfileProcessor.ipynb | 161 +++++++++++++++++++++ Practice3.py | 54 -------- archiveTextClassifier.py | 261 ----------------------------------- main.py | 66 --------- 8 files changed, 392 insertions(+), 632 deletions(-) delete mode 100644 CNN_1hour2level.py delete mode 100644 MainNewsDataSet.py create mode 100644 NewCNN_1hour2levelUpdated.py create mode 100644 ParquetfileProcessor.ipynb delete mode 100644 Practice3.py delete mode 100644 archiveTextClassifier.py delete mode 100644 main.py diff --git a/CNN_1hour2level.py b/CNN_1hour2level.py deleted file mode 100644 index f3e7d37..0000000 --- a/CNN_1hour2level.py +++ /dev/null @@ -1,157 +0,0 @@ -import pandas as pd -import numpy as np -import os as os -from Trie import Trie -from urllib.parse import urlparse -from archiveTextClassifier import classiyRF, classiySVM -from ResponseParser import responseParser - -# making data -def buildDataDayWise (folderpath): - listOfFolder = os.listdir(folderpath) - data = [] - for i in listOfFolder: - if not (i.endswith('.DS_Store')): - address = folderpath + i + "/" - listOfFiles = os.listdir(address) - for f_name in listOfFiles: - if f_name.endswith('.parquet'): - addressPar = folderpath + i + "/" + f_name - dateFiles = pd.read_parquet(addressPar).to_numpy() - if (len(dateFiles) == 0): continue - zz_new = [] - for j in range(len(dateFiles)): - if dateFiles[j][4] == 'text/html' and dateFiles[j][5] == '200': - zz_new.append(dateFiles[j]) - zz_new = np.asarray(zz_new) - data.append(zz_new) - return np.asarray(data) - -def dataSplitBuildTest(data_array, threshold): - data_array = np.asarray(data_array) - data_train = data_array[0:threshold] - data_test = data_array[threshold:len(data_array)] - return data_train, data_test - -#making sitemap tree -def makingSitemapTree (data_train): - data_train = np.asarray(data_train) - #unique domains - sitemapdomains = {} - # sitemapURLS["/"] = "www.vt.edu" - # sitemap = Trie() - for dayData in data_train: - dayDataNP = np.asarray(dayData) - for i in range(len(dayDataNP)): - # parsedurl = urlparse(dayDataNP[i][2]) - url = dayDataNP[i][3] - if (url[len(url) - 1] != '/'): url = url + "/" - parsedurl = urlparse(os.path.splitext(url)[0]) - if not sitemapdomains.__contains__(parsedurl.hostname): - sitemapdomains[parsedurl.hostname] = Trie() - sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname - sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0 - sitemap = sitemapdomains[parsedurl.hostname] - timestamp = dayDataNP[i][2] - payload = dayDataNP[i][13] - payload = responseParser(payload).read().decode("utf-8") - sitemap.insert(parsedurl.path, timestamp, payload) - return sitemapdomains - -def testingSitemapTreeClassiyRF (sitemapdomains, data_test): - data_test = np.asarray(data_test) - # testing sitemap tree - for dayData in data_test: - dayDataNP = np.asarray(dayData) - for i in range(len(dayDataNP)): - # parsedurl = urlparse(dayDataNP[i][2]) - url = dayDataNP[i][3] - if (url[len(url) - 1] != '/'): url = url + "/" - parsedurl = urlparse(os.path.splitext(url)[0]) - if not sitemapdomains.__contains__(parsedurl.hostname): - sitemapdomains[parsedurl.hostname] = Trie() - sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname - sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0 - sitemap = sitemapdomains[parsedurl.hostname] - timestamp = dayDataNP[i][2] - payload = dayDataNP[i][13] - payload = responseParser(payload).read().decode("utf-8") - # Check for structure change - structchange = sitemap.isStructureChange(parsedurl.path) - if (structchange): - sitemap.insert(parsedurl.path, timestamp, payload) - else: - nodeData = sitemap.extractNodeData(parsedurl.path) - new_data = {} - new_data['timestamp'] = timestamp - new_data['payload'] = payload - nodeDataMpdified = [] - for key,val in nodeData.items(): - temp = {} - temp['timestamp'] = key - temp['payload'] = val - nodeDataMpdified.append(temp) - tocrawl = classiyRF(nodeDataMpdified, new_data) - # if yes, crawl - # if no, use classifier to check for to crawl or not - if(tocrawl[0]): - sitemap.insert(parsedurl.path, timestamp, payload) - return sitemapdomains - -def testingSitemapTreeClassiySVM (sitemapdomains, data_test): - data_test = np.asarray(data_test) - # testing sitemap tree - for dayData in data_test: - dayDataNP = np.asarray(dayData) - for i in range(len(dayDataNP)): - # parsedurl = urlparse(dayDataNP[i][2]) - url = dayDataNP[i][3] - if (url[len(url) - 1] != '/'): url = url + "/" - parsedurl = urlparse(os.path.splitext(url)[0]) - if not sitemapdomains.__contains__(parsedurl.hostname): - sitemapdomains[parsedurl.hostname] = Trie() - sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname - sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0 - sitemap = sitemapdomains[parsedurl.hostname] - timestamp = dayDataNP[i][2] - payload = dayDataNP[i][13] - payload = responseParser(payload).read().decode("utf-8") - # Check for structure change - structchange = sitemap.isStructureChange(parsedurl.path) - if (structchange): - sitemap.insert(parsedurl.path, timestamp, payload) - else: - nodeData = sitemap.extractNodeData(parsedurl.path) - new_data = {} - new_data['timestamp'] = timestamp - new_data['payload'] = payload - nodeDataMpdified = [] - for key,val in nodeData.items(): - temp = {} - temp['timestamp'] = key - temp['payload'] = val - nodeDataMpdified.append(temp) - tocrawl = classiySVM(nodeDataMpdified, new_data) - # if yes, crawl - # if no, use classifier to check for to crawl or not - if(tocrawl[0]): - sitemap.insert(parsedurl.path, timestamp, payload) - return sitemapdomains - -def extractSitemap(sitemapdomains, domainName): - return sitemapdomains[domainName] - -def createCopySitemap(sitemapdomains, domainName): - sitemap = sitemapdomains[domainName] - return sitemap.extract() - -def getSitemapForTimestamp(sitemapdomains, domainName, startTimestamp, endTimeStamp): - sitemap = sitemapdomains[domainName] - return sitemap.extract(startTimestamp,endTimeStamp) - -def compareTwoSiteMaps (sitemap1, sitemap2): - return sitemap1.comparison(sitemap2.root) - -def extractMatrixSiteMap (sitemapdomains, domainName): - sitemap = sitemapdomains[domainName] - return np.asarray(sitemap.ancestorMatrix()) diff --git a/CNN_1hour2levelMainNew.py b/CNN_1hour2levelMainNew.py index 825f05e..5607a13 100644 --- a/CNN_1hour2levelMainNew.py +++ b/CNN_1hour2levelMainNew.py @@ -1,23 +1,44 @@ - +import NewCNN_1hour2levelUpdated as cnnFocusCrawl +from numpy import save +from functools import cmp_to_key import numpy as np -import CNN_1hour2level as cnnFocusCrawl +import pandas as pd +from fbprophet import Prophet +import math + + # making data folderpath = "CNN_1hour2level/" -data_array = cnnFocusCrawl.buildDataDayWise(folderpath) -data_array = np.asarray(data_array) -print(len(data_array)) + +#uncomment when use to build data +# data_array = cnnFocusCrawl.buildDataDayWise(folderpath) + +#cnnNodeData2.pkl will be generated in above step, loading over here +archiveDataDataFrameLoad = pd.read_pickle('cnnNodeData2.pkl').values +clean_archiveDataDataFrameLoad = cnnFocusCrawl.cleanDataSet(archiveDataDataFrameLoad) +def compare(item1, item2): + if(item1[0]['timestamp']>item2[0]['timestamp']): + return +1 + elif (item1[0]['timestamp']<item2[0]['timestamp']): + return -1 + else : + return 0 + +#sorting data +clean_archiveDataDataFrameLoad.sort(key=cmp_to_key(compare)) #split data -threshold = [100] -data_train, data_test = cnnFocusCrawl.dataSplitBuildTest(data_array, threshold[0]) +threshold = [len(clean_archiveDataDataFrameLoad)] +data_train, data_test = cnnFocusCrawl.dataSplitBuildTest(clean_archiveDataDataFrameLoad, threshold[0]) + # making sitemap tree #unique domains -sitemapdomains = cnnFocusCrawl.makingSitemapTree(data_train) -# sitemapURLS["/"] = "www.vt.edu" -# sitemap = Trie() +sitemapdomains, changeNodesMatrix = cnnFocusCrawl.makingSitemapTree(data_train, 0.75) +changeNodesMatrix = np.asarray(changeNodesMatrix) +newNodesDataset = pd.DataFrame({'DS': changeNodesMatrix[:, 0], 'Y': changeNodesMatrix[:, 1]}) +newNodesPerChangeDataset = pd.DataFrame({'DS': changeNodesMatrix[:, 0], 'Y': changeNodesMatrix[:, 2]}) -# testing sitemap tree -sitemapdomains = cnnFocusCrawl.testingSitemapTreeClassiyRF(sitemapdomains, data_test) +save('data.npy', changeNodesMatrix) edition_cnn_com = sitemapdomains['www.cnn.com'] edition_cnn_com_Copy = edition_cnn_com.extract("","") @@ -27,4 +48,42 @@ result = edition_cnn_com.isSame(edition_cnn_com.root) print(result) matrix = edition_cnn_com.ancestorMatrix() matrix = np.asarray(matrix) -print('done') \ No newline at end of file +print('done') + +data = np.load('data.npy')[1:,:] + +# SVM +trainEx = math.floor(len(newNodesPerChangeDataset)*0.8) +X = [] +y = [] + +windowSize = 4 + +for i in range(len(data) - windowSize -1): + bound = min(i + windowSize, len(data)) + window = data[i:bound, 2] + windowLabel = data[bound, 4] + X.append(window) + y.append(windowLabel) + +cnnFocusCrawl.results(X,y, trainEx) + +newNodesPerChangeDataset = pd.DataFrame({'ds': data[:, 0], 'y': data[:, 3], 'floor': np.array(data[:, 5]).astype(np.float), 'cap': np.array(data[:, 6]).astype(np.float)}) +trainEx = math.floor(len(newNodesPerChangeDataset)*0.8) + +def dataSplitBuildTest(data_array, threshold): + data_train = data_array[0:threshold] + data_test = data_array[threshold:len(data_array)] + print('split done ') + return data_train, data_test + +# Nodes added Prediction +df_Nodes, data_test2 = dataSplitBuildTest(newNodesPerChangeDataset, trainEx) +m2 = Prophet(growth='logistic') +m2.fit(df_Nodes) +allfuture2 = m2.make_future_dataframe(periods=72, freq="1H", include_history=True) +allfuture2['cap'] = 0.75 +allfuture2['floor'] = 0 +allforecast2 = m2.predict(allfuture2) +fig3 = m2.plot(allforecast2) +fig4 = m2.plot_components(allforecast2) \ No newline at end of file diff --git a/MainNewsDataSet.py b/MainNewsDataSet.py deleted file mode 100644 index 52be144..0000000 --- a/MainNewsDataSet.py +++ /dev/null @@ -1,81 +0,0 @@ -import pyarrow.parquet as pq -import pandas as pd -import numpy as np -import os as os -from Trie import Trie -from urllib.parse import urlparse - -# making data -folderNews = "CNN_focuscrawls/" -listOfFolder = os.listdir(folderNews) -data = [] -for i in listOfFolder: - if not(i.endswith('.DS_Store')): - address = folderNews+i+"/" - listOfFiles = os.listdir(address) - for f_name in listOfFiles: - if f_name.endswith('.parquet'): - addressPar = folderNews + i + "/"+f_name - dateFiles = pd.read_parquet(addressPar).to_numpy() - if(len(dateFiles)==0) : continue - zz_new = [] - for j in range(len(dateFiles)): - if dateFiles[j][4] == 'text/html' and dateFiles[j][5] == '200': - zz_new.append(dateFiles[j]) - zz_new = np.asarray(zz_new) - data.append(zz_new) -data_array = np.asarray(data) - -# threshold = [100] -threshold = [len(data_array)] - -# making sitemap tree -data_train = data_array[0:threshold[0]] - -#unique domains -sitemapdomains = {} -# sitemapURLS["/"] = "www.vt.edu" -# sitemap = Trie() - -for dayData in data_train: - dayDataNP = np.asarray(dayData) - for i in range(len(dayDataNP)): - # parsedurl = urlparse(dayDataNP[i][2]) - parsedurl = urlparse(os.path.splitext(dayDataNP[i][3])[0]) - if not sitemapdomains.__contains__(parsedurl.hostname): - sitemapdomains[parsedurl.hostname] = Trie() - sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname - sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0 - sitemap = sitemapdomains[parsedurl.hostname] - timestamp = dayDataNP[i][2] - payload = dayDataNP[i][13] - isnewpath, newnodepath = sitemap.insert(parsedurl.path, timestamp, payload) - if isnewpath: print(newnodepath) - # if not(sitemapURLS.__contains__(parsedurl.path)): - # sitemapURLS[parsedurl.path] = parsedurl[1]+parsedurl[2] - # sitemap.insert(parsedurl.path, timestamp, payload) - -edition_cnn_com = sitemapdomains['edition.cnn.com'] -edition_cnn_com_Copy = edition_cnn_com.extract() -result = edition_cnn_com.comparison(edition_cnn_com_Copy.root) -print(result) -result = edition_cnn_com.comparison(edition_cnn_com.root) -print(result) - -matrix = edition_cnn_com.ancestorMatrix() -matrix = np.asarray(matrix) -print('done') - - - - - - - - - - - - - - diff --git a/NewCNN_1hour2levelUpdated.py b/NewCNN_1hour2levelUpdated.py new file mode 100644 index 0000000..4be5ac7 --- /dev/null +++ b/NewCNN_1hour2levelUpdated.py @@ -0,0 +1,159 @@ +import pandas as pd +import numpy as np +import os as os +from Trie import Trie +from urllib.parse import urlparse +from ResponseParser import responseParser +import datetime +from sklearn import svm +from sklearn.metrics import classification_report +from sklearn.metrics import confusion_matrix +from sklearn.ensemble import RandomForestClassifier + +# making data +def buildDataDayWise (folderpath): + listOfFolder = os.listdir(folderpath) + data = [] + parquentFilesAddress = [] + for i in listOfFolder: + if not (i.endswith('.DS_Store')): + address = folderpath + i + "/" + listOfFiles = os.listdir(address) + for f_name in listOfFiles: + if f_name.endswith('.parquet'): + addressPar = folderpath + i + "/" + f_name + parquentFilesAddress.append(addressPar) + parquentFilesAddress.sort(); + + # h5_file = h5py.File("cnnData.h5") + # dst = h5_file.create_dataset("myvideo", shape=(len(parquentFilesAddress),)) + + for addressPar in parquentFilesAddress: + dateFiles = pd.read_parquet(addressPar) + if (len(dateFiles) == 0): continue + zz_new = [] + tm = dateFiles.iloc[0].filename.split('.')[0].split('-')[1] + for i in range(len(dateFiles)): + if dateFiles.iloc[i].mime == 'text/html' and dateFiles.iloc[i].status == '200': + currentData = {} + currentData['timestamp'] = dateFiles.iloc[i].filename.split('.')[0].split('-')[1] + # currentData['timestamp'] = dateFiles.iloc[i].timestamp + currentData['originalUrl'] = dateFiles.iloc[i].originalUrl + # currentData['mime'] = dateFiles.iloc[i].mime + currentData['payload'] = dateFiles.iloc[i].payload + # currentData = [dateFiles.iloc[i].filename.split('.')[0].split('-')[1],dateFiles.iloc[i].originalUrl, dateFiles.iloc[i].payload ] + zz_new.append(currentData) + df = pd.DataFrame(zz_new) + df.to_csv('my_csv.csv', mode='a', header=False) + # h5_file.create_array('/', 'Cnn_data_for_{}'.format(tm), np.asarray(zz_new)) + data.append(zz_new) + print('Data Processed') + # h5_file.close() + return data + +def dataSplitBuildTest(data_array, threshold): + data_train = data_array[0:threshold] + data_test = data_array[threshold:len(data_array)] + print('split done ') + return data_train, data_test + +def cleanDataSet (data_array): + data = [] + for i in range(len(data_array)): + zz_new = [] + for j in range(len(data_array[i])): + if data_array[i][j] != None : + zz_new.append(data_array[i][j]) + data.append(zz_new) + return data + +#making sitemap tree +def makingSitemapTree (data_train, threshold): + changeNodesMatrix = [] + #unique domains + lastCheckpoint = 0; + intialValue = 0; + baseTimestamp = datetime.datetime.strptime(data_train[0][0]['timestamp'], '%Y%m%d%H%M%S') + if(len(data_train)>0): lastCheckpoint = len(data_train[0]) + sitemapdomains = {} + for dayDataNP in data_train: + counter = 0 + for i in range(len(dayDataNP)): + url = dayDataNP[i]['originalUrl'] + if (url[len(url) - 1] != '/'): url = url + "/" + parsedurl = urlparse(os.path.splitext(url)[0]) + if not sitemapdomains.__contains__(parsedurl.hostname): + sitemapdomains[parsedurl.hostname] = Trie() + sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname + sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0 + sitemap = sitemapdomains[parsedurl.hostname] + timestamp = dayDataNP[i]['timestamp'] + # payload = dayDataNP[i]['payload'] + payload = ''; + # payload = responseParser(payload).read().decode("utf-8") + isnewpath,newNodePath = sitemap.insert(parsedurl.path, timestamp, payload) + if(isnewpath): counter = counter+1 + intialValue = intialValue+counter + percentagechange = (intialValue/lastCheckpoint)*100 + label = 0 + if(percentagechange>=threshold): + intialValue = 0 + lastCheckpoint = len(dayDataNP) + label = 1 + dateString = str(dayDataNP[0]['timestamp']) + timestamp = datetime.datetime.strptime(dateString, '%Y%m%d%H%M%S').strftime("%Y-%m-%d %H:%M:%S") + + baseTimestampDiff = ((datetime.datetime.strptime(dateString,'%Y%m%d%H%M%S') - baseTimestamp).total_seconds())/3600.0 + change = [timestamp, baseTimestampDiff, counter, percentagechange, label, 0, 0.75] + changeNodesMatrix.append(change) + print(dateString+' sitemap done') + return [sitemapdomains, changeNodesMatrix] + +def results(X,y, trainEx): + X = np.array(X) + y = np.array(y) + X_train = X[0:trainEx, :] + y_train = np.reshape(y[0:trainEx], (len(y[0:trainEx]), 1)) + X_test = X[trainEx:, :] + y_test = np.reshape(y[trainEx:], (len(y[trainEx:]), 1)) + print("Starting SVM Classification") + regressor = svm.SVC() + regressor.fit(X_train, y_train) + y_pred = regressor.predict(X_test) + y_pred = np.reshape(np.array(y_pred), (len(y_pred), 1)) + print(y_pred) + print(classification_report(y_test, y_pred)) + print(confusion_matrix(y_test, y_pred)) + print("Accuracy: SVM --> " + str(np.mean(y_test == y_pred))) + + print("Starting RF Classification") + + clf = RandomForestClassifier() + clf.fit(X_train, y_train) + y_pred = clf.predict(X_test) + + # print(y_pred) + print(classification_report(y_test, y_pred)) + print(confusion_matrix(y_test, y_pred)) + RF_ = np.mean(y_test == y_pred) + print("Accuracy: RF --> " + str(np.mean(y_test == y_pred))) + +def extractSitemap(sitemapdomains, domainName): + return sitemapdomains[domainName] + +def createCopySitemap(sitemapdomains, domainName): + sitemap = sitemapdomains[domainName] + return sitemap.extract() + +def getSitemapForTimestamp(sitemapdomains, domainName, startTimestamp, endTimeStamp): + sitemap = sitemapdomains[domainName] + return sitemap.extract(startTimestamp,endTimeStamp) + +def compareTwoSiteMaps (sitemap1, sitemap2): + return sitemap1.comparison(sitemap2.root) + +def extractMatrixSiteMap (sitemapdomains, domainName): + sitemap = sitemapdomains[domainName] + return np.asarray(sitemap.ancestorMatrix()) + + diff --git a/ParquetfileProcessor.ipynb b/ParquetfileProcessor.ipynb new file mode 100644 index 0000000..e4d61d0 --- /dev/null +++ b/ParquetfileProcessor.ipynb @@ -0,0 +1,161 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "outputs": [], + "source": [ + "import os as os\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql import SQLContext\n", + "import pandas as pd\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n", + "is_executing": false + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "root = 'data'" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "parquentFilesAddress = []\n", + "for root, dirs, files in os.walk(root):\n", + " path = root.split(os.sep)\n", + " for file in files:\n", + " if file.endswith(\".parquet\"):\n", + " parquentFilesAddress.append(os.path.join(root, file))\n", + "\n", + "print(str(len(parquentFilesAddress)) + \" parquet files found\")" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "spark = SparkSession.builder \\\n", + " .master(\"local[*]\")\\\n", + " .config(\"spark.executor.memory\", \"70g\")\\\n", + " .config(\"spark.driver.memory\", \"50g\")\\\n", + " .config(\"spark.memory.offHeap.enabled\", \"true\")\\\n", + " .config(\"spark.memory.offHeap.size\", \"14g\")\\\n", + " .appName(\"sampleCodeForReference\")\\\n", + " .config(\"spark.driver.cores\", \"12\")\\\n", + " .getOrCreate()\n", + "\n", + "spark.conf.set(\"spark.sql.parquet.enableVectorizedReader\",\"false\")\n", + "\n", + "sc = spark.sparkContext\n", + "\n", + "sqlContext = SQLContext(sc)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "archiveData = []\n", + "for addressPar in parquentFilesAddress:\n", + " zz_new = []\n", + " dateFiles = sqlContext.read.parquet(addressPar)\n", + " print(addressPar)\n", + " # print(dateFiles.count())\n", + " data = dateFiles.select('filename', 'originalUrl', 'mime' , 'status').collect()\n", + " for i in range(len(data)):\n", + " if data[i]['mime'] == 'text/html' and data[i]['status'] == '200':\n", + " currentData = {}\n", + " currentData['timestamp'] = data[i]['filename'].split('.')[0].split('-')[1]\n", + " currentData['originalUrl'] = data[i]['originalUrl']\n", + " zz_new.append(currentData)\n", + " print(addressPar+' Processed')\n", + " print(str(len(zz_new)))\n", + " archiveData.append(zz_new)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "archiveDataDataFrame = pd.DataFrame(archiveData)\n", + "archiveDataDataFrame.to_pickle('cnnNodeData.pkl')\n", + "print('Data Processed')\n", + "\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + }, + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "source": [], + "metadata": { + "collapsed": false + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/Practice3.py b/Practice3.py deleted file mode 100644 index a7be56f..0000000 --- a/Practice3.py +++ /dev/null @@ -1,54 +0,0 @@ -import os as os -from pyspark.sql import SparkSession -from pyspark.sql import SQLContext -import pandas as pd - -folderpath = "CNN_1hour2level/" -listOfFolder = os.listdir(folderpath) -parquentFilesAddress = [] -for i in listOfFolder: - if not (i.endswith('.DS_Store')): - address = folderpath + i + "/" - listOfFiles = os.listdir(address) - for f_name in listOfFiles: - if f_name.endswith('.parquet'): - addressPar = folderpath + i + "/" + f_name - parquentFilesAddress.append(addressPar) -parquentFilesAddress.sort(); -spark = SparkSession.builder \ - .master("local[*]")\ - .config("spark.executor.memory", "70g")\ - .config("spark.driver.memory", "50g")\ - .config("spark.memory.offHeap.enabled", "true")\ - .config("spark.memory.offHeap.size", "14g")\ - .config("spark.driver.cores", "4")\ - .appName("sampleCodeForReference")\ - .getOrCreate() -spark.conf.set("spark.sql.parquet.enableVectorizedReader","false") - -sc = spark.sparkContext - -# using SQLContext to read parquet file -sqlContext = SQLContext(sc) -archiveData = [] - -# to read parquet file -for addressPar in parquentFilesAddress: - zz_new = [] - dateFiles = sqlContext.read.parquet(addressPar) - print(addressPar) - # print(dateFiles.count()) - for i in range(1,dateFiles.count()+1): - # print(i) - currentData = {} - currentData['timestamp'] = dateFiles.rdd.take(i)[0]['filename'].split('.')[0].split('-')[1] - # currentData['timestamp'] = dateFiles.iloc[i].timestamp - currentData['originalUrl'] = dateFiles.rdd.take(i)[0]['originalUrl'] - # currentData['mime'] = dateFiles.iloc[i].mime - currentData['payload'] = dateFiles.rdd.take(i)[0]['payload'] - zz_new.append(currentData) - print(addressPar+' Processed') - archiveData.append(zz_new) -archiveData = pd.DataFrame(archiveData) -archiveData.to_pickle('cnnNodeData.pkl') -print('Data Processed') diff --git a/archiveTextClassifier.py b/archiveTextClassifier.py deleted file mode 100644 index c197622..0000000 --- a/archiveTextClassifier.py +++ /dev/null @@ -1,261 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# In[2]: - - -import os -import pandas as pd -from html_similarity import style_similarity, structural_similarity, similarity -from bs4 import BeautifulSoup, Doctype -from bs4.element import Comment -from collections import Counter -from scipy.spatial import distance -from nltk.corpus import stopwords -from nltk.tokenize import word_tokenize -from nltk.tokenize.treebank import TreebankWordDetokenizer -import string -import spacy -from nltk.metrics import edit_distance -from nltk.metrics import edit_distance -from nltk.metrics import interval_distance -from nltk import jaccard_distance -import textdistance -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import StandardScaler -from sklearn.ensemble import RandomForestClassifier -from sklearn.metrics import accuracy_score -from sklearn import svm - - -# In[3]: - - -def tag_visible(element): - if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']: - return False - if isinstance(element, Comment): - return False - return True - - -# In[4]: - - -def text_from_html(htmlPage): - soup = BeautifulSoup(htmlPage, 'html.parser') - texts = soup.findAll(text=True) - visible_texts = filter(tag_visible, texts) - return u" ".join(t.strip() for t in visible_texts) - - -# In[5]: - - -def split(word): - return [char for char in word] - - -# In[6]: - - -def filter_text(text): - stop_words = set(stopwords.words('english')) - stop_words.update(split(string.punctuation)) - nlp = spacy.load('en_core_web_sm') - spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS - stop_words.update(spacy_stopwords) - #stop_words.update(["\\t","\\n","\\r"]) - - text = text.replace("\\n", "") - text = text.replace("\\r", "") - text = text.replace("\\t", "") - - word_tokens_text = word_tokenize(text) - - filtered_text = [w for w in word_tokens_text if not w in stop_words] - - filtered_text = TreebankWordDetokenizer().detokenize(filtered_text) - - return filtered_text - - -# In[ ]: - - - - - -# In[ ]: - - - - - -# In[7]: - - -def classiyRF(archiveData, newRecord): - archiveData.sort(key=lambda x: x['timestamp'], reverse=False) - - basePayload = archiveData[0]['payload'] - basePayloadText = text_from_html(basePayload) - basePayloadFilteredText = filter_text(basePayloadText) - lastSavedDataIndex = 0 - dataset = [] - - print(str(len(archiveData)) + " datapoints found") - - for i in range(1, len(archiveData)): - if(i % 100 is 0): - print(str(i) + " Records processed") - - hasContentChanged = False - - overallSimilarity = similarity(basePayload, archiveData[i]['payload']) - styleSimilarity = style_similarity(basePayload, archiveData[i]['payload']) - structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload']) - - archiveText = text_from_html(archiveData[i]['payload']) - filteredArchiveText = filter_text(archiveText) - - cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText) - jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText) - #editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText) - sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText) - - if(overallSimilarity < 0.80 or cosineSimilarity < 0.95): - hasContentChanged = True - lastSavedDataIndex = i - basePayload = archiveData[i]['payload'] - basePayloadText = archiveText - basePayloadFilteredText = filteredArchiveText - - data = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged] - dataset.append(data) - - - - df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'structureSimilarity', 'cosine', 'jaccard', 'sorensen', 'changed']) - print("Dataframe created") - - X = df.iloc[:, 0:6].values - y = df.iloc[:, 6].values - - sc = StandardScaler() - X_train = sc.fit_transform(X) - - - overallSimilarity = similarity(basePayload, newRecord['payload']) - styleSimilarity = style_similarity(basePayload, newRecord['payload']) - structuralSimilarity = structural_similarity(basePayload, newRecord['payload']) - - archiveText = text_from_html(newRecord['payload']) - filteredArchiveText = filter_text(archiveText) - - cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText) - jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText) - #editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText) - sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText) - - X_test = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity] - - - print("Starting Random Forest Classification") - - regressor = RandomForestClassifier(n_estimators=20, random_state=0) - regressor.fit(X_train, y) - y_pred = regressor.predict([X_test]) - - return y_pred - - - - - - - - -# In[ ]: - - -def classiySVM(archiveData, newRecord): - archiveData.sort(key=lambda x: x['timestamp'], reverse=False) - - basePayload = archiveData[0]['payload'] - basePayloadText = text_from_html(basePayload) - basePayloadFilteredText = filter_text(basePayloadText) - lastSavedDataIndex = 0 - dataset = [] - - print(str(len(archiveData)) + " datapoints found") - - for i in range(1, len(archiveData)): - if(i % 100 is 0): - print(str(i) + " Records processed") - - hasContentChanged = False - - overallSimilarity = similarity(basePayload, archiveData[i]['payload']) - styleSimilarity = style_similarity(basePayload, archiveData[i]['payload']) - structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload']) - - archiveText = text_from_html(archiveData[i]['payload']) - filteredArchiveText = filter_text(archiveText) - - cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText) - jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText) - #editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText) - sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText) - - if(overallSimilarity < 0.80 or cosineSimilarity < 0.95): - hasContentChanged = True - lastSavedDataIndex = i - basePayload = archiveData[i]['payload'] - basePayloadText = archiveText - basePayloadFilteredText = filteredArchiveText - - data = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged] - dataset.append(data) - - - - df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'structureSimilarity', 'cosine', 'jaccard', 'sorensen', 'changed']) - print("Dataframe created") - - X = df.iloc[:, 0:6].values - y = df.iloc[:, 6].values - - sc = StandardScaler() - X_train = sc.fit_transform(X) - - - overallSimilarity = similarity(basePayload, newRecord['payload']) - styleSimilarity = style_similarity(basePayload, newRecord['payload']) - structuralSimilarity = structural_similarity(basePayload, newRecord['payload']) - - archiveText = text_from_html(newRecord['payload']) - filteredArchiveText = filter_text(archiveText) - - cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText) - jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText) - #editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText) - sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText) - - X_test = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity] - - - print("Starting SVM Classification") - - regressor = svm.SVC() - regressor.fit(X_train, y) - y_pred = regressor.predict([X_test]) - - return y_pred - - - - - - - diff --git a/main.py b/main.py deleted file mode 100644 index 9952f90..0000000 --- a/main.py +++ /dev/null @@ -1,66 +0,0 @@ -import pyarrow.parquet as pq -import pandas as pd -import numpy as np -import os as os -from Trie import Trie -from urllib.parse import urlparse - -# making data -listOfFolder = os.listdir("vt.edu/") -data = [] -for i in listOfFolder: - if not(i.endswith('.DS_Store')): - address = "vt.edu/"+i+"/" - listOfFiles = os.listdir(address) - for f_name in listOfFiles: - if f_name.endswith('.parquet'): - addressPar = "vt.edu/" + i + "/"+f_name - dateFiles = pd.read_parquet(addressPar).to_numpy() - if(len(dateFiles)==0) : continue - zz_new = [] - for j in range(len(dateFiles)): - if dateFiles[j][3] == 'text/html' and dateFiles[j][4] == '200': - zz_new.append(dateFiles[j]) - zz_new = np.asarray(zz_new) - data.append(zz_new) -data_array = np.asarray(data) - -# threshold = [100] -threshold = [len(data_array)] - -# making sitemap tree -data_train = data_array[0:threshold[0]] - -#unique domains -sitemapdomains = {} -# sitemapURLS["/"] = "www.vt.edu" -# sitemap = Trie() - -for dayData in data_train: - dayDataNP = np.asarray(dayData) - for i in range(len(dayDataNP)): - # parsedurl = urlparse(dayDataNP[i][2]) - parsedurl = urlparse(os.path.splitext(dayDataNP[i][2])[0]) - if not sitemapdomains.__contains__(parsedurl.hostname): - sitemapdomains[parsedurl.hostname] = Trie() - sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname - sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0 - sitemap = sitemapdomains[parsedurl.hostname] - timestamp = dayDataNP[i][1] - payload = dayDataNP[i][9] - isnewpath, newnodepath = sitemap.insert(parsedurl.path, timestamp, payload) - if isnewpath: print(newnodepath) - # if not(sitemapURLS.__contains__(parsedurl.path)): - # sitemapURLS[parsedurl.path] = parsedurl[1]+parsedurl[2] - # sitemap.insert(parsedurl.path, timestamp, payload) - -vtTree = sitemapdomains['www.vt.edu'] -vtTreeCopy = vtTree.extract('20140906125541','20141215204723') -result = vtTree.comparison(vtTreeCopy.root) -print(result) -result = vtTree.comparison(vtTree.root) -print(result) - -matrix = vtTree.ancestorMatrix() -matrix = np.asarray(matrix) -print('done') -- GitLab