Skip to content
Snippets Groups Projects
Commit 97cadb18 authored by Ritesh Bansal's avatar Ritesh Bansal
Browse files

final models

parent dfe60618
Branches siteStructure
No related tags found
No related merge requests found
import pandas as pd
import numpy as np
import os as os
from Trie import Trie
from urllib.parse import urlparse
from archiveTextClassifier import classiyRF, classiySVM
from ResponseParser import responseParser
# making data
def buildDataDayWise (folderpath):
listOfFolder = os.listdir(folderpath)
data = []
for i in listOfFolder:
if not (i.endswith('.DS_Store')):
address = folderpath + i + "/"
listOfFiles = os.listdir(address)
for f_name in listOfFiles:
if f_name.endswith('.parquet'):
addressPar = folderpath + i + "/" + f_name
dateFiles = pd.read_parquet(addressPar).to_numpy()
if (len(dateFiles) == 0): continue
zz_new = []
for j in range(len(dateFiles)):
if dateFiles[j][4] == 'text/html' and dateFiles[j][5] == '200':
zz_new.append(dateFiles[j])
zz_new = np.asarray(zz_new)
data.append(zz_new)
return np.asarray(data)
def dataSplitBuildTest(data_array, threshold):
data_array = np.asarray(data_array)
data_train = data_array[0:threshold]
data_test = data_array[threshold:len(data_array)]
return data_train, data_test
#making sitemap tree
def makingSitemapTree (data_train):
data_train = np.asarray(data_train)
#unique domains
sitemapdomains = {}
# sitemapURLS["/"] = "www.vt.edu"
# sitemap = Trie()
for dayData in data_train:
dayDataNP = np.asarray(dayData)
for i in range(len(dayDataNP)):
# parsedurl = urlparse(dayDataNP[i][2])
url = dayDataNP[i][3]
if (url[len(url) - 1] != '/'): url = url + "/"
parsedurl = urlparse(os.path.splitext(url)[0])
if not sitemapdomains.__contains__(parsedurl.hostname):
sitemapdomains[parsedurl.hostname] = Trie()
sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname
sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0
sitemap = sitemapdomains[parsedurl.hostname]
timestamp = dayDataNP[i][2]
payload = dayDataNP[i][13]
payload = responseParser(payload).read().decode("utf-8")
sitemap.insert(parsedurl.path, timestamp, payload)
return sitemapdomains
def testingSitemapTreeClassiyRF (sitemapdomains, data_test):
data_test = np.asarray(data_test)
# testing sitemap tree
for dayData in data_test:
dayDataNP = np.asarray(dayData)
for i in range(len(dayDataNP)):
# parsedurl = urlparse(dayDataNP[i][2])
url = dayDataNP[i][3]
if (url[len(url) - 1] != '/'): url = url + "/"
parsedurl = urlparse(os.path.splitext(url)[0])
if not sitemapdomains.__contains__(parsedurl.hostname):
sitemapdomains[parsedurl.hostname] = Trie()
sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname
sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0
sitemap = sitemapdomains[parsedurl.hostname]
timestamp = dayDataNP[i][2]
payload = dayDataNP[i][13]
payload = responseParser(payload).read().decode("utf-8")
# Check for structure change
structchange = sitemap.isStructureChange(parsedurl.path)
if (structchange):
sitemap.insert(parsedurl.path, timestamp, payload)
else:
nodeData = sitemap.extractNodeData(parsedurl.path)
new_data = {}
new_data['timestamp'] = timestamp
new_data['payload'] = payload
nodeDataMpdified = []
for key,val in nodeData.items():
temp = {}
temp['timestamp'] = key
temp['payload'] = val
nodeDataMpdified.append(temp)
tocrawl = classiyRF(nodeDataMpdified, new_data)
# if yes, crawl
# if no, use classifier to check for to crawl or not
if(tocrawl[0]):
sitemap.insert(parsedurl.path, timestamp, payload)
return sitemapdomains
def testingSitemapTreeClassiySVM (sitemapdomains, data_test):
data_test = np.asarray(data_test)
# testing sitemap tree
for dayData in data_test:
dayDataNP = np.asarray(dayData)
for i in range(len(dayDataNP)):
# parsedurl = urlparse(dayDataNP[i][2])
url = dayDataNP[i][3]
if (url[len(url) - 1] != '/'): url = url + "/"
parsedurl = urlparse(os.path.splitext(url)[0])
if not sitemapdomains.__contains__(parsedurl.hostname):
sitemapdomains[parsedurl.hostname] = Trie()
sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname
sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0
sitemap = sitemapdomains[parsedurl.hostname]
timestamp = dayDataNP[i][2]
payload = dayDataNP[i][13]
payload = responseParser(payload).read().decode("utf-8")
# Check for structure change
structchange = sitemap.isStructureChange(parsedurl.path)
if (structchange):
sitemap.insert(parsedurl.path, timestamp, payload)
else:
nodeData = sitemap.extractNodeData(parsedurl.path)
new_data = {}
new_data['timestamp'] = timestamp
new_data['payload'] = payload
nodeDataMpdified = []
for key,val in nodeData.items():
temp = {}
temp['timestamp'] = key
temp['payload'] = val
nodeDataMpdified.append(temp)
tocrawl = classiySVM(nodeDataMpdified, new_data)
# if yes, crawl
# if no, use classifier to check for to crawl or not
if(tocrawl[0]):
sitemap.insert(parsedurl.path, timestamp, payload)
return sitemapdomains
def extractSitemap(sitemapdomains, domainName):
return sitemapdomains[domainName]
def createCopySitemap(sitemapdomains, domainName):
sitemap = sitemapdomains[domainName]
return sitemap.extract()
def getSitemapForTimestamp(sitemapdomains, domainName, startTimestamp, endTimeStamp):
sitemap = sitemapdomains[domainName]
return sitemap.extract(startTimestamp,endTimeStamp)
def compareTwoSiteMaps (sitemap1, sitemap2):
return sitemap1.comparison(sitemap2.root)
def extractMatrixSiteMap (sitemapdomains, domainName):
sitemap = sitemapdomains[domainName]
return np.asarray(sitemap.ancestorMatrix())
import NewCNN_1hour2levelUpdated as cnnFocusCrawl
from numpy import save
from functools import cmp_to_key
import numpy as np
import CNN_1hour2level as cnnFocusCrawl
import pandas as pd
from fbprophet import Prophet
import math
# making data
folderpath = "CNN_1hour2level/"
data_array = cnnFocusCrawl.buildDataDayWise(folderpath)
data_array = np.asarray(data_array)
print(len(data_array))
#uncomment when use to build data
# data_array = cnnFocusCrawl.buildDataDayWise(folderpath)
#cnnNodeData2.pkl will be generated in above step, loading over here
archiveDataDataFrameLoad = pd.read_pickle('cnnNodeData2.pkl').values
clean_archiveDataDataFrameLoad = cnnFocusCrawl.cleanDataSet(archiveDataDataFrameLoad)
def compare(item1, item2):
if(item1[0]['timestamp']>item2[0]['timestamp']):
return +1
elif (item1[0]['timestamp']<item2[0]['timestamp']):
return -1
else :
return 0
#sorting data
clean_archiveDataDataFrameLoad.sort(key=cmp_to_key(compare))
#split data
threshold = [100]
data_train, data_test = cnnFocusCrawl.dataSplitBuildTest(data_array, threshold[0])
threshold = [len(clean_archiveDataDataFrameLoad)]
data_train, data_test = cnnFocusCrawl.dataSplitBuildTest(clean_archiveDataDataFrameLoad, threshold[0])
# making sitemap tree
#unique domains
sitemapdomains = cnnFocusCrawl.makingSitemapTree(data_train)
# sitemapURLS["/"] = "www.vt.edu"
# sitemap = Trie()
sitemapdomains, changeNodesMatrix = cnnFocusCrawl.makingSitemapTree(data_train, 0.75)
changeNodesMatrix = np.asarray(changeNodesMatrix)
newNodesDataset = pd.DataFrame({'DS': changeNodesMatrix[:, 0], 'Y': changeNodesMatrix[:, 1]})
newNodesPerChangeDataset = pd.DataFrame({'DS': changeNodesMatrix[:, 0], 'Y': changeNodesMatrix[:, 2]})
# testing sitemap tree
sitemapdomains = cnnFocusCrawl.testingSitemapTreeClassiyRF(sitemapdomains, data_test)
save('data.npy', changeNodesMatrix)
edition_cnn_com = sitemapdomains['www.cnn.com']
edition_cnn_com_Copy = edition_cnn_com.extract("","")
......@@ -27,4 +48,42 @@ result = edition_cnn_com.isSame(edition_cnn_com.root)
print(result)
matrix = edition_cnn_com.ancestorMatrix()
matrix = np.asarray(matrix)
print('done')
\ No newline at end of file
print('done')
data = np.load('data.npy')[1:,:]
# SVM
trainEx = math.floor(len(newNodesPerChangeDataset)*0.8)
X = []
y = []
windowSize = 4
for i in range(len(data) - windowSize -1):
bound = min(i + windowSize, len(data))
window = data[i:bound, 2]
windowLabel = data[bound, 4]
X.append(window)
y.append(windowLabel)
cnnFocusCrawl.results(X,y, trainEx)
newNodesPerChangeDataset = pd.DataFrame({'ds': data[:, 0], 'y': data[:, 3], 'floor': np.array(data[:, 5]).astype(np.float), 'cap': np.array(data[:, 6]).astype(np.float)})
trainEx = math.floor(len(newNodesPerChangeDataset)*0.8)
def dataSplitBuildTest(data_array, threshold):
data_train = data_array[0:threshold]
data_test = data_array[threshold:len(data_array)]
print('split done ')
return data_train, data_test
# Nodes added Prediction
df_Nodes, data_test2 = dataSplitBuildTest(newNodesPerChangeDataset, trainEx)
m2 = Prophet(growth='logistic')
m2.fit(df_Nodes)
allfuture2 = m2.make_future_dataframe(periods=72, freq="1H", include_history=True)
allfuture2['cap'] = 0.75
allfuture2['floor'] = 0
allforecast2 = m2.predict(allfuture2)
fig3 = m2.plot(allforecast2)
fig4 = m2.plot_components(allforecast2)
\ No newline at end of file
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import os as os
from Trie import Trie
from urllib.parse import urlparse
# making data
folderNews = "CNN_focuscrawls/"
listOfFolder = os.listdir(folderNews)
data = []
for i in listOfFolder:
if not(i.endswith('.DS_Store')):
address = folderNews+i+"/"
listOfFiles = os.listdir(address)
for f_name in listOfFiles:
if f_name.endswith('.parquet'):
addressPar = folderNews + i + "/"+f_name
dateFiles = pd.read_parquet(addressPar).to_numpy()
if(len(dateFiles)==0) : continue
zz_new = []
for j in range(len(dateFiles)):
if dateFiles[j][4] == 'text/html' and dateFiles[j][5] == '200':
zz_new.append(dateFiles[j])
zz_new = np.asarray(zz_new)
data.append(zz_new)
data_array = np.asarray(data)
# threshold = [100]
threshold = [len(data_array)]
# making sitemap tree
data_train = data_array[0:threshold[0]]
#unique domains
sitemapdomains = {}
# sitemapURLS["/"] = "www.vt.edu"
# sitemap = Trie()
for dayData in data_train:
dayDataNP = np.asarray(dayData)
for i in range(len(dayDataNP)):
# parsedurl = urlparse(dayDataNP[i][2])
parsedurl = urlparse(os.path.splitext(dayDataNP[i][3])[0])
if not sitemapdomains.__contains__(parsedurl.hostname):
sitemapdomains[parsedurl.hostname] = Trie()
sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname
sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0
sitemap = sitemapdomains[parsedurl.hostname]
timestamp = dayDataNP[i][2]
payload = dayDataNP[i][13]
isnewpath, newnodepath = sitemap.insert(parsedurl.path, timestamp, payload)
if isnewpath: print(newnodepath)
# if not(sitemapURLS.__contains__(parsedurl.path)):
# sitemapURLS[parsedurl.path] = parsedurl[1]+parsedurl[2]
# sitemap.insert(parsedurl.path, timestamp, payload)
edition_cnn_com = sitemapdomains['edition.cnn.com']
edition_cnn_com_Copy = edition_cnn_com.extract()
result = edition_cnn_com.comparison(edition_cnn_com_Copy.root)
print(result)
result = edition_cnn_com.comparison(edition_cnn_com.root)
print(result)
matrix = edition_cnn_com.ancestorMatrix()
matrix = np.asarray(matrix)
print('done')
import pandas as pd
import numpy as np
import os as os
from Trie import Trie
from urllib.parse import urlparse
from ResponseParser import responseParser
import datetime
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
# making data
def buildDataDayWise (folderpath):
listOfFolder = os.listdir(folderpath)
data = []
parquentFilesAddress = []
for i in listOfFolder:
if not (i.endswith('.DS_Store')):
address = folderpath + i + "/"
listOfFiles = os.listdir(address)
for f_name in listOfFiles:
if f_name.endswith('.parquet'):
addressPar = folderpath + i + "/" + f_name
parquentFilesAddress.append(addressPar)
parquentFilesAddress.sort();
# h5_file = h5py.File("cnnData.h5")
# dst = h5_file.create_dataset("myvideo", shape=(len(parquentFilesAddress),))
for addressPar in parquentFilesAddress:
dateFiles = pd.read_parquet(addressPar)
if (len(dateFiles) == 0): continue
zz_new = []
tm = dateFiles.iloc[0].filename.split('.')[0].split('-')[1]
for i in range(len(dateFiles)):
if dateFiles.iloc[i].mime == 'text/html' and dateFiles.iloc[i].status == '200':
currentData = {}
currentData['timestamp'] = dateFiles.iloc[i].filename.split('.')[0].split('-')[1]
# currentData['timestamp'] = dateFiles.iloc[i].timestamp
currentData['originalUrl'] = dateFiles.iloc[i].originalUrl
# currentData['mime'] = dateFiles.iloc[i].mime
currentData['payload'] = dateFiles.iloc[i].payload
# currentData = [dateFiles.iloc[i].filename.split('.')[0].split('-')[1],dateFiles.iloc[i].originalUrl, dateFiles.iloc[i].payload ]
zz_new.append(currentData)
df = pd.DataFrame(zz_new)
df.to_csv('my_csv.csv', mode='a', header=False)
# h5_file.create_array('/', 'Cnn_data_for_{}'.format(tm), np.asarray(zz_new))
data.append(zz_new)
print('Data Processed')
# h5_file.close()
return data
def dataSplitBuildTest(data_array, threshold):
data_train = data_array[0:threshold]
data_test = data_array[threshold:len(data_array)]
print('split done ')
return data_train, data_test
def cleanDataSet (data_array):
data = []
for i in range(len(data_array)):
zz_new = []
for j in range(len(data_array[i])):
if data_array[i][j] != None :
zz_new.append(data_array[i][j])
data.append(zz_new)
return data
#making sitemap tree
def makingSitemapTree (data_train, threshold):
changeNodesMatrix = []
#unique domains
lastCheckpoint = 0;
intialValue = 0;
baseTimestamp = datetime.datetime.strptime(data_train[0][0]['timestamp'], '%Y%m%d%H%M%S')
if(len(data_train)>0): lastCheckpoint = len(data_train[0])
sitemapdomains = {}
for dayDataNP in data_train:
counter = 0
for i in range(len(dayDataNP)):
url = dayDataNP[i]['originalUrl']
if (url[len(url) - 1] != '/'): url = url + "/"
parsedurl = urlparse(os.path.splitext(url)[0])
if not sitemapdomains.__contains__(parsedurl.hostname):
sitemapdomains[parsedurl.hostname] = Trie()
sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname
sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0
sitemap = sitemapdomains[parsedurl.hostname]
timestamp = dayDataNP[i]['timestamp']
# payload = dayDataNP[i]['payload']
payload = '';
# payload = responseParser(payload).read().decode("utf-8")
isnewpath,newNodePath = sitemap.insert(parsedurl.path, timestamp, payload)
if(isnewpath): counter = counter+1
intialValue = intialValue+counter
percentagechange = (intialValue/lastCheckpoint)*100
label = 0
if(percentagechange>=threshold):
intialValue = 0
lastCheckpoint = len(dayDataNP)
label = 1
dateString = str(dayDataNP[0]['timestamp'])
timestamp = datetime.datetime.strptime(dateString, '%Y%m%d%H%M%S').strftime("%Y-%m-%d %H:%M:%S")
baseTimestampDiff = ((datetime.datetime.strptime(dateString,'%Y%m%d%H%M%S') - baseTimestamp).total_seconds())/3600.0
change = [timestamp, baseTimestampDiff, counter, percentagechange, label, 0, 0.75]
changeNodesMatrix.append(change)
print(dateString+' sitemap done')
return [sitemapdomains, changeNodesMatrix]
def results(X,y, trainEx):
X = np.array(X)
y = np.array(y)
X_train = X[0:trainEx, :]
y_train = np.reshape(y[0:trainEx], (len(y[0:trainEx]), 1))
X_test = X[trainEx:, :]
y_test = np.reshape(y[trainEx:], (len(y[trainEx:]), 1))
print("Starting SVM Classification")
regressor = svm.SVC()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
y_pred = np.reshape(np.array(y_pred), (len(y_pred), 1))
print(y_pred)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("Accuracy: SVM --> " + str(np.mean(y_test == y_pred)))
print("Starting RF Classification")
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# print(y_pred)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
RF_ = np.mean(y_test == y_pred)
print("Accuracy: RF --> " + str(np.mean(y_test == y_pred)))
def extractSitemap(sitemapdomains, domainName):
return sitemapdomains[domainName]
def createCopySitemap(sitemapdomains, domainName):
sitemap = sitemapdomains[domainName]
return sitemap.extract()
def getSitemapForTimestamp(sitemapdomains, domainName, startTimestamp, endTimeStamp):
sitemap = sitemapdomains[domainName]
return sitemap.extract(startTimestamp,endTimeStamp)
def compareTwoSiteMaps (sitemap1, sitemap2):
return sitemap1.comparison(sitemap2.root)
def extractMatrixSiteMap (sitemapdomains, domainName):
sitemap = sitemapdomains[domainName]
return np.asarray(sitemap.ancestorMatrix())
%% Cell type:code id: tags:
``` python
import os as os
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pandas as pd
```
%% Cell type:code id: tags:
``` python
root = 'data'
```
%% Cell type:code id: tags:
``` python
parquentFilesAddress = []
for root, dirs, files in os.walk(root):
path = root.split(os.sep)
for file in files:
if file.endswith(".parquet"):
parquentFilesAddress.append(os.path.join(root, file))
print(str(len(parquentFilesAddress)) + " parquet files found")
```
%% Cell type:code id: tags:
``` python
spark = SparkSession.builder \
.master("local[*]")\
.config("spark.executor.memory", "70g")\
.config("spark.driver.memory", "50g")\
.config("spark.memory.offHeap.enabled", "true")\
.config("spark.memory.offHeap.size", "14g")\
.appName("sampleCodeForReference")\
.config("spark.driver.cores", "12")\
.getOrCreate()
spark.conf.set("spark.sql.parquet.enableVectorizedReader","false")
sc = spark.sparkContext
sqlContext = SQLContext(sc)
```
%% Cell type:code id: tags:
``` python
archiveData = []
for addressPar in parquentFilesAddress:
zz_new = []
dateFiles = sqlContext.read.parquet(addressPar)
print(addressPar)
# print(dateFiles.count())
data = dateFiles.select('filename', 'originalUrl', 'mime' , 'status').collect()
for i in range(len(data)):
if data[i]['mime'] == 'text/html' and data[i]['status'] == '200':
currentData = {}
currentData['timestamp'] = data[i]['filename'].split('.')[0].split('-')[1]
currentData['originalUrl'] = data[i]['originalUrl']
zz_new.append(currentData)
print(addressPar+' Processed')
print(str(len(zz_new)))
archiveData.append(zz_new)
```
%% Cell type:code id: tags:
``` python
archiveDataDataFrame = pd.DataFrame(archiveData)
archiveDataDataFrame.to_pickle('cnnNodeData.pkl')
print('Data Processed')
```
import os as os
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pandas as pd
folderpath = "CNN_1hour2level/"
listOfFolder = os.listdir(folderpath)
parquentFilesAddress = []
for i in listOfFolder:
if not (i.endswith('.DS_Store')):
address = folderpath + i + "/"
listOfFiles = os.listdir(address)
for f_name in listOfFiles:
if f_name.endswith('.parquet'):
addressPar = folderpath + i + "/" + f_name
parquentFilesAddress.append(addressPar)
parquentFilesAddress.sort();
spark = SparkSession.builder \
.master("local[*]")\
.config("spark.executor.memory", "70g")\
.config("spark.driver.memory", "50g")\
.config("spark.memory.offHeap.enabled", "true")\
.config("spark.memory.offHeap.size", "14g")\
.config("spark.driver.cores", "4")\
.appName("sampleCodeForReference")\
.getOrCreate()
spark.conf.set("spark.sql.parquet.enableVectorizedReader","false")
sc = spark.sparkContext
# using SQLContext to read parquet file
sqlContext = SQLContext(sc)
archiveData = []
# to read parquet file
for addressPar in parquentFilesAddress:
zz_new = []
dateFiles = sqlContext.read.parquet(addressPar)
print(addressPar)
# print(dateFiles.count())
for i in range(1,dateFiles.count()+1):
# print(i)
currentData = {}
currentData['timestamp'] = dateFiles.rdd.take(i)[0]['filename'].split('.')[0].split('-')[1]
# currentData['timestamp'] = dateFiles.iloc[i].timestamp
currentData['originalUrl'] = dateFiles.rdd.take(i)[0]['originalUrl']
# currentData['mime'] = dateFiles.iloc[i].mime
currentData['payload'] = dateFiles.rdd.take(i)[0]['payload']
zz_new.append(currentData)
print(addressPar+' Processed')
archiveData.append(zz_new)
archiveData = pd.DataFrame(archiveData)
archiveData.to_pickle('cnnNodeData.pkl')
print('Data Processed')
#!/usr/bin/env python
# coding: utf-8
# In[2]:
import os
import pandas as pd
from html_similarity import style_similarity, structural_similarity, similarity
from bs4 import BeautifulSoup, Doctype
from bs4.element import Comment
from collections import Counter
from scipy.spatial import distance
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string
import spacy
from nltk.metrics import edit_distance
from nltk.metrics import edit_distance
from nltk.metrics import interval_distance
from nltk import jaccard_distance
import textdistance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm
# In[3]:
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
# In[4]:
def text_from_html(htmlPage):
soup = BeautifulSoup(htmlPage, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
# In[5]:
def split(word):
return [char for char in word]
# In[6]:
def filter_text(text):
stop_words = set(stopwords.words('english'))
stop_words.update(split(string.punctuation))
nlp = spacy.load('en_core_web_sm')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
stop_words.update(spacy_stopwords)
#stop_words.update(["\\t","\\n","\\r"])
text = text.replace("\\n", "")
text = text.replace("\\r", "")
text = text.replace("\\t", "")
word_tokens_text = word_tokenize(text)
filtered_text = [w for w in word_tokens_text if not w in stop_words]
filtered_text = TreebankWordDetokenizer().detokenize(filtered_text)
return filtered_text
# In[ ]:
# In[ ]:
# In[7]:
def classiyRF(archiveData, newRecord):
archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
basePayload = archiveData[0]['payload']
basePayloadText = text_from_html(basePayload)
basePayloadFilteredText = filter_text(basePayloadText)
lastSavedDataIndex = 0
dataset = []
print(str(len(archiveData)) + " datapoints found")
for i in range(1, len(archiveData)):
if(i % 100 is 0):
print(str(i) + " Records processed")
hasContentChanged = False
overallSimilarity = similarity(basePayload, archiveData[i]['payload'])
styleSimilarity = style_similarity(basePayload, archiveData[i]['payload'])
structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload'])
archiveText = text_from_html(archiveData[i]['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
if(overallSimilarity < 0.80 or cosineSimilarity < 0.95):
hasContentChanged = True
lastSavedDataIndex = i
basePayload = archiveData[i]['payload']
basePayloadText = archiveText
basePayloadFilteredText = filteredArchiveText
data = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged]
dataset.append(data)
df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'structureSimilarity', 'cosine', 'jaccard', 'sorensen', 'changed'])
print("Dataframe created")
X = df.iloc[:, 0:6].values
y = df.iloc[:, 6].values
sc = StandardScaler()
X_train = sc.fit_transform(X)
overallSimilarity = similarity(basePayload, newRecord['payload'])
styleSimilarity = style_similarity(basePayload, newRecord['payload'])
structuralSimilarity = structural_similarity(basePayload, newRecord['payload'])
archiveText = text_from_html(newRecord['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
X_test = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity]
print("Starting Random Forest Classification")
regressor = RandomForestClassifier(n_estimators=20, random_state=0)
regressor.fit(X_train, y)
y_pred = regressor.predict([X_test])
return y_pred
# In[ ]:
def classiySVM(archiveData, newRecord):
archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
basePayload = archiveData[0]['payload']
basePayloadText = text_from_html(basePayload)
basePayloadFilteredText = filter_text(basePayloadText)
lastSavedDataIndex = 0
dataset = []
print(str(len(archiveData)) + " datapoints found")
for i in range(1, len(archiveData)):
if(i % 100 is 0):
print(str(i) + " Records processed")
hasContentChanged = False
overallSimilarity = similarity(basePayload, archiveData[i]['payload'])
styleSimilarity = style_similarity(basePayload, archiveData[i]['payload'])
structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload'])
archiveText = text_from_html(archiveData[i]['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
if(overallSimilarity < 0.80 or cosineSimilarity < 0.95):
hasContentChanged = True
lastSavedDataIndex = i
basePayload = archiveData[i]['payload']
basePayloadText = archiveText
basePayloadFilteredText = filteredArchiveText
data = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged]
dataset.append(data)
df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'structureSimilarity', 'cosine', 'jaccard', 'sorensen', 'changed'])
print("Dataframe created")
X = df.iloc[:, 0:6].values
y = df.iloc[:, 6].values
sc = StandardScaler()
X_train = sc.fit_transform(X)
overallSimilarity = similarity(basePayload, newRecord['payload'])
styleSimilarity = style_similarity(basePayload, newRecord['payload'])
structuralSimilarity = structural_similarity(basePayload, newRecord['payload'])
archiveText = text_from_html(newRecord['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
X_test = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity]
print("Starting SVM Classification")
regressor = svm.SVC()
regressor.fit(X_train, y)
y_pred = regressor.predict([X_test])
return y_pred
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import os as os
from Trie import Trie
from urllib.parse import urlparse
# making data
listOfFolder = os.listdir("vt.edu/")
data = []
for i in listOfFolder:
if not(i.endswith('.DS_Store')):
address = "vt.edu/"+i+"/"
listOfFiles = os.listdir(address)
for f_name in listOfFiles:
if f_name.endswith('.parquet'):
addressPar = "vt.edu/" + i + "/"+f_name
dateFiles = pd.read_parquet(addressPar).to_numpy()
if(len(dateFiles)==0) : continue
zz_new = []
for j in range(len(dateFiles)):
if dateFiles[j][3] == 'text/html' and dateFiles[j][4] == '200':
zz_new.append(dateFiles[j])
zz_new = np.asarray(zz_new)
data.append(zz_new)
data_array = np.asarray(data)
# threshold = [100]
threshold = [len(data_array)]
# making sitemap tree
data_train = data_array[0:threshold[0]]
#unique domains
sitemapdomains = {}
# sitemapURLS["/"] = "www.vt.edu"
# sitemap = Trie()
for dayData in data_train:
dayDataNP = np.asarray(dayData)
for i in range(len(dayDataNP)):
# parsedurl = urlparse(dayDataNP[i][2])
parsedurl = urlparse(os.path.splitext(dayDataNP[i][2])[0])
if not sitemapdomains.__contains__(parsedurl.hostname):
sitemapdomains[parsedurl.hostname] = Trie()
sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname
sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0
sitemap = sitemapdomains[parsedurl.hostname]
timestamp = dayDataNP[i][1]
payload = dayDataNP[i][9]
isnewpath, newnodepath = sitemap.insert(parsedurl.path, timestamp, payload)
if isnewpath: print(newnodepath)
# if not(sitemapURLS.__contains__(parsedurl.path)):
# sitemapURLS[parsedurl.path] = parsedurl[1]+parsedurl[2]
# sitemap.insert(parsedurl.path, timestamp, payload)
vtTree = sitemapdomains['www.vt.edu']
vtTreeCopy = vtTree.extract('20140906125541','20141215204723')
result = vtTree.comparison(vtTreeCopy.root)
print(result)
result = vtTree.comparison(vtTree.root)
print(result)
matrix = vtTree.ancestorMatrix()
matrix = np.asarray(matrix)
print('done')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment