diff --git a/CNN_1hour2level.py b/CNN_1hour2level.py new file mode 100644 index 0000000000000000000000000000000000000000..f3e7d377d73cafba2e3032675f948e8150de1778 --- /dev/null +++ b/CNN_1hour2level.py @@ -0,0 +1,157 @@ +import pandas as pd +import numpy as np +import os as os +from Trie import Trie +from urllib.parse import urlparse +from archiveTextClassifier import classiyRF, classiySVM +from ResponseParser import responseParser + +# making data +def buildDataDayWise (folderpath): + listOfFolder = os.listdir(folderpath) + data = [] + for i in listOfFolder: + if not (i.endswith('.DS_Store')): + address = folderpath + i + "/" + listOfFiles = os.listdir(address) + for f_name in listOfFiles: + if f_name.endswith('.parquet'): + addressPar = folderpath + i + "/" + f_name + dateFiles = pd.read_parquet(addressPar).to_numpy() + if (len(dateFiles) == 0): continue + zz_new = [] + for j in range(len(dateFiles)): + if dateFiles[j][4] == 'text/html' and dateFiles[j][5] == '200': + zz_new.append(dateFiles[j]) + zz_new = np.asarray(zz_new) + data.append(zz_new) + return np.asarray(data) + +def dataSplitBuildTest(data_array, threshold): + data_array = np.asarray(data_array) + data_train = data_array[0:threshold] + data_test = data_array[threshold:len(data_array)] + return data_train, data_test + +#making sitemap tree +def makingSitemapTree (data_train): + data_train = np.asarray(data_train) + #unique domains + sitemapdomains = {} + # sitemapURLS["/"] = "www.vt.edu" + # sitemap = Trie() + for dayData in data_train: + dayDataNP = np.asarray(dayData) + for i in range(len(dayDataNP)): + # parsedurl = urlparse(dayDataNP[i][2]) + url = dayDataNP[i][3] + if (url[len(url) - 1] != '/'): url = url + "/" + parsedurl = urlparse(os.path.splitext(url)[0]) + if not sitemapdomains.__contains__(parsedurl.hostname): + sitemapdomains[parsedurl.hostname] = Trie() + sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname + sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0 + sitemap = sitemapdomains[parsedurl.hostname] + timestamp = dayDataNP[i][2] + payload = dayDataNP[i][13] + payload = responseParser(payload).read().decode("utf-8") + sitemap.insert(parsedurl.path, timestamp, payload) + return sitemapdomains + +def testingSitemapTreeClassiyRF (sitemapdomains, data_test): + data_test = np.asarray(data_test) + # testing sitemap tree + for dayData in data_test: + dayDataNP = np.asarray(dayData) + for i in range(len(dayDataNP)): + # parsedurl = urlparse(dayDataNP[i][2]) + url = dayDataNP[i][3] + if (url[len(url) - 1] != '/'): url = url + "/" + parsedurl = urlparse(os.path.splitext(url)[0]) + if not sitemapdomains.__contains__(parsedurl.hostname): + sitemapdomains[parsedurl.hostname] = Trie() + sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname + sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0 + sitemap = sitemapdomains[parsedurl.hostname] + timestamp = dayDataNP[i][2] + payload = dayDataNP[i][13] + payload = responseParser(payload).read().decode("utf-8") + # Check for structure change + structchange = sitemap.isStructureChange(parsedurl.path) + if (structchange): + sitemap.insert(parsedurl.path, timestamp, payload) + else: + nodeData = sitemap.extractNodeData(parsedurl.path) + new_data = {} + new_data['timestamp'] = timestamp + new_data['payload'] = payload + nodeDataMpdified = [] + for key,val in nodeData.items(): + temp = {} + temp['timestamp'] = key + temp['payload'] = val + nodeDataMpdified.append(temp) + tocrawl = classiyRF(nodeDataMpdified, new_data) + # if yes, crawl + # if no, use classifier to check for to crawl or not + if(tocrawl[0]): + sitemap.insert(parsedurl.path, timestamp, payload) + return sitemapdomains + +def testingSitemapTreeClassiySVM (sitemapdomains, data_test): + data_test = np.asarray(data_test) + # testing sitemap tree + for dayData in data_test: + dayDataNP = np.asarray(dayData) + for i in range(len(dayDataNP)): + # parsedurl = urlparse(dayDataNP[i][2]) + url = dayDataNP[i][3] + if (url[len(url) - 1] != '/'): url = url + "/" + parsedurl = urlparse(os.path.splitext(url)[0]) + if not sitemapdomains.__contains__(parsedurl.hostname): + sitemapdomains[parsedurl.hostname] = Trie() + sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname + sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0 + sitemap = sitemapdomains[parsedurl.hostname] + timestamp = dayDataNP[i][2] + payload = dayDataNP[i][13] + payload = responseParser(payload).read().decode("utf-8") + # Check for structure change + structchange = sitemap.isStructureChange(parsedurl.path) + if (structchange): + sitemap.insert(parsedurl.path, timestamp, payload) + else: + nodeData = sitemap.extractNodeData(parsedurl.path) + new_data = {} + new_data['timestamp'] = timestamp + new_data['payload'] = payload + nodeDataMpdified = [] + for key,val in nodeData.items(): + temp = {} + temp['timestamp'] = key + temp['payload'] = val + nodeDataMpdified.append(temp) + tocrawl = classiySVM(nodeDataMpdified, new_data) + # if yes, crawl + # if no, use classifier to check for to crawl or not + if(tocrawl[0]): + sitemap.insert(parsedurl.path, timestamp, payload) + return sitemapdomains + +def extractSitemap(sitemapdomains, domainName): + return sitemapdomains[domainName] + +def createCopySitemap(sitemapdomains, domainName): + sitemap = sitemapdomains[domainName] + return sitemap.extract() + +def getSitemapForTimestamp(sitemapdomains, domainName, startTimestamp, endTimeStamp): + sitemap = sitemapdomains[domainName] + return sitemap.extract(startTimestamp,endTimeStamp) + +def compareTwoSiteMaps (sitemap1, sitemap2): + return sitemap1.comparison(sitemap2.root) + +def extractMatrixSiteMap (sitemapdomains, domainName): + sitemap = sitemapdomains[domainName] + return np.asarray(sitemap.ancestorMatrix()) diff --git a/CNN_1hour2levelMainNew.py b/CNN_1hour2levelMainNew.py new file mode 100644 index 0000000000000000000000000000000000000000..5bebb4094c3a2750019ee97cdf685620024b2289 --- /dev/null +++ b/CNN_1hour2levelMainNew.py @@ -0,0 +1,30 @@ + +import numpy as np +import CNN_1hour2level as cnnFocusCrawl +# making data +folderpath = "CNN_1hour2level/" +data_array = cnnFocusCrawl.buildDataDayWise(folderpath) +data_array = np.asarray(data_array) +print(len(data_array)) + +#split data +threshold = [100] +data_train, data_test = cnnFocusCrawl.dataSplitBuildTest(data_array, threshold[0]) +# making sitemap tree +#unique domains +sitemapdomains = cnnFocusCrawl.makingSitemapTree(data_train) +# sitemapURLS["/"] = "www.vt.edu" +# sitemap = Trie() + +# testing sitemap tree +sitemapdomains = cnnFocusCrawl.testingSitemapTreeClassiyRF(sitemapdomains, data_test) + +edition_cnn_com = sitemapdomains['www.cnn.com'] +edition_cnn_com_Copy = edition_cnn_com.extract() +result = edition_cnn_com.comparison(edition_cnn_com_Copy.root) +print(result) +result = edition_cnn_com.comparison(edition_cnn_com.root) +print(result) +matrix = edition_cnn_com.ancestorMatrix() +matrix = np.asarray(matrix) +print('done') \ No newline at end of file diff --git a/ResponseParser.py b/ResponseParser.py new file mode 100644 index 0000000000000000000000000000000000000000..8f7b45d4fb4c0e83bb7dd9d7dca0814eb2bf0bc6 --- /dev/null +++ b/ResponseParser.py @@ -0,0 +1,22 @@ +from http.client import HTTPResponse +from io import BytesIO + +http_response_str = """HTTP/1.1 200 OK +Date: Thu, Jul 3 15:27:54 2014 +Content-Type: text/xml; charset="utf-8" +Connection: close +Content-Length: 626 + +teststring""" +class FakeSocket(): + def __init__(self, response_bytes): + self._file = BytesIO(response_bytes) + def makefile(self, *args, **kwargs): + return self._file + +def responseParser (http_response_str): + http_response_bytes = http_response_str.encode() + source = FakeSocket(http_response_bytes) + response = HTTPResponse(source) + response.begin() + return response \ No newline at end of file diff --git a/archiveTextClassifier.py b/archiveTextClassifier.py new file mode 100644 index 0000000000000000000000000000000000000000..c1976226cd246d1ab94e22ca9a7e67ffc6261422 --- /dev/null +++ b/archiveTextClassifier.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[2]: + + +import os +import pandas as pd +from html_similarity import style_similarity, structural_similarity, similarity +from bs4 import BeautifulSoup, Doctype +from bs4.element import Comment +from collections import Counter +from scipy.spatial import distance +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from nltk.tokenize.treebank import TreebankWordDetokenizer +import string +import spacy +from nltk.metrics import edit_distance +from nltk.metrics import edit_distance +from nltk.metrics import interval_distance +from nltk import jaccard_distance +import textdistance +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import accuracy_score +from sklearn import svm + + +# In[3]: + + +def tag_visible(element): + if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']: + return False + if isinstance(element, Comment): + return False + return True + + +# In[4]: + + +def text_from_html(htmlPage): + soup = BeautifulSoup(htmlPage, 'html.parser') + texts = soup.findAll(text=True) + visible_texts = filter(tag_visible, texts) + return u" ".join(t.strip() for t in visible_texts) + + +# In[5]: + + +def split(word): + return [char for char in word] + + +# In[6]: + + +def filter_text(text): + stop_words = set(stopwords.words('english')) + stop_words.update(split(string.punctuation)) + nlp = spacy.load('en_core_web_sm') + spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS + stop_words.update(spacy_stopwords) + #stop_words.update(["\\t","\\n","\\r"]) + + text = text.replace("\\n", "") + text = text.replace("\\r", "") + text = text.replace("\\t", "") + + word_tokens_text = word_tokenize(text) + + filtered_text = [w for w in word_tokens_text if not w in stop_words] + + filtered_text = TreebankWordDetokenizer().detokenize(filtered_text) + + return filtered_text + + +# In[ ]: + + + + + +# In[ ]: + + + + + +# In[7]: + + +def classiyRF(archiveData, newRecord): + archiveData.sort(key=lambda x: x['timestamp'], reverse=False) + + basePayload = archiveData[0]['payload'] + basePayloadText = text_from_html(basePayload) + basePayloadFilteredText = filter_text(basePayloadText) + lastSavedDataIndex = 0 + dataset = [] + + print(str(len(archiveData)) + " datapoints found") + + for i in range(1, len(archiveData)): + if(i % 100 is 0): + print(str(i) + " Records processed") + + hasContentChanged = False + + overallSimilarity = similarity(basePayload, archiveData[i]['payload']) + styleSimilarity = style_similarity(basePayload, archiveData[i]['payload']) + structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload']) + + archiveText = text_from_html(archiveData[i]['payload']) + filteredArchiveText = filter_text(archiveText) + + cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText) + jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText) + #editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText) + sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText) + + if(overallSimilarity < 0.80 or cosineSimilarity < 0.95): + hasContentChanged = True + lastSavedDataIndex = i + basePayload = archiveData[i]['payload'] + basePayloadText = archiveText + basePayloadFilteredText = filteredArchiveText + + data = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged] + dataset.append(data) + + + + df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'structureSimilarity', 'cosine', 'jaccard', 'sorensen', 'changed']) + print("Dataframe created") + + X = df.iloc[:, 0:6].values + y = df.iloc[:, 6].values + + sc = StandardScaler() + X_train = sc.fit_transform(X) + + + overallSimilarity = similarity(basePayload, newRecord['payload']) + styleSimilarity = style_similarity(basePayload, newRecord['payload']) + structuralSimilarity = structural_similarity(basePayload, newRecord['payload']) + + archiveText = text_from_html(newRecord['payload']) + filteredArchiveText = filter_text(archiveText) + + cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText) + jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText) + #editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText) + sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText) + + X_test = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity] + + + print("Starting Random Forest Classification") + + regressor = RandomForestClassifier(n_estimators=20, random_state=0) + regressor.fit(X_train, y) + y_pred = regressor.predict([X_test]) + + return y_pred + + + + + + + + +# In[ ]: + + +def classiySVM(archiveData, newRecord): + archiveData.sort(key=lambda x: x['timestamp'], reverse=False) + + basePayload = archiveData[0]['payload'] + basePayloadText = text_from_html(basePayload) + basePayloadFilteredText = filter_text(basePayloadText) + lastSavedDataIndex = 0 + dataset = [] + + print(str(len(archiveData)) + " datapoints found") + + for i in range(1, len(archiveData)): + if(i % 100 is 0): + print(str(i) + " Records processed") + + hasContentChanged = False + + overallSimilarity = similarity(basePayload, archiveData[i]['payload']) + styleSimilarity = style_similarity(basePayload, archiveData[i]['payload']) + structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload']) + + archiveText = text_from_html(archiveData[i]['payload']) + filteredArchiveText = filter_text(archiveText) + + cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText) + jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText) + #editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText) + sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText) + + if(overallSimilarity < 0.80 or cosineSimilarity < 0.95): + hasContentChanged = True + lastSavedDataIndex = i + basePayload = archiveData[i]['payload'] + basePayloadText = archiveText + basePayloadFilteredText = filteredArchiveText + + data = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged] + dataset.append(data) + + + + df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'structureSimilarity', 'cosine', 'jaccard', 'sorensen', 'changed']) + print("Dataframe created") + + X = df.iloc[:, 0:6].values + y = df.iloc[:, 6].values + + sc = StandardScaler() + X_train = sc.fit_transform(X) + + + overallSimilarity = similarity(basePayload, newRecord['payload']) + styleSimilarity = style_similarity(basePayload, newRecord['payload']) + structuralSimilarity = structural_similarity(basePayload, newRecord['payload']) + + archiveText = text_from_html(newRecord['payload']) + filteredArchiveText = filter_text(archiveText) + + cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText) + jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText) + #editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText) + sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText) + + X_test = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity] + + + print("Starting SVM Classification") + + regressor = svm.SVC() + regressor.fit(X_train, y) + y_pred = regressor.predict([X_test]) + + return y_pred + + + + + + +