Skip to content
Snippets Groups Projects
Commit e14fb7fe authored by Ritesh Bansal's avatar Ritesh Bansal
Browse files

added integration with focus crawl and SVM/randomforest

parent 51f8cc34
No related branches found
No related tags found
No related merge requests found
import pandas as pd
import numpy as np
import os as os
from Trie import Trie
from urllib.parse import urlparse
from archiveTextClassifier import classiyRF, classiySVM
from ResponseParser import responseParser
# making data
def buildDataDayWise (folderpath):
listOfFolder = os.listdir(folderpath)
data = []
for i in listOfFolder:
if not (i.endswith('.DS_Store')):
address = folderpath + i + "/"
listOfFiles = os.listdir(address)
for f_name in listOfFiles:
if f_name.endswith('.parquet'):
addressPar = folderpath + i + "/" + f_name
dateFiles = pd.read_parquet(addressPar).to_numpy()
if (len(dateFiles) == 0): continue
zz_new = []
for j in range(len(dateFiles)):
if dateFiles[j][4] == 'text/html' and dateFiles[j][5] == '200':
zz_new.append(dateFiles[j])
zz_new = np.asarray(zz_new)
data.append(zz_new)
return np.asarray(data)
def dataSplitBuildTest(data_array, threshold):
data_array = np.asarray(data_array)
data_train = data_array[0:threshold]
data_test = data_array[threshold:len(data_array)]
return data_train, data_test
#making sitemap tree
def makingSitemapTree (data_train):
data_train = np.asarray(data_train)
#unique domains
sitemapdomains = {}
# sitemapURLS["/"] = "www.vt.edu"
# sitemap = Trie()
for dayData in data_train:
dayDataNP = np.asarray(dayData)
for i in range(len(dayDataNP)):
# parsedurl = urlparse(dayDataNP[i][2])
url = dayDataNP[i][3]
if (url[len(url) - 1] != '/'): url = url + "/"
parsedurl = urlparse(os.path.splitext(url)[0])
if not sitemapdomains.__contains__(parsedurl.hostname):
sitemapdomains[parsedurl.hostname] = Trie()
sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname
sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0
sitemap = sitemapdomains[parsedurl.hostname]
timestamp = dayDataNP[i][2]
payload = dayDataNP[i][13]
payload = responseParser(payload).read().decode("utf-8")
sitemap.insert(parsedurl.path, timestamp, payload)
return sitemapdomains
def testingSitemapTreeClassiyRF (sitemapdomains, data_test):
data_test = np.asarray(data_test)
# testing sitemap tree
for dayData in data_test:
dayDataNP = np.asarray(dayData)
for i in range(len(dayDataNP)):
# parsedurl = urlparse(dayDataNP[i][2])
url = dayDataNP[i][3]
if (url[len(url) - 1] != '/'): url = url + "/"
parsedurl = urlparse(os.path.splitext(url)[0])
if not sitemapdomains.__contains__(parsedurl.hostname):
sitemapdomains[parsedurl.hostname] = Trie()
sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname
sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0
sitemap = sitemapdomains[parsedurl.hostname]
timestamp = dayDataNP[i][2]
payload = dayDataNP[i][13]
payload = responseParser(payload).read().decode("utf-8")
# Check for structure change
structchange = sitemap.isStructureChange(parsedurl.path)
if (structchange):
sitemap.insert(parsedurl.path, timestamp, payload)
else:
nodeData = sitemap.extractNodeData(parsedurl.path)
new_data = {}
new_data['timestamp'] = timestamp
new_data['payload'] = payload
nodeDataMpdified = []
for key,val in nodeData.items():
temp = {}
temp['timestamp'] = key
temp['payload'] = val
nodeDataMpdified.append(temp)
tocrawl = classiyRF(nodeDataMpdified, new_data)
# if yes, crawl
# if no, use classifier to check for to crawl or not
if(tocrawl[0]):
sitemap.insert(parsedurl.path, timestamp, payload)
return sitemapdomains
def testingSitemapTreeClassiySVM (sitemapdomains, data_test):
data_test = np.asarray(data_test)
# testing sitemap tree
for dayData in data_test:
dayDataNP = np.asarray(dayData)
for i in range(len(dayDataNP)):
# parsedurl = urlparse(dayDataNP[i][2])
url = dayDataNP[i][3]
if (url[len(url) - 1] != '/'): url = url + "/"
parsedurl = urlparse(os.path.splitext(url)[0])
if not sitemapdomains.__contains__(parsedurl.hostname):
sitemapdomains[parsedurl.hostname] = Trie()
sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname
sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0
sitemap = sitemapdomains[parsedurl.hostname]
timestamp = dayDataNP[i][2]
payload = dayDataNP[i][13]
payload = responseParser(payload).read().decode("utf-8")
# Check for structure change
structchange = sitemap.isStructureChange(parsedurl.path)
if (structchange):
sitemap.insert(parsedurl.path, timestamp, payload)
else:
nodeData = sitemap.extractNodeData(parsedurl.path)
new_data = {}
new_data['timestamp'] = timestamp
new_data['payload'] = payload
nodeDataMpdified = []
for key,val in nodeData.items():
temp = {}
temp['timestamp'] = key
temp['payload'] = val
nodeDataMpdified.append(temp)
tocrawl = classiySVM(nodeDataMpdified, new_data)
# if yes, crawl
# if no, use classifier to check for to crawl or not
if(tocrawl[0]):
sitemap.insert(parsedurl.path, timestamp, payload)
return sitemapdomains
def extractSitemap(sitemapdomains, domainName):
return sitemapdomains[domainName]
def createCopySitemap(sitemapdomains, domainName):
sitemap = sitemapdomains[domainName]
return sitemap.extract()
def getSitemapForTimestamp(sitemapdomains, domainName, startTimestamp, endTimeStamp):
sitemap = sitemapdomains[domainName]
return sitemap.extract(startTimestamp,endTimeStamp)
def compareTwoSiteMaps (sitemap1, sitemap2):
return sitemap1.comparison(sitemap2.root)
def extractMatrixSiteMap (sitemapdomains, domainName):
sitemap = sitemapdomains[domainName]
return np.asarray(sitemap.ancestorMatrix())
import numpy as np
import CNN_1hour2level as cnnFocusCrawl
# making data
folderpath = "CNN_1hour2level/"
data_array = cnnFocusCrawl.buildDataDayWise(folderpath)
data_array = np.asarray(data_array)
print(len(data_array))
#split data
threshold = [100]
data_train, data_test = cnnFocusCrawl.dataSplitBuildTest(data_array, threshold[0])
# making sitemap tree
#unique domains
sitemapdomains = cnnFocusCrawl.makingSitemapTree(data_train)
# sitemapURLS["/"] = "www.vt.edu"
# sitemap = Trie()
# testing sitemap tree
sitemapdomains = cnnFocusCrawl.testingSitemapTreeClassiyRF(sitemapdomains, data_test)
edition_cnn_com = sitemapdomains['www.cnn.com']
edition_cnn_com_Copy = edition_cnn_com.extract()
result = edition_cnn_com.comparison(edition_cnn_com_Copy.root)
print(result)
result = edition_cnn_com.comparison(edition_cnn_com.root)
print(result)
matrix = edition_cnn_com.ancestorMatrix()
matrix = np.asarray(matrix)
print('done')
\ No newline at end of file
from http.client import HTTPResponse
from io import BytesIO
http_response_str = """HTTP/1.1 200 OK
Date: Thu, Jul 3 15:27:54 2014
Content-Type: text/xml; charset="utf-8"
Connection: close
Content-Length: 626
teststring"""
class FakeSocket():
def __init__(self, response_bytes):
self._file = BytesIO(response_bytes)
def makefile(self, *args, **kwargs):
return self._file
def responseParser (http_response_str):
http_response_bytes = http_response_str.encode()
source = FakeSocket(http_response_bytes)
response = HTTPResponse(source)
response.begin()
return response
\ No newline at end of file
#!/usr/bin/env python
# coding: utf-8
# In[2]:
import os
import pandas as pd
from html_similarity import style_similarity, structural_similarity, similarity
from bs4 import BeautifulSoup, Doctype
from bs4.element import Comment
from collections import Counter
from scipy.spatial import distance
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string
import spacy
from nltk.metrics import edit_distance
from nltk.metrics import edit_distance
from nltk.metrics import interval_distance
from nltk import jaccard_distance
import textdistance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm
# In[3]:
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
# In[4]:
def text_from_html(htmlPage):
soup = BeautifulSoup(htmlPage, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
# In[5]:
def split(word):
return [char for char in word]
# In[6]:
def filter_text(text):
stop_words = set(stopwords.words('english'))
stop_words.update(split(string.punctuation))
nlp = spacy.load('en_core_web_sm')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
stop_words.update(spacy_stopwords)
#stop_words.update(["\\t","\\n","\\r"])
text = text.replace("\\n", "")
text = text.replace("\\r", "")
text = text.replace("\\t", "")
word_tokens_text = word_tokenize(text)
filtered_text = [w for w in word_tokens_text if not w in stop_words]
filtered_text = TreebankWordDetokenizer().detokenize(filtered_text)
return filtered_text
# In[ ]:
# In[ ]:
# In[7]:
def classiyRF(archiveData, newRecord):
archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
basePayload = archiveData[0]['payload']
basePayloadText = text_from_html(basePayload)
basePayloadFilteredText = filter_text(basePayloadText)
lastSavedDataIndex = 0
dataset = []
print(str(len(archiveData)) + " datapoints found")
for i in range(1, len(archiveData)):
if(i % 100 is 0):
print(str(i) + " Records processed")
hasContentChanged = False
overallSimilarity = similarity(basePayload, archiveData[i]['payload'])
styleSimilarity = style_similarity(basePayload, archiveData[i]['payload'])
structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload'])
archiveText = text_from_html(archiveData[i]['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
if(overallSimilarity < 0.80 or cosineSimilarity < 0.95):
hasContentChanged = True
lastSavedDataIndex = i
basePayload = archiveData[i]['payload']
basePayloadText = archiveText
basePayloadFilteredText = filteredArchiveText
data = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged]
dataset.append(data)
df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'structureSimilarity', 'cosine', 'jaccard', 'sorensen', 'changed'])
print("Dataframe created")
X = df.iloc[:, 0:6].values
y = df.iloc[:, 6].values
sc = StandardScaler()
X_train = sc.fit_transform(X)
overallSimilarity = similarity(basePayload, newRecord['payload'])
styleSimilarity = style_similarity(basePayload, newRecord['payload'])
structuralSimilarity = structural_similarity(basePayload, newRecord['payload'])
archiveText = text_from_html(newRecord['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
X_test = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity]
print("Starting Random Forest Classification")
regressor = RandomForestClassifier(n_estimators=20, random_state=0)
regressor.fit(X_train, y)
y_pred = regressor.predict([X_test])
return y_pred
# In[ ]:
def classiySVM(archiveData, newRecord):
archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
basePayload = archiveData[0]['payload']
basePayloadText = text_from_html(basePayload)
basePayloadFilteredText = filter_text(basePayloadText)
lastSavedDataIndex = 0
dataset = []
print(str(len(archiveData)) + " datapoints found")
for i in range(1, len(archiveData)):
if(i % 100 is 0):
print(str(i) + " Records processed")
hasContentChanged = False
overallSimilarity = similarity(basePayload, archiveData[i]['payload'])
styleSimilarity = style_similarity(basePayload, archiveData[i]['payload'])
structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload'])
archiveText = text_from_html(archiveData[i]['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
if(overallSimilarity < 0.80 or cosineSimilarity < 0.95):
hasContentChanged = True
lastSavedDataIndex = i
basePayload = archiveData[i]['payload']
basePayloadText = archiveText
basePayloadFilteredText = filteredArchiveText
data = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity, hasContentChanged]
dataset.append(data)
df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'structureSimilarity', 'cosine', 'jaccard', 'sorensen', 'changed'])
print("Dataframe created")
X = df.iloc[:, 0:6].values
y = df.iloc[:, 6].values
sc = StandardScaler()
X_train = sc.fit_transform(X)
overallSimilarity = similarity(basePayload, newRecord['payload'])
styleSimilarity = style_similarity(basePayload, newRecord['payload'])
structuralSimilarity = structural_similarity(basePayload, newRecord['payload'])
archiveText = text_from_html(newRecord['payload'])
filteredArchiveText = filter_text(archiveText)
cosineSimilarity = textdistance.cosine.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
jaccardSimilarity = textdistance.jaccard.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
#editDistanceSimilarity = textdistance.levenshtein.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
sorensenDiceSimilarity = textdistance.sorensen_dice.normalized_similarity(basePayloadFilteredText , filteredArchiveText)
X_test = [overallSimilarity, styleSimilarity, structuralSimilarity, cosineSimilarity, jaccardSimilarity, sorensenDiceSimilarity]
print("Starting SVM Classification")
regressor = svm.SVC()
regressor.fit(X_train, y)
y_pred = regressor.predict([X_test])
return y_pred
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment