Skip to content
Snippets Groups Projects
Commit 9664f130 authored by Naman Ahuja's avatar Naman Ahuja
Browse files

basic classifier

parent 6c107512
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
``` python
import os
import pandas as pd
from html_similarity import style_similarity, structural_similarity, similarity
from bs4 import BeautifulSoup, Doctype
```
%% Cell type:code id: tags:
``` python
root = 'data'
modelUrl = 'vt.edu'
parquetFiles = []
for root, dirs, files in os.walk(root):
path = root.split(os.sep)
for file in files:
if file.endswith(".parquet"):
parquetFiles.append(os.path.join(root, file))
parquetFiles.sort()
```
%% Cell type:code id: tags:
``` python
archiveData = []
for fileName in parquetFiles:
file = pd.read_parquet(fileName, engine='pyarrow')
numRows = len(file.index)
for i in range(numRows):
UriComponents = file.iloc[i].originalUrl.split('/')
payload = file.iloc[i].payload
mime = file.iloc[i].mime
timestamp = file.iloc[i].timestamp
soup = (BeautifulSoup(payload, "html.parser"))
if (mime == 'text/html' and len(payload) > 1 and modelUrl in UriComponents[-2]):
currentData = {}
currentData['payload'] = payload
currentData['timestamp'] = timestamp
archiveData.append(currentData)
archiveData.sort(key=lambda x: x['timestamp'], reverse=False)
```
%% Output
/home/naman/anaconda3/envs/archive/lib/python3.7/site-packages/pyarrow/pandas_compat.py:752: FutureWarning: .labels was deprecated in version 0.24.0. Use .codes instead.
labels, = index.labels
%% Cell type:code id: tags:
``` python
basePayload = archiveData[0]['payload']
lastSavedDataIndex = 0
dataset = []
for i in range(1, len(archiveData)):
hasContentChanged = 0
overallSimilarity = similarity(basePayload, archiveData[i]['payload'])
styleSimilarity = style_similarity(basePayload, archiveData[i]['payload'])
structuralSimilarity = structural_similarity(basePayload, archiveData[i]['payload'])
if(overallSimilarity < 0.80):
hasContentChanged = 1
lastSavedDataIndex = i
basePayload = archiveData[i]['payload']
data = [overallSimilarity, styleSimilarity, structuralSimilarity, hasContentChanged]
dataset.append(data)
df = pd.DataFrame(dataset, columns = ['similarity', 'styleSimilarity', 'styleSimilarity', 'changed'])
```
%% Cell type:code id: tags:
``` python
df
```
%% Output
similarity styleSimilarity styleSimilarity changed
0 1.000000 1.000000 1.000000 0
1 1.000000 1.000000 1.000000 0
2 0.381462 0.242424 0.520499 1
3 0.383244 0.242424 0.524064 1
%% Cell type:code id: tags:
``` python
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment