Forked from
xw0078 / CS-6604-WebArchive
15 commits behind the upstream repository.
-
Ritesh Bansal authoredRitesh Bansal authored
main.py 1.94 KiB
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import os as os
from Trie import Trie
from urllib.parse import urlparse
# making data
listOfFolder = os.listdir("vt.edu/")
data = []
for i in listOfFolder:
if not(i.endswith('.DS_Store')):
address = "vt.edu/"+i+"/"
listOfFiles = os.listdir(address)
for f_name in listOfFiles:
if f_name.endswith('.parquet'):
addressPar = "vt.edu/" + i + "/"+f_name
dateFiles = pd.read_parquet(addressPar).to_numpy()
if(len(dateFiles)==0) : continue
zz_new = []
for j in range(len(dateFiles)):
if dateFiles[j][3] == 'text/html' and dateFiles[j][4] == '200':
zz_new.append(dateFiles[j])
zz_new = np.asarray(zz_new)
data.append(zz_new)
data_array = np.asarray(data)
threshold = [100]
# making sitemap
data_train = data_array[0:threshold[0]]
sitemapURLS = {}
# sitemapURLS["/"] = "www.vt.edu"
# sitemap = Trie()
for dayData in data_train:
dayDataNP = np.asarray(dayData)
for i in range(len(dayDataNP)):
# parsedurl = urlparse(dayDataNP[i][2])
parsedurl = urlparse(os.path.splitext(dayDataNP[i][2])[0])
if not sitemapURLS.__contains__(parsedurl.hostname):
sitemapURLS[parsedurl.hostname] = Trie()
sitemap = sitemapURLS[parsedurl.hostname]
timestamp = dayDataNP[i][1]
payload = dayDataNP[i][9]
sitemap.insert(parsedurl.path, timestamp, payload)
vtTree = sitemapURLS['www.vt.edu']
vtTreeCopy = vtTree.extract('20140906125541','20141215204723')
result = vtTree.comparison(vtTreeCopy)
print(result)
result = vtTree.comparison(vtTree.root)
print(result)
print('done')
# if not(sitemapURLS.__contains__(parsedurl.path)):
# sitemapURLS[parsedurl.path] = parsedurl[1]+parsedurl[2]
# sitemap.insert(parsedurl.path, timestamp, payload)