Skip to content
Snippets Groups Projects
Forked from xw0078 / CS-6604-WebArchive
15 commits behind the upstream repository.
main.py 1.94 KiB
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import os as os
from Trie import Trie
from urllib.parse import urlparse

# making data
listOfFolder = os.listdir("vt.edu/")
data = []
for i in listOfFolder:
    if not(i.endswith('.DS_Store')):
        address = "vt.edu/"+i+"/"
        listOfFiles = os.listdir(address)
        for f_name in listOfFiles:
            if f_name.endswith('.parquet'):
                addressPar = "vt.edu/" + i + "/"+f_name
                dateFiles = pd.read_parquet(addressPar).to_numpy()
                if(len(dateFiles)==0) : continue
                zz_new = []
                for j in range(len(dateFiles)):
                    if dateFiles[j][3] == 'text/html' and dateFiles[j][4] == '200':
                        zz_new.append(dateFiles[j])
                zz_new = np.asarray(zz_new)
                data.append(zz_new)
data_array = np.asarray(data)

threshold = [100]

# making sitemap
data_train = data_array[0:threshold[0]]
sitemapURLS = {}
# sitemapURLS["/"] = "www.vt.edu"
# sitemap = Trie()

for dayData in data_train:
    dayDataNP = np.asarray(dayData)
    for i in range(len(dayDataNP)):
        # parsedurl = urlparse(dayDataNP[i][2])
        parsedurl = urlparse(os.path.splitext(dayDataNP[i][2])[0])
        if not sitemapURLS.__contains__(parsedurl.hostname):
            sitemapURLS[parsedurl.hostname] = Trie()
        sitemap = sitemapURLS[parsedurl.hostname]
        timestamp = dayDataNP[i][1]
        payload = dayDataNP[i][9]
        sitemap.insert(parsedurl.path, timestamp, payload)

vtTree = sitemapURLS['www.vt.edu']
vtTreeCopy = vtTree.extract('20140906125541','20141215204723')
result  = vtTree.comparison(vtTreeCopy)
print(result)
result  = vtTree.comparison(vtTree.root)
print(result)
print('done')



        # if not(sitemapURLS.__contains__(parsedurl.path)):
        #     sitemapURLS[parsedurl.path] = parsedurl[1]+parsedurl[2]
        #     sitemap.insert(parsedurl.path, timestamp, payload)