import pyarrow.parquet as pq import pandas as pd import numpy as np import os as os from Trie import Trie from urllib.parse import urlparse # making data listOfFolder = os.listdir("vt.edu/") data = [] for i in listOfFolder: if not(i.endswith('.DS_Store')): address = "vt.edu/"+i+"/" listOfFiles = os.listdir(address) for f_name in listOfFiles: if f_name.endswith('.parquet'): addressPar = "vt.edu/" + i + "/"+f_name dateFiles = pd.read_parquet(addressPar).to_numpy() if(len(dateFiles)==0) : continue zz_new = [] for j in range(len(dateFiles)): if dateFiles[j][3] == 'text/html' and dateFiles[j][4] == '200': zz_new.append(dateFiles[j]) zz_new = np.asarray(zz_new) data.append(zz_new) data_array = np.asarray(data) threshold = [100] # making sitemap data_train = data_array[0:threshold[0]] sitemapURLS = {} # sitemapURLS["/"] = "www.vt.edu" # sitemap = Trie() for dayData in data_train: dayDataNP = np.asarray(dayData) for i in range(len(dayDataNP)): # parsedurl = urlparse(dayDataNP[i][2]) parsedurl = urlparse(os.path.splitext(dayDataNP[i][2])[0]) if not sitemapURLS.__contains__(parsedurl.hostname): sitemapURLS[parsedurl.hostname] = Trie() sitemap = sitemapURLS[parsedurl.hostname] timestamp = dayDataNP[i][1] payload = dayDataNP[i][9] sitemap.insert(parsedurl.path, timestamp, payload) # if not(sitemapURLS.__contains__(parsedurl.path)): # sitemapURLS[parsedurl.path] = parsedurl[1]+parsedurl[2] # sitemap.insert(parsedurl.path, timestamp, payload)