import pyarrow.parquet as pq import pandas as pd import numpy as np import os as os from Trie import Trie from urllib.parse import urlparse # making data listOfFolder = os.listdir("vt.edu/") data = [] for i in listOfFolder: if not(i.endswith('.DS_Store')): address = "vt.edu/"+i+"/" listOfFiles = os.listdir(address) for f_name in listOfFiles: if f_name.endswith('.parquet'): addressPar = "vt.edu/" + i + "/"+f_name dateFiles = pd.read_parquet(addressPar).to_numpy() if(len(dateFiles)==0) : continue zz_new = [] for j in range(len(dateFiles)): if dateFiles[j][3] == 'text/html' and dateFiles[j][4] == '200': zz_new.append(dateFiles[j]) zz_new = np.asarray(zz_new) data.append(zz_new) data_array = np.asarray(data) # threshold = [100] threshold = [len(data_array)] # making sitemap tree data_train = data_array[0:threshold[0]] #unique domains sitemapdomains = {} # sitemapURLS["/"] = "www.vt.edu" # sitemap = Trie() for dayData in data_train: dayDataNP = np.asarray(dayData) for i in range(len(dayDataNP)): # parsedurl = urlparse(dayDataNP[i][2]) parsedurl = urlparse(os.path.splitext(dayDataNP[i][2])[0]) if not sitemapdomains.__contains__(parsedurl.hostname): sitemapdomains[parsedurl.hostname] = Trie() sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0 sitemap = sitemapdomains[parsedurl.hostname] timestamp = dayDataNP[i][1] payload = dayDataNP[i][9] isnewpath, newnodepath = sitemap.insert(parsedurl.path, timestamp, payload) if isnewpath: print(newnodepath) # if not(sitemapURLS.__contains__(parsedurl.path)): # sitemapURLS[parsedurl.path] = parsedurl[1]+parsedurl[2] # sitemap.insert(parsedurl.path, timestamp, payload) vtTree = sitemapdomains['www.vt.edu'] vtTreeCopy = vtTree.extract('20140906125541','20141215204723') result = vtTree.comparison(vtTreeCopy.root) print(result) result = vtTree.comparison(vtTree.root) print(result) matrix = vtTree.ancestorMatrix() matrix = np.asarray(matrix) print('done')