import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import os as os
from Trie import Trie
from urllib.parse import urlparse

# making data
listOfFolder = os.listdir("vt.edu/")
data = []
for i in listOfFolder:
    if not(i.endswith('.DS_Store')):
        address = "vt.edu/"+i+"/"
        listOfFiles = os.listdir(address)
        for f_name in listOfFiles:
            if f_name.endswith('.parquet'):
                addressPar = "vt.edu/" + i + "/"+f_name
                dateFiles = pd.read_parquet(addressPar).to_numpy()
                if(len(dateFiles)==0) : continue
                zz_new = []
                for j in range(len(dateFiles)):
                    if dateFiles[j][3] == 'text/html' and dateFiles[j][4] == '200':
                        zz_new.append(dateFiles[j])
                zz_new = np.asarray(zz_new)
                data.append(zz_new)
data_array = np.asarray(data)

threshold = [100]

# making sitemap
data_train = data_array[0:threshold[0]]
sitemapURLS = {}
# sitemapURLS["/"] = "www.vt.edu"
# sitemap = Trie()

for dayData in data_train:
    dayDataNP = np.asarray(dayData)
    for i in range(len(dayDataNP)):
        # parsedurl = urlparse(dayDataNP[i][2])
        parsedurl = urlparse(os.path.splitext(dayDataNP[i][2])[0])
        if not sitemapURLS.__contains__(parsedurl.hostname):
            sitemapURLS[parsedurl.hostname] = Trie()
        sitemap = sitemapURLS[parsedurl.hostname]
        timestamp = dayDataNP[i][1]
        payload = dayDataNP[i][9]
        sitemap.insert(parsedurl.path, timestamp, payload)

        # if not(sitemapURLS.__contains__(parsedurl.path)):
        #     sitemapURLS[parsedurl.path] = parsedurl[1]+parsedurl[2]
        #     sitemap.insert(parsedurl.path, timestamp, payload)