Skip to content
Snippets Groups Projects
Commit b749a038 authored by Ritesh Bansal's avatar Ritesh Bansal
Browse files

added logic to work with news dataset

parent db376411
No related branches found
No related tags found
No related merge requests found
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import os as os
from Trie import Trie
from urllib.parse import urlparse
# making data
folderNews = "CNN_focuscrawls/"
listOfFolder = os.listdir(folderNews)
data = []
for i in listOfFolder:
if not(i.endswith('.DS_Store')):
address = folderNews+i+"/"
listOfFiles = os.listdir(address)
for f_name in listOfFiles:
if f_name.endswith('.parquet'):
addressPar = folderNews + i + "/"+f_name
dateFiles = pd.read_parquet(addressPar).to_numpy()
if(len(dateFiles)==0) : continue
zz_new = []
for j in range(len(dateFiles)):
if dateFiles[j][4] == 'text/html' and dateFiles[j][5] == '200':
zz_new.append(dateFiles[j])
zz_new = np.asarray(zz_new)
data.append(zz_new)
data_array = np.asarray(data)
# threshold = [100]
threshold = [len(data_array)]
# making sitemap tree
data_train = data_array[0:threshold[0]]
#unique domains
sitemapdomains = {}
# sitemapURLS["/"] = "www.vt.edu"
# sitemap = Trie()
for dayData in data_train:
dayDataNP = np.asarray(dayData)
for i in range(len(dayDataNP)):
# parsedurl = urlparse(dayDataNP[i][2])
parsedurl = urlparse(os.path.splitext(dayDataNP[i][3])[0])
if not sitemapdomains.__contains__(parsedurl.hostname):
sitemapdomains[parsedurl.hostname] = Trie()
sitemapdomains[parsedurl.hostname].root.name = parsedurl.hostname
sitemapdomains[parsedurl.hostname].matrixElements[parsedurl.hostname] = 0
sitemap = sitemapdomains[parsedurl.hostname]
timestamp = dayDataNP[i][2]
payload = dayDataNP[i][13]
isnewpath, newnodepath = sitemap.insert(parsedurl.path, timestamp, payload)
if isnewpath: print(newnodepath)
# if not(sitemapURLS.__contains__(parsedurl.path)):
# sitemapURLS[parsedurl.path] = parsedurl[1]+parsedurl[2]
# sitemap.insert(parsedurl.path, timestamp, payload)
edition_cnn_com = sitemapdomains['edition.cnn.com']
edition_cnn_com_Copy = edition_cnn_com.extract()
result = edition_cnn_com.comparison(edition_cnn_com_Copy.root)
print(result)
result = edition_cnn_com.comparison(edition_cnn_com.root)
print(result)
matrix = edition_cnn_com.ancestorMatrix()
matrix = np.asarray(matrix)
print('done')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment