Skip to content
Snippets Groups Projects
Commit fdc41e00 authored by Cole Walton's avatar Cole Walton
Browse files

Fixed the query system of WET files

parent 894ad47a
No related branches found
No related tags found
No related merge requests found
{
"folders": [
{
"path": ".."
},
{
"name": "sentiment-analysis-ai",
"path": "."
}
],
"settings": {}
}
\ No newline at end of file
import requests
from warcio import ArchiveIterator
import sys
sys.stdout.reconfigure(encoding='utf-8')
##Could create a webscraping aspect to this that would query all of the warc urls for each of the different news and media urls
wet_url = 'https://data.commoncrawl.org/crawl-data/CC-MAIN-2023-23/segments/1685224649986.95/wet/CC-MAIN-20230604125132-20230604155132-00544.warc.wet.gz'
r = requests.get(wet_url, stream = True)
records = ArchiveIterator(r.raw)
record = next(records)
assert record.rec_type == 'warcinfo'
text = record.content_stream().read()
print(text.decode('utf-8', errors='ignore'))
for record in records:
record = next(records)
if((record.rec_headers.get_header('Content-Length') < '5000') and record.rec_headers.get_header('WARC-Identified-Content-Language') == 'eng'):
text = record.content_stream().read()
print(text.decode('utf-8', errors='ignore'))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment