Fixed the query system of WET files

fdc41e00 · Cole Walton · 894ad47a · fdc41e00 · fdc41e00
Commit fdc41e00 authored 1 year ago by Cole Walton
--- a/4664.code-workspace
+++ b/4664.code-workspace
+{
+	"folders": [
+		{
+			"path": ".."
+		},
+		{
+			"name": "sentiment-analysis-ai",
+			"path": "."
+		}
+	],
+	"settings": {}
+}
\ No newline at end of file
--- a/webscrape.py
+++ b/webscrape.py
+import requests
+from warcio import ArchiveIterator
+import sys
+sys.stdout.reconfigure(encoding='utf-8')
+##Could create a webscraping aspect to this that would query all of the warc urls for each of the different news and media urls
+
+wet_url = 'https://data.commoncrawl.org/crawl-data/CC-MAIN-2023-23/segments/1685224649986.95/wet/CC-MAIN-20230604125132-20230604155132-00544.warc.wet.gz'
+r = requests.get(wet_url, stream = True)
+records = ArchiveIterator(r.raw)
+
+record = next(records)
+assert record.rec_type == 'warcinfo'
+text = record.content_stream().read()
+print(text.decode('utf-8', errors='ignore'))
+for record in records:
+    record = next(records)
+    if((record.rec_headers.get_header('Content-Length') < '5000') and record.rec_headers.get_header('WARC-Identified-Content-Language') == 'eng'):
+        text = record.content_stream().read()
+        print(text.decode('utf-8', errors='ignore')) 
+