Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • bornahokie/sentiment-analysis-ai
1 result
Show changes
Commits on Source (2)
{
"folders": [
{
"path": ".."
},
{
"name": "sentiment-analysis-ai",
"path": "."
}
],
"settings": {}
}
\ No newline at end of file
import requests
from warcio import ArchiveIterator
import sys
sys.stdout.reconfigure(encoding='utf-8')
##Could create a webscraping aspect to this that would query all of the warc urls for each of the different news and media urls
wet_url = 'https://data.commoncrawl.org/crawl-data/CC-MAIN-2023-23/segments/1685224649986.95/wet/CC-MAIN-20230604125132-20230604155132-00544.warc.wet.gz'
r = requests.get(wet_url, stream = True)
records = ArchiveIterator(r.raw)
record = next(records)
assert record.rec_type == 'warcinfo'
text = record.content_stream().read()
print(text.decode('utf-8', errors='ignore'))
for record in records:
record = next(records)
if((record.rec_headers.get_header('Content-Length') < '5000') and record.rec_headers.get_header('WARC-Identified-Content-Language') == 'eng'):
text = record.content_stream().read()
print(text.decode('utf-8', errors='ignore'))