Skip to content
Snippets Groups Projects
Commit 9460fc5e authored by xw0078's avatar xw0078
Browse files

archiveOrgDownload

parent ae86ddb9
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
# Get download link from Wayback Machine and Download Data
%% Cell type:markdown id: tags:
## List items in the collection
%% Cell type:code id: tags:
``` python
import imp
import waybackcollectiondownloader
from waybackcollectiondownloader import WaybackCollectionDownloader
imp.reload(waybackcollectiondownloader)
downloader = WaybackCollectionDownloader(collectionUrl)
downloader.ScrapeDownloadLinks("/home/xw0078/data/WaybackDownload/twitterarchiveLinks.txt")
```
%% Output
Log In Successful
End at page:
https://archive.org/details/twitterarchive?&sort=-publicdate&page=9
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
import requests
from bs4 import BeautifulSoup as bs
import re
import sys
import os
import datetime
import errno
class WaybackCollectionDownloader:
collectionUrl = ""
collectionUrlSorted = ""
credential = {
'username': 'xw0078@vt.edu',
'password': 'arc!sh007'
}
loginLink = "https://archive.org/account/login"
outputFile = "/home/xw0078/data/WaybackDownload/"
downloadLinkPrefix = "https://archive.org/download/"
def __init__(self,collectionUrl,**kwargs):
self.collectionUrl = collectionUrl
self.collectionUrlSorted = collectionUrl+"?&sort=-publicdate&page="
def generateDownloadLinkFile(self,filePath):
self.outputFile = filePath
if os.path.isfile(self.outputFile):
ts = datetime.datetime.now()
tsStr = ts.strftime("%d-%b-%Y (%H:%M:%S)")
os.rename(self.outputFile, self.outputFile.replace(".txt",tsStr+".txt"))
self.createInputDir(self.outputFile)
f = open(self.outputFile,"w+")
f.close()
def loginValidation(self,session):
page = session.get(self.loginLink)
return "Log out" in page.text
def collectionPageContenValidation(self,pageText):
return "No results matched your criteria" not in pageText
def ScrapeDownloadLinks(self,filePath):
self.generateDownloadLinkFile(filePath)
with requests.Session() as s:
# Login
s.get(self.loginLink)
s.post(self.loginLink,data = self.credential)
if self.loginValidation(s) != True:
print("Bad Login")
sys.exit
print("Log In Successful")
# iterate collection page to get all collection content
pageNumber = 1
while True:
currentPageLink = self.collectionUrlSorted+str(pageNumber)
currentPage = s.get(currentPageLink)
if self.collectionPageContenValidation(currentPage.text) == False:
print("End at page:")
print(currentPageLink)
return
self.parseWaybackCollectionItems(currentPage,s)
pageNumber+=1
def parseWaybackCollectionItems(self,page,session):
soup = bs(page.text,'html.parser')
collectionList = soup.find(class_='results')
collectionList_items = collectionList.find_all('a',href=re.compile('/details/.*'),class_=lambda x: x != 'stealth')
for item in collectionList_items:
itemLink = "https://archive.org"+item.get('href')
downloadPageLink = itemLink.replace("details","download")
itemPage = session.get(downloadPageLink)
self.parseWaybackItemDownloadPage(itemPage,downloadPageLink,session)
def parseWaybackItemDownloadPage(self,page,parentLink,session):
soup = bs(page.text,'html.parser')
downloadList = soup.find(class_='directory-listing-table')
warcItems = downloadList.find_all('a',href=re.compile('.*warc.gz'))
cdxItems = downloadList.find_all('a',href=re.compile('.*os.cdx.gz'))
if len(warcItems) != len(cdxItems):
print("WARC CDX item number not matching: WARC "+ len(warcItems) +" cdx "+len(cdxItems))
sys.exit(0)
for warc,cdx in zip(warcItems,cdxItems):
warcLink = parentLink+"/"+warc.get('href')
cdxLink = parentLink+"/"+cdx.get('href')
self.appendLineToFile(warcLink)
self.appendLineToFile(cdxLink)
def appendLineToFile(self,input):
f = open(self.outputFile,"a+")
f.write(input)
f.write("\r\n")
f.close()
def createInputDir(self,input):
if not os.path.exists(os.path.dirname(input)):
try:
os.makedirs(os.path.dirname(input))
except OSError as exc:
if exc.errno != errno.EEXIST:
raise
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment