archiveOrgDownload

9460fc5e · xw0078 · ae86ddb9 · 9460fc5e · 9460fc5e
Commit 9460fc5e authored 5 years ago by xw0078
--- a/getArchiveOrgCollection/GetArchiveOrgDownloadLinks.ipynb
+++ b/getArchiveOrgCollection/GetArchiveOrgDownloadLinks.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Get download link from Wayback Machine and Download Data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## List items in the collection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 121,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Log In Successful\n",
+      "End at page:\n",
+      "https://archive.org/details/twitterarchive?&sort=-publicdate&page=9\n"
+     ]
+    }
+   ],
+   "source": [
+    "import imp\n",
+    "import waybackcollectiondownloader\n",
+    "from waybackcollectiondownloader import WaybackCollectionDownloader\n",
+    "imp.reload(waybackcollectiondownloader)\n",
+    "\n",
+    "downloader = WaybackCollectionDownloader(collectionUrl)\n",
+    "downloader.ScrapeDownloadLinks(\"/home/xw0078/data/WaybackDownload/twitterarchiveLinks.txt\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
+%% Cell type:markdown id: tags:
+
+# Get download link from Wayback Machine and Download Data
+
+%% Cell type:markdown id: tags:
+
+## List items in the collection
+
+%% Cell type:code id: tags:
+
+``` python
+import imp
+import waybackcollectiondownloader
+from waybackcollectiondownloader import WaybackCollectionDownloader
+imp.reload(waybackcollectiondownloader)
+
+downloader = WaybackCollectionDownloader(collectionUrl)
+downloader.ScrapeDownloadLinks("/home/xw0078/data/WaybackDownload/twitterarchiveLinks.txt")
+```
+
+%% Output
+
+    Log In Successful
+    End at page:
+    https://archive.org/details/twitterarchive?&sort=-publicdate&page=9
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
--- a/getArchiveOrgCollection/waybackcollectiondownloader.py
+++ b/getArchiveOrgCollection/waybackcollectiondownloader.py
+import requests
+from bs4 import BeautifulSoup as bs
+import re
+import sys
+import os
+import datetime
+import errno
+
+class WaybackCollectionDownloader:
+    collectionUrl = ""
+    collectionUrlSorted = ""
+    credential = {
+        'username': 'xw0078@vt.edu',
+        'password': 'arc!sh007'
+    }
+    loginLink = "https://archive.org/account/login"
+    outputFile = "/home/xw0078/data/WaybackDownload/"
+    downloadLinkPrefix = "https://archive.org/download/"
+    
+    
+    def __init__(self,collectionUrl,**kwargs):
+        self.collectionUrl = collectionUrl
+        self.collectionUrlSorted = collectionUrl+"?&sort=-publicdate&page="        
+        
+    def generateDownloadLinkFile(self,filePath):
+        self.outputFile = filePath
+        if os.path.isfile(self.outputFile):
+            ts = datetime.datetime.now()
+            tsStr = ts.strftime("%d-%b-%Y (%H:%M:%S)")
+            os.rename(self.outputFile, self.outputFile.replace(".txt",tsStr+".txt"))
+        self.createInputDir(self.outputFile)
+        f = open(self.outputFile,"w+")
+        f.close() 
+        
+    def loginValidation(self,session):
+        page = session.get(self.loginLink)
+        return "Log out" in page.text
+    
+    def collectionPageContenValidation(self,pageText):
+        return "No results matched your criteria" not in pageText
+    
+    
+    def ScrapeDownloadLinks(self,filePath):
+        self.generateDownloadLinkFile(filePath)
+        with requests.Session() as s:
+            # Login
+            s.get(self.loginLink)
+            s.post(self.loginLink,data = self.credential)
+            if self.loginValidation(s) != True:
+                print("Bad Login")
+                sys.exit
+            print("Log In Successful")
+            
+            # iterate collection page to get all collection content
+            pageNumber = 1
+            while True:
+                currentPageLink = self.collectionUrlSorted+str(pageNumber)
+                currentPage = s.get(currentPageLink)
+                if self.collectionPageContenValidation(currentPage.text) == False:
+                    print("End at page:")
+                    print(currentPageLink)
+                    return
+                self.parseWaybackCollectionItems(currentPage,s)
+                pageNumber+=1
+                
+                
+    def parseWaybackCollectionItems(self,page,session):
+        soup = bs(page.text,'html.parser')
+        collectionList = soup.find(class_='results')
+        collectionList_items = collectionList.find_all('a',href=re.compile('/details/.*'),class_=lambda x: x != 'stealth')
+        for item in collectionList_items:  
+            itemLink = "https://archive.org"+item.get('href')
+            downloadPageLink = itemLink.replace("details","download")
+            itemPage = session.get(downloadPageLink)
+            self.parseWaybackItemDownloadPage(itemPage,downloadPageLink,session)
+
+    def parseWaybackItemDownloadPage(self,page,parentLink,session):
+        soup = bs(page.text,'html.parser')
+        downloadList = soup.find(class_='directory-listing-table')
+        warcItems = downloadList.find_all('a',href=re.compile('.*warc.gz'))
+        cdxItems = downloadList.find_all('a',href=re.compile('.*os.cdx.gz'))
+        if len(warcItems) != len(cdxItems):
+            print("WARC CDX item number not matching: WARC "+ len(warcItems) +" cdx "+len(cdxItems))
+            sys.exit(0)
+        for warc,cdx in zip(warcItems,cdxItems):
+            warcLink = parentLink+"/"+warc.get('href')
+            cdxLink = parentLink+"/"+cdx.get('href')
+            self.appendLineToFile(warcLink)
+            self.appendLineToFile(cdxLink)
+            
+            
+    def appendLineToFile(self,input):
+        f = open(self.outputFile,"a+")
+        f.write(input)
+        f.write("\r\n")
+        f.close() 
+        
+    def createInputDir(self,input):
+        if not os.path.exists(os.path.dirname(input)):
+            try:
+                os.makedirs(os.path.dirname(input))
+            except OSError as exc:
+                if exc.errno != errno.EEXIST:
+                    raise
\ No newline at end of file