working version

79467a24 · Tarek Shah · a1cf42e0 · 79467a24 · 79467a24 · 79467a24
Commit 79467a24 authored 11 months ago by Tarek Shah
--- a/__pycache__/scraper.cpython-312.pyc
+++ b/__pycache__/scraper.cpython-312.pyc
--- a/crisis_events.db
+++ b/crisis_events.db
--- a/crisis_events_database_metadata.json
+++ b/crisis_events_database_metadata.json
-{"Users": 2, "Collections": 2}
\ No newline at end of file
+{"Users": 2, "Collections": 5}
\ No newline at end of file
--- a/flask_backend.py
+++ b/flask_backend.py
@@ -13,6 +13,7 @@ import glob
 import summarizer_implementations.t5 as t5
 import summarizer_implementations.nltk_summarizer as nltk
 import summarizer_implementations.bert as bert
+import scraper

 app = Flask(__name__)
 app.config['JWT_SECRET_KEY'] = 'PI'
@@ -227,7 +228,7 @@ def v1_summarize_t5():

    glob = ""
    
-    if collection["type"] == "text":
+    if collection["type"] == "text" or collection["type"] == "url" or collection["type"] == "html":
        for file_path in items:
            with open(file_path, "r", encoding="utf8") as f:
                glob += f.read()
@@ -264,7 +265,7 @@ def v1_summarize_nltk():
    # files = database.get_raw_text_files(collection_info["collection_id"])
    
    glob = ""
-    if collection["type"] == "text":
+    if collection["type"] == "text" or collection["type"] == "url" or collection["type"] == "html":
        for file_path in items:
            with open(file_path, "r", encoding="utf8") as f:
                glob += f.read()
@@ -302,7 +303,7 @@ def v1_summarize_bert():
    # files = database.get_raw_text_files(collection_info["collection_id"])
    
    glob = ""
-    if collection["type"] == "text":
+    if collection["type"] == "text" or collection["type"] == "url" or collection["type"] == "html":
        for file_path in items:
            with open(file_path, "r", encoding="utf8") as f:
                glob += f.read()
@@ -323,10 +324,9 @@ def v1_summarize_bert():
            "status":"failure",
            }, 500
    
-
-@app.route('/api/v1/upload_raw_text', methods=['POST'])
-def v1_upload_raw_text():
-    "http://127.0.0.1:5000//api/v1/upload_raw_text?collection=0"
+@app.route('/api/v1/upload_url_file', methods=['POST'])
+def v1_upload_url_file():
+    "http://127.0.0.1:5000//api/v1/upload_url_file?collection=0"
    collection_id = request.args.get("collection")

    if not collection_id:
@@ -336,19 +336,34 @@ def v1_upload_raw_text():

    os.makedirs(f"./backend/storage/{collection_id}",exist_ok=True)

-    for zip_file in request.files.keys():
-        path = f"./backend/storage/{collection_id}/{zip_file}"
-        request.files[zip_file].save(path)
+    for url_file in request.files.keys():
+        path = f"./backend/storage/{collection_id}/{url_file}"
+        request.files[url_file].save(path)
+        print(f"Saved: {path}")

-        with zipfile.ZipFile(path, 'r') as zip_ref:
-            zip_ref.extractall(f"./backend/storage/{collection_id}")
+        with open(path, 'r', encoding='utf-8') as file:
+            urls = [line.strip() for line in file]
+
+        for url in urls: #iterate through list of URLs
+            try:
+                text = scraper.scrape_url(url)
+                if text.lower().startswith('error'): #special case
+                    continue
+            except Exception as e:
+                print(f"Error: {e}")
+                continue #disregard url
+
+            file_name = url.split('/')[-1].strip() + '.txt' #determines file name by extracting last part of URL string
+            if file_name == '.txt': #special case
+                file_name = url.split('/')[-2].strip() + '.txt'
+            file_path = f'./backend/storage/{collection_id}/{file_name}'
+            with open(file_path, 'w', encoding='utf-8') as text_file: #write file to storage path with given name
+                text_file.write(text)

        glob_path = r'./backend/storage/' + collection_id + r'/*.txt'
-
        for file_path in glob.glob(glob_path):
            database.create_raw_text_file(collection_id, file_path)

-
    return {"status":"success"}, 200

 @app.route('/api/v1/upload_raw_html', methods=['POST'])
@@ -368,20 +383,25 @@ def v1_upload_raw_html():
        request.files[zip_file].save(path)

        with zipfile.ZipFile(path, 'r') as zip_ref:
-            zip_ref.extractall(f"./backend/storage/{collection_id}")
-
-        glob_path = r'./backend/storage/' + collection_id + r'/*.html'
+            #zip_ref.extractall(f"./backend/storage/{collection_id}")
+            for file_name in zip_ref.namelist(): #loop through each file name in zipped file
+                with zip_ref.open(file_name) as file:
+                    content = file.read()
+                    text = scraper.scrape_html(content)
+                file_path = f'./backend/storage/{collection_id}/{file_name}'
+                with open(file_path, 'w', encoding='utf-8') as text_file:
+                    text_file.write(text)

-        ##deprecated code
+        glob_path = r'./backend/storage/' + collection_id + r'/*.txt'
        for file_path in glob.glob(glob_path):
            database.create_raw_text_file(collection_id, file_path)
-        ##

    return {"status":"success"}, 200
+    

-@app.route('/api/v1/upload_url_file', methods=['POST'])
-def v1_upload_url_file():
-    "http://127.0.0.1:5000//api/v1/upload_url_file?collection=0"
+@app.route('/api/v1/upload_raw_text', methods=['POST'])
+def v1_upload_raw_text():
+    "http://127.0.0.1:5000//api/v1/upload_raw_text?collection=0"
    collection_id = request.args.get("collection")

    if not collection_id:
@@ -391,10 +411,17 @@ def v1_upload_url_file():

    os.makedirs(f"./backend/storage/{collection_id}",exist_ok=True)

-    for url_file in request.files.keys():
-        path = f"./backend/storage/{collection_id}/{url_file}"
-        request.files[url_file].save(path)
-        print(f"Saved: {path}")
+    for zip_file in request.files.keys():
+        path = f"./backend/storage/{collection_id}/{zip_file}"
+        request.files[zip_file].save(path)
+
+        with zipfile.ZipFile(path, 'r') as zip_ref:
+            zip_ref.extractall(f"./backend/storage/{collection_id}")
+
+        glob_path = r'./backend/storage/' + collection_id + r'/*.txt'
+
+        for file_path in glob.glob(glob_path):
+            database.create_raw_text_file(collection_id, file_path)


    return {"status":"success"}, 200
@@ -415,10 +442,11 @@ def __get_items(id):
    elif c_type == "url":
        glob_path = r'./backend/storage/' + id + r'/*.txt'
        for file_path in glob.glob(glob_path):
-            with open(file_path) as file:
-                items = [line.rstrip() for line in file]
+            items.append(file_path)
+            # with open(file_path) as file:
+            #     items = [line.rstrip() for line in file]
    elif c_type == "html":
-        glob_path = r'./backend/storage/' + id + r'/*.html'
+        glob_path = r'./backend/storage/' + id + r'/*.txt'
        for file_path in glob.glob(glob_path):
            items.append(file_path)


--- a/scraper.py
+++ b/scraper.py
@@ -64,6 +64,28 @@ def scrape_multi_webpage(filename):
                    all_text += result + "\n"
    return json.dumps({"URLs": urls, "Text": all_text})

+def scrape_url(link):
+    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
+
+    try:
+        resp = r.get(link, headers=headers)
+        resp.raise_for_status()
+    except requests.exceptions.RequestException as exception:
+        return f"Error identified: {exception}"
+
+    soupObject = BeautifulSoup(resp.content, "html.parser")
+    text = ""
+    for paragraph in soupObject.find_all("p"):
+        text += paragraph.text + "\n"
+    return text
+
+def scrape_html(htmlText):
+    soupObject = BeautifulSoup(htmlText, 'html.parser')
+    text = ""
+    for paragraph in soupObject.find_all("p"):
+        text += paragraph.text + "\n"
+    return text
+


 # webpage = "https://www.cbsnews.com/news/enrique-marquez-san-bernardino-shooter-friend-pleads-guilty-to-supplying-weapons/"