diff --git a/__pycache__/scraper.cpython-312.pyc b/__pycache__/scraper.cpython-312.pyc index 98f2a8fe0cb9814e7543b4291b1c61344efe2b16..f74863a53ea0e12effc8b81a5c15b218f1ef3cc2 100644 Binary files a/__pycache__/scraper.cpython-312.pyc and b/__pycache__/scraper.cpython-312.pyc differ diff --git a/crisis_events.db b/crisis_events.db index c0173a3a9bd6203f6fd569ab359d40f6b60276f0..20d733224bf5bbaa1bdec121901d4ff43d8fe84c 100644 Binary files a/crisis_events.db and b/crisis_events.db differ diff --git a/crisis_events_database_metadata.json b/crisis_events_database_metadata.json index 96f6edc7c9c9db46747cabb70150c3deb5df1add..34450736630c8dabcaab761cb7933ef8d69ebd50 100644 --- a/crisis_events_database_metadata.json +++ b/crisis_events_database_metadata.json @@ -1 +1 @@ -{"Users": 2, "Collections": 2} \ No newline at end of file +{"Users": 2, "Collections": 5} \ No newline at end of file diff --git a/flask_backend.py b/flask_backend.py index 98cfa6dbbd71cdcf8a4c7f379f39c138eea29096..de564cc907f0ee630e034383c49b862f9bc8f7b2 100644 --- a/flask_backend.py +++ b/flask_backend.py @@ -13,6 +13,7 @@ import glob import summarizer_implementations.t5 as t5 import summarizer_implementations.nltk_summarizer as nltk import summarizer_implementations.bert as bert +import scraper app = Flask(__name__) app.config['JWT_SECRET_KEY'] = 'PI' @@ -227,7 +228,7 @@ def v1_summarize_t5(): glob = "" - if collection["type"] == "text": + if collection["type"] == "text" or collection["type"] == "url" or collection["type"] == "html": for file_path in items: with open(file_path, "r", encoding="utf8") as f: glob += f.read() @@ -264,7 +265,7 @@ def v1_summarize_nltk(): # files = database.get_raw_text_files(collection_info["collection_id"]) glob = "" - if collection["type"] == "text": + if collection["type"] == "text" or collection["type"] == "url" or collection["type"] == "html": for file_path in items: with open(file_path, "r", encoding="utf8") as f: glob += f.read() @@ -302,7 +303,7 @@ def v1_summarize_bert(): # files = database.get_raw_text_files(collection_info["collection_id"]) glob = "" - if collection["type"] == "text": + if collection["type"] == "text" or collection["type"] == "url" or collection["type"] == "html": for file_path in items: with open(file_path, "r", encoding="utf8") as f: glob += f.read() @@ -323,10 +324,9 @@ def v1_summarize_bert(): "status":"failure", }, 500 - -@app.route('/api/v1/upload_raw_text', methods=['POST']) -def v1_upload_raw_text(): - "http://127.0.0.1:5000//api/v1/upload_raw_text?collection=0" +@app.route('/api/v1/upload_url_file', methods=['POST']) +def v1_upload_url_file(): + "http://127.0.0.1:5000//api/v1/upload_url_file?collection=0" collection_id = request.args.get("collection") if not collection_id: @@ -336,19 +336,34 @@ def v1_upload_raw_text(): os.makedirs(f"./backend/storage/{collection_id}",exist_ok=True) - for zip_file in request.files.keys(): - path = f"./backend/storage/{collection_id}/{zip_file}" - request.files[zip_file].save(path) + for url_file in request.files.keys(): + path = f"./backend/storage/{collection_id}/{url_file}" + request.files[url_file].save(path) + print(f"Saved: {path}") - with zipfile.ZipFile(path, 'r') as zip_ref: - zip_ref.extractall(f"./backend/storage/{collection_id}") + with open(path, 'r', encoding='utf-8') as file: + urls = [line.strip() for line in file] + + for url in urls: #iterate through list of URLs + try: + text = scraper.scrape_url(url) + if text.lower().startswith('error'): #special case + continue + except Exception as e: + print(f"Error: {e}") + continue #disregard url + + file_name = url.split('/')[-1].strip() + '.txt' #determines file name by extracting last part of URL string + if file_name == '.txt': #special case + file_name = url.split('/')[-2].strip() + '.txt' + file_path = f'./backend/storage/{collection_id}/{file_name}' + with open(file_path, 'w', encoding='utf-8') as text_file: #write file to storage path with given name + text_file.write(text) glob_path = r'./backend/storage/' + collection_id + r'/*.txt' - for file_path in glob.glob(glob_path): database.create_raw_text_file(collection_id, file_path) - return {"status":"success"}, 200 @app.route('/api/v1/upload_raw_html', methods=['POST']) @@ -368,20 +383,25 @@ def v1_upload_raw_html(): request.files[zip_file].save(path) with zipfile.ZipFile(path, 'r') as zip_ref: - zip_ref.extractall(f"./backend/storage/{collection_id}") - - glob_path = r'./backend/storage/' + collection_id + r'/*.html' + #zip_ref.extractall(f"./backend/storage/{collection_id}") + for file_name in zip_ref.namelist(): #loop through each file name in zipped file + with zip_ref.open(file_name) as file: + content = file.read() + text = scraper.scrape_html(content) + file_path = f'./backend/storage/{collection_id}/{file_name}' + with open(file_path, 'w', encoding='utf-8') as text_file: + text_file.write(text) - ##deprecated code + glob_path = r'./backend/storage/' + collection_id + r'/*.txt' for file_path in glob.glob(glob_path): database.create_raw_text_file(collection_id, file_path) - ## return {"status":"success"}, 200 + -@app.route('/api/v1/upload_url_file', methods=['POST']) -def v1_upload_url_file(): - "http://127.0.0.1:5000//api/v1/upload_url_file?collection=0" +@app.route('/api/v1/upload_raw_text', methods=['POST']) +def v1_upload_raw_text(): + "http://127.0.0.1:5000//api/v1/upload_raw_text?collection=0" collection_id = request.args.get("collection") if not collection_id: @@ -391,10 +411,17 @@ def v1_upload_url_file(): os.makedirs(f"./backend/storage/{collection_id}",exist_ok=True) - for url_file in request.files.keys(): - path = f"./backend/storage/{collection_id}/{url_file}" - request.files[url_file].save(path) - print(f"Saved: {path}") + for zip_file in request.files.keys(): + path = f"./backend/storage/{collection_id}/{zip_file}" + request.files[zip_file].save(path) + + with zipfile.ZipFile(path, 'r') as zip_ref: + zip_ref.extractall(f"./backend/storage/{collection_id}") + + glob_path = r'./backend/storage/' + collection_id + r'/*.txt' + + for file_path in glob.glob(glob_path): + database.create_raw_text_file(collection_id, file_path) return {"status":"success"}, 200 @@ -415,10 +442,11 @@ def __get_items(id): elif c_type == "url": glob_path = r'./backend/storage/' + id + r'/*.txt' for file_path in glob.glob(glob_path): - with open(file_path) as file: - items = [line.rstrip() for line in file] + items.append(file_path) + # with open(file_path) as file: + # items = [line.rstrip() for line in file] elif c_type == "html": - glob_path = r'./backend/storage/' + id + r'/*.html' + glob_path = r'./backend/storage/' + id + r'/*.txt' for file_path in glob.glob(glob_path): items.append(file_path) diff --git a/scraper.py b/scraper.py index 0cb74c07205fa69bd5e081f57acc0add9ef5f7c4..aec3fe1b09b02c1253851ef25cbd1b86b815989c 100644 --- a/scraper.py +++ b/scraper.py @@ -64,6 +64,28 @@ def scrape_multi_webpage(filename): all_text += result + "\n" return json.dumps({"URLs": urls, "Text": all_text}) +def scrape_url(link): + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} + + try: + resp = r.get(link, headers=headers) + resp.raise_for_status() + except requests.exceptions.RequestException as exception: + return f"Error identified: {exception}" + + soupObject = BeautifulSoup(resp.content, "html.parser") + text = "" + for paragraph in soupObject.find_all("p"): + text += paragraph.text + "\n" + return text + +def scrape_html(htmlText): + soupObject = BeautifulSoup(htmlText, 'html.parser') + text = "" + for paragraph in soupObject.find_all("p"): + text += paragraph.text + "\n" + return text + # webpage = "https://www.cbsnews.com/news/enrique-marquez-san-bernardino-shooter-friend-pleads-guilty-to-supplying-weapons/"