Skip to content
Snippets Groups Projects
Commit 79467a24 authored by Tarek Shah's avatar Tarek Shah
Browse files

working version

parent a1cf42e0
No related branches found
No related tags found
No related merge requests found
No preview for this file type
No preview for this file type
{"Users": 2, "Collections": 2}
\ No newline at end of file
{"Users": 2, "Collections": 5}
\ No newline at end of file
......@@ -13,6 +13,7 @@ import glob
import summarizer_implementations.t5 as t5
import summarizer_implementations.nltk_summarizer as nltk
import summarizer_implementations.bert as bert
import scraper
app = Flask(__name__)
app.config['JWT_SECRET_KEY'] = 'PI'
......@@ -227,7 +228,7 @@ def v1_summarize_t5():
glob = ""
if collection["type"] == "text":
if collection["type"] == "text" or collection["type"] == "url" or collection["type"] == "html":
for file_path in items:
with open(file_path, "r", encoding="utf8") as f:
glob += f.read()
......@@ -264,7 +265,7 @@ def v1_summarize_nltk():
# files = database.get_raw_text_files(collection_info["collection_id"])
glob = ""
if collection["type"] == "text":
if collection["type"] == "text" or collection["type"] == "url" or collection["type"] == "html":
for file_path in items:
with open(file_path, "r", encoding="utf8") as f:
glob += f.read()
......@@ -302,7 +303,7 @@ def v1_summarize_bert():
# files = database.get_raw_text_files(collection_info["collection_id"])
glob = ""
if collection["type"] == "text":
if collection["type"] == "text" or collection["type"] == "url" or collection["type"] == "html":
for file_path in items:
with open(file_path, "r", encoding="utf8") as f:
glob += f.read()
......@@ -323,10 +324,9 @@ def v1_summarize_bert():
"status":"failure",
}, 500
@app.route('/api/v1/upload_raw_text', methods=['POST'])
def v1_upload_raw_text():
"http://127.0.0.1:5000//api/v1/upload_raw_text?collection=0"
@app.route('/api/v1/upload_url_file', methods=['POST'])
def v1_upload_url_file():
"http://127.0.0.1:5000//api/v1/upload_url_file?collection=0"
collection_id = request.args.get("collection")
if not collection_id:
......@@ -336,19 +336,34 @@ def v1_upload_raw_text():
os.makedirs(f"./backend/storage/{collection_id}",exist_ok=True)
for zip_file in request.files.keys():
path = f"./backend/storage/{collection_id}/{zip_file}"
request.files[zip_file].save(path)
for url_file in request.files.keys():
path = f"./backend/storage/{collection_id}/{url_file}"
request.files[url_file].save(path)
print(f"Saved: {path}")
with zipfile.ZipFile(path, 'r') as zip_ref:
zip_ref.extractall(f"./backend/storage/{collection_id}")
with open(path, 'r', encoding='utf-8') as file:
urls = [line.strip() for line in file]
for url in urls: #iterate through list of URLs
try:
text = scraper.scrape_url(url)
if text.lower().startswith('error'): #special case
continue
except Exception as e:
print(f"Error: {e}")
continue #disregard url
file_name = url.split('/')[-1].strip() + '.txt' #determines file name by extracting last part of URL string
if file_name == '.txt': #special case
file_name = url.split('/')[-2].strip() + '.txt'
file_path = f'./backend/storage/{collection_id}/{file_name}'
with open(file_path, 'w', encoding='utf-8') as text_file: #write file to storage path with given name
text_file.write(text)
glob_path = r'./backend/storage/' + collection_id + r'/*.txt'
for file_path in glob.glob(glob_path):
database.create_raw_text_file(collection_id, file_path)
return {"status":"success"}, 200
@app.route('/api/v1/upload_raw_html', methods=['POST'])
......@@ -368,20 +383,25 @@ def v1_upload_raw_html():
request.files[zip_file].save(path)
with zipfile.ZipFile(path, 'r') as zip_ref:
zip_ref.extractall(f"./backend/storage/{collection_id}")
glob_path = r'./backend/storage/' + collection_id + r'/*.html'
#zip_ref.extractall(f"./backend/storage/{collection_id}")
for file_name in zip_ref.namelist(): #loop through each file name in zipped file
with zip_ref.open(file_name) as file:
content = file.read()
text = scraper.scrape_html(content)
file_path = f'./backend/storage/{collection_id}/{file_name}'
with open(file_path, 'w', encoding='utf-8') as text_file:
text_file.write(text)
##deprecated code
glob_path = r'./backend/storage/' + collection_id + r'/*.txt'
for file_path in glob.glob(glob_path):
database.create_raw_text_file(collection_id, file_path)
##
return {"status":"success"}, 200
@app.route('/api/v1/upload_url_file', methods=['POST'])
def v1_upload_url_file():
"http://127.0.0.1:5000//api/v1/upload_url_file?collection=0"
@app.route('/api/v1/upload_raw_text', methods=['POST'])
def v1_upload_raw_text():
"http://127.0.0.1:5000//api/v1/upload_raw_text?collection=0"
collection_id = request.args.get("collection")
if not collection_id:
......@@ -391,10 +411,17 @@ def v1_upload_url_file():
os.makedirs(f"./backend/storage/{collection_id}",exist_ok=True)
for url_file in request.files.keys():
path = f"./backend/storage/{collection_id}/{url_file}"
request.files[url_file].save(path)
print(f"Saved: {path}")
for zip_file in request.files.keys():
path = f"./backend/storage/{collection_id}/{zip_file}"
request.files[zip_file].save(path)
with zipfile.ZipFile(path, 'r') as zip_ref:
zip_ref.extractall(f"./backend/storage/{collection_id}")
glob_path = r'./backend/storage/' + collection_id + r'/*.txt'
for file_path in glob.glob(glob_path):
database.create_raw_text_file(collection_id, file_path)
return {"status":"success"}, 200
......@@ -415,10 +442,11 @@ def __get_items(id):
elif c_type == "url":
glob_path = r'./backend/storage/' + id + r'/*.txt'
for file_path in glob.glob(glob_path):
with open(file_path) as file:
items = [line.rstrip() for line in file]
items.append(file_path)
# with open(file_path) as file:
# items = [line.rstrip() for line in file]
elif c_type == "html":
glob_path = r'./backend/storage/' + id + r'/*.html'
glob_path = r'./backend/storage/' + id + r'/*.txt'
for file_path in glob.glob(glob_path):
items.append(file_path)
......
......@@ -64,6 +64,28 @@ def scrape_multi_webpage(filename):
all_text += result + "\n"
return json.dumps({"URLs": urls, "Text": all_text})
def scrape_url(link):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
try:
resp = r.get(link, headers=headers)
resp.raise_for_status()
except requests.exceptions.RequestException as exception:
return f"Error identified: {exception}"
soupObject = BeautifulSoup(resp.content, "html.parser")
text = ""
for paragraph in soupObject.find_all("p"):
text += paragraph.text + "\n"
return text
def scrape_html(htmlText):
soupObject = BeautifulSoup(htmlText, 'html.parser')
text = ""
for paragraph in soupObject.find_all("p"):
text += paragraph.text + "\n"
return text
# webpage = "https://www.cbsnews.com/news/enrique-marquez-san-bernardino-shooter-friend-pleads-guilty-to-supplying-weapons/"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment