Newer
Older
import requests.exceptions
from flask import Flask, request, jsonify
from requests.exceptions import RequestException
@app.route('/scrape_single', methods=['GET'])
def scrape_single_webpage_api():
link = request.args.get('link') # Retrieve the 'link' parameter from the URL query string
if not link:
return jsonify({"error": "No link provided"}), 400
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
try:
resp = r.get(link, headers=headers)
resp.raise_for_status()
except RequestException as exception:
return jsonify({"error": str(exception)}), 500 # Return the error as JSON
soup = BeautifulSoup(resp.content, "html.parser")
text = ""
for paragraph in soup.find_all("p"):
text += paragraph.text + "\n"
return jsonify({"link": link, "text": text}) # Return the response as JSON
# Returns text of web page
def scrape_single_webpage(link):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
try:
resp = r.get(link, headers=headers)
resp.raise_for_status()
except requests.exceptions.RequestException as exception:
return f"Error identified: {exception}"
soupObject = BeautifulSoup(resp.content, "html.parser")
text = ""
for paragraph in soupObject.find_all("p"):
text += paragraph.text + "\n"
return {"link":link,"text":text}
def scrape_multi_webpage(filename):
urls = []
all_text = ""
with open(filename, "r") as file:
for link in file:
link = link.strip()
if link:
urls.append(link)
result = scrape_single_webpage(link)
if isinstance(result, dict):
all_text += result["text"] + "\n"
else:
all_text += result + "\n"
return json.dumps({"URLs": urls, "Text": all_text})
def scrape_url(link):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
try:
resp = r.get(link, headers=headers)
resp.raise_for_status()
except requests.exceptions.RequestException as exception:
return f"Error identified: {exception}"
soupObject = BeautifulSoup(resp.content, "html.parser")
text = ""
for paragraph in soupObject.find_all("p"):
text += paragraph.text + "\n"
return text
def scrape_html(htmlText):
soupObject = BeautifulSoup(htmlText, 'html.parser')
text = ""
for paragraph in soupObject.find_all("p"):
text += paragraph.text + "\n"
return text
# webpage = "https://www.cbsnews.com/news/enrique-marquez-san-bernardino-shooter-friend-pleads-guilty-to-supplying-weapons/"
# print(scrape_webpage(webpage)['text'])
#print(scrape_multi_webpage('test.txt'))