import requests as r import requests.exceptions from flask import Flask, request, jsonify from requests.exceptions import RequestException from bs4 import BeautifulSoup import json app = Flask(__name__) @app.route('/scrape_single', methods=['GET']) def scrape_single_webpage_api(): link = request.args.get('link') # Retrieve the 'link' parameter from the URL query string if not link: return jsonify({"error": "No link provided"}), 400 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} try: resp = r.get(link, headers=headers) resp.raise_for_status() except RequestException as exception: return jsonify({"error": str(exception)}), 500 # Return the error as JSON soup = BeautifulSoup(resp.content, "html.parser") text = "" for paragraph in soup.find_all("p"): text += paragraph.text + "\n" return jsonify({"link": link, "text": text}) # Return the response as JSON # Returns text of web page def scrape_single_webpage(link): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} try: resp = r.get(link, headers=headers) resp.raise_for_status() except requests.exceptions.RequestException as exception: return f"Error identified: {exception}" soupObject = BeautifulSoup(resp.content, "html.parser") text = "" for paragraph in soupObject.find_all("p"): text += paragraph.text + "\n" return {"link":link,"text":text} def scrape_multi_webpage(filename): urls = [] all_text = "" with open(filename, "r") as file: for link in file: link = link.strip() if link: urls.append(link) result = scrape_single_webpage(link) if isinstance(result, dict): all_text += result["text"] + "\n" else: all_text += result + "\n" return json.dumps({"URLs": urls, "Text": all_text}) def scrape_url(link): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} try: resp = r.get(link, headers=headers) resp.raise_for_status() except requests.exceptions.RequestException as exception: return f"Error identified: {exception}" soupObject = BeautifulSoup(resp.content, "html.parser") text = "" for paragraph in soupObject.find_all("p"): text += paragraph.text + "\n" return text def scrape_html(htmlText): soupObject = BeautifulSoup(htmlText, 'html.parser') text = "" for paragraph in soupObject.find_all("p"): text += paragraph.text + "\n" return text # webpage = "https://www.cbsnews.com/news/enrique-marquez-san-bernardino-shooter-friend-pleads-guilty-to-supplying-weapons/" # print(scrape_webpage(webpage)['text']) #print(scrape_multi_webpage('test.txt'))