scraper.py

import requests as r
import requests.exceptions
from flask import Flask, request, jsonify
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
import json
app = Flask(__name__)


@app.route('/scrape_single', methods=['GET'])
def scrape_single_webpage_api():
    link = request.args.get('link')  # Retrieve the 'link' parameter from the URL query string
    if not link:
        return jsonify({"error": "No link provided"}), 400

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

    try:
        resp = r.get(link, headers=headers)
        resp.raise_for_status()
    except RequestException as exception:
        return jsonify({"error": str(exception)}), 500  # Return the error as JSON

    soup = BeautifulSoup(resp.content, "html.parser")
    text = ""
    for paragraph in soup.find_all("p"):
        text += paragraph.text + "\n"

    return jsonify({"link": link, "text": text})  # Return the response as JSON


# Returns text of web page
def scrape_single_webpage(link):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}


    try:
        resp = r.get(link, headers=headers)
        resp.raise_for_status()
    except requests.exceptions.RequestException as exception:
        return f"Error identified: {exception}"


    soupObject = BeautifulSoup(resp.content, "html.parser")
    text = ""
    for paragraph in soupObject.find_all("p"):
        text += paragraph.text + "\n"
    return {"link":link,"text":text}
    
def scrape_multi_webpage(filename):
    urls = []
    all_text = ""
    with open(filename, "r") as file:
        for link in file:
            link = link.strip()
            if link:
                urls.append(link)
                result = scrape_single_webpage(link)
                if isinstance(result, dict):
                    all_text += result["text"] + "\n"
                else:
                    all_text += result + "\n"
    return json.dumps({"URLs": urls, "Text": all_text})

def scrape_url(link):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

    try:
        resp = r.get(link, headers=headers)
        resp.raise_for_status()
    except requests.exceptions.RequestException as exception:
        return f"Error identified: {exception}"

    soupObject = BeautifulSoup(resp.content, "html.parser")
    text = ""
    for paragraph in soupObject.find_all("p"):
        text += paragraph.text + "\n"
    return text

def scrape_html(htmlText):
    soupObject = BeautifulSoup(htmlText, 'html.parser')
    text = ""
    for paragraph in soupObject.find_all("p"):
        text += paragraph.text + "\n"
    return text


# webpage = "https://www.cbsnews.com/news/enrique-marquez-san-bernardino-shooter-friend-pleads-guilty-to-supplying-weapons/"
# print(scrape_webpage(webpage)['text'])
#print(scrape_multi_webpage('test.txt'))