Skip to content
Snippets Groups Projects
scraper.py 3.03 KiB
Newer Older
  • Learn to ignore specific revisions
  • Farhan Mohammed's avatar
    Farhan Mohammed committed
    import requests as r
    
    Farhan Mohammed's avatar
    Farhan Mohammed committed
    import requests.exceptions
    from flask import Flask, request, jsonify
    from requests.exceptions import RequestException
    
    Farhan Mohammed's avatar
    Farhan Mohammed committed
    from bs4 import BeautifulSoup
    
    Farhan Mohammed's avatar
    Farhan Mohammed committed
    import json
    app = Flask(__name__)
    
    Farhan Mohammed's avatar
    Farhan Mohammed committed
    @app.route('/scrape_single', methods=['GET'])
    def scrape_single_webpage_api():
        link = request.args.get('link')  # Retrieve the 'link' parameter from the URL query string
        if not link:
            return jsonify({"error": "No link provided"}), 400
    
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    
        try:
            resp = r.get(link, headers=headers)
            resp.raise_for_status()
        except RequestException as exception:
            return jsonify({"error": str(exception)}), 500  # Return the error as JSON
    
        soup = BeautifulSoup(resp.content, "html.parser")
        text = ""
        for paragraph in soup.find_all("p"):
            text += paragraph.text + "\n"
    
        return jsonify({"link": link, "text": text})  # Return the response as JSON
    
    
    Farhan Mohammed's avatar
    Farhan Mohammed committed
    # Returns text of web page
    def scrape_single_webpage(link):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    
    
        try:
            resp = r.get(link, headers=headers)
            resp.raise_for_status()
        except requests.exceptions.RequestException as exception:
            return f"Error identified: {exception}"
    
    
        soupObject = BeautifulSoup(resp.content, "html.parser")
        text = ""
        for paragraph in soupObject.find_all("p"):
            text += paragraph.text + "\n"
        return {"link":link,"text":text}
        
    def scrape_multi_webpage(filename):
        urls = []
        all_text = ""
        with open(filename, "r") as file:
            for link in file:
                link = link.strip()
                if link:
                    urls.append(link)
                    result = scrape_single_webpage(link)
                    if isinstance(result, dict):
                        all_text += result["text"] + "\n"
                    else:
                        all_text += result + "\n"
        return json.dumps({"URLs": urls, "Text": all_text})
    
    
    Tarek Shah's avatar
    Tarek Shah committed
    def scrape_url(link):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    
        try:
            resp = r.get(link, headers=headers)
            resp.raise_for_status()
        except requests.exceptions.RequestException as exception:
            return f"Error identified: {exception}"
    
        soupObject = BeautifulSoup(resp.content, "html.parser")
        text = ""
        for paragraph in soupObject.find_all("p"):
            text += paragraph.text + "\n"
        return text
    
    def scrape_html(htmlText):
        soupObject = BeautifulSoup(htmlText, 'html.parser')
        text = ""
        for paragraph in soupObject.find_all("p"):
            text += paragraph.text + "\n"
        return text
    
    
    Farhan Mohammed's avatar
    Farhan Mohammed committed
    # webpage = "https://www.cbsnews.com/news/enrique-marquez-san-bernardino-shooter-friend-pleads-guilty-to-supplying-weapons/"
    # print(scrape_webpage(webpage)['text'])
    #print(scrape_multi_webpage('test.txt'))