-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathweb_scraper.py
More file actions
81 lines (66 loc) · 2.55 KB
/
web_scraper.py
File metadata and controls
81 lines (66 loc) · 2.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import requests
import time
from bs4 import BeautifulSoup
from parsl.app.app import python_app
from database import store_data_in_db
from parsl_config import load_parsl_config
import logging
# Logging configuration
logging.basicConfig(filename="scraping_errors.log", level=logging.ERROR)
def log_error(message):
logging.error(message)
# Scraper function
def scraper(url, retries=3):
for i in range(retries):
try:
# If the URL doesn't have a protocol, add it
if not url.startswith("http://") and not url.startswith("https://"):
url = "http://" + url
response = requests.get(url, timeout=10) # Timeout added for long requests
response.raise_for_status() # Check if the request was successful
soup = BeautifulSoup(response.text, "html.parser")
title = soup.title.string if soup.title else "No Title"
return {"url": url, "title": title}
except requests.RequestException as error:
log_error(f"Attempt {i + 1} failed for {url}: {error}")
time.sleep(2**i) # Exponential backoff for retries
except Exception as error:
log_error(f"An unexpected error occurred for {url}: {error}")
return {"url": url, "error": str(error)}
log_error(f"Failed after {retries} retries for {url}")
return {"url": url, "error": "Failed after retries"}
@python_app
def parallel_parsl_scrape(url):
return scraper(url)
def multiple_url_scraper(urls):
futures = [parallel_parsl_scrape(url) for url in urls]
results = [future.result() for future in futures]
for result in results:
print(result)
store_data_in_db(result)
if __name__ == "__main__":
load_parsl_config()
# Website List
urls = [
"https://www.google.com",
"https://www.example.com",
"https://www.wikipedia.org",
"https://www.github.com",
"https://stackoverflow.com",
"https://www.python.org",
"https://www.bbc.com/news",
"https://www.nytimes.com",
"https://techcrunch.com",
"https://www.cnn.com",
"https://www.theverge.com",
"https://www.nasa.gov",
"https://developer.mozilla.org",
"https://www.nationalgeographic.com",
"https://www.theguardian.com/international",
"https://www.reuters.com",
"https://www.bloomberg.com",
"https://www.youtube.com",
"https://www.amazon.com",
"https://www.ebay.com",
]
multiple_url_scraper(urls) # Calling the function with the list of URLs