-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathweb_scraper.py
More file actions
56 lines (33 loc) · 1.28 KB
/
web_scraper.py
File metadata and controls
56 lines (33 loc) · 1.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#What it does:
#Sends HTTP requests to each page (page 1, 2, 3...).
#Parses the HTML to extract:
#Quote text
#Author name
#Tags related to the quote
#Writes this data into a CSV file with 3 columns: Quotes, Author, Tags.
#Stops when there are no more pages or no more quotes.
import csv
from bs4 import BeautifulSoup
import requests
base_url = "http://quotes.toscrape.com/page/{}/"
with open("csvwritter", "w", newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(["Quotes", "Author", "Tags"])
page = 1
while True:
print(f"scraping page {page}")
response = requests.get(base_url.format(page))
if response.status_code != 200:
print('No more pages')
break
soup = BeautifulSoup(response.text,"html.parser")
quotes = soup.find_all('div', class_='quote')
if not quotes:
print('No more quotes')
break;
for quote in quotes:
text = quote.find('span', class_="text").get_text(strip=True)
author = quote.find('small', class_="author").get_text(strip=True)
tags = [ tag.get_text(strip=True) for tag in quote.find_all('a', class_="tag")]
writer.writerow([text, author, "".join(tags)])
page += 1