python-webScrappers/toscrapeBooks.py at main · sunil-dhaka/python-webScrappers · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import requests
from bs4 import BeautifulSoup as bs

# pagination ul class page > li class next
#article class product_pod

baseURL='http://books.toscrape.com/'
basePageLink=baseURL+'catalogue/'


books=list()

# pagination restriction would be like
## toscrapeSoup.find('li',class_='next')!=None
#--------
for page in range(1,51):
    print('Getting book data from page ...',page)
    currLink=basePageLink+f'page-{page}.html'
    r=requests.get(currLink)
    toscrapeSoup=bs(r.text,features='html.parser')
    for book in toscrapeSoup.find_all('article',class_='product_pod'):
        item={
            'name':book.h3.a['title'],
            'link':baseURL+book.h3.a['href'],
            'image':baseURL+book.img['src'][3:], # ../,
            'price':book.find('p',class_='price_color').text,
            'availability':book.find('p',class_='instock availability').text.strip()
        }
        books.append(item)

print(len(books))