-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathamazonScraper.py
More file actions
70 lines (55 loc) · 1.94 KB
/
amazonScraper.py
File metadata and controls
70 lines (55 loc) · 1.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from bs4 import BeautifulSoup
from requests_html import HTMLSession
import pandas as pd
s= HTMLSession()
url = "https://www.amazon.com/s?k=monitor"
def page(url,num):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
'Path':f'/s?k=monitor&page={str(num)}&qid=1684865096&ref=sr_pg_{str(num)}',
}
r = s.get(url,headers=headers)
print(r.status_code)
print(r.request.headers)
soup = BeautifulSoup(r.text,'lxml')
return soup
def parse(soup):
results = soup.find_all('div',{'data-component-type':'s-search-result'})
products = []
for x in results:
try:
product ={
'title': x.find('span',{'class':'a-size-medium a-color-base a-text-normal'}).text,
'price': x.find('div',{'class':'a-section a-spacing-none a-spacing-top-micro puis-price-instructions-style'}).find('span',{'class':'a-offscreen'}).text,
}
products.append(product)
except:
product ={
'title': 'null',
'price': 'null'
}
products.append(product)
print(len(products))
return products
def nextPage():
page = soup.find('span',{'class':'s-pagination-strip'})
if not page.find('span',{'class':'s-pagination-item s-pagination-next s-pagination-disabled '}):
full_link = 'https://www.amazon.com' + str(page.find('a',{'class':'s-pagination-item s-pagination-next s-pagination-button s-pagination-separator'}).get('href'))
return full_link
else:
return
df = pd.DataFrame()
num = 0
while True:
sum = 1
num = num + sum
print(num)
soup = page(url,num)
products = parse(soup)
f = pd.DataFrame(products)
df = df._append(f)
url =nextPage()
if url == None:
break
print(url)
df.to_csv('products.csv')