-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpyscraper.py
More file actions
95 lines (83 loc) · 3.13 KB
/
pyscraper.py
File metadata and controls
95 lines (83 loc) · 3.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# Program for scraping a given webpage for all of its images]
# Usage: python pyscraper.py <url> [folder]
# Libraries:
from html.parser import HTMLParser
import requests
import sys
import os
import datetime
# Classes:
class ImageParser(HTMLParser):
urlLoc = ""
# Constructor
def __init__(self, urlL):
self.urlLoc = urlL
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
# Only want 'img' and 'a' tags, so that's what we'll check for
if tag != "img" and tag != "a":
return
# Make sure links point to images if we're scraping them
filetypes = ["jpeg","gif","jpg","png","bmp","svg"]
# Find the 'src' attribute, and download the image
for x in attrs:
if x[0] == "src" and tag == "img":
downloadImage(x[1])
elif x[0] == "href" and tag == "a" and x[1].split(".")[-1] in filetypes:
downloadImage(x[1])
def parsePage(self):
r = requests.get(self.urlLoc)
self.feed(r.text)
# Code starts here:
# Create a list of already downloaded files to avoid double-dipping
alreadyDownloaded = []
def downloadImage(urlLoc):
# Ensure that the url is correct
urlLoc = formatUrl(urlLoc)
# Grab the last segment of the url for the filename
# eg. "i.4cdn.org/g/1531602832712s.jpg" becomes "1531602832712s.jpg"
filename = urlLoc.split('/')[-1]
# Download the image and save to disk
path = foldername + "/" + filename
print("Downloading image " + urlLoc)
# Double check to see if the image has already been downloaded
if urlLoc in alreadyDownloaded:
print("Image already downloaded!")
return
# Download time
r = requests.get(urlLoc, stream=True)
# Make sure we're good to go
if r.status_code == 200:
# Open the file on disk and write the image stream to it
with open(path, 'wb') as f:
for chunk in r.iter_content(1024):
f.write(chunk)
# Add it to the list so we don't download the image again
alreadyDownloaded.append(urlLoc)
# Didn't work, abort the download
else:
print("Error downloading image!")
print("Expected status 200, got status " + str(r.status.code))
# Strip anything that might cause requests to panic and break
# Make sure it's in the right format so requests doesn't panic and break
def formatUrl(urlStr):
urlStr = urlStr.replace("https://","").replace("http://","")
if urlStr[0:2] == '//':
urlStr = urlStr[2:]
return "http://" + urlStr
# Double check to make sure the user is using the program correctly
if len(sys.argv) < 2:
print("Invalid arguments!")
print("Useage: python pyscraper.py <url> [folder]")
exit(1)
# Use the user provided folder name or just default to today's date
if len(sys.argv) == 3:
foldername = sys.argv[2]
else:
foldername = datetime.date.today().strftime("%B %d, %Y")
# Make sure the folder exists, if not, create it
if not os.path.exists(foldername):
os.makedirs(foldername)
# Create an ImageParser object and run the code
parser = ImageParser(sys.argv[1])
parser.parsePage()