-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathget_link_for_crawler.py
More file actions
86 lines (77 loc) · 2.23 KB
/
get_link_for_crawler.py
File metadata and controls
86 lines (77 loc) · 2.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from requests_html import HTMLSession
from urllib.parse import urlsplit
from urllib.parse import urlencode
import time
import logging
LOGGING2 = { # dictConfig for output stream and file logging
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'console': {
'format': '[%(asctime)s] %(levelname)s::%(module)s - %(message)s',
},
'file': {
'format': '[%(asctime)s] %(levelname)s::(P:%(process)d T:%(thread)d)::%(module)s - %(message)s',
},
},
'handlers': {
'console': {
'class': 'ColorStreamHandler.ColorStreamHandler',
'formatter':'console',
'level': 'DEBUG',
'use_colors': True,
},
'file': {
'class': 'logging.handlers.TimedRotatingFileHandler',
'formatter':'file',
'level': 'INFO',
'when': 'midnight',
'filename': 'logs/pycrawler.log',
'interval': 1,
'backupCount': 0,
'encoding': None,
'delay': False,
'utc': False,
},
},
'loggers': {
'voodoo_logger': {
'handlers': ['console', 'file'],
'level': 'DEBUG' if True else 'INFO',
'propagate': True,
},
}
}
ADDONS_INFO_FILENAME = 'link_for_crawler.txt'
logging.makeLogRecord(LOGGING2)
logger = logging.getLogger('voodoo_logger')
def link_getter(query, max_res):
link_a = []
query_url = query.replace(" ", "+").replace("(", "%28").replace(")", "%29").replace("\"", "%22")
max_int = int(max_res)
for i in range(0, max_int, 10):
url = f"https://www.google.com/search?q={query_url}&start={i}"
session = HTMLSession()
print("=" * 40)
print('Search from page: %s' % url)
print("Processing...")
r = session.get(url)
for link in r.html.absolute_links:
if 'google' not in link:
if 'blogger' not in link:
if 'youtube' not in link:
link = urlsplit(link).netloc
if link not in set(link_a):
link_a.append(link)
print(f"New domain: {link} !!!")
print("I'm search next >>> ")
time.sleep(0.035)
with open(ADDONS_INFO_FILENAME, 'w') as f:
f.write(str(query)+'||'+str(max_res) + "\n")
for ln in set(link_a):
print(ln)
f.write(ln + "\n")
if __name__ == "__main__":
query = input('Каков будет Ваш запрос... ? ')
max_res = input('А сколько максимум будет выдано результатов ? ')
link_getter(query, max_res)