web_scraping/scrape_test.py at master · MichaelSEA/web_scraping · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import requests
from lxml import html

url = 'https://www.datawhatnow.com'

def get_parsed_page(url):
    '''Return the content of the website or the given url in
    a parsed lxml format that is easy to query'''


    response = requests.get(url)
    parsed_page = html.fromstring(response.content)
    return parsed_page


parsed_page = get_parsed_page(url)

# print the website's title
print(f"{parsed_page.xpath('//h1/a/text()')}")  # ['Data, what now?']


# print post names
print(f"{parsed_page.xpath('//h2/a/text()')}")
# Output
# ['SimHash for question deduplication',
#  'Feature importance and why it’s important']

post_urls = parsed_page.xpath('//h2//a/@href')
for post_url in post_urls:
    print('Post url:', post_url)

    parsed_post_page = get_parsed_page(post_url)
    paragraph_title_xpath = '//div[@class="entry-content"]/h3/text()'
    paragraph_titles = parsed_post_page.xpath(paragraph_title_xpath)
    paragraph_titles = map(lambda x: ' \n  ' + x, paragraph_titles)
    print(''.join(paragraph_titles) + '\n')