-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathFunctions_HomePageStart.py
More file actions
111 lines (83 loc) · 3.97 KB
/
Functions_HomePageStart.py
File metadata and controls
111 lines (83 loc) · 3.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# Scraper functions, which in turn are sourced in runfile.py
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import time
from random import randint
# Initialize browser & wait for 15 seconds
def init_browser(filepath):
browser = webdriver.Chrome(executable_path=filepath)
browser.implicitly_wait(10)
return browser
# Open target website
def navigate_to_website(browser):
browser.get('https://www.funda.nl')
# Wait until site elements are loaded on the home page and enter search term
def enter_search_term(browser, search_term):
wait = WebDriverWait(browser, 10)
try:
search_bar = wait.until(EC.presence_of_element_located(
(By.XPATH, "//input[@id='autocomplete-input']")))
button = wait.until(EC.element_to_be_clickable(
(By.XPATH, "//button[@class='button-primary-alternative']")))
search_bar.click()
time.sleep(randint(10, 15))
search_bar.clear()
time.sleep(randint(10, 15))
search_bar.send_keys(search_term)
time.sleep(randint(10, 15))
button.click()
print("search-button has been clicked")
time.sleep(randint(15, 20))
return True
except (TimeoutException, NoSuchElementException) as e:
print(str(e))
return False
# Scrape the resulting page and move on to the next page until hitting the predefined lastpage. All results are stored in a csv-file
def get_data(browser, lastpage, search_term):
data = []
keep_going = True
wait = WebDriverWait(browser, 15)
page = 2
while keep_going and page <= lastpage:
try:
for item in browser.find_elements_by_css_selector("div.search-result-content"):
try:
zipcode1, zipcode2, city = item.find_element_by_css_selector(
"small.search-result-subtitle").text.split(" ", 2)
zipcode = zipcode1 + " " + zipcode2
street_zipcode_city = item.find_element_by_css_selector("h3.search-result-title").text
price = item.find_element_by_css_selector("span.search-result-price").text.lstrip('€ ').rstrip(
' k.k,').replace('.', '')
surface, rooms = item.find_element_by_css_selector("ul.search-result-kenmerken").text.replace('\n',
'').replace(
'm²', '').split(" ", 1)
rooms = rooms.replace('kamer', '').replace('s', '')
link = item.find_element_by_css_selector("div.search-result-header>a").get_attribute('href')
data.append({
"street_zipcode_city": street_zipcode_city,
"zipcode": zipcode,
"city": city,
"price": price,
"surface": surface,
"rooms": rooms,
"link": link,
})
except ValueError:
pass
print("page extracted")
time.sleep(randint(5, 10))
browser.find_element_by_xpath("//a[contains(@href,'" + search_term + "') and contains(@data-pagination-page,'" + str(page) + "') and contains(@class, 'pagination-number')]").click()
print("link to page " + str(page) + " has been clicked")
page += 1
time.sleep(randint(5, 15))
except (TimeoutException, NoSuchElementException):
keep_going = False
browser.close()
df = pd.DataFrame(data)
df.to_csv(search_term+"uptopage"+str(lastpage)+".csv", sep=';', encoding='utf-8')
print(df)