bolig_ping.scraper
docs
module
bolig_ping.scraper
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110 | """Scraping homes available satisfying the given criteria."""
import json
import logging
import requests
from tqdm.auto import tqdm
from .data_models import Home, SearchQuery
logger = logging.getLogger(__package__)
def scrape_results(search_query: SearchQuery) -> list[Home] | None:
"""Scrape the results of a home search query.
Args:
search_query:
The search query to scrape results for.
Returns:
A list of homes that satisfy the search query, or None if no results were found.
Raises:
HTTPError:
If there was an error in the HTTP request.
"""
logger.info("Fetching results...")
# Get the results from the search query
url = search_query.get_url()
response = requests.get(url=url)
response.raise_for_status()
# Parse the response
result_dict = json.loads(response.text)
results = result_dict["cases"]
if results is None:
return None
# Get the number of pages
num_results = result_dict["totalHits"]
num_pages = num_results // len(results)
if num_results % len(results) != 0:
num_pages += 1
# Get the first page of results
homes = [get_home_from_result(result=result) for result in results]
# Scrape the remaining pages
if num_pages > 1:
with tqdm(desc="Scraping homes from boligsiden.dk", total=num_results) as pbar:
pbar.update(len(homes))
for page_idx in range(2, num_pages + 1):
url = search_query.get_url(page=page_idx)
response = requests.get(url=url)
response.raise_for_status()
result_dict = json.loads(response.text)
results = result_dict["cases"]
new_homes = [get_home_from_result(result=result) for result in results]
homes.extend(new_homes)
homes = list(set(homes))
pbar.update(len(new_homes))
# Ensure that the progress bar is at 100% at the end
pbar.n = pbar.total
return homes
def get_home_from_result(result: dict) -> Home:
"""Get a home from a result.
Args:
result:
The result to get the home from.
Returns:
The home from the result.
"""
url = f"https://boligsiden.dk/viderestilling/{result['caseID']}"
road_name = result["address"]["roadName"]
road_number = result["address"].get("houseNumber")
floor = result["address"].get("floor")
door = result["address"].get("door")
post_code = result["address"].get("zipCode")
city = result["address"]["cityName"]
address = road_name
if road_number:
address += f" {road_number}"
if floor:
floor = floor.replace("0", "st.")
address += f" {floor}"
if door:
address += f" {door}"
if post_code:
address += f" {post_code}"
if city:
address += f" {city}"
return Home(
url=url,
address=address,
price=result.get("priceCash"),
num_rooms=result.get("numberOfRooms"),
size=result.get("housingArea"),
monthly_fee=result.get("monthlyExpense"),
year=result.get("yearBuilt"),
)
|