forked from mordax7/flathunter
-
Notifications
You must be signed in to change notification settings - Fork 180
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
111 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
"""Expose crawler for VrmImmo""" | ||
import re | ||
import hashlib | ||
|
||
from bs4 import BeautifulSoup, Tag | ||
|
||
from flathunter.logging import logger | ||
from flathunter.abstract_crawler import Crawler | ||
|
||
|
||
class VrmImmo(Crawler): | ||
"""Implementation of Crawler interface for VrmImmo""" | ||
|
||
BASE_URL = "https://vrm-immo.de" | ||
URL_PATTERN = re.compile(r'https://vrm-immo\.de') | ||
|
||
def __init__(self, config): | ||
super().__init__(config) | ||
self.config = config | ||
|
||
# pylint: disable=too-many-locals | ||
def extract_data(self, soup: BeautifulSoup): | ||
"""Extracts all exposes from a provided Soup object""" | ||
entries = [] | ||
|
||
items = soup.find_all("div", {"class": "item-wrap js-serp-item"}) | ||
|
||
for item in items: | ||
link = item.find("a", {"class": "js-item-title-link ci-search-result__link"}) | ||
url = link.get("href") | ||
title = link.get("title") | ||
logger.debug("Analyze " + url) | ||
|
||
try: | ||
price = item.find("div", {"class": "item__spec item-spec-price"}).text | ||
except (IndexError, AttributeError): | ||
price = "" | ||
|
||
try: | ||
size = item.find("div", {"class": "item__spec item-spec-area"}).text | ||
except (IndexError, AttributeError): | ||
size = "" | ||
|
||
try: | ||
rooms = item.find("div", {"class": "item__spec item-spec-rooms"}).text | ||
except (IndexError, AttributeError): | ||
rooms = "" | ||
|
||
try: | ||
image = item.find('img')['src'] | ||
except (IndexError, AttributeError): | ||
image = "" | ||
|
||
try: | ||
address = item.find("div", {"class": "item__locality"}).text | ||
except (IndexError, AttributeError): | ||
address = "" | ||
|
||
processed_id = int( | ||
hashlib.sha256(item.get("id").encode('utf-8')).hexdigest(), 16 | ||
) % 10 ** 16 | ||
|
||
details = { | ||
'id': processed_id, | ||
'image': image, | ||
'url': self.BASE_URL + url, | ||
'title': title, | ||
'rooms': rooms, | ||
'price': price.strip(), | ||
'size': size.strip(), | ||
'address': address.strip(), | ||
'crawler': self.get_name() | ||
} | ||
logger.debug(details) | ||
entries.append(details) | ||
logger.debug('Number of entries found: %d', len(entries)) | ||
return entries |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import os | ||
import unittest | ||
from functools import reduce | ||
|
||
from flathunter.crawler.vrmimmo import VrmImmo | ||
from test.utils.config import StringConfig | ||
|
||
class VrmImmoCrawlerTest(unittest.TestCase): | ||
TEST_URL = 'https://vrm-immo.de/suchergebnisse?l=Darmstadt&r=0km&_multiselect_r=0km&a=de.darmstadt&t=apartment%3Asale%3Aliving&pf=&pt=&rf=0&rt=0&sf=&st=&s=most_recently_updated_first' | ||
DUMMY_CONFIG = """ | ||
verbose: true | ||
urls: | ||
- https://vrm-immo.de/suchergebnisse?l=Darmstadt&r=0km&_multiselect_r=0km&a=de.darmstadt&t=all%3Arental%3Aliving&pf=&pt=&rf=0&rt=0&sf=&st= | ||
""" | ||
|
||
def setUp(self): | ||
self.crawler = VrmImmo(StringConfig(string=self.DUMMY_CONFIG)) | ||
|
||
def test(self): | ||
soup = self.crawler.get_page(self.TEST_URL) | ||
self.assertIsNotNone(soup, "Should get a soup from the URL") | ||
entries = self.crawler.extract_data(soup) | ||
self.assertIsNotNone(entries, "Should parse entries from search URL") | ||
self.assertTrue(len(entries) > 0, "Should have at least one entry") | ||
self.assertTrue(entries[0]['id'] > 0, "Id should be parsed") | ||
self.assertTrue(entries[0]['url'].startswith("https://vrm-immo.de"), u"URL should start with BASE_URL") | ||
self.assertTrue(entries[0]['url'].startswith("https://vrm-immo.de/immobilien"), | ||
u"URL should be an immobilien link") | ||
for attr in ['title', 'price', 'size', 'rooms', 'address', 'image']: | ||
self.assertIsNotNone(entries[0][attr], attr + " should be set") |