Skip to content

Commit

Permalink
Add VrmImmo Crawler
Browse files Browse the repository at this point in the history
  • Loading branch information
mrclrchtr committed Sep 9, 2023
1 parent 0570ff0 commit 292a70b
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 2 deletions.
2 changes: 1 addition & 1 deletion config.yaml.dist
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ loop:

# List the URLs containing your filter properties below.
# Currently supported services: www.immobilienscout24.de,
# www.immowelt.de, www.wg-gesucht.de, and www.kleinanzeigen.de.
# www.immowelt.de, www.wg-gesucht.de, www.kleinanzeigen.de, meinestadt.de and vrm-immo.de.
# List the URLs in the following format:
# urls:
# - https://www.immobilienscout24.de/Suche/...
Expand Down
4 changes: 3 additions & 1 deletion flathunter/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from flathunter.crawler.immowelt import Immowelt
from flathunter.crawler.meinestadt import MeineStadt
from flathunter.crawler.wggesucht import WgGesucht
from flathunter.crawler.vrmimmo import VrmImmo
from flathunter.crawler.subito import Subito
from flathunter.filter import Filter
from flathunter.logging import logger
Expand Down Expand Up @@ -124,7 +125,8 @@ def init_searchers(self):
Subito(self),
Immobiliare(self),
Idealista(self),
MeineStadt(self)
MeineStadt(self),
VrmImmo(self)
]

def check_deprecated(self):
Expand Down
77 changes: 77 additions & 0 deletions flathunter/crawler/vrmimmo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""Expose crawler for VrmImmo"""
import re
import hashlib

from bs4 import BeautifulSoup, Tag

from flathunter.logging import logger
from flathunter.abstract_crawler import Crawler


class VrmImmo(Crawler):
"""Implementation of Crawler interface for VrmImmo"""

BASE_URL = "https://vrm-immo.de"
URL_PATTERN = re.compile(r'https://vrm-immo\.de')

def __init__(self, config):
super().__init__(config)
self.config = config

# pylint: disable=too-many-locals
def extract_data(self, soup: BeautifulSoup):
"""Extracts all exposes from a provided Soup object"""
entries = []

items = soup.find_all("div", {"class": "item-wrap js-serp-item"})

for item in items:
link = item.find("a", {"class": "js-item-title-link ci-search-result__link"})
url = link.get("href")
title = link.get("title")
logger.debug("Analyze " + url)

try:
price = item.find("div", {"class": "item__spec item-spec-price"}).text
except (IndexError, AttributeError):
price = ""

Check warning on line 37 in flathunter/crawler/vrmimmo.py

View check run for this annotation

Codecov / codecov/patch

flathunter/crawler/vrmimmo.py#L36-L37

Added lines #L36 - L37 were not covered by tests

try:
size = item.find("div", {"class": "item__spec item-spec-area"}).text
except (IndexError, AttributeError):
size = ""

try:
rooms = item.find("div", {"class": "item__spec item-spec-rooms"}).text
except (IndexError, AttributeError):
rooms = ""

try:
image = item.find('img')['src']
except (IndexError, AttributeError):
image = ""

Check warning on line 52 in flathunter/crawler/vrmimmo.py

View check run for this annotation

Codecov / codecov/patch

flathunter/crawler/vrmimmo.py#L51-L52

Added lines #L51 - L52 were not covered by tests

try:
address = item.find("div", {"class": "item__locality"}).text
except (IndexError, AttributeError):
address = ""

Check warning on line 57 in flathunter/crawler/vrmimmo.py

View check run for this annotation

Codecov / codecov/patch

flathunter/crawler/vrmimmo.py#L56-L57

Added lines #L56 - L57 were not covered by tests

processed_id = int(
hashlib.sha256(item.get("id").encode('utf-8')).hexdigest(), 16
) % 10 ** 16

details = {
'id': processed_id,
'image': image,
'url': self.BASE_URL + url,
'title': title,
'rooms': rooms,
'price': price.strip(),
'size': size.strip(),
'address': address.strip(),
'crawler': self.get_name()
}
logger.debug(details)
entries.append(details)
logger.debug('Number of entries found: %d', len(entries))
return entries
30 changes: 30 additions & 0 deletions test/test_crawl_vrmimmo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import os
import unittest
from functools import reduce

from flathunter.crawler.vrmimmo import VrmImmo
from test.utils.config import StringConfig

class VrmImmoCrawlerTest(unittest.TestCase):
TEST_URL = 'https://vrm-immo.de/suchergebnisse?l=Darmstadt&r=0km&_multiselect_r=0km&a=de.darmstadt&t=apartment%3Asale%3Aliving&pf=&pt=&rf=0&rt=0&sf=&st=&s=most_recently_updated_first'
DUMMY_CONFIG = """
verbose: true
urls:
- https://vrm-immo.de/suchergebnisse?l=Darmstadt&r=0km&_multiselect_r=0km&a=de.darmstadt&t=all%3Arental%3Aliving&pf=&pt=&rf=0&rt=0&sf=&st=
"""

def setUp(self):
self.crawler = VrmImmo(StringConfig(string=self.DUMMY_CONFIG))

def test(self):
soup = self.crawler.get_page(self.TEST_URL)
self.assertIsNotNone(soup, "Should get a soup from the URL")
entries = self.crawler.extract_data(soup)
self.assertIsNotNone(entries, "Should parse entries from search URL")
self.assertTrue(len(entries) > 0, "Should have at least one entry")
self.assertTrue(entries[0]['id'] > 0, "Id should be parsed")
self.assertTrue(entries[0]['url'].startswith("https://vrm-immo.de"), u"URL should start with BASE_URL")
self.assertTrue(entries[0]['url'].startswith("https://vrm-immo.de/immobilien"),
u"URL should be an immobilien link")
for attr in ['title', 'price', 'size', 'rooms', 'address', 'image']:
self.assertIsNotNone(entries[0][attr], attr + " should be set")

0 comments on commit 292a70b

Please sign in to comment.