Add VrmImmo Crawler

flathunters · Sep 9, 2023 · 292a70b · 292a70b
1 parent 0570ff0
commit 292a70b
Show file tree

Hide file tree

Showing 4 changed files with 111 additions and 2 deletions.
diff --git a/config.yaml.dist b/config.yaml.dist
@@ -17,7 +17,7 @@ loop:
 
 # List the URLs containing your filter properties below.
 # Currently supported services: www.immobilienscout24.de,
-# www.immowelt.de, www.wg-gesucht.de, and www.kleinanzeigen.de.
+# www.immowelt.de, www.wg-gesucht.de, www.kleinanzeigen.de, meinestadt.de and vrm-immo.de.
 # List the URLs in the following format:
 # urls:
 # 	- https://www.immobilienscout24.de/Suche/...

diff --git a/flathunter/config.py b/flathunter/config.py
@@ -16,6 +16,7 @@
 from flathunter.crawler.immowelt import Immowelt
 from flathunter.crawler.meinestadt import MeineStadt
 from flathunter.crawler.wggesucht import WgGesucht
+from flathunter.crawler.vrmimmo import VrmImmo
 from flathunter.crawler.subito import Subito
 from flathunter.filter import Filter
 from flathunter.logging import logger
@@ -124,7 +125,8 @@ def init_searchers(self):
             Subito(self),
             Immobiliare(self),
             Idealista(self),
-            MeineStadt(self)
+            MeineStadt(self),
+            VrmImmo(self)
         ]
 
     def check_deprecated(self):

diff --git a/flathunter/crawler/vrmimmo.py b/flathunter/crawler/vrmimmo.py
@@ -0,0 +1,77 @@
+"""Expose crawler for VrmImmo"""
+import re
+import hashlib
+
+from bs4 import BeautifulSoup, Tag
+
+from flathunter.logging import logger
+from flathunter.abstract_crawler import Crawler
+
+
+class VrmImmo(Crawler):
+    """Implementation of Crawler interface for VrmImmo"""
+
+    BASE_URL = "https://vrm-immo.de"
+    URL_PATTERN = re.compile(r'https://vrm-immo\.de')
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+    # pylint: disable=too-many-locals
+    def extract_data(self, soup: BeautifulSoup):
+        """Extracts all exposes from a provided Soup object"""
+        entries = []
+
+        items = soup.find_all("div", {"class": "item-wrap js-serp-item"})
+
+        for item in items:
+            link = item.find("a", {"class": "js-item-title-link ci-search-result__link"})
+            url = link.get("href")
+            title = link.get("title")
+            logger.debug("Analyze " + url)
+
+            try:
+                price = item.find("div", {"class": "item__spec item-spec-price"}).text
+            except (IndexError, AttributeError):
+                price = ""
+
+            try:
+                size = item.find("div", {"class": "item__spec item-spec-area"}).text
+            except (IndexError, AttributeError):
+                size = ""
+
+            try:
+                rooms = item.find("div", {"class": "item__spec item-spec-rooms"}).text
+            except (IndexError, AttributeError):
+                rooms = ""
+
+            try:
+                image = item.find('img')['src']
+            except (IndexError, AttributeError):
+                image = ""
+
+            try:
+                address = item.find("div", {"class": "item__locality"}).text
+            except (IndexError, AttributeError):
+                address = ""
+
+            processed_id = int(
+                hashlib.sha256(item.get("id").encode('utf-8')).hexdigest(), 16
+            ) % 10 ** 16
+
+            details = {
+                'id': processed_id,
+                'image': image,
+                'url': self.BASE_URL + url,
+                'title': title,
+                'rooms': rooms,
+                'price': price.strip(),
+                'size': size.strip(),
+                'address': address.strip(),
+                'crawler': self.get_name()
+            }
+            logger.debug(details)
+            entries.append(details)
+        logger.debug('Number of entries found: %d', len(entries))
+        return entries
diff --git a/test/test_crawl_vrmimmo.py b/test/test_crawl_vrmimmo.py
@@ -0,0 +1,30 @@
+import os
+import unittest
+from functools import reduce
+
+from flathunter.crawler.vrmimmo import VrmImmo
+from test.utils.config import StringConfig
+
+class VrmImmoCrawlerTest(unittest.TestCase):
+    TEST_URL = 'https://vrm-immo.de/suchergebnisse?l=Darmstadt&r=0km&_multiselect_r=0km&a=de.darmstadt&t=apartment%3Asale%3Aliving&pf=&pt=&rf=0&rt=0&sf=&st=&s=most_recently_updated_first'
+    DUMMY_CONFIG = """
+    verbose: true
+    urls:
+      - https://vrm-immo.de/suchergebnisse?l=Darmstadt&r=0km&_multiselect_r=0km&a=de.darmstadt&t=all%3Arental%3Aliving&pf=&pt=&rf=0&rt=0&sf=&st=
+        """
+
+    def setUp(self):
+        self.crawler = VrmImmo(StringConfig(string=self.DUMMY_CONFIG))
+
+    def test(self):
+        soup = self.crawler.get_page(self.TEST_URL)
+        self.assertIsNotNone(soup, "Should get a soup from the URL")
+        entries = self.crawler.extract_data(soup)
+        self.assertIsNotNone(entries, "Should parse entries from search URL")
+        self.assertTrue(len(entries) > 0, "Should have at least one entry")
+        self.assertTrue(entries[0]['id'] > 0, "Id should be parsed")
+        self.assertTrue(entries[0]['url'].startswith("https://vrm-immo.de"), u"URL should start with BASE_URL")
+        self.assertTrue(entries[0]['url'].startswith("https://vrm-immo.de/immobilien"),
+                        u"URL should be an immobilien link")
+        for attr in ['title', 'price', 'size', 'rooms', 'address', 'image']:
+            self.assertIsNotNone(entries[0][attr], attr + " should be set")