nikhilbadyal · IMXEren · Mar 30, 2024 · Mar 30, 2024 · Mar 31, 2024 · Apr 9, 2024
@@ -5,13 +5,23 @@ COPY requirements.txt .
 RUN python -m pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir -r requirements.txt
 
+## Chrome dependencies
+RUN apt-get update -y && \
+    apt-get install -y curl gnupg2 unzip xvfb
+
 
 # Copy entrypoint script
 COPY ./entrypoint /entrypoint
 RUN sed -i 's/\r$//g' /entrypoint && chmod +x /entrypoint
 
+# Install chrome and chromedriver
+COPY ./src/browser/setup_browser.sh /setup_chrome_webdriver.sh
+RUN sed -i 's/\r$//g' /setup_chrome_webdriver.sh && \
+    sed -i 's/sudo\s//g' /setup_chrome_webdriver.sh
+RUN bash /setup_chrome_webdriver.sh
+
 # Copy application code
 COPY . ${APP_HOME}
 
 # Set the default command to run the entrypoint script
-CMD ["bash","/entrypoint"]
+CMD [ "bash", "/entrypoint" ]
@@ -5,4 +5,25 @@ set -o pipefail
 set -o nounset
 
 
+## Ref: https://github.com/Ulyssedev/Rust-undetected-chromedriver/blob/7da3bea/xvfb.sh
+## Set up the Display for XVFB server,
+## to support running Chrome in headful mode
+export DISPLAY=:99
+function keepUpScreen() {
+  echo "Running keepUpScreen() [Xvfb Server]"
+  while true; do
+        sleep .25
+        if [ -z $(pidof Xvfb) ]; then
+                Xvfb $DISPLAY -screen $DISPLAY 1280x1024x24 -ac +extension GLX +render -noreset &
+                ## PyVirtualDisplay is another alternative to manage
+                ## running the server from the code itself.
+        fi;
+  done;
+}
+
+## Starting the XVFB server in background
+keepUpScreen &
+
+## Browser tests
+# python src/browser/test.py
 python main.py
@@ -1,4 +1,5 @@
 beautifulsoup4==4.12.3
+browserforge==1.1.1
 environs==11.0.0
 gdown @ git+https://github.com/nikhilbadyal/gdown
 google-play-scraper==1.2.6
@@ -7,4 +8,5 @@ loguru==0.7.2
 pre-commit==3.7.0
 pytz==2024.1
 requests==2.31.0
+selenium-driverless==1.8.0.2
 tqdm==4.66.2
@@ -0,0 +1,6 @@
+"""Browser to source content from any site."""
+
+from src.browser.browser import Browser  # noqa: F401
+from src.browser.cookies import Cookies  # noqa: F401
+from src.browser.exceptions import JSONExtractError, PageLoadError  # noqa: F401
+from src.browser.site import Site, Source, source  # noqa: F401
@@ -0,0 +1,22 @@
+"""Browsing methods for APKMirror."""
+
+from typing import Self
+
+from src.browser.browser import Browser
+from src.browser.site import Site, Source
+
+
+class APKMirror(Site):
+    """Sample class to implement methods for APKMirror.
+
+    Implementing these would do the work.
+    """
+
+    def __init__(self: Self, browser: Browser) -> None:
+        super().__init__(browser)
+
+    async def get(self: Self, url: str, timeout: float) -> Source:  # noqa: D102
+        return await super().get(url, timeout)
+
+    async def check_if_loaded(self: Self) -> bool:  # noqa: D102
+        return await super().check_if_loaded()
@@ -0,0 +1,179 @@
+"""Methods to load the page in the browser using webdriver."""
+
+from typing import Self
+
+from loguru import logger
+from selenium_driverless import webdriver
+from selenium_driverless.utils.utils import find_chrome_executable
+
+
+class Browser:
+    """Convenient class to load urls in the browser opposed to HTTPRequest."""
+
+    def __init__(self: Self) -> None:
+        """Initialises the browser by setting up the dependencies and async the webdriver.
+
+        Not meant to be invoked directly instead `create()` when using outside of async context manager.
+        """
+        if not find_chrome_executable():
+            self.setup_dependencies()
+        self.options = BrowserOptions()
+        self.driver = webdriver.Chrome(options=self.options)
+
+    @classmethod
+    async def create(cls: type[Self]) -> Self:
+        """Creates the browser instance.
+
+        Exceptions are raised by the `webdriver.Chrome()` impls.
+        """
+        instance = cls()
+        await instance.driver
+        return instance
+
+    async def quit(self: Self) -> None:
+        """Cleares up the browser instance."""
+        return await self.driver.quit(clean_dirs=True)
+
+    async def get(self: Self, url: str, timeout: float = 60):  # noqa: ANN201
+        """Loads the url.
+
+        Waits for the page to load until timeout is hit.
+        Raises `PageLoadError` on load failure.
+        """
+        site = self.map_url(url)
+        return await site.get(url, timeout)
+
+    def map_url(self: Self, url: str):  # noqa: ANN201
+        """Maps the url, to their site implementation based on the pattern matching.
+
+        Returns default `Site` on no match.
+        """
+        from src.browser.apkmirror import APKMirror
+        from src.browser.site import Site
+
+        site = Site(self)
+        if "www.apkmirror.com" in url:
+            site = APKMirror(self)
+
+        return site
+
+    def setup_dependencies(self: Self) -> bool:
+        """Not implemented yet for all (linux only for now).
+
+        Setups the browser dependencies based on systems and returns the bool result.
+        """
+        import platform
+
+        system = platform.system().lower()
+        if system == "linux":
+            setup = self.setup_dependencies_on_linux()
+        elif system == "windows":
+            setup = self.setup_dependencies_on_windows()
+        elif system == "darwin":
+            setup = self.setup_dependencies_on_mac()
+        else:
+            setup = self.setup_dependencies_on_unknown(system)
+        return setup
+
+    @staticmethod
+    def setup_dependencies_on_linux() -> bool:
+        """Setups the browser dependencies on linux.
+
+        Returns the bool result.
+        """
+        import subprocess
+        from pathlib import Path
+
+        setup = False
+        setup_script = Path(__file__).parent.joinpath("setup_browser.sh").as_posix()
+        try:
+            subprocess.run(["bash", setup_script], check=True)
+            setup = True
+        except subprocess.CalledProcessError as e:
+            logger.error(f"failed to setup browser dependencies: {e!r}")
+        return setup
+
+    @staticmethod
+    def setup_dependencies_on_windows() -> bool:
+        """Not implemented yet.
+
+        Setups the browser dependencies on windows.
+
+        Returns the bool result.
+        """
+        try:
+            msg = (
+                "setup not yet implemented for Windows, kindly setup chrome and chromedriver manuallly "
+                "or write yourself a powershell script"
+            )
+            raise NotImplementedError(msg)
+        except NotImplementedError as e:
+            logger.error(f"failed to setup browser dependencies: {e!r}")
+        return False
+
+    @staticmethod
+    def setup_dependencies_on_mac() -> bool:
+        """Not implemented yet.
+
+        Setups the browser dependencies on mac.
+
+        Returns the bool result.
+        """
+        try:
+            msg = (
+                "setup not yet implemented for Mac OS, kindly setup chrome and chromedriver manually "
+                "or write yourself a zsh script"
+            )
+            raise NotImplementedError(msg)
+        except NotImplementedError as e:
+            logger.error(f"failed to setup browser dependencies: {e!r}")
+        return False
+
+    @staticmethod
+    def setup_dependencies_on_unknown(system: str) -> bool:
+        """Not implemented yet.
+
+        Setups the browser dependencies on unknown.
+
+        Returns the bool result.
+        """
+        try:
+            msg = f"unexpected system: {system}, kindly setup chrome and chromedriver manually"
+            raise NotImplementedError(msg)
+        except NotImplementedError as e:
+            logger.error(f"failed to setup browser dependencies: {e!r}")
+        return False
+
+
+class BrowserOptions:
+    """Simple class to form and return a predefined instance of chrome `ChromeOptions()`."""
+
+    def __new__(cls: type[Self]) -> webdriver.ChromeOptions:
+        """Return an instance of chrome `ChromeOptions()`."""
+        ## Ref1: https://github.com/Ulyssedev/Rust-undetected-chromedriver/blob/29222ff29fdf8bf018eb7ce668aa3ef4f9d84ab3/src/lib.rs#L107
+        ## Ref2: https://stackoverflow.com/a/59678801
+
+        cls.rand_ua()
+        options = webdriver.ChromeOptions()
+        """# options.add_argument("--headless=new")"""
+        options.add_argument("--no-sandbox")
+        options.add_argument("--disable-dev-shm-usage")
+        options.add_argument("--log-level=3")
+        options.add_argument("--disable-blink-features=AutomationControlled")
+        """# options.add_argument(f"--user-agent={cls.user_agent}")"""
+        return options
+
+    @classmethod
+    def rand_ua(cls: type[Self]) -> None:
+        """Set a random user agent."""
+        import secrets
+
+        user_agents = [
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/120.0.0.0 Safari/537.36 Config/91.2.3711.12",
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/121.0.0.0 Safari/537.36",
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/122.0.0.0 Safari/537.36",
+        ]
+        cls.user_agent = secrets.choice(user_agents)
@@ -0,0 +1,82 @@
+"""Methods for managing browser tab cookies."""
+
+import json
+from collections.abc import Iterator
+from http.cookiejar import CookieJar
+from pathlib import Path
+from typing import Any, Self
+
+from requests.cookies import create_cookie
+
+
+class Cookies:
+    """Represent the stored cookies from the browser."""
+
+    def __init__(self: Self) -> None:
+        self.cookies_file: Path = Path.cwd().joinpath("browser_cookies.json")
+        self.cookies: list[dict[str, Any]] = []
+        self._load_cookies_from_file()
+
+    def update_cookie(self: Self, cookie_dict: dict[str, Any]) -> None:
+        """Update the cookie list by adding another cookie."""
+        self.cookies = list(filter(lambda _cookie: not self._are_cookies_matching(cookie_dict, _cookie), self.cookies))
+        self.cookies.append(cookie_dict)
+        self.save_cookies()
+
+    def update_cookies(self: Self, cookie_list: list[dict[str, Any]]) -> None:
+        """Update the cookie list by extending another cookie list."""
+        for cookie in cookie_list:
+            self.update_cookie(cookie)
+
+    def load_to_cookie_jar(self: Self) -> CookieJar:
+        """Loads the stored cookies into the cookie jar."""
+        cookie_jar = CookieJar()
+        for _cookie in self.cookies:
+            cookie = create_cookie(
+                name=_cookie["name"],
+                value=_cookie["value"],
+                domain=_cookie["domain"],
+                path=_cookie["path"],
+                expires=_cookie["expires"],
+                secure=_cookie["secure"],
+                rest={"HttpOnly": _cookie["httpOnly"]},
+            )
+            cookie_jar.set_cookie(cookie)
+        return cookie_jar
+
+    def save_cookies(self: Self) -> None:
+        """Save the cookies to the file."""
+        self._save_cookies_to_file()
+
+    def delete_cookies(self: Self) -> None:
+        """Delete the saved cookies file."""
+        self.cookies_file.unlink(missing_ok=True)
+
+    @staticmethod
+    def _are_cookies_matching(cookie_new: dict[str, Any], cookie_old: dict[str, Any]) -> bool:
+        return (
+            cookie_new["name"] == cookie_old["name"]
+            and cookie_new["domain"] == cookie_old["domain"]
+            and cookie_new["path"] == cookie_old["path"]
+        )
+
+    def _load_cookies_from_file(self: Self) -> list[dict[str, Any]]:
+        if self.cookies_file.exists():
+            try:
+                self.cookies = json.loads(self.cookies_file.read_text())
+            except ValueError:
+                self.cookies = []
+        return self.cookies
+
+    def _save_cookies_to_file(self: Self) -> bool:
+        try:
+            with self.cookies_file.open("w") as f:
+                json.dump(self.cookies, f)
+        except Exception:  # noqa: BLE001
+            return False
+        else:
+            return True
+
+    def __iter__(self: Self) -> Iterator[dict[str, Any]]:
+        """Returns an iterator cookies."""
+        return self.cookies.__iter__()
@@ -0,0 +1,9 @@
+"""Browser impl exceptions."""
+
+
+class PageLoadError(Exception):
+    """Implies that the page load checker mechanism failed."""
+
+
+class JSONExtractError(Exception):
+    """Implies that the json extractor mechanism failed or no such json."""