Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use a browser to load the page #481

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,23 @@ COPY requirements.txt .
RUN python -m pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r requirements.txt

## Chrome dependencies
RUN apt-get update -y && \
apt-get install -y curl gnupg2 unzip xvfb


# Copy entrypoint script
COPY ./entrypoint /entrypoint
RUN sed -i 's/\r$//g' /entrypoint && chmod +x /entrypoint

# Install chrome and chromedriver
COPY ./src/browser/setup_browser.sh /setup_chrome_webdriver.sh
RUN sed -i 's/\r$//g' /setup_chrome_webdriver.sh && \
sed -i 's/sudo\s//g' /setup_chrome_webdriver.sh
RUN bash /setup_chrome_webdriver.sh

# Copy application code
COPY . ${APP_HOME}

# Set the default command to run the entrypoint script
CMD ["bash","/entrypoint"]
CMD [ "bash", "/entrypoint" ]
21 changes: 21 additions & 0 deletions entrypoint
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,25 @@ set -o pipefail
set -o nounset


## Ref: https://github.com/Ulyssedev/Rust-undetected-chromedriver/blob/7da3bea/xvfb.sh
## Set up the Display for XVFB server,
## to support running Chrome in headful mode
export DISPLAY=:99
function keepUpScreen() {
echo "Running keepUpScreen() [Xvfb Server]"
while true; do
sleep .25
if [ -z $(pidof Xvfb) ]; then
Xvfb $DISPLAY -screen $DISPLAY 1280x1024x24 -ac +extension GLX +render -noreset &
## PyVirtualDisplay is another alternative to manage
## running the server from the code itself.
fi;
done;
}

## Starting the XVFB server in background
keepUpScreen &

## Browser tests
# python src/browser/test.py
python main.py
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
beautifulsoup4==4.12.3
browserforge==1.1.1
environs==11.0.0
gdown @ git+https://github.com/nikhilbadyal/gdown
google-play-scraper==1.2.6
Expand All @@ -7,4 +8,5 @@ loguru==0.7.2
pre-commit==3.7.0
pytz==2024.1
requests==2.31.0
selenium-driverless==1.8.0.2
tqdm==4.66.2
6 changes: 6 additions & 0 deletions src/browser/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""Browser to source content from any site."""

from src.browser.browser import Browser # noqa: F401
from src.browser.cookies import Cookies # noqa: F401
from src.browser.exceptions import JSONExtractError, PageLoadError # noqa: F401
from src.browser.site import Site, Source, source # noqa: F401
22 changes: 22 additions & 0 deletions src/browser/apkmirror.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""Browsing methods for APKMirror."""

from typing import Self

from src.browser.browser import Browser
from src.browser.site import Site, Source


class APKMirror(Site):
"""Sample class to implement methods for APKMirror.

Implementing these would do the work.
"""

def __init__(self: Self, browser: Browser) -> None:
super().__init__(browser)

async def get(self: Self, url: str, timeout: float) -> Source: # noqa: D102
return await super().get(url, timeout)

async def check_if_loaded(self: Self) -> bool: # noqa: D102
return await super().check_if_loaded()
179 changes: 179 additions & 0 deletions src/browser/browser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
"""Methods to load the page in the browser using webdriver."""

from typing import Self

from loguru import logger
from selenium_driverless import webdriver
from selenium_driverless.utils.utils import find_chrome_executable


class Browser:
"""Convenient class to load urls in the browser opposed to HTTPRequest."""

def __init__(self: Self) -> None:
"""Initialises the browser by setting up the dependencies and async the webdriver.

Not meant to be invoked directly instead `create()` when using outside of async context manager.
"""
if not find_chrome_executable():
self.setup_dependencies()
self.options = BrowserOptions()
self.driver = webdriver.Chrome(options=self.options)

@classmethod
async def create(cls: type[Self]) -> Self:
"""Creates the browser instance.

Exceptions are raised by the `webdriver.Chrome()` impls.
"""
instance = cls()
await instance.driver
return instance

async def quit(self: Self) -> None:
"""Cleares up the browser instance."""
return await self.driver.quit(clean_dirs=True)

async def get(self: Self, url: str, timeout: float = 60): # noqa: ANN201
"""Loads the url.

Waits for the page to load until timeout is hit.
Raises `PageLoadError` on load failure.
"""
site = self.map_url(url)
return await site.get(url, timeout)

def map_url(self: Self, url: str): # noqa: ANN201
"""Maps the url, to their site implementation based on the pattern matching.

Returns default `Site` on no match.
"""
from src.browser.apkmirror import APKMirror
from src.browser.site import Site

site = Site(self)
if "www.apkmirror.com" in url:
site = APKMirror(self)

return site

def setup_dependencies(self: Self) -> bool:
"""Not implemented yet for all (linux only for now).

Setups the browser dependencies based on systems and returns the bool result.
"""
import platform

system = platform.system().lower()
if system == "linux":
setup = self.setup_dependencies_on_linux()
elif system == "windows":
setup = self.setup_dependencies_on_windows()
elif system == "darwin":
setup = self.setup_dependencies_on_mac()
else:
setup = self.setup_dependencies_on_unknown(system)
return setup

@staticmethod
def setup_dependencies_on_linux() -> bool:
"""Setups the browser dependencies on linux.

Returns the bool result.
"""
import subprocess
from pathlib import Path

setup = False
setup_script = Path(__file__).parent.joinpath("setup_browser.sh").as_posix()
try:
subprocess.run(["bash", setup_script], check=True)
setup = True
except subprocess.CalledProcessError as e:
logger.error(f"failed to setup browser dependencies: {e!r}")
return setup

@staticmethod
def setup_dependencies_on_windows() -> bool:
"""Not implemented yet.

Setups the browser dependencies on windows.

Returns the bool result.
"""
try:
msg = (
"setup not yet implemented for Windows, kindly setup chrome and chromedriver manuallly "
"or write yourself a powershell script"
)
raise NotImplementedError(msg)
except NotImplementedError as e:
logger.error(f"failed to setup browser dependencies: {e!r}")
return False

@staticmethod
def setup_dependencies_on_mac() -> bool:
"""Not implemented yet.

Setups the browser dependencies on mac.

Returns the bool result.
"""
try:
msg = (
"setup not yet implemented for Mac OS, kindly setup chrome and chromedriver manually "
"or write yourself a zsh script"
)
raise NotImplementedError(msg)
except NotImplementedError as e:
logger.error(f"failed to setup browser dependencies: {e!r}")
return False

@staticmethod
def setup_dependencies_on_unknown(system: str) -> bool:
"""Not implemented yet.

Setups the browser dependencies on unknown.

Returns the bool result.
"""
try:
msg = f"unexpected system: {system}, kindly setup chrome and chromedriver manually"
raise NotImplementedError(msg)
except NotImplementedError as e:
logger.error(f"failed to setup browser dependencies: {e!r}")
return False


class BrowserOptions:
"""Simple class to form and return a predefined instance of chrome `ChromeOptions()`."""

def __new__(cls: type[Self]) -> webdriver.ChromeOptions:
"""Return an instance of chrome `ChromeOptions()`."""
## Ref1: https://github.com/Ulyssedev/Rust-undetected-chromedriver/blob/29222ff29fdf8bf018eb7ce668aa3ef4f9d84ab3/src/lib.rs#L107
## Ref2: https://stackoverflow.com/a/59678801

cls.rand_ua()
options = webdriver.ChromeOptions()
"""# options.add_argument("--headless=new")"""
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--log-level=3")
options.add_argument("--disable-blink-features=AutomationControlled")
"""# options.add_argument(f"--user-agent={cls.user_agent}")"""
return options

@classmethod
def rand_ua(cls: type[Self]) -> None:
"""Set a random user agent."""
import secrets

user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36 Config/91.2.3711.12",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/121.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/122.0.0.0 Safari/537.36",
]
cls.user_agent = secrets.choice(user_agents)
82 changes: 82 additions & 0 deletions src/browser/cookies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""Methods for managing browser tab cookies."""

import json
from collections.abc import Iterator
from http.cookiejar import CookieJar
from pathlib import Path
from typing import Any, Self

from requests.cookies import create_cookie


class Cookies:
"""Represent the stored cookies from the browser."""

def __init__(self: Self) -> None:
self.cookies_file: Path = Path.cwd().joinpath("browser_cookies.json")
self.cookies: list[dict[str, Any]] = []
self._load_cookies_from_file()

def update_cookie(self: Self, cookie_dict: dict[str, Any]) -> None:
"""Update the cookie list by adding another cookie."""
self.cookies = list(filter(lambda _cookie: not self._are_cookies_matching(cookie_dict, _cookie), self.cookies))
self.cookies.append(cookie_dict)
self.save_cookies()

def update_cookies(self: Self, cookie_list: list[dict[str, Any]]) -> None:
"""Update the cookie list by extending another cookie list."""
for cookie in cookie_list:
self.update_cookie(cookie)

def load_to_cookie_jar(self: Self) -> CookieJar:
"""Loads the stored cookies into the cookie jar."""
cookie_jar = CookieJar()
for _cookie in self.cookies:
cookie = create_cookie(
name=_cookie["name"],
value=_cookie["value"],
domain=_cookie["domain"],
path=_cookie["path"],
expires=_cookie["expires"],
secure=_cookie["secure"],
rest={"HttpOnly": _cookie["httpOnly"]},
)
cookie_jar.set_cookie(cookie)
return cookie_jar

def save_cookies(self: Self) -> None:
"""Save the cookies to the file."""
self._save_cookies_to_file()

def delete_cookies(self: Self) -> None:
"""Delete the saved cookies file."""
self.cookies_file.unlink(missing_ok=True)

@staticmethod
def _are_cookies_matching(cookie_new: dict[str, Any], cookie_old: dict[str, Any]) -> bool:
return (
cookie_new["name"] == cookie_old["name"]
and cookie_new["domain"] == cookie_old["domain"]
and cookie_new["path"] == cookie_old["path"]
)

def _load_cookies_from_file(self: Self) -> list[dict[str, Any]]:
if self.cookies_file.exists():
try:
self.cookies = json.loads(self.cookies_file.read_text())
except ValueError:
self.cookies = []
return self.cookies

def _save_cookies_to_file(self: Self) -> bool:
try:
with self.cookies_file.open("w") as f:
json.dump(self.cookies, f)
except Exception: # noqa: BLE001
return False
else:
return True

def __iter__(self: Self) -> Iterator[dict[str, Any]]:
"""Returns an iterator cookies."""
return self.cookies.__iter__()
9 changes: 9 additions & 0 deletions src/browser/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""Browser impl exceptions."""


class PageLoadError(Exception):
"""Implies that the page load checker mechanism failed."""


class JSONExtractError(Exception):
"""Implies that the json extractor mechanism failed or no such json."""
Loading