Skip to content

Commit

Permalink
Addressed review feedback in #630
Browse files Browse the repository at this point in the history
  • Loading branch information
codders committed Sep 17, 2024
1 parent 463f3e6 commit e53e423
Show file tree
Hide file tree
Showing 8 changed files with 69 additions and 24 deletions.
22 changes: 10 additions & 12 deletions flathunter/abstract_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,41 +204,39 @@ def resolve_awsawf(self, driver):

# Intercept background network traffic via log sniffing
sleep(2)
logs_raw = driver.get_log("performance")
logs = [json.loads(lr["message"])["message"] for lr in logs_raw]

logs = [json.loads(lr["message"])["message"] for lr in driver.get_log("performance")]

def log_filter(log_):
return (
# is an actual response
log_["method"] == "Network.responseReceived"
# and json
and "json" in log_["params"]["response"]["mimeType"]
)

for log in filter(log_filter, logs):
request_id = log["params"]["requestId"]
resp_url = log["params"]["response"]["url"]
if "problem" in resp_url and "awswaf" in resp_url:
response = driver.execute_cdp_cmd("Network.getResponseBody", {"requestId": request_id})
response = driver.execute_cdp_cmd(
"Network.getResponseBody", {"requestId": request_id}
)
response_json = json.loads(response["body"])
iv = response_json["state"]["iv"]
context = response_json["state"]["payload"]
sitekey = response_json["key"]


sitekey = re.findall(
r"apiKey: \"(.*?)\"", driver.page_source)[0]

patternChallenge = r'src="([^"]*challenge\.js)"'
challenge_matches = re.findall(patternChallenge, driver.page_source)
challenge_matches = re.findall(r'src="([^"]*challenge\.js)"', driver.page_source)
for match in challenge_matches:
print(f'Challenge SRC Value: {match}')
logger.debug('Challenge SRC Value: %s', match)
challenge = match

patternJsApi = r'src="([^"]*jsapi\.js)"'
jsapi_matches = re.findall(patternJsApi, driver.page_source)
jsapi_matches = re.findall(r'src="([^"]*jsapi\.js)"', driver.page_source)
for match in jsapi_matches:
print(f'JsApi SRC Value: {match}')
logger.debug('JsApi SRC Value: %s', match)
jsapi = match

try:
Expand Down
24 changes: 19 additions & 5 deletions flathunter/captcha/capmonster_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,24 @@

class CapmonsterSolver(CaptchaSolver):
"""Implementation of Captcha solver for CapMonster"""


def solve_awswaf(self, sitekey: str, iv: str, context: str, challenge_script: str, captcha_script: str, page_url: str) -> AwsAwfResponse:

def solve_geetest(self, geetest: str, challenge: str, page_url: str) -> GeetestResponse:
"""Should be implemented in subclass"""
raise NotImplementedError("Geetest captcha solving is not implemented for CapMonster")

def solve_recaptcha(self, google_site_key: str, page_url: str) -> RecaptchaResponse:
"""Should be implemented in subclass"""
raise NotImplementedError("Recaptcha captcha solving is not implemented for Capmonster")

def solve_awswaf(
self,
sitekey: str,
iv: str,
context: str,
challenge_script: str,
captcha_script: str,
page_url: str
) -> AwsAwfResponse:
"""Solves AWS WAF Captcha"""
logger.info("Trying to solve AWS WAF.")
params = {
Expand Down Expand Up @@ -49,7 +64,6 @@ def __submit_capmonster_request(self, params: Dict[str, str]) -> str:

return response_json["taskId"]


@backoff.on_exception(**CaptchaSolver.backoff_options)
def __retrieve_capmonster_result(self, captcha_id: str):
retrieve_url = "https://api.capmonster.cloud/getTaskResult"
Expand All @@ -70,4 +84,4 @@ def __retrieve_capmonster_result(self, captcha_id: str):
sleep(5)
continue
if response_json["status"] == "ready":
return response_json["solution"]["cookies"]["aws-waf-token"]
return response_json["solution"]["cookies"]["aws-waf-token"]
12 changes: 10 additions & 2 deletions flathunter/captcha/captcha_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,16 @@ def __init__(self, api_key):
def solve_geetest(self, geetest: str, challenge: str, page_url: str) -> GeetestResponse:
"""Should be implemented in subclass"""
raise NotImplementedError()

def solve_awswaf(self, sitekey: str, iv: str, context: str, page_url: str) -> AwsAwfResponse:

def solve_awswaf(
self,
sitekey: str,
iv: str,
context: str,
challenge_script: str,
captcha_script: str,
page_url: str
) -> AwsAwfResponse:
"""Should be implemented in subclass"""
raise NotImplementedError()

Expand Down
12 changes: 12 additions & 0 deletions flathunter/captcha/imagetyperz_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
CaptchaSolver,
CaptchaUnsolvableError,
GeetestResponse,
AwsAwfResponse,
RecaptchaResponse,
)

Expand Down Expand Up @@ -58,6 +59,17 @@ def solve_recaptcha(self, google_site_key: str, page_url: str) -> RecaptchaRespo
)
return RecaptchaResponse(self.__retrieve_imagetyperz_result(captcha_id))

def solve_awswaf(
self,
sitekey: str,
iv: str,
context: str,
challenge_script: str,
captcha_script: str,
page_url: str
) -> AwsAwfResponse:
"""Should be implemented at some point"""
raise NotImplementedError("AWS WAF captchas not supported for Imagetyperz")

@backoff.on_exception(**CaptchaSolver.backoff_options)
def __submit_imagetyperz_request(self, submit_url: str, params: Dict[str, str]) -> str:
Expand Down
13 changes: 12 additions & 1 deletion flathunter/captcha/twocaptcha_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,17 @@ def solve_recaptcha(self, google_site_key: str, page_url: str) -> RecaptchaRespo
captcha_id = self.__submit_2captcha_request(params)
return RecaptchaResponse(self.__retrieve_2captcha_result(captcha_id))

def solve_awswaf(
self,
sitekey: str,
iv: str,
context: str,
challenge_script: str,
captcha_script: str,
page_url: str
) -> AwsAwfResponse:
"""Should be implemented at some point"""
raise NotImplementedError("AWS WAF captchas not supported for 2Captcha")

@backoff.on_exception(**CaptchaSolver.backoff_options)
def __submit_2captcha_request(self, params: Dict[str, str]) -> str:
Expand Down Expand Up @@ -89,4 +100,4 @@ def __retrieve_2captcha_result(self, captcha_id: str):
if not retrieve_response.text.startswith("OK"):
raise requests.HTTPError(response=retrieve_response)

return retrieve_response.text.split("|", 1)[1]
return retrieve_response.text.split("|", 1)[1]
2 changes: 2 additions & 0 deletions flathunter/chrome_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ def get_chrome_driver(driver_arguments):
"""Configure Chrome WebDriver"""
logger.info('Initializing Chrome WebDriver for crawler...')
chrome_options = uc.ChromeOptions() # pylint: disable=no-member
if platform == "darwin":
chrome_options.add_argument("--headless")
if driver_arguments is not None:
for driver_argument in driver_arguments:
chrome_options.add_argument(driver_argument)
Expand Down
6 changes: 3 additions & 3 deletions flathunter/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ def _get_imagetyperz_token(self):
def get_twocaptcha_key(self) -> str:
"""API Token for 2captcha"""
return self._read_yaml_path("captcha.2captcha.api_key", "")

def get_capmonster_key(self) -> str:
"""API Token for Capmonster"""
return self._read_yaml_path("captcha.capmonster.api_key", "")
Expand All @@ -316,7 +316,7 @@ def _get_captcha_solver(self) -> Optional[CaptchaSolver]:
twocaptcha_api_key = self.get_twocaptcha_key()
if twocaptcha_api_key:
return TwoCaptchaSolver(twocaptcha_api_key)

capmonster_api_key = self.get_capmonster_key()
if capmonster_api_key:
return CapmonsterSolver(capmonster_api_key)
Expand Down Expand Up @@ -409,7 +409,7 @@ def _get_imagetyperz_token(self):
def get_twocaptcha_key(self) -> str:
"""Return the currently configured 2captcha API key"""
return Env.FLATHUNTER_2CAPTCHA_KEY() or super().get_twocaptcha_key() # pylint: disable=no-member

def get_capmonster_key(self) -> str:
"""Return the currently configured Capmonster API key"""
return Env.FLATHUNTER_CAPMONSTER_KEY() or super().get_capmonster_key()
Expand Down
2 changes: 1 addition & 1 deletion flathunter/crawler/immobilienscout.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def get_entries_from_javascript(self):
logger.error(
"IS24 bot detection has identified our script as a bot - we've been blocked"
)
logger.info(self.get_driver_force().page_source)
logger.debug(self.get_driver_force().page_source)
return []
return self.get_entries_from_json(result_json)

Expand Down

0 comments on commit e53e423

Please sign in to comment.