From 734e7a2948262914df0ee86a71541d2d0041d3bb Mon Sep 17 00:00:00 2001 From: Nicolas Sawyer Date: Sat, 14 Sep 2024 13:25:10 -0500 Subject: [PATCH 01/10] adding logging, typing, error handling to scraper --- src/scraper/__init__.py | 102 ++++++++++++++++++++++++++-------------- 1 file changed, 67 insertions(+), 35 deletions(-) diff --git a/src/scraper/__init__.py b/src/scraper/__init__.py index fb1e11a..ef8d5b3 100644 --- a/src/scraper/__init__.py +++ b/src/scraper/__init__.py @@ -1,9 +1,7 @@ import logging import os -import re import csv import urllib.parse -import json import sys from datetime import datetime, timedelta from time import time @@ -11,50 +9,79 @@ from bs4 import BeautifulSoup from .helpers import * import importlib +from typing import Optional, Tuple class Scraper: def __init__(self): pass - def set_defaults(self, ms_wait, start_date, end_date, court_calendar_link_text, case_number): - if not ms_wait: - ms_wait = 200 - if not start_date: - start_date = '2024-07-01' - if not end_date: - end_date = '2024-07-01' - if not court_calendar_link_text: - court_calendar_link_text = "Court Calendar" - if not case_number: - case_number = None + def set_defaults(self, + ms_wait: Optional[int], + start_date: Optional[str], + end_date: Optional[str], + court_calendar_link_text: Optional[str], + case_number: Optional[str], + logger: logging.Logger + ) -> Tuple[Optional[int], + Optional[str], + Optional[str], + Optional[str], + Optional[str], + logging.Logger]: + try: # set the defaults + if not ms_wait: + ms_wait = 200 + if not start_date: + start_date = '2024-07-01' + if not end_date: + end_date = '2024-07-01' + if not court_calendar_link_text: + court_calendar_link_text = "Court Calendar" + if not case_number: + case_number = None + except Exception as e: + raise ValueError(f"Error setting the default values for the code : {e}") return ms_wait, start_date, end_date, court_calendar_link_text, case_number - def configure_logger(self): + def configure_logger(self) -> logging.Logger: # configure the logger - logger = logging.getLogger(name="pid: " + str(os.getpid())) - logging.basicConfig() - logging.root.setLevel(level="INFO") - logger.info("Scraper class initialized") + try: + logger = logging.getLogger(name="pid: " + str(os.getpid())) + logging.basicConfig() + logging.root.setLevel(level="INFO") + logger.info("Scraper class initialized") + except Exception as e: + raise ValueError(f"Error configuring the logger: {e}") return logger - def format_county(self, county): - county = county.lower() + def format_county(self, county: str) -> str: + # make the county lowercase + try: + county = county.lower() + except Exception as e: + raise ValueError(f"Error with making the county lowercase: {e}") return county - def create_session(self): - session = requests.Session() - session.verify = False - requests.packages.urllib3.disable_warnings( - requests.packages.urllib3.exceptions.InsecureRequestWarning - ) + def create_session(self, logger: logging.Logger) -> requests.sessions.Session: + try: + session = requests.Session() + session.verify = False + requests.packages.urllib3.disable_warnings( + requests.packages.urllib3.exceptions.InsecureRequestWarning + ) + except Exception as e: + raise ValueError(f"Error creating the requests session field: {e}") return session - def make_directories(self, county): + def make_directories(self, county: str, logger: logging.Logger) -> str: # make directories if not present - case_html_path = os.path.join( - os.path.dirname(__file__), "..", "..", "data", county, "case_html" - ) - os.makedirs(case_html_path, exist_ok=True) + try: + case_html_path = os.path.join( + os.path.dirname(__file__), "..", "..", "data", county, "case_html" + ) + os.makedirs(case_html_path, exist_ok=True) + except Exception as e: + raise OSError(f"Error making directories for the resulting case HTML: {e}") return case_html_path def get_ody_link(self, county, logger): @@ -83,7 +110,7 @@ def get_ody_link(self, county, logger): ) return base_url, odyssey_version, notes - def get_class_and_method(self, county): + def get_class_and_method(self, county, logger): # Construct the module, class, and method names module_name = county #ex: 'hays' class_name = f"Scraper{county.capitalize()}" #ex: 'ScraperHays' @@ -351,9 +378,9 @@ def scrape_multiple_cases(self, county, odyssey_version, base_url, search_url, h def scrape(self, county, judicial_officers, ms_wait, start_date, end_date, court_calendar_link_text, case_number, case_html_path): ms_wait, start_date, end_date, court_calendar_link_text, case_number = self.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number) logger = self.configure_logger() - county = self.format_county(county) - session = self.create_session() - self.make_directories(county) if not case_html_path else case_html_path + county = self.format_county(county, logger) + session = self.create_session(logger) + self.make_directories(county, logger) if not case_html_path else case_html_path base_url, odyssey_version, notes = self.get_ody_link(county, logger) main_page_html, main_soup = self.scrape_main_page(base_url, odyssey_version, session, notes, logger, ms_wait) search_url, search_page_html, search_soup = self.scrape_search_page(base_url, odyssey_version, main_page_html, main_soup, session, logger, ms_wait, court_calendar_link_text) @@ -365,3 +392,8 @@ def scrape(self, county, judicial_officers, ms_wait, start_date, end_date, court SCRAPER_START_TIME = time() self.scrape_multiple_cases(odyssey_version, base_url, search_url, hidden_values, judicial_officers, judicial_officer_to_ID, case_html_path, logger, session, ms_wait, start_date, end_date) logger.info(f"\nTime to run script: {round(time() - SCRAPER_START_TIME, 2)} seconds") + +#scraper_instance = Scraper() +#logger = scraper_instance.configure_logger() +#session = scraper_instance.create_session(logger) +#print(type(session)) From f985d84d23121241f06737c7e3217eb37257276e Mon Sep 17 00:00:00 2001 From: Nicolas Sawyer Date: Sat, 14 Sep 2024 14:01:43 -0500 Subject: [PATCH 02/10] scraper: logging, typeing, error handling part 2 --- src/scraper/__init__.py | 205 +++++++++++++++++++++------------------- 1 file changed, 108 insertions(+), 97 deletions(-) diff --git a/src/scraper/__init__.py b/src/scraper/__init__.py index ef8d5b3..fe3b675 100644 --- a/src/scraper/__init__.py +++ b/src/scraper/__init__.py @@ -9,7 +9,7 @@ from bs4 import BeautifulSoup from .helpers import * import importlib -from typing import Optional, Tuple +from typing import Optional, Tuple, Callable, Type class Scraper: def __init__(self): @@ -40,7 +40,7 @@ def set_defaults(self, if not case_number: case_number = None except Exception as e: - raise ValueError(f"Error setting the default values for the code : {e}") + raise Exception(f"Error setting the default values for the code : {e}") return ms_wait, start_date, end_date, court_calendar_link_text, case_number def configure_logger(self) -> logging.Logger: @@ -51,132 +51,143 @@ def configure_logger(self) -> logging.Logger: logging.root.setLevel(level="INFO") logger.info("Scraper class initialized") except Exception as e: - raise ValueError(f"Error configuring the logger: {e}") + raise Exception(f"Error configuring the logger: {e}") return logger - def format_county(self, county: str) -> str: + def format_county(self, + county: str + ) -> str: # make the county lowercase try: county = county.lower() except Exception as e: - raise ValueError(f"Error with making the county lowercase: {e}") + raise TypeError(f"Error with making the county lowercase: {e}") return county - def create_session(self, logger: logging.Logger) -> requests.sessions.Session: + # creates a session that will be used for interacting with web pages + def create_session(self, + logger: logging.Logger + ) -> requests.sessions.Session: try: session = requests.Session() session.verify = False - requests.packages.urllib3.disable_warnings( - requests.packages.urllib3.exceptions.InsecureRequestWarning - ) + requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) except Exception as e: - raise ValueError(f"Error creating the requests session field: {e}") + logger.info(f"Error creating the requests session field: {e}") + raise Exception(f"Error creating the requests session field: {e}") return session - def make_directories(self, county: str, logger: logging.Logger) -> str: - # make directories if not present + # make directories if not present + def make_directories(self, + county: str, + logger: logging.Logger + ) -> str: try: - case_html_path = os.path.join( - os.path.dirname(__file__), "..", "..", "data", county, "case_html" - ) + case_html_path = os.path.join(os.path.dirname(__file__), "..", "..", "data", county, "case_html") os.makedirs(case_html_path, exist_ok=True) except Exception as e: + logger.info(f"Error making directories for the resulting case HTML: {e}") raise OSError(f"Error making directories for the resulting case HTML: {e}") return case_html_path - def get_ody_link(self, county, logger): - # get county portal and version year information from csv file - base_url = odyssey_version = notes = None - with open( - os.path.join( - os.path.dirname(__file__), "..", "..", "resources", "texas_county_data.csv" - ), - mode="r", - ) as file_handle: - csv_file = csv.DictReader(file_handle) - for row in csv_file: - if row["county"].lower() == county.lower(): - base_url = row["portal"] - # add trailing slash if not present, otherwise urljoin breaks - if base_url[-1] != "/": - base_url += "/" - logger.info(f"{base_url} - scraping this url") - odyssey_version = int(row["version"].split(".")[0]) - notes = row["notes"] - break - if not base_url or not odyssey_version: - raise Exception( - "The required data to scrape this county is not in ./resources/texas_county_data.csv" - ) + # get county portal URL, Odyssey version, and notes from csv file + def get_ody_link(self, + county: str, + logger: logging.Logger + ) -> Tuple[str, + str, + str ]: + try: + base_url = odyssey_version = notes = None + with open( + os.path.join(os.path.dirname(__file__), "..", "..", "resources", "texas_county_data.csv"), + mode="r", + ) as file_handle: + csv_file = csv.DictReader(file_handle) + for row in csv_file: + if row["county"].lower() == county.lower(): + base_url = row["portal"] + # add trailing slash if not present, otherwise urljoin breaks + if base_url[-1] != "/": + base_url += "/" + logger.info(f"{base_url} - scraping this url") + odyssey_version = int(row["version"].split(".")[0]) + notes = row["notes"] + break + if not base_url or not odyssey_version: + raise Exception("The required data to scrape this county is not in /resources/texas_county_data.csv") + except Exception as e: + logger.info(f"Error getting county-specific information from csv: {e}") + raise Exception(f"Error getting county-specific information from csv: {e}") return base_url, odyssey_version, notes - def get_class_and_method(self, county, logger): - # Construct the module, class, and method names - module_name = county #ex: 'hays' - class_name = f"Scraper{county.capitalize()}" #ex: 'ScraperHays' - method_name = f"scraper_{county}" #ex: 'scraper_hays' - - # Add the current directory to the system path - sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) - + def get_class_and_method(self, + county: str, + logger: logging.Logger + ) -> Tuple[Type[object], + Callable]: try: - # Dynamically import the module - module = importlib.import_module(module_name) - - # Retrieve the class from the module - cls = getattr(module, class_name) - if cls is None: - print(f"Class '{class_name}' not found in module '{module_name}'.") - return None, None - - # Instantiate the class - instance = cls() - - # Retrieve the method with the specified name - method = getattr(instance, method_name, None) - if method is None: - print(f"Method '{method_name}' not found in class '{class_name}'.") - return instance, None - + # Construct the module, class, and method names + module_name = county #ex: 'hays' + class_name = f"Scraper{county.capitalize()}" #ex: 'ScraperHays' + method_name = f"scraper_{county}" #ex: 'scraper_hays' + # Add the current directory to the system path + sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + except Exception as e: + logger.info(f"Error formatting the module, class, and method name from county name: {e}") + raise Exception(f"Error formatting the module, class, and method name from county name: {e}") + try: + module = importlib.import_module(module_name) # Dynamically import the module + cls = getattr(module, class_name) # Retrieve the class from the module + instance = cls() # Instantiate the class + method = getattr(instance, method_name, None) # Retrieve the method with the specified name + if cls is None or method is None: + logger.info(f"Received None for either the class (str: {class_name}) or method (str: {method_name})") + raise TypeError(f"Received None for either the class (str: {class_name}) or method (str: {method_name})") return instance, method - except ModuleNotFoundError: - print(f"Module '{module_name}' not found.") - return None, None - - def scrape_main_page(self, base_url, odyssey_version, session, notes, logger, ms_wait): - # if odyssey_version < 2017, scrape main page first to get necessary data - if odyssey_version < 2017: - # some sites have a public guest login that must be used - if "PUBLICLOGIN#" in notes: - userpass = notes.split("#")[1].split("/") - - data = { - "UserName": userpass[0], - "Password": userpass[1], - "ValidateUser": "1", - "dbKeyAuth": "Justice", - "SignOn": "Sign On", - } + except Exception as e: + logger.info(f"Error dynamically importing the module, class, and method name using county name: {e}") + raise Exception(f"Error dynamically importing the module, class, and method name using county name: {e}") - response = request_page_with_retry( - session=session, - url=urllib.parse.urljoin(base_url, "login.aspx"), - logger=logger, - http_method=HTTPMethod.GET, - ms_wait=ms_wait, - data=data, - ) + def scrape_main_page(self, + base_url: str, + odyssey_version: int, + session: requests.sessions.Session, + notes: str, + logger: logging.Logger, + ms_wait: int + ) -> Tuple[str, BeautifulSoup]: + # some sites have a public guest login that must be used + if "PUBLICLOGIN#" in notes: + userpass = notes.split("#")[1].split("/") + data = { + "UserName": userpass[0], + "Password": userpass[1], + "ValidateUser": "1", + "dbKeyAuth": "Justice", + "SignOn": "Sign On", + } - main_page_html = request_page_with_retry( + # not sure how this is being used. response doesn't seem to be used anywhere. May remove? + response = request_page_with_retry( session=session, - url=base_url, - verification_text="ssSearchHyperlink", + url=urllib.parse.urljoin(base_url, "login.aspx"), logger=logger, http_method=HTTPMethod.GET, ms_wait=ms_wait, + data=data, ) - main_soup = BeautifulSoup(main_page_html, "html.parser") - return main_page_html, main_soup + + main_page_html = request_page_with_retry( + session=session, + url=base_url, + verification_text="ssSearchHyperlink", + logger=logger, + http_method=HTTPMethod.GET, + ms_wait=ms_wait, + ) + main_soup = BeautifulSoup(main_page_html, "html.parser") + return main_page_html, main_soup def scrape_search_page(self, base_url, odyssey_version, main_page_html, main_soup, session, logger, ms_wait, court_calendar_link_text): # build url for court calendar From 88b40fa2b70ab082681b5208f28b7229d7a7122d Mon Sep 17 00:00:00 2001 From: Nicolas Sawyer Date: Sat, 14 Sep 2024 15:08:12 -0500 Subject: [PATCH 03/10] final adding logging, typing, error handling to scraper --- src/scraper/__init__.py | 775 ++++++++++++++++++++---------- src/scraper/scrapcode_post2017.py | 45 ++ src/tester/test_unittest.py | 12 +- 3 files changed, 566 insertions(+), 266 deletions(-) create mode 100644 src/scraper/scrapcode_post2017.py diff --git a/src/scraper/__init__.py b/src/scraper/__init__.py index fe3b675..6128f99 100644 --- a/src/scraper/__init__.py +++ b/src/scraper/__init__.py @@ -9,94 +9,156 @@ from bs4 import BeautifulSoup from .helpers import * import importlib -from typing import Optional, Tuple, Callable, Type +from typing import Optional, Tuple, Callable, Type, List +import importlib.util class Scraper: def __init__(self): pass - def set_defaults(self, - ms_wait: Optional[int], - start_date: Optional[str], - end_date: Optional[str], - court_calendar_link_text: Optional[str], - case_number: Optional[str], - logger: logging.Logger - ) -> Tuple[Optional[int], - Optional[str], - Optional[str], - Optional[str], - Optional[str], - logging.Logger]: - try: # set the defaults - if not ms_wait: - ms_wait = 200 - if not start_date: - start_date = '2024-07-01' - if not end_date: - end_date = '2024-07-01' - if not court_calendar_link_text: - court_calendar_link_text = "Court Calendar" - if not case_number: - case_number = None - except Exception as e: - raise Exception(f"Error setting the default values for the code : {e}") + def set_defaults( + self, + ms_wait: Optional[int] = None, + start_date: Optional[str] = None, + end_date: Optional[str] = None, + court_calendar_link_text: Optional[str] = None, + case_number: Optional[str] = None + ) -> Tuple[int, str, str, str, Optional[str]]: + """ + Sets default values for the provided optional parameters. + + Args: + ms_wait (Optional[int]): Milliseconds to wait, default is 200 if not provided. + start_date (Optional[str]): Start date in YYYY-MM-DD format, default is '2024-07-01' if not provided. + end_date (Optional[str]): End date in YYYY-MM-DD format, default is '2024-07-01' if not provided. + court_calendar_link_text (Optional[str]): Text for court calendar link, default is 'Court Calendar' if not provided. + case_number (Optional[str]): Case number, default is None if not provided. + + Returns: + Tuple[int, str, str, str, Optional[str]]: A tuple containing: + - `ms_wait` (int): Milliseconds to wait. + - `start_date` (str): Start date. + - `end_date` (str): End date. + - `court_calendar_link_text` (str): Text for court calendar link. + - `case_number` (Optional[str]): Case number or None. + """ + # Assign default values if parameters are not provided + ms_wait = ms_wait if ms_wait is not None else 200 + start_date = start_date if start_date is not None else '2024-07-01' + end_date = end_date if end_date is not None else '2024-07-01' + court_calendar_link_text = court_calendar_link_text if court_calendar_link_text is not None else "Court Calendar" + # case_number defaults to None if not provided + case_number = case_number + return ms_wait, start_date, end_date, court_calendar_link_text, case_number def configure_logger(self) -> logging.Logger: - # configure the logger - try: - logger = logging.getLogger(name="pid: " + str(os.getpid())) - logging.basicConfig() - logging.root.setLevel(level="INFO") - logger.info("Scraper class initialized") - except Exception as e: - raise Exception(f"Error configuring the logger: {e}") + """ + Configures and returns a logger instance for the scraper class. + + This method sets up the logger with a unique name based on the process ID, + configures the logging level to INFO, and logs an initialization message. + + Returns: + logging.Logger: Configured logger instance. + """ + # Configure the logger + logger = logging.getLogger(name=f"pid: {os.getpid()}") + + # Set up basic configuration for the logging system + logging.basicConfig(level=logging.INFO) + return logger - def format_county(self, - county: str - ) -> str: - # make the county lowercase - try: - county = county.lower() - except Exception as e: - raise TypeError(f"Error with making the county lowercase: {e}") - return county + def format_county(self, county: str) -> str: + """ + Formats the county name to lowercase. - # creates a session that will be used for interacting with web pages - def create_session(self, - logger: logging.Logger - ) -> requests.sessions.Session: - try: - session = requests.Session() - session.verify = False - requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) - except Exception as e: - logger.info(f"Error creating the requests session field: {e}") - raise Exception(f"Error creating the requests session field: {e}") + Args: + county (str): The name of the county to be formatted. + + Returns: + str: The county name in lowercase. + + Raises: + TypeError: If the provided county name is not a string. + """ + if not isinstance(county, str): + raise TypeError("The county name must be a string.") + + return county.lower() + + def create_session(self, logger: logging.Logger) -> requests.sessions.Session: + """ + Creates and configures a requests session for interacting with web pages. + + This method sets up a `requests.Session` with SSL verification disabled and suppresses + related warnings. + + Args: + logger (logging.Logger): Logger instance for logging errors. + + Returns: + requests.sessions.Session: Configured session object. + """ + # Create and configure the session + session = requests.Session() + session.verify = False # Disable SSL certificate verification + requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) + return session - # make directories if not present - def make_directories(self, - county: str, - logger: logging.Logger - ) -> str: - try: - case_html_path = os.path.join(os.path.dirname(__file__), "..", "..", "data", county, "case_html") - os.makedirs(case_html_path, exist_ok=True) - except Exception as e: - logger.info(f"Error making directories for the resulting case HTML: {e}") - raise OSError(f"Error making directories for the resulting case HTML: {e}") + def make_directories(self, county: str, logger: logging.Logger) -> str: + """ + Creates necessary directories for storing case HTML files. + + This method constructs a path based on the county name and ensures that + all required directories in the path are created. If the directories already + exist, no action is taken. + + Args: + county (str): The name of the county, used to create a specific directory path. + logger (logging.Logger): Logger instance for logging errors. + + Returns: + str: The path to the created directories. + + Raises: + OSError: If there is an error creating the directories. + """ + case_html_path = os.path.join(os.path.dirname(__file__), "..", "..", "data", county, "case_html") + + # Create the directories if they do not exist + os.makedirs(case_html_path, exist_ok=True) + return case_html_path # get county portal URL, Odyssey version, and notes from csv file def get_ody_link(self, county: str, logger: logging.Logger - ) -> Tuple[str, - str, - str ]: + ) -> Tuple[str, str, str ]: + """ + Retrieves Odyssey-related information for a given county from a CSV file. + + This function reads county-specific data from a CSV file located in the `resources` directory. It searches for + the county name in the CSV file, extracts the corresponding base URL, Odyssey version, and any additional notes. + The base URL is formatted with a trailing slash if necessary. + + Args: + county (str): The name of the county for which to retrieve Odyssey information. + logger (logging.Logger): Logger instance for logging errors and information. + + Returns: + Tuple[str, str, str]: A tuple containing: + - `base_url` (str): The base URL for the county’s portal. + - `odyssey_version` (str): The major version of Odyssey associated with the county. + - `notes` (str): Additional notes related to the county. + + Raises: + Exception: If the county is not found in the CSV file or if required data is missing, an exception is raised + and logged. + """ try: base_url = odyssey_version = notes = None with open( @@ -118,36 +180,61 @@ def get_ody_link(self, raise Exception("The required data to scrape this county is not in /resources/texas_county_data.csv") except Exception as e: logger.info(f"Error getting county-specific information from csv: {e}") - raise Exception(f"Error getting county-specific information from csv: {e}") + raise return base_url, odyssey_version, notes - def get_class_and_method(self, - county: str, - logger: logging.Logger - ) -> Tuple[Type[object], - Callable]: - try: - # Construct the module, class, and method names - module_name = county #ex: 'hays' - class_name = f"Scraper{county.capitalize()}" #ex: 'ScraperHays' - method_name = f"scraper_{county}" #ex: 'scraper_hays' - # Add the current directory to the system path - sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) - except Exception as e: - logger.info(f"Error formatting the module, class, and method name from county name: {e}") - raise Exception(f"Error formatting the module, class, and method name from county name: {e}") + def get_class_and_method( + self, + county: str, + logger: logging.Logger + ) -> Tuple[Type[object], Callable]: + """ + Dynamically imports a module, retrieves a class, and gets a method from it based on the county name. + + Args: + county (str): The name of the county, used to construct module, class, and method names. + logger (logging.Logger): Logger instance for logging errors. + + Returns: + Tuple[Type[object], Callable]: A tuple containing the instance of the class and the method callable. + + Raises: + ImportError: If the module cannot be imported. + AttributeError: If the class or method cannot be found. + Exception: For any other unexpected errors. + """ + module_name = county + class_name = f"Scraper{county.capitalize()}" + method_name = f"scraper_{county}" + + # Add the current directory to the system path + sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + try: - module = importlib.import_module(module_name) # Dynamically import the module - cls = getattr(module, class_name) # Retrieve the class from the module - instance = cls() # Instantiate the class - method = getattr(instance, method_name, None) # Retrieve the method with the specified name - if cls is None or method is None: - logger.info(f"Received None for either the class (str: {class_name}) or method (str: {method_name})") - raise TypeError(f"Received None for either the class (str: {class_name}) or method (str: {method_name})") + # Dynamically import the module + module = importlib.import_module(module_name) + + # Retrieve the class from the module + cls = getattr(module, class_name, None) + if cls is None: + raise AttributeError(f"Class '{class_name}' not found in module '{module_name}'") + + # Instantiate the class + instance = cls() + + # Retrieve the method with the specified name + method = getattr(instance, method_name, None) + if method is None: + raise AttributeError(f"Method '{method_name}' not found in class '{class_name}'") + return instance, method + + except (FileNotFoundError, ImportError, AttributeError) as e: + logger.error(f"Error dynamically loading module or retrieving class/method: {e}") + raise except Exception as e: - logger.info(f"Error dynamically importing the module, class, and method name using county name: {e}") - raise Exception(f"Error dynamically importing the module, class, and method name using county name: {e}") + logger.error(f"Unexpected error: {e}") + raise def scrape_main_page(self, base_url: str, @@ -157,61 +244,130 @@ def scrape_main_page(self, logger: logging.Logger, ms_wait: int ) -> Tuple[str, BeautifulSoup]: - # some sites have a public guest login that must be used - if "PUBLICLOGIN#" in notes: - userpass = notes.split("#")[1].split("/") - data = { - "UserName": userpass[0], - "Password": userpass[1], - "ValidateUser": "1", - "dbKeyAuth": "Justice", - "SignOn": "Sign On", - } - - # not sure how this is being used. response doesn't seem to be used anywhere. May remove? - response = request_page_with_retry( + """ + Scrapes the main page of the Odyssey site, handling login if required and returning the page's HTML and parsed content. + + This function handles a special case where some sites may require a public guest login. If the `notes` parameter + contains a "PUBLICLOGIN#" identifier, it will extract the username and password from the `notes`, perform the login, + and then proceed to scrape the main page. + + Args: + base_url (str): The base URL of the main page to scrape. + odyssey_version (int): The version of Odyssey; currently not used in this function. + session (requests.sessions.Session): The `requests` session object used for making HTTP requests. + notes (str): A string containing notes that may include login credentials in the format "PUBLICLOGIN#username/password". + logger (logging.Logger): Logger instance for logging errors and debug information. + ms_wait (int): The number of milliseconds to wait between retry attempts. + + Returns: + Tuple[str, BeautifulSoup]: A tuple containing: + - `main_page_html` (str): The raw HTML content of the main page. + - `main_soup` (BeautifulSoup): A BeautifulSoup object containing the parsed HTML content. + + Raises: + Exception: If any error occurs during the HTTP requests or HTML parsing, an exception is raised and logged. + """ + try: + # some sites have a public guest login that must be used + if "PUBLICLOGIN#" in notes: + userpass = notes.split("#")[1].split("/") + data = { + "UserName": userpass[0], + "Password": userpass[1], + "ValidateUser": "1", + "dbKeyAuth": "Justice", + "SignOn": "Sign On", + } + + # not sure how this is being used. response doesn't seem to be used anywhere, but it may just be opening the page. + response = request_page_with_retry( + session=session, + url=urllib.parse.urljoin(base_url, "login.aspx"), + logger=logger, + http_method=HTTPMethod.GET, + ms_wait=ms_wait, + data=data, + ) + + main_page_html = request_page_with_retry( session=session, - url=urllib.parse.urljoin(base_url, "login.aspx"), + url=base_url, + verification_text="ssSearchHyperlink", logger=logger, http_method=HTTPMethod.GET, ms_wait=ms_wait, - data=data, ) - - main_page_html = request_page_with_retry( - session=session, - url=base_url, - verification_text="ssSearchHyperlink", - logger=logger, - http_method=HTTPMethod.GET, - ms_wait=ms_wait, - ) - main_soup = BeautifulSoup(main_page_html, "html.parser") + main_soup = BeautifulSoup(main_page_html, "html.parser") + except Exception as e: + logger.error(f"Error scraping main page for main page HTML: {e}") + raise return main_page_html, main_soup - def scrape_search_page(self, base_url, odyssey_version, main_page_html, main_soup, session, logger, ms_wait, court_calendar_link_text): - # build url for court calendar + def scrape_search_page( + self, + base_url: str, + odyssey_version: int, + main_page_html: str, + main_soup: BeautifulSoup, + session: requests.sessions.Session, + logger: logging.Logger, + ms_wait: int, + court_calendar_link_text: str + ) -> Tuple[str, str, BeautifulSoup]: + """ + Scrapes the search page URL and data based on the main page content. + + This method extracts the search page ID from the court calendar link, constructs the URL for the search page, + and retrieves the search page HTML. Depending on the Odyssey version, it either uses the extracted URL or a + default URL. It then parses the search page HTML into a BeautifulSoup object. + + Args: + base_url (str): The base URL for constructing full URLs. + odyssey_version (int): The version of Odyssey, used to determine the correct URL and verification text. + main_page_html (str): The HTML content of the main page. + main_soup (BeautifulSoup): Parsed BeautifulSoup object of the main page HTML. + session (requests.sessions.Session): The session object for making HTTP requests. + logger (logging.Logger): Logger instance for logging errors and information. + ms_wait (int): Milliseconds to wait before making requests. + court_calendar_link_text (str): Text to search for in the court calendar link. + + Returns: + Tuple[str, str, BeautifulSoup]: A tuple containing the search page URL, search page HTML, and the BeautifulSoup object of the search page. + + Raises: + ValueError: If the court calendar link is not found on the main page. + """ + # Extract the search page ID from the court calendar link search_page_id = None for link in main_soup.select("a.ssSearchHyperlink"): if court_calendar_link_text in link.text: search_page_id = link["href"].split("?ID=")[1].split("'")[0] + break # Exit loop once the link is found + if not search_page_id: write_debug_and_quit( verification_text="Court Calendar link", page_text=main_page_html, logger=logger, ) - search_url = base_url + "Search.aspx?ID=" + search_page_id + raise ValueError("Court Calendar link not found on the main page.") - # hit the search page to gather initial data + # Build the URL for the search page + search_url = f"{base_url}Search.aspx?ID={search_page_id}" + + # Determine the correct URL and verification text based on Odyssey version + if odyssey_version < 2017: + search_url = search_url + verification_text = "Court Calendar" + else: + search_url = urllib.parse.urljoin(base_url, "Home/Dashboard/26") + verification_text = "SearchCriteria.SelectedCourt" + + # Hit the search page to gather initial data search_page_html = request_page_with_retry( session=session, - url=search_url - if odyssey_version < 2017 - else urllib.parse.urljoin(base_url, "Home/Dashboard/26"), - verification_text="Court Calendar" - if odyssey_version < 2017 - else "SearchCriteria.SelectedCourt", + url=search_url, + verification_text=verification_text, http_method=HTTPMethod.GET, logger=logger, ms_wait=ms_wait, @@ -220,28 +376,68 @@ def scrape_search_page(self, base_url, odyssey_version, main_page_html, main_sou return search_url, search_page_html, search_soup - def get_hidden_values(self, odyssey_version, main_soup, search_soup, logger): - # we need these hidden values to POST a search + def get_hidden_values( + self, + odyssey_version: int, + main_soup: BeautifulSoup, + search_soup: BeautifulSoup, + logger: logging.Logger + ) -> Dict[str, str]: + """ + Extracts hidden input values and additional data from the search page. + + Args: + odyssey_version (int): The version of Odyssey to determine logic. + main_soup (BeautifulSoup): Parsed BeautifulSoup object of the main page HTML. + search_soup (BeautifulSoup): Parsed BeautifulSoup object of the search page HTML. + logger (logging.Logger): Logger instance for logging information. + + Returns: + Dict[str, str]: Dictionary of hidden input names and their values. + """ + # Extract hidden input values hidden_values = { hidden["name"]: hidden["value"] for hidden in search_soup.select('input[type="hidden"]') if hidden.has_attr("name") } - # get nodedesc and nodeid information from main page location select box + + # Get NodeDesc and NodeID information based on Odyssey version if odyssey_version < 2017: - location_option = main_soup.findAll("option")[0] - logger.info(f"location: {location_option.text}") - hidden_values.update( - {"NodeDesc": location_option.text, "NodeID": location_option["value"]} - ) + location_option = main_soup.find_all("option")[0] + logger.info(f"Location: {location_option.text}") + hidden_values.update({ + "NodeDesc": location_option.text, + "NodeID": location_option["value"] + }) else: - hidden_values["SearchCriteria.SelectedCourt"] = hidden_values[ - "Settings.DefaultLocation" - ] # TODO: Search in default court. Might need to add further logic later to loop through courts. + hidden_values["SearchCriteria.SelectedCourt"] = hidden_values.get("Settings.DefaultLocation", "") + return hidden_values - def get_search_results(self, session, search_url, logger, ms_wait, hidden_values, case_number): - # POST a request for search results + def get_search_results( + self, + session: requests.sessions.Session, + search_url: str, + logger: logging.Logger, + ms_wait: int, + hidden_values: Dict[str, str], + case_number: Optional[str] + ) -> BeautifulSoup: + """ + Retrieves search results from the search page. + + Args: + session (requests.sessions.Session): The session object for making HTTP requests. + search_url (str): The URL to request search results from. + logger (logging.Logger): Logger instance for logging information. + ms_wait (int): Milliseconds to wait before making requests. + hidden_values (Dict[str, str]): Dictionary of hidden input values. + case_number (Optional[str]): Case number for searching. + + Returns: + BeautifulSoup: Parsed BeautifulSoup object of the search results page HTML. + """ results_page_html = request_page_with_retry( session=session, url=search_url, @@ -250,161 +446,220 @@ def get_search_results(self, session, search_url, logger, ms_wait, hidden_values data=create_single_case_search_form_data(hidden_values, case_number), ms_wait=ms_wait, ) - results_soup = BeautifulSoup(results_page_html, "html.parser") - return results_soup + return BeautifulSoup(results_page_html, "html.parser") + + def scrape_individual_case( + self, + base_url: str, + search_url: str, + hidden_values: Dict[str, str], + case_number: Optional[str], + case_html_path: str, + session: requests.sessions.Session, + logger: logging.Logger, + ms_wait: int + ) -> None: - def scrape_individual_case(self, base_url, search_url, hidden_values, case_number, case_html_path, session, logger, ms_wait): # Individual case search logic results_soup = self.get_search_results(session, search_url, logger, ms_wait, hidden_values, case_number) case_urls = [ base_url + anchor["href"] for anchor in results_soup.select('a[href^="CaseDetail"]') ] + logger.info(f"{len(case_urls)} entries found") - case_id = case_urls[0].split("=")[1] - logger.info(f"{case_id} - scraping case") - # make request for the case - case_html = request_page_with_retry( - session=session, - url=case_urls[0], - verification_text="Date Filed", - logger=logger, - ms_wait=ms_wait, - ) - # write html case data - logger.info(f"{len(case_html)} response string length") + + if case_urls: + case_id = case_urls[0].split("=")[1] + logger.info(f"{case_id} - scraping case") + + case_html = request_page_with_retry( + session=session, + url=case_urls[0], + verification_text="Date Filed", + logger=logger, + ms_wait=ms_wait, + ) + + logger.info(f"{len(case_html)} response string length") - with open( - os.path.join(case_html_path, f"{case_id}.html"), "w" - ) as file_handle: - file_handle.write(case_html) + with open( + os.path.join(case_html_path, f"{case_id}.html"), "w" + ) as file_handle: + file_handle.write(case_html) + else: + logger.warning("No case URLs found.") + + def scrape_jo_list( + self, + odyssey_version: int, + search_soup: BeautifulSoup, + judicial_officers: Optional[List[str]], + logger: logging.Logger + ) -> Tuple[List[str], Dict[str, str]]: + """ + Scrapes a list of judicial officers and their IDs from the search page. + + Optionally receives a list of judicial officers to scrape. - def scrape_jo_list(self, odyssey_version, search_soup, judicial_officers, logger): - # get a list of JOs to their IDs from the search page + Args: + odyssey_version (int): The version of Odyssey to determine the selector. + search_soup (BeautifulSoup): Parsed BeautifulSoup object of the search page HTML. + judicial_officers (Optional[List[str]]): List of specific judicial officers to use. + logger (logging.Logger): Logger instance for logging information. + + Returns: + Tuple[List[str], Dict[str, str]]: Tuple containing a list of judicial officers to use and a dictionary of judicial officers and their IDs. + """ + selector = 'select[labelname="Judicial Officer:"] > option' if odyssey_version < 2017 else 'select[id="selHSJudicialOfficer"] > option' judicial_officer_to_ID = { option.text: option["value"] - for option in search_soup.select( - 'select[labelname="Judicial Officer:"] > option' - if odyssey_version < 2017 - else 'select[id="selHSJudicialOfficer"] > option' - ) + for option in search_soup.select(selector) if option.text } - # if juidicial_officers param is not specified, use all of them + if not judicial_officers: judicial_officers = list(judicial_officer_to_ID.keys()) + logger.info(f"No judicial officers specified, so scraping all of them: {len(judicial_officers)}") + else: + logger.info(f"Judicial officers were specified, so only scraping these: {judicial_officers}") + return judicial_officers, judicial_officer_to_ID - def scrape_results_page(self, odyssey_version, base_url, search_url, hidden_values, JO_id, date_string, session, logger, ms_wait): - # POST a request for search results - results_page_html = request_page_with_retry( - session=session, - url=search_url + def scrape_results_page( + self, + odyssey_version: int, + base_url: str, + search_url: str, + hidden_values: Dict[str, str], + JO_id: str, + date_string: str, + session: requests.sessions.Session, + logger: logging.Logger, + ms_wait: int + ) -> Tuple[str, BeautifulSoup]: + """ + Scrapes the results page based on Odyssey version and search criteria. + + Args: + odyssey_version (int): The version of Odyssey to determine the URL and verification text. + base_url (str): The base URL for constructing full URLs. + search_url (str): The URL to request search results from. + hidden_values (Dict[str, str]): Dictionary of hidden input values. + JO_id (str): Judicial officer ID for searching. + date_string (str): Date string for searching. + session (requests.sessions.Session): The session object for making HTTP requests. + logger (logging.Logger): Logger instance for logging information. + ms_wait (int): Milliseconds to wait before making requests. + + Returns: + Tuple[str, BeautifulSoup]: A tuple containing the HTML of the results page and the parsed BeautifulSoup object. + """ + search_url = ( + search_url if odyssey_version < 2017 - else urllib.parse.urljoin(base_url, "Hearing/SearchHearings/HearingSearch"), - verification_text="Record Count" + else urllib.parse.urljoin(base_url, "Hearing/SearchHearings/HearingSearch") + ) + + verification_text = ( + "Record Count" if odyssey_version < 2017 - else "Search Results", + else "Search Results" + ) + + results_page_html = request_page_with_retry( + session=session, + url=search_url, + verification_text=verification_text, logger=logger, - data=create_search_form_data( - date_string, JO_id, hidden_values, odyssey_version - ), + data=create_search_form_data(date_string, JO_id, hidden_values, odyssey_version), ms_wait=ms_wait, - ) + ) + results_soup = BeautifulSoup(results_page_html, "html.parser") + return results_page_html, results_soup - # Not currently in use. Should be moved to a county-specific module, class, and method when a post2017 county is included - """def scrape_case_data_post2017(self, base_url, case_html_path, session, logger, ms_wait): - # Need to POST this page to get a JSON of the search results after the initial POST - case_list_json = request_page_with_retry( - session=session, - url=urllib.parse.urljoin(base_url, "Hearing/HearingResults/Read"), - verification_text="AggregateResults", - logger=logger, - ) - case_list_json = json.loads(case_list_json) - logger.info(f"{case_list_json['Total']} cases found") - for case_json in case_list_json["Data"]: - case_id = str(case_json["CaseId"]) - logger.info(f"{case_id} scraping case") - # make request for the case - case_html = request_page_with_retry( - session=session, - url=urllib.parse.urljoin(base_url, "Case/CaseDetail"), - verification_text="Case Information", - logger=logger, - ms_wait=ms_wait, - params={ - "eid": case_json["EncryptedCaseId"], - "CaseNumber": case_json["CaseNumber"], - }, - ) - # make request for financial info - case_html += request_page_with_retry( - session=session, - url=urllib.parse.urljoin( - base_url, "Case/CaseDetail/LoadFinancialInformation" - ), - verification_text="Financial", - logger=logger, - ms_wait=ms_wait, - params={ - "caseId": case_json["CaseId"], - }, - ) - # write case html data - logger.info(f"{len(case_html)} response string length") - with open( - os.path.join(case_html_path, f"{case_id}.html"), "w" - ) as file_handle: - file_handle.write(case_html)""" - - def scrape_multiple_cases(self, county, odyssey_version, base_url, search_url, hidden_values, judicial_officers, judicial_officer_to_ID, case_html_path, logger, session, ms_wait, start_date, end_date): + def scrape_multiple_cases( + self, + county: str, + odyssey_version: int, + base_url: str, + search_url: str, + hidden_values: Dict[str, str], + judicial_officers: List[str], + judicial_officer_to_ID: Dict[str, str], + case_html_path: Optional[str], + logger: logging.Logger, + session: requests.Session, + ms_wait: int, + start_date: str, + end_date: str + ) -> None: start_date = datetime.strptime(start_date, '%Y-%m-%d').date() end_date = datetime.strptime(end_date, '%Y-%m-%d').date() - # loop through each day - for date in ( - start_date + timedelta(n) - for n in range((end_date - start_date).days + 1) - ): - date_string = datetime.strftime(date, "%m/%d/%Y") - # loop through each judicial officer + + for date in (start_date + timedelta(n) for n in range((end_date - start_date).days + 1)): + date_string = date.strftime("%m/%d/%Y") + for JO_name in judicial_officers: if JO_name not in judicial_officer_to_ID: - logger.error(f"judicial officer {JO_name} not found on search page. Continuing.") + logger.error(f"Judicial officer {JO_name} not found on search page. Continuing.") continue + JO_id = judicial_officer_to_ID[JO_name] logger.info(f"Searching cases on {date_string} for {JO_name}") - # scrapes the results page with the search parameters and returns the soup. it also returns the html but it's not used at this time - results_html, results_soup = self.scrape_results_page(odyssey_version, base_url, search_url, hidden_values, JO_id, date_string, session, logger, ms_wait) - # get a different scraper for each county - self.get_class_and_method(county) - # gets the county-specific scraper class and method - scraper_instance, scraper_function = self.get_class_and_method(county=county) - if scraper_instance is not None and scraper_function is not None: + + results_html, results_soup = self.scrape_results_page( + odyssey_version, base_url, search_url, hidden_values, JO_id, date_string, session, logger, ms_wait + ) + + scraper_instance, scraper_function = self.get_class_and_method(county, logger) + if scraper_instance and scraper_function: scraper_function(base_url, results_soup, case_html_path, logger, session, ms_wait) else: - print("Error: Could not obtain parser instance or function.") + logger.error("Error: Could not obtain parser instance or function.") - def scrape(self, county, judicial_officers, ms_wait, start_date, end_date, court_calendar_link_text, case_number, case_html_path): - ms_wait, start_date, end_date, court_calendar_link_text, case_number = self.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number) + def scrape( + self, + county: str, + judicial_officers: List[str], + ms_wait: int, + start_date: str, + end_date: str, + court_calendar_link_text: Optional[str], + case_number: Optional[str], + case_html_path: Optional[str] + ) -> None: + ms_wait, start_date, end_date, court_calendar_link_text, case_number = self.set_defaults( + ms_wait, start_date, end_date, court_calendar_link_text, case_number + ) + logger = self.configure_logger() county = self.format_county(county, logger) session = self.create_session(logger) - self.make_directories(county, logger) if not case_html_path else case_html_path + + if case_html_path is None: + self.make_directories(county, logger) + base_url, odyssey_version, notes = self.get_ody_link(county, logger) main_page_html, main_soup = self.scrape_main_page(base_url, odyssey_version, session, notes, logger, ms_wait) - search_url, search_page_html, search_soup = self.scrape_search_page(base_url, odyssey_version, main_page_html, main_soup, session, logger, ms_wait, court_calendar_link_text) + search_url, search_page_html, search_soup = self.scrape_search_page( + base_url, odyssey_version, main_page_html, main_soup, session, logger, ms_wait, court_calendar_link_text + ) + hidden_values = self.get_hidden_values(odyssey_version, main_soup, search_soup, logger) - if case_number: # just scrapes the one case - self.scrape_individual_case(base_url, search_url, hidden_values, case_number, case_html_path, session, logger, ms_wait) - else: # scrape a list of JOs between a start and end date - judicial_officers, judicial_officer_to_ID = self.scrape_jo_list(odyssey_version, search_soup, judicial_officers, logger) - SCRAPER_START_TIME = time() - self.scrape_multiple_cases(odyssey_version, base_url, search_url, hidden_values, judicial_officers, judicial_officer_to_ID, case_html_path, logger, session, ms_wait, start_date, end_date) - logger.info(f"\nTime to run script: {round(time() - SCRAPER_START_TIME, 2)} seconds") - -#scraper_instance = Scraper() -#logger = scraper_instance.configure_logger() -#session = scraper_instance.create_session(logger) -#print(type(session)) + + if case_number: + self.scrape_individual_case( + base_url, search_url, hidden_values, case_number, case_html_path, session, logger, ms_wait + ) + else: + judicial_officers, judicial_officer_to_ID = self.scrape_jo_list( + odyssey_version, search_soup, judicial_officers, logger + ) + scraper_start_time = time() + self.scrape_multiple_cases( + county, odyssey_version, base_url, search_url, hidden_values, judicial_officers, judicial_officer_to_ID, + case_html_path, logger, session, ms_wait, start_date, end_date + ) + logger.info(f"\nTime to run script: {round(time() - scraper_start_time, 2)} seconds") diff --git a/src/scraper/scrapcode_post2017.py b/src/scraper/scrapcode_post2017.py new file mode 100644 index 0000000..713d097 --- /dev/null +++ b/src/scraper/scrapcode_post2017.py @@ -0,0 +1,45 @@ +# Not currently in use. Should be moved to a county-specific module, class, and method when a post2017 county is included +"""def scrape_case_data_post2017(self, base_url, case_html_path, session, logger, ms_wait): + # Need to POST this page to get a JSON of the search results after the initial POST + case_list_json = request_page_with_retry( + session=session, + url=urllib.parse.urljoin(base_url, "Hearing/HearingResults/Read"), + verification_text="AggregateResults", + logger=logger, + ) + case_list_json = json.loads(case_list_json) + logger.info(f"{case_list_json['Total']} cases found") + for case_json in case_list_json["Data"]: + case_id = str(case_json["CaseId"]) + logger.info(f"{case_id} scraping case") + # make request for the case + case_html = request_page_with_retry( + session=session, + url=urllib.parse.urljoin(base_url, "Case/CaseDetail"), + verification_text="Case Information", + logger=logger, + ms_wait=ms_wait, + params={ + "eid": case_json["EncryptedCaseId"], + "CaseNumber": case_json["CaseNumber"], + }, + ) + # make request for financial info + case_html += request_page_with_retry( + session=session, + url=urllib.parse.urljoin( + base_url, "Case/CaseDetail/LoadFinancialInformation" + ), + verification_text="Financial", + logger=logger, + ms_wait=ms_wait, + params={ + "caseId": case_json["CaseId"], + }, + ) + # write case html data + logger.info(f"{len(case_html)} response string length") + with open( + os.path.join(case_html_path, f"{case_id}.html"), "w" + ) as file_handle: + file_handle.write(case_html)""" \ No newline at end of file diff --git a/src/tester/test_unittest.py b/src/tester/test_unittest.py index b4d73e2..1de1fbb 100644 --- a/src/tester/test_unittest.py +++ b/src/tester/test_unittest.py @@ -43,7 +43,7 @@ def test_scrape_main_page(self, scraper_instance = Scraper() logger = scraper_instance.configure_logger() ms_wait, start_date, end_date, court_calendar_link_text, case_number = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number) - session = scraper_instance.create_session() + session = scraper_instance.create_session(logger) main_page_html, main_soup = scraper_instance.scrape_main_page(base_url, odyssey_version, session, notes, logger, ms_wait) self.assertIsNotNone(main_page_html, "No main page HTML came through. main_page_html = None.") self.assertTrue('ssSearchHyperlink' in main_page_html, "There is no 'ssSearchHyperlink' text found in this main page html.") # Note: This validation is already being done using the 'verification_text' field. @@ -72,7 +72,7 @@ def test_scrape_search_page(self, scraper_instance = Scraper() logger = scraper_instance.configure_logger() ms_wait, start_date, end_date, court_calendar_link_text, case_number = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number) - session = scraper_instance.create_session() + session = scraper_instance.create_session(logger) search_url, search_page_html, search_soup = scraper_instance.scrape_search_page(base_url, odyssey_version, main_page_html, main_soup, session, logger, ms_wait, court_calendar_link_text) # Verify the court calendar link self.assertIsNotNone(main_page_html, "No search url came through. search_url = None.") @@ -141,7 +141,7 @@ def test_scrape_individual_case(self, ms_wait, start_date, end_date, court_calendar_link_text, case_number = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number) logger = scraper_instance.configure_logger() county = scraper_instance.format_county(county) - session = scraper_instance.create_session() + session = scraper_instance.create_session(logger) case_html_path = scraper_instance.make_directories(county) if not case_html_path else case_html_path base_url, odyssey_version, notes = scraper_instance.get_ody_link(county, logger) main_page_html, main_soup = scraper_instance.scrape_main_page(base_url, odyssey_version, session, notes, logger, ms_wait) @@ -194,7 +194,7 @@ def test_scrape_jo_list(self, ms_wait, start_date, end_date, court_calendar_link_text, case_number = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number) logger = scraper_instance.configure_logger() county = scraper_instance.format_county(county) - session = scraper_instance.create_session() + session = scraper_instance.create_session(logger) main_page_html, main_soup = scraper_instance.scrape_main_page(base_url, odyssey_version, session, notes, logger, ms_wait) search_url, search_page_html, search_soup = scraper_instance.scrape_search_page(base_url, odyssey_version, main_page_html, main_soup, session, logger, ms_wait, court_calendar_link_text) judicial_officers, judicial_officer_to_ID = scraper_instance.scrape_jo_list(odyssey_version, search_soup, judicial_officers, logger) @@ -231,7 +231,7 @@ def test_scrape_results_page(self, ms_wait, start_date, end_date, court_calendar_link_text, case_number = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number) logger = scraper_instance.configure_logger() county = scraper_instance.format_county(county) - session = scraper_instance.create_session() + session = scraper_instance.create_session(logger) # Open the example main page HTML with open( os.path.join(os.path.dirname(__file__), "..", "..", "resources", 'test_files','hays_main_page.html'), "r", encoding='utf-8' @@ -293,9 +293,9 @@ def test_scrape_multiple_cases(self, # There are some live depency functions that have to be run before the primary code can be run. scraper_instance = Scraper() - session = scraper_instance.create_session() ms_wait, start_date, end_date, court_calendar_link_text, case_number = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number) logger = scraper_instance.configure_logger() + session = scraper_instance.create_session(logger) case_html_path = scraper_instance.make_directories(county) if not case_html_path else case_html_path search_url, search_page_html, search_soup = scraper_instance.scrape_search_page(base_url, odyssey_version, main_page_html, main_soup, session, logger, ms_wait, court_calendar_link_text) results_html, results_soup = scraper_instance.scrape_results_page(odyssey_version, base_url, search_url, hidden_values, JO_id, date_string, session, logger, ms_wait) From 24333642256b8e78fd34e657e5d04987966be310 Mon Sep 17 00:00:00 2001 From: Nicolas Sawyer Date: Sat, 21 Sep 2024 18:03:28 -0500 Subject: [PATCH 04/10] lowercase JO_id Co-authored-by: Matt Allen --- src/scraper/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scraper/__init__.py b/src/scraper/__init__.py index 6128f99..6d0aafc 100644 --- a/src/scraper/__init__.py +++ b/src/scraper/__init__.py @@ -531,7 +531,7 @@ def scrape_results_page( base_url: str, search_url: str, hidden_values: Dict[str, str], - JO_id: str, + jo_id: str, date_string: str, session: requests.sessions.Session, logger: logging.Logger, From d639fae80570553acf9f01aae13ebfba1a081107 Mon Sep 17 00:00:00 2001 From: Nicolas Sawyer Date: Sat, 21 Sep 2024 18:04:13 -0500 Subject: [PATCH 05/10] lowercase Dict[str, str] Co-authored-by: Matt Allen --- src/scraper/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scraper/__init__.py b/src/scraper/__init__.py index 6d0aafc..98d50f9 100644 --- a/src/scraper/__init__.py +++ b/src/scraper/__init__.py @@ -530,7 +530,7 @@ def scrape_results_page( odyssey_version: int, base_url: str, search_url: str, - hidden_values: Dict[str, str], + hidden_values: dict[str, str], jo_id: str, date_string: str, session: requests.sessions.Session, From f8133559bfaadff01783ca0809914db8396dc5c4 Mon Sep 17 00:00:00 2001 From: Nicolas Sawyer Date: Sat, 21 Sep 2024 18:12:26 -0500 Subject: [PATCH 06/10] lowercasing jo_id everywhere --- src/scraper/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/scraper/__init__.py b/src/scraper/__init__.py index 5bf7860..1d89d2c 100644 --- a/src/scraper/__init__.py +++ b/src/scraper/__init__.py @@ -546,7 +546,7 @@ def scrape_results_page( base_url (str): The base URL for constructing full URLs. search_url (str): The URL to request search results from. hidden_values (Dict[str, str]): Dictionary of hidden input values. - JO_id (str): Judicial officer ID for searching. + jo_id (str): Judicial officer ID for searching. date_string (str): Date string for searching. session (requests.sessions.Session): The session object for making HTTP requests. logger (logging.Logger): Logger instance for logging information. @@ -572,7 +572,7 @@ def scrape_results_page( url=search_url, verification_text=verification_text, logger=logger, - data=create_search_form_data(date_string, JO_id, hidden_values, odyssey_version), + data=create_search_form_data(date_string, jo_id, hidden_values, odyssey_version), ms_wait=ms_wait, ) @@ -607,11 +607,11 @@ def scrape_multiple_cases( logger.error(f"Judicial officer {JO_name} not found on search page. Continuing.") continue - JO_id = judicial_officer_to_ID[JO_name] + jo_id = judicial_officer_to_ID[JO_name] logger.info(f"Searching cases on {date_string} for {JO_name}") results_html, results_soup = self.scrape_results_page( - odyssey_version, base_url, search_url, hidden_values, JO_id, date_string, session, logger, ms_wait + odyssey_version, base_url, search_url, hidden_values, jo_id, date_string, session, logger, ms_wait ) scraper_instance, scraper_function = self.get_class_and_method(county, logger) From f02495acfbed1813648f018da64db5b3218bbc3e Mon Sep 17 00:00:00 2001 From: Nicolas Sawyer Date: Sat, 21 Sep 2024 19:04:51 -0500 Subject: [PATCH 07/10] making ssl parameter and setting default true --- src/scraper/__init__.py | 31 +++++++++++++++++-------------- src/tester/test_unittest.py | 19 ++++++++++--------- 2 files changed, 27 insertions(+), 23 deletions(-) diff --git a/src/scraper/__init__.py b/src/scraper/__init__.py index 1d89d2c..a8f2fa7 100644 --- a/src/scraper/__init__.py +++ b/src/scraper/__init__.py @@ -11,6 +11,7 @@ import importlib from typing import Optional, Tuple, Callable, Type, List import importlib.util +import re class Scraper: """Scrape Odyssey html files into an output folder""" @@ -19,12 +20,13 @@ def __init__(self): def set_defaults( self, - ms_wait: Optional[int] = None, - start_date: Optional[str] = None, - end_date: Optional[str] = None, - court_calendar_link_text: Optional[str] = None, - case_number: Optional[str] = None - ) -> Tuple[int, str, str, str, Optional[str]]: + ms_wait: int | None = None, + start_date: str | None = None, + end_date: str | None = None, + court_calendar_link_text: str | None = None, + case_number: str | None = None, + ssl: bool | None = None + ) -> Tuple[int, str, str, str, Optional[str], bool]: """ Sets default values for the provided optional parameters. @@ -50,8 +52,9 @@ def set_defaults( court_calendar_link_text = court_calendar_link_text if court_calendar_link_text is not None else "Court Calendar" # case_number defaults to None if not provided case_number = case_number + ssl = ssl if ssl is not None else True - return ms_wait, start_date, end_date, court_calendar_link_text, case_number + return ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl def configure_logger(self) -> logging.Logger: """ @@ -84,12 +87,10 @@ def format_county(self, county: str) -> str: Raises: TypeError: If the provided county name is not a string. """ - if not isinstance(county, str): - raise TypeError("The county name must be a string.") - return county.lower() + return re.sub(r'[^\w]+', '', county.lower()) - def create_session(self, logger: logging.Logger) -> requests.sessions.Session: + def create_session(self, logger: logging.Logger, ssl) -> requests.sessions.Session: """ Creates and configures a requests session for interacting with web pages. @@ -104,7 +105,9 @@ def create_session(self, logger: logging.Logger) -> requests.sessions.Session: """ # Create and configure the session session = requests.Session() - session.verify = False # Disable SSL certificate verification + + # Optionally SSL certificate verification. Default to True unless False passed. + session.verify = ssl requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) return session @@ -631,8 +634,8 @@ def scrape( case_number: Optional[str], case_html_path: Optional[str] ) -> None: - ms_wait, start_date, end_date, court_calendar_link_text, case_number = self.set_defaults( - ms_wait, start_date, end_date, court_calendar_link_text, case_number + ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = self.set_defaults( + ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl ) logger = self.configure_logger() diff --git a/src/tester/test_unittest.py b/src/tester/test_unittest.py index 6b27694..5d96011 100644 --- a/src/tester/test_unittest.py +++ b/src/tester/test_unittest.py @@ -44,7 +44,7 @@ def test_scrape_main_page(self, ): scraper_instance = Scraper() logger = scraper_instance.configure_logger() - ms_wait, start_date, end_date, court_calendar_link_text, case_number = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number) + ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl) session = scraper_instance.create_session(logger) main_page_html, main_soup = scraper_instance.scrape_main_page(base_url, odyssey_version, session, notes, logger, ms_wait) self.assertIsNotNone(main_page_html, "No main page HTML came through. main_page_html = None.") @@ -73,7 +73,7 @@ def test_scrape_search_page(self, # Look for the court calendar link scraper_instance = Scraper() logger = scraper_instance.configure_logger() - ms_wait, start_date, end_date, court_calendar_link_text, case_number = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number) + ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl) session = scraper_instance.create_session(logger) search_url, search_page_html, search_soup = scraper_instance.scrape_search_page(base_url, odyssey_version, main_page_html, main_soup, session, logger, ms_wait, court_calendar_link_text) # Verify the court calendar link @@ -113,7 +113,7 @@ def test_get_hidden_values(self, #Run the function scraper_instance = Scraper() logger = scraper_instance.configure_logger() - ms_wait, start_date, end_date, court_calendar_link_text, case_number = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number) + ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl) hidden_values = scraper_instance.get_hidden_values(odyssey_version, main_soup, search_soup, logger) self.assertIsNotNone(hidden_values, "No hidden values came through. hidden_values = None.") self.assertTrue(type(hidden_values) == dict, "The hidden values fields is not a dictionary but it needs to be.") @@ -130,7 +130,8 @@ def test_scrape_individual_case(self, start_date = None, end_date = None, court_calendar_link_text = None, - case_html_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources", 'test_files', 'test_data', 'hays', "case_html") + case_html_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources", 'test_files', 'test_data', 'hays', "case_html"), + ssl = True ): # This starts a timer to compare the run start time to the last updated time of the resulting HTML to ensure the HTML was created after run start time now = datetime.now() @@ -140,10 +141,10 @@ def test_scrape_individual_case(self, # Call the functions being tested. In this case, the functions being called are all of the subfunctions required and effectively replicates the shape of scrape. scraper_instance = Scraper() - ms_wait, start_date, end_date, court_calendar_link_text, case_number = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number) + ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl) logger = scraper_instance.configure_logger() county = scraper_instance.format_county(county) - session = scraper_instance.create_session(logger) + session = scraper_instance.create_session(logger, ssl) case_html_path = scraper_instance.make_directories(county) if not case_html_path else case_html_path base_url, odyssey_version, notes = scraper_instance.get_ody_link(county, logger) main_page_html, main_soup = scraper_instance.scrape_main_page(base_url, odyssey_version, session, notes, logger, ms_wait) @@ -193,7 +194,7 @@ def test_scrape_jo_list(self, ): # This test requires that certain dependency functions run first. scraper_instance = Scraper() - ms_wait, start_date, end_date, court_calendar_link_text, case_number = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number) + ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl) logger = scraper_instance.configure_logger() county = scraper_instance.format_county(county) session = scraper_instance.create_session(logger) @@ -230,7 +231,7 @@ def test_scrape_results_page(self, hidden_values = hidden_values.replace("'", "\"") hidden_values = json.loads(hidden_values) scraper_instance = Scraper() - ms_wait, start_date, end_date, court_calendar_link_text, case_number = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number) + ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl) logger = scraper_instance.configure_logger() county = scraper_instance.format_county(county) session = scraper_instance.create_session(logger) @@ -296,7 +297,7 @@ def test_scrape_multiple_cases(self, # There are some live depency functions that have to be run before the primary code can be run. scraper_instance = Scraper() - ms_wait, start_date, end_date, court_calendar_link_text, case_number = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number) + ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl) logger = scraper_instance.configure_logger() session = scraper_instance.create_session(logger) case_html_path = scraper_instance.make_directories(county) if not case_html_path else case_html_path From 9279ef62b31f7c29bd99b15dab4a073f822fcca8 Mon Sep 17 00:00:00 2001 From: Nicolas Sawyer Date: Sat, 21 Sep 2024 19:36:31 -0500 Subject: [PATCH 08/10] making case_html_path an optional parameter --- src/scraper/__init__.py | 21 ++++----- src/scraper/helpers.py | 1 - src/tester/test_unittest.py | 90 ++++++++++++++++++++++--------------- 3 files changed, 64 insertions(+), 48 deletions(-) diff --git a/src/scraper/__init__.py b/src/scraper/__init__.py index a8f2fa7..6774715 100644 --- a/src/scraper/__init__.py +++ b/src/scraper/__init__.py @@ -25,8 +25,10 @@ def set_defaults( end_date: str | None = None, court_calendar_link_text: str | None = None, case_number: str | None = None, - ssl: bool | None = None - ) -> Tuple[int, str, str, str, Optional[str], bool]: + ssl: bool | None = None, + county: str | None = None, + case_html_path: str | None = None, + ) -> Tuple[int, str, str, str, Optional[str], bool, str, str]: """ Sets default values for the provided optional parameters. @@ -53,8 +55,9 @@ def set_defaults( # case_number defaults to None if not provided case_number = case_number ssl = ssl if ssl is not None else True - - return ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl + county = county if county is not None else 'hays' + case_html_path = case_html_path if case_html_path is not None else os.path.join(os.path.dirname(__file__), "..", "..", "data", county, "case_html") + return ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path def configure_logger(self) -> logging.Logger: """ @@ -112,7 +115,7 @@ def create_session(self, logger: logging.Logger, ssl) -> requests.sessions.Sessi return session - def make_directories(self, county: str, logger: logging.Logger) -> str: + def make_directories(self, county: str, logger: logging.Logger, case_html_path) -> str: """ Creates necessary directories for storing case HTML files. @@ -129,9 +132,7 @@ def make_directories(self, county: str, logger: logging.Logger) -> str: Raises: OSError: If there is an error creating the directories. - """ - case_html_path = os.path.join(os.path.dirname(__file__), "..", "..", "data", county, "case_html") - + """ # Create the directories if they do not exist os.makedirs(case_html_path, exist_ok=True) @@ -634,8 +635,8 @@ def scrape( case_number: Optional[str], case_html_path: Optional[str] ) -> None: - ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = self.set_defaults( - ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl + ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path = self.set_defaults( + ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path ) logger = self.configure_logger() diff --git a/src/scraper/helpers.py b/src/scraper/helpers.py index f3d48e8..8b29c36 100644 --- a/src/scraper/helpers.py +++ b/src/scraper/helpers.py @@ -5,7 +5,6 @@ from logging import Logger from typing import Dict, Optional, Tuple, Literal from enum import Enum -from datetime import datetime, timezone, timedelta #This is called debug and quit. def write_debug_and_quit( diff --git a/src/tester/test_unittest.py b/src/tester/test_unittest.py index 5d96011..9d4fa52 100644 --- a/src/tester/test_unittest.py +++ b/src/tester/test_unittest.py @@ -1,5 +1,5 @@ -import unittest, sys, os, json, warnings, requests, logging -from datetime import datetime, timezone, timedelta +import unittest, sys, os, json, warnings, logging +from datetime import datetime, timedelta from bs4 import BeautifulSoup current_dir = os.path.dirname(os.path.abspath(__file__)) @@ -29,7 +29,7 @@ def test_scrape_get_ody_link(self, scraper_instance = Scraper() logger = scraper_instance.configure_logger() county = scraper_instance.format_county(county) - base_url = scraper_instance.get_ody_link('hays', logger) + base_url = scraper_instance.get_ody_link(county, logger) self.assertIsNotNone(base_url, "No URL found for this county.") def test_scrape_main_page(self, @@ -40,12 +40,15 @@ def test_scrape_main_page(self, start_date = None, end_date = None, court_calendar_link_text = None, - case_number = None + case_number = None, + ssl = True, + case_html_path = None, + county = 'hays' ): scraper_instance = Scraper() logger = scraper_instance.configure_logger() - ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl) - session = scraper_instance.create_session(logger) + ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path) + session = scraper_instance.create_session(logger, ssl) main_page_html, main_soup = scraper_instance.scrape_main_page(base_url, odyssey_version, session, notes, logger, ms_wait) self.assertIsNotNone(main_page_html, "No main page HTML came through. main_page_html = None.") self.assertTrue('ssSearchHyperlink' in main_page_html, "There is no 'ssSearchHyperlink' text found in this main page html.") # Note: This validation is already being done using the 'verification_text' field. @@ -62,7 +65,11 @@ def test_scrape_search_page(self, court_calendar_link_text = None, start_date = None, end_date = None, - case_number = None): + case_number = None, + ssl = True, + case_html_path = None, + county = 'hays' + ): # Open the mocked main page HTML with open( os.path.join(os.path.dirname(__file__), "..", "..", "resources", 'test_files','hays_main_page.html'), "r", encoding='utf-8' @@ -73,8 +80,8 @@ def test_scrape_search_page(self, # Look for the court calendar link scraper_instance = Scraper() logger = scraper_instance.configure_logger() - ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl) - session = scraper_instance.create_session(logger) + ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path) + session = scraper_instance.create_session(logger, ssl) search_url, search_page_html, search_soup = scraper_instance.scrape_search_page(base_url, odyssey_version, main_page_html, main_soup, session, logger, ms_wait, court_calendar_link_text) # Verify the court calendar link self.assertIsNotNone(main_page_html, "No search url came through. search_url = None.") @@ -93,7 +100,11 @@ def test_get_hidden_values(self, court_calendar_link_text = None, start_date = None, end_date = None, - case_number = None): + case_number = None, + ssl = True, + case_html_path = None, + county = 'hays' + ): # Open the mocked main page HTML with open( os.path.join(os.path.dirname(__file__), "..", "..", "resources", 'test_files','hays_main_page.html'), "r", encoding='utf-8' @@ -113,7 +124,7 @@ def test_get_hidden_values(self, #Run the function scraper_instance = Scraper() logger = scraper_instance.configure_logger() - ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl) + ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path) hidden_values = scraper_instance.get_hidden_values(odyssey_version, main_soup, search_soup, logger) self.assertIsNotNone(hidden_values, "No hidden values came through. hidden_values = None.") self.assertTrue(type(hidden_values) == dict, "The hidden values fields is not a dictionary but it needs to be.") @@ -131,7 +142,7 @@ def test_scrape_individual_case(self, end_date = None, court_calendar_link_text = None, case_html_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources", 'test_files', 'test_data', 'hays', "case_html"), - ssl = True + ssl = True, ): # This starts a timer to compare the run start time to the last updated time of the resulting HTML to ensure the HTML was created after run start time now = datetime.now() @@ -141,7 +152,7 @@ def test_scrape_individual_case(self, # Call the functions being tested. In this case, the functions being called are all of the subfunctions required and effectively replicates the shape of scrape. scraper_instance = Scraper() - ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl) + ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path) logger = scraper_instance.configure_logger() county = scraper_instance.format_county(county) session = scraper_instance.create_session(logger, ssl) @@ -191,13 +202,15 @@ def test_scrape_jo_list(self, county = 'hays', session = None, logger = None, + ssl = True, + case_html_path = None, ): # This test requires that certain dependency functions run first. scraper_instance = Scraper() - ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl) + ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path) logger = scraper_instance.configure_logger() county = scraper_instance.format_county(county) - session = scraper_instance.create_session(logger) + session = scraper_instance.create_session(logger, ssl) main_page_html, main_soup = scraper_instance.scrape_main_page(base_url, odyssey_version, session, notes, logger, ms_wait) search_url, search_page_html, search_soup = scraper_instance.scrape_search_page(base_url, odyssey_version, main_page_html, main_soup, session, logger, ms_wait, court_calendar_link_text) judicial_officers, judicial_officer_to_ID = scraper_instance.scrape_jo_list(odyssey_version, search_soup, judicial_officers, logger) @@ -220,7 +233,9 @@ def test_scrape_results_page(self, start_date = None, end_date = None, court_calendar_link_text = None, - case_number = None + case_number = None, + ssl = True, + case_html_path = None, ): # Read in the test 'hidden values' that are necessary for searching a case @@ -231,10 +246,10 @@ def test_scrape_results_page(self, hidden_values = hidden_values.replace("'", "\"") hidden_values = json.loads(hidden_values) scraper_instance = Scraper() - ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl) + ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path) logger = scraper_instance.configure_logger() county = scraper_instance.format_county(county) - session = scraper_instance.create_session(logger) + session = scraper_instance.create_session(logger, ssl) # Open the example main page HTML with open( os.path.join(os.path.dirname(__file__), "..", "..", "resources", 'test_files','hays_main_page.html'), "r", encoding='utf-8' @@ -256,23 +271,24 @@ def test_scrape_results_page(self, #def scrape_case_data_post2017() @unittest.skipIf(SKIP_SLOW, "slow") - def test_scrape_multiple_cases(self, - county = 'hays', - odyssey_version = 2003, - base_url = r'http://public.co.hays.tx.us/', - search_url = r'https://public.co.hays.tx.us/Search.aspx?ID=900', - hidden_values = None, - judicial_officers = ['Boyer, Bruce'], - judicial_officer_to_ID = {'Boyer, Bruce':'39607'}, - JO_id = '39607', - date_string = '07-01-2024', - court_calendar_link_text = None, - case_number = None, - ms_wait = 200, - start_date = '2024-07-01', - end_date = '2024-07-01', - case_html_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources", 'test_files', 'test_data', 'hays', "case_html") - ): + def test_scrape_multiple_cases(self, + county = 'hays', + odyssey_version = 2003, + base_url = r'http://public.co.hays.tx.us/', + search_url = r'https://public.co.hays.tx.us/Search.aspx?ID=900', + hidden_values = None, + judicial_officers = ['Boyer, Bruce'], + judicial_officer_to_ID = {'Boyer, Bruce':'39607'}, + JO_id = '39607', + date_string = '07-01-2024', + court_calendar_link_text = None, + case_number = None, + ms_wait = 200, + start_date = '2024-07-01', + end_date = '2024-07-01', + case_html_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources", 'test_files', 'test_data', 'hays', "case_html"), + ssl = True, + ): # This starts a timer to compare the run start time to the last updated time of the resulting HTML to ensure the HTML was created after run start time now = datetime.now() @@ -297,9 +313,9 @@ def test_scrape_multiple_cases(self, # There are some live depency functions that have to be run before the primary code can be run. scraper_instance = Scraper() - ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl) + ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path = scraper_instance.set_defaults(ms_wait, start_date, end_date, court_calendar_link_text, case_number, ssl, county, case_html_path) logger = scraper_instance.configure_logger() - session = scraper_instance.create_session(logger) + session = scraper_instance.create_session(logger, ssl) case_html_path = scraper_instance.make_directories(county) if not case_html_path else case_html_path search_url, search_page_html, search_soup = scraper_instance.scrape_search_page(base_url, odyssey_version, main_page_html, main_soup, session, logger, ms_wait, court_calendar_link_text) results_html, results_soup = scraper_instance.scrape_results_page(odyssey_version, base_url, search_url, hidden_values, JO_id, date_string, session, logger, ms_wait) From e569d6a284bfe7c67da3f1974819919dcb0a68b2 Mon Sep 17 00:00:00 2001 From: Nicolas Sawyer Date: Sat, 21 Sep 2024 22:21:49 -0500 Subject: [PATCH 09/10] scraper: reformatting doc strings --- src/scraper/__init__.py | 248 +++++++++++++++++----------------------- 1 file changed, 108 insertions(+), 140 deletions(-) diff --git a/src/scraper/__init__.py b/src/scraper/__init__.py index 6774715..c829d5a 100644 --- a/src/scraper/__init__.py +++ b/src/scraper/__init__.py @@ -32,21 +32,27 @@ def set_defaults( """ Sets default values for the provided optional parameters. - Args: - ms_wait (Optional[int]): Milliseconds to wait, default is 200 if not provided. - start_date (Optional[str]): Start date in YYYY-MM-DD format, default is '2024-07-01' if not provided. - end_date (Optional[str]): End date in YYYY-MM-DD format, default is '2024-07-01' if not provided. - court_calendar_link_text (Optional[str]): Text for court calendar link, default is 'Court Calendar' if not provided. - case_number (Optional[str]): Case number, default is None if not provided. - - Returns: - Tuple[int, str, str, str, Optional[str]]: A tuple containing: - - `ms_wait` (int): Milliseconds to wait. - - `start_date` (str): Start date. - - `end_date` (str): End date. - - `court_calendar_link_text` (str): Text for court calendar link. - - `case_number` (Optional[str]): Case number or None. + Defaults: + - `ms_wait`: 200 milliseconds if not provided. + - `start_date`: '2024-07-01' if not provided. + - `end_date`: '2024-07-01' if not provided. + - `court_calendar_link_text`: 'Court Calendar' if not provided. + - `case_number`: None if not provided. + + :param ms_wait: Milliseconds to wait. + :param start_date: Start date in YYYY-MM-DD format. + :param end_date: End date in YYYY-MM-DD format. + :param court_calendar_link_text: Text for the court calendar link. + :param case_number: Case number, or None. + + :returns: A tuple containing: + - ms_wait (int): Milliseconds to wait. + - start_date (str): Start date. + - end_date (str): End date. + - court_calendar_link_text (str): Text for court calendar link. + - case_number (Optional[str]): Case number or None. """ + # Assign default values if parameters are not provided ms_wait = ms_wait if ms_wait is not None else 200 start_date = start_date if start_date is not None else '2024-07-01' @@ -66,8 +72,7 @@ def configure_logger(self) -> logging.Logger: This method sets up the logger with a unique name based on the process ID, configures the logging level to INFO, and logs an initialization message. - Returns: - logging.Logger: Configured logger instance. + :returns: Configured logger instance. """ # Configure the logger logger = logging.getLogger(name=f"pid: {os.getpid()}") @@ -81,30 +86,22 @@ def format_county(self, county: str) -> str: """ Formats the county name to lowercase. - Args: - county (str): The name of the county to be formatted. - - Returns: - str: The county name in lowercase. - - Raises: - TypeError: If the provided county name is not a string. + :param county: The name of the county to be formatted. + :returns: The county name in lowercase. + :raises TypeError: If the provided county name is not a string. """ return re.sub(r'[^\w]+', '', county.lower()) def create_session(self, logger: logging.Logger, ssl) -> requests.sessions.Session: """ - Creates and configures a requests session for interacting with web pages. - - This method sets up a `requests.Session` with SSL verification disabled and suppresses + Sets up a `requests.Session` with or without SSL verification and suppresses related warnings. - Args: - logger (logging.Logger): Logger instance for logging errors. + Defaults to enable SSL. - Returns: - requests.sessions.Session: Configured session object. + :param logger: Logger instance for logging errors. + :returns: Configured session object. """ # Create and configure the session session = requests.Session() @@ -123,16 +120,12 @@ def make_directories(self, county: str, logger: logging.Logger, case_html_path) all required directories in the path are created. If the directories already exist, no action is taken. - Args: - county (str): The name of the county, used to create a specific directory path. - logger (logging.Logger): Logger instance for logging errors. - - Returns: - str: The path to the created directories. + :param county: The name of the county, used to create a specific directory path. + :param logger: Logger instance for logging errors. + :returns: The path to the created directories. + :raises OSError: If there is an error creating the directories. + """ - Raises: - OSError: If there is an error creating the directories. - """ # Create the directories if they do not exist os.makedirs(case_html_path, exist_ok=True) @@ -146,24 +139,19 @@ def get_ody_link(self, """ Retrieves Odyssey-related information for a given county from a CSV file. - This function reads county-specific data from a CSV file located in the `resources` directory. It searches for - the county name in the CSV file, extracts the corresponding base URL, Odyssey version, and any additional notes. - The base URL is formatted with a trailing slash if necessary. - - Args: - county (str): The name of the county for which to retrieve Odyssey information. - logger (logging.Logger): Logger instance for logging errors and information. - - Returns: - Tuple[str, str, str]: A tuple containing: - - `base_url` (str): The base URL for the county’s portal. - - `odyssey_version` (str): The major version of Odyssey associated with the county. - - `notes` (str): Additional notes related to the county. - - Raises: - Exception: If the county is not found in the CSV file or if required data is missing, an exception is raised - and logged. + This function reads county-specific data from a CSV file located in the `resources` directory. + It searches for the county name in the CSV file, extracts the corresponding base URL, Odyssey + version, and any additional notes. The base URL is formatted with a trailing slash if necessary. + + :param county: The name of the county for which to retrieve Odyssey information. + :param logger: Logger instance for logging errors and information. + :returns: A tuple containing: + - base_url (str): The base URL for the county’s portal. + - odyssey_version (str): The major version of Odyssey associated with the county. + - notes (str): Additional notes related to the county. + :raises Exception: If the county is not found in the CSV file or if required data is missing. """ + try: base_url = odyssey_version = notes = None with open( @@ -196,18 +184,14 @@ def get_class_and_method( """ Dynamically imports a module, retrieves a class, and gets a method from it based on the county name. - Args: - county (str): The name of the county, used to construct module, class, and method names. - logger (logging.Logger): Logger instance for logging errors. - - Returns: - Tuple[Type[object], Callable]: A tuple containing the instance of the class and the method callable. - - Raises: - ImportError: If the module cannot be imported. - AttributeError: If the class or method cannot be found. - Exception: For any other unexpected errors. + :param county: The name of the county, used to construct module, class, and method names. + :param logger: Logger instance for logging errors. + :returns: A tuple containing the instance of the class and the method callable. + :raises ImportError: If the module cannot be imported. + :raises AttributeError: If the class or method cannot be found. + :raises Exception: For any other unexpected errors. """ + module_name = county class_name = f"Scraper{county.capitalize()}" method_name = f"scraper_{county}" @@ -250,28 +234,24 @@ def scrape_main_page(self, ms_wait: int ) -> Tuple[str, BeautifulSoup]: """ - Scrapes the main page of the Odyssey site, handling login if required and returning the page's HTML and parsed content. + Scrapes the main page of the Odyssey site, handling login if required, and returns the page's HTML and parsed content. This function handles a special case where some sites may require a public guest login. If the `notes` parameter contains a "PUBLICLOGIN#" identifier, it will extract the username and password from the `notes`, perform the login, and then proceed to scrape the main page. - Args: - base_url (str): The base URL of the main page to scrape. - odyssey_version (int): The version of Odyssey; currently not used in this function. - session (requests.sessions.Session): The `requests` session object used for making HTTP requests. - notes (str): A string containing notes that may include login credentials in the format "PUBLICLOGIN#username/password". - logger (logging.Logger): Logger instance for logging errors and debug information. - ms_wait (int): The number of milliseconds to wait between retry attempts. - - Returns: - Tuple[str, BeautifulSoup]: A tuple containing: - - `main_page_html` (str): The raw HTML content of the main page. - - `main_soup` (BeautifulSoup): A BeautifulSoup object containing the parsed HTML content. - - Raises: - Exception: If any error occurs during the HTTP requests or HTML parsing, an exception is raised and logged. + :param base_url: The base URL of the main page to scrape. + :param odyssey_version: The version of Odyssey; currently not used in this function. + :param session: The `requests` session object used for making HTTP requests. + :param notes: A string containing notes that may include login credentials in the format "PUBLICLOGIN#username/password". + :param logger: Logger instance for logging errors and debug information. + :param ms_wait: The number of milliseconds to wait between retry attempts. + :returns: A tuple containing: + - main_page_html (str): The raw HTML content of the main page. + - main_soup (BeautifulSoup): A BeautifulSoup object containing the parsed HTML content. + :raises Exception: If any error occurs during the HTTP requests or HTML parsing. """ + try: # some sites have a public guest login that must be used if "PUBLICLOGIN#" in notes: @@ -326,22 +306,18 @@ def scrape_search_page( and retrieves the search page HTML. Depending on the Odyssey version, it either uses the extracted URL or a default URL. It then parses the search page HTML into a BeautifulSoup object. - Args: - base_url (str): The base URL for constructing full URLs. - odyssey_version (int): The version of Odyssey, used to determine the correct URL and verification text. - main_page_html (str): The HTML content of the main page. - main_soup (BeautifulSoup): Parsed BeautifulSoup object of the main page HTML. - session (requests.sessions.Session): The session object for making HTTP requests. - logger (logging.Logger): Logger instance for logging errors and information. - ms_wait (int): Milliseconds to wait before making requests. - court_calendar_link_text (str): Text to search for in the court calendar link. - - Returns: - Tuple[str, str, BeautifulSoup]: A tuple containing the search page URL, search page HTML, and the BeautifulSoup object of the search page. - - Raises: - ValueError: If the court calendar link is not found on the main page. + :param base_url: The base URL for constructing full URLs. + :param odyssey_version: The version of Odyssey, used to determine the correct URL and verification text. + :param main_page_html: The HTML content of the main page. + :param main_soup: Parsed BeautifulSoup object of the main page HTML. + :param session: The session object for making HTTP requests. + :param logger: Logger instance for logging errors and information. + :param ms_wait: Milliseconds to wait before making requests. + :param court_calendar_link_text: Text to search for in the court calendar link. + :returns: A tuple containing the search page URL, search page HTML, and the BeautifulSoup object of the search page. + :raises ValueError: If the court calendar link is not found on the main page. """ + # Extract the search page ID from the court calendar link search_page_id = None for link in main_soup.select("a.ssSearchHyperlink"): @@ -391,15 +367,13 @@ def get_hidden_values( """ Extracts hidden input values and additional data from the search page. - Args: - odyssey_version (int): The version of Odyssey to determine logic. - main_soup (BeautifulSoup): Parsed BeautifulSoup object of the main page HTML. - search_soup (BeautifulSoup): Parsed BeautifulSoup object of the search page HTML. - logger (logging.Logger): Logger instance for logging information. - - Returns: - Dict[str, str]: Dictionary of hidden input names and their values. + :param odyssey_version: The version of Odyssey to determine logic. + :param main_soup: Parsed BeautifulSoup object of the main page HTML. + :param search_soup: Parsed BeautifulSoup object of the search page HTML. + :param logger: Logger instance for logging information. + :returns: Dictionary of hidden input names and their values. """ + # Extract hidden input values hidden_values = { hidden["name"]: hidden["value"] @@ -432,17 +406,15 @@ def get_search_results( """ Retrieves search results from the search page. - Args: - session (requests.sessions.Session): The session object for making HTTP requests. - search_url (str): The URL to request search results from. - logger (logging.Logger): Logger instance for logging information. - ms_wait (int): Milliseconds to wait before making requests. - hidden_values (Dict[str, str]): Dictionary of hidden input values. - case_number (Optional[str]): Case number for searching. - - Returns: - BeautifulSoup: Parsed BeautifulSoup object of the search results page HTML. + :param session: The session object for making HTTP requests. + :param search_url: The URL to request search results from. + :param logger: Logger instance for logging information. + :param ms_wait: Milliseconds to wait before making requests. + :param hidden_values: Dictionary of hidden input values. + :param case_number: Case number for searching. + :returns: Parsed BeautifulSoup object of the search results page HTML. """ + results_page_html = request_page_with_retry( session=session, url=search_url, @@ -502,19 +474,17 @@ def scrape_jo_list( logger: logging.Logger ) -> Tuple[List[str], Dict[str, str]]: """ - Scrapes a list of judicial officers and their IDs from the search page. - - Optionally receives a list of judicial officers to scrape. + Scrapes a list of judicial officers and their IDs from the search page. - Args: - odyssey_version (int): The version of Odyssey to determine the selector. - search_soup (BeautifulSoup): Parsed BeautifulSoup object of the search page HTML. - judicial_officers (Optional[List[str]]): List of specific judicial officers to use. - logger (logging.Logger): Logger instance for logging information. + Optionally receives a list of judicial officers to scrape. - Returns: - Tuple[List[str], Dict[str, str]]: Tuple containing a list of judicial officers to use and a dictionary of judicial officers and their IDs. + :param odyssey_version: The version of Odyssey to determine the selector. + :param search_soup: Parsed BeautifulSoup object of the search page HTML. + :param judicial_officers: List of specific judicial officers to use. + :param logger: Logger instance for logging information. + :returns: Tuple containing a list of judicial officers to use and a dictionary of judicial officers and their IDs. """ + selector = 'select[labelname="Judicial Officer:"] > option' if odyssey_version < 2017 else 'select[id="selHSJudicialOfficer"] > option' judicial_officer_to_ID = { option.text: option["value"] @@ -545,20 +515,18 @@ def scrape_results_page( """ Scrapes the results page based on Odyssey version and search criteria. - Args: - odyssey_version (int): The version of Odyssey to determine the URL and verification text. - base_url (str): The base URL for constructing full URLs. - search_url (str): The URL to request search results from. - hidden_values (Dict[str, str]): Dictionary of hidden input values. - jo_id (str): Judicial officer ID for searching. - date_string (str): Date string for searching. - session (requests.sessions.Session): The session object for making HTTP requests. - logger (logging.Logger): Logger instance for logging information. - ms_wait (int): Milliseconds to wait before making requests. - - Returns: - Tuple[str, BeautifulSoup]: A tuple containing the HTML of the results page and the parsed BeautifulSoup object. + :param odyssey_version: The version of Odyssey to determine the URL and verification text. + :param base_url: The base URL for constructing full URLs. + :param search_url: The URL to request search results from. + :param hidden_values: Dictionary of hidden input values. + :param jo_id: Judicial officer ID for searching. + :param date_string: Date string for searching. + :param session: The session object for making HTTP requests. + :param logger: Logger instance for logging information. + :param ms_wait: Milliseconds to wait before making requests. + :returns: A tuple containing the HTML of the results page and the parsed BeautifulSoup object. """ + search_url = ( search_url if odyssey_version < 2017 From b2849f1c28e0cd079b146ddfd6697f5ede860ff8 Mon Sep 17 00:00:00 2001 From: Nicolas Sawyer Date: Sun, 22 Sep 2024 10:13:34 -0500 Subject: [PATCH 10/10] scrpr cleanup: responding to code review comments --- src/scraper/__init__.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/src/scraper/__init__.py b/src/scraper/__init__.py index c829d5a..cd0881c 100644 --- a/src/scraper/__init__.py +++ b/src/scraper/__init__.py @@ -154,6 +154,7 @@ def get_ody_link(self, try: base_url = odyssey_version = notes = None + # CSV is located in 'resources' folder with open( os.path.join(os.path.dirname(__file__), "..", "..", "resources", "texas_county_data.csv"), mode="r", @@ -172,7 +173,7 @@ def get_ody_link(self, if not base_url or not odyssey_version: raise Exception("The required data to scrape this county is not in /resources/texas_county_data.csv") except Exception as e: - logger.info(f"Error getting county-specific information from csv: {e}") + logger.exception(e, "Error getting county-specific information from csv.") raise return base_url, odyssey_version, notes @@ -181,6 +182,7 @@ def get_class_and_method( county: str, logger: logging.Logger ) -> Tuple[Type[object], Callable]: + """ Dynamically imports a module, retrieves a class, and gets a method from it based on the county name. @@ -189,7 +191,6 @@ def get_class_and_method( :returns: A tuple containing the instance of the class and the method callable. :raises ImportError: If the module cannot be imported. :raises AttributeError: If the class or method cannot be found. - :raises Exception: For any other unexpected errors. """ module_name = county @@ -219,10 +220,7 @@ def get_class_and_method( return instance, method except (FileNotFoundError, ImportError, AttributeError) as e: - logger.error(f"Error dynamically loading module or retrieving class/method: {e}") - raise - except Exception as e: - logger.error(f"Unexpected error: {e}") + logger.exception(e, "Error dynamically loading module or retrieving class/method.") raise def scrape_main_page(self, @@ -264,8 +262,7 @@ def scrape_main_page(self, "SignOn": "Sign On", } - # not sure how this is being used. response doesn't seem to be used anywhere, but it may just be opening the page. - response = request_page_with_retry( + request_page_with_retry( session=session, url=urllib.parse.urljoin(base_url, "login.aspx"), logger=logger, @@ -284,7 +281,7 @@ def scrape_main_page(self, ) main_soup = BeautifulSoup(main_page_html, "html.parser") except Exception as e: - logger.error(f"Error scraping main page for main page HTML: {e}") + logger.exception(e, f"Error scraping main page for main page HTML.") raise return main_page_html, main_soup @@ -582,15 +579,12 @@ def scrape_multiple_cases( jo_id = judicial_officer_to_ID[JO_name] logger.info(f"Searching cases on {date_string} for {JO_name}") - results_html, results_soup = self.scrape_results_page( + results_soup = self.scrape_results_page( odyssey_version, base_url, search_url, hidden_values, jo_id, date_string, session, logger, ms_wait ) - scraper_instance, scraper_function = self.get_class_and_method(county, logger) - if scraper_instance and scraper_function: - scraper_function(base_url, results_soup, case_html_path, logger, session, ms_wait) - else: - logger.error("Error: Could not obtain parser instance or function.") + scraper_function = self.get_class_and_method(county, logger) + scraper_function(base_url, results_soup, case_html_path, logger, session, ms_wait) def scrape( self,