From f5171a1153619b4ef79e058118f187e78aedee56 Mon Sep 17 00:00:00 2001 From: Jordan Gillard Date: Sun, 7 Jul 2024 16:00:18 -0400 Subject: [PATCH 01/18] Update README examples w/ x-platform escape chars --- README.md | 48 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 128f1e8..03ee5e7 100644 --- a/README.md +++ b/README.md @@ -38,31 +38,57 @@ This is a command line tool that takes a search query, queries a server, and dow ### Examples -```bash -# Display help message describing all supported arguments along with their usage, aliases and eventual default values (type q to exit) +Display help message describing all supported arguments along with their usage, aliases and eventual default values (type `q` to exit) + +```shell edgar-tool text_search --help +``` + +Basic usage (defaults to searching the last 5 years of records) -# Basic usage (defaults to searching the last 5 years of records) +```shell edgar-tool text_search John Doe +``` + +You can wrap a phrase in quotes if you want an exact match, just remember to wrap the phrase in single quotes. +This works in both POSIX-compliant shells (Linux/Bash) and Windows PowerShell environments. + +For example, the following usage will search for the exact phrase `"John Doe"` and treat `Pharmaceuticals` and +`Chemicals` as partial search parameters. + +```shell +edgar-tool text_search '"John Doe"' Pharmaceuticals Chemicals +``` -# Basic usage with a combination of exact and partial search parameters -edgar-tool text_search \"John Doe\" Pharmaceuticals Chemicals +Usage with date range and export to custom CSV file -# Usage with date range and export to custom CSV file +```shell edgar-tool text_search Tsunami Hazards --start_date "2021-01-01" --end_date "2021-12-31" --output "results.csv" +``` + +Usage with a partial set of filing forms + single forms -# Usage with a partial set of filing forms + single forms +```shell edgar-tool text_search Hurricane Damage --filing_form "registration_statements" --single_forms "['1-K', '1-SA']" +``` -# Usage specifying the location of incorporation +Usage specifying the location of incorporation + +```shell edgar-tool text_search oil --inc_in "Egypt" +``` -# More advanced usage specifying more arguments, with export to JSON +More advanced usage specifying more arguments, with export to JSON + +```shell edgar-tool text_search Volcano Monitoring --start_date "2021-01-01" --end_date "2021-12-31" --output "results.json"\ --filing_form "all_annual_quarterly_and_current_reports" --entity_id "0001030717" \ --min_wait 5.0 --max_wait 7.0 --retries 3 - -# Using aliases where supported and exporting to JSONLines +``` + +Using aliases where supported and exporting to JSONLines + +```shell edgar-tool text_search Calabarzon -s "2021-01-01" -o "results.jsonl" -f "all_annual_quarterly_and_current_reports" -r 3 -h ``` From 549273e4e79cbde1e3bf4192140adc8d920aeddf Mon Sep 17 00:00:00 2001 From: Jordan Gillard Date: Tue, 9 Jul 2024 23:16:55 +0000 Subject: [PATCH 02/18] Use urllib.parse.quote to encode quote characters --- edgar_tool/{main.py => __main__.py} | 4 ++-- edgar_tool/text_search.py | 2 +- pyproject.toml | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) rename edgar_tool/{main.py => __main__.py} (73%) diff --git a/edgar_tool/main.py b/edgar_tool/__main__.py similarity index 73% rename from edgar_tool/main.py rename to edgar_tool/__main__.py index 15f8d05..83874d5 100644 --- a/edgar_tool/main.py +++ b/edgar_tool/__main__.py @@ -2,9 +2,9 @@ import fire -def main_entrypoint(): +def main(): fire.Fire(SecEdgarScraperCli) if __name__ == "__main__": - main_entrypoint() + main() diff --git a/edgar_tool/text_search.py b/edgar_tool/text_search.py index 656bd7a..9bac287 100644 --- a/edgar_tool/text_search.py +++ b/edgar_tool/text_search.py @@ -218,7 +218,7 @@ def _generate_request_args( # Generate request arguments request_args = { - "q": keywords, + "q": urllib.parse.quote(keywords), "dateRange": "custom", "startdt": start_date.strftime("%Y-%m-%d"), "enddt": end_date.strftime("%Y-%m-%d"), diff --git a/pyproject.toml b/pyproject.toml index a973f33..7a016b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ keywords=["scraper", "edgar", "finance", "sec"] "Bug Tracker" = "https://github.com/bellingcat/EDGAR/issues" [tool.poetry.scripts] -edgar-tool = "edgar_tool.main:main_entrypoint" +edgar-tool = "edgar_tool.__main__:main" [tool.poetry.dependencies] python = "^3.9" @@ -36,4 +36,4 @@ black = "^24.2.0" [build-system] requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" \ No newline at end of file +build-backend = "poetry.core.masonry.api" From 53576dcbd3e011e6a87a53a3b009aa9217788977 Mon Sep 17 00:00:00 2001 From: Galen Reich <54807169+GalenReich@users.noreply.github.com> Date: Wed, 10 Jul 2024 10:23:53 +0100 Subject: [PATCH 03/18] Avoid double escaping keywords in url and handle single quotes --- edgar_tool/text_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/edgar_tool/text_search.py b/edgar_tool/text_search.py index 9bac287..52e2b37 100644 --- a/edgar_tool/text_search.py +++ b/edgar_tool/text_search.py @@ -214,11 +214,11 @@ def _generate_request_args( raise ValueError("start_date cannot be after end_date") # Join search keywords into a single string - keywords = " ".join(keywords) + keywords = " ".join([f'"{keyword}"' if " " in keyword else keyword for keyword in keywords]) # Generate request arguments request_args = { - "q": urllib.parse.quote(keywords), + "q": keywords, "dateRange": "custom", "startdt": start_date.strftime("%Y-%m-%d"), "enddt": end_date.strftime("%Y-%m-%d"), From d7f754cd7c81996ac4943422ce6281c42c2fc20b Mon Sep 17 00:00:00 2001 From: Galen Reich <54807169+GalenReich@users.noreply.github.com> Date: Wed, 10 Jul 2024 10:26:38 +0100 Subject: [PATCH 04/18] Update exact search instructions in README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fce02db..7cbb038 100644 --- a/README.md +++ b/README.md @@ -50,14 +50,14 @@ Basic usage (defaults to searching the last 5 years of records) edgar-tool text_search John Doe ``` -You can wrap a phrase in quotes if you want an exact match, just remember to wrap the phrase in single quotes. +You can wrap a phrase in quotes if you want an exact match. This works in both POSIX-compliant shells (Linux/Bash) and Windows PowerShell environments. For example, the following usage will search for the exact phrase `"John Doe"` and treat `Pharmaceuticals` and `Chemicals` as partial search parameters. ```shell -edgar-tool text_search '"John Doe"' Pharmaceuticals Chemicals +edgar-tool text_search "John Doe" Pharmaceuticals Chemicals ``` Usage with date range and export to custom CSV file From 164bbb2f1e03fd6f9e55cbc52d3e44961757a0fd Mon Sep 17 00:00:00 2001 From: Jordan Gillard Date: Sun, 14 Jul 2024 00:48:24 +0000 Subject: [PATCH 05/18] feat: Add search param validator and basic tests --- edgar_tool/cli.py | 19 ++++++----- edgar_tool/page_fetcher.py | 4 ++- edgar_tool/text_search.py | 34 +++++++++++++------ edgar_tool/url_generator.py | 67 +++++++++++++++++++++++++++++++++++++ edgar_tool/utils.py | 61 +++++++++++++++++++-------------- tests/test_url_generator.py | 46 +++++++++++++++++++++++++ 6 files changed, 186 insertions(+), 45 deletions(-) create mode 100644 edgar_tool/url_generator.py create mode 100644 tests/test_url_generator.py diff --git a/edgar_tool/cli.py b/edgar_tool/cli.py index 06b8ae5..8c2593c 100644 --- a/edgar_tool/cli.py +++ b/edgar_tool/cli.py @@ -57,15 +57,16 @@ def _validate_text_search_args( ): raise ValueError( f"Filing form group must be one of: {'; '.join(TEXT_SEARCH_FILING_VS_MAPPING_CATEGORIES_MAPPING.keys())}" - ) + ) if single_forms: - single_list = [item for sublist in TEXT_SEARCH_CATEGORY_FORM_GROUPINGS.values() for item in - sublist] + single_list = [ + item + for sublist in TEXT_SEARCH_CATEGORY_FORM_GROUPINGS.values() + for item in sublist + ] invalid_forms = [form for form in single_forms if form not in single_list] if invalid_forms: - raise ValueError( - f"Single forms must be one or more of: {single_list}" - ) + raise ValueError(f"Single forms must be one or more of: {single_list}") class SecEdgarScraperCli: @@ -135,7 +136,9 @@ def text_search( scraper.text_search( keywords=keywords, entity_id=entity_id, - filing_form=TEXT_SEARCH_FILING_VS_MAPPING_CATEGORIES_MAPPING.get(filing_form), + filing_form=TEXT_SEARCH_FILING_VS_MAPPING_CATEGORIES_MAPPING.get( + filing_form + ), single_forms=single_forms, start_date=start_date, end_date=end_date, @@ -144,7 +147,7 @@ def text_search( retries=retries, destination=output, peo_in=peo_in, - inc_in=inc_in + inc_in=inc_in, ) @staticmethod diff --git a/edgar_tool/page_fetcher.py b/edgar_tool/page_fetcher.py index 1208c07..4892b1d 100644 --- a/edgar_tool/page_fetcher.py +++ b/edgar_tool/page_fetcher.py @@ -23,6 +23,7 @@ def fetch_page( :param stop_after_n: how many times to retry the request before failing :return: wrapper function that takes a check method and retries the request if the page load fails """ + @retry( wait=wait_fixed(uniform(min_wait_seconds, max_wait_seconds)), stop=stop_after_attempt(stop_after_n), @@ -57,5 +58,6 @@ class ResultsTableNotFoundError(Exception): class PageCheckFailedError(Exception): pass + class NoResultsFoundError(Exception): - pass \ No newline at end of file + pass diff --git a/edgar_tool/text_search.py b/edgar_tool/text_search.py index 52e2b37..110132d 100644 --- a/edgar_tool/text_search.py +++ b/edgar_tool/text_search.py @@ -11,7 +11,7 @@ fetch_page, PageCheckFailedError, ResultsTableNotFoundError, - NoResultsFoundError + NoResultsFoundError, ) from edgar_tool.constants import ( TEXT_SEARCH_BASE_URL, @@ -130,7 +130,11 @@ def _parse_row(row: Dict[str, Any]) -> Dict[str, Any]: places_of_business = _source.get("biz_locations") places_of_business = [ - f"{split[0]}, {TEXT_SEARCH_LOCATIONS_MAPPING.get(split[1])}" if len(split) == 2 else f"{split[0]}" + ( + f"{split[0]}, {TEXT_SEARCH_LOCATIONS_MAPPING.get(split[1])}" + if len(split) == 2 + else f"{split[0]}" + ) for place in places_of_business if (split := place.rsplit(", ", maxsplit=1)) ] @@ -214,7 +218,9 @@ def _generate_request_args( raise ValueError("start_date cannot be after end_date") # Join search keywords into a single string - keywords = " ".join([f'"{keyword}"' if " " in keyword else keyword for keyword in keywords]) + keywords = " ".join( + [f'"{keyword}"' if " " in keyword else keyword for keyword in keywords] + ) # Generate request arguments request_args = { @@ -226,25 +232,31 @@ def _generate_request_args( # Add optional parameters if peo_in and inc_in: - raise ValueError("use only one of peo_in or inc_in, not both") ## because SEC API doesn't support + raise ValueError( + "use only one of peo_in or inc_in, not both" + ) ## because SEC API doesn't support else: if peo_in: request_args["locationCodes"] = peo_in if inc_in: request_args["locationCodes"] = inc_in request_args["locationType"] = "incorporated" - + if entity_id: request_args["entityName"] = entity_id # Handle forms and single forms - part_filing_form = [] if filing_form is None else TEXT_SEARCH_CATEGORY_FORM_GROUPINGS[filing_form] + part_filing_form = ( + [] + if filing_form is None + else TEXT_SEARCH_CATEGORY_FORM_GROUPINGS[filing_form] + ) part_single_forms = [] if single_forms is None else single_forms # Join the filing_forms and single forms and remove duplicates forms = ",".join(list(set(part_filing_form + part_single_forms))) if forms != "": request_args["forms"] = forms - + # URL-encode the request arguments request_args = urllib.parse.urlencode(request_args) @@ -373,7 +385,9 @@ def _generate_search_requests( # If we have 10000 results, split date range in two separate requests and fetch first page again, do so until # we have a set of date ranges for which none of the requests have 10000 results if num_results == 0: - print(f"No results found for query in date range {start_date} -> {end_date}.") + print( + f"No results found for query in date range {start_date} -> {end_date}." + ) elif num_results < 10000: print( f"Less than 10000 ({num_results}) results found for range {start_date} -> {end_date}, " @@ -475,7 +489,7 @@ def text_search( print( f"Skipping search request due to an unexpected {e.__class__.__name__} for request parameters '{r}': {e}" ) - if(search_requests_results == []): + if search_requests_results == []: raise NoResultsFoundError(f"No results found for the search query") write_results_to_file( itertools.chain(*search_requests_results), @@ -518,4 +532,4 @@ def _fetch_first_page_results_number( raise NoResultsFoundError( f"\nExecution aborting due to a {e.__class__.__name__} error raised " f"while parsing number of results for first page at URL {url}: {e}" - ) from e \ No newline at end of file + ) from e diff --git a/edgar_tool/url_generator.py b/edgar_tool/url_generator.py new file mode 100644 index 0000000..259f937 --- /dev/null +++ b/edgar_tool/url_generator.py @@ -0,0 +1,67 @@ +import datetime +from typing import Literal, TypedDict +from urllib import parse + + +class SearchQueryKwargs(TypedDict, total=False): + keywords: list[str] + entity: str + filing_form: str + single_forms: list[str] + date_range_select: Literal["all", "10y", "1y", "30d", "custom"] + start_date: datetime.date + end_date: datetime.date + peo_in: str + inc_in: str + + +class _ValidSearchParams: + def __init__(self, **query_args: SearchQueryKwargs): + keywords = query_args.get("keywords") + entity = query_args.get("entity") + if not keywords and not entity: + raise ValueError( + "Invalid search arguments. You must provide keywords or an entity." + ) + + date_range_select = query_args.get("date_range_select") + start_date = query_args.get("start_date") + end_date = query_args.get("end_date") + if date_range_select == "custom" and not (start_date and end_date): + raise ValueError( + ( + "Invalid date parameters. " + "You must provide both a start and end date if searching a custom date range." + ) + ) + + self._keywords = keywords + self.entity = entity + self.filing_form = query_args.get("filing_form") + self.single_forms = query_args.get("single_forms") + self.date_range_select = date_range_select + self.start_date = start_date + self.end_date = end_date + self.peo_in = query_args.get("peo_in") + self.inc_in = query_args.get("inc_in") + + @property + def keywords(self): + return self._keywords + + @keywords.getter + def keywords(self): + """Returns the keywords to search for, wrapping exact phrases in quotes.""" + return [f'"{phrase}"' if " " in phrase else phrase for phrase in self._keywords] + + +def generate_search_url_for_kwargs(search_kwargs: SearchQueryKwargs) -> str: + base_url = "https://www.sec.gov/edgar/search/#/" + validated_params = _ValidSearchParams(**search_kwargs) + query_params = { + "q": validated_params.keywords, + } + encoded_params = parse.urlencode( + query_params, doseq=True, encoding="utf-8", quote_via=parse.quote + ) + return parse.urljoin(base=base_url, url=encoded_params, allow_fragments=False) diff --git a/edgar_tool/utils.py b/edgar_tool/utils.py index d72bd11..72ab9c1 100644 --- a/edgar_tool/utils.py +++ b/edgar_tool/utils.py @@ -36,16 +36,19 @@ def safe_get(d: Dict, *keys) -> Any: return None return d + def unpack_singleton_list(l: Optional[List]) -> Union[str, List[str]]: return l if (l is None) or (len(l) != 1) else l[0] -def invert_dict(d:dict)->dict: + +def invert_dict(d: dict) -> dict: """ - Returns an inverted dictionary such that values are keys and keys are values. - If there are duplicate values, the last occurring key-value pair will prevail. + Returns an inverted dictionary such that values are keys and keys are values. + If there are duplicate values, the last occurring key-value pair will prevail. """ return {v: k for k, v in d.items()} + def replace_ignore_case_whitespace(s, location, replacement): """ Perform a case-insensitive and whitespace-insensitive replacement of a substring in a string. @@ -59,12 +62,15 @@ def replace_ignore_case_whitespace(s, location, replacement): str: The modified string with the replacements made. """ # Create a regex pattern that ignores whitespace and is case-insensitive - location_pattern = re.compile(r'\s*'.join(re.escape(char) for char in location), re.IGNORECASE) + location_pattern = re.compile( + r"\s*".join(re.escape(char) for char in location), re.IGNORECASE + ) return location_pattern.sub(replacement, s) -def replace_substrings_in_string(s)->str: + +def replace_substrings_in_string(s) -> str: """ - Takes a string like "New York, OH" and returns a string with the full + Takes a string like "New York, OH" and returns a string with the full location names converted to codes such as "NY,OH". Returns an unmodified string if there are no full location names present. Note that matching full location names shall be case and whitespace insensitive. @@ -76,58 +82,61 @@ def replace_substrings_in_string(s)->str: str: The modified string with substrings replaced. """ locations2codes = invert_dict(TEXT_SEARCH_LOCATIONS_MAPPING) - locations2codes = {k.replace(" ", "").lower(): v for k, v in locations2codes.items()} + locations2codes = { + k.replace(" ", "").lower(): v for k, v in locations2codes.items() + } for location in locations2codes.keys(): if location in s.replace(" ", "").lower(): - s = replace_ignore_case_whitespace(s,location, locations2codes[location]) + s = replace_ignore_case_whitespace(s, location, locations2codes[location]) return s - + + def parse_location_input(location_input: str | tuple | None) -> str | None: """ Handles text search input for --peo_in or --inc_in. - This function processes the input to ensure it is in an acceptable format + This function processes the input to ensure it is in an acceptable format for location searches. Because CLI input like --peo_in "NY, OH" yields - runtime value ('NY','OH'), this function supports single or multiple locations - provided as a string or a tuple. If the input is a tuple, it converts the tuple - to a comma-separated string. It also removes any whitespace from the output + runtime value ('NY','OH'), this function supports single or multiple locations + provided as a string or a tuple. If the input is a tuple, it converts the tuple + to a comma-separated string. It also removes any whitespace from the output string to prevent errors during further processing. Also validates that all provided location codes are in the TEXT_SEARCH_LOCATIONS_MAPPING and prints the list of acceptable codes if not. If the input string is a location's full name instead of the code (i.e. 'New York' instead of 'NY'), then strings present in - TEXT_SEARCH_LOCATIONS_MAPPING.values() are mapped to an code value instead. + TEXT_SEARCH_LOCATIONS_MAPPING.values() are mapped to an code value instead. Parameters: - location_input (str | tuple | None): The input location(s) to be parsed. - It can be a single location as a string, multiple locations as a tuple + location_input (str | tuple | None): The input location(s) to be parsed. + It can be a single location as a string, multiple locations as a tuple of strings, or None. Returns: str: A string representation of the location(s) with no whitespace. Raises: - ValueError: If the input is not a string, tuple, or None, or if any location + ValueError: If the input is not a string, tuple, or None, or if any location in the input is not in the TEXT_SEARCH_LOCATIONS_MAPPING. """ if not isinstance(location_input, (str, tuple, type(None))): raise ValueError( f'peo_in and inc_in must use format like "NY" or "NY,OH,etc"' - f'and be one of {TEXT_SEARCH_LOCATIONS_MAPPING}' + f"and be one of {TEXT_SEARCH_LOCATIONS_MAPPING}" ) - if isinstance(location_input,tuple): - location_input = ','.join(location_input) - - if isinstance(location_input,str): - location_input = tuple(replace_substrings_in_string(location_input).split(',')) + if isinstance(location_input, tuple): + location_input = ",".join(location_input) + + if isinstance(location_input, str): + location_input = tuple(replace_substrings_in_string(location_input).split(",")) for value in location_input: # Eliminate issues caused by casing and whitespaces - value = value.replace(" ","").upper() + value = value.replace(" ", "").upper() if value not in TEXT_SEARCH_LOCATIONS_MAPPING.keys(): raise ValueError(f"{value} not in {TEXT_SEARCH_LOCATIONS_MAPPING}") - location_input = ','.join(location_input) + location_input = ",".join(location_input) if location_input: location_input = location_input.replace(" ", "") - + return location_input diff --git a/tests/test_url_generator.py b/tests/test_url_generator.py new file mode 100644 index 0000000..f33ba48 --- /dev/null +++ b/tests/test_url_generator.py @@ -0,0 +1,46 @@ +""" +This file tests code responsible for generating the EDGAR search URL. + +Note that at the time of this writing, EDGAR double-encodes query search +parameters so that %-encoded characters, like a quote ("), are encoded +as %2520 in the browser's URL instead of %20. This is a quirk with the +SEC's search functionality. Local testing indicates that single-encoded +URLs (which is the norm) and double-encoded URLs produce the same +responses. + +I.e. this double-encoded URL produced on the SEC's EDGAR search page: + https://www.sec.gov/edgar/search/#/q=%2522Insider%2520trading%2520report%2522 + +is functionally equivalent to our generated URL: + https://www.sec.gov/edgar/search/#/q=%22Insider%20trading%20report%20 +""" + +from edgar_tool import url_generator + + +def test_should_correctly_generate_search_url_for_single_word(): + """Baseline test to assert that querying for a single word + produces the correct search URL""" + # GIVEN + keywords = ["10-K"] + expected_url = "https://www.sec.gov/edgar/search/#/q=10-K" + + # WHEN + actual_url = url_generator.generate_search_url_for_kwargs({"keywords": keywords}) + + # THEN + assert actual_url == expected_url + + +def test_should_correctly_generate_search_url_for_exact_phrase(): + # GIVEN + keywords = ["Insider trading report"] + expected_url = ( + "https://www.sec.gov/edgar/search/#/q=%22Insider%20trading%20report%22" + ) + + # WHEN + actual_url = url_generator.generate_search_url_for_kwargs({"keywords": keywords}) + + # THEN + assert actual_url == expected_url From 49876a21d2d5c7cb176e1bea5c10756b7bcdfbe2 Mon Sep 17 00:00:00 2001 From: Jordan Gillard Date: Mon, 15 Jul 2024 23:57:12 +0000 Subject: [PATCH 06/18] test: Add invalid arg tests --- poetry.lock | 51 +++++++++++++++++++++++++++++++++++-- pyproject.toml | 1 + tests/test_url_generator.py | 11 +++++++- 3 files changed, 60 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index b592e5c..79d622c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "attrs" @@ -233,6 +233,20 @@ files = [ {file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"}, ] +[[package]] +name = "exceptiongroup" +version = "1.2.2" +description = "Backport of PEP 654 (exception groups)" +optional = false +python-versions = ">=3.7" +files = [ + {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, + {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"}, +] + +[package.extras] +test = ["pytest (>=6)"] + [[package]] name = "filelock" version = "3.15.4" @@ -274,6 +288,17 @@ files = [ {file = "idna-3.6.tar.gz", hash = "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca"}, ] +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.7" +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + [[package]] name = "jsonlines" version = "4.0.0" @@ -371,6 +396,28 @@ tomli = {version = ">=2.0.1", markers = "python_version < \"3.11\""} docs = ["furo (>=2024.5.6)", "sphinx-autodoc-typehints (>=2.2.1)"] testing = ["covdefaults (>=2.3)", "pytest (>=8.2.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "setuptools (>=70.1)"] +[[package]] +name = "pytest" +version = "8.2.2" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest-8.2.2-py3-none-any.whl", hash = "sha256:c434598117762e2bd304e526244f67bf66bbd7b5d6cf22138be51ff661980343"}, + {file = "pytest-8.2.2.tar.gz", hash = "sha256:de4bb8104e201939ccdc688b27a89a7be2079b22e2bd2b07f806b6ba71117977"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=1.5,<2.0" +tomli = {version = ">=1", markers = "python_version < \"3.11\""} + +[package.extras] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] + [[package]] name = "requests" version = "2.31.0" @@ -531,4 +578,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "641f1eda619269cfc514c3f705d18a761bce7f9cbd0ad4b8768a641d6a7356f1" +content-hash = "9f33a820003e081d830e6fe266b1c54cf20439346c3fa50f789f30958f6ebd69" diff --git a/pyproject.toml b/pyproject.toml index 4469f77..012d1c8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ fire = "^0.5" jsonlines = "^4.0" requests = "^2.31" xmltodict = "^0.13" +pytest = "^8.2.2" [tool.poetry.group.dev.dependencies] black = "^24.2.0" diff --git a/tests/test_url_generator.py b/tests/test_url_generator.py index f33ba48..b71f5a9 100644 --- a/tests/test_url_generator.py +++ b/tests/test_url_generator.py @@ -6,7 +6,7 @@ as %2520 in the browser's URL instead of %20. This is a quirk with the SEC's search functionality. Local testing indicates that single-encoded URLs (which is the norm) and double-encoded URLs produce the same -responses. +responses. I.e. this double-encoded URL produced on the SEC's EDGAR search page: https://www.sec.gov/edgar/search/#/q=%2522Insider%2520trading%2520report%2522 @@ -14,6 +14,7 @@ is functionally equivalent to our generated URL: https://www.sec.gov/edgar/search/#/q=%22Insider%20trading%20report%20 """ +import pytest from edgar_tool import url_generator @@ -44,3 +45,11 @@ def test_should_correctly_generate_search_url_for_exact_phrase(): # THEN assert actual_url == expected_url + +@pytest.mark.parametrize('args', [ + {"keywords": []}, + {"entity": []}, +]) +def test_should_raise_if_keywords_or_entity_missing(args): + with pytest.raises(ValueError, match="Invalid search arguments. You must provide keywords or an entity."): + url_generator.generate_search_url_for_kwargs(args) From c49894151fc6e71ab48f14f4cc132863b6698cb4 Mon Sep 17 00:00:00 2001 From: Jordan Gillard Date: Tue, 16 Jul 2024 00:06:42 +0000 Subject: [PATCH 07/18] test: Add invalid custom date tests --- tests/test_url_generator.py | 47 +++++++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/tests/test_url_generator.py b/tests/test_url_generator.py index b71f5a9..92a33ef 100644 --- a/tests/test_url_generator.py +++ b/tests/test_url_generator.py @@ -14,6 +14,8 @@ is functionally equivalent to our generated URL: https://www.sec.gov/edgar/search/#/q=%22Insider%20trading%20report%20 """ + +import datetime import pytest from edgar_tool import url_generator @@ -46,10 +48,41 @@ def test_should_correctly_generate_search_url_for_exact_phrase(): # THEN assert actual_url == expected_url -@pytest.mark.parametrize('args', [ - {"keywords": []}, - {"entity": []}, -]) -def test_should_raise_if_keywords_or_entity_missing(args): - with pytest.raises(ValueError, match="Invalid search arguments. You must provide keywords or an entity."): - url_generator.generate_search_url_for_kwargs(args) + +@pytest.mark.parametrize( + "test_kwarg", + [ + {"keywords": []}, + {"entity": []}, + ], +) +def test_should_raise_if_keywords_or_entity_missing(test_kwarg): + # GIVEN + expected_error_msg = ( + "Invalid search arguments. You must provide keywords or an entity." + ) + + # WHEN / THEN + with pytest.raises(ValueError, match=expected_error_msg): + url_generator.generate_search_url_for_kwargs(test_kwarg) + + +@pytest.mark.parametrize( + "date_kwarg", + [ + {"start_date": datetime.date.today()}, + {"end_date": datetime.date.today()}, + ], +) +def test_should_raise_if_date_range_custom_but_missing_dates(date_kwarg): + # GIVEN + expected_error_msg = ( + "Invalid date parameters. " + "You must provide both a start and end date if searching a custom date range." + ) + base_kwargs = {"keywords": ["Ford Motor Co"], "date_range_select": "custom"} + test_kwargs = {**base_kwargs, **date_kwarg} + + # WHEN / THEN + with pytest.raises(ValueError, match=expected_error_msg): + url_generator.generate_search_url_for_kwargs(test_kwargs) From 0391bf6204ec236e669fe6060c760238117b4e1f Mon Sep 17 00:00:00 2001 From: Jordan Gillard Date: Tue, 16 Jul 2024 00:13:02 +0000 Subject: [PATCH 08/18] test: Raises error for invalid date_range_select --- edgar_tool/url_generator.py | 7 +++++++ tests/test_url_generator.py | 13 +++++++++++++ 2 files changed, 20 insertions(+) diff --git a/edgar_tool/url_generator.py b/edgar_tool/url_generator.py index 259f937..43626ed 100644 --- a/edgar_tool/url_generator.py +++ b/edgar_tool/url_generator.py @@ -34,6 +34,13 @@ def __init__(self, **query_args: SearchQueryKwargs): "You must provide both a start and end date if searching a custom date range." ) ) + elif date_range_select and date_range_select not in {"all", "10y", "1y", "30d"}: + raise ValueError( + ( + "Invalid date_range_select. " + 'Value must be one of "all", "10y", "1y", "30d", or "custom"' + ) + ) self._keywords = keywords self.entity = entity diff --git a/tests/test_url_generator.py b/tests/test_url_generator.py index 92a33ef..20e9e21 100644 --- a/tests/test_url_generator.py +++ b/tests/test_url_generator.py @@ -86,3 +86,16 @@ def test_should_raise_if_date_range_custom_but_missing_dates(date_kwarg): # WHEN / THEN with pytest.raises(ValueError, match=expected_error_msg): url_generator.generate_search_url_for_kwargs(test_kwargs) + + +def test_should_raise_if_date_range_select_invalid(): + # GIVEN + expected_error_msg = ( + "Invalid date_range_select. " + 'Value must be one of "all", "10y", "1y", "30d", or "custom"' + ) + test_kwargs = {"keywords": ["Ford Motor Co"], "date_range_select": "1m"} + + # WHEN / THEN + with pytest.raises(ValueError, match=expected_error_msg): + url_generator.generate_search_url_for_kwargs(test_kwargs) From bb850ec435c72634db0554dc58b692f919b111ea Mon Sep 17 00:00:00 2001 From: Jordan Gillard Date: Tue, 16 Jul 2024 00:34:27 +0000 Subject: [PATCH 09/18] test: Add tests for all date options --- edgar_tool/url_generator.py | 21 ++++++++++++++++++++- tests/test_url_generator.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/edgar_tool/url_generator.py b/edgar_tool/url_generator.py index 43626ed..8693b8f 100644 --- a/edgar_tool/url_generator.py +++ b/edgar_tool/url_generator.py @@ -34,7 +34,13 @@ def __init__(self, **query_args: SearchQueryKwargs): "You must provide both a start and end date if searching a custom date range." ) ) - elif date_range_select and date_range_select not in {"all", "10y", "1y", "30d"}: + elif date_range_select and date_range_select not in { + "all", + "10y", + "1y", + "30d", + "custom", + }: raise ValueError( ( "Invalid date_range_select. " @@ -68,6 +74,19 @@ def generate_search_url_for_kwargs(search_kwargs: SearchQueryKwargs) -> str: query_params = { "q": validated_params.keywords, } + if date_range_select := validated_params.date_range_select: + query_params.update( + { + "dateRange": date_range_select, + } + ) + if date_range_select == "custom": + query_params.update( + { + "startdt": validated_params.start_date.strftime("%Y-%m-%d"), + "enddt": validated_params.end_date.strftime("%Y-%m-%d"), + } + ) encoded_params = parse.urlencode( query_params, doseq=True, encoding="utf-8", quote_via=parse.quote ) diff --git a/tests/test_url_generator.py b/tests/test_url_generator.py index 20e9e21..ca6045a 100644 --- a/tests/test_url_generator.py +++ b/tests/test_url_generator.py @@ -99,3 +99,34 @@ def test_should_raise_if_date_range_select_invalid(): # WHEN / THEN with pytest.raises(ValueError, match=expected_error_msg): url_generator.generate_search_url_for_kwargs(test_kwargs) + + +@pytest.mark.parametrize( + "date_kwargs,url_ending", + [ + ( + { + "date_range_select": "custom", + "start_date": datetime.date.fromisoformat("2024-07-10"), + "end_date": datetime.date.fromisoformat("2024-07-15"), + }, + "&dateRange=custom&startdt=2024-07-10&enddt=2024-07-15", + ), + ({"date_range_select": "all"}, "&dateRange=all"), + ({"date_range_select": "10y"}, "&dateRange=10y"), + ({"date_range_select": "1y"}, "&dateRange=1y"), + ({"date_range_select": "30d"}, "&dateRange=30d"), + ], +) +def test_generates_correct_url_for_date(date_kwargs, url_ending): + # GIVEN + expected_url = ( + f"https://www.sec.gov/edgar/search/#/q=%22Ford%20Motor%20Co%22{url_ending}" + ) + test_kwargs = {**{"keywords": ["Ford Motor Co"]}, **date_kwargs} + + # WHEN + actual_url = url_generator.generate_search_url_for_kwargs(test_kwargs) + + # THEN + assert actual_url == expected_url From e30880bd4c5a00c968f780103bbb2c945db9bf11 Mon Sep 17 00:00:00 2001 From: Jordan Gillard Date: Sun, 28 Jul 2024 18:03:19 +0000 Subject: [PATCH 10/18] Add filing categories to url_generator --- README.md | 4 +++- edgar_tool/url_generator.py | 44 +++++++++++++++++++++++++++++++++++-- tests/test_url_generator.py | 33 +++++++++++++++++++++++++++- 3 files changed, 77 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 7d6278f..426536c 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,9 @@ Usage with date range and export to custom CSV file edgar-tool text_search Tsunami Hazards --start_date "2021-01-01" --end_date "2021-12-31" --output "results.csv" ``` -# Usage with a partial set of filing forms + single forms +### Usage with a partial set of filing forms + single forms + +``` edgar-tool text_search Hurricane Damage --filing_form "registration_statements" --single_forms "['1-K', '1-SA']" ``` diff --git a/edgar_tool/url_generator.py b/edgar_tool/url_generator.py index 8693b8f..0b8a0f8 100644 --- a/edgar_tool/url_generator.py +++ b/edgar_tool/url_generator.py @@ -3,10 +3,26 @@ from urllib import parse +filing_category_to_sec_form_id = { + "all": "", + "all_except_section_16": "form-cat0", + "all_annual_quarterly_and_current_reports": "form-cat1", + "all_section_16": "form-cat2", + "beneficial_ownership_reports": "form-cat3", + "exempt_offerings": "form-cat4", + "registration_statements": "form-cat5", + "filing_review_correspondence": "form-cat6", + "sec_orders_and_notices": "form-cat7", + "proxy_materials": "form-cat8", + "tender_offers_and_going_private_tx": "form-cat9", + "trust_indentures": "form-cat10", +} + + class SearchQueryKwargs(TypedDict, total=False): keywords: list[str] entity: str - filing_form: str + filing_category: str single_forms: list[str] date_range_select: Literal["all", "10y", "1y", "30d", "custom"] start_date: datetime.date @@ -50,7 +66,7 @@ def __init__(self, **query_args: SearchQueryKwargs): self._keywords = keywords self.entity = entity - self.filing_form = query_args.get("filing_form") + self._filing_category = query_args.get("filing_category", "all") self.single_forms = query_args.get("single_forms") self.date_range_select = date_range_select self.start_date = start_date @@ -67,6 +83,28 @@ def keywords(self): """Returns the keywords to search for, wrapping exact phrases in quotes.""" return [f'"{phrase}"' if " " in phrase else phrase for phrase in self._keywords] + @property + def filing_category(self): + return self._filing_category + + @keywords.getter + def filing_category(self): + filing_category_to_sec_form_id = { + "all": "", + "all_except_section_16": "form-cat0", + "all_annual_quarterly_and_current_reports": "form-cat1", + "all_section_16": "form-cat2", + "beneficial_ownership_reports": "form-cat3", + "exempt_offerings": "form-cat4", + "registration_statements": "form-cat5", + "filing_review_correspondence": "form-cat6", + "sec_orders_and_notices": "form-cat7", + "proxy_materials": "form-cat8", + "tender_offers_and_going_private_tx": "form-cat9", + "trust_indentures": "form-cat10", + } + return filing_category_to_sec_form_id[self._filing_category] + def generate_search_url_for_kwargs(search_kwargs: SearchQueryKwargs) -> str: base_url = "https://www.sec.gov/edgar/search/#/" @@ -87,6 +125,8 @@ def generate_search_url_for_kwargs(search_kwargs: SearchQueryKwargs) -> str: "enddt": validated_params.end_date.strftime("%Y-%m-%d"), } ) + if validated_params.filing_category: + query_params["category"] = validated_params.filing_category encoded_params = parse.urlencode( query_params, doseq=True, encoding="utf-8", quote_via=parse.quote ) diff --git a/tests/test_url_generator.py b/tests/test_url_generator.py index ca6045a..35921ed 100644 --- a/tests/test_url_generator.py +++ b/tests/test_url_generator.py @@ -118,7 +118,9 @@ def test_should_raise_if_date_range_select_invalid(): ({"date_range_select": "30d"}, "&dateRange=30d"), ], ) -def test_generates_correct_url_for_date(date_kwargs, url_ending): +def test_generates_correct_url_for_date_ranges(date_kwargs, url_ending): + """Tests that various date range options are correctly translated + into the seach URL.""" # GIVEN expected_url = ( f"https://www.sec.gov/edgar/search/#/q=%22Ford%20Motor%20Co%22{url_ending}" @@ -130,3 +132,32 @@ def test_generates_correct_url_for_date(date_kwargs, url_ending): # THEN assert actual_url == expected_url + + +@pytest.mark.parametrize( + "filing_category, url_ending", + ( + ("all", ""), + ("all_except_section_16", "&category=form-cat0"), + ("all_annual_quarterly_and_current_reports", "&category=form-cat1"), + ("all_section_16", "&category=form-cat2"), + ("beneficial_ownership_reports", "&category=form-cat3"), + ("exempt_offerings", "&category=form-cat4"), + ("registration_statements", "&category=form-cat5"), + ("filing_review_correspondence", "&category=form-cat6"), + ("sec_orders_and_notices", "&category=form-cat7"), + ("proxy_materials", "&category=form-cat8"), + ("tender_offers_and_going_private_tx", "&category=form-cat9"), + ("trust_indentures", "&category=form-cat10"), + ), +) +def test_generates_correct_url_for_filing_category(filing_category, url_ending): + # GIVEN + expected_url = f"https://www.sec.gov/edgar/search/#/q=Ignore{url_ending}" + test_kwargs = {"keywords": ["Ignore"], "filing_category": filing_category} + + # WHEN + actual_url = url_generator.generate_search_url_for_kwargs(test_kwargs) + + # THEN + assert actual_url == expected_url From eb2583ad9c192ec45bfc40c51e9fcd898e4fa7af Mon Sep 17 00:00:00 2001 From: Jordan Gillard Date: Sun, 28 Jul 2024 18:59:53 +0000 Subject: [PATCH 11/18] Remove duplicate filing category to form id code --- edgar_tool/constants.py | 13 ------------- edgar_tool/url_generator.py | 16 ---------------- 2 files changed, 29 deletions(-) diff --git a/edgar_tool/constants.py b/edgar_tool/constants.py index 9b79752..92d9958 100644 --- a/edgar_tool/constants.py +++ b/edgar_tool/constants.py @@ -1,18 +1,5 @@ SUPPORTED_OUTPUT_EXTENSIONS = [".csv", ".jsonl", ".json"] TEXT_SEARCH_BASE_URL = "https://efts.sec.gov/LATEST/search-index?" -TEXT_SEARCH_FILING_CATEGORIES_MAPPING = { - "all_except_section_16": "form-cat0", - "all_annual_quarterly_and_current_reports": "form-cat1", - "all_section_16": "form-cat2", - "beneficial_ownership_reports": "form-cat3", - "exempt_offerings": "form-cat4", - "registration_statements": "form-cat5", - "filing_review_correspondence": "form-cat6", - "sec_orders_and_notices": "form-cat7", - "proxy_materials": "form-cat8", - "tender_offers_and_going_private_tx": "form-cat9", - "trust_indentures": "form-cat10", -} TEXT_SEARCH_SPLIT_BATCHES_NUMBER = 2 TEXT_SEARCH_CSV_FIELDS_NAMES = [ "root_form", diff --git a/edgar_tool/url_generator.py b/edgar_tool/url_generator.py index 0b8a0f8..903591f 100644 --- a/edgar_tool/url_generator.py +++ b/edgar_tool/url_generator.py @@ -3,22 +3,6 @@ from urllib import parse -filing_category_to_sec_form_id = { - "all": "", - "all_except_section_16": "form-cat0", - "all_annual_quarterly_and_current_reports": "form-cat1", - "all_section_16": "form-cat2", - "beneficial_ownership_reports": "form-cat3", - "exempt_offerings": "form-cat4", - "registration_statements": "form-cat5", - "filing_review_correspondence": "form-cat6", - "sec_orders_and_notices": "form-cat7", - "proxy_materials": "form-cat8", - "tender_offers_and_going_private_tx": "form-cat9", - "trust_indentures": "form-cat10", -} - - class SearchQueryKwargs(TypedDict, total=False): keywords: list[str] entity: str From 534b672a1144f4380726e8eac2c835f631db3f96 Mon Sep 17 00:00:00 2001 From: Jordan Gillard Date: Wed, 7 Aug 2024 00:12:49 +0000 Subject: [PATCH 12/18] feat: Add support for single forms --- edgar_tool/url_generator.py | 19 +++++++++++--- tests/test_url_generator.py | 49 +++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 3 deletions(-) diff --git a/edgar_tool/url_generator.py b/edgar_tool/url_generator.py index 903591f..ef4be66 100644 --- a/edgar_tool/url_generator.py +++ b/edgar_tool/url_generator.py @@ -50,8 +50,18 @@ def __init__(self, **query_args: SearchQueryKwargs): self._keywords = keywords self.entity = entity - self._filing_category = query_args.get("filing_category", "all") - self.single_forms = query_args.get("single_forms") + + filing_category = query_args.get("filing_category", "custom") + single_forms = query_args.get("single_forms") + if filing_category != "custom" and single_forms: + raise ValueError( + "Cannot specify both filing_category and single_forms. " + "Passing single_forms automatically sets the filing_category" + " to custom. Please choose one or the other." + ) + + self._filing_category = filing_category + self.single_forms = single_forms self.date_range_select = date_range_select self.start_date = start_date self.end_date = end_date @@ -87,7 +97,7 @@ def filing_category(self): "tender_offers_and_going_private_tx": "form-cat9", "trust_indentures": "form-cat10", } - return filing_category_to_sec_form_id[self._filing_category] + return filing_category_to_sec_form_id.get(self._filing_category) def generate_search_url_for_kwargs(search_kwargs: SearchQueryKwargs) -> str: @@ -111,6 +121,9 @@ def generate_search_url_for_kwargs(search_kwargs: SearchQueryKwargs) -> str: ) if validated_params.filing_category: query_params["category"] = validated_params.filing_category + elif validated_params.single_forms: + query_params["category"] = "custom" + query_params["forms"] = validated_params.single_forms encoded_params = parse.urlencode( query_params, doseq=True, encoding="utf-8", quote_via=parse.quote ) diff --git a/tests/test_url_generator.py b/tests/test_url_generator.py index 35921ed..5810aa3 100644 --- a/tests/test_url_generator.py +++ b/tests/test_url_generator.py @@ -161,3 +161,52 @@ def test_generates_correct_url_for_filing_category(filing_category, url_ending): # THEN assert actual_url == expected_url + + +@pytest.mark.parametrize( + "single_forms, url_ending", + ( + (["1"], "&forms=1"), + (["CORRESP"], "&forms=CORRESP"), + ( + ["F-4, PREC14A, SEC STAFF ACTION"], + "&forms=F-4%2C%20PREC14A%2C%20SEC%20STAFF%20ACTION", + ), + ), +) +def test_generates_correct_url_for_single_forms(single_forms, url_ending): + # GIVEN + expected_url = ( + f"https://www.sec.gov/edgar/search/#/q=Ignore&category=custom{url_ending}" + ) + test_kwargs = {"keywords": ["Ignore"], "single_forms": single_forms} + + # WHEN + actual_url = url_generator.generate_search_url_for_kwargs(test_kwargs) + + # THEN + assert actual_url == expected_url + + +def test_raises_an_exception_if_user_passes_both_filing_category_and_single_forms(): + """When a user filters based on single form type the filing category is automatically + set to "custom." Therefore passing a filing category when using single forms both does + not make sense and will potentially give the user confusing results if the code ignores + the passed filing category and sets it as custom. It's best to raise an error and let + the user use either a filing category or single forms. + """ + # GIVEN + test_kwargs = { + "keywords": ["Ignore"], + "single_forms": ["F-4, PREC14A, SEC STAFF ACTION"], + "filing_category": "beneficial_ownership_reports", + } + expected_error_msg = ( + "Cannot specify both filing_category and single_forms. " + "Passing single_forms automatically sets the filing_category" + " to custom. Please choose one or the other." + ) + + # WHEN / THEN + with pytest.raises(ValueError, match=expected_error_msg): + url_generator.generate_search_url_for_kwargs(test_kwargs) From ca7add3baf9f3955f93dd0e4e272efe76982bd07 Mon Sep 17 00:00:00 2001 From: Jordan Gillard Date: Wed, 7 Aug 2024 00:18:58 +0000 Subject: [PATCH 13/18] Amend me! Add TODO for remaining work --- tests/test_url_generator.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_url_generator.py b/tests/test_url_generator.py index 5810aa3..dabdf75 100644 --- a/tests/test_url_generator.py +++ b/tests/test_url_generator.py @@ -210,3 +210,6 @@ def test_raises_an_exception_if_user_passes_both_filing_category_and_single_form # WHEN / THEN with pytest.raises(ValueError, match=expected_error_msg): url_generator.generate_search_url_for_kwargs(test_kwargs) + + +# TODO: Test principle executive offices in and incorporated in parameters. From d31de9a91c41199e4e45518e3d75232e04cac709 Mon Sep 17 00:00:00 2001 From: Jordan Gillard Date: Sat, 17 Aug 2024 19:52:39 +0000 Subject: [PATCH 14/18] Add ISO 3166-1/2 country codes to EDGAR codes --- edgar_tool/constants.py | 114 +++++++++++++++++++++++++++++++ edgar_tool/url_generator.py | 10 ++- tests/test_url_generator.py | 131 +++++++++++++++++++++++++++++++++++- 3 files changed, 252 insertions(+), 3 deletions(-) diff --git a/edgar_tool/constants.py b/edgar_tool/constants.py index 92d9958..b570c19 100644 --- a/edgar_tool/constants.py +++ b/edgar_tool/constants.py @@ -39,6 +39,120 @@ "xbrl_files", ] +"""All mappings below are from the SEC EDGAR website's search form. +The keys are the values that the CLI uses, and the values are those +that the search form uses. All values are shown in the order they +appear in the SEC EDGAR search drop down.""" +PEO_IN_AND_INC_IN_TO_SEC_FORM_ID = { + # US States + "AL": "AL", + "AK": "AK", + "AZ": "AZ", + "AR": "AR", + "CA": "CA", + "CO": "CO", + "CT": "CT", + "DE": "DE", + "DC": "DC", + "FL": "FL", + "GA": "GA", + "HI": "HI", + "ID": "ID", + "IL": "IL", + "IN": "IN", + "IA": "IA", + "KS": "KS", + "KY": "KY", + "LA": "LA", + "ME": "ME", + "MD": "MD", + "MA": "MA", + "MI": "MI", + "MN": "MN", + "MS": "MS", + "MO": "MO", + "MT": "MT", + "NE": "NE", + "NV": "NV", + "NH": "NH", + "NJ": "NJ", + "NM": "NM", + "NY": "NY", + "NC": "NC", + "ND": "ND", + "OH": "OH", + "OK": "OK", + "OR": "OR", + "PA": "PA", + "RI": "RI", + "SC": "SC", + "SD": "SD", + "TN": "TN", + "TX": "TX", + "UT": "UT", + "VT": "VT", + "VA": "VA", + "WA": "WA", + "WV": "WV", + "WI": "WI", + "WY": "WY", + # Canadian Provinces + "AB": "A0", + "BC": "A1", + "CAN": "Z4", # Canada (Federal Level) + "MB": "A2", + "NB": "A3", + "NL": "A4", + "NS": "A5", + "ON": "A6", + "PE": "A7", + "QC": "A8", + "SK": "A9", + "YT": "B0", + # Countries + "AFG": "B2", + "ALA": "Y6", + "ALB": "B3", + "DZA": "B4", + "ASM": "B5", + "AND": "B6", + "AGO": "B7", + "AIA": "1A", + "ATA": "B8", + "ATG": "B9", + "ARG": "C1", + "ARM": "1B", + "ABW": "1C", + "AUS": "C3", + "AUT": "C4", + "AZE": "1D", + "BHS": "C5", + "BHR": "C6", + "BGD": "C7", + "BRB": "C8", + "BLR": "1F", + "BEL": "C9", + "BLZ": "D1", + "BEN": "G6", + "BMU": "D0", + "BTN": "D2", + "BOL": "D3", + "BIH": "1E", + "BWA": "B1", + "BVT": "D4", + "BRA": "D5", + "IOT": "D6", + "BRN": "D9", + "BGR": "E0", + "BFA": "X2", + "BDI": "E2", + "KHM": "E3", + "CMR": "E4", + "CPV": "E8", + "CYM": "E9", + "CAF": "F0", +} + TEXT_SEARCH_LOCATIONS_MAPPING = { "AL": "Alabama", "AK": "Alaska", diff --git a/edgar_tool/url_generator.py b/edgar_tool/url_generator.py index ef4be66..181c8bb 100644 --- a/edgar_tool/url_generator.py +++ b/edgar_tool/url_generator.py @@ -2,6 +2,8 @@ from typing import Literal, TypedDict from urllib import parse +from edgar_tool.constants import PEO_IN_AND_INC_IN_TO_SEC_FORM_ID + class SearchQueryKwargs(TypedDict, total=False): keywords: list[str] @@ -11,8 +13,8 @@ class SearchQueryKwargs(TypedDict, total=False): date_range_select: Literal["all", "10y", "1y", "30d", "custom"] start_date: datetime.date end_date: datetime.date - peo_in: str inc_in: str + peo_in: str class _ValidSearchParams: @@ -65,8 +67,8 @@ def __init__(self, **query_args: SearchQueryKwargs): self.date_range_select = date_range_select self.start_date = start_date self.end_date = end_date - self.peo_in = query_args.get("peo_in") self.inc_in = query_args.get("inc_in") + self.peo_in = query_args.get("peo_in") @property def keywords(self): @@ -124,6 +126,10 @@ def generate_search_url_for_kwargs(search_kwargs: SearchQueryKwargs) -> str: elif validated_params.single_forms: query_params["category"] = "custom" query_params["forms"] = validated_params.single_forms + if validated_params.peo_in: + query_params["locationCode"] = PEO_IN_AND_INC_IN_TO_SEC_FORM_ID[ + validated_params.peo_in + ] encoded_params = parse.urlencode( query_params, doseq=True, encoding="utf-8", quote_via=parse.quote ) diff --git a/tests/test_url_generator.py b/tests/test_url_generator.py index dabdf75..15a828e 100644 --- a/tests/test_url_generator.py +++ b/tests/test_url_generator.py @@ -212,4 +212,133 @@ def test_raises_an_exception_if_user_passes_both_filing_category_and_single_form url_generator.generate_search_url_for_kwargs(test_kwargs) -# TODO: Test principle executive offices in and incorporated in parameters. +@pytest.mark.parametrize( + "peo_in, expected_location_code", + [ + # US States - All use 2-letter state & territory abbreviations (ISO 3166-2) + ("AL", "AL"), # Alabama + ("AK", "AK"), # Alaska + ("AZ", "AZ"), # Arizona + ("AR", "AR"), # Arkansas + ("CA", "CA"), # California + ("CO", "CO"), # Colorado + ("CT", "CT"), # Connecticut + ("DE", "DE"), # Delaware + ("DC", "DC"), # District of Columbia + ("FL", "FL"), # Florida + ("GA", "GA"), # Georgia + ("HI", "HI"), # Hawaii + ("ID", "ID"), # Idaho + ("IL", "IL"), # Illinois + ("IN", "IN"), # Indiana + ("IA", "IA"), # Iowa + ("KS", "KS"), # Kansas + ("KY", "KY"), # Kentucky + ("LA", "LA"), # Louisiana + ("ME", "ME"), # Maine + ("MD", "MD"), # Maryland + ("MA", "MA"), # Massachusetts + ("MI", "MI"), # Michigan + ("MN", "MN"), # Minnesota + ("MS", "MS"), # Mississippi + ("MO", "MO"), # Missouri + ("MT", "MT"), # Montana + ("NE", "NE"), # Nebraska + ("NV", "NV"), # Nevada + ("NH", "NH"), # New Hampshire + ("NJ", "NJ"), # New Jersey + ("NM", "NM"), # New Mexico + ("NY", "NY"), # New York + ("NC", "NC"), # North Carolina + ("ND", "ND"), # North Dakota + ("OH", "OH"), # Ohio + ("OK", "OK"), # Oklahoma + ("OR", "OR"), # Oregon + ("PA", "PA"), # Pennsylvania + ("RI", "RI"), # Rhode Island + ("SC", "SC"), # South Carolina + ("SD", "SD"), # South Dakota + ("TN", "TN"), # Tennessee + ("TX", "TX"), # Texas + ("UT", "UT"), # Utah + ("VT", "VT"), # Vermont + ("VA", "VA"), # Virginia + ("WA", "WA"), # Washington + ("WV", "WV"), # West Virginia + ("WI", "WI"), # Wisconsin + ("WY", "WY"), # Wyoming + # Canadian Provinces - P.E.O. in to use internationally approved alpha codes (ISO 3166-2) + ("AB", "A0"), # Alberta + ("BC", "A1"), # British Columbia + ("CAN", "Z4"), # Canada (Federal Level) + ("MB", "A2"), # Manitoba + ("NB", "A3"), # New Brunswick + ("NL", "A4"), # Newfoundland and Labrador + ("NS", "A5"), # Nova Scotia + ("ON", "A6"), # Ontario + ("PE", "A7"), # Prince Edward Island + ("QC", "A8"), # Quebec + ("SK", "A9"), # Saskatchewan + ("YT", "B0"), # Yukon + # Other Countries - All use internationally approved 3-letter alpha codes (ISO 3166-1) + ("AFG", "B2"), # Afghanistan + ("ALA", "Y6"), # Aland Islands + ("ALB", "B3"), # Albania + ("DZA", "B4"), # Algeria + ("ASM", "B5"), # American Samoa + ("AND", "B6"), # Andorra + ("AGO", "B7"), # Angola + ("AIA", "1A"), # Anguilla + ("ATA", "B8"), # Antarctica + ("ATG", "B9"), # Antigua and Barbuda + ("ARG", "C1"), # Argentina + ("ARM", "1B"), # Armenia + ("ABW", "1C"), # Aruba + ("AUS", "C3"), # Australia + ("AUT", "C4"), # Austria + ("AZE", "1D"), # Azerbaijan + ("BHS", "C5"), # Bahamas + ("BHR", "C6"), # Bahrain + ("BGD", "C7"), # Bangladesh + ("BRB", "C8"), # Barbados + ("BLR", "1F"), # Belarus + ("BEL", "C9"), # Belgium + ("BLZ", "D1"), # Belize + ("BEN", "G6"), # Benin + ("BMU", "D0"), # Bermuda + ("BTN", "D2"), # Bhutan + ("BOL", "D3"), # Bolivia + ("BIH", "1E"), # Bosnia and Herzegovina + ("BWA", "B1"), # Botswana + ("BVT", "D4"), # Bouvet Island + ("BRA", "D5"), # Brazil + ("IOT", "D6"), # British Indian Ocean Territory + ("BRN", "D9"), # Brunei Darussalam + ("BGR", "E0"), # Bulgaria + ("BFA", "X2"), # Burkina Faso + ("BDI", "E2"), # Burundi + ("KHM", "E3"), # Cambodia + ("CMR", "E4"), # Cameroon + ("CPV", "E8"), # Cape Verde + ("CYM", "E9"), # Cayman Islands + ("CAF", "F0"), # Central African Republic + ], +) +def test_should_correctly_generate_search_url_for_principal_executive_office_in( + peo_in, expected_location_code +): + # GIVEN + expected_url = ( + f"https://www.sec.gov/edgar/search/#/q=a&locationCode={expected_location_code}" + ) + + # WHEN + actual_url = url_generator.generate_search_url_for_kwargs( + {"keywords": ["a"], "peo_in": peo_in} + ) + + # THEN + assert actual_url == expected_url + + +# TODO: Test incorporated in parameters. From 4d4c1bac3b410e98f14811d2d941f4c8723c35e4 Mon Sep 17 00:00:00 2001 From: Jordan Gillard Date: Sun, 18 Aug 2024 20:16:18 +0000 Subject: [PATCH 15/18] Add urls for remaining countries --- edgar_tool/constants.py | 203 ++++++++++++++++++++++++++++++++++++ tests/test_url_generator.py | 203 ++++++++++++++++++++++++++++++++++++ 2 files changed, 406 insertions(+) diff --git a/edgar_tool/constants.py b/edgar_tool/constants.py index b570c19..f378c7b 100644 --- a/edgar_tool/constants.py +++ b/edgar_tool/constants.py @@ -151,6 +151,209 @@ "CPV": "E8", "CYM": "E9", "CAF": "F0", + "TCD": "F2", + "CHL": "F3", + "CHN": "F4", + "CXR": "F6", + "CCK": "F7", + "COL": "F8", + "COM": "F9", + "COG": "G0", + "COD": "Y3", + "COK": "G1", + "CRI": "G2", + "CIV": "L7", + "HRV": "1M", + "CUB": "G3", + "CYP": "G4", + "CZE": "2N", + "DNK": "G7", + "DJI": "1G", + "DMA": "G9", + "DOM": "D8", + "ECU": "H1", + "EGY": "H2", + "SLV": "H3", + "GNQ": "H4", + "ERI": "1J", + "EST": "1H", + "ETH": "H5", + "FLK": "H7", + "FRO": "H6", + "FJI": "H8", + "FIN": "H9", + "FRA": "I0", + "GUF": "I3", + "PYF": "I4", + "ATF": "2C", + "GAB": "I5", + "GMB": "I6", + "GEO": "2Q", + "DEU": "2M", + "GHA": "J0", + "GIB": "J1", + "GRC": "J3", + "GRL": "J4", + "GRD": "J5", + "GLP": "J6", + "GUM": "GU", + "GTM": "J8", + "GGY": "Y7", + "GIN": "J9", + "GNB": "S0", + "GUY": "K0", + "HTI": "K1", + "HMD": "K4", + "VAT": "X4", + "HND": "K2", + "HKG": "K3", + "HUN": "K5", + "ISL": "K6", + "IND": "K7", + "IDN": "K8", + "IRN": "K9", + "IRQ": "L0", + "IRL": "L2", + "IMN": "Y8", + "ISR": "L3", + "ITA": "L6", + "JAM": "L8", + "JPN": "M0", + "JEY": "Y9", + "JOR": "M2", + "KAZ": "1P", + "KEN": "M3", + "KIR": "J2", + "PRK": "M4", + "KOR": "M5", + "KWT": "M6", + "KGZ": "1N", + "LAO": "M7", + "LVA": "1R", + "LBN": "M8", + "LSO": "M9", + "LBR": "N0", + "LBY": "N1", + "LIE": "N2", + "LTU": "1Q", + "LUX": "N4", + "MAC": "N5", + "MKD": "1U", + "MDG": "N6", + "MWI": "N7", + "MYS": "N8", + "MDV": "N9", + "MLI": "O0", + "MLT": "O1", + "MHL": "1T", + "MTQ": "O2", + "MRT": "O3", + "MUS": "O4", + "MYT": "2P", + "MEX": "O5", + "FSM": "1K", + "MDA": "1S", + "MCO": "O9", + "MNG": "P0", + "MNE": "Z5", + "MSR": "P1", + "MAR": "P2", + "MOZ": "P3", + "MMR": "E1", + "NAM": "T6", + "NRU": "P5", + "NPL": "P6", + "NLD": "P7", + "ANT": "P8", + "NCL": "1W", + "NZL": "Q2", + "NIC": "Q3", + "NER": "Q4", + "NGA": "Q5", + "NIU": "Q6", + "NFK": "Q7", + "MNP": "1V", + "NOR": "Q8", + "OMN": "P4", + "PAK": "R0", + "PLW": "1Y", + "PSE": "1X", + "PAN": "R1", + "PNG": "R2", + "PRY": "R4", + "PER": "R5", + "PHL": "R6", + "PCN": "R8", + "POL": "R9", + "PRT": "S1", + "PRI": "PR", + "QAT": "S3", + "REU": "S4", + "ROU": "S5", + "RUS": "1Z", + "RWA": "S6", + "BLM": "Z0", + "SHN": "U8", + "KNA": "U7", + "LCA": "U9", + "MAF": "Z1", + "SPM": "V0", + "VCT": "V1", + "WSM": "Y0", + "SMR": "S8", + "STP": "S9", + "SAU": "T0", + "SEN": "T1", + "SRB": "Z2", + "SYC": "T2", + "SLE": "T8", + "SGP": "U0", + "SVK": "2B", + "SVN": "2A", + "SLB": "D7", + "SOM": "U1", + "ZAF": "T3", + "SGS": "1L", + "ESP": "U3", + "LKA": "F1", + "SDN": "V2", + "SUR": "V3", + "SJM": "L9", + "SWZ": "V6", + "SWE": "V7", + "CHE": "V8", + "SYR": "V9", + "TWN": "F5", + "TJK": "2D", + "THA": "W1", + "TLS": "Z3", + "TGO": "W2", + "TKL": "W3", + "TON": "W4", + "TTO": "W5", + "TUN": "W6", + "TUR": "W8", + "TKM": "2E", + "TCA": "W7", + "TUV": "2G", + "UGA": "W9", + "UKR": "2H", + "ARE": "C0", + "GBR": "X0", + "UMI": "2J", + "URY": "X3", + "UZB": "2K", + "VUT": "2L", + "VEN": "X5", + "VNM": "Q1", + "VGB": "D8", + "VIR": "VI", + "WLF": "X8", + "ESH": "Y1", + "YEM": "T7", + "ZMB": "Y4", + "ZWE": "Y5", + "XX": "XX", } TEXT_SEARCH_LOCATIONS_MAPPING = { diff --git a/tests/test_url_generator.py b/tests/test_url_generator.py index 15a828e..ad16902 100644 --- a/tests/test_url_generator.py +++ b/tests/test_url_generator.py @@ -322,6 +322,209 @@ def test_raises_an_exception_if_user_passes_both_filing_category_and_single_form ("CPV", "E8"), # Cape Verde ("CYM", "E9"), # Cayman Islands ("CAF", "F0"), # Central African Republic + ("TCD", "F2"), # Chad + ("CHL", "F3"), # Chile + ("CHN", "F4"), # China + ("CXR", "F6"), # Christmas Island + ("CCK", "F7"), # Cocos (Keeling) Islands + ("COL", "F8"), # Colombia + ("COM", "F9"), # Comoros + ("COG", "G0"), # Congo + ("COD", "Y3"), # Congo, Democratic Republic of the + ("COK", "G1"), # Cook Islands + ("CRI", "G2"), # Costa Rica + ("CIV", "L7"), # Cote d'Ivoire + ("HRV", "1M"), # Croatia + ("CUB", "G3"), # Cuba + ("CYP", "G4"), # Cyprus + ("CZE", "2N"), # Czech Republic + ("DNK", "G7"), # Denmark + ("DJI", "1G"), # Djibouti + ("DMA", "G9"), # Dominica + ("DOM", "D8"), # Dominican Republic + ("ECU", "H1"), # Ecuador + ("EGY", "H2"), # Egypt + ("SLV", "H3"), # El Salvador + ("GNQ", "H4"), # Equatorial Guinea + ("ERI", "1J"), # Eritrea + ("EST", "1H"), # Estonia + ("ETH", "H5"), # Ethiopia + ("FLK", "H7"), # Falkland Islands (Malvinas) + ("FRO", "H6"), # Faroe Islands + ("FJI", "H8"), # Fiji + ("FIN", "H9"), # Finland + ("FRA", "I0"), # France + ("GUF", "I3"), # French Guiana + ("PYF", "I4"), # French Polynesia + ("ATF", "2C"), # French Southern Territories + ("GAB", "I5"), # Gabon + ("GMB", "I6"), # Gambia + ("GEO", "2Q"), # Georgia + ("DEU", "2M"), # Germany + ("GHA", "J0"), # Ghana + ("GIB", "J1"), # Gibraltar + ("GRC", "J3"), # Greece + ("GRL", "J4"), # Greenland + ("GRD", "J5"), # Grenada + ("GLP", "J6"), # Guadeloupe + ("GUM", "GU"), # Guam + ("GTM", "J8"), # Guatemala + ("GGY", "Y7"), # Guernsey + ("GIN", "J9"), # Guinea + ("GNB", "S0"), # Guinea-Bissau + ("GUY", "K0"), # Guyana + ("HTI", "K1"), # Haiti + ("HMD", "K4"), # Heard Island and McDonald Islands + ("VAT", "X4"), # Holy See (Vatican City State) + ("HND", "K2"), # Honduras + ("HKG", "K3"), # Hong Kong + ("HUN", "K5"), # Hungary + ("ISL", "K6"), # Iceland + ("IND", "K7"), # India + ("IDN", "K8"), # Indonesia + ("IRN", "K9"), # Iran + ("IRQ", "L0"), # Iraq + ("IRL", "L2"), # Ireland + ("IMN", "Y8"), # Isle of Man + ("ISR", "L3"), # Israel + ("ITA", "L6"), # Italy + ("JAM", "L8"), # Jamaica + ("JPN", "M0"), # Japan + ("JEY", "Y9"), # Jersey + ("JOR", "M2"), # Jordan + ("KAZ", "1P"), # Kazakhstan + ("KEN", "M3"), # Kenya + ("KIR", "J2"), # Kiribati + ("PRK", "M4"), # Korea, Democratic People's Republic of + ("KOR", "M5"), # Korea, Republic of + ("KWT", "M6"), # Kuwait + ("KGZ", "1N"), # Kyrgyzstan + ("LAO", "M7"), # Lao People's Democratic Republic + ("LVA", "1R"), # Latvia + ("LBN", "M8"), # Lebanon + ("LSO", "M9"), # Lesotho + ("LBR", "N0"), # Liberia + ("LBY", "N1"), # Libya + ("LIE", "N2"), # Liechtenstein + ("LTU", "1Q"), # Lithuania + ("LUX", "N4"), # Luxembourg + ("MAC", "N5"), # Macao + ("MKD", "1U"), # Macedonia + ("MDG", "N6"), # Madagascar + ("MWI", "N7"), # Malawi + ("MYS", "N8"), # Malaysia + ("MDV", "N9"), # Maldives + ("MLI", "O0"), # Mali + ("MLT", "O1"), # Malta + ("MHL", "1T"), # Marshall Islands + ("MTQ", "O2"), # Martinique + ("MRT", "O3"), # Mauritania + ("MUS", "O4"), # Mauritius + ("MYT", "2P"), # Mayotte + ("MEX", "O5"), # Mexico + ("FSM", "1K"), # Micronesia, Federated States of + ("MDA", "1S"), # Moldova + ("MCO", "O9"), # Monaco + ("MNG", "P0"), # Mongolia + ("MNE", "Z5"), # Montenegro + ("MSR", "P1"), # Montserrat + ("MAR", "P2"), # Morocco + ("MOZ", "P3"), # Mozambique + ("MMR", "E1"), # Myanmar + ("NAM", "T6"), # Namibia + ("NRU", "P5"), # Nauru + ("NPL", "P6"), # Nepal + ("NLD", "P7"), # Netherlands + ("ANT", "P8"), # Netherlands Antilles + ("NCL", "1W"), # New Caledonia + ("NZL", "Q2"), # New Zealand + ("NIC", "Q3"), # Nicaragua + ("NER", "Q4"), # Niger + ("NGA", "Q5"), # Nigeria + ("NIU", "Q6"), # Niue + ("NFK", "Q7"), # Norfolk Island + ("MNP", "1V"), # Northern Mariana Islands + ("NOR", "Q8"), # Norway + ("OMN", "P4"), # Oman + ("PAK", "R0"), # Pakistan + ("PLW", "1Y"), # Palau + ("PSE", "1X"), # Palestinian Territory + ("PAN", "R1"), # Panama + ("PNG", "R2"), # Papua New Guinea + ("PRY", "R4"), # Paraguay + ("PER", "R5"), # Peru + ("PHL", "R6"), # Philippines + ("PCN", "R8"), # Pitcairn + ("POL", "R9"), # Poland + ("PRT", "S1"), # Portugal + ("PRI", "PR"), # Puerto Rico + ("QAT", "S3"), # Qatar + ("REU", "S4"), # Reunion + ("ROU", "S5"), # Romania + ("RUS", "1Z"), # Russian Federation + ("RWA", "S6"), # Rwanda + ("BLM", "Z0"), # Saint Barthelemy + ("SHN", "U8"), # Saint Helena + ("KNA", "U7"), # Saint Kitts and Nevis + ("LCA", "U9"), # Saint Lucia + ("MAF", "Z1"), # Saint Martin + ("SPM", "V0"), # Saint Pierre and Miquelon + ("VCT", "V1"), # Saint Vincent and the Grenadines + ("WSM", "Y0"), # Samoa + ("SMR", "S8"), # San Marino + ("STP", "S9"), # Sao Tome and Principe + ("SAU", "T0"), # Saudi Arabia + ("SEN", "T1"), # Senegal + ("SRB", "Z2"), # Serbia + ("SYC", "T2"), # Seychelles + ("SLE", "T8"), # Sierra Leone + ("SGP", "U0"), # Singapore + ("SVK", "2B"), # Slovakia + ("SVN", "2A"), # Slovenia + ("SLB", "D7"), # Solomon Islands + ("SOM", "U1"), # Somalia + ("ZAF", "T3"), # South Africa + ("SGS", "1L"), # South Georgia and the South Sandwich Islands + ("ESP", "U3"), # Spain + ("LKA", "F1"), # Sri Lanka + ("SDN", "V2"), # Sudan + ("SUR", "V3"), # Suriname + ("SJM", "L9"), # Svalbard and Jan Mayen + ("SWZ", "V6"), # Kingdom of Eswatini (Formerly Swaziland) + ("SWE", "V7"), # Sweden + ("CHE", "V8"), # Switzerland + ("SYR", "V9"), # Syrian Arab Republic (Syria) + ("TWN", "F5"), # Taiwan + ("TJK", "2D"), # Tajikistan + ("THA", "W1"), # Thailand + ("TLS", "Z3"), # Timor-Leste + ("TGO", "W2"), # Togo + ("TKL", "W3"), # Tokelau + ("TON", "W4"), # Tonga + ("TTO", "W5"), # Trinidad and Tobago + ("TUN", "W6"), # Tunisia + ("TUR", "W8"), # Turkey + ("TKM", "2E"), # Turkmenistan + ("TCA", "W7"), # Turks and Caicos Islands + ("TUV", "2G"), # Tuvalu + ("UGA", "W9"), # Uganda + ("UKR", "2H"), # Ukraine + ("ARE", "C0"), # United Arab Emirates + ("GBR", "X0"), # United Kingdom + ("UMI", "2J"), # United States Minor Outlying Islands + ("URY", "X3"), # Uruguay + ("UZB", "2K"), # Uzbekistan + ("VUT", "2L"), # Vanuatu + ("VEN", "X5"), # Venezuela + ("VNM", "Q1"), # Vietnam + ("VGB", "D8"), # British Virgin Islands + ("VIR", "VI"), # U.S. Virgin Islands + ("WLF", "X8"), # Wallis and Futuna + ("ESH", "Y1"), # Western Sahara + ("YEM", "T7"), # Yemen + ("ZMB", "Y4"), # Zambia + ("ZWE", "Y5"), # Zimbabwe + ("XX", "XX"), # Unknown ], ) def test_should_correctly_generate_search_url_for_principal_executive_office_in( From 41ee9bb2444fec6a2aaffcfcfe24d0c22e2dc02c Mon Sep 17 00:00:00 2001 From: Jordan Gillard Date: Sun, 18 Aug 2024 21:15:34 +0000 Subject: [PATCH 16/18] Add test for invalid peo_in --- edgar_tool/url_generator.py | 15 +++++++++++++-- tests/test_url_generator.py | 14 ++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/edgar_tool/url_generator.py b/edgar_tool/url_generator.py index 181c8bb..61d3bfe 100644 --- a/edgar_tool/url_generator.py +++ b/edgar_tool/url_generator.py @@ -67,8 +67,19 @@ def __init__(self, **query_args: SearchQueryKwargs): self.date_range_select = date_range_select self.start_date = start_date self.end_date = end_date - self.inc_in = query_args.get("inc_in") - self.peo_in = query_args.get("peo_in") + + peo_in = query_args.get("peo_in") + if peo_in and peo_in not in PEO_IN_AND_INC_IN_TO_SEC_FORM_ID: + raise ValueError( + ( + "Invalid location code. " + "Please provide a valid 2-letter state abbreviation, " + "3-letter country code, or 'XX' for unknown." + ) + ) + inc_in = query_args.get("inc_in") + self.inc_in = inc_in + self.peo_in = peo_in @property def keywords(self): diff --git a/tests/test_url_generator.py b/tests/test_url_generator.py index ad16902..77e5f45 100644 --- a/tests/test_url_generator.py +++ b/tests/test_url_generator.py @@ -544,4 +544,18 @@ def test_should_correctly_generate_search_url_for_principal_executive_office_in( assert actual_url == expected_url +def test_should_raise_exception_if_location_code_invalid(): + # GIVEN + expected_error_msg = ( + "Invalid location code. " + "Please provide a valid 2-letter state abbreviation, " + "3-letter country code, or 'XX' for unknown." + ) + test_kwargs = {"keywords": ["a"], "peo_in": "SUN"} + + # WHEN / THEN + with pytest.raises(ValueError, match=expected_error_msg): + url_generator.generate_search_url_for_kwargs(test_kwargs) + + # TODO: Test incorporated in parameters. From f3204b8b4151985d125ab31b086a139660035ec6 Mon Sep 17 00:00:00 2001 From: Jordan Gillard Date: Sat, 7 Sep 2024 23:06:48 +0000 Subject: [PATCH 17/18] Add tests for incorportated in --- edgar_tool/url_generator.py | 18 ++++++++-- tests/test_url_generator.py | 67 +++++++++++++++++++++++++------------ 2 files changed, 62 insertions(+), 23 deletions(-) diff --git a/edgar_tool/url_generator.py b/edgar_tool/url_generator.py index 61d3bfe..0775782 100644 --- a/edgar_tool/url_generator.py +++ b/edgar_tool/url_generator.py @@ -69,7 +69,17 @@ def __init__(self, **query_args: SearchQueryKwargs): self.end_date = end_date peo_in = query_args.get("peo_in") - if peo_in and peo_in not in PEO_IN_AND_INC_IN_TO_SEC_FORM_ID: + inc_in = query_args.get("inc_in") + if peo_in and inc_in: + raise ValueError( + "Cannot specify both peo_in and inc_in. Please choose one or the other." + ) + if ( + peo_in + and peo_in not in PEO_IN_AND_INC_IN_TO_SEC_FORM_ID + or inc_in + and inc_in not in PEO_IN_AND_INC_IN_TO_SEC_FORM_ID + ): raise ValueError( ( "Invalid location code. " @@ -77,7 +87,6 @@ def __init__(self, **query_args: SearchQueryKwargs): "3-letter country code, or 'XX' for unknown." ) ) - inc_in = query_args.get("inc_in") self.inc_in = inc_in self.peo_in = peo_in @@ -141,6 +150,11 @@ def generate_search_url_for_kwargs(search_kwargs: SearchQueryKwargs) -> str: query_params["locationCode"] = PEO_IN_AND_INC_IN_TO_SEC_FORM_ID[ validated_params.peo_in ] + elif validated_params.inc_in: + query_params["locationType"] = "incorporated" + query_params["locationCode"] = PEO_IN_AND_INC_IN_TO_SEC_FORM_ID[ + validated_params.inc_in + ] encoded_params = parse.urlencode( query_params, doseq=True, encoding="utf-8", quote_via=parse.quote ) diff --git a/tests/test_url_generator.py b/tests/test_url_generator.py index 77e5f45..a469dbe 100644 --- a/tests/test_url_generator.py +++ b/tests/test_url_generator.py @@ -213,7 +213,7 @@ def test_raises_an_exception_if_user_passes_both_filing_category_and_single_form @pytest.mark.parametrize( - "peo_in, expected_location_code", + "abbreviation, expected_location_code", [ # US States - All use 2-letter state & territory abbreviations (ISO 3166-2) ("AL", "AL"), # Alabama @@ -527,35 +527,60 @@ def test_raises_an_exception_if_user_passes_both_filing_category_and_single_form ("XX", "XX"), # Unknown ], ) -def test_should_correctly_generate_search_url_for_principal_executive_office_in( - peo_in, expected_location_code -): - # GIVEN - expected_url = ( - f"https://www.sec.gov/edgar/search/#/q=a&locationCode={expected_location_code}" - ) - - # WHEN - actual_url = url_generator.generate_search_url_for_kwargs( - {"keywords": ["a"], "peo_in": peo_in} - ) - - # THEN - assert actual_url == expected_url - - -def test_should_raise_exception_if_location_code_invalid(): +class TestPeoInAndIncIn: + def test_should_correctly_generate_search_url_for_peo_in( + self, abbreviation, expected_location_code + ): + # GIVEN + expected_url = f"https://www.sec.gov/edgar/search/#/q=a&locationCode={expected_location_code}" + + # WHEN + actual_url = url_generator.generate_search_url_for_kwargs( + {"keywords": ["a"], "peo_in": abbreviation} + ) + + # THEN + assert actual_url == expected_url + + def test_should_correctly_generate_search_url_for_inc_in( + self, abbreviation, expected_location_code + ): + # GIVEN + expected_url = f"https://www.sec.gov/edgar/search/#/q=a&locationType=incorporated&locationCode={expected_location_code}" + + # WHEN + actual_url = url_generator.generate_search_url_for_kwargs( + {"keywords": ["a"], "inc_in": abbreviation} + ) + + # THEN + assert actual_url == expected_url + + +@pytest.mark.parametrize("key", ["peo_in", "inc_in"]) +def test_should_raise_exception_if_location_code_invalid(key): # GIVEN expected_error_msg = ( "Invalid location code. " "Please provide a valid 2-letter state abbreviation, " "3-letter country code, or 'XX' for unknown." ) - test_kwargs = {"keywords": ["a"], "peo_in": "SUN"} + soviet_union = "SUN" + test_kwargs = {"keywords": ["a"], key: soviet_union} # WHEN / THEN with pytest.raises(ValueError, match=expected_error_msg): url_generator.generate_search_url_for_kwargs(test_kwargs) -# TODO: Test incorporated in parameters. +def test_should_raise_exception_if_both_peo_in_and_inc_in(): + # GIVEN + expected_error_msg = ( + "Cannot specify both peo_in and inc_in. Please choose one or the other." + ) + + test_kwargs = {"keywords": ["a"], "peo_in": "CA", "inc_in": "CA"} + + # WHEN / THEN + with pytest.raises(ValueError, match=expected_error_msg): + url_generator.generate_search_url_for_kwargs(test_kwargs) From c4e0163b8b7de0700b069a39eab6a6fa66ba1043 Mon Sep 17 00:00:00 2001 From: Jordan Gillard Date: Sun, 8 Sep 2024 00:06:09 +0000 Subject: [PATCH 18/18] Update url_generator to use EDGAR API URL --- edgar_tool/url_generator.py | 5 ++--- tests/test_url_generator.py | 16 +++++++--------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/edgar_tool/url_generator.py b/edgar_tool/url_generator.py index 0775782..1b47f46 100644 --- a/edgar_tool/url_generator.py +++ b/edgar_tool/url_generator.py @@ -2,7 +2,7 @@ from typing import Literal, TypedDict from urllib import parse -from edgar_tool.constants import PEO_IN_AND_INC_IN_TO_SEC_FORM_ID +from edgar_tool.constants import PEO_IN_AND_INC_IN_TO_SEC_FORM_ID, TEXT_SEARCH_BASE_URL class SearchQueryKwargs(TypedDict, total=False): @@ -123,7 +123,6 @@ def filing_category(self): def generate_search_url_for_kwargs(search_kwargs: SearchQueryKwargs) -> str: - base_url = "https://www.sec.gov/edgar/search/#/" validated_params = _ValidSearchParams(**search_kwargs) query_params = { "q": validated_params.keywords, @@ -158,4 +157,4 @@ def generate_search_url_for_kwargs(search_kwargs: SearchQueryKwargs) -> str: encoded_params = parse.urlencode( query_params, doseq=True, encoding="utf-8", quote_via=parse.quote ) - return parse.urljoin(base=base_url, url=encoded_params, allow_fragments=False) + return TEXT_SEARCH_BASE_URL + encoded_params diff --git a/tests/test_url_generator.py b/tests/test_url_generator.py index a469dbe..27efb9f 100644 --- a/tests/test_url_generator.py +++ b/tests/test_url_generator.py @@ -26,7 +26,7 @@ def test_should_correctly_generate_search_url_for_single_word(): produces the correct search URL""" # GIVEN keywords = ["10-K"] - expected_url = "https://www.sec.gov/edgar/search/#/q=10-K" + expected_url = f"https://efts.sec.gov/LATEST/search-index?q=10-K" # WHEN actual_url = url_generator.generate_search_url_for_kwargs({"keywords": keywords}) @@ -39,7 +39,7 @@ def test_should_correctly_generate_search_url_for_exact_phrase(): # GIVEN keywords = ["Insider trading report"] expected_url = ( - "https://www.sec.gov/edgar/search/#/q=%22Insider%20trading%20report%22" + "https://efts.sec.gov/LATEST/search-index?q=%22Insider%20trading%20report%22" ) # WHEN @@ -122,9 +122,7 @@ def test_generates_correct_url_for_date_ranges(date_kwargs, url_ending): """Tests that various date range options are correctly translated into the seach URL.""" # GIVEN - expected_url = ( - f"https://www.sec.gov/edgar/search/#/q=%22Ford%20Motor%20Co%22{url_ending}" - ) + expected_url = f"https://efts.sec.gov/LATEST/search-index?q=%22Ford%20Motor%20Co%22{url_ending}" test_kwargs = {**{"keywords": ["Ford Motor Co"]}, **date_kwargs} # WHEN @@ -153,7 +151,7 @@ def test_generates_correct_url_for_date_ranges(date_kwargs, url_ending): ) def test_generates_correct_url_for_filing_category(filing_category, url_ending): # GIVEN - expected_url = f"https://www.sec.gov/edgar/search/#/q=Ignore{url_ending}" + expected_url = f"https://efts.sec.gov/LATEST/search-index?q=Ignore{url_ending}" test_kwargs = {"keywords": ["Ignore"], "filing_category": filing_category} # WHEN @@ -177,7 +175,7 @@ def test_generates_correct_url_for_filing_category(filing_category, url_ending): def test_generates_correct_url_for_single_forms(single_forms, url_ending): # GIVEN expected_url = ( - f"https://www.sec.gov/edgar/search/#/q=Ignore&category=custom{url_ending}" + f"https://efts.sec.gov/LATEST/search-index?q=Ignore&category=custom{url_ending}" ) test_kwargs = {"keywords": ["Ignore"], "single_forms": single_forms} @@ -532,7 +530,7 @@ def test_should_correctly_generate_search_url_for_peo_in( self, abbreviation, expected_location_code ): # GIVEN - expected_url = f"https://www.sec.gov/edgar/search/#/q=a&locationCode={expected_location_code}" + expected_url = f"https://efts.sec.gov/LATEST/search-index?q=a&locationCode={expected_location_code}" # WHEN actual_url = url_generator.generate_search_url_for_kwargs( @@ -546,7 +544,7 @@ def test_should_correctly_generate_search_url_for_inc_in( self, abbreviation, expected_location_code ): # GIVEN - expected_url = f"https://www.sec.gov/edgar/search/#/q=a&locationType=incorporated&locationCode={expected_location_code}" + expected_url = f"https://efts.sec.gov/LATEST/search-index?q=a&locationType=incorporated&locationCode={expected_location_code}" # WHEN actual_url = url_generator.generate_search_url_for_kwargs(