From 3c71a179870e670f12640c053b6d34765f53e9b7 Mon Sep 17 00:00:00 2001 From: "Grayson, Matthew" Date: Fri, 15 Mar 2024 12:03:24 -0500 Subject: [PATCH 01/13] Fix issues flagged by bandit and flake8. --- .bandit.yml | 7 +- .flake8 | 6 +- .../scripts/populateCountiesCities/cities.py | 48 ++++++-- .../populateCountiesCities/counties.py | 32 ++++- .../scripts/populateCountiesCities/main.py | 30 +++++ backend/worker/__init__.py | 5 + backend/worker/mitmproxy_sign_requests.py | 113 ++++++++++-------- backend/worker/requirements.txt | 1 + .../worker/test_mitmproxy_sign_requests.py | 21 +++- .../worker/webscraper/webscraper/__init__.py | 5 + backend/worker/webscraper/webscraper/items.py | 11 +- .../webscraper/webscraper/middlewares.py | 73 +++++++++-- .../worker/webscraper/webscraper/pipelines.py | 29 ++++- .../worker/webscraper/webscraper/settings.py | 19 +-- .../webscraper/webscraper/spiders/__init__.py | 9 +- .../webscraper/spiders/main_spider.py | 46 ++++++- 16 files changed, 354 insertions(+), 101 deletions(-) diff --git a/.bandit.yml b/.bandit.yml index ab3cb21e..c8cf0312 100644 --- a/.bandit.yml +++ b/.bandit.yml @@ -6,8 +6,9 @@ # If `tests` is empty, all tests are considered included. tests: -# - B101 -# - B102 skips: -# - B101 # skip "assert used" check since assertions are required in pytests + - B101 # skip "assert used" check since assertions are required in pytests + +exclude: + - '**/test_*.py' diff --git a/.flake8 b/.flake8 index 92ff8268..869d863a 100644 --- a/.flake8 +++ b/.flake8 @@ -21,5 +21,9 @@ select = C,D,E,F,W,B,B950 # Also ignore flake8's warning about line breaks before binary # operators. It no longer agrees with PEP8. See, for example, here: # https://github.com/ambv/black/issues/21. Guido agrees here: -# https://github.com/python/peps/commit/c59c4376ad233a62ca4b3a6060c81368bd21e85b. +# https://github.com/python/peps/commit/c59c4376ad233a62git ca4b3a6060c81368bd21e85b. ignore = E501,W503 +# Ignore D100 and D103, which check for docstrings in modules and functions, in all test files +per-file-ignores = + # Ignore D100 and D103 in all test files + */test_*.py: D100, D103 diff --git a/backend/scripts/populateCountiesCities/cities.py b/backend/scripts/populateCountiesCities/cities.py index cd720f42..8adc3c4e 100644 --- a/backend/scripts/populateCountiesCities/cities.py +++ b/backend/scripts/populateCountiesCities/cities.py @@ -1,18 +1,44 @@ -import pandas as pd -import requests -from bs4 import BeautifulSoup -import time -import re +""" +This module contains the script for populating cities data. + +It includes functions for parsing titles, pulling cities data from Wikipedia, +and writing the data to a CSV file. +""" + +# Standard Python Libraries import json +import re +import time from urllib.parse import unquote +# Third-Party Libraries +from bs4 import BeautifulSoup +import pandas as pd +import requests + def title_parse(title): + """ + Parse the title by unquoting it. + + Args: + title (str): The title to be parsed. + + Returns: + str: The parsed title. + """ title = unquote(title) return title def pull_cities(): + """ + Process and pull cities data from Wikipedia. + + This function reads the Wikipedia US cities data from a JSON file, processes each entry, + fetches the corresponding Wikipedia page, parses the page to extract city, county, and URL information, + and writes the data to a CSV file. + """ print("Processing Cities...") with open("wikipedia_US_cities.json") as f: wikipedia_us_city_data = json.load(f) @@ -23,7 +49,10 @@ def pull_cities(): print(entry["name"]) # get the response in the form of html wikiurl = "https://en.wikipedia.org/wiki/" + entry["url"] - response = requests.get(wikiurl) + try: + response = requests.get(wikiurl, timeout=5) + except requests.exceptions.Timeout: + print("The request timed out") # parse data from the html into a beautifulsoup object soup = BeautifulSoup(response.text, "html.parser") @@ -52,7 +81,9 @@ def pull_cities(): if "," in link.get("title"): county_pieces = link.get("title").split(",") # OPEN WIKIPEDIA PAGE UP - x = requests.get("https://en.wikipedia.org/" + link.get("href")) + x = requests.get( + "https://en.wikipedia.org/" + link.get("href"), timeout=5 + ) # PULL COUNTY OR PARISH FROM WIKIPEDIA PAGE county_parish_matches = re.findall( @@ -85,7 +116,8 @@ def pull_cities(): } ) time.sleep(1) - except: + except Exception as e: + print(f"Error: {e}") pass df = pd.DataFrame(holding_pen, columns=["State", "County", "City", "URL"]) diff --git a/backend/scripts/populateCountiesCities/counties.py b/backend/scripts/populateCountiesCities/counties.py index 64df0f38..34823f8c 100644 --- a/backend/scripts/populateCountiesCities/counties.py +++ b/backend/scripts/populateCountiesCities/counties.py @@ -1,16 +1,35 @@ +""" +This module contains the script for populating counties data. + +It includes functions for pulling counties data from Wikipedia, +and writing the data to a CSV file. +""" + +# Standard Python Libraries +import re +import time + +# Third-Party Libraries +from bs4 import BeautifulSoup import pandas as pd import requests -from bs4 import BeautifulSoup -import time -import re def pull_counties(): + """ + Process and pull counties data from Wikipedia. + + This function fetches the Wikipedia page for the list of United States counties, + parses the page to extract county, state, and URL information, + and writes the data to a CSV file. + """ print("Processing Counties...") # get the response in the form of html wikiurl = "https://en.wikipedia.org/wiki/List_of_United_States_counties_and_county_equivalents" - table_class = "wikitable sortable jquery-tablesorter" - response = requests.get(wikiurl) + try: + response = requests.get(wikiurl, timeout=5) + except requests.exceptions.Timeout: + print("The request timed out") # parse data from the html into a beautifulsoup object soup = BeautifulSoup(response.text, "html.parser") @@ -24,7 +43,7 @@ def pull_counties(): try: county_pieces = link.get("title").split(", ") # OPEN WIKIPEDIA PAGE UP - x = requests.get("https://en.wikipedia.org/" + link.get("href")) + x = requests.get("https://en.wikipedia.org/" + link.get("href"), timeout=5) # PULL WEBSITE FROM WIKIPEDIA PAGE w = re.findall( @@ -43,6 +62,7 @@ def pull_counties(): } ) except Exception as e: + print(f"Error: {e}") pass time.sleep(1) diff --git a/backend/scripts/populateCountiesCities/main.py b/backend/scripts/populateCountiesCities/main.py index dc86edb1..20296ce9 100644 --- a/backend/scripts/populateCountiesCities/main.py +++ b/backend/scripts/populateCountiesCities/main.py @@ -1,3 +1,9 @@ +""" +This module contains the main script for populating counties and cities data. + +It includes commands for processing cities and counties data separately or both at once. +""" + import typer import cities import counties @@ -7,16 +13,40 @@ @app.command() def process_cities(): + """ + Process and pull cities data from Wikipedia. + + This function calls the pull_cities function from the cities module, + which reads the Wikipedia US cities data from a JSON file, processes each entry, + fetches the corresponding Wikipedia page, parses the page to extract city, county, and URL information, + and writes the data to a CSV file. + """ cities.pull_cities() @app.command() def process_counties(): + """ + Process and pull counties data from Wikipedia. + + This function calls the pull_counties function from the counties module, + which fetches the Wikipedia page for the list of United States counties, + parses the page to extract county, state, and URL information, + and writes the data to a CSV file. + """ counties.pull_counties() @app.command() def process_both(): + """ + Process and pull both cities and counties data from Wikipedia. + + This function calls both the pull_cities function from the cities module and the pull_counties function from the counties module, + which fetches the Wikipedia pages for the list of United States cities and counties, + parses the pages to extract city, county, state, and URL information, + and writes the data to CSV files. + """ counties.pull_counties() cities.pull_cities() diff --git a/backend/worker/__init__.py b/backend/worker/__init__.py index e69de29b..088f05b8 100644 --- a/backend/worker/__init__.py +++ b/backend/worker/__init__.py @@ -0,0 +1,5 @@ +""" +This package contains the worker tasks for the backend. + +It includes modules for processing data, interacting with databases, and other backend tasks. +""" diff --git a/backend/worker/mitmproxy_sign_requests.py b/backend/worker/mitmproxy_sign_requests.py index 691a5afb..4967b173 100644 --- a/backend/worker/mitmproxy_sign_requests.py +++ b/backend/worker/mitmproxy_sign_requests.py @@ -1,16 +1,35 @@ -""" -mitmproxy addon that signs requests and adds a Crossfeed-specific user agent. -""" -from mitmproxy import http, ctx +"""mitmproxy addon that signs requests and adds a Crossfeed-specific user agent.""" +from mitmproxy import http import os import requests -import json import traceback from requests_http_signature import HTTPSignatureHeaderAuth class SignRequests: + """ + A class used to sign HTTP requests and add a Crossfeed-specific user agent. + + This class is used as a mitmproxy addon. It signs the HTTP requests using the provided private key and adds a user agent to the request headers. + + Attributes: + key_id (str): The key ID used for signing the requests. + private_key (str): The private key used for signing the requests. + public_key (str): The public key used for verifying the signature. + user_agent (str): The user agent to be added to the request headers. + signature_auth (HTTPSignatureHeaderAuth): The HTTPSignatureHeaderAuth instance used for signing the requests. + """ + def __init__(self, key_id="", public_key="", private_key="", user_agent=""): + """ + Initialize the SignRequests instance. + + Args: + key_id (str, optional): The key ID used for signing the requests. Defaults to "". + public_key (str, optional): The public key used for verifying the signature. Defaults to "". + private_key (str, optional): The private key used for signing the requests. Defaults to "". + user_agent (str, optional): The user agent to be added to the request headers. Defaults to "". + """ self.key_id = key_id self.private_key = private_key self.public_key = public_key @@ -20,9 +39,30 @@ def __init__(self, key_id="", public_key="", private_key="", user_agent=""): ) def key_resolver(self, key_id, algorithm): + """ + Resolve the key for the given key_id and algorithm. + + Args: + key_id (str): The key ID used for signing the requests. + algorithm (str): The algorithm used for signing the requests. + + Returns: + bytes: The public key encoded in bytes. + """ return self.public_key.encode() def verify_signature(self, method, url, date, signature): + """ + Verify the signature of the HTTP request. + + Args: + method (str): The HTTP method of the request. + url (str): The URL of the request. + date (str): The date when the request was made. + signature (str): The signature of the request. + + This method uses the HTTPSignatureHeaderAuth's verify method to verify the signature of the request. + """ HTTPSignatureHeaderAuth.verify( requests.Request( method=url, url=url, headers={"date": date, "Signature": signature} @@ -32,6 +72,17 @@ def verify_signature(self, method, url, date, signature): ) def request(self, flow): + """ + Process the HTTP request. + + This method adds a user agent to the request headers if one is provided. If a private key is provided, it signs the request using the HTTPSignatureHeaderAuth instance. + + Args: + flow (mitmproxy.http.HTTPFlow): The HTTP request/response flow. + + Raises: + Exception: If there is an error while processing the request, an exception is raised and a 500 response is returned. + """ try: if self.user_agent: flow.request.headers["User-Agent"] = self.user_agent @@ -57,47 +108,11 @@ def request(self, flow): ) -test = os.getenv("WORKER_TEST", None) is not None - -if test: - # This is a test RSA private key and not used in any deployed environment - # file deepcode ignore HardcodedNonCryptoSecret: - private_key = """-----BEGIN RSA PRIVATE KEY----- -MIICXgIBAAKBgQDCFENGw33yGihy92pDjZQhl0C36rPJj+CvfSC8+q28hxA161QF -NUd13wuCTUcq0Qd2qsBe/2hFyc2DCJJg0h1L78+6Z4UMR7EOcpfdUE9Hf3m/hs+F -UR45uBJeDK1HSFHD8bHKD6kv8FPGfJTotc+2xjJwoYi+1hqp1fIekaxsyQIDAQAB -AoGBAJR8ZkCUvx5kzv+utdl7T5MnordT1TvoXXJGXK7ZZ+UuvMNUCdN2QPc4sBiA -QWvLw1cSKt5DsKZ8UETpYPy8pPYnnDEz2dDYiaew9+xEpubyeW2oH4Zx71wqBtOK -kqwrXa/pzdpiucRRjk6vE6YY7EBBs/g7uanVpGibOVAEsqH1AkEA7DkjVH28WDUg -f1nqvfn2Kj6CT7nIcE3jGJsZZ7zlZmBmHFDONMLUrXR/Zm3pR5m0tCmBqa5RK95u -412jt1dPIwJBANJT3v8pnkth48bQo/fKel6uEYyboRtA5/uHuHkZ6FQF7OUkGogc -mSJluOdc5t6hI1VsLn0QZEjQZMEOWr+wKSMCQQCC4kXJEsHAve77oP6HtG/IiEn7 -kpyUXRNvFsDE0czpJJBvL/aRFUJxuRK91jhjC68sA7NsKMGg5OXb5I5Jj36xAkEA -gIT7aFOYBFwGgQAQkWNKLvySgKbAZRTeLBacpHMuQdl1DfdntvAyqpAZ0lY0RKmW -G6aFKaqQfOXKCyWoUiVknQJAXrlgySFci/2ueKlIE1QqIiLSZ8V8OlpFLRnb1pzI -7U1yQXnTAEFYM560yJlzUpOb1V4cScGd365tiSMvxLOvTA== ------END RSA PRIVATE KEY-----""" - - public_key = """-----BEGIN PUBLIC KEY----- -MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDCFENGw33yGihy92pDjZQhl0C3 -6rPJj+CvfSC8+q28hxA161QFNUd13wuCTUcq0Qd2qsBe/2hFyc2DCJJg0h1L78+6 -Z4UMR7EOcpfdUE9Hf3m/hs+FUR45uBJeDK1HSFHD8bHKD6kv8FPGfJTotc+2xjJw -oYi+1hqp1fIekaxsyQIDAQAB ------END PUBLIC KEY-----""" - addons = [ - SignRequests( - key_id="crossfeed", - public_key=public_key, - private_key=private_key, - user_agent="Crossfeed test user agent", - ) - ] -else: - addons = [ - SignRequests( - key_id="crossfeed", - public_key=os.getenv("WORKER_SIGNATURE_PUBLIC_KEY", ""), - private_key=os.getenv("WORKER_SIGNATURE_PRIVATE_KEY", ""), - user_agent=os.getenv("WORKER_USER_AGENT", ""), - ) - ] +addons = [ + SignRequests( + key_id="crossfeed", + public_key=os.getenv("WORKER_SIGNATURE_PUBLIC_KEY", ""), + private_key=os.getenv("WORKER_SIGNATURE_PRIVATE_KEY", ""), + user_agent=os.getenv("WORKER_USER_AGENT", ""), + ) +] diff --git a/backend/worker/requirements.txt b/backend/worker/requirements.txt index d362d5a5..0466899d 100644 --- a/backend/worker/requirements.txt +++ b/backend/worker/requirements.txt @@ -18,6 +18,7 @@ psycopg2-binary==2.9.5 pyproject_hooks==1.0.0 pytest==7.3.0 python-dateutil==2.8.2 +python-dotenv==1.0.1 pytz==2023.3 pytz-deprecation-shim==0.1.0.post0 regex==2023.3.23 diff --git a/backend/worker/test_mitmproxy_sign_requests.py b/backend/worker/test_mitmproxy_sign_requests.py index 766ca5fd..a5289425 100644 --- a/backend/worker/test_mitmproxy_sign_requests.py +++ b/backend/worker/test_mitmproxy_sign_requests.py @@ -1,4 +1,9 @@ -from mitmproxy import exceptions +""" +This module contains tests for the SignRequests class in the mitmproxy_sign_requests module. + +It includes tests for different scenarios such as when a user agent and signature are set, and when they are not set. +""" + from mitmproxy.test import tflow from mitmproxy.test import taddons from .mitmproxy_sign_requests import SignRequests @@ -29,13 +34,18 @@ def test_user_agent_and_signature(): + """ + This function tests the SignRequests class with a user agent and signature set. + + It creates an instance of the SignRequests class with a user agent and signature, makes a request, and verifies the signature. + """ sr = SignRequests( key_id="crossfeed", public_key=public_key, private_key=private_key, user_agent="custom user agent", ) - with taddons.context() as tctx: + with taddons.context(): f = tflow.tflow() f.request.headers["User-Agent"] = "original user agent" sr.request(f) @@ -49,8 +59,13 @@ def test_user_agent_and_signature(): def test_no_user_agent_or_signature_set(): + """ + This function tests the SignRequests class without a user agent and signature set. + + It creates an instance of the SignRequests class without a user agent and signature, makes a request, and checks that no user agent, date, or signature headers are set. + """ sr = SignRequests(key_id="", public_key="", private_key="", user_agent="") - with taddons.context() as tctx: + with taddons.context(): f = tflow.tflow() sr.request(f) assert "User-Agent" not in f.request.headers diff --git a/backend/worker/webscraper/webscraper/__init__.py b/backend/worker/webscraper/webscraper/__init__.py index e69de29b..61265cd2 100644 --- a/backend/worker/webscraper/webscraper/__init__.py +++ b/backend/worker/webscraper/webscraper/__init__.py @@ -0,0 +1,5 @@ +""" +This package contains modules and classes for web scraping. + +It includes modules for making HTTP requests, parsing HTML, and extracting data. +""" diff --git a/backend/worker/webscraper/webscraper/items.py b/backend/worker/webscraper/webscraper/items.py index a581a5fc..f7a4a8d6 100644 --- a/backend/worker/webscraper/webscraper/items.py +++ b/backend/worker/webscraper/webscraper/items.py @@ -1,6 +1,7 @@ -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html +""" +This module defines the items that the web scraper will extract from web pages. -import scrapy +Each item is represented by a class, and the fields of the class represent the data that the scraper will extract. +See documentation here for information on how to create models for your items: +https://docs.scrapy.org/en/latest/topics/items.html +""" diff --git a/backend/worker/webscraper/webscraper/middlewares.py b/backend/worker/webscraper/webscraper/middlewares.py index 3efe286a..0df4e887 100644 --- a/backend/worker/webscraper/webscraper/middlewares.py +++ b/backend/worker/webscraper/webscraper/middlewares.py @@ -1,27 +1,43 @@ -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html +""" +This module defines the middlewares for the web scraper. -from scrapy import signals +Middlewares are used to process incoming responses and outgoing requests and items. This module defines two types of middlewares: Spider Middleware and Downloader Middleware. The Spider Middleware processes responses before they reach the spider and processes items and requests after they have been processed by the spider. The Downloader Middleware processes requests before they are sent to the downloader and processes responses before they reach the Spider Middleware or the spider. +See documentation here: +https://docs.scrapy.org/en/latest/topics/spider-middleware.html +""" -# useful for handling different item types with a single interface -from itemadapter import is_item, ItemAdapter +from scrapy import signals class WebscraperSpiderMiddleware: + """ + This class defines the Spider Middleware for the web scraper. + + The Spider Middleware processes responses before they reach the spider and processes items and requests after they have been processed by the spider. + """ + # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): + """ + Create spiders using Scrapy. + + Connect the spider_opened method to the spider_opened signal. + """ # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): + """ + Process each response that goes through the spider middleware and into the spider. + + Return None or raise an exception. + """ # Called for each response that goes through the spider # middleware and into the spider. @@ -29,6 +45,11 @@ def process_spider_input(self, response, spider): return None def process_spider_output(self, response, result, spider): + """ + Process the results returned from the Spider, after it has processed the response. + + Return an iterable of Request, or item objects. + """ # Called with the results returned from the Spider, after # it has processed the response. @@ -37,6 +58,11 @@ def process_spider_output(self, response, result, spider): yield i def process_spider_exception(self, response, exception, spider): + """ + Handle exceptions raised by a spider or process_spider_input() method. + + This method should return either None or an iterable of Request or item objects. + """ # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. @@ -44,6 +70,11 @@ def process_spider_exception(self, response, exception, spider): pass def process_start_requests(self, start_requests, spider): + """ + Process the start requests of the spider. + + This method works similarly to the process_spider_output() method, except that it doesn’t have a response associated. It must return only requests (not items). + """ # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. @@ -53,22 +84,39 @@ def process_start_requests(self, start_requests, spider): yield r def spider_opened(self, spider): + """Log the name of the spider when opened.""" spider.logger.info("Spider opened: %s" % spider.name) class WebscraperDownloaderMiddleware: + """ + This class defines the Downloader Middleware for the web scraper. + + The Downloader Middleware processes requests before they are sent to the downloader and processes responses before they reach the Spider Middleware or the spider. + """ + # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): + """ + Create spiders using Scrapy. + + Connect the spider_opened method to the spider_opened signal. + """ # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): + """ + Process each request that goes through the downloader middleware. + + Must either return None, a Response object, a Request object, or raise IgnoreRequest. + """ # Called for each request that goes through the downloader # middleware. @@ -81,6 +129,11 @@ def process_request(self, request, spider): return None def process_response(self, request, response, spider): + """ + Process the response returned from the downloader. + + Must either return a Response object, a Request object, or raise IgnoreRequest. + """ # Called with the response returned from the downloader. # Must either; @@ -90,6 +143,11 @@ def process_response(self, request, response, spider): return response def process_exception(self, request, exception, spider): + """ + Handle exceptions raised by a download handler or a process_request() method. + + Must either return None, a Response object, a Request object. + """ # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. @@ -100,4 +158,5 @@ def process_exception(self, request, exception, spider): pass def spider_opened(self, spider): + """Log the name of the spider when it is opened.""" spider.logger.info("Spider opened: %s" % spider.name) diff --git a/backend/worker/webscraper/webscraper/pipelines.py b/backend/worker/webscraper/webscraper/pipelines.py index c9b63243..321bbdaf 100644 --- a/backend/worker/webscraper/webscraper/pipelines.py +++ b/backend/worker/webscraper/webscraper/pipelines.py @@ -1,18 +1,39 @@ +""" +This module contains the pipeline classes for the web scraper. + +The pipelines process the items returned by the spiders. +""" + from scrapy.exceptions import DropItem import json -import os -from io import BytesIO -from datetime import datetime class ExportFilePipeline: - """Prints file contents to the console.""" + """Print file contents to the console.""" def __init__(self, print=print): + """ + Initialize the ExportFilePipeline class. + + Args: + print (function, optional): A function to print the output. Defaults to print. + """ self.urls_seen = set() self.print = print def process_item(self, item, spider=None): + """ + Process each item that goes through the pipeline. + + If the item's URL has been seen before, it raises a DropItem exception. Otherwise, it prints the item and returns it. + + Args: + item (dict): The item to process. + spider (Spider, optional): The spider that produced the item. Defaults to None. + + Returns: + dict: The processed item. + """ if item["url"] in self.urls_seen: raise DropItem("Duplicate item found with url: %s" % item["url"]) self.urls_seen.add(item["url"]) diff --git a/backend/worker/webscraper/webscraper/settings.py b/backend/worker/webscraper/webscraper/settings.py index 40d5c9c5..2fa7d676 100644 --- a/backend/worker/webscraper/webscraper/settings.py +++ b/backend/worker/webscraper/webscraper/settings.py @@ -1,11 +1,14 @@ -# Scrapy settings for webscraper project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html +""" +Scrapy settings for the webscraper project. + +This module contains the settings for the webscraper project, including configurations for the spider, +downloader middleware, item pipelines, and more. +Additional settings are in the documentation: +https://docs.scrapy.org/en/latest/topics/settings.html +https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +https://docs.scrapy.org/en/latest/topics/spider-middleware.html +""" + import logging BOT_NAME = "webscraper" diff --git a/backend/worker/webscraper/webscraper/spiders/__init__.py b/backend/worker/webscraper/webscraper/spiders/__init__.py index ebd689ac..cce0fa8b 100644 --- a/backend/worker/webscraper/webscraper/spiders/__init__.py +++ b/backend/worker/webscraper/webscraper/spiders/__init__.py @@ -1,4 +1,5 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. +""" +This package contains the spiders for the webscraper project. + +Each spider is a Python class that defines how a certain site or a group of sites will be scraped. +""" diff --git a/backend/worker/webscraper/webscraper/spiders/main_spider.py b/backend/worker/webscraper/webscraper/spiders/main_spider.py index af3b2a86..0c061076 100644 --- a/backend/worker/webscraper/webscraper/spiders/main_spider.py +++ b/backend/worker/webscraper/webscraper/spiders/main_spider.py @@ -1,26 +1,66 @@ -import scrapy +""" +This module contains the MainSpider class for the webscraper project. + +The MainSpider class is a Scrapy spider that crawls and scrapes data from the specified start URLs. +""" + from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from urllib.parse import urlparse -import hashlib -import json class MainSpider(CrawlSpider): + """ + MainSpider is a Scrapy spider that crawls and scrapes data from the specified start URLs. + + It uses the LinkExtractor to follow links and the parse_item method to process the scraped data. + """ + name = "main" rules = (Rule(LinkExtractor(), callback="parse_item", follow=True),) def __init__(self, *args, **kwargs): + """ + Initialize the MainSpider class. + + It reads the start URLs from a file and sets the allowed domains based on these URLs. + + Args: + *args: Variable length argument list. + **kwargs: Arbitrary keyword arguments. + """ super().__init__(*args, **kwargs) with open(self.domains_file, "r") as f: self.start_urls = f.read().split("\n") self.allowed_domains = [urlparse(url).netloc for url in self.start_urls] def parse_start_url(self, response): + """ + Parse the start URL. + + This method gets called when the spider opens the start URL. It returns the result of the parse_item method. + + Args: + response (Response): The response object for the start URL. + + Returns: + dict: The result of the parse_item method. + """ return self.parse_item(response) def parse_item(self, response): + """ + Parse the response and extract the data. + + This method gets called for each response that the spider receives. It extracts the data from the response and returns it as a dictionary. + + Args: + response (Response): The response to parse. + + Returns: + dict: The extracted data. + """ try: body_decoded = response.body.decode() except UnicodeDecodeError: From 96988efd70b892cdb8aad491f07c26acdab84f37 Mon Sep 17 00:00:00 2001 From: "Grayson, Matthew" Date: Fri, 15 Mar 2024 13:07:41 -0500 Subject: [PATCH 02/13] Fix issues flagged by isort, pyupgrade, and requirements-txt-fixer. --- backend/scripts/populateCountiesCities/main.py | 3 ++- .../scripts/populateCountiesCities/requirements.txt | 2 +- backend/worker/mitmproxy_sign_requests.py | 7 +++++-- backend/worker/requirements.txt | 10 +++++----- backend/worker/test_mitmproxy_sign_requests.py | 5 +++-- backend/worker/webscraper/webscraper/middlewares.py | 7 +++---- backend/worker/webscraper/webscraper/pipelines.py | 5 ++++- backend/worker/webscraper/webscraper/settings.py | 1 + .../webscraper/webscraper/spiders/main_spider.py | 9 ++++++--- .../webscraper/spiders/test_main_spider.py | 12 ++++++++---- .../worker/webscraper/webscraper/test_pipelines.py | 8 ++++++-- 11 files changed, 44 insertions(+), 25 deletions(-) diff --git a/backend/scripts/populateCountiesCities/main.py b/backend/scripts/populateCountiesCities/main.py index 20296ce9..baf7de5b 100644 --- a/backend/scripts/populateCountiesCities/main.py +++ b/backend/scripts/populateCountiesCities/main.py @@ -4,9 +4,10 @@ It includes commands for processing cities and counties data separately or both at once. """ -import typer +# Third-Party Libraries import cities import counties +import typer app = typer.Typer() diff --git a/backend/scripts/populateCountiesCities/requirements.txt b/backend/scripts/populateCountiesCities/requirements.txt index b3e808d7..0e6f50f1 100644 --- a/backend/scripts/populateCountiesCities/requirements.txt +++ b/backend/scripts/populateCountiesCities/requirements.txt @@ -1,4 +1,4 @@ +beautifulsoup4==4.11.2 pandas==1.5.1 requests==2.28.2 -beautifulsoup4==4.11.2 typer==0.7.0 diff --git a/backend/worker/mitmproxy_sign_requests.py b/backend/worker/mitmproxy_sign_requests.py index 4967b173..a2c55022 100644 --- a/backend/worker/mitmproxy_sign_requests.py +++ b/backend/worker/mitmproxy_sign_requests.py @@ -1,8 +1,11 @@ """mitmproxy addon that signs requests and adds a Crossfeed-specific user agent.""" -from mitmproxy import http +# Standard Python Libraries import os -import requests import traceback + +# Third-Party Libraries +from mitmproxy import http +import requests from requests_http_signature import HTTPSignatureHeaderAuth diff --git a/backend/worker/requirements.txt b/backend/worker/requirements.txt index 0466899d..72e51965 100644 --- a/backend/worker/requirements.txt +++ b/backend/worker/requirements.txt @@ -5,9 +5,10 @@ click==8.1.3 dateparser==1.1.8 dnstwist==20230509 docopt==0.6.2 +git+https://github.com/LeapBeyond/scrubadub.git@d0e12c5d922631af3532d044196b05fb1b7c8c1c +git+https://github.com/mitmproxy/mitmproxy@e0e46f4 idna==3.4 joblib==1.2.0 -git+https://github.com/mitmproxy/mitmproxy@e0e46f4 mitmproxy_wireguard==0.1.23 numpy==1.24.3 pandas==2.1.4 @@ -24,15 +25,14 @@ pytz-deprecation-shim==0.1.0.post0 regex==2023.3.23 requests==2.31.0 requests-http-signature==0.2.0 -Scrapy==2.9.0 -git+https://github.com/LeapBeyond/scrubadub.git@d0e12c5d922631af3532d044196b05fb1b7c8c1c scikit-learn==1.2.2 +Scrapy==2.9.0 +setuptools==65.5.1 six==1.16.0 threadpoolctl==3.1.0 tomli==2.0.1 trustymail @ git+https://github.com/Matthew-Grayson/trustymail@production tzdata==2023.3 tzlocal==4.3 -yarg==0.1.9 wheel==0.38.1 -setuptools==65.5.1 +yarg==0.1.9 diff --git a/backend/worker/test_mitmproxy_sign_requests.py b/backend/worker/test_mitmproxy_sign_requests.py index a5289425..2ff660e3 100644 --- a/backend/worker/test_mitmproxy_sign_requests.py +++ b/backend/worker/test_mitmproxy_sign_requests.py @@ -4,8 +4,9 @@ It includes tests for different scenarios such as when a user agent and signature are set, and when they are not set. """ -from mitmproxy.test import tflow -from mitmproxy.test import taddons +# Third-Party Libraries +from mitmproxy.test import taddons, tflow + from .mitmproxy_sign_requests import SignRequests # This is a test RSA private key and not used in any deployed environment diff --git a/backend/worker/webscraper/webscraper/middlewares.py b/backend/worker/webscraper/webscraper/middlewares.py index 0df4e887..9f755168 100644 --- a/backend/worker/webscraper/webscraper/middlewares.py +++ b/backend/worker/webscraper/webscraper/middlewares.py @@ -6,6 +6,7 @@ https://docs.scrapy.org/en/latest/topics/spider-middleware.html """ +# Third-Party Libraries from scrapy import signals @@ -54,8 +55,7 @@ def process_spider_output(self, response, result, spider): # it has processed the response. # Must return an iterable of Request, or item objects. - for i in result: - yield i + yield from result def process_spider_exception(self, response, exception, spider): """ @@ -80,8 +80,7 @@ def process_start_requests(self, start_requests, spider): # that it doesn’t have a response associated. # Must return only requests (not items). - for r in start_requests: - yield r + yield from start_requests def spider_opened(self, spider): """Log the name of the spider when opened.""" diff --git a/backend/worker/webscraper/webscraper/pipelines.py b/backend/worker/webscraper/webscraper/pipelines.py index 321bbdaf..e6cd6631 100644 --- a/backend/worker/webscraper/webscraper/pipelines.py +++ b/backend/worker/webscraper/webscraper/pipelines.py @@ -4,9 +4,12 @@ The pipelines process the items returned by the spiders. """ -from scrapy.exceptions import DropItem +# Standard Python Libraries import json +# Third-Party Libraries +from scrapy.exceptions import DropItem + class ExportFilePipeline: """Print file contents to the console.""" diff --git a/backend/worker/webscraper/webscraper/settings.py b/backend/worker/webscraper/webscraper/settings.py index 2fa7d676..91769d75 100644 --- a/backend/worker/webscraper/webscraper/settings.py +++ b/backend/worker/webscraper/webscraper/settings.py @@ -9,6 +9,7 @@ https://docs.scrapy.org/en/latest/topics/spider-middleware.html """ +# Standard Python Libraries import logging BOT_NAME = "webscraper" diff --git a/backend/worker/webscraper/webscraper/spiders/main_spider.py b/backend/worker/webscraper/webscraper/spiders/main_spider.py index 0c061076..c4221a14 100644 --- a/backend/worker/webscraper/webscraper/spiders/main_spider.py +++ b/backend/worker/webscraper/webscraper/spiders/main_spider.py @@ -4,10 +4,13 @@ The MainSpider class is a Scrapy spider that crawls and scrapes data from the specified start URLs. """ -from scrapy.spiders import CrawlSpider, Rule -from scrapy.linkextractors import LinkExtractor +# Standard Python Libraries from urllib.parse import urlparse +# Third-Party Libraries +from scrapy.linkextractors import LinkExtractor +from scrapy.spiders import CrawlSpider, Rule + class MainSpider(CrawlSpider): """ @@ -31,7 +34,7 @@ def __init__(self, *args, **kwargs): **kwargs: Arbitrary keyword arguments. """ super().__init__(*args, **kwargs) - with open(self.domains_file, "r") as f: + with open(self.domains_file) as f: self.start_urls = f.read().split("\n") self.allowed_domains = [urlparse(url).netloc for url in self.start_urls] diff --git a/backend/worker/webscraper/webscraper/spiders/test_main_spider.py b/backend/worker/webscraper/webscraper/spiders/test_main_spider.py index f7d86199..89b87dfa 100644 --- a/backend/worker/webscraper/webscraper/spiders/test_main_spider.py +++ b/backend/worker/webscraper/webscraper/spiders/test_main_spider.py @@ -1,8 +1,12 @@ +# Standard Python Libraries +import json +from tempfile import NamedTemporaryFile + +# Third-Party Libraries import pytest +from scrapy.http import Request, Response + from .main_spider import MainSpider -from scrapy.http import Response, Request -from tempfile import NamedTemporaryFile -import json SAMPLE_HEADERS = { "Server": "Apache", @@ -34,7 +38,7 @@ def test_sample_website(spider): response = Response( url="https://www.cisa.gov", request=Request(url="https://www.cisa.gov"), - body="Hello world".encode(), + body=b"Hello world", headers=SAMPLE_HEADERS, ) results = list(spider.parse_item(response)) diff --git a/backend/worker/webscraper/webscraper/test_pipelines.py b/backend/worker/webscraper/webscraper/test_pipelines.py index ecde460f..5aecb15a 100644 --- a/backend/worker/webscraper/webscraper/test_pipelines.py +++ b/backend/worker/webscraper/webscraper/test_pipelines.py @@ -1,7 +1,11 @@ +# Standard Python Libraries +from unittest.mock import MagicMock + +# Third-Party Libraries import pytest -from .pipelines import ExportFilePipeline from scrapy.exceptions import DropItem -from unittest.mock import MagicMock + +from .pipelines import ExportFilePipeline @pytest.fixture From 7a8e551f0d589ba73fd46d98d1cf54f410571b98 Mon Sep 17 00:00:00 2001 From: "Grayson, Matthew" Date: Fri, 15 Mar 2024 13:30:24 -0500 Subject: [PATCH 03/13] Add types-requests to mypy dependencies in pre-commit config. --- .pre-commit-config.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 30e9e9f9..eca2b544 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -103,6 +103,8 @@ repos: rev: v1.5.1 hooks: - id: mypy + additional_dependencies: + - types-requests - repo: https://github.com/asottile/pyupgrade rev: v3.10.1 hooks: From 0fae6d5867c44728c068077a0bde2e961cb47330 Mon Sep 17 00:00:00 2001 From: "Grayson, Matthew" Date: Fri, 15 Mar 2024 14:41:59 -0500 Subject: [PATCH 04/13] Delete hard-coded keys from mitmproxy files. --- .../worker/test_mitmproxy_sign_requests.py | 32 ++++--------------- 1 file changed, 7 insertions(+), 25 deletions(-) diff --git a/backend/worker/test_mitmproxy_sign_requests.py b/backend/worker/test_mitmproxy_sign_requests.py index 2ff660e3..b2b23bc3 100644 --- a/backend/worker/test_mitmproxy_sign_requests.py +++ b/backend/worker/test_mitmproxy_sign_requests.py @@ -4,34 +4,16 @@ It includes tests for different scenarios such as when a user agent and signature are set, and when they are not set. """ +# Standard Python Libraries +import os + # Third-Party Libraries +from dotenv import load_dotenv from mitmproxy.test import taddons, tflow from .mitmproxy_sign_requests import SignRequests -# This is a test RSA private key and not used in any deployed environment -private_key = """-----BEGIN RSA PRIVATE KEY----- -MIICXgIBAAKBgQDCFENGw33yGihy92pDjZQhl0C36rPJj+CvfSC8+q28hxA161QF -NUd13wuCTUcq0Qd2qsBe/2hFyc2DCJJg0h1L78+6Z4UMR7EOcpfdUE9Hf3m/hs+F -UR45uBJeDK1HSFHD8bHKD6kv8FPGfJTotc+2xjJwoYi+1hqp1fIekaxsyQIDAQAB -AoGBAJR8ZkCUvx5kzv+utdl7T5MnordT1TvoXXJGXK7ZZ+UuvMNUCdN2QPc4sBiA -QWvLw1cSKt5DsKZ8UETpYPy8pPYnnDEz2dDYiaew9+xEpubyeW2oH4Zx71wqBtOK -kqwrXa/pzdpiucRRjk6vE6YY7EBBs/g7uanVpGibOVAEsqH1AkEA7DkjVH28WDUg -f1nqvfn2Kj6CT7nIcE3jGJsZZ7zlZmBmHFDONMLUrXR/Zm3pR5m0tCmBqa5RK95u -412jt1dPIwJBANJT3v8pnkth48bQo/fKel6uEYyboRtA5/uHuHkZ6FQF7OUkGogc -mSJluOdc5t6hI1VsLn0QZEjQZMEOWr+wKSMCQQCC4kXJEsHAve77oP6HtG/IiEn7 -kpyUXRNvFsDE0czpJJBvL/aRFUJxuRK91jhjC68sA7NsKMGg5OXb5I5Jj36xAkEA -gIT7aFOYBFwGgQAQkWNKLvySgKbAZRTeLBacpHMuQdl1DfdntvAyqpAZ0lY0RKmW -G6aFKaqQfOXKCyWoUiVknQJAXrlgySFci/2ueKlIE1QqIiLSZ8V8OlpFLRnb1pzI -7U1yQXnTAEFYM560yJlzUpOb1V4cScGd365tiSMvxLOvTA== ------END RSA PRIVATE KEY-----""" - -public_key = """-----BEGIN PUBLIC KEY----- -MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDCFENGw33yGihy92pDjZQhl0C3 -6rPJj+CvfSC8+q28hxA161QFNUd13wuCTUcq0Qd2qsBe/2hFyc2DCJJg0h1L78+6 -Z4UMR7EOcpfdUE9Hf3m/hs+FUR45uBJeDK1HSFHD8bHKD6kv8FPGfJTotc+2xjJw -oYi+1hqp1fIekaxsyQIDAQAB ------END PUBLIC KEY-----""" +load_dotenv() def test_user_agent_and_signature(): @@ -42,8 +24,8 @@ def test_user_agent_and_signature(): """ sr = SignRequests( key_id="crossfeed", - public_key=public_key, - private_key=private_key, + public_key=os.getenv("WORKER_SIGNATURE_PUBLIC_KEY"), + private_key=os.getenv("WORKER_SIGNATURE_PRIVATE_KEY"), user_agent="custom user agent", ) with taddons.context(): From 02bde8726abd2e1fde7373bb32d3560d74636b73 Mon Sep 17 00:00:00 2001 From: "Grayson, Matthew" Date: Fri, 15 Mar 2024 14:54:35 -0500 Subject: [PATCH 05/13] Add project .python-version. --- .python-version | 1 + 1 file changed, 1 insertion(+) create mode 100644 .python-version diff --git a/.python-version b/.python-version new file mode 100644 index 00000000..e9d31ca3 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +XFD From 9c0523e056bf3502c410926080ff865ebd720331 Mon Sep 17 00:00:00 2001 From: "Grayson, Matthew" Date: Fri, 15 Mar 2024 19:00:19 -0500 Subject: [PATCH 06/13] Use dotenv to import worker keys into mitmproxy; add line breaks to docstrings in mitmproxy test. --- backend/worker/mitmproxy_sign_requests.py | 3 +++ backend/worker/test_mitmproxy_sign_requests.py | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/backend/worker/mitmproxy_sign_requests.py b/backend/worker/mitmproxy_sign_requests.py index a2c55022..c98ac6e3 100644 --- a/backend/worker/mitmproxy_sign_requests.py +++ b/backend/worker/mitmproxy_sign_requests.py @@ -4,10 +4,13 @@ import traceback # Third-Party Libraries +from dotenv import load_dotenv from mitmproxy import http import requests from requests_http_signature import HTTPSignatureHeaderAuth +load_dotenv() + class SignRequests: """ diff --git a/backend/worker/test_mitmproxy_sign_requests.py b/backend/worker/test_mitmproxy_sign_requests.py index b2b23bc3..95c63498 100644 --- a/backend/worker/test_mitmproxy_sign_requests.py +++ b/backend/worker/test_mitmproxy_sign_requests.py @@ -20,7 +20,8 @@ def test_user_agent_and_signature(): """ This function tests the SignRequests class with a user agent and signature set. - It creates an instance of the SignRequests class with a user agent and signature, makes a request, and verifies the signature. + It creates an instance of the SignRequests class with a user agent and signature, makes a request, and verifies the + signature. """ sr = SignRequests( key_id="crossfeed", @@ -45,7 +46,8 @@ def test_no_user_agent_or_signature_set(): """ This function tests the SignRequests class without a user agent and signature set. - It creates an instance of the SignRequests class without a user agent and signature, makes a request, and checks that no user agent, date, or signature headers are set. + It creates an instance of the SignRequests class without a user agent and signature, makes a request, and checks + that no user agent, date, or signature headers are set. """ sr = SignRequests(key_id="", public_key="", private_key="", user_agent="") with taddons.context(): From 73ebc4feb626806640d44db0b4f78ebd60411fe1 Mon Sep 17 00:00:00 2001 From: "Grayson, Matthew" Date: Fri, 15 Mar 2024 19:02:20 -0500 Subject: [PATCH 07/13] Update Scrapy to address security vulnerability. --- backend/worker/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/worker/requirements.txt b/backend/worker/requirements.txt index 72e51965..350a28f2 100644 --- a/backend/worker/requirements.txt +++ b/backend/worker/requirements.txt @@ -26,7 +26,7 @@ regex==2023.3.23 requests==2.31.0 requests-http-signature==0.2.0 scikit-learn==1.2.2 -Scrapy==2.9.0 +Scrapy==2.11.1 setuptools==65.5.1 six==1.16.0 threadpoolctl==3.1.0 From d306c1a6921ae9ba9324fc5ea8f0f1ed5238fc32 Mon Sep 17 00:00:00 2001 From: "Grayson, Matthew" Date: Sun, 17 Mar 2024 10:32:21 -0500 Subject: [PATCH 08/13] Load .env for python_test in backend.yml. --- .github/workflows/backend.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index 8fb5ba75..ee889d26 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -111,6 +111,8 @@ jobs: uses: actions/setup-python@v5.0.0 with: python-version: '3.10' + - name: Copy .env file + run: cp dev.env.example .env - uses: actions/cache@v3 with: path: ~/.cache/pip From dac4ba74d1d06122cac998108cc8f17c75d098d3 Mon Sep 17 00:00:00 2001 From: "Grayson, Matthew" Date: Sun, 17 Mar 2024 10:40:59 -0500 Subject: [PATCH 09/13] Fix file path for Copy .env file step in backend.yml's python_test. --- .github/workflows/backend.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index ee889d26..95b0a0b9 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -112,7 +112,7 @@ jobs: with: python-version: '3.10' - name: Copy .env file - run: cp dev.env.example .env + run: cp ../dev.env.example .env - uses: actions/cache@v3 with: path: ~/.cache/pip From 49352a3edb68a4183513474bec65b535f172dc31 Mon Sep 17 00:00:00 2001 From: "Grayson, Matthew" Date: Sun, 17 Mar 2024 11:07:12 -0500 Subject: [PATCH 10/13] Fix typo in .flake8 config. --- .flake8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.flake8 b/.flake8 index 869d863a..f188ef63 100644 --- a/.flake8 +++ b/.flake8 @@ -21,7 +21,7 @@ select = C,D,E,F,W,B,B950 # Also ignore flake8's warning about line breaks before binary # operators. It no longer agrees with PEP8. See, for example, here: # https://github.com/ambv/black/issues/21. Guido agrees here: -# https://github.com/python/peps/commit/c59c4376ad233a62git ca4b3a6060c81368bd21e85b. +# https://github.com/python/peps/commit/c59c4376ad233a62ca4b3a6060c81368bd21e85b. ignore = E501,W503 # Ignore D100 and D103, which check for docstrings in modules and functions, in all test files per-file-ignores = From e20fbfddf0c6b27e1d64bc75d2332a1fef3dde77 Mon Sep 17 00:00:00 2001 From: "Grayson, Matthew" Date: Mon, 18 Mar 2024 14:40:42 -0500 Subject: [PATCH 11/13] Remove exception for test files from .flake8; write docstrings for test_pipelines.py and test_main_spider.py. --- .flake8 | 4 --- .../webscraper/spiders/test_main_spider.py | 18 +++++++++++ .../webscraper/webscraper/test_pipelines.py | 31 +++++++++++++++++++ 3 files changed, 49 insertions(+), 4 deletions(-) diff --git a/.flake8 b/.flake8 index f188ef63..92ff8268 100644 --- a/.flake8 +++ b/.flake8 @@ -23,7 +23,3 @@ select = C,D,E,F,W,B,B950 # https://github.com/ambv/black/issues/21. Guido agrees here: # https://github.com/python/peps/commit/c59c4376ad233a62ca4b3a6060c81368bd21e85b. ignore = E501,W503 -# Ignore D100 and D103, which check for docstrings in modules and functions, in all test files -per-file-ignores = - # Ignore D100 and D103 in all test files - */test_*.py: D100, D103 diff --git a/backend/worker/webscraper/webscraper/spiders/test_main_spider.py b/backend/worker/webscraper/webscraper/spiders/test_main_spider.py index 89b87dfa..44807905 100644 --- a/backend/worker/webscraper/webscraper/spiders/test_main_spider.py +++ b/backend/worker/webscraper/webscraper/spiders/test_main_spider.py @@ -1,3 +1,9 @@ +""" +This module contains tests for the MainSpider class in the main_spider module. + +It includes tests for different scenarios such as when a response from a sample website is received. +""" + # Standard Python Libraries import json from tempfile import NamedTemporaryFile @@ -30,11 +36,23 @@ @pytest.fixture def spider(): + """ + Create a MainSpider instance with a temporary domains file. + + This fixture creates a NamedTemporaryFile instance and uses its name as the domains_file parameter for the + MainSpider instance. The MainSpider instance is then returned for use in the tests. + """ with NamedTemporaryFile() as f: return MainSpider(domains_file=f.name) def test_sample_website(spider): + """ + Test the MainSpider class with a sample website response. + + This function creates a sample Response instance with a specific body and headers. It then calls the parse_item + method of the MainSpider instance (provided by the spider fixture) with the sample response and checks the results. + """ response = Response( url="https://www.cisa.gov", request=Request(url="https://www.cisa.gov"), diff --git a/backend/worker/webscraper/webscraper/test_pipelines.py b/backend/worker/webscraper/webscraper/test_pipelines.py index 5aecb15a..b7c6fadd 100644 --- a/backend/worker/webscraper/webscraper/test_pipelines.py +++ b/backend/worker/webscraper/webscraper/test_pipelines.py @@ -1,3 +1,9 @@ +""" +This module contains tests for the ExportFilePipeline class in the pipelines module. + +It includes tests for different scenarios such as processing an item and handling duplicate items. +""" + # Standard Python Libraries from unittest.mock import MagicMock @@ -10,11 +16,23 @@ @pytest.fixture def pipeline(): + """ + Create an ExportFilePipeline instance with a mocked print function. + + This fixture creates a MagicMock instance and uses it as the print parameter for the + ExportFilePipeline instance. The ExportFilePipeline instance is then returned for use in the tests. + """ return ExportFilePipeline(print=MagicMock()) @pytest.fixture def item(): + """ + Create a sample item for testing. + + This fixture creates a dictionary that represents a sample item with specific headers and other details. + The item is then returned for use in the tests. + """ return { "status": 200, "url": "https://www.cisa.gov", @@ -49,11 +67,24 @@ def item(): def test_print_item(pipeline, item): + """ + Test the process_item method of the ExportFilePipeline class with a sample item. + + This function calls the process_item method of the ExportFilePipeline instance (provided by the pipeline fixture) + with the sample item (provided by the item fixture) and checks if the print function was called. + """ pipeline.process_item(item) pipeline.print.assert_called_once() def test_discard_duplicate_items(pipeline, item): + """ + Test the process_item method of the ExportFilePipeline class with duplicate items. + + This function calls the process_item method of the ExportFilePipeline instance (provided by the pipeline fixture) + with the sample item (provided by the item fixture) twice and checks if a DropItem exception is raised the second time. + It also checks if the print function was called only once. + """ pipeline.process_item(item) pipeline.print.assert_called_once() pipeline.print.reset_mock() From bd2196eeb52c64827c835667fb9280e660e87f2b Mon Sep 17 00:00:00 2001 From: "Grayson, Matthew" Date: Mon, 18 Mar 2024 14:47:24 -0500 Subject: [PATCH 12/13] Remove exception for test files from .bandit.yml. --- .bandit.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.bandit.yml b/.bandit.yml index c8cf0312..b6b3a2c8 100644 --- a/.bandit.yml +++ b/.bandit.yml @@ -9,6 +9,3 @@ tests: skips: - B101 # skip "assert used" check since assertions are required in pytests - -exclude: - - '**/test_*.py' From 9ac8ec4c2186f7a6b69f8b861149e9ca7c2b89b3 Mon Sep 17 00:00:00 2001 From: "Grayson, Matthew" Date: Mon, 18 Mar 2024 15:01:05 -0500 Subject: [PATCH 13/13] Delete .python-version from origin; add .python-version, __pycache__, and .mypy_cache to .gitignore. --- .gitignore | 5 ++++- .python-version | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) delete mode 100644 .python-version diff --git a/.gitignore b/.gitignore index 69c09e98..1248786c 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,10 @@ # Files already tracked by Git are not affected. # See: https://git-scm.com/docs/gitignore +# python +__pycache__ +.mypy_cache +.python-version # terraform .terraform @@ -50,4 +54,3 @@ minio-data infrastructure/lambdas/security_headers.zip *.hcl .iac-data - diff --git a/.python-version b/.python-version deleted file mode 100644 index e9d31ca3..00000000 --- a/.python-version +++ /dev/null @@ -1 +0,0 @@ -XFD