From 3c71a179870e670f12640c053b6d34765f53e9b7 Mon Sep 17 00:00:00 2001
From: "Grayson, Matthew" <matthew.grayson@associates.cisa.dhs.gov>
Date: Fri, 15 Mar 2024 12:03:24 -0500
Subject: [PATCH 01/13] Fix issues flagged by bandit and flake8.

---
 .bandit.yml                                   |   7 +-
 .flake8                                       |   6 +-
 .../scripts/populateCountiesCities/cities.py  |  48 ++++++--
 .../populateCountiesCities/counties.py        |  32 ++++-
 .../scripts/populateCountiesCities/main.py    |  30 +++++
 backend/worker/__init__.py                    |   5 +
 backend/worker/mitmproxy_sign_requests.py     | 113 ++++++++++--------
 backend/worker/requirements.txt               |   1 +
 .../worker/test_mitmproxy_sign_requests.py    |  21 +++-
 .../worker/webscraper/webscraper/__init__.py  |   5 +
 backend/worker/webscraper/webscraper/items.py |  11 +-
 .../webscraper/webscraper/middlewares.py      |  73 +++++++++--
 .../worker/webscraper/webscraper/pipelines.py |  29 ++++-
 .../worker/webscraper/webscraper/settings.py  |  19 +--
 .../webscraper/webscraper/spiders/__init__.py |   9 +-
 .../webscraper/spiders/main_spider.py         |  46 ++++++-
 16 files changed, 354 insertions(+), 101 deletions(-)

diff --git a/.bandit.yml b/.bandit.yml
index ab3cb21e..c8cf0312 100644
--- a/.bandit.yml
+++ b/.bandit.yml
@@ -6,8 +6,9 @@
 # If `tests` is empty, all tests are considered included.
 
 tests:
-# - B101
-# - B102
 
 skips:
-# - B101 # skip "assert used" check since assertions are required in pytests
+  - B101  # skip "assert used" check since assertions are required in pytests
+
+exclude:
+  - '**/test_*.py'
diff --git a/.flake8 b/.flake8
index 92ff8268..869d863a 100644
--- a/.flake8
+++ b/.flake8
@@ -21,5 +21,9 @@ select = C,D,E,F,W,B,B950
 # Also ignore flake8's warning about line breaks before binary
 # operators.  It no longer agrees with PEP8.  See, for example, here:
 # https://github.com/ambv/black/issues/21. Guido agrees here:
-# https://github.com/python/peps/commit/c59c4376ad233a62ca4b3a6060c81368bd21e85b.
+# https://github.com/python/peps/commit/c59c4376ad233a62git ca4b3a6060c81368bd21e85b.
 ignore = E501,W503
+# Ignore D100 and D103, which check for docstrings in modules and functions, in all test files
+per-file-ignores =
+    # Ignore D100 and D103 in all test files
+    */test_*.py: D100, D103
diff --git a/backend/scripts/populateCountiesCities/cities.py b/backend/scripts/populateCountiesCities/cities.py
index cd720f42..8adc3c4e 100644
--- a/backend/scripts/populateCountiesCities/cities.py
+++ b/backend/scripts/populateCountiesCities/cities.py
@@ -1,18 +1,44 @@
-import pandas as pd
-import requests
-from bs4 import BeautifulSoup
-import time
-import re
+"""
+This module contains the script for populating cities data.
+
+It includes functions for parsing titles, pulling cities data from Wikipedia,
+and writing the data to a CSV file.
+"""
+
+# Standard Python Libraries
 import json
+import re
+import time
 from urllib.parse import unquote
 
+# Third-Party Libraries
+from bs4 import BeautifulSoup
+import pandas as pd
+import requests
+
 
 def title_parse(title):
+    """
+    Parse the title by unquoting it.
+
+    Args:
+        title (str): The title to be parsed.
+
+    Returns:
+        str: The parsed title.
+    """
     title = unquote(title)
     return title
 
 
 def pull_cities():
+    """
+    Process and pull cities data from Wikipedia.
+
+    This function reads the Wikipedia US cities data from a JSON file, processes each entry,
+    fetches the corresponding Wikipedia page, parses the page to extract city, county, and URL information,
+    and writes the data to a CSV file.
+    """
     print("Processing Cities...")
     with open("wikipedia_US_cities.json") as f:
         wikipedia_us_city_data = json.load(f)
@@ -23,7 +49,10 @@ def pull_cities():
         print(entry["name"])
         # get the response in the form of html
         wikiurl = "https://en.wikipedia.org/wiki/" + entry["url"]
-        response = requests.get(wikiurl)
+        try:
+            response = requests.get(wikiurl, timeout=5)
+        except requests.exceptions.Timeout:
+            print("The request timed out")
 
         # parse data from the html into a beautifulsoup object
         soup = BeautifulSoup(response.text, "html.parser")
@@ -52,7 +81,9 @@ def pull_cities():
                     if "," in link.get("title"):
                         county_pieces = link.get("title").split(",")
                         # OPEN WIKIPEDIA PAGE UP
-                        x = requests.get("https://en.wikipedia.org/" + link.get("href"))
+                        x = requests.get(
+                            "https://en.wikipedia.org/" + link.get("href"), timeout=5
+                        )
 
                         # PULL COUNTY OR PARISH FROM WIKIPEDIA PAGE
                         county_parish_matches = re.findall(
@@ -85,7 +116,8 @@ def pull_cities():
                             }
                         )
                     time.sleep(1)
-                except:
+                except Exception as e:
+                    print(f"Error: {e}")
                     pass
 
         df = pd.DataFrame(holding_pen, columns=["State", "County", "City", "URL"])
diff --git a/backend/scripts/populateCountiesCities/counties.py b/backend/scripts/populateCountiesCities/counties.py
index 64df0f38..34823f8c 100644
--- a/backend/scripts/populateCountiesCities/counties.py
+++ b/backend/scripts/populateCountiesCities/counties.py
@@ -1,16 +1,35 @@
+"""
+This module contains the script for populating counties data.
+
+It includes functions for pulling counties data from Wikipedia,
+and writing the data to a CSV file.
+"""
+
+# Standard Python Libraries
+import re
+import time
+
+# Third-Party Libraries
+from bs4 import BeautifulSoup
 import pandas as pd
 import requests
-from bs4 import BeautifulSoup
-import time
-import re
 
 
 def pull_counties():
+    """
+    Process and pull counties data from Wikipedia.
+
+    This function fetches the Wikipedia page for the list of United States counties,
+    parses the page to extract county, state, and URL information,
+    and writes the data to a CSV file.
+    """
     print("Processing Counties...")
     # get the response in the form of html
     wikiurl = "https://en.wikipedia.org/wiki/List_of_United_States_counties_and_county_equivalents"
-    table_class = "wikitable sortable jquery-tablesorter"
-    response = requests.get(wikiurl)
+    try:
+        response = requests.get(wikiurl, timeout=5)
+    except requests.exceptions.Timeout:
+        print("The request timed out")
 
     # parse data from the html into a beautifulsoup object
     soup = BeautifulSoup(response.text, "html.parser")
@@ -24,7 +43,7 @@ def pull_counties():
         try:
             county_pieces = link.get("title").split(", ")
             # OPEN WIKIPEDIA PAGE UP
-            x = requests.get("https://en.wikipedia.org/" + link.get("href"))
+            x = requests.get("https://en.wikipedia.org/" + link.get("href"), timeout=5)
 
             # PULL WEBSITE FROM WIKIPEDIA PAGE
             w = re.findall(
@@ -43,6 +62,7 @@ def pull_counties():
                 }
             )
         except Exception as e:
+            print(f"Error: {e}")
             pass
 
         time.sleep(1)
diff --git a/backend/scripts/populateCountiesCities/main.py b/backend/scripts/populateCountiesCities/main.py
index dc86edb1..20296ce9 100644
--- a/backend/scripts/populateCountiesCities/main.py
+++ b/backend/scripts/populateCountiesCities/main.py
@@ -1,3 +1,9 @@
+"""
+This module contains the main script for populating counties and cities data.
+
+It includes commands for processing cities and counties data separately or both at once.
+"""
+
 import typer
 import cities
 import counties
@@ -7,16 +13,40 @@
 
 @app.command()
 def process_cities():
+    """
+    Process and pull cities data from Wikipedia.
+
+    This function calls the pull_cities function from the cities module,
+    which reads the Wikipedia US cities data from a JSON file, processes each entry,
+    fetches the corresponding Wikipedia page, parses the page to extract city, county, and URL information,
+    and writes the data to a CSV file.
+    """
     cities.pull_cities()
 
 
 @app.command()
 def process_counties():
+    """
+    Process and pull counties data from Wikipedia.
+
+    This function calls the pull_counties function from the counties module,
+    which fetches the Wikipedia page for the list of United States counties,
+    parses the page to extract county, state, and URL information,
+    and writes the data to a CSV file.
+    """
     counties.pull_counties()
 
 
 @app.command()
 def process_both():
+    """
+    Process and pull both cities and counties data from Wikipedia.
+
+    This function calls both the pull_cities function from the cities module and the pull_counties function from the counties module,
+    which fetches the Wikipedia pages for the list of United States cities and counties,
+    parses the pages to extract city, county, state, and URL information,
+    and writes the data to CSV files.
+    """
     counties.pull_counties()
     cities.pull_cities()
 
diff --git a/backend/worker/__init__.py b/backend/worker/__init__.py
index e69de29b..088f05b8 100644
--- a/backend/worker/__init__.py
+++ b/backend/worker/__init__.py
@@ -0,0 +1,5 @@
+"""
+This package contains the worker tasks for the backend.
+
+It includes modules for processing data, interacting with databases, and other backend tasks.
+"""
diff --git a/backend/worker/mitmproxy_sign_requests.py b/backend/worker/mitmproxy_sign_requests.py
index 691a5afb..4967b173 100644
--- a/backend/worker/mitmproxy_sign_requests.py
+++ b/backend/worker/mitmproxy_sign_requests.py
@@ -1,16 +1,35 @@
-"""
-mitmproxy addon that signs requests and adds a Crossfeed-specific user agent.
-"""
-from mitmproxy import http, ctx
+"""mitmproxy addon that signs requests and adds a Crossfeed-specific user agent."""
+from mitmproxy import http
 import os
 import requests
-import json
 import traceback
 from requests_http_signature import HTTPSignatureHeaderAuth
 
 
 class SignRequests:
+    """
+    A class used to sign HTTP requests and add a Crossfeed-specific user agent.
+
+    This class is used as a mitmproxy addon. It signs the HTTP requests using the provided private key and adds a user agent to the request headers.
+
+    Attributes:
+        key_id (str): The key ID used for signing the requests.
+        private_key (str): The private key used for signing the requests.
+        public_key (str): The public key used for verifying the signature.
+        user_agent (str): The user agent to be added to the request headers.
+        signature_auth (HTTPSignatureHeaderAuth): The HTTPSignatureHeaderAuth instance used for signing the requests.
+    """
+
     def __init__(self, key_id="", public_key="", private_key="", user_agent=""):
+        """
+        Initialize the SignRequests instance.
+
+        Args:
+            key_id (str, optional): The key ID used for signing the requests. Defaults to "".
+            public_key (str, optional): The public key used for verifying the signature. Defaults to "".
+            private_key (str, optional): The private key used for signing the requests. Defaults to "".
+            user_agent (str, optional): The user agent to be added to the request headers. Defaults to "".
+        """
         self.key_id = key_id
         self.private_key = private_key
         self.public_key = public_key
@@ -20,9 +39,30 @@ def __init__(self, key_id="", public_key="", private_key="", user_agent=""):
         )
 
     def key_resolver(self, key_id, algorithm):
+        """
+        Resolve the key for the given key_id and algorithm.
+
+        Args:
+            key_id (str): The key ID used for signing the requests.
+            algorithm (str): The algorithm used for signing the requests.
+
+        Returns:
+            bytes: The public key encoded in bytes.
+        """
         return self.public_key.encode()
 
     def verify_signature(self, method, url, date, signature):
+        """
+        Verify the signature of the HTTP request.
+
+        Args:
+            method (str): The HTTP method of the request.
+            url (str): The URL of the request.
+            date (str): The date when the request was made.
+            signature (str): The signature of the request.
+
+        This method uses the HTTPSignatureHeaderAuth's verify method to verify the signature of the request.
+        """
         HTTPSignatureHeaderAuth.verify(
             requests.Request(
                 method=url, url=url, headers={"date": date, "Signature": signature}
@@ -32,6 +72,17 @@ def verify_signature(self, method, url, date, signature):
         )
 
     def request(self, flow):
+        """
+        Process the HTTP request.
+
+        This method adds a user agent to the request headers if one is provided. If a private key is provided, it signs the request using the HTTPSignatureHeaderAuth instance.
+
+        Args:
+            flow (mitmproxy.http.HTTPFlow): The HTTP request/response flow.
+
+        Raises:
+            Exception: If there is an error while processing the request, an exception is raised and a 500 response is returned.
+        """
         try:
             if self.user_agent:
                 flow.request.headers["User-Agent"] = self.user_agent
@@ -57,47 +108,11 @@ def request(self, flow):
             )
 
 
-test = os.getenv("WORKER_TEST", None) is not None
-
-if test:
-    # This is a test RSA private key and not used in any deployed environment
-    # file deepcode ignore HardcodedNonCryptoSecret: <please specify a reason of ignoring this>
-    private_key = """-----BEGIN RSA PRIVATE KEY-----
-MIICXgIBAAKBgQDCFENGw33yGihy92pDjZQhl0C36rPJj+CvfSC8+q28hxA161QF
-NUd13wuCTUcq0Qd2qsBe/2hFyc2DCJJg0h1L78+6Z4UMR7EOcpfdUE9Hf3m/hs+F
-UR45uBJeDK1HSFHD8bHKD6kv8FPGfJTotc+2xjJwoYi+1hqp1fIekaxsyQIDAQAB
-AoGBAJR8ZkCUvx5kzv+utdl7T5MnordT1TvoXXJGXK7ZZ+UuvMNUCdN2QPc4sBiA
-QWvLw1cSKt5DsKZ8UETpYPy8pPYnnDEz2dDYiaew9+xEpubyeW2oH4Zx71wqBtOK
-kqwrXa/pzdpiucRRjk6vE6YY7EBBs/g7uanVpGibOVAEsqH1AkEA7DkjVH28WDUg
-f1nqvfn2Kj6CT7nIcE3jGJsZZ7zlZmBmHFDONMLUrXR/Zm3pR5m0tCmBqa5RK95u
-412jt1dPIwJBANJT3v8pnkth48bQo/fKel6uEYyboRtA5/uHuHkZ6FQF7OUkGogc
-mSJluOdc5t6hI1VsLn0QZEjQZMEOWr+wKSMCQQCC4kXJEsHAve77oP6HtG/IiEn7
-kpyUXRNvFsDE0czpJJBvL/aRFUJxuRK91jhjC68sA7NsKMGg5OXb5I5Jj36xAkEA
-gIT7aFOYBFwGgQAQkWNKLvySgKbAZRTeLBacpHMuQdl1DfdntvAyqpAZ0lY0RKmW
-G6aFKaqQfOXKCyWoUiVknQJAXrlgySFci/2ueKlIE1QqIiLSZ8V8OlpFLRnb1pzI
-7U1yQXnTAEFYM560yJlzUpOb1V4cScGd365tiSMvxLOvTA==
------END RSA PRIVATE KEY-----"""
-
-    public_key = """-----BEGIN PUBLIC KEY-----
-MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDCFENGw33yGihy92pDjZQhl0C3
-6rPJj+CvfSC8+q28hxA161QFNUd13wuCTUcq0Qd2qsBe/2hFyc2DCJJg0h1L78+6
-Z4UMR7EOcpfdUE9Hf3m/hs+FUR45uBJeDK1HSFHD8bHKD6kv8FPGfJTotc+2xjJw
-oYi+1hqp1fIekaxsyQIDAQAB
------END PUBLIC KEY-----"""
-    addons = [
-        SignRequests(
-            key_id="crossfeed",
-            public_key=public_key,
-            private_key=private_key,
-            user_agent="Crossfeed test user agent",
-        )
-    ]
-else:
-    addons = [
-        SignRequests(
-            key_id="crossfeed",
-            public_key=os.getenv("WORKER_SIGNATURE_PUBLIC_KEY", ""),
-            private_key=os.getenv("WORKER_SIGNATURE_PRIVATE_KEY", ""),
-            user_agent=os.getenv("WORKER_USER_AGENT", ""),
-        )
-    ]
+addons = [
+    SignRequests(
+        key_id="crossfeed",
+        public_key=os.getenv("WORKER_SIGNATURE_PUBLIC_KEY", ""),
+        private_key=os.getenv("WORKER_SIGNATURE_PRIVATE_KEY", ""),
+        user_agent=os.getenv("WORKER_USER_AGENT", ""),
+    )
+]
diff --git a/backend/worker/requirements.txt b/backend/worker/requirements.txt
index d362d5a5..0466899d 100644
--- a/backend/worker/requirements.txt
+++ b/backend/worker/requirements.txt
@@ -18,6 +18,7 @@ psycopg2-binary==2.9.5
 pyproject_hooks==1.0.0
 pytest==7.3.0
 python-dateutil==2.8.2
+python-dotenv==1.0.1
 pytz==2023.3
 pytz-deprecation-shim==0.1.0.post0
 regex==2023.3.23
diff --git a/backend/worker/test_mitmproxy_sign_requests.py b/backend/worker/test_mitmproxy_sign_requests.py
index 766ca5fd..a5289425 100644
--- a/backend/worker/test_mitmproxy_sign_requests.py
+++ b/backend/worker/test_mitmproxy_sign_requests.py
@@ -1,4 +1,9 @@
-from mitmproxy import exceptions
+"""
+This module contains tests for the SignRequests class in the mitmproxy_sign_requests module.
+
+It includes tests for different scenarios such as when a user agent and signature are set, and when they are not set.
+"""
+
 from mitmproxy.test import tflow
 from mitmproxy.test import taddons
 from .mitmproxy_sign_requests import SignRequests
@@ -29,13 +34,18 @@
 
 
 def test_user_agent_and_signature():
+    """
+    This function tests the SignRequests class with a user agent and signature set.
+
+    It creates an instance of the SignRequests class with a user agent and signature, makes a request, and verifies the signature.
+    """
     sr = SignRequests(
         key_id="crossfeed",
         public_key=public_key,
         private_key=private_key,
         user_agent="custom user agent",
     )
-    with taddons.context() as tctx:
+    with taddons.context():
         f = tflow.tflow()
         f.request.headers["User-Agent"] = "original user agent"
         sr.request(f)
@@ -49,8 +59,13 @@ def test_user_agent_and_signature():
 
 
 def test_no_user_agent_or_signature_set():
+    """
+    This function tests the SignRequests class without a user agent and signature set.
+
+    It creates an instance of the SignRequests class without a user agent and signature, makes a request, and checks that no user agent, date, or signature headers are set.
+    """
     sr = SignRequests(key_id="", public_key="", private_key="", user_agent="")
-    with taddons.context() as tctx:
+    with taddons.context():
         f = tflow.tflow()
         sr.request(f)
         assert "User-Agent" not in f.request.headers
diff --git a/backend/worker/webscraper/webscraper/__init__.py b/backend/worker/webscraper/webscraper/__init__.py
index e69de29b..61265cd2 100644
--- a/backend/worker/webscraper/webscraper/__init__.py
+++ b/backend/worker/webscraper/webscraper/__init__.py
@@ -0,0 +1,5 @@
+"""
+This package contains modules and classes for web scraping.
+
+It includes modules for making HTTP requests, parsing HTML, and extracting data.
+"""
diff --git a/backend/worker/webscraper/webscraper/items.py b/backend/worker/webscraper/webscraper/items.py
index a581a5fc..f7a4a8d6 100644
--- a/backend/worker/webscraper/webscraper/items.py
+++ b/backend/worker/webscraper/webscraper/items.py
@@ -1,6 +1,7 @@
-# Define here the models for your scraped items
-#
-# See documentation in:
-# https://docs.scrapy.org/en/latest/topics/items.html
+"""
+This module defines the items that the web scraper will extract from web pages.
 
-import scrapy
+Each item is represented by a class, and the fields of the class represent the data that the scraper will extract.
+See documentation here for information on how to create models for your items:
+https://docs.scrapy.org/en/latest/topics/items.html
+"""
diff --git a/backend/worker/webscraper/webscraper/middlewares.py b/backend/worker/webscraper/webscraper/middlewares.py
index 3efe286a..0df4e887 100644
--- a/backend/worker/webscraper/webscraper/middlewares.py
+++ b/backend/worker/webscraper/webscraper/middlewares.py
@@ -1,27 +1,43 @@
-# Define here the models for your spider middleware
-#
-# See documentation in:
-# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+"""
+This module defines the middlewares for the web scraper.
 
-from scrapy import signals
+Middlewares are used to process incoming responses and outgoing requests and items. This module defines two types of middlewares: Spider Middleware and Downloader Middleware. The Spider Middleware processes responses before they reach the spider and processes items and requests after they have been processed by the spider. The Downloader Middleware processes requests before they are sent to the downloader and processes responses before they reach the Spider Middleware or the spider.
+See documentation here:
+https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+"""
 
-# useful for handling different item types with a single interface
-from itemadapter import is_item, ItemAdapter
+from scrapy import signals
 
 
 class WebscraperSpiderMiddleware:
+    """
+    This class defines the Spider Middleware for the web scraper.
+
+    The Spider Middleware processes responses before they reach the spider and processes items and requests after they have been processed by the spider.
+    """
+
     # Not all methods need to be defined. If a method is not defined,
     # scrapy acts as if the spider middleware does not modify the
     # passed objects.
 
     @classmethod
     def from_crawler(cls, crawler):
+        """
+        Create spiders using Scrapy.
+
+        Connect the spider_opened method to the spider_opened signal.
+        """
         # This method is used by Scrapy to create your spiders.
         s = cls()
         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
         return s
 
     def process_spider_input(self, response, spider):
+        """
+        Process each response that goes through the spider middleware and into the spider.
+
+        Return None or raise an exception.
+        """
         # Called for each response that goes through the spider
         # middleware and into the spider.
 
@@ -29,6 +45,11 @@ def process_spider_input(self, response, spider):
         return None
 
     def process_spider_output(self, response, result, spider):
+        """
+        Process the results returned from the Spider, after it has processed the response.
+
+        Return an iterable of Request, or item objects.
+        """
         # Called with the results returned from the Spider, after
         # it has processed the response.
 
@@ -37,6 +58,11 @@ def process_spider_output(self, response, result, spider):
             yield i
 
     def process_spider_exception(self, response, exception, spider):
+        """
+        Handle exceptions raised by a spider or process_spider_input() method.
+
+        This method should return either None or an iterable of Request or item objects.
+        """
         # Called when a spider or process_spider_input() method
         # (from other spider middleware) raises an exception.
 
@@ -44,6 +70,11 @@ def process_spider_exception(self, response, exception, spider):
         pass
 
     def process_start_requests(self, start_requests, spider):
+        """
+        Process the start requests of the spider.
+
+        This method works similarly to the process_spider_output() method, except that it doesn’t have a response associated. It must return only requests (not items).
+        """
         # Called with the start requests of the spider, and works
         # similarly to the process_spider_output() method, except
         # that it doesn’t have a response associated.
@@ -53,22 +84,39 @@ def process_start_requests(self, start_requests, spider):
             yield r
 
     def spider_opened(self, spider):
+        """Log the name of the spider when opened."""
         spider.logger.info("Spider opened: %s" % spider.name)
 
 
 class WebscraperDownloaderMiddleware:
+    """
+    This class defines the Downloader Middleware for the web scraper.
+
+    The Downloader Middleware processes requests before they are sent to the downloader and processes responses before they reach the Spider Middleware or the spider.
+    """
+
     # Not all methods need to be defined. If a method is not defined,
     # scrapy acts as if the downloader middleware does not modify the
     # passed objects.
 
     @classmethod
     def from_crawler(cls, crawler):
+        """
+        Create spiders using Scrapy.
+
+        Connect the spider_opened method to the spider_opened signal.
+        """
         # This method is used by Scrapy to create your spiders.
         s = cls()
         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
         return s
 
     def process_request(self, request, spider):
+        """
+        Process each request that goes through the downloader middleware.
+
+        Must either return None, a Response object, a Request object, or raise IgnoreRequest.
+        """
         # Called for each request that goes through the downloader
         # middleware.
 
@@ -81,6 +129,11 @@ def process_request(self, request, spider):
         return None
 
     def process_response(self, request, response, spider):
+        """
+        Process the response returned from the downloader.
+
+        Must either return a Response object, a Request object, or raise IgnoreRequest.
+        """
         # Called with the response returned from the downloader.
 
         # Must either;
@@ -90,6 +143,11 @@ def process_response(self, request, response, spider):
         return response
 
     def process_exception(self, request, exception, spider):
+        """
+        Handle exceptions raised by a download handler or a process_request() method.
+
+        Must either return None, a Response object, a Request object.
+        """
         # Called when a download handler or a process_request()
         # (from other downloader middleware) raises an exception.
 
@@ -100,4 +158,5 @@ def process_exception(self, request, exception, spider):
         pass
 
     def spider_opened(self, spider):
+        """Log the name of the spider when it is opened."""
         spider.logger.info("Spider opened: %s" % spider.name)
diff --git a/backend/worker/webscraper/webscraper/pipelines.py b/backend/worker/webscraper/webscraper/pipelines.py
index c9b63243..321bbdaf 100644
--- a/backend/worker/webscraper/webscraper/pipelines.py
+++ b/backend/worker/webscraper/webscraper/pipelines.py
@@ -1,18 +1,39 @@
+"""
+This module contains the pipeline classes for the web scraper.
+
+The pipelines process the items returned by the spiders.
+"""
+
 from scrapy.exceptions import DropItem
 import json
-import os
-from io import BytesIO
-from datetime import datetime
 
 
 class ExportFilePipeline:
-    """Prints file contents to the console."""
+    """Print file contents to the console."""
 
     def __init__(self, print=print):
+        """
+        Initialize the ExportFilePipeline class.
+
+        Args:
+            print (function, optional): A function to print the output. Defaults to print.
+        """
         self.urls_seen = set()
         self.print = print
 
     def process_item(self, item, spider=None):
+        """
+        Process each item that goes through the pipeline.
+
+        If the item's URL has been seen before, it raises a DropItem exception. Otherwise, it prints the item and returns it.
+
+        Args:
+            item (dict): The item to process.
+            spider (Spider, optional): The spider that produced the item. Defaults to None.
+
+        Returns:
+            dict: The processed item.
+        """
         if item["url"] in self.urls_seen:
             raise DropItem("Duplicate item found with url: %s" % item["url"])
         self.urls_seen.add(item["url"])
diff --git a/backend/worker/webscraper/webscraper/settings.py b/backend/worker/webscraper/webscraper/settings.py
index 40d5c9c5..2fa7d676 100644
--- a/backend/worker/webscraper/webscraper/settings.py
+++ b/backend/worker/webscraper/webscraper/settings.py
@@ -1,11 +1,14 @@
-# Scrapy settings for webscraper project
-#
-# For simplicity, this file contains only settings considered important or
-# commonly used. You can find more settings consulting the documentation:
-#
-#     https://docs.scrapy.org/en/latest/topics/settings.html
-#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
-#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+"""
+Scrapy settings for the webscraper project.
+
+This module contains the settings for the webscraper project, including configurations for the spider,
+downloader middleware, item pipelines, and more.
+Additional settings are in the documentation:
+https://docs.scrapy.org/en/latest/topics/settings.html
+https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+"""
+
 import logging
 
 BOT_NAME = "webscraper"
diff --git a/backend/worker/webscraper/webscraper/spiders/__init__.py b/backend/worker/webscraper/webscraper/spiders/__init__.py
index ebd689ac..cce0fa8b 100644
--- a/backend/worker/webscraper/webscraper/spiders/__init__.py
+++ b/backend/worker/webscraper/webscraper/spiders/__init__.py
@@ -1,4 +1,5 @@
-# This package will contain the spiders of your Scrapy project
-#
-# Please refer to the documentation for information on how to create and manage
-# your spiders.
+"""
+This package contains the spiders for the webscraper project.
+
+Each spider is a Python class that defines how a certain site or a group of sites will be scraped.
+"""
diff --git a/backend/worker/webscraper/webscraper/spiders/main_spider.py b/backend/worker/webscraper/webscraper/spiders/main_spider.py
index af3b2a86..0c061076 100644
--- a/backend/worker/webscraper/webscraper/spiders/main_spider.py
+++ b/backend/worker/webscraper/webscraper/spiders/main_spider.py
@@ -1,26 +1,66 @@
-import scrapy
+"""
+This module contains the MainSpider class for the webscraper project.
+
+The MainSpider class is a Scrapy spider that crawls and scrapes data from the specified start URLs.
+"""
+
 from scrapy.spiders import CrawlSpider, Rule
 from scrapy.linkextractors import LinkExtractor
 from urllib.parse import urlparse
-import hashlib
-import json
 
 
 class MainSpider(CrawlSpider):
+    """
+    MainSpider is a Scrapy spider that crawls and scrapes data from the specified start URLs.
+
+    It uses the LinkExtractor to follow links and the parse_item method to process the scraped data.
+    """
+
     name = "main"
 
     rules = (Rule(LinkExtractor(), callback="parse_item", follow=True),)
 
     def __init__(self, *args, **kwargs):
+        """
+        Initialize the MainSpider class.
+
+        It reads the start URLs from a file and sets the allowed domains based on these URLs.
+
+        Args:
+            *args: Variable length argument list.
+            **kwargs: Arbitrary keyword arguments.
+        """
         super().__init__(*args, **kwargs)
         with open(self.domains_file, "r") as f:
             self.start_urls = f.read().split("\n")
         self.allowed_domains = [urlparse(url).netloc for url in self.start_urls]
 
     def parse_start_url(self, response):
+        """
+        Parse the start URL.
+
+        This method gets called when the spider opens the start URL. It returns the result of the parse_item method.
+
+        Args:
+            response (Response): The response object for the start URL.
+
+        Returns:
+            dict: The result of the parse_item method.
+        """
         return self.parse_item(response)
 
     def parse_item(self, response):
+        """
+        Parse the response and extract the data.
+
+        This method gets called for each response that the spider receives. It extracts the data from the response and returns it as a dictionary.
+
+        Args:
+            response (Response): The response to parse.
+
+        Returns:
+            dict: The extracted data.
+        """
         try:
             body_decoded = response.body.decode()
         except UnicodeDecodeError:

From 96988efd70b892cdb8aad491f07c26acdab84f37 Mon Sep 17 00:00:00 2001
From: "Grayson, Matthew" <matthew.grayson@associates.cisa.dhs.gov>
Date: Fri, 15 Mar 2024 13:07:41 -0500
Subject: [PATCH 02/13] Fix issues flagged by isort, pyupgrade, and
 requirements-txt-fixer.

---
 backend/scripts/populateCountiesCities/main.py       |  3 ++-
 .../scripts/populateCountiesCities/requirements.txt  |  2 +-
 backend/worker/mitmproxy_sign_requests.py            |  7 +++++--
 backend/worker/requirements.txt                      | 10 +++++-----
 backend/worker/test_mitmproxy_sign_requests.py       |  5 +++--
 backend/worker/webscraper/webscraper/middlewares.py  |  7 +++----
 backend/worker/webscraper/webscraper/pipelines.py    |  5 ++++-
 backend/worker/webscraper/webscraper/settings.py     |  1 +
 .../webscraper/webscraper/spiders/main_spider.py     |  9 ++++++---
 .../webscraper/spiders/test_main_spider.py           | 12 ++++++++----
 .../worker/webscraper/webscraper/test_pipelines.py   |  8 ++++++--
 11 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/backend/scripts/populateCountiesCities/main.py b/backend/scripts/populateCountiesCities/main.py
index 20296ce9..baf7de5b 100644
--- a/backend/scripts/populateCountiesCities/main.py
+++ b/backend/scripts/populateCountiesCities/main.py
@@ -4,9 +4,10 @@
 It includes commands for processing cities and counties data separately or both at once.
 """
 
-import typer
+# Third-Party Libraries
 import cities
 import counties
+import typer
 
 app = typer.Typer()
 
diff --git a/backend/scripts/populateCountiesCities/requirements.txt b/backend/scripts/populateCountiesCities/requirements.txt
index b3e808d7..0e6f50f1 100644
--- a/backend/scripts/populateCountiesCities/requirements.txt
+++ b/backend/scripts/populateCountiesCities/requirements.txt
@@ -1,4 +1,4 @@
+beautifulsoup4==4.11.2
 pandas==1.5.1
 requests==2.28.2
-beautifulsoup4==4.11.2
 typer==0.7.0
diff --git a/backend/worker/mitmproxy_sign_requests.py b/backend/worker/mitmproxy_sign_requests.py
index 4967b173..a2c55022 100644
--- a/backend/worker/mitmproxy_sign_requests.py
+++ b/backend/worker/mitmproxy_sign_requests.py
@@ -1,8 +1,11 @@
 """mitmproxy addon that signs requests and adds a Crossfeed-specific user agent."""
-from mitmproxy import http
+# Standard Python Libraries
 import os
-import requests
 import traceback
+
+# Third-Party Libraries
+from mitmproxy import http
+import requests
 from requests_http_signature import HTTPSignatureHeaderAuth
 
 
diff --git a/backend/worker/requirements.txt b/backend/worker/requirements.txt
index 0466899d..72e51965 100644
--- a/backend/worker/requirements.txt
+++ b/backend/worker/requirements.txt
@@ -5,9 +5,10 @@ click==8.1.3
 dateparser==1.1.8
 dnstwist==20230509
 docopt==0.6.2
+git+https://github.com/LeapBeyond/scrubadub.git@d0e12c5d922631af3532d044196b05fb1b7c8c1c
+git+https://github.com/mitmproxy/mitmproxy@e0e46f4
 idna==3.4
 joblib==1.2.0
-git+https://github.com/mitmproxy/mitmproxy@e0e46f4
 mitmproxy_wireguard==0.1.23
 numpy==1.24.3
 pandas==2.1.4
@@ -24,15 +25,14 @@ pytz-deprecation-shim==0.1.0.post0
 regex==2023.3.23
 requests==2.31.0
 requests-http-signature==0.2.0
-Scrapy==2.9.0
-git+https://github.com/LeapBeyond/scrubadub.git@d0e12c5d922631af3532d044196b05fb1b7c8c1c
 scikit-learn==1.2.2
+Scrapy==2.9.0
+setuptools==65.5.1
 six==1.16.0
 threadpoolctl==3.1.0
 tomli==2.0.1
 trustymail @ git+https://github.com/Matthew-Grayson/trustymail@production
 tzdata==2023.3
 tzlocal==4.3
-yarg==0.1.9
 wheel==0.38.1
-setuptools==65.5.1
+yarg==0.1.9
diff --git a/backend/worker/test_mitmproxy_sign_requests.py b/backend/worker/test_mitmproxy_sign_requests.py
index a5289425..2ff660e3 100644
--- a/backend/worker/test_mitmproxy_sign_requests.py
+++ b/backend/worker/test_mitmproxy_sign_requests.py
@@ -4,8 +4,9 @@
 It includes tests for different scenarios such as when a user agent and signature are set, and when they are not set.
 """
 
-from mitmproxy.test import tflow
-from mitmproxy.test import taddons
+# Third-Party Libraries
+from mitmproxy.test import taddons, tflow
+
 from .mitmproxy_sign_requests import SignRequests
 
 # This is a test RSA private key and not used in any deployed environment
diff --git a/backend/worker/webscraper/webscraper/middlewares.py b/backend/worker/webscraper/webscraper/middlewares.py
index 0df4e887..9f755168 100644
--- a/backend/worker/webscraper/webscraper/middlewares.py
+++ b/backend/worker/webscraper/webscraper/middlewares.py
@@ -6,6 +6,7 @@
 https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 """
 
+# Third-Party Libraries
 from scrapy import signals
 
 
@@ -54,8 +55,7 @@ def process_spider_output(self, response, result, spider):
         # it has processed the response.
 
         # Must return an iterable of Request, or item objects.
-        for i in result:
-            yield i
+        yield from result
 
     def process_spider_exception(self, response, exception, spider):
         """
@@ -80,8 +80,7 @@ def process_start_requests(self, start_requests, spider):
         # that it doesn’t have a response associated.
 
         # Must return only requests (not items).
-        for r in start_requests:
-            yield r
+        yield from start_requests
 
     def spider_opened(self, spider):
         """Log the name of the spider when opened."""
diff --git a/backend/worker/webscraper/webscraper/pipelines.py b/backend/worker/webscraper/webscraper/pipelines.py
index 321bbdaf..e6cd6631 100644
--- a/backend/worker/webscraper/webscraper/pipelines.py
+++ b/backend/worker/webscraper/webscraper/pipelines.py
@@ -4,9 +4,12 @@
 The pipelines process the items returned by the spiders.
 """
 
-from scrapy.exceptions import DropItem
+# Standard Python Libraries
 import json
 
+# Third-Party Libraries
+from scrapy.exceptions import DropItem
+
 
 class ExportFilePipeline:
     """Print file contents to the console."""
diff --git a/backend/worker/webscraper/webscraper/settings.py b/backend/worker/webscraper/webscraper/settings.py
index 2fa7d676..91769d75 100644
--- a/backend/worker/webscraper/webscraper/settings.py
+++ b/backend/worker/webscraper/webscraper/settings.py
@@ -9,6 +9,7 @@
 https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 """
 
+# Standard Python Libraries
 import logging
 
 BOT_NAME = "webscraper"
diff --git a/backend/worker/webscraper/webscraper/spiders/main_spider.py b/backend/worker/webscraper/webscraper/spiders/main_spider.py
index 0c061076..c4221a14 100644
--- a/backend/worker/webscraper/webscraper/spiders/main_spider.py
+++ b/backend/worker/webscraper/webscraper/spiders/main_spider.py
@@ -4,10 +4,13 @@
 The MainSpider class is a Scrapy spider that crawls and scrapes data from the specified start URLs.
 """
 
-from scrapy.spiders import CrawlSpider, Rule
-from scrapy.linkextractors import LinkExtractor
+# Standard Python Libraries
 from urllib.parse import urlparse
 
+# Third-Party Libraries
+from scrapy.linkextractors import LinkExtractor
+from scrapy.spiders import CrawlSpider, Rule
+
 
 class MainSpider(CrawlSpider):
     """
@@ -31,7 +34,7 @@ def __init__(self, *args, **kwargs):
             **kwargs: Arbitrary keyword arguments.
         """
         super().__init__(*args, **kwargs)
-        with open(self.domains_file, "r") as f:
+        with open(self.domains_file) as f:
             self.start_urls = f.read().split("\n")
         self.allowed_domains = [urlparse(url).netloc for url in self.start_urls]
 
diff --git a/backend/worker/webscraper/webscraper/spiders/test_main_spider.py b/backend/worker/webscraper/webscraper/spiders/test_main_spider.py
index f7d86199..89b87dfa 100644
--- a/backend/worker/webscraper/webscraper/spiders/test_main_spider.py
+++ b/backend/worker/webscraper/webscraper/spiders/test_main_spider.py
@@ -1,8 +1,12 @@
+# Standard Python Libraries
+import json
+from tempfile import NamedTemporaryFile
+
+# Third-Party Libraries
 import pytest
+from scrapy.http import Request, Response
+
 from .main_spider import MainSpider
-from scrapy.http import Response, Request
-from tempfile import NamedTemporaryFile
-import json
 
 SAMPLE_HEADERS = {
     "Server": "Apache",
@@ -34,7 +38,7 @@ def test_sample_website(spider):
     response = Response(
         url="https://www.cisa.gov",
         request=Request(url="https://www.cisa.gov"),
-        body="<body>Hello world</body>".encode(),
+        body=b"<body>Hello world</body>",
         headers=SAMPLE_HEADERS,
     )
     results = list(spider.parse_item(response))
diff --git a/backend/worker/webscraper/webscraper/test_pipelines.py b/backend/worker/webscraper/webscraper/test_pipelines.py
index ecde460f..5aecb15a 100644
--- a/backend/worker/webscraper/webscraper/test_pipelines.py
+++ b/backend/worker/webscraper/webscraper/test_pipelines.py
@@ -1,7 +1,11 @@
+# Standard Python Libraries
+from unittest.mock import MagicMock
+
+# Third-Party Libraries
 import pytest
-from .pipelines import ExportFilePipeline
 from scrapy.exceptions import DropItem
-from unittest.mock import MagicMock
+
+from .pipelines import ExportFilePipeline
 
 
 @pytest.fixture

From 7a8e551f0d589ba73fd46d98d1cf54f410571b98 Mon Sep 17 00:00:00 2001
From: "Grayson, Matthew" <matthew.grayson@associates.cisa.dhs.gov>
Date: Fri, 15 Mar 2024 13:30:24 -0500
Subject: [PATCH 03/13] Add types-requests to mypy dependencies in pre-commit
 config.

---
 .pre-commit-config.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 30e9e9f9..eca2b544 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -103,6 +103,8 @@ repos:
     rev: v1.5.1
     hooks:
       - id: mypy
+        additional_dependencies:
+          - types-requests
   - repo: https://github.com/asottile/pyupgrade
     rev: v3.10.1
     hooks:

From 0fae6d5867c44728c068077a0bde2e961cb47330 Mon Sep 17 00:00:00 2001
From: "Grayson, Matthew" <matthew.grayson@associates.cisa.dhs.gov>
Date: Fri, 15 Mar 2024 14:41:59 -0500
Subject: [PATCH 04/13] Delete hard-coded keys from mitmproxy files.

---
 .../worker/test_mitmproxy_sign_requests.py    | 32 ++++---------------
 1 file changed, 7 insertions(+), 25 deletions(-)

diff --git a/backend/worker/test_mitmproxy_sign_requests.py b/backend/worker/test_mitmproxy_sign_requests.py
index 2ff660e3..b2b23bc3 100644
--- a/backend/worker/test_mitmproxy_sign_requests.py
+++ b/backend/worker/test_mitmproxy_sign_requests.py
@@ -4,34 +4,16 @@
 It includes tests for different scenarios such as when a user agent and signature are set, and when they are not set.
 """
 
+# Standard Python Libraries
+import os
+
 # Third-Party Libraries
+from dotenv import load_dotenv
 from mitmproxy.test import taddons, tflow
 
 from .mitmproxy_sign_requests import SignRequests
 
-# This is a test RSA private key and not used in any deployed environment
-private_key = """-----BEGIN RSA PRIVATE KEY-----
-MIICXgIBAAKBgQDCFENGw33yGihy92pDjZQhl0C36rPJj+CvfSC8+q28hxA161QF
-NUd13wuCTUcq0Qd2qsBe/2hFyc2DCJJg0h1L78+6Z4UMR7EOcpfdUE9Hf3m/hs+F
-UR45uBJeDK1HSFHD8bHKD6kv8FPGfJTotc+2xjJwoYi+1hqp1fIekaxsyQIDAQAB
-AoGBAJR8ZkCUvx5kzv+utdl7T5MnordT1TvoXXJGXK7ZZ+UuvMNUCdN2QPc4sBiA
-QWvLw1cSKt5DsKZ8UETpYPy8pPYnnDEz2dDYiaew9+xEpubyeW2oH4Zx71wqBtOK
-kqwrXa/pzdpiucRRjk6vE6YY7EBBs/g7uanVpGibOVAEsqH1AkEA7DkjVH28WDUg
-f1nqvfn2Kj6CT7nIcE3jGJsZZ7zlZmBmHFDONMLUrXR/Zm3pR5m0tCmBqa5RK95u
-412jt1dPIwJBANJT3v8pnkth48bQo/fKel6uEYyboRtA5/uHuHkZ6FQF7OUkGogc
-mSJluOdc5t6hI1VsLn0QZEjQZMEOWr+wKSMCQQCC4kXJEsHAve77oP6HtG/IiEn7
-kpyUXRNvFsDE0czpJJBvL/aRFUJxuRK91jhjC68sA7NsKMGg5OXb5I5Jj36xAkEA
-gIT7aFOYBFwGgQAQkWNKLvySgKbAZRTeLBacpHMuQdl1DfdntvAyqpAZ0lY0RKmW
-G6aFKaqQfOXKCyWoUiVknQJAXrlgySFci/2ueKlIE1QqIiLSZ8V8OlpFLRnb1pzI
-7U1yQXnTAEFYM560yJlzUpOb1V4cScGd365tiSMvxLOvTA==
------END RSA PRIVATE KEY-----"""
-
-public_key = """-----BEGIN PUBLIC KEY-----
-MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDCFENGw33yGihy92pDjZQhl0C3
-6rPJj+CvfSC8+q28hxA161QFNUd13wuCTUcq0Qd2qsBe/2hFyc2DCJJg0h1L78+6
-Z4UMR7EOcpfdUE9Hf3m/hs+FUR45uBJeDK1HSFHD8bHKD6kv8FPGfJTotc+2xjJw
-oYi+1hqp1fIekaxsyQIDAQAB
------END PUBLIC KEY-----"""
+load_dotenv()
 
 
 def test_user_agent_and_signature():
@@ -42,8 +24,8 @@ def test_user_agent_and_signature():
     """
     sr = SignRequests(
         key_id="crossfeed",
-        public_key=public_key,
-        private_key=private_key,
+        public_key=os.getenv("WORKER_SIGNATURE_PUBLIC_KEY"),
+        private_key=os.getenv("WORKER_SIGNATURE_PRIVATE_KEY"),
         user_agent="custom user agent",
     )
     with taddons.context():

From 02bde8726abd2e1fde7373bb32d3560d74636b73 Mon Sep 17 00:00:00 2001
From: "Grayson, Matthew" <matthew.grayson@associates.cisa.dhs.gov>
Date: Fri, 15 Mar 2024 14:54:35 -0500
Subject: [PATCH 05/13] Add project .python-version.

---
 .python-version | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 .python-version

diff --git a/.python-version b/.python-version
new file mode 100644
index 00000000..e9d31ca3
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+XFD

From 9c0523e056bf3502c410926080ff865ebd720331 Mon Sep 17 00:00:00 2001
From: "Grayson, Matthew" <matthew.grayson@associates.cisa.dhs.gov>
Date: Fri, 15 Mar 2024 19:00:19 -0500
Subject: [PATCH 06/13] Use dotenv to import worker keys into mitmproxy; add
 line breaks to docstrings in mitmproxy test.

---
 backend/worker/mitmproxy_sign_requests.py      | 3 +++
 backend/worker/test_mitmproxy_sign_requests.py | 6 ++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/backend/worker/mitmproxy_sign_requests.py b/backend/worker/mitmproxy_sign_requests.py
index a2c55022..c98ac6e3 100644
--- a/backend/worker/mitmproxy_sign_requests.py
+++ b/backend/worker/mitmproxy_sign_requests.py
@@ -4,10 +4,13 @@
 import traceback
 
 # Third-Party Libraries
+from dotenv import load_dotenv
 from mitmproxy import http
 import requests
 from requests_http_signature import HTTPSignatureHeaderAuth
 
+load_dotenv()
+
 
 class SignRequests:
     """
diff --git a/backend/worker/test_mitmproxy_sign_requests.py b/backend/worker/test_mitmproxy_sign_requests.py
index b2b23bc3..95c63498 100644
--- a/backend/worker/test_mitmproxy_sign_requests.py
+++ b/backend/worker/test_mitmproxy_sign_requests.py
@@ -20,7 +20,8 @@ def test_user_agent_and_signature():
     """
     This function tests the SignRequests class with a user agent and signature set.
 
-    It creates an instance of the SignRequests class with a user agent and signature, makes a request, and verifies the signature.
+    It creates an instance of the SignRequests class with a user agent and signature, makes a request, and verifies the
+    signature.
     """
     sr = SignRequests(
         key_id="crossfeed",
@@ -45,7 +46,8 @@ def test_no_user_agent_or_signature_set():
     """
     This function tests the SignRequests class without a user agent and signature set.
 
-    It creates an instance of the SignRequests class without a user agent and signature, makes a request, and checks that no user agent, date, or signature headers are set.
+    It creates an instance of the SignRequests class without a user agent and signature, makes a request, and checks
+    that no user agent, date, or signature headers are set.
     """
     sr = SignRequests(key_id="", public_key="", private_key="", user_agent="")
     with taddons.context():

From 73ebc4feb626806640d44db0b4f78ebd60411fe1 Mon Sep 17 00:00:00 2001
From: "Grayson, Matthew" <matthew.grayson@associates.cisa.dhs.gov>
Date: Fri, 15 Mar 2024 19:02:20 -0500
Subject: [PATCH 07/13] Update Scrapy to address security vulnerability.

---
 backend/worker/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/worker/requirements.txt b/backend/worker/requirements.txt
index 72e51965..350a28f2 100644
--- a/backend/worker/requirements.txt
+++ b/backend/worker/requirements.txt
@@ -26,7 +26,7 @@ regex==2023.3.23
 requests==2.31.0
 requests-http-signature==0.2.0
 scikit-learn==1.2.2
-Scrapy==2.9.0
+Scrapy==2.11.1
 setuptools==65.5.1
 six==1.16.0
 threadpoolctl==3.1.0

From d306c1a6921ae9ba9324fc5ea8f0f1ed5238fc32 Mon Sep 17 00:00:00 2001
From: "Grayson, Matthew" <matthew.grayson@associates.cisa.dhs.gov>
Date: Sun, 17 Mar 2024 10:32:21 -0500
Subject: [PATCH 08/13] Load .env for python_test in backend.yml.

---
 .github/workflows/backend.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml
index 8fb5ba75..ee889d26 100644
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -111,6 +111,8 @@ jobs:
         uses: actions/setup-python@v5.0.0
         with:
           python-version: '3.10'
+      - name: Copy .env file
+        run: cp dev.env.example .env
       - uses: actions/cache@v3
         with:
           path: ~/.cache/pip

From dac4ba74d1d06122cac998108cc8f17c75d098d3 Mon Sep 17 00:00:00 2001
From: "Grayson, Matthew" <matthew.grayson@associates.cisa.dhs.gov>
Date: Sun, 17 Mar 2024 10:40:59 -0500
Subject: [PATCH 09/13] Fix file path for Copy .env file step in backend.yml's
 python_test.

---
 .github/workflows/backend.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml
index ee889d26..95b0a0b9 100644
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -112,7 +112,7 @@ jobs:
         with:
           python-version: '3.10'
       - name: Copy .env file
-        run: cp dev.env.example .env
+        run: cp ../dev.env.example .env
       - uses: actions/cache@v3
         with:
           path: ~/.cache/pip

From 49352a3edb68a4183513474bec65b535f172dc31 Mon Sep 17 00:00:00 2001
From: "Grayson, Matthew" <matthew.grayson@associates.cisa.dhs.gov>
Date: Sun, 17 Mar 2024 11:07:12 -0500
Subject: [PATCH 10/13] Fix typo in .flake8 config.

---
 .flake8 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.flake8 b/.flake8
index 869d863a..f188ef63 100644
--- a/.flake8
+++ b/.flake8
@@ -21,7 +21,7 @@ select = C,D,E,F,W,B,B950
 # Also ignore flake8's warning about line breaks before binary
 # operators.  It no longer agrees with PEP8.  See, for example, here:
 # https://github.com/ambv/black/issues/21. Guido agrees here:
-# https://github.com/python/peps/commit/c59c4376ad233a62git ca4b3a6060c81368bd21e85b.
+# https://github.com/python/peps/commit/c59c4376ad233a62ca4b3a6060c81368bd21e85b.
 ignore = E501,W503
 # Ignore D100 and D103, which check for docstrings in modules and functions, in all test files
 per-file-ignores =

From e20fbfddf0c6b27e1d64bc75d2332a1fef3dde77 Mon Sep 17 00:00:00 2001
From: "Grayson, Matthew" <matthew.grayson@associates.cisa.dhs.gov>
Date: Mon, 18 Mar 2024 14:40:42 -0500
Subject: [PATCH 11/13] Remove exception for test files from .flake8; write
 docstrings for test_pipelines.py and test_main_spider.py.

---
 .flake8                                       |  4 ---
 .../webscraper/spiders/test_main_spider.py    | 18 +++++++++++
 .../webscraper/webscraper/test_pipelines.py   | 31 +++++++++++++++++++
 3 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/.flake8 b/.flake8
index f188ef63..92ff8268 100644
--- a/.flake8
+++ b/.flake8
@@ -23,7 +23,3 @@ select = C,D,E,F,W,B,B950
 # https://github.com/ambv/black/issues/21. Guido agrees here:
 # https://github.com/python/peps/commit/c59c4376ad233a62ca4b3a6060c81368bd21e85b.
 ignore = E501,W503
-# Ignore D100 and D103, which check for docstrings in modules and functions, in all test files
-per-file-ignores =
-    # Ignore D100 and D103 in all test files
-    */test_*.py: D100, D103
diff --git a/backend/worker/webscraper/webscraper/spiders/test_main_spider.py b/backend/worker/webscraper/webscraper/spiders/test_main_spider.py
index 89b87dfa..44807905 100644
--- a/backend/worker/webscraper/webscraper/spiders/test_main_spider.py
+++ b/backend/worker/webscraper/webscraper/spiders/test_main_spider.py
@@ -1,3 +1,9 @@
+"""
+This module contains tests for the MainSpider class in the main_spider module.
+
+It includes tests for different scenarios such as when a response from a sample website is received.
+"""
+
 # Standard Python Libraries
 import json
 from tempfile import NamedTemporaryFile
@@ -30,11 +36,23 @@
 
 @pytest.fixture
 def spider():
+    """
+    Create a MainSpider instance with a temporary domains file.
+
+    This fixture creates a NamedTemporaryFile instance and uses its name as the domains_file parameter for the
+    MainSpider instance. The MainSpider instance is then returned for use in the tests.
+    """
     with NamedTemporaryFile() as f:
         return MainSpider(domains_file=f.name)
 
 
 def test_sample_website(spider):
+    """
+    Test the MainSpider class with a sample website response.
+
+    This function creates a sample Response instance with a specific body and headers. It then calls the parse_item
+    method of the MainSpider instance (provided by the spider fixture) with the sample response and checks the results.
+    """
     response = Response(
         url="https://www.cisa.gov",
         request=Request(url="https://www.cisa.gov"),
diff --git a/backend/worker/webscraper/webscraper/test_pipelines.py b/backend/worker/webscraper/webscraper/test_pipelines.py
index 5aecb15a..b7c6fadd 100644
--- a/backend/worker/webscraper/webscraper/test_pipelines.py
+++ b/backend/worker/webscraper/webscraper/test_pipelines.py
@@ -1,3 +1,9 @@
+"""
+This module contains tests for the ExportFilePipeline class in the pipelines module.
+
+It includes tests for different scenarios such as processing an item and handling duplicate items.
+"""
+
 # Standard Python Libraries
 from unittest.mock import MagicMock
 
@@ -10,11 +16,23 @@
 
 @pytest.fixture
 def pipeline():
+    """
+    Create an ExportFilePipeline instance with a mocked print function.
+
+    This fixture creates a MagicMock instance and uses it as the print parameter for the
+    ExportFilePipeline instance. The ExportFilePipeline instance is then returned for use in the tests.
+    """
     return ExportFilePipeline(print=MagicMock())
 
 
 @pytest.fixture
 def item():
+    """
+    Create a sample item for testing.
+
+    This fixture creates a dictionary that represents a sample item with specific headers and other details.
+    The item is then returned for use in the tests.
+    """
     return {
         "status": 200,
         "url": "https://www.cisa.gov",
@@ -49,11 +67,24 @@ def item():
 
 
 def test_print_item(pipeline, item):
+    """
+    Test the process_item method of the ExportFilePipeline class with a sample item.
+
+    This function calls the process_item method of the ExportFilePipeline instance (provided by the pipeline fixture)
+    with the sample item (provided by the item fixture) and checks if the print function was called.
+    """
     pipeline.process_item(item)
     pipeline.print.assert_called_once()
 
 
 def test_discard_duplicate_items(pipeline, item):
+    """
+    Test the process_item method of the ExportFilePipeline class with duplicate items.
+
+    This function calls the process_item method of the ExportFilePipeline instance (provided by the pipeline fixture)
+    with the sample item (provided by the item fixture) twice and checks if a DropItem exception is raised the second time.
+    It also checks if the print function was called only once.
+    """
     pipeline.process_item(item)
     pipeline.print.assert_called_once()
     pipeline.print.reset_mock()

From bd2196eeb52c64827c835667fb9280e660e87f2b Mon Sep 17 00:00:00 2001
From: "Grayson, Matthew" <matthew.grayson@associates.cisa.dhs.gov>
Date: Mon, 18 Mar 2024 14:47:24 -0500
Subject: [PATCH 12/13] Remove exception for test files from .bandit.yml.

---
 .bandit.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.bandit.yml b/.bandit.yml
index c8cf0312..b6b3a2c8 100644
--- a/.bandit.yml
+++ b/.bandit.yml
@@ -9,6 +9,3 @@ tests:
 
 skips:
   - B101  # skip "assert used" check since assertions are required in pytests
-
-exclude:
-  - '**/test_*.py'

From 9ac8ec4c2186f7a6b69f8b861149e9ca7c2b89b3 Mon Sep 17 00:00:00 2001
From: "Grayson, Matthew" <matthew.grayson@associates.cisa.dhs.gov>
Date: Mon, 18 Mar 2024 15:01:05 -0500
Subject: [PATCH 13/13] Delete .python-version from origin; add
 .python-version, __pycache__, and .mypy_cache to .gitignore.

---
 .gitignore      | 5 ++++-
 .python-version | 1 -
 2 files changed, 4 insertions(+), 2 deletions(-)
 delete mode 100644 .python-version

diff --git a/.gitignore b/.gitignore
index 69c09e98..1248786c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,10 @@
 # Files already tracked by Git are not affected.
 # See: https://git-scm.com/docs/gitignore
 
+# python
+__pycache__
+.mypy_cache
+.python-version
 
 # terraform
 .terraform
@@ -50,4 +54,3 @@ minio-data
 infrastructure/lambdas/security_headers.zip
 *.hcl
 .iac-data
-
diff --git a/.python-version b/.python-version
deleted file mode 100644
index e9d31ca3..00000000
--- a/.python-version
+++ /dev/null
@@ -1 +0,0 @@
-XFD