diff --git a/.bandit.yml b/.bandit.yml index ab3cb21e..b6b3a2c8 100644 --- a/.bandit.yml +++ b/.bandit.yml @@ -6,8 +6,6 @@ # If `tests` is empty, all tests are considered included. tests: -# - B101 -# - B102 skips: -# - B101 # skip "assert used" check since assertions are required in pytests + - B101 # skip "assert used" check since assertions are required in pytests diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index 8fb5ba75..95b0a0b9 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -111,6 +111,8 @@ jobs: uses: actions/setup-python@v5.0.0 with: python-version: '3.10' + - name: Copy .env file + run: cp ../dev.env.example .env - uses: actions/cache@v3 with: path: ~/.cache/pip diff --git a/.gitignore b/.gitignore index 69c09e98..1248786c 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,10 @@ # Files already tracked by Git are not affected. # See: https://git-scm.com/docs/gitignore +# python +__pycache__ +.mypy_cache +.python-version # terraform .terraform @@ -50,4 +54,3 @@ minio-data infrastructure/lambdas/security_headers.zip *.hcl .iac-data - diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 30e9e9f9..eca2b544 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -103,6 +103,8 @@ repos: rev: v1.5.1 hooks: - id: mypy + additional_dependencies: + - types-requests - repo: https://github.com/asottile/pyupgrade rev: v3.10.1 hooks: diff --git a/backend/scripts/populateCountiesCities/cities.py b/backend/scripts/populateCountiesCities/cities.py index cd720f42..8adc3c4e 100644 --- a/backend/scripts/populateCountiesCities/cities.py +++ b/backend/scripts/populateCountiesCities/cities.py @@ -1,18 +1,44 @@ -import pandas as pd -import requests -from bs4 import BeautifulSoup -import time -import re +""" +This module contains the script for populating cities data. + +It includes functions for parsing titles, pulling cities data from Wikipedia, +and writing the data to a CSV file. +""" + +# Standard Python Libraries import json +import re +import time from urllib.parse import unquote +# Third-Party Libraries +from bs4 import BeautifulSoup +import pandas as pd +import requests + def title_parse(title): + """ + Parse the title by unquoting it. + + Args: + title (str): The title to be parsed. + + Returns: + str: The parsed title. + """ title = unquote(title) return title def pull_cities(): + """ + Process and pull cities data from Wikipedia. + + This function reads the Wikipedia US cities data from a JSON file, processes each entry, + fetches the corresponding Wikipedia page, parses the page to extract city, county, and URL information, + and writes the data to a CSV file. + """ print("Processing Cities...") with open("wikipedia_US_cities.json") as f: wikipedia_us_city_data = json.load(f) @@ -23,7 +49,10 @@ def pull_cities(): print(entry["name"]) # get the response in the form of html wikiurl = "https://en.wikipedia.org/wiki/" + entry["url"] - response = requests.get(wikiurl) + try: + response = requests.get(wikiurl, timeout=5) + except requests.exceptions.Timeout: + print("The request timed out") # parse data from the html into a beautifulsoup object soup = BeautifulSoup(response.text, "html.parser") @@ -52,7 +81,9 @@ def pull_cities(): if "," in link.get("title"): county_pieces = link.get("title").split(",") # OPEN WIKIPEDIA PAGE UP - x = requests.get("https://en.wikipedia.org/" + link.get("href")) + x = requests.get( + "https://en.wikipedia.org/" + link.get("href"), timeout=5 + ) # PULL COUNTY OR PARISH FROM WIKIPEDIA PAGE county_parish_matches = re.findall( @@ -85,7 +116,8 @@ def pull_cities(): } ) time.sleep(1) - except: + except Exception as e: + print(f"Error: {e}") pass df = pd.DataFrame(holding_pen, columns=["State", "County", "City", "URL"]) diff --git a/backend/scripts/populateCountiesCities/counties.py b/backend/scripts/populateCountiesCities/counties.py index 64df0f38..34823f8c 100644 --- a/backend/scripts/populateCountiesCities/counties.py +++ b/backend/scripts/populateCountiesCities/counties.py @@ -1,16 +1,35 @@ +""" +This module contains the script for populating counties data. + +It includes functions for pulling counties data from Wikipedia, +and writing the data to a CSV file. +""" + +# Standard Python Libraries +import re +import time + +# Third-Party Libraries +from bs4 import BeautifulSoup import pandas as pd import requests -from bs4 import BeautifulSoup -import time -import re def pull_counties(): + """ + Process and pull counties data from Wikipedia. + + This function fetches the Wikipedia page for the list of United States counties, + parses the page to extract county, state, and URL information, + and writes the data to a CSV file. + """ print("Processing Counties...") # get the response in the form of html wikiurl = "https://en.wikipedia.org/wiki/List_of_United_States_counties_and_county_equivalents" - table_class = "wikitable sortable jquery-tablesorter" - response = requests.get(wikiurl) + try: + response = requests.get(wikiurl, timeout=5) + except requests.exceptions.Timeout: + print("The request timed out") # parse data from the html into a beautifulsoup object soup = BeautifulSoup(response.text, "html.parser") @@ -24,7 +43,7 @@ def pull_counties(): try: county_pieces = link.get("title").split(", ") # OPEN WIKIPEDIA PAGE UP - x = requests.get("https://en.wikipedia.org/" + link.get("href")) + x = requests.get("https://en.wikipedia.org/" + link.get("href"), timeout=5) # PULL WEBSITE FROM WIKIPEDIA PAGE w = re.findall( @@ -43,6 +62,7 @@ def pull_counties(): } ) except Exception as e: + print(f"Error: {e}") pass time.sleep(1) diff --git a/backend/scripts/populateCountiesCities/main.py b/backend/scripts/populateCountiesCities/main.py index dc86edb1..baf7de5b 100644 --- a/backend/scripts/populateCountiesCities/main.py +++ b/backend/scripts/populateCountiesCities/main.py @@ -1,22 +1,53 @@ -import typer +""" +This module contains the main script for populating counties and cities data. + +It includes commands for processing cities and counties data separately or both at once. +""" + +# Third-Party Libraries import cities import counties +import typer app = typer.Typer() @app.command() def process_cities(): + """ + Process and pull cities data from Wikipedia. + + This function calls the pull_cities function from the cities module, + which reads the Wikipedia US cities data from a JSON file, processes each entry, + fetches the corresponding Wikipedia page, parses the page to extract city, county, and URL information, + and writes the data to a CSV file. + """ cities.pull_cities() @app.command() def process_counties(): + """ + Process and pull counties data from Wikipedia. + + This function calls the pull_counties function from the counties module, + which fetches the Wikipedia page for the list of United States counties, + parses the page to extract county, state, and URL information, + and writes the data to a CSV file. + """ counties.pull_counties() @app.command() def process_both(): + """ + Process and pull both cities and counties data from Wikipedia. + + This function calls both the pull_cities function from the cities module and the pull_counties function from the counties module, + which fetches the Wikipedia pages for the list of United States cities and counties, + parses the pages to extract city, county, state, and URL information, + and writes the data to CSV files. + """ counties.pull_counties() cities.pull_cities() diff --git a/backend/scripts/populateCountiesCities/requirements.txt b/backend/scripts/populateCountiesCities/requirements.txt index b3e808d7..0e6f50f1 100644 --- a/backend/scripts/populateCountiesCities/requirements.txt +++ b/backend/scripts/populateCountiesCities/requirements.txt @@ -1,4 +1,4 @@ +beautifulsoup4==4.11.2 pandas==1.5.1 requests==2.28.2 -beautifulsoup4==4.11.2 typer==0.7.0 diff --git a/backend/worker/__init__.py b/backend/worker/__init__.py index e69de29b..088f05b8 100644 --- a/backend/worker/__init__.py +++ b/backend/worker/__init__.py @@ -0,0 +1,5 @@ +""" +This package contains the worker tasks for the backend. + +It includes modules for processing data, interacting with databases, and other backend tasks. +""" diff --git a/backend/worker/mitmproxy_sign_requests.py b/backend/worker/mitmproxy_sign_requests.py index 691a5afb..c98ac6e3 100644 --- a/backend/worker/mitmproxy_sign_requests.py +++ b/backend/worker/mitmproxy_sign_requests.py @@ -1,16 +1,41 @@ -""" -mitmproxy addon that signs requests and adds a Crossfeed-specific user agent. -""" -from mitmproxy import http, ctx +"""mitmproxy addon that signs requests and adds a Crossfeed-specific user agent.""" +# Standard Python Libraries import os -import requests -import json import traceback + +# Third-Party Libraries +from dotenv import load_dotenv +from mitmproxy import http +import requests from requests_http_signature import HTTPSignatureHeaderAuth +load_dotenv() + class SignRequests: + """ + A class used to sign HTTP requests and add a Crossfeed-specific user agent. + + This class is used as a mitmproxy addon. It signs the HTTP requests using the provided private key and adds a user agent to the request headers. + + Attributes: + key_id (str): The key ID used for signing the requests. + private_key (str): The private key used for signing the requests. + public_key (str): The public key used for verifying the signature. + user_agent (str): The user agent to be added to the request headers. + signature_auth (HTTPSignatureHeaderAuth): The HTTPSignatureHeaderAuth instance used for signing the requests. + """ + def __init__(self, key_id="", public_key="", private_key="", user_agent=""): + """ + Initialize the SignRequests instance. + + Args: + key_id (str, optional): The key ID used for signing the requests. Defaults to "". + public_key (str, optional): The public key used for verifying the signature. Defaults to "". + private_key (str, optional): The private key used for signing the requests. Defaults to "". + user_agent (str, optional): The user agent to be added to the request headers. Defaults to "". + """ self.key_id = key_id self.private_key = private_key self.public_key = public_key @@ -20,9 +45,30 @@ def __init__(self, key_id="", public_key="", private_key="", user_agent=""): ) def key_resolver(self, key_id, algorithm): + """ + Resolve the key for the given key_id and algorithm. + + Args: + key_id (str): The key ID used for signing the requests. + algorithm (str): The algorithm used for signing the requests. + + Returns: + bytes: The public key encoded in bytes. + """ return self.public_key.encode() def verify_signature(self, method, url, date, signature): + """ + Verify the signature of the HTTP request. + + Args: + method (str): The HTTP method of the request. + url (str): The URL of the request. + date (str): The date when the request was made. + signature (str): The signature of the request. + + This method uses the HTTPSignatureHeaderAuth's verify method to verify the signature of the request. + """ HTTPSignatureHeaderAuth.verify( requests.Request( method=url, url=url, headers={"date": date, "Signature": signature} @@ -32,6 +78,17 @@ def verify_signature(self, method, url, date, signature): ) def request(self, flow): + """ + Process the HTTP request. + + This method adds a user agent to the request headers if one is provided. If a private key is provided, it signs the request using the HTTPSignatureHeaderAuth instance. + + Args: + flow (mitmproxy.http.HTTPFlow): The HTTP request/response flow. + + Raises: + Exception: If there is an error while processing the request, an exception is raised and a 500 response is returned. + """ try: if self.user_agent: flow.request.headers["User-Agent"] = self.user_agent @@ -57,47 +114,11 @@ def request(self, flow): ) -test = os.getenv("WORKER_TEST", None) is not None - -if test: - # This is a test RSA private key and not used in any deployed environment - # file deepcode ignore HardcodedNonCryptoSecret: - private_key = """-----BEGIN RSA PRIVATE KEY----- -MIICXgIBAAKBgQDCFENGw33yGihy92pDjZQhl0C36rPJj+CvfSC8+q28hxA161QF -NUd13wuCTUcq0Qd2qsBe/2hFyc2DCJJg0h1L78+6Z4UMR7EOcpfdUE9Hf3m/hs+F -UR45uBJeDK1HSFHD8bHKD6kv8FPGfJTotc+2xjJwoYi+1hqp1fIekaxsyQIDAQAB -AoGBAJR8ZkCUvx5kzv+utdl7T5MnordT1TvoXXJGXK7ZZ+UuvMNUCdN2QPc4sBiA -QWvLw1cSKt5DsKZ8UETpYPy8pPYnnDEz2dDYiaew9+xEpubyeW2oH4Zx71wqBtOK -kqwrXa/pzdpiucRRjk6vE6YY7EBBs/g7uanVpGibOVAEsqH1AkEA7DkjVH28WDUg -f1nqvfn2Kj6CT7nIcE3jGJsZZ7zlZmBmHFDONMLUrXR/Zm3pR5m0tCmBqa5RK95u -412jt1dPIwJBANJT3v8pnkth48bQo/fKel6uEYyboRtA5/uHuHkZ6FQF7OUkGogc -mSJluOdc5t6hI1VsLn0QZEjQZMEOWr+wKSMCQQCC4kXJEsHAve77oP6HtG/IiEn7 -kpyUXRNvFsDE0czpJJBvL/aRFUJxuRK91jhjC68sA7NsKMGg5OXb5I5Jj36xAkEA -gIT7aFOYBFwGgQAQkWNKLvySgKbAZRTeLBacpHMuQdl1DfdntvAyqpAZ0lY0RKmW -G6aFKaqQfOXKCyWoUiVknQJAXrlgySFci/2ueKlIE1QqIiLSZ8V8OlpFLRnb1pzI -7U1yQXnTAEFYM560yJlzUpOb1V4cScGd365tiSMvxLOvTA== ------END RSA PRIVATE KEY-----""" - - public_key = """-----BEGIN PUBLIC KEY----- -MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDCFENGw33yGihy92pDjZQhl0C3 -6rPJj+CvfSC8+q28hxA161QFNUd13wuCTUcq0Qd2qsBe/2hFyc2DCJJg0h1L78+6 -Z4UMR7EOcpfdUE9Hf3m/hs+FUR45uBJeDK1HSFHD8bHKD6kv8FPGfJTotc+2xjJw -oYi+1hqp1fIekaxsyQIDAQAB ------END PUBLIC KEY-----""" - addons = [ - SignRequests( - key_id="crossfeed", - public_key=public_key, - private_key=private_key, - user_agent="Crossfeed test user agent", - ) - ] -else: - addons = [ - SignRequests( - key_id="crossfeed", - public_key=os.getenv("WORKER_SIGNATURE_PUBLIC_KEY", ""), - private_key=os.getenv("WORKER_SIGNATURE_PRIVATE_KEY", ""), - user_agent=os.getenv("WORKER_USER_AGENT", ""), - ) - ] +addons = [ + SignRequests( + key_id="crossfeed", + public_key=os.getenv("WORKER_SIGNATURE_PUBLIC_KEY", ""), + private_key=os.getenv("WORKER_SIGNATURE_PRIVATE_KEY", ""), + user_agent=os.getenv("WORKER_USER_AGENT", ""), + ) +] diff --git a/backend/worker/requirements.txt b/backend/worker/requirements.txt index d362d5a5..350a28f2 100644 --- a/backend/worker/requirements.txt +++ b/backend/worker/requirements.txt @@ -5,9 +5,10 @@ click==8.1.3 dateparser==1.1.8 dnstwist==20230509 docopt==0.6.2 +git+https://github.com/LeapBeyond/scrubadub.git@d0e12c5d922631af3532d044196b05fb1b7c8c1c +git+https://github.com/mitmproxy/mitmproxy@e0e46f4 idna==3.4 joblib==1.2.0 -git+https://github.com/mitmproxy/mitmproxy@e0e46f4 mitmproxy_wireguard==0.1.23 numpy==1.24.3 pandas==2.1.4 @@ -18,20 +19,20 @@ psycopg2-binary==2.9.5 pyproject_hooks==1.0.0 pytest==7.3.0 python-dateutil==2.8.2 +python-dotenv==1.0.1 pytz==2023.3 pytz-deprecation-shim==0.1.0.post0 regex==2023.3.23 requests==2.31.0 requests-http-signature==0.2.0 -Scrapy==2.9.0 -git+https://github.com/LeapBeyond/scrubadub.git@d0e12c5d922631af3532d044196b05fb1b7c8c1c scikit-learn==1.2.2 +Scrapy==2.11.1 +setuptools==65.5.1 six==1.16.0 threadpoolctl==3.1.0 tomli==2.0.1 trustymail @ git+https://github.com/Matthew-Grayson/trustymail@production tzdata==2023.3 tzlocal==4.3 -yarg==0.1.9 wheel==0.38.1 -setuptools==65.5.1 +yarg==0.1.9 diff --git a/backend/worker/test_mitmproxy_sign_requests.py b/backend/worker/test_mitmproxy_sign_requests.py index 766ca5fd..95c63498 100644 --- a/backend/worker/test_mitmproxy_sign_requests.py +++ b/backend/worker/test_mitmproxy_sign_requests.py @@ -1,41 +1,35 @@ -from mitmproxy import exceptions -from mitmproxy.test import tflow -from mitmproxy.test import taddons +""" +This module contains tests for the SignRequests class in the mitmproxy_sign_requests module. + +It includes tests for different scenarios such as when a user agent and signature are set, and when they are not set. +""" + +# Standard Python Libraries +import os + +# Third-Party Libraries +from dotenv import load_dotenv +from mitmproxy.test import taddons, tflow + from .mitmproxy_sign_requests import SignRequests -# This is a test RSA private key and not used in any deployed environment -private_key = """-----BEGIN RSA PRIVATE KEY----- -MIICXgIBAAKBgQDCFENGw33yGihy92pDjZQhl0C36rPJj+CvfSC8+q28hxA161QF -NUd13wuCTUcq0Qd2qsBe/2hFyc2DCJJg0h1L78+6Z4UMR7EOcpfdUE9Hf3m/hs+F -UR45uBJeDK1HSFHD8bHKD6kv8FPGfJTotc+2xjJwoYi+1hqp1fIekaxsyQIDAQAB -AoGBAJR8ZkCUvx5kzv+utdl7T5MnordT1TvoXXJGXK7ZZ+UuvMNUCdN2QPc4sBiA -QWvLw1cSKt5DsKZ8UETpYPy8pPYnnDEz2dDYiaew9+xEpubyeW2oH4Zx71wqBtOK -kqwrXa/pzdpiucRRjk6vE6YY7EBBs/g7uanVpGibOVAEsqH1AkEA7DkjVH28WDUg -f1nqvfn2Kj6CT7nIcE3jGJsZZ7zlZmBmHFDONMLUrXR/Zm3pR5m0tCmBqa5RK95u -412jt1dPIwJBANJT3v8pnkth48bQo/fKel6uEYyboRtA5/uHuHkZ6FQF7OUkGogc -mSJluOdc5t6hI1VsLn0QZEjQZMEOWr+wKSMCQQCC4kXJEsHAve77oP6HtG/IiEn7 -kpyUXRNvFsDE0czpJJBvL/aRFUJxuRK91jhjC68sA7NsKMGg5OXb5I5Jj36xAkEA -gIT7aFOYBFwGgQAQkWNKLvySgKbAZRTeLBacpHMuQdl1DfdntvAyqpAZ0lY0RKmW -G6aFKaqQfOXKCyWoUiVknQJAXrlgySFci/2ueKlIE1QqIiLSZ8V8OlpFLRnb1pzI -7U1yQXnTAEFYM560yJlzUpOb1V4cScGd365tiSMvxLOvTA== ------END RSA PRIVATE KEY-----""" - -public_key = """-----BEGIN PUBLIC KEY----- -MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDCFENGw33yGihy92pDjZQhl0C3 -6rPJj+CvfSC8+q28hxA161QFNUd13wuCTUcq0Qd2qsBe/2hFyc2DCJJg0h1L78+6 -Z4UMR7EOcpfdUE9Hf3m/hs+FUR45uBJeDK1HSFHD8bHKD6kv8FPGfJTotc+2xjJw -oYi+1hqp1fIekaxsyQIDAQAB ------END PUBLIC KEY-----""" +load_dotenv() def test_user_agent_and_signature(): + """ + This function tests the SignRequests class with a user agent and signature set. + + It creates an instance of the SignRequests class with a user agent and signature, makes a request, and verifies the + signature. + """ sr = SignRequests( key_id="crossfeed", - public_key=public_key, - private_key=private_key, + public_key=os.getenv("WORKER_SIGNATURE_PUBLIC_KEY"), + private_key=os.getenv("WORKER_SIGNATURE_PRIVATE_KEY"), user_agent="custom user agent", ) - with taddons.context() as tctx: + with taddons.context(): f = tflow.tflow() f.request.headers["User-Agent"] = "original user agent" sr.request(f) @@ -49,8 +43,14 @@ def test_user_agent_and_signature(): def test_no_user_agent_or_signature_set(): + """ + This function tests the SignRequests class without a user agent and signature set. + + It creates an instance of the SignRequests class without a user agent and signature, makes a request, and checks + that no user agent, date, or signature headers are set. + """ sr = SignRequests(key_id="", public_key="", private_key="", user_agent="") - with taddons.context() as tctx: + with taddons.context(): f = tflow.tflow() sr.request(f) assert "User-Agent" not in f.request.headers diff --git a/backend/worker/webscraper/webscraper/__init__.py b/backend/worker/webscraper/webscraper/__init__.py index e69de29b..61265cd2 100644 --- a/backend/worker/webscraper/webscraper/__init__.py +++ b/backend/worker/webscraper/webscraper/__init__.py @@ -0,0 +1,5 @@ +""" +This package contains modules and classes for web scraping. + +It includes modules for making HTTP requests, parsing HTML, and extracting data. +""" diff --git a/backend/worker/webscraper/webscraper/items.py b/backend/worker/webscraper/webscraper/items.py index a581a5fc..f7a4a8d6 100644 --- a/backend/worker/webscraper/webscraper/items.py +++ b/backend/worker/webscraper/webscraper/items.py @@ -1,6 +1,7 @@ -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html +""" +This module defines the items that the web scraper will extract from web pages. -import scrapy +Each item is represented by a class, and the fields of the class represent the data that the scraper will extract. +See documentation here for information on how to create models for your items: +https://docs.scrapy.org/en/latest/topics/items.html +""" diff --git a/backend/worker/webscraper/webscraper/middlewares.py b/backend/worker/webscraper/webscraper/middlewares.py index 3efe286a..9f755168 100644 --- a/backend/worker/webscraper/webscraper/middlewares.py +++ b/backend/worker/webscraper/webscraper/middlewares.py @@ -1,27 +1,44 @@ -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html +""" +This module defines the middlewares for the web scraper. -from scrapy import signals +Middlewares are used to process incoming responses and outgoing requests and items. This module defines two types of middlewares: Spider Middleware and Downloader Middleware. The Spider Middleware processes responses before they reach the spider and processes items and requests after they have been processed by the spider. The Downloader Middleware processes requests before they are sent to the downloader and processes responses before they reach the Spider Middleware or the spider. +See documentation here: +https://docs.scrapy.org/en/latest/topics/spider-middleware.html +""" -# useful for handling different item types with a single interface -from itemadapter import is_item, ItemAdapter +# Third-Party Libraries +from scrapy import signals class WebscraperSpiderMiddleware: + """ + This class defines the Spider Middleware for the web scraper. + + The Spider Middleware processes responses before they reach the spider and processes items and requests after they have been processed by the spider. + """ + # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): + """ + Create spiders using Scrapy. + + Connect the spider_opened method to the spider_opened signal. + """ # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): + """ + Process each response that goes through the spider middleware and into the spider. + + Return None or raise an exception. + """ # Called for each response that goes through the spider # middleware and into the spider. @@ -29,14 +46,23 @@ def process_spider_input(self, response, spider): return None def process_spider_output(self, response, result, spider): + """ + Process the results returned from the Spider, after it has processed the response. + + Return an iterable of Request, or item objects. + """ # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, or item objects. - for i in result: - yield i + yield from result def process_spider_exception(self, response, exception, spider): + """ + Handle exceptions raised by a spider or process_spider_input() method. + + This method should return either None or an iterable of Request or item objects. + """ # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. @@ -44,31 +70,52 @@ def process_spider_exception(self, response, exception, spider): pass def process_start_requests(self, start_requests, spider): + """ + Process the start requests of the spider. + + This method works similarly to the process_spider_output() method, except that it doesn’t have a response associated. It must return only requests (not items). + """ # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). - for r in start_requests: - yield r + yield from start_requests def spider_opened(self, spider): + """Log the name of the spider when opened.""" spider.logger.info("Spider opened: %s" % spider.name) class WebscraperDownloaderMiddleware: + """ + This class defines the Downloader Middleware for the web scraper. + + The Downloader Middleware processes requests before they are sent to the downloader and processes responses before they reach the Spider Middleware or the spider. + """ + # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): + """ + Create spiders using Scrapy. + + Connect the spider_opened method to the spider_opened signal. + """ # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): + """ + Process each request that goes through the downloader middleware. + + Must either return None, a Response object, a Request object, or raise IgnoreRequest. + """ # Called for each request that goes through the downloader # middleware. @@ -81,6 +128,11 @@ def process_request(self, request, spider): return None def process_response(self, request, response, spider): + """ + Process the response returned from the downloader. + + Must either return a Response object, a Request object, or raise IgnoreRequest. + """ # Called with the response returned from the downloader. # Must either; @@ -90,6 +142,11 @@ def process_response(self, request, response, spider): return response def process_exception(self, request, exception, spider): + """ + Handle exceptions raised by a download handler or a process_request() method. + + Must either return None, a Response object, a Request object. + """ # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. @@ -100,4 +157,5 @@ def process_exception(self, request, exception, spider): pass def spider_opened(self, spider): + """Log the name of the spider when it is opened.""" spider.logger.info("Spider opened: %s" % spider.name) diff --git a/backend/worker/webscraper/webscraper/pipelines.py b/backend/worker/webscraper/webscraper/pipelines.py index c9b63243..e6cd6631 100644 --- a/backend/worker/webscraper/webscraper/pipelines.py +++ b/backend/worker/webscraper/webscraper/pipelines.py @@ -1,18 +1,42 @@ -from scrapy.exceptions import DropItem +""" +This module contains the pipeline classes for the web scraper. + +The pipelines process the items returned by the spiders. +""" + +# Standard Python Libraries import json -import os -from io import BytesIO -from datetime import datetime + +# Third-Party Libraries +from scrapy.exceptions import DropItem class ExportFilePipeline: - """Prints file contents to the console.""" + """Print file contents to the console.""" def __init__(self, print=print): + """ + Initialize the ExportFilePipeline class. + + Args: + print (function, optional): A function to print the output. Defaults to print. + """ self.urls_seen = set() self.print = print def process_item(self, item, spider=None): + """ + Process each item that goes through the pipeline. + + If the item's URL has been seen before, it raises a DropItem exception. Otherwise, it prints the item and returns it. + + Args: + item (dict): The item to process. + spider (Spider, optional): The spider that produced the item. Defaults to None. + + Returns: + dict: The processed item. + """ if item["url"] in self.urls_seen: raise DropItem("Duplicate item found with url: %s" % item["url"]) self.urls_seen.add(item["url"]) diff --git a/backend/worker/webscraper/webscraper/settings.py b/backend/worker/webscraper/webscraper/settings.py index 40d5c9c5..91769d75 100644 --- a/backend/worker/webscraper/webscraper/settings.py +++ b/backend/worker/webscraper/webscraper/settings.py @@ -1,11 +1,15 @@ -# Scrapy settings for webscraper project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html +""" +Scrapy settings for the webscraper project. + +This module contains the settings for the webscraper project, including configurations for the spider, +downloader middleware, item pipelines, and more. +Additional settings are in the documentation: +https://docs.scrapy.org/en/latest/topics/settings.html +https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +https://docs.scrapy.org/en/latest/topics/spider-middleware.html +""" + +# Standard Python Libraries import logging BOT_NAME = "webscraper" diff --git a/backend/worker/webscraper/webscraper/spiders/__init__.py b/backend/worker/webscraper/webscraper/spiders/__init__.py index ebd689ac..cce0fa8b 100644 --- a/backend/worker/webscraper/webscraper/spiders/__init__.py +++ b/backend/worker/webscraper/webscraper/spiders/__init__.py @@ -1,4 +1,5 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. +""" +This package contains the spiders for the webscraper project. + +Each spider is a Python class that defines how a certain site or a group of sites will be scraped. +""" diff --git a/backend/worker/webscraper/webscraper/spiders/main_spider.py b/backend/worker/webscraper/webscraper/spiders/main_spider.py index af3b2a86..c4221a14 100644 --- a/backend/worker/webscraper/webscraper/spiders/main_spider.py +++ b/backend/worker/webscraper/webscraper/spiders/main_spider.py @@ -1,26 +1,69 @@ -import scrapy -from scrapy.spiders import CrawlSpider, Rule -from scrapy.linkextractors import LinkExtractor +""" +This module contains the MainSpider class for the webscraper project. + +The MainSpider class is a Scrapy spider that crawls and scrapes data from the specified start URLs. +""" + +# Standard Python Libraries from urllib.parse import urlparse -import hashlib -import json + +# Third-Party Libraries +from scrapy.linkextractors import LinkExtractor +from scrapy.spiders import CrawlSpider, Rule class MainSpider(CrawlSpider): + """ + MainSpider is a Scrapy spider that crawls and scrapes data from the specified start URLs. + + It uses the LinkExtractor to follow links and the parse_item method to process the scraped data. + """ + name = "main" rules = (Rule(LinkExtractor(), callback="parse_item", follow=True),) def __init__(self, *args, **kwargs): + """ + Initialize the MainSpider class. + + It reads the start URLs from a file and sets the allowed domains based on these URLs. + + Args: + *args: Variable length argument list. + **kwargs: Arbitrary keyword arguments. + """ super().__init__(*args, **kwargs) - with open(self.domains_file, "r") as f: + with open(self.domains_file) as f: self.start_urls = f.read().split("\n") self.allowed_domains = [urlparse(url).netloc for url in self.start_urls] def parse_start_url(self, response): + """ + Parse the start URL. + + This method gets called when the spider opens the start URL. It returns the result of the parse_item method. + + Args: + response (Response): The response object for the start URL. + + Returns: + dict: The result of the parse_item method. + """ return self.parse_item(response) def parse_item(self, response): + """ + Parse the response and extract the data. + + This method gets called for each response that the spider receives. It extracts the data from the response and returns it as a dictionary. + + Args: + response (Response): The response to parse. + + Returns: + dict: The extracted data. + """ try: body_decoded = response.body.decode() except UnicodeDecodeError: diff --git a/backend/worker/webscraper/webscraper/spiders/test_main_spider.py b/backend/worker/webscraper/webscraper/spiders/test_main_spider.py index f7d86199..44807905 100644 --- a/backend/worker/webscraper/webscraper/spiders/test_main_spider.py +++ b/backend/worker/webscraper/webscraper/spiders/test_main_spider.py @@ -1,8 +1,18 @@ +""" +This module contains tests for the MainSpider class in the main_spider module. + +It includes tests for different scenarios such as when a response from a sample website is received. +""" + +# Standard Python Libraries +import json +from tempfile import NamedTemporaryFile + +# Third-Party Libraries import pytest +from scrapy.http import Request, Response + from .main_spider import MainSpider -from scrapy.http import Response, Request -from tempfile import NamedTemporaryFile -import json SAMPLE_HEADERS = { "Server": "Apache", @@ -26,15 +36,27 @@ @pytest.fixture def spider(): + """ + Create a MainSpider instance with a temporary domains file. + + This fixture creates a NamedTemporaryFile instance and uses its name as the domains_file parameter for the + MainSpider instance. The MainSpider instance is then returned for use in the tests. + """ with NamedTemporaryFile() as f: return MainSpider(domains_file=f.name) def test_sample_website(spider): + """ + Test the MainSpider class with a sample website response. + + This function creates a sample Response instance with a specific body and headers. It then calls the parse_item + method of the MainSpider instance (provided by the spider fixture) with the sample response and checks the results. + """ response = Response( url="https://www.cisa.gov", request=Request(url="https://www.cisa.gov"), - body="Hello world".encode(), + body=b"Hello world", headers=SAMPLE_HEADERS, ) results = list(spider.parse_item(response)) diff --git a/backend/worker/webscraper/webscraper/test_pipelines.py b/backend/worker/webscraper/webscraper/test_pipelines.py index ecde460f..b7c6fadd 100644 --- a/backend/worker/webscraper/webscraper/test_pipelines.py +++ b/backend/worker/webscraper/webscraper/test_pipelines.py @@ -1,16 +1,38 @@ +""" +This module contains tests for the ExportFilePipeline class in the pipelines module. + +It includes tests for different scenarios such as processing an item and handling duplicate items. +""" + +# Standard Python Libraries +from unittest.mock import MagicMock + +# Third-Party Libraries import pytest -from .pipelines import ExportFilePipeline from scrapy.exceptions import DropItem -from unittest.mock import MagicMock + +from .pipelines import ExportFilePipeline @pytest.fixture def pipeline(): + """ + Create an ExportFilePipeline instance with a mocked print function. + + This fixture creates a MagicMock instance and uses it as the print parameter for the + ExportFilePipeline instance. The ExportFilePipeline instance is then returned for use in the tests. + """ return ExportFilePipeline(print=MagicMock()) @pytest.fixture def item(): + """ + Create a sample item for testing. + + This fixture creates a dictionary that represents a sample item with specific headers and other details. + The item is then returned for use in the tests. + """ return { "status": 200, "url": "https://www.cisa.gov", @@ -45,11 +67,24 @@ def item(): def test_print_item(pipeline, item): + """ + Test the process_item method of the ExportFilePipeline class with a sample item. + + This function calls the process_item method of the ExportFilePipeline instance (provided by the pipeline fixture) + with the sample item (provided by the item fixture) and checks if the print function was called. + """ pipeline.process_item(item) pipeline.print.assert_called_once() def test_discard_duplicate_items(pipeline, item): + """ + Test the process_item method of the ExportFilePipeline class with duplicate items. + + This function calls the process_item method of the ExportFilePipeline instance (provided by the pipeline fixture) + with the sample item (provided by the item fixture) twice and checks if a DropItem exception is raised the second time. + It also checks if the print function was called only once. + """ pipeline.process_item(item) pipeline.print.assert_called_once() pipeline.print.reset_mock()