usds · lucasmbrown-usds · Apr 18, 2022 · Apr 27, 2022 · May 12, 2022 · Jun 23, 2022
@@ -39,6 +39,7 @@ jobs:
         run: poetry show -v
       - name: Install dependencies
         run: poetry install
-        if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
+        # TODO: investigate why caching layer started failing.
+        # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
       - name: Run tox
         run: poetry run tox
@@ -38,6 +38,12 @@ jobs:
         uses: snok/install-poetry@v1
       - name: Print Poetry settings
         run: poetry show -v
+      - name: Install GDAL/ogr2ogr
+        run: |
+          sudo add-apt-repository ppa:ubuntugis/ppa
+          sudo apt-get update
+          sudo apt-get -y install gdal-bin
+          ogrinfo --version
       - name: Install dependencies
         run: poetry add s4cmd && poetry install
         if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
@@ -47,12 +53,21 @@ jobs:
           aws-access-key-id: ${{ secrets.DATA_DEV_AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.DATA_DEV_AWS_SECRET_ACCESS_KEY }}
           aws-region: us-east-1
+      - name: Download census geo data for later user
+        run: |
+          poetry run python3 data_pipeline/application.py pull-census-data -s aws
       - name: Generate Score
         run: |
           poetry run python3 data_pipeline/application.py score-full-run
       - name: Generate Score Post
         run: |
-          poetry run python3 data_pipeline/application.py generate-score-post -s aws
+          poetry run python3 data_pipeline/application.py generate-score-post
+      - name: Generate Score Geo
+        run: |
+          poetry run python3 data_pipeline/application.py geo-score
+      - name: Run Smoketests
+        run: |
+          poetry run pytest data_pipeline/ -m smoketest
       - name: Deploy Score to Geoplatform AWS
         run: |
           poetry run s4cmd put ./data_pipeline/data/score/csv/ s3://justice40-data/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/csv --recursive --force --API-ACL=public-read
@@ -71,12 +86,6 @@ jobs:
           repo-token: ${{ secrets.GITHUB_TOKEN }}
           repo-token-user-login: "github-actions[bot]"
           allow-repeats: false
-      - name: Install GDAL/ogr2ogr
-        run: |
-          sudo add-apt-repository ppa:ubuntugis/ppa
-          sudo apt-get update
-          sudo apt-get -y install gdal-bin
-          ogrinfo --version
       - name: Set timezone for tippecanoe
         uses: szenius/[email protected]
         with:
@@ -94,9 +103,6 @@ jobs:
           mkdir -p /usr/local/bin
           cp tippecanoe /usr/local/bin/tippecanoe
           tippecanoe -v
-      - name: Generate Score Geo
-        run: |
-          poetry run python3 data_pipeline/application.py geo-score
       - name: Generate Tiles
         run: |
           poetry run python3 data_pipeline/application.py generate-map-tiles
@@ -111,7 +117,7 @@ jobs:
           # Deploy to S3 for the staging URL
           message: |
             ** Map Deployed! **   
-            Map with Staging Backend: https://screeningtool.geoplatform.gov/en/?flags=stage_hash=${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}
+            Map with Staging Backend: https://screeningtool.geoplatform.gov/en?flags=stage_hash=${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}
             Find tiles here: https://justice40-data.s3.amazonaws.com/data-pipeline-staging/${{env.PR_NUMBER}}/${{env.SHA_NUMBER}}/data/score/tiles
           repo-token: ${{ secrets.GITHUB_TOKEN }}
           repo-token-user-login: "github-actions[bot]"

@@ -9,7 +9,8 @@ RUN apt-get update && apt-get install -y \
     unzip \
     wget \
     python3-dev \
-    python3-pip
+    python3-pip \
+    gdal-bin
 
 # tippeanoe
 ENV TZ=America/Los_Angeles

@@ -12,11 +12,14 @@
       - [2. Extract-Transform-Load (ETL) the data](#2-extract-transform-load-etl-the-data)
       - [3. Combined dataset](#3-combined-dataset)
       - [4. Tileset](#4-tileset)
+      - [5. Shapefiles](#5-shapefiles)
     - [Score generation and comparison workflow](#score-generation-and-comparison-workflow)
       - [Workflow Diagram](#workflow-diagram)
       - [Step 0: Set up your environment](#step-0-set-up-your-environment)
       - [Step 1: Run the script to download census data or download from the Justice40 S3 URL](#step-1-run-the-script-to-download-census-data-or-download-from-the-justice40-s3-url)
       - [Step 2: Run the ETL script for each data source](#step-2-run-the-etl-script-for-each-data-source)
+        - [Table of commands](#table-of-commands)
+        - [ETL steps](#etl-steps)
       - [Step 3: Calculate the Justice40 score experiments](#step-3-calculate-the-justice40-score-experiments)
       - [Step 4: Compare the Justice40 score experiments to other indices](#step-4-compare-the-justice40-score-experiments-to-other-indices)
     - [Data Sources](#data-sources)
@@ -26,21 +29,27 @@
     - [MacOS](#macos)
     - [Windows Users](#windows-users)
     - [Setting up Poetry](#setting-up-poetry)
-    - [Downloading Census Block Groups GeoJSON and Generating CBG CSVs](#downloading-census-block-groups-geojson-and-generating-cbg-csvs)
+    - [Running tox](#running-tox)
+    - [The Application entrypoint](#the-application-entrypoint)
+    - [Downloading Census Block Groups GeoJSON and Generating CBG CSVs (not normally required)](#downloading-census-block-groups-geojson-and-generating-cbg-csvs-not-normally-required)
+    - [Run all ETL, score and map generation processes](#run-all-etl-score-and-map-generation-processes)
+    - [Run both ETL and score generation processes](#run-both-etl-and-score-generation-processes)
+    - [Run all ETL processes](#run-all-etl-processes)
     - [Generating Map Tiles](#generating-map-tiles)
     - [Serve the map locally](#serve-the-map-locally)
     - [Running Jupyter notebooks](#running-jupyter-notebooks)
     - [Activating variable-enabled Markdown for Jupyter notebooks](#activating-variable-enabled-markdown-for-jupyter-notebooks)
-  - [Miscellaneous](#miscellaneous)
   - [Testing](#testing)
     - [Background](#background)
-    - [Configuration / Fixtures](#configuration--fixtures)
+    - [Score and post-processing tests](#score-and-post-processing-tests)
       - [Updating Pickles](#updating-pickles)
-      - [Future Enchancements](#future-enchancements)
-    - [ETL Unit Tests](#etl-unit-tests)
+      - [Future Enhancements](#future-enhancements)
+    - [Fixtures used in ETL "snapshot tests"](#fixtures-used-in-etl-snapshot-tests)
+    - [Other ETL Unit Tests](#other-etl-unit-tests)
       - [Extract Tests](#extract-tests)
       - [Transform Tests](#transform-tests)
       - [Load Tests](#load-tests)
+    - [Smoketests](#smoketests)
 
 <!-- /TOC -->
 
@@ -196,7 +205,7 @@ Here's a list of commands:
 
 ## Local development
 
-You can run the Python code locally without Docker to develop, using Poetry. However, to generate the census data you will need the [GDAL library](https://github.com/OSGeo/gdal) installed locally. Also to generate tiles for a local map, you will need [Mapbox tippecanoe](https://github.com/mapbox/tippecanoe). Please refer to the repos for specific instructions for your OS.
+You can run the Python code locally without Docker to develop, using Poetry. However, to generate the census data you will need the [GDAL library](https://github.com/OSGeo/gdal) installed locally. For score generation, you will need [libspatialindex](https://libspatialindex.org/en/latest/). And to generate tiles for a local map, you will need [Mapbox tippecanoe](https://github.com/mapbox/tippecanoe). Please refer to the repos for specific instructions for your OS.
 
 ### VSCode
 
@@ -218,6 +227,7 @@ To install the above-named executables:
 
 - gdal: `brew install gdal`
 - Tippecanoe: `brew install tippecanoe`
+- spatialindex: `brew install spatialindex`
 
 Note: For MacOS Monterey or M1 Macs, [you might need to follow these steps](https://stackoverflow.com/a/70880741) to install Scipy.
 
@@ -229,10 +239,19 @@ If you want to run tile generation, please install TippeCanoe [following these i
 
 - Start a terminal
 - Change to this directory (`/data/data-pipeline/`)
-- Make sure you have at least Python 3.7 installed: `python -V` or `python3 -V`
+- Make sure you have at least Python 3.8 installed: `python -V` or `python3 -V`
 - We use [Poetry](https://python-poetry.org/) for managing dependencies and building the application. Please follow the instructions on their site to download.
 - Install Poetry requirements with `poetry install`
 
+### Running tox 
+
+Our full test and check suite is run using tox. This can be run using commands such 
+as `poetry run tox`.
+
+Each run can take a while to build the whole environment. If you'd like to save time,
+you can use the previously built environment by running `poetry run tox -e lint` 
+which will drastically speed up the process.
+
 ### The Application entrypoint
 
 After installing the poetry dependencies, you can see a list of commands with the following steps:
@@ -303,7 +322,11 @@ see [python-markdown docs](https://github.com/ipython-contrib/jupyter_contrib_nb
 
 ### Background
 
-For this project, we make use of [pytest](https://docs.pytest.org/en/latest/) for testing purposes. To run tests, simply run `poetry run pytest` in this directory (i.e., `justice40-tool/data/data-pipeline`).
+<!-- markdown-link-check-disable -->
+For this project, we make use of [pytest](https://docs.pytest.org/en/latest/) for testing purposes. 
+<!-- markdown-link-check-enable-->
+
+To run tests, simply run `poetry run pytest` in this directory (i.e., `justice40-tool/data/data-pipeline`).
 
 Test data is configured via [fixtures](https://docs.pytest.org/en/latest/explanation/fixtures.html).
 
@@ -350,7 +373,8 @@ We have four pickle files that correspond to expected files:
 
 To update the pickles, let's go one by one:
 
-For the `score_transformed_expected.pkl`, put a breakpoint on [this line](https://github.com/usds/justice40-tool/blob/main/data/data-pipeline/data_pipeline/etl/score/tests/test_score_post.py#L58), before the `pdt.assert_frame_equal` and run:
+For the `score_transformed_expected.pkl`, put a breakpoint on [this line]
+(https://github.com/usds/justice40-tool/blob/main/data/data-pipeline/data_pipeline/etl/score/tests/test_score_post.py#L62), before the `pdt.assert_frame_equal` and run:
 `pytest data_pipeline/etl/score/tests/test_score_post.py::test_transform_score`
 
 Once on the breakpoint, capture the df to a pickle as follows:
@@ -378,7 +402,7 @@ score_data_actual.to_pickle(data_path / "data_pipeline" / "etl" / "score" / "tes
 
 Then take out the breakpoint and re-run the test: `pytest data_pipeline/etl/score/tests/test_score_post.py::test_create_score_data`
 
-For the `tile_data_expected.pkl`, put a breakpoint on [this line](https://github.com/usds/justice40-tool/blob/main/data/data-pipeline/data_pipeline/etl/score/tests/test_score_post.py#L86), before the `pdt.assert_frame_equal` and run:
+For the `tile_data_expected.pkl`, put a breakpoint on [this line](https://github.com/usds/justice40-tool/blob/main/data/data-pipeline/data_pipeline/etl/score/tests/test_score_post.py#L90), before the `pdt.assert_frame_equal` and run:
 `pytest data_pipeline/etl/score/tests/test_score_post.py::test_create_tile_data`
 
 Once on the breakpoint, capture the df to a pickle as follows:
@@ -418,7 +442,9 @@ In the future, we could adopt any of the below strategies to work around this:
 
 1. We could use [pytest-snapshot](https://pypi.org/project/pytest-snapshot/) to automatically store the output of each test as data changes. This would make it so that you could avoid having to generate a pickle for each method - instead, you would only need to call `generate` once , and only when the dataframe had changed.
 
+<!-- markdown-link-check-disable -->
 Additionally, you could use a pandas type schema annotation such as [pandera](https://pandera.readthedocs.io/en/stable/schema_models.html?highlight=inputschema#basic-usage) to annotate input/output schemas for given functions, and your unit tests could use these to validate explicitly. This could be of very high value for annotating expectations.
+<!-- markdown-link-check-enable-->
 
 Alternatively, or in conjunction, you could move toward using a more strictly-typed container format for read/writes such as SQL/SQLite, and use something like [SQLModel](https://github.com/tiangolo/sqlmodel) to handle more explicit type guarantees.
 
@@ -485,3 +511,13 @@ See above [Fixtures](#configuration--fixtures) section for information about whe
 These make use of [tmp_path_factory](https://docs.pytest.org/en/latest/how-to/tmp_path.html) to create a file-system located under `temp_dir`, and validate whether the correct files are written to the correct locations.
 
 Additional future modifications could include the use of Pandera and/or other schema validation tools, and or a more explicit test that the data written to file can be read back in and yield the same dataframe.
+
+### Smoketests
+
+To ensure the score and tiles process correctly, there is a suite of "smoke tests" that can be run after the ETL and score data have been run, and outputs like the frontend GEOJSON have been created.
+These tests are implemented as pytest test, but are skipped by default. To run them.
+
+1. Generate a full score with `poetry run python3 data_pipeline/application.py score-full-run`
+2. Generate the tile data with `poetry run python3 data_pipeline/application.py generate-score-post`
+3. Generate the frontend GEOJSON with `poetry run python3 data_pipeline/application.py geo-score`
+4. Select the smoke tests for pytest with `poetry run pytest data_pipeline/tests -k smoketest`
@@ -10,6 +10,7 @@
     score_post,
 )
 from data_pipeline.etl.sources.census.etl_utils import (
+    check_census_data_source,
     reset_data_directories as census_reset,
     zip_census_data,
 )
@@ -96,6 +97,23 @@ def census_data_download(zip_compress):
     sys.exit()
 
 
+@cli.command(help="Retrieve census data from source")
+@click.option(
+    "-s",
+    "--data-source",
+    default="local",
+    required=False,
+    type=str,
+    help=dataset_cli_help,
+)
+def pull_census_data(data_source: str):
+    logger.info("Pulling census data from %s", data_source)
+    data_path = settings.APP_ROOT / "data" / "census"
+    check_census_data_source(data_path, data_source)
+    logger.info("Finished pulling census data")
+    sys.exit()
+
+
 @cli.command(
     help="Run all ETL processes or a specific one",
 )

@@ -40,7 +40,7 @@ def validate_new_data(
     assert (
         checking_df[score_col].nunique() <= 3
     ), f"Error: there are too many values possible in {score_col}"
-    assert (True in checking_df[score_col].unique()) & (
+    assert (True in checking_df[score_col].unique()) | (
         False in checking_df[score_col].unique()
     ), f"Error: {score_col} should be a boolean"
 

@@ -12,7 +12,8 @@
 
 # set root dir
 settings.APP_ROOT = pathlib.Path(data_pipeline.__file__).resolve().parent
-
+settings.DATA_PATH = settings.APP_ROOT / "data"
+settings.REQUESTS_DEFAULT_TIMOUT = 3600
 # To set an environment use:
 # Linux/OSX: export ENV_FOR_DYNACONF=staging
 # Windows: set ENV_FOR_DYNACONF=staging