uchicago-dsi · alankagiri · Dec 5, 2023 · Dec 5, 2023 · Dec 5, 2023 · Dec 5, 2023
diff --git a/notebooks/PA_EDA.ipynb b/notebooks/PA_EDA.ipynb
diff --git a/requirements.txt b/requirements.txt
@@ -11,3 +11,4 @@ pandas~=2.0.3
 plotly~=5.18.0
 bs4~=0.0.1
 nbformat~=5.9.2
+spacy~=3.7.2
diff --git a/utils/PA_Data_Web_Scraper.py b/utils/PA_Data_Web_Scraper.py
@@ -29,7 +29,7 @@ def download_PA_data(start_year: int, end_year: int):
             zippedfile.filename = zippedfile.filename.replace(
                 ".txt", "_" + str(year) + ".txt"
             )
-            zippedfiles.extract(zippedfile, "../data/Raw/PA")
+            zippedfiles.extract(zippedfile, "../data/raw/PA")
 
 
 def main():

diff --git a/utils/PA_EDA_Functions.py b/utils/PA_EDA_Functions.py
@@ -1,9 +1,6 @@
-# import sys
-
 import pandas as pd
 import plotly.express as px
 
-# sys.path.append("/home/alankagiri/2023-fall-clinic-climate-cabinet")
 from utils import constants as const
 
 
@@ -93,7 +90,7 @@ def pre_process_contributor_dataset(df: pd.DataFrame):
             "E_ZIPCODE",
             "SECTION",
             "CYCLE",
-            "CONT_DESCRIP",
+            "PURPOSE",
             "CONT_DATE_1",
             "CONT_AMT_1",
             "CONT_DATE_2",
@@ -328,17 +325,21 @@ def plot_recipients_by_office(merged_dataset: pd.DataFrame) -> object:
     Return:
         A table object"""
 
-    recep_per_office = merged_dataset.replace({"OFFICE": const.PA_OFFICE_ABBREV_DICT})
+    recep_per_office = merged_dataset.replace(
+        {"RECIPIENT_OFFICE": const.PA_OFFICE_ABBREV_DICT}
+    )
 
     recep_per_office = (
-        recep_per_office.groupby(["OFFICE"]).agg({"TOTAL_CONT_AMT": sum}).reset_index()
+        recep_per_office.groupby(["RECIPIENT_OFFICE"])
+        .agg({"TOTAL_CONT_AMT": sum})
+        .reset_index()
     )
 
     fig = px.bar(
         data_frame=recep_per_office,
-        x="OFFICE",
+        x="RECIPIENT_OFFICE",
         y="TOTAL_CONT_AMT",
-        title="PA Contributions Received by Office-Type From 2018-2023",
+        title="Pennsylvania Contributions Received by Office-Type From 2018-2023",
         labels={"TOTAL_CONT_AMT": "Total Contribution Amount"},
     )
     fig.show()
@@ -369,7 +370,7 @@ def compare_cont_by_donorType(merged_dataset: pd.DataFrame) -> object:
         x="YEAR",
         y="TOTAL_CONT_AMT",
         color="RECIPIENT_TYPE",
-        title="PA Recipients of Annual Contributions (2018 - 2023)",
+        title="Pennsylvania Recipients of Annual Contributions (2018 - 2023)",
         labels={
             "TOTAL_CONT_AMT": "Total Contribution Amount",
             "RECIPIENT_TYPE": "Type of Filer",

diff --git a/utils/clean.py b/utils/clean.py
@@ -47,7 +47,7 @@ def clean(self, data: list[pd.DataFrame]) -> list[pd.DataFrame]:
         not representing minimal viable transactions
 
         Inputs:
-                data: a list of 1 or 3 dataframes as outputted from preprocess method.
+            data: a list of 1 or 3 dataframes as outputted from preprocess method.
 
         Returns: a list of dataframes. If state data is all in one format
             (i.e. there are not separate individual and transaction tables),

diff --git a/utils/constants.py b/utils/constants.py
@@ -79,7 +79,7 @@
     "CONT_AMT_2",
     "CONT_DATE_3",
     "CONT_AMT_3",
-    "CONT_DESCRIP",
+    "PURPOSE",
 ]
 
 PA_CONT_COLS_NAMES_POST2022: list = [
@@ -108,7 +108,7 @@
     "CONT_AMT_2",
     "CONT_DATE_3",
     "CONT_AMT_3",
-    "CONT_DESCRIP",
+    "PURPOSE",
 ]
 
 PA_FILER_COLS_NAMES_PRE2022: list = [
@@ -193,7 +193,7 @@
 
 PA_OFFICE_ABBREV_DICT: dict = {
     "GOV": "Governor",
-    "LTG": "Liutenant Gov",
+    "LTG": "Lieutenant Gov",
     "ATT": "Attorney General",
     "AUD": "Auditor General",
     "TRE": "State Treasurer",