Merge pull request #61 from dsi-clinic/arizona_corrections

updated docstrings and added transactions splitter
uchicago-dsi · Dec 7, 2023 · f806264 · f806264
2 parents 1e1516f + dd44abe
commit f806264
Show file tree

Hide file tree

Showing 2 changed files with 80 additions and 31 deletions.
diff --git a/utils/arizona.py b/utils/arizona.py
@@ -1,5 +1,3 @@
-# see also the README in this branch's utils folder
-
 import pandas as pd
 
 from utils.clean import StateCleaner
@@ -11,6 +9,7 @@
     az_transactions_convert,
     az_transactor_sorter,
     convert_date,
+    transactions_splitter,
 )
 from utils.constants import (
     AZ_INDIVIDUALS_FILEPATH,
@@ -62,7 +61,7 @@ def clean_state(self) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
         pipeline which takes in filenames and outputs cleaned,
         standardized, and schema-compliant tables
 
-        args: list of two filepaths which lead to dataframes
+        args: list of three filepaths which lead to dataframes
 
         returns: three schema-compliant tables for
         transactions, individuals, and organizations
@@ -97,11 +96,10 @@ def create_tables(
         Creates the Individuals, Organizations, and Transactions tables from
         the dataframe list outputted from standardize
 
-        Inputs:
-            data: a list of 1 or 3 dataframes as outputted from standardize method.
+        Inputs: data: a list of 3 dataframes as outputted from the standardize method.
 
-        Returns: (individuals_table, organizations_table, transactions_table)
-                    tuple containing the tables as defined in database schema
+        Returns: a nested tuple of dataframes, ordered as such: (individuals_table,
+        organizations_table, (transactions: ind->ind, ind->org, org->ind, org->org))
         """
 
         transactions, details = data
@@ -132,7 +130,12 @@ def create_tables(
         else:
             az_organizations = None
 
-        return (az_individuals, az_organizations, az_transactions)
+        # will result in small or empty dataframes if a subset of the data is used
+        transactions_tuple = transactions_splitter(
+            az_individuals, az_organizations, az_transactions
+        )
+
+        return (az_individuals, az_organizations, transactions_tuple)
 
     def standardize(self, details_df_list: list[pd.DataFrame]) -> list[pd.DataFrame]:
         """standardize names of entities
@@ -165,18 +168,14 @@ def standardize(self, details_df_list: list[pd.DataFrame]) -> list[pd.DataFrame]
     def clean(self, data: list[pd.DataFrame]) -> pd.DataFrame:
         """clean the contents of the columns
 
-        INCOMPLETE
-
         transactions and details dataframes undergo cleaning of
         transaction dates, names are imputed to the right column,
         and employer information is retrieved,
 
-        args: transactions and details dataframes
+        args: list of transactions and details dataframes
 
         returns: cleaned transactions and details dataframes
 
-        NOTE: TO DO: coerce correct dtypes and make text lowercase
-
         """
 
         transactions, details = data

diff --git a/utils/cleaner_utils.py b/utils/cleaner_utils.py
@@ -63,14 +63,14 @@ def az_transactions_convert(df: pd.DataFrame) -> pd.DataFrame:
     """
 
     d = {
-        "transaction_id": df["PublicTransactionId"].astype(int),
-        "donor_id": df["base_transactor_id"].astype(int),
+        "transaction_id": df["PublicTransactionId"].astype(int).astype(str),
+        "donor_id": df["base_transactor_id"].astype(int).astype(str),
         "year": df["TransactionDateYear"].astype(int),
-        "amount": df["Amount"].abs(),
-        "recipient_id": df["other_transactor_id"].astype(int),
-        "office_sought": df["office_sought"],
-        "purpose": df["Memo"],
-        "transaction_type": df["TransactionType"],
+        "amount": df["Amount"].abs().astype(float),
+        "recipient_id": df["other_transactor_id"].astype(int).astype(int),
+        "office_sought": df["office_sought"].astype(str).str.lower(),
+        "purpose": df["Memo"].astype(str).str.lower(),
+        "transaction_type": df["TransactionType"].astype(str).str.lower(),
         "TransactionTypeDispositionId": df["TransactionTypeDispositionId"],
     }
 
@@ -129,14 +129,14 @@ def az_individuals_convert(details_df: pd.DataFrame) -> pd.DataFrame:
             states_list.append(None)
 
     d = {
-        "id": details_df["retrieved_id"].astype(int),
+        "id": details_df["retrieved_id"].astype(int).astype(str),
         "first_name": None,
         "last_name": None,
-        "full_name": details_df["full_name"],
-        "entity_type": entity_type,
+        "full_name": details_df["full_name"].astype(str).str.lower(),
+        "entity_type": entity_type.astype(str).str.lower(),
         "state": states_list,
-        "party": details_df["party_name"],
-        "company": employer,
+        "party": details_df["party_name"].astype(str).str.lower(),
+        "company": employer.astype(str).str.lower(),
     }
 
     return pd.DataFrame(data=d)
@@ -175,10 +175,10 @@ def az_organizations_convert(df: pd.DataFrame) -> pd.DataFrame:
             states_list.append(None)
 
     d = {
-        "id": df["retrieved_id"].astype(int),
-        "name": df["committee_name"],
+        "id": df["retrieved_id"].astype(int).astype(str),
+        "name": df["committee_name"].astype(str).str.lower(),
         "state": states_list,
-        "entity_type": entity_type,
+        "entity_type": entity_type.astype(str).str.lower(),
     }
 
     return pd.DataFrame(data=d)
@@ -333,6 +333,56 @@ def az_id_table(
     id_map_df["database_id"] = [uuid.uuid4() for _ in range(len(id_map_df.index))]
 
     return id_map_df
-    # new_uuids = []
-    # for i in len(all_ids):
-    #     new_uuids.append(uid.uuid4())
+
+
+def transactions_splitter(
+    individuals, organizations, transactions, *args: pd.DataFrame
+) -> pd.DataFrame:
+    """Split transactions into four groups
+
+    We split the transactions dataframe into four groups depending on donor
+    and recipient. If the donor and recipient are both individuals, the
+    transaction is classified in inividual->individual. If the donor
+    is an individual and the recipient an organization, the transaction
+    is classified in individual->organization, and so on.
+
+    NOTE: If running on a subset of the data, such as in the demo,
+    the resulting transactions dataframes will be very small
+    or entirely empty, as many of the ids will not be present and
+    cannot be classified. If this is undesirable, do not employ
+    this function and instead return the raw transactions dataframe.
+
+    args: the individuals, organizations, and transactions dataframes
+    created by ArizonaCleaner.create_tables().
+
+    returns: a tuple of four transactions dataframes split according
+    to the type of donor and recipient, ordered as follows:
+    individual->individual, individual->organization,
+    organization->individual, organization->organization
+    """
+
+    inds_ids = individuals["id"].astype(str)
+
+    org_ids = organizations["id"].astype(str)
+
+    ind_ind = transactions[
+        (transactions["donor_id"].isin(inds_ids))
+        & (transactions["recipient_id"].isin(inds_ids))
+    ]
+
+    ind_org = transactions[
+        (transactions["donor_id"].isin(inds_ids))
+        & (transactions["recipient_id"].isin(org_ids))
+    ]
+
+    org_ind = transactions[
+        (transactions["donor_id"].isin(org_ids))
+        & (transactions["recipient_id"].isin(inds_ids))
+    ]
+
+    org_org = transactions[
+        (transactions["donor_id"].isin(org_ids))
+        & (transactions["recipient_id"].isin(org_ids))
+    ]
+
+    return (ind_ind, ind_org, org_ind, org_org)