Skip to content

Commit

Permalink
Merge pull request #61 from dsi-clinic/arizona_corrections
Browse files Browse the repository at this point in the history
updated docstrings and added transactions splitter
  • Loading branch information
trevorspreadbury authored Dec 7, 2023
2 parents 1e1516f + dd44abe commit f806264
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 31 deletions.
25 changes: 12 additions & 13 deletions utils/arizona.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# see also the README in this branch's utils folder

import pandas as pd

from utils.clean import StateCleaner
Expand All @@ -11,6 +9,7 @@
az_transactions_convert,
az_transactor_sorter,
convert_date,
transactions_splitter,
)
from utils.constants import (
AZ_INDIVIDUALS_FILEPATH,
Expand Down Expand Up @@ -62,7 +61,7 @@ def clean_state(self) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
pipeline which takes in filenames and outputs cleaned,
standardized, and schema-compliant tables
args: list of two filepaths which lead to dataframes
args: list of three filepaths which lead to dataframes
returns: three schema-compliant tables for
transactions, individuals, and organizations
Expand Down Expand Up @@ -97,11 +96,10 @@ def create_tables(
Creates the Individuals, Organizations, and Transactions tables from
the dataframe list outputted from standardize
Inputs:
data: a list of 1 or 3 dataframes as outputted from standardize method.
Inputs: data: a list of 3 dataframes as outputted from the standardize method.
Returns: (individuals_table, organizations_table, transactions_table)
tuple containing the tables as defined in database schema
Returns: a nested tuple of dataframes, ordered as such: (individuals_table,
organizations_table, (transactions: ind->ind, ind->org, org->ind, org->org))
"""

transactions, details = data
Expand Down Expand Up @@ -132,7 +130,12 @@ def create_tables(
else:
az_organizations = None

return (az_individuals, az_organizations, az_transactions)
# will result in small or empty dataframes if a subset of the data is used
transactions_tuple = transactions_splitter(
az_individuals, az_organizations, az_transactions
)

return (az_individuals, az_organizations, transactions_tuple)

def standardize(self, details_df_list: list[pd.DataFrame]) -> list[pd.DataFrame]:
"""standardize names of entities
Expand Down Expand Up @@ -165,18 +168,14 @@ def standardize(self, details_df_list: list[pd.DataFrame]) -> list[pd.DataFrame]
def clean(self, data: list[pd.DataFrame]) -> pd.DataFrame:
"""clean the contents of the columns
INCOMPLETE
transactions and details dataframes undergo cleaning of
transaction dates, names are imputed to the right column,
and employer information is retrieved,
args: transactions and details dataframes
args: list of transactions and details dataframes
returns: cleaned transactions and details dataframes
NOTE: TO DO: coerce correct dtypes and make text lowercase
"""

transactions, details = data
Expand Down
86 changes: 68 additions & 18 deletions utils/cleaner_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,14 @@ def az_transactions_convert(df: pd.DataFrame) -> pd.DataFrame:
"""

d = {
"transaction_id": df["PublicTransactionId"].astype(int),
"donor_id": df["base_transactor_id"].astype(int),
"transaction_id": df["PublicTransactionId"].astype(int).astype(str),
"donor_id": df["base_transactor_id"].astype(int).astype(str),
"year": df["TransactionDateYear"].astype(int),
"amount": df["Amount"].abs(),
"recipient_id": df["other_transactor_id"].astype(int),
"office_sought": df["office_sought"],
"purpose": df["Memo"],
"transaction_type": df["TransactionType"],
"amount": df["Amount"].abs().astype(float),
"recipient_id": df["other_transactor_id"].astype(int).astype(int),
"office_sought": df["office_sought"].astype(str).str.lower(),
"purpose": df["Memo"].astype(str).str.lower(),
"transaction_type": df["TransactionType"].astype(str).str.lower(),
"TransactionTypeDispositionId": df["TransactionTypeDispositionId"],
}

Expand Down Expand Up @@ -129,14 +129,14 @@ def az_individuals_convert(details_df: pd.DataFrame) -> pd.DataFrame:
states_list.append(None)

d = {
"id": details_df["retrieved_id"].astype(int),
"id": details_df["retrieved_id"].astype(int).astype(str),
"first_name": None,
"last_name": None,
"full_name": details_df["full_name"],
"entity_type": entity_type,
"full_name": details_df["full_name"].astype(str).str.lower(),
"entity_type": entity_type.astype(str).str.lower(),
"state": states_list,
"party": details_df["party_name"],
"company": employer,
"party": details_df["party_name"].astype(str).str.lower(),
"company": employer.astype(str).str.lower(),
}

return pd.DataFrame(data=d)
Expand Down Expand Up @@ -175,10 +175,10 @@ def az_organizations_convert(df: pd.DataFrame) -> pd.DataFrame:
states_list.append(None)

d = {
"id": df["retrieved_id"].astype(int),
"name": df["committee_name"],
"id": df["retrieved_id"].astype(int).astype(str),
"name": df["committee_name"].astype(str).str.lower(),
"state": states_list,
"entity_type": entity_type,
"entity_type": entity_type.astype(str).str.lower(),
}

return pd.DataFrame(data=d)
Expand Down Expand Up @@ -333,6 +333,56 @@ def az_id_table(
id_map_df["database_id"] = [uuid.uuid4() for _ in range(len(id_map_df.index))]

return id_map_df
# new_uuids = []
# for i in len(all_ids):
# new_uuids.append(uid.uuid4())


def transactions_splitter(
individuals, organizations, transactions, *args: pd.DataFrame
) -> pd.DataFrame:
"""Split transactions into four groups
We split the transactions dataframe into four groups depending on donor
and recipient. If the donor and recipient are both individuals, the
transaction is classified in inividual->individual. If the donor
is an individual and the recipient an organization, the transaction
is classified in individual->organization, and so on.
NOTE: If running on a subset of the data, such as in the demo,
the resulting transactions dataframes will be very small
or entirely empty, as many of the ids will not be present and
cannot be classified. If this is undesirable, do not employ
this function and instead return the raw transactions dataframe.
args: the individuals, organizations, and transactions dataframes
created by ArizonaCleaner.create_tables().
returns: a tuple of four transactions dataframes split according
to the type of donor and recipient, ordered as follows:
individual->individual, individual->organization,
organization->individual, organization->organization
"""

inds_ids = individuals["id"].astype(str)

org_ids = organizations["id"].astype(str)

ind_ind = transactions[
(transactions["donor_id"].isin(inds_ids))
& (transactions["recipient_id"].isin(inds_ids))
]

ind_org = transactions[
(transactions["donor_id"].isin(inds_ids))
& (transactions["recipient_id"].isin(org_ids))
]

org_ind = transactions[
(transactions["donor_id"].isin(org_ids))
& (transactions["recipient_id"].isin(inds_ids))
]

org_org = transactions[
(transactions["donor_id"].isin(org_ids))
& (transactions["recipient_id"].isin(org_ids))
]

return (ind_ind, ind_org, org_ind, org_org)

0 comments on commit f806264

Please sign in to comment.