Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add local authority data utility functions #1

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions src/data_common/csv/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import requests
import urllib3

import pandas as pd

import ssl

ssl._create_default_https_context = ssl._create_unverified_context
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


def get_google_sheet_as_csv(key, outfile, sheet_name=None):
sheet_url = f"https://docs.google.com/spreadsheets/d/{key}/gviz/tq?tqx=out:csv"
if sheet_name is not None:
sheet_url = f"{sheet_url}&sheet={sheet_name}"
r = requests.get(sheet_url)

with open(outfile, "wb") as outfile:
outfile.write(r.content)


def replace_csv_headers(csv_file, new_headers, drop_empty_columns=True, outfile=None):
if outfile is None:
outfile = csv_file

df = pd.read_csv(csv_file)
if drop_empty_columns:
df = df.dropna(axis="columns", how="all")

df.columns = new_headers
df.to_csv(open(outfile, "w"), index=False, header=True)
115 changes: 115 additions & 0 deletions src/data_common/local_authority/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
from os.path import join
import pandas as pd

from ..dataset import get_dataset_df

def fix_council_name(council: str) -> str:
return (
council.replace("council", "")
.replace(" - unitary", "")
.replace("(unitary)", "")
.strip()
)

def add_local_authority_code(df: pd.DataFrame) -> pd.DataFrame:
"""
Add the local-authority-code to the dataframe
"""

name_to_code = get_dataset_df(
repo="uk_local_authority_names_and_codes",
package="uk_la_past_current",
version="1",
file="lookup_name_to_registry.csv",
)
df["council_lower"] = df["council"].str.lower().apply(fix_council_name)
name_to_code["council_lower"] = (
name_to_code["la-name"].str.lower().apply(fix_council_name)
)
df = df.merge(name_to_code, on="council_lower", how="left")

# local-authority-code is in last position, move it to the start of the dataframe
cols = list(df.columns)
cols.insert(0, cols.pop(-1))
df = df[cols]
df = df.drop(columns=["council_lower", "la-name"])
return df

def add_region_and_county(df: pd.DataFrame) -> pd.DataFrame:
name_to_code = get_dataset_df(
repo="uk_local_authority_names_and_codes",
package="uk_la_past_current",
version="1",
file="uk_local_authorities_current.csv",
)

rows = len(df["council"])
df["region"] = pd.Series([None] * rows, index=df.index)
df["county"] = pd.Series([None] * rows, index=df.index)

for index, row in df.iterrows():
authority_code = row["local-authority-code"]
if not pd.isnull(authority_code):
authority_match = name_to_code[
name_to_code["local-authority-code"] == authority_code
]
df.at[index, "region"] = authority_match["region"].values[0]
df.at[index, "county"] = authority_match["county-la"].values[0]

return df


def add_gss_codes(df: pd.DataFrame) -> pd.DataFrame:
name_to_code = get_dataset_df(
repo="uk_local_authority_names_and_codes",
package="uk_la_past_current",
version="1",
file="uk_local_authorities_current.csv",
)

rows = len(df["council"])
df["gss_code"] = pd.Series([None] * rows, index=df.index)

for index, row in df.iterrows():
authority_code = row["local-authority-code"]
if not pd.isnull(authority_code):
authority_match = name_to_code[
name_to_code["local-authority-code"] == authority_code
]
df.at[index, "gss_code"] = authority_match["gss-code"].values[0]

return df


def add_extra_authority_info(df: pd.DataFrame) -> pd.DataFrame:
name_to_code = get_dataset_df(
repo="uk_local_authority_names_and_codes",
package="uk_la_past_current",
version="1",
file="uk_local_authorities_current.csv",
)

extra_df = name_to_code[
[
"local-authority-code",
"local-authority-type",
"wdtk-id",
"mapit-area-code",
"nation",
"gss-code",
]
]

# the info sheet may contain updated version of columns previously
# loaded to sheet, need to drop them before the merge
# ignore errors in case columns are not present
columns_to_drop = [x for x in extra_df.columns if x != "local-authority-code"]
df = df.drop(columns=columns_to_drop, errors="ignore")

# merge two dataframes using the authority_code as the common reference
extra_df = extra_df.merge(df, on="local-authority-code", how="left")

is_non_english = extra_df["nation"].isin(["Wales", "Scotland", "Northern Ireland"])
extra_df.loc[is_non_english, "local-authority-type"] = "UA"

return extra_df