Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SARC-187 - Provide a prompt for mila-drac manual matching. #61

Merged
merged 10 commits into from
Jul 27, 2023
186 changes: 131 additions & 55 deletions sarc/account_matching/make_matches.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def perform_matching(
override_matches_mila_to_cc: dict[str, str],
name_distance_delta_threshold=2, # mostly for testing
verbose=False,
prompt=False,
):
"""
This is the function with the core functionality.
Expand All @@ -78,9 +79,16 @@ def perform_matching(
fetch things from the `config` or to the `database`.
All the SARC-related tasks are done outside of this function.

Returns a dict of dicts, indexed by @mila.quebec email addresses,
and containing entries of the form
If `prompt` is True, a command-line prompt will be provided
everywhere manual matching is required.

Returns a couple containing:
- a dict of dicts, indexed by @mila.quebec email addresses,
and containing entries of the form
{"mila_ldap": {...}, "drac_roles": {...}, "drac_members": {...}}
- a dict of new manual matches occurred during matching,
mapping a mila email to a DRAC username. As manual
matches rely on prompt, dict will be empty if prompt is False.
"""

# because this function feels entitled to modify the input data
Expand Down Expand Up @@ -164,74 +172,142 @@ def perform_matching(
# We have 206 drac_members accounts with @mila.quebec, out of 610.
# We have 42 drac_roles accounts with @mila.quebec, out of 610.

_matching_names(DLD_data, DD_persons, name_distance_delta_threshold)
# Matching.
new_manual_matches = _matching_names(
DLD_data, DD_persons, name_distance_delta_threshold, prompt
)

# NB: In any case (even with prompt), match overriding is applied.
# This means that even a manually-prompted matching may be overriden
# if related mila username is present in override_matches_mila_to_cc.
# Is it what we want ?
_manual_matching(DLD_data, DD_persons, override_matches_mila_to_cc)

if verbose:
_make_matches_status_report(DLD_data, DD_persons)

return DD_persons
return DD_persons, new_manual_matches


def _matching_names(DLD_data, DD_persons, name_distance_delta_threshold):
def _matching_names(DLD_data, DD_persons, name_distance_delta_threshold, prompt=False):
"""
Substep of the `perform_matching` function.
Mutates the entries of `DD_persons` in-place.
All argument names are the same as in the body of `perform_matching`.
First argument names are the same as in the body of `perform_matching`.
If `prompt` is True, a prompt is provided to solve ambiguous cases.

Return a dictionary of manual matches,
mapping a mila email to manually-associated DRAC username.
"""

for name_or_nom, drac_source in [("name", "drac_members"), ("nom", "drac_roles")]:
LP_name_matches = name_distances.find_exact_bag_of_words_matches(
[e[name_or_nom] for e in DLD_data[drac_source]],
mila_email_to_cc_username = {}

for name_or_nom, cc_source in [("name", "drac_members"), ("nom", "drac_roles")]:
# Get 10 best matches for each mila display name.
LP_best_name_matches = name_distances.find_best_word_matches(
[e["display_name"] for e in DLD_data["mila_ldap"]],
delta_threshold=name_distance_delta_threshold,
[e[name_or_nom] for e in DLD_data[cc_source]],
nb_best_matches=10,
)
for a, b, _ in LP_name_matches:
# Again with the O(N^2) matching.
# Let's find which entry of `DD_persons` corresponds to `b`
# and put that entry in `D_person_found` for the next step.
for D_person in DD_persons.values():
# `D_person` is a dict with 3 subdicts, one for each source.
# `b` is the name of a person in the Mila LDAP.
if D_person["mila_ldap"]["display_name"] == b:
D_person_found = D_person
break
# We know for FACT that this person is in there,
# by virtue of the fact that we matched their name
# to get the `LP_name_matches` in the first place.
# Therefore, when we break, `D_person_found` is assigned.
# It becomes the insertion point.

# Again a strange construct that works because we know that
# there is a match in there with `e[name_or_nom] == a` because
# that's actually how we got it.
# This list comprehension is basically just FOR loop that
# retrieves the dict for the DLD_data["drac_members"] or DLD_data["drac_roles"]
# that has `a` as identifier.
# That is, it's the one that got successfully matched to `b`.
match = [e for e in DLD_data[drac_source] if e[name_or_nom] == a][0]

# Matching names is less of a strong association than
# matching emails, so let's not mess things up by overwriting
# one by the other. It would still be interesting to report
# divergences here, where emails suggest a match that names don't.
if D_person_found.get(drac_source, None) is None:
# Note that this is different from `if drac_source not in D_person_found:`.
# Note also that `D_person_found` is a dict, a mutatable object
# in which we will be inserting the `match` dict,
# therefore mutating the original `DD_persons` dict
# which constitutes the answer.
# That is, this is where we're "writing the output"
# of this function. Don't expect `D_person_found`
# to be used later in this function.
D_person_found[drac_source] = match
del D_person_found # to make it clear
# else:
# You can uncomment this to see the divergences,
# but usually you don't want to see them.
# This can be uncommented when we're doing the manual matching.
# assert D_person_found[drac_source] == match # optional

# Get best match for each mila display name.
for mila_display_name, best_matches in LP_best_name_matches:
match_is_manual = False

# Try to make match if we find only 1 match <= threshold.
matches_under_threshold = [
match
for match in best_matches
if match[0] <= name_distance_delta_threshold
]
if len(matches_under_threshold) == 1:
cc_match = matches_under_threshold[0][1]

# Otherwise, prompt if allowed (manual match).
elif prompt:
cc_match = _prompt_manual_match(
mila_display_name, cc_source, [match[1] for match in best_matches]
)
match_is_manual = True

# Else, do not match.
else:
cc_match = None

if cc_match is not None:
# A match was selected.

# Find which entry of `DD_persons` corresponds to `mila_display_name`
D_person_found = [
D_person
for D_person in DD_persons.values()
if D_person["mila_ldap"]["display_name"] == mila_display_name
][0]
# Find match that corresponds to `cc_match`.
match = [e for e in DLD_data[cc_source] if e[name_or_nom] == cc_match][
0
]
prev_match_data = D_person_found.get(cc_source, None)
# If user already had a match,
# make sure previous and new match do have same name.
if prev_match_data is not None:
assert prev_match_data[name_or_nom] == cc_match
# Update new match anyway.
D_person_found[cc_source] = match

# If match is manual, save it in output dictionary.
if match_is_manual:
mila_email = D_person_found["mila_ldap"]["mila_email_username"]
cc_username = match["username"]
mila_email_to_cc_username[mila_email] = cc_username

del D_person_found

return mila_email_to_cc_username


def _prompt_manual_match(mila_display_name, cc_source, best_matches):
"""
Sub-step of `_matching_names_with_prompt`

Prompt script user to select a `cc_source` match for `mila_display_name`
in `best_matches` choices.

Return selected match, or None if script user did not make a choice.
"""
prompt_message = (
f"\n"
f"Ambiguous {cc_source}. "
f"Type a number to choose match for: {mila_display_name} "
f"(default: matching ignored):\n"
+ "\n".join(f"[{i}] {match}" for i, match in enumerate(best_matches))
+ "\n"
)

# Loop as long as we don't get a valid prompt.
while True:
prompted_answer = input(prompt_message).strip()
try:
if prompted_answer:
# Parse input if available.
index_match = int(prompted_answer)
cc_match = best_matches[index_match]
else:
# Otherwise, match is ignored.
cc_match = None
break
except (ValueError, IndexError) as exc:
# We may get a value error from parsing,
# or an index error when selecting a match.
print("Invalid index:", exc)
# Re-prompt.

if cc_match:
print(mila_display_name, "(matched with)", cc_match)
else:
print("(ignored)")

return cc_match


def _manual_matching(DLD_data, DD_persons, override_matches_mila_to_cc):
Expand Down
51 changes: 14 additions & 37 deletions sarc/account_matching/name_distances.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,45 +30,22 @@ def bow_distance(bow_A, bow_B):
return distance


def find_exact_bag_of_words_matches(L_names_A, L_names_B, delta_threshold=1):
"""
Find pairs of names that are close enough to be considered the same person.
Comparisons are done by counting the occurrences of letters in each name,
ignoring spaces, hyphens, accents and capitalization.
Some `delta_threshold` value is used to determine the cutoff.
"""
def find_best_word_matches(L_names_A, L_names_B, nb_best_matches=10):
"""Get the `nb_best_matches` values from L_names_B closest to values in `L_names_A`.

LP_names_A = [(a, bag_of_words_projection(a)) for a in L_names_A]
LP_names_B = [(b, bag_of_words_projection(b)) for b in L_names_B]
Return a list of couples, each with format:
(value_from_A, best_comparisons)

# O(N^2) is wasteful, but maybe it's fine with small quantities,
# or if we run this occasionally for new accounts only,
# or if we rule out the already-established associations
`best_comparisons` is a sorted list of `nb_best_matches` couples
with format (threshold, value_from_B)
"""
# NB: in next line, L_names_A is sorted to make matching pipeline more predictable.
LP_names_A = [(a, bag_of_words_projection(a)) for a in sorted(L_names_A)]
LP_names_B = [(b, bag_of_words_projection(b)) for b in L_names_B]
LP_results = []
for a, bow_A in LP_names_A:
for b, bow_B in LP_names_B:
delta = bow_distance(bow_A, bow_B)
if delta <= delta_threshold:
# print(f"{a}, {b}")
LP_results.append((a, b, delta))

# We can't do that like this because we're invaliding
# the matching that we did with some tolerance.
# If "Amirjohn Appleseed" and "Amir John Appleseed" match,
# then we can't compare the exact sets and be shocked
# when there's a discrepancy.
#
# S_names_A = set(list(zip(*LP_results))[0])
# S_names_B = set(list(zip(*LP_results))[1])
# for missing_name in S_names_A.difference(S_names_B):
# print(f"We have name {missing_name} missing from one.")
# for missing_name in S_names_B.difference(S_names_A):
# print(f"We have name {missing_name} missing from one.")
#
# assert len(set(list(zip(*LP_results))[0])) == len(set(list(zip(*LP_results))[1])), (
# "We have a big problem in the name matching because some name matched to more than one.\n"
# "This should really be a one-to-one correspondance, or otherwise we shouldn't be doing "
# "this matching.\n"
# "We should be more careful and omit those multiple matches by altering this function."
# )
comparisons = sorted(
((bow_distance(bow_A, bow_B), b) for b, bow_B in LP_names_B),
)
LP_results.append((a, comparisons[:nb_best_matches]))
return LP_results
9 changes: 8 additions & 1 deletion sarc/cli/acquire/users.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,18 @@

from dataclasses import dataclass

from simple_parsing import field

import sarc.ldap.acquire


@dataclass
class AcquireUsers:
prompt: bool = field(
action="store_true",
help="Provide a prompt for manual matching if automatic matching fails (default: False)",
)

def execute(self) -> int:
sarc.ldap.acquire.run()
sarc.ldap.acquire.run(prompt=self.prompt)
return 0
24 changes: 20 additions & 4 deletions sarc/ldap/acquire.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
"""
This script is basically a wrapper around the "read_mila_ldap.py" script.
Instead of taking arguments from the command line, it takes them from
Instead of taking arguments from the command line, it takes them from
the SARC configuration file.

This is possible because the "read_mila_ldap.py" script has a `run` function
that takes the arguments as parameters, so the argparse step comes earlier.

As a result of running this script, the values in the collection
As a result of running this script, the values in the collection
referenced by "cfg.ldap.mongo_collection_name" will be updated.
"""

Expand All @@ -19,7 +19,9 @@
from sarc.config import config


def run():
def run(prompt=False):
"""If prompt is True, script will prompt for manual matching."""

cfg = config()

user_collection = cfg.mongo.database_instance[cfg.ldap.mongo_collection_name]
Expand Down Expand Up @@ -55,14 +57,18 @@ def run():
) as json_file:
make_matches_config = json.load(json_file)

DD_persons_matched = sarc.account_matching.make_matches.perform_matching(
(
DD_persons_matched,
new_manual_matches,
) = sarc.account_matching.make_matches.perform_matching(
DLD_data=DLD_data,
mila_emails_to_ignore=make_matches_config["L_phantom_mila_emails_to_ignore"],
override_matches_mila_to_cc=make_matches_config[
"D_override_matches_mila_to_cc_account_username"
],
name_distance_delta_threshold=0,
verbose=False,
prompt=prompt,
)

# from pprint import pprint
Expand All @@ -85,6 +91,16 @@ def run():
DD_persons_matched,
)

# If new manual matches are available, save them.
if new_manual_matches:
make_matches_config["D_override_matches_mila_to_cc_account_username"].update(
new_manual_matches
)
with open(
cfg.account_matching.make_matches_config, "w", encoding="utf-8"
) as json_file:
json.dump(make_matches_config, json_file, indent=4)


def fill_computed_fields(data: dict):
mila_ldap = data.get("mila_ldap", {}) or {}
Expand Down
Loading