Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Addressed situation when assign_default_confidence() returns only dataframe with all NaN confidence values #548

Merged
merged 3 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 22 additions & 23 deletions src/sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,33 +444,32 @@ def filter_redundant_rows(df: pd.DataFrame, ignore_predicate: bool = False) -> p
else:
key = [SUBJECT_ID, OBJECT_ID, PREDICATE_ID]
dfmax: pd.DataFrame
dfmax = df.groupby(key, as_index=False)[CONFIDENCE].apply(max).drop_duplicates()
max_conf: Dict[Tuple[str, ...], float] = {}
for _, row in dfmax.iterrows():
if not df.empty:
matentzn marked this conversation as resolved.
Show resolved Hide resolved
dfmax = df.groupby(key, as_index=False)[CONFIDENCE].apply(max).drop_duplicates()
max_conf: Dict[Tuple[str, ...], float] = {}
for _, row in dfmax.iterrows():
if ignore_predicate:
max_conf[(row[SUBJECT_ID], row[OBJECT_ID])] = row[CONFIDENCE]
else:
max_conf[(row[SUBJECT_ID], row[OBJECT_ID], row[PREDICATE_ID])] = row[CONFIDENCE]
if ignore_predicate:
max_conf[(row[SUBJECT_ID], row[OBJECT_ID])] = row[CONFIDENCE]
df = df[
df.apply(
lambda x: x[CONFIDENCE] >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID])],
axis=1,
)
]
else:
max_conf[(row[SUBJECT_ID], row[OBJECT_ID], row[PREDICATE_ID])] = row[CONFIDENCE]
if ignore_predicate:
df = df[
df.apply(
lambda x: x[CONFIDENCE] >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID])],
axis=1,
)
]
else:
df = df[
df.apply(
lambda x: x[CONFIDENCE] >= max_conf[(x[SUBJECT_ID], x[OBJECT_ID], x[PREDICATE_ID])],
axis=1,
)
]
df = df[
df.apply(
lambda x: x[CONFIDENCE]
>= max_conf[(x[SUBJECT_ID], x[OBJECT_ID], x[PREDICATE_ID])],
axis=1,
)
]
# We are preserving confidence = NaN rows without making assumptions.
# This means that there are potential duplicate mappings
# FutureWarning: The frame.append method is deprecated and
# will be removed from pandas in a future version.
# Use pandas.concat instead.
# return_df = df.append(nan_df).drop_duplicates()

confidence_reconciled_df = pd.concat([df, nan_df]).drop_duplicates()
matentzn marked this conversation as resolved.
Show resolved Hide resolved

# Reconciling dataframe rows based on the predicates with equal confidence.
Expand Down
7 changes: 7 additions & 0 deletions tests/test_reconcile.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@ def test_filter(self):
df2 = filter_redundant_rows(self.msdf2.df)
self.assertEqual(18, len(df2.index))

# Create a new dataframe with the confidence column having NaN values
import numpy as np

self.msdf1.df["confidence"] = np.NAN
df3 = filter_redundant_rows(self.msdf1.df)
self.assertEqual(11, len(df3.index))

def test_deal_with_negation(self):
"""Test handling negating returns the right number of rows."""
df1 = deal_with_negation(self.msdf1.df)
Expand Down
Loading