diff --git a/followthemoney/proxy.py b/followthemoney/proxy.py index 3c94216a7..4172ecfe7 100644 --- a/followthemoney/proxy.py +++ b/followthemoney/proxy.py @@ -86,7 +86,13 @@ def __init__( if key not in self.schema.properties: continue if cleaned: - self.add(key, values, cleaned=cleaned) + # This does not call `self.add` as it might be called millions of times + # in some context and we want to avoid the performance overhead of doing so. + seen = set() + seen_add = seen.add + unique_values = [v for v in values if not (v in seen or seen_add(v))] + self._properties[key] = unique_values + self._size += sum([len(v) for v in unique_values]) else: self.add(key, values, quiet=True) diff --git a/tests/test_proxy.py b/tests/test_proxy.py index 3d82f0eda..febb8f2d3 100644 --- a/tests/test_proxy.py +++ b/tests/test_proxy.py @@ -315,3 +315,15 @@ def test_value_deduplication(self): assert proxy.get("name") == ["ACME, Inc."] proxy.add("name", "ACME, Inc.") assert proxy.get("name") == ["ACME, Inc."] + + def test_value_deduplication_cleaned(self): + proxy = EntityProxy.from_dict(model, { + "id": "acme-inc", + "schema": "Company", + "properties": { + "name": ["ACME, Inc.", "ACME, Inc."], + }, + }, cleaned=True) + + assert proxy.get("name") == ["ACME, Inc."] + \ No newline at end of file