diff --git a/contrib/benchmarks/bm_create_proxy_cleaned.py b/contrib/benchmarks/bm_create_proxy_cleaned.py new file mode 100644 index 000000000..4d4623449 --- /dev/null +++ b/contrib/benchmarks/bm_create_proxy_cleaned.py @@ -0,0 +1,40 @@ +import os +from platform import python_version +from timeit import timeit + +from followthemoney import model +from followthemoney.proxy import EntityProxy + +ITERATIONS = 1_000_000 + +ENTITY = { + "id": "test", + "schema": "Person", + "properties": { + "name": ["Ralph Tester", "Ralph Tester", "Ralf Tester"], + "birthDate": ["1972-05-01"], + "idNumber": ["9177171", "8e839023"], + "website": ["https://ralphtester.me"], + "phone": ["+12025557612", "+12025557612", "+12025557612"], + "email": ["info@ralphtester.me", "info@ralphtester.org", "info@ralphtester.com"], + "topics": ["role.spy"], + }, +} + + +def create_proxy(data): + proxy = EntityProxy.from_dict(model, data, cleaned=True) + data = proxy.to_dict() + return data + + +def benchmark(): + for _ in range(ITERATIONS): + create_proxy(ENTITY) + + +if __name__ == "__main__": + commit = os.environ.get("GIT_COMMIT", "None") + version = python_version() + time = timeit(benchmark, number=10) + print(f"{commit},{version},{time}") diff --git a/contrib/benchmarks/run.sh b/contrib/benchmarks/run.sh new file mode 100755 index 000000000..bf2587a78 --- /dev/null +++ b/contrib/benchmarks/run.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Run a benchmark script using all permutations of Python versions and commits. + +set -e + +benchmark="bm_create_proxy_cleaned.py" +declare -a py_versions=("3.11" "3.10" "3.9" "3.8") +declare -a commits=("83e8cd8f" "00bb7918" "f61f7395" "8ff8e264") + +if [ $# -eq 0 ] +then + echo "Usage: ./run.sh [BENCHMARK]" + exit 1; +fi + +for commit in ${commits[@]} +do + git checkout ${commit} > /dev/null + for py_version in ${py_versions[@]} + do + docker run \ + --rm \ + -it \ + -v $(pwd):/followthemoney \ + -e GIT_COMMIT=${commit} \ + python:${py_version} \ + bash -c "cd /followthemoney && pip3 install -e . &> /dev/null && python3 contrib/benchmarks/${benchmark}" + done +done \ No newline at end of file diff --git a/followthemoney/proxy.py b/followthemoney/proxy.py index dcbf23c12..19eb8b2b7 100644 --- a/followthemoney/proxy.py +++ b/followthemoney/proxy.py @@ -79,18 +79,22 @@ def __init__( #: than ``id``, ``schema`` or ``properties``, they will be kept in here #: and re-added upon serialization. self.context = data - self._properties: Dict[str, Set[str]] = {} + self._properties: Dict[str, List[str]] = {} self._size = 0 - for key, value in properties.items(): + for key, values in properties.items(): if key not in self.schema.properties: continue - if not cleaned: - self.add(key, value, cleaned=cleaned, quiet=True) + if cleaned: + # This does not call `self.add` as it might be called millions of times + # in some context and we want to avoid the performance overhead of doing so. + seen: Set[str] = set() + seen_add = seen.add + unique_values = [v for v in values if not (v in seen or seen_add(v))] + self._properties[key] = unique_values + self._size += sum([len(v) for v in unique_values]) else: - values = set(value) - self._properties[key] = values - self._size += sum([len(v) for v in values]) + self.add(key, values, quiet=True) def make_id(self, *parts: Any) -> Optional[str]: """Generate a (hopefully unique) ID for the given entity, composed @@ -127,11 +131,10 @@ def get(self, prop: P, quiet: bool = False) -> List[str]: prop_name = self._prop_name(prop, quiet=quiet) if prop_name is None: return [] - return list(self._properties.get(prop_name, [])) + return self._properties.get(prop_name, []) def first(self, prop: P, quiet: bool = False) -> Optional[str]: - """Get only the first value set for the property, in no particular - order. + """Get only the first value set for the property. :param prop: can be given as a name or an instance of :class:`~followthemoney.property.Property`. @@ -217,8 +220,9 @@ def unsafe_add( # log.warning(msg, prop.name) return None self._size += value_size - self._properties.setdefault(prop.name, set()) - self._properties[prop.name].add(value) + self._properties.setdefault(prop.name, list()) + if value not in self._properties[prop.name]: + self._properties[prop.name].append(value) return None def set( @@ -275,7 +279,7 @@ def remove(self, prop: P, value: str, quiet: bool = True) -> None: if prop_name is not None and prop_name in self._properties: try: self._properties[prop_name].remove(value) - except KeyError: + except (KeyError, ValueError): pass def iterprops(self) -> List[Property]: diff --git a/tests/test_proxy.py b/tests/test_proxy.py index d44544557..febb8f2d3 100644 --- a/tests/test_proxy.py +++ b/tests/test_proxy.py @@ -273,3 +273,57 @@ def test_pickle(self): assert proxy.id == proxy2.id assert hash(proxy) == hash(proxy2) assert proxy2.schema.name == ENTITY["schema"] + + def test_value_order(self): + one = EntityProxy.from_dict(model, { + "id": "one", + "schema": "Email", + "properties": { + "bodyHtml": ["Hello", "World"], + }, + }) + + two = EntityProxy.from_dict(model, { + "id": "one", + "schema": "Email", + "properties": { + "bodyHtml": ["World", "Hello"], + }, + }) + + assert one.get("bodyHtml") == ["Hello", "World"] + assert two.get("bodyHtml") == ["World", "Hello"] + + def test_value_deduplication(self): + proxy = EntityProxy.from_dict(model, { + "id": "acme-inc", + "schema": "Company", + "properties": { + "name": ["ACME, Inc.", "ACME, Inc."], + }, + }) + + assert proxy.get("name") == ["ACME, Inc."] + + proxy = EntityProxy.from_dict(model, { + "id": "acme-inc", + "schema": "Company", + }) + + assert proxy.get("name") == [] + proxy.add("name", "ACME, Inc.") + assert proxy.get("name") == ["ACME, Inc."] + proxy.add("name", "ACME, Inc.") + assert proxy.get("name") == ["ACME, Inc."] + + def test_value_deduplication_cleaned(self): + proxy = EntityProxy.from_dict(model, { + "id": "acme-inc", + "schema": "Company", + "properties": { + "name": ["ACME, Inc.", "ACME, Inc."], + }, + }, cleaned=True) + + assert proxy.get("name") == ["ACME, Inc."] + \ No newline at end of file