Skip to content

Commit

Permalink
Maintain insert order of property values (#1148)
Browse files Browse the repository at this point in the history
* Maintain insert order of property values

Fix #1139

* Unroll adding values in constructor to remove performance overhead

Implement PR feedback by @pudo

* Add benchmark script

* pin click to avoid type check errors

* Add missing type annotation

---------

Co-authored-by: Friedrich Lindenberg <[email protected]>
  • Loading branch information
tillprochaska and pudo authored Jul 12, 2023
1 parent b88ab98 commit b4d51f4
Show file tree
Hide file tree
Showing 4 changed files with 140 additions and 13 deletions.
40 changes: 40 additions & 0 deletions contrib/benchmarks/bm_create_proxy_cleaned.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import os
from platform import python_version
from timeit import timeit

from followthemoney import model
from followthemoney.proxy import EntityProxy

ITERATIONS = 1_000_000

ENTITY = {
"id": "test",
"schema": "Person",
"properties": {
"name": ["Ralph Tester", "Ralph Tester", "Ralf Tester"],
"birthDate": ["1972-05-01"],
"idNumber": ["9177171", "8e839023"],
"website": ["https://ralphtester.me"],
"phone": ["+12025557612", "+12025557612", "+12025557612"],
"email": ["[email protected]", "[email protected]", "[email protected]"],
"topics": ["role.spy"],
},
}


def create_proxy(data):
proxy = EntityProxy.from_dict(model, data, cleaned=True)
data = proxy.to_dict()
return data


def benchmark():
for _ in range(ITERATIONS):
create_proxy(ENTITY)


if __name__ == "__main__":
commit = os.environ.get("GIT_COMMIT", "None")
version = python_version()
time = timeit(benchmark, number=10)
print(f"{commit},{version},{time}")
29 changes: 29 additions & 0 deletions contrib/benchmarks/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash
# Run a benchmark script using all permutations of Python versions and commits.

set -e

benchmark="bm_create_proxy_cleaned.py"
declare -a py_versions=("3.11" "3.10" "3.9" "3.8")
declare -a commits=("83e8cd8f" "00bb7918" "f61f7395" "8ff8e264")

if [ $# -eq 0 ]
then
echo "Usage: ./run.sh [BENCHMARK]"
exit 1;
fi

for commit in ${commits[@]}
do
git checkout ${commit} > /dev/null
for py_version in ${py_versions[@]}
do
docker run \
--rm \
-it \
-v $(pwd):/followthemoney \
-e GIT_COMMIT=${commit} \
python:${py_version} \
bash -c "cd /followthemoney && pip3 install -e . &> /dev/null && python3 contrib/benchmarks/${benchmark}"
done
done
30 changes: 17 additions & 13 deletions followthemoney/proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,18 +79,22 @@ def __init__(
#: than ``id``, ``schema`` or ``properties``, they will be kept in here
#: and re-added upon serialization.
self.context = data
self._properties: Dict[str, Set[str]] = {}
self._properties: Dict[str, List[str]] = {}
self._size = 0

for key, value in properties.items():
for key, values in properties.items():
if key not in self.schema.properties:
continue
if not cleaned:
self.add(key, value, cleaned=cleaned, quiet=True)
if cleaned:
# This does not call `self.add` as it might be called millions of times
# in some context and we want to avoid the performance overhead of doing so.
seen: Set[str] = set()
seen_add = seen.add
unique_values = [v for v in values if not (v in seen or seen_add(v))]
self._properties[key] = unique_values
self._size += sum([len(v) for v in unique_values])
else:
values = set(value)
self._properties[key] = values
self._size += sum([len(v) for v in values])
self.add(key, values, quiet=True)

def make_id(self, *parts: Any) -> Optional[str]:
"""Generate a (hopefully unique) ID for the given entity, composed
Expand Down Expand Up @@ -127,11 +131,10 @@ def get(self, prop: P, quiet: bool = False) -> List[str]:
prop_name = self._prop_name(prop, quiet=quiet)
if prop_name is None:
return []
return list(self._properties.get(prop_name, []))
return self._properties.get(prop_name, [])

def first(self, prop: P, quiet: bool = False) -> Optional[str]:
"""Get only the first value set for the property, in no particular
order.
"""Get only the first value set for the property.
:param prop: can be given as a name or an instance of
:class:`~followthemoney.property.Property`.
Expand Down Expand Up @@ -217,8 +220,9 @@ def unsafe_add(
# log.warning(msg, prop.name)
return None
self._size += value_size
self._properties.setdefault(prop.name, set())
self._properties[prop.name].add(value)
self._properties.setdefault(prop.name, list())
if value not in self._properties[prop.name]:
self._properties[prop.name].append(value)
return None

def set(
Expand Down Expand Up @@ -275,7 +279,7 @@ def remove(self, prop: P, value: str, quiet: bool = True) -> None:
if prop_name is not None and prop_name in self._properties:
try:
self._properties[prop_name].remove(value)
except KeyError:
except (KeyError, ValueError):
pass

def iterprops(self) -> List[Property]:
Expand Down
54 changes: 54 additions & 0 deletions tests/test_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,3 +273,57 @@ def test_pickle(self):
assert proxy.id == proxy2.id
assert hash(proxy) == hash(proxy2)
assert proxy2.schema.name == ENTITY["schema"]

def test_value_order(self):
one = EntityProxy.from_dict(model, {
"id": "one",
"schema": "Email",
"properties": {
"bodyHtml": ["Hello", "World"],
},
})

two = EntityProxy.from_dict(model, {
"id": "one",
"schema": "Email",
"properties": {
"bodyHtml": ["World", "Hello"],
},
})

assert one.get("bodyHtml") == ["Hello", "World"]
assert two.get("bodyHtml") == ["World", "Hello"]

def test_value_deduplication(self):
proxy = EntityProxy.from_dict(model, {
"id": "acme-inc",
"schema": "Company",
"properties": {
"name": ["ACME, Inc.", "ACME, Inc."],
},
})

assert proxy.get("name") == ["ACME, Inc."]

proxy = EntityProxy.from_dict(model, {
"id": "acme-inc",
"schema": "Company",
})

assert proxy.get("name") == []
proxy.add("name", "ACME, Inc.")
assert proxy.get("name") == ["ACME, Inc."]
proxy.add("name", "ACME, Inc.")
assert proxy.get("name") == ["ACME, Inc."]

def test_value_deduplication_cleaned(self):
proxy = EntityProxy.from_dict(model, {
"id": "acme-inc",
"schema": "Company",
"properties": {
"name": ["ACME, Inc.", "ACME, Inc."],
},
}, cleaned=True)

assert proxy.get("name") == ["ACME, Inc."]

0 comments on commit b4d51f4

Please sign in to comment.