Skip to content

Commit

Permalink
Merge pull request #57 from jeanmaried/feat/delete-by-metadata
Browse files Browse the repository at this point in the history
feat: add ability to delete by filtering metadata only
  • Loading branch information
olirice authored Nov 14, 2023
2 parents ec961c4 + 7f18333 commit 7680c01
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 16 deletions.
4 changes: 3 additions & 1 deletion docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,12 @@ docs.upsert(

## Deleting vectors

Deleting records removes them from the collection. To delete records, specify a list of `ids` to the `delete` method. The ids of the sucessfully deleted records are returned from the method. Note that attempting to delete non-existent records does not raise an error.
Deleting records removes them from the collection. To delete records, specify a list of `ids` or metadata filters to the `delete` method. The ids of the sucessfully deleted records are returned from the method. Note that attempting to delete non-existent records does not raise an error.

```python
docs.delete(ids=["vec0", "vec1"])
# or delete by a metadata filter
docs.delete(filters={"year": {"$eq": 2012}})
```

## Create an index
Expand Down
25 changes: 23 additions & 2 deletions src/tests/test_collection.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import itertools
import random

import numpy as np
Expand Down Expand Up @@ -91,24 +92,44 @@ def test_delete(client: vecs.Client) -> None:
f"vec{ix}",
vec,
{
"genre": random.choice(["action", "rom-com", "drama"]),
"genre": genre,
"year": int(50 * random.random()) + 1970,
},
)
for ix, vec in enumerate(np.random.random((n_records, dim)))
for (ix, vec), genre in zip(
enumerate(np.random.random((n_records, dim))),
itertools.cycle(["action", "rom-com", "drama"]),
)
]

# insert works
movies.upsert(records)

# delete by IDs.
delete_ids = ["vec0", "vec15", "vec99"]
movies.delete(ids=delete_ids)
assert len(movies) == n_records - len(delete_ids)

# insert works
movies.upsert(records)

# delete with filters
genre_to_delete = "action"
deleted_ids_by_genre = movies.delete(filters={"genre": {"$eq": genre_to_delete}})
assert len(deleted_ids_by_genre) == 34

# bad input
with pytest.raises(vecs.exc.ArgError):
movies.delete(ids="should_be_a_list")

# bad input: neither ids nor filters provided.
with pytest.raises(vecs.exc.ArgError):
movies.delete()

# bad input: should only provide either ids or filters, not both
with pytest.raises(vecs.exc.ArgError):
movies.delete(ids=["vec0"], filters={"genre": {"$eq": genre_to_delete}})


def test_repr(client: vecs.Client) -> None:
movies = client.get_or_create_collection(name="movies", dimension=99)
Expand Down
47 changes: 34 additions & 13 deletions src/vecs/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,33 +375,52 @@ def fetch(self, ids: Iterable[str]) -> List[Record]:
records.extend(chunk_records)
return records

def delete(self, ids: Iterable[str]) -> List[str]:
def delete(
self, ids: Optional[Iterable[str]] = None, filters: Optional[Metadata] = None
) -> List[str]:
"""
Deletes vectors from the collection by their identifiers.
Deletes vectors from the collection by matching filters or ids.
Args:
ids (Iterable[str]): An iterable of vector identifiers.
ids (Iterable[str], optional): An iterable of vector identifiers.
filters (Optional[Dict], optional): Filters to apply to the search. Defaults to None.
Returns:
List[str]: A list of the identifiers of the deleted vectors.
"""
if ids is None and filters is None:
raise ArgError("Either ids or filters must be provided.")

if ids is not None and filters is not None:
raise ArgError("Either ids or filters must be provided, not both.")

if isinstance(ids, str):
raise ArgError("ids must be a list of strings")

chunk_size = 12
ids = ids or []
filters = filters or {}
del_ids = []

del_ids = list(ids)
ids = []
with self.client.Session() as sess:
with sess.begin():
for id_chunk in flu(del_ids).chunk(chunk_size):
if ids:
for id_chunk in flu(ids).chunk(12):
stmt = (
delete(self.table)
.where(self.table.c.id.in_(id_chunk))
.returning(self.table.c.id)
)
del_ids.extend(sess.execute(stmt).scalars() or [])

if filters:
meta_filter = build_filters(self.table.c.metadata, filters)
stmt = (
delete(self.table)
.where(self.table.c.id.in_(id_chunk))
.returning(self.table.c.id)
delete(self.table).where(meta_filter).returning(self.table.c.id) # type: ignore
)
ids.extend(sess.execute(stmt).scalars() or [])
return ids
result = sess.execute(stmt).scalars()
del_ids.extend(result.fetchall())

return del_ids

def __getitem__(self, items):
"""
Expand Down Expand Up @@ -516,7 +535,9 @@ def query(

stmt = select(*cols)
if filters:
stmt = stmt.filter(build_filters(self.table.c.metadata, filters)) # type: ignore
stmt = stmt.filter(
build_filters(self.table.c.metadata, filters) # type: ignore
)

stmt = stmt.order_by(distance_clause)
stmt = stmt.limit(limit)
Expand Down

0 comments on commit 7680c01

Please sign in to comment.