Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate nvtext jaccard API to pylibcudf #17007

Merged
merged 9 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ nvtext

edit_distance
generate_ngrams
jaccard
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=======
jaccard
=======

.. automodule:: pylibcudf.nvtext.jaccard
:members:
33 changes: 8 additions & 25 deletions python/cudf/cudf/_lib/nvtext/jaccard.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,16 @@

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.nvtext.jaccard cimport (
jaccard_index as cpp_jaccard_index,
)
from pylibcudf.libcudf.types cimport size_type

from cudf._lib.column cimport Column

from pylibcudf import nvtext


@acquire_spill_lock()
def jaccard_index(Column input1, Column input2, int width):
cdef column_view c_input1 = input1.view()
cdef column_view c_input2 = input2.view()
cdef size_type c_width = width
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_jaccard_index(
c_input1,
c_input2,
c_width
)
)

return Column.from_unique_ptr(move(c_result))
result = nvtext.jaccard.jaccard_index(
input1.to_pylibcudf(mode="read"),
input2.to_pylibcudf(mode="read"),
width,
)
return Column.from_pylibcudf(result)
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources edit_distance.pyx generate_ngrams.pyx)
set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand Down
3 changes: 2 additions & 1 deletion python/pylibcudf/pylibcudf/nvtext/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport edit_distance, generate_ngrams
from . cimport edit_distance, generate_ngrams, jaccard

__all__ = [
"edit_distance",
"generate_ngrams",
"jaccard",
]
3 changes: 2 additions & 1 deletion python/pylibcudf/pylibcudf/nvtext/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import edit_distance, generate_ngrams
from . import edit_distance, generate_ngrams, jaccard

__all__ = [
"edit_distance",
"generate_ngrams",
"jaccard",
]
7 changes: 7 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/jaccard.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column
from pylibcudf.libcudf.types cimport size_type


cpdef Column jaccard_index(Column input1, Column input2, size_type width)
47 changes: 47 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.nvtext.jaccard cimport (
jaccard_index as cpp_jaccard_index,
)
from pylibcudf.libcudf.types cimport size_type


cpdef Column jaccard_index(Column input1, Column input2, size_type width):
"""
Returns the Jaccard similarity between individual rows in two strings columns.

For details, see :cpp:func:`jaccard_index`

Parameters
----------
input1 : Column
Input strings column
input2 : Column
Input strings column
width : size_type
The ngram number to generate

Returns
-------
Column
Index calculation values
"""
cdef column_view c_input1 = input1.view()
cdef column_view c_input2 = input2.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_jaccard_index(
c_input1,
c_input2,
width
)
)

return Column.from_libcudf(move(c_result))
37 changes: 37 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pylibcudf as plc
import pytest
from utils import assert_column_eq


@pytest.fixture(scope="module")
def input_data():
input1 = ["the fuzzy dog", "little piggy", "funny bunny", "chatty parrot"]
input2 = ["the fuzzy cat", "bitty piggy", "funny bunny", "silent partner"]
return pa.array(input1), pa.array(input2)


@pytest.mark.parametrize("width", [2, 3])
def test_jaccard_index(input_data, width):
def get_tokens(s, width):
return [s[i : i + width] for i in range(len(s) - width + 1)]

def jaccard_index(s1, s2, width):
x = set(get_tokens(s1, width))
y = set(get_tokens(s2, width))
return len(x & y) / len(x | y)

input1, input2 = input_data
result = plc.nvtext.jaccard.jaccard_index(
plc.interop.from_arrow(input1), plc.interop.from_arrow(input2), width
)
expected = pa.array(
[
jaccard_index(s1.as_py(), s2.as_py(), width)
for s1, s2 in zip(input1, input2)
],
type=pa.float32(),
)
assert_column_eq(result, expected)
Loading