diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst index 2e03b589c8b..6300f77d686 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst @@ -6,3 +6,4 @@ nvtext edit_distance generate_ngrams + jaccard diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst new file mode 100644 index 00000000000..ea59657c25e --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst @@ -0,0 +1,6 @@ +======= +jaccard +======= + +.. automodule:: pylibcudf.nvtext.jaccard + :members: diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx index 0ebf7c281e3..c964d0206b7 100644 --- a/python/cudf/cudf/_lib/nvtext/jaccard.pyx +++ b/python/cudf/cudf/_lib/nvtext/jaccard.pyx @@ -2,33 +2,16 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.jaccard cimport ( - jaccard_index as cpp_jaccard_index, -) -from pylibcudf.libcudf.types cimport size_type - from cudf._lib.column cimport Column +from pylibcudf import nvtext + @acquire_spill_lock() def jaccard_index(Column input1, Column input2, int width): - cdef column_view c_input1 = input1.view() - cdef column_view c_input2 = input2.view() - cdef size_type c_width = width - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_jaccard_index( - c_input1, - c_input2, - c_width - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.jaccard.jaccard_index( + input1.to_pylibcudf(mode="read"), + input2.to_pylibcudf(mode="read"), + width, + ) + return Column.from_pylibcudf(result) diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt index eb5617a1da6..9913e1fbadb 100644 --- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources edit_distance.pyx generate_ngrams.pyx) +set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd index 7f5fa2b9925..5f1762b1e3d 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd @@ -1,8 +1,9 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport edit_distance, generate_ngrams +from . cimport edit_distance, generate_ngrams, jaccard __all__ = [ "edit_distance", "generate_ngrams", + "jaccard", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py index a66ce984745..1c0ddb1e5a4 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.py +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -1,8 +1,9 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import edit_distance, generate_ngrams +from . import edit_distance, generate_ngrams, jaccard __all__ = [ "edit_distance", "generate_ngrams", + "jaccard", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd b/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd new file mode 100644 index 00000000000..a4d4a15335b --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column jaccard_index(Column input1, Column input2, size_type width) diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx new file mode 100644 index 00000000000..9334d7ce751 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx @@ -0,0 +1,47 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext.jaccard cimport ( + jaccard_index as cpp_jaccard_index, +) +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column jaccard_index(Column input1, Column input2, size_type width): + """ + Returns the Jaccard similarity between individual rows in two strings columns. + + For details, see :cpp:func:`jaccard_index` + + Parameters + ---------- + input1 : Column + Input strings column + input2 : Column + Input strings column + width : size_type + The ngram number to generate + + Returns + ------- + Column + Index calculation values + """ + cdef column_view c_input1 = input1.view() + cdef column_view c_input2 = input2.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_jaccard_index( + c_input1, + c_input2, + width + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py new file mode 100644 index 00000000000..d5a168426b1 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py @@ -0,0 +1,37 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture(scope="module") +def input_data(): + input1 = ["the fuzzy dog", "little piggy", "funny bunny", "chatty parrot"] + input2 = ["the fuzzy cat", "bitty piggy", "funny bunny", "silent partner"] + return pa.array(input1), pa.array(input2) + + +@pytest.mark.parametrize("width", [2, 3]) +def test_jaccard_index(input_data, width): + def get_tokens(s, width): + return [s[i : i + width] for i in range(len(s) - width + 1)] + + def jaccard_index(s1, s2, width): + x = set(get_tokens(s1, width)) + y = set(get_tokens(s2, width)) + return len(x & y) / len(x | y) + + input1, input2 = input_data + result = plc.nvtext.jaccard.jaccard_index( + plc.interop.from_arrow(input1), plc.interop.from_arrow(input2), width + ) + expected = pa.array( + [ + jaccard_index(s1.as_py(), s2.as_py(), width) + for s1, s2 in zip(input1, input2) + ], + type=pa.float32(), + ) + assert_column_eq(result, expected)