Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cudf-polars/pylibcudf string -> date parsing #16306

Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,5 @@ rapids_cython_create_modules(
SOURCE_FILES "${cython_sources}"
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf
)

add_subdirectory(convert)
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ from . cimport (
case,
char_types,
contains,
convert,
find,
regex_flags,
regex_program,
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
case,
char_types,
contains,
convert,
find,
regex_flags,
regex_program,
Expand Down
22 changes: 22 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/convert/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# =============================================================================
# Copyright (c) 2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations under
# the License.
# =============================================================================

set(cython_sources convert_durations.pyx convert_datetime.pyx)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
CXX
SOURCE_FILES "${cython_sources}"
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf
)
2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/convert/__init__.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from . cimport convert_datetime, convert_durations
2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/convert/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from . import convert_datetime, convert_durations
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.string cimport string

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.types cimport DataType


cpdef Column to_timestamps(
Column input,
DataType timestamp_type,
const string& format
)

cpdef Column from_timestamps(
Column input,
const string& format,
Column input_strings_names
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.strings.convert cimport (
convert_datetime as cpp_convert_datetime,
)

from cudf._lib.pylibcudf.types import DataType


cpdef Column to_timestamps(
Column input,
DataType timestamp_type,
const string& format
):
cdef unique_ptr[column] c_result
with nogil:
c_result = cpp_convert_datetime.to_timestamps(
input.view(),
timestamp_type.c_obj,
format
)

return Column.from_libcudf(move(c_result))

cpdef Column from_timestamps(
Column input,
const string& format,
Column input_strings_names
):
cdef unique_ptr[column] c_result
with nogil:
c_result = cpp_convert_datetime.from_timestamps(
input.view(),
format,
input_strings_names.view()
)

return Column.from_libcudf(move(c_result))

cpdef Column is_timestamp(
Column input,
const string& format
):
cdef unique_ptr[column] c_result
with nogil:
c_result = cpp_convert_datetime.is_timestamp(
input.view(),
format
)

return Column.from_libcudf(move(c_result))
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.string cimport string

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.types cimport DataType


cpdef Column to_durations(
Column input,
DataType duration_type,
const string& format
)

cpdef Column from_durations(
Column input,
const string& format
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.strings.convert cimport (
convert_durations as cpp_convert_durations,
)

from cudf._lib.pylibcudf.types import DataType


cpdef Column to_durations(
Column input,
DataType duration_type,
const string& format
):
cdef unique_ptr[column] c_result
with nogil:
c_result = cpp_convert_durations.to_durations(
input.view(),
duration_type.c_obj,
format
)

return Column.from_libcudf(move(c_result))

cpdef Column from_durations(
Column input,
const string& format
):
cdef unique_ptr[column] c_result
with nogil:
c_result = cpp_convert_durations.from_durations(
input.view(),
format
)

return Column.from_libcudf(move(c_result))
86 changes: 29 additions & 57 deletions python/cudf/cudf/_lib/string_casting.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,7 @@ from cudf._lib.pylibcudf.libcudf.strings.convert.convert_booleans cimport (
to_booleans as cpp_to_booleans,
)
from cudf._lib.pylibcudf.libcudf.strings.convert.convert_datetime cimport (
from_timestamps as cpp_from_timestamps,
is_timestamp as cpp_is_timestamp,
to_timestamps as cpp_to_timestamps,
)
from cudf._lib.pylibcudf.libcudf.strings.convert.convert_durations cimport (
from_durations as cpp_from_durations,
to_durations as cpp_to_durations,
)
from cudf._lib.pylibcudf.libcudf.strings.convert.convert_floats cimport (
from_floats as cpp_from_floats,
Expand All @@ -48,6 +42,8 @@ from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
from cudf._lib.types cimport underlying_type_t_type_id

import cudf
import cudf._lib.pylibcudf as plc
from cudf._lib.types cimport dtype_to_pylibcudf_type


def floating_to_string(Column input_col):
Expand Down Expand Up @@ -521,19 +517,14 @@ def int2timestamp(
A Column with date-time represented in string format

"""
cdef column_view input_column_view = input_col.view()
cdef string c_timestamp_format = format.encode("UTF-8")
cdef column_view input_strings_names = names.view()

cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_from_timestamps(
input_column_view,
c_timestamp_format,
input_strings_names))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(
plc.strings.convert.convert_datetime.from_timestamps(
input_col.to_pylibcudf(mode="read"),
c_timestamp_format,
names.to_pylibcudf(mode="read")
)
)


def timestamp2int(Column input_col, dtype, format):
Expand All @@ -550,23 +541,15 @@ def timestamp2int(Column input_col, dtype, format):
A Column with string represented in date-time format

"""
cdef column_view input_column_view = input_col.view()
cdef type_id tid = <type_id> (
<underlying_type_t_type_id> (
SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[dtype]
dtype = dtype_to_pylibcudf_type(dtype)
cdef string c_timestamp_format = format.encode('UTF-8')
return Column.from_pylibcudf(
plc.strings.convert.convert_datetime.to_timestamps(
input_col.to_pylibcudf(mode="read"),
dtype,
c_timestamp_format
)
)
cdef data_type out_type = data_type(tid)
cdef string c_timestamp_format = format.encode('UTF-8')
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_to_timestamps(
input_column_view,
out_type,
c_timestamp_format))

return Column.from_unique_ptr(move(c_result))


def istimestamp(Column input_col, str format):
Expand Down Expand Up @@ -612,23 +595,15 @@ def timedelta2int(Column input_col, dtype, format):
A Column with string represented in TimeDelta format

"""
cdef column_view input_column_view = input_col.view()
cdef type_id tid = <type_id> (
<underlying_type_t_type_id> (
SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[dtype]
dtype = dtype_to_pylibcudf_type(dtype)
cdef string c_timestamp_format = format.encode('UTF-8')
return Column.from_pylibcudf(
plc.strings.convert.convert_durations.to_durations(
input_col.to_pylibcudf(mode="read"),
dtype,
c_timestamp_format
)
)
cdef data_type out_type = data_type(tid)
cdef string c_duration_format = format.encode('UTF-8')
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_to_durations(
input_column_view,
out_type,
c_duration_format))

return Column.from_unique_ptr(move(c_result))


def int2timedelta(Column input_col, str format):
Expand All @@ -646,16 +621,13 @@ def int2timedelta(Column input_col, str format):

"""

cdef column_view input_column_view = input_col.view()
cdef string c_duration_format = format.encode('UTF-8')
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_from_durations(
input_column_view,
c_duration_format))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(
plc.strings.convert.convert_durations.from_durations(
input_col.to_pylibcudf(mode="read"),
c_duration_format
)
)


def int2ip(Column input_col):
Expand Down
Loading
Loading