diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 04a22b41247..c9212334a96 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -17,6 +17,7 @@ #include "compact_protocol_reader.hpp" #include "parquet.hpp" +#include "parquet_common.hpp" #include @@ -652,6 +653,9 @@ void CompactProtocolReader::read(ColumnChunkMetaData* c) { using optional_size_statistics = parquet_field_optional>; + using optional_list_enc_stats = + parquet_field_optional, + parquet_field_struct_list>; auto op = std::make_tuple(parquet_field_enum(1, c->type), parquet_field_enum_list(2, c->encodings), parquet_field_string_list(3, c->path_in_schema), @@ -663,6 +667,7 @@ void CompactProtocolReader::read(ColumnChunkMetaData* c) parquet_field_int64(10, c->index_page_offset), parquet_field_int64(11, c->dictionary_page_offset), parquet_field_struct(12, c->statistics), + optional_list_enc_stats(13, c->encoding_stats), optional_size_statistics(16, c->size_statistics)); function_builder(this, op); } @@ -758,13 +763,16 @@ void CompactProtocolReader::read(Statistics* s) { using optional_binary = parquet_field_optional, parquet_field_binary>; using optional_int64 = parquet_field_optional; + using optional_bool = parquet_field_optional; auto op = std::make_tuple(optional_binary(1, s->max), optional_binary(2, s->min), optional_int64(3, s->null_count), optional_int64(4, s->distinct_count), optional_binary(5, s->max_value), - optional_binary(6, s->min_value)); + optional_binary(6, s->min_value), + optional_bool(7, s->is_max_value_exact), + optional_bool(8, s->is_min_value_exact)); function_builder(this, op); } @@ -774,6 +782,14 @@ void CompactProtocolReader::read(ColumnOrder* c) function_builder(this, op); } +void CompactProtocolReader::read(PageEncodingStats* s) +{ + auto op = std::make_tuple(parquet_field_enum(1, s->page_type), + parquet_field_enum(2, s->encoding), + parquet_field_int32(3, s->count)); + function_builder(this, op); +} + void CompactProtocolReader::read(SortingColumn* s) { auto op = std::make_tuple(parquet_field_int32(1, s->column_idx), diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp index 2ad336a3052..bcc9adfc8c0 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.hpp +++ b/cpp/src/io/parquet/compact_protocol_reader.hpp @@ -120,6 +120,7 @@ class CompactProtocolReader { void read(ColumnIndex* c); void read(Statistics* s); void read(ColumnOrder* c); + void read(PageEncodingStats* s); void read(SortingColumn* s); public: diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index 1262ca1926d..14c99f728de 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -188,6 +188,7 @@ size_t CompactProtocolWriter::write(ColumnChunkMetaData const& s) if (s.index_page_offset != 0) { c.field_int(10, s.index_page_offset); } if (s.dictionary_page_offset != 0) { c.field_int(11, s.dictionary_page_offset); } c.field_struct(12, s.statistics); + if (s.encoding_stats.has_value()) { c.field_struct_list(13, s.encoding_stats.value()); } if (s.size_statistics.has_value()) { c.field_struct(16, s.size_statistics.value()); } return c.value(); } @@ -201,6 +202,8 @@ size_t CompactProtocolWriter::write(Statistics const& s) if (s.distinct_count.has_value()) { c.field_int(4, s.distinct_count.value()); } if (s.max_value.has_value()) { c.field_binary(5, s.max_value.value()); } if (s.min_value.has_value()) { c.field_binary(6, s.min_value.value()); } + if (s.is_max_value_exact.has_value()) { c.field_bool(7, s.is_max_value_exact.value()); } + if (s.is_min_value_exact.has_value()) { c.field_bool(8, s.is_min_value_exact.value()); } return c.value(); } @@ -248,6 +251,15 @@ size_t CompactProtocolWriter::write(ColumnOrder const& co) return c.value(); } +size_t CompactProtocolWriter::write(PageEncodingStats const& enc) +{ + CompactProtocolFieldWriter c(*this); + c.field_int(1, static_cast(enc.page_type)); + c.field_int(2, static_cast(enc.encoding)); + c.field_int(3, enc.count); + return c.value(); +} + size_t CompactProtocolWriter::write(SortingColumn const& sc) { CompactProtocolFieldWriter c(*this); diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp index 2e39abadd24..c2e6178acbf 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.hpp +++ b/cpp/src/io/parquet/compact_protocol_writer.hpp @@ -53,6 +53,7 @@ class CompactProtocolWriter { size_t write(OffsetIndex const&); size_t write(SizeStatistics const&); size_t write(ColumnOrder const&); + size_t write(PageEncodingStats const&); size_t write(SortingColumn const&); protected: diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 227f13db60e..11b18579c58 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -2944,6 +2944,9 @@ __device__ uint8_t* EncodeStatistics(uint8_t* start, auto const [min_ptr, min_size] = get_extremum(&s->min_value, dtype, scratch, true, NO_TRUNC_STATS); encoder.field_binary(6, min_ptr, min_size); + // cudf min/max statistics are always exact (i.e. not truncated) + encoder.field_bool(7, true); + encoder.field_bool(8, true); } encoder.end(&end); return end; diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index 7f00d63b9c2..756726945cf 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -259,6 +259,10 @@ struct Statistics { thrust::optional> max_value; // min value for column determined by ColumnOrder thrust::optional> min_value; + // If true, max_value is the actual maximum value for a column + thrust::optional is_max_value_exact; + // If true, min_value is the actual minimum value for a column + thrust::optional is_min_value_exact; }; /** @@ -322,6 +326,15 @@ struct ColumnIndex { thrust::optional> definition_level_histogram; }; +/** + * @brief Thrift-derived struct describing page encoding statistics + */ +struct PageEncodingStats { + PageType page_type; // The page type (data/dic/...) + Encoding encoding; // Encoding of the page + int32_t count; // Number of pages of this type with this encoding +}; + /** * @brief Thrift-derived struct describing column sort order */ @@ -335,21 +348,36 @@ struct SortingColumn { * @brief Thrift-derived struct describing a column chunk */ struct ColumnChunkMetaData { + // Type of this column Type type = BOOLEAN; + // Set of all encodings used for this column. The purpose is to validate + // whether we can decode those pages. std::vector encodings; + // Path in schema std::vector path_in_schema; - Compression codec = UNCOMPRESSED; + // Compression codec + Compression codec = UNCOMPRESSED; + // Number of values in this column int64_t num_values = 0; - int64_t total_uncompressed_size = - 0; // total byte size of all uncompressed pages in this column chunk (including the headers) - int64_t total_compressed_size = - 0; // total byte size of all compressed pages in this column chunk (including the headers) - int64_t data_page_offset = 0; // Byte offset from beginning of file to first data page - int64_t index_page_offset = 0; // Byte offset from beginning of file to root index page - int64_t dictionary_page_offset = - 0; // Byte offset from the beginning of file to first (only) dictionary page - Statistics statistics; // Encoded chunk-level statistics - thrust::optional size_statistics; // Size statistics for the chunk + // Total byte size of all uncompressed pages in this column chunk (including the headers) + int64_t total_uncompressed_size = 0; + // Total byte size of all compressed pages in this column chunk (including the headers) + int64_t total_compressed_size = 0; + // Byte offset from beginning of file to first data page + int64_t data_page_offset = 0; + // Byte offset from beginning of file to root index page + int64_t index_page_offset = 0; + // Byte offset from the beginning of file to first (only) dictionary page + int64_t dictionary_page_offset = 0; + // Optional statistics for this column chunk + Statistics statistics; + // Set of all encodings used for pages in this column chunk. This information can be used to + // determine if all data pages are dictionary encoded for example. + thrust::optional> encoding_stats; + // Optional statistics to help estimate total memory when converted to in-memory representations. + // The histograms contained in these statistics can also be useful in some cases for more + // fine-grained nullability/list length filter pushdown. + thrust::optional size_statistics; }; /** diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 5509a33f9f0..286c7b361a9 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -22,6 +22,8 @@ #include "compact_protocol_reader.hpp" #include "compact_protocol_writer.hpp" #include "io/comp/nvcomp_adapter.hpp" +#include "io/parquet/parquet.hpp" +#include "io/parquet/parquet_gpu.hpp" #include "io/statistics/column_statistics.cuh" #include "io/utilities/column_utils.cuh" #include "io/utilities/config_utils.hpp" @@ -214,6 +216,53 @@ void update_chunk_encodings(std::vector& encodings, uint32_t enc_mask) } } +/** + * @brief Update the encoding_stats field in the column chunk metadata. + * + * @param chunk_meta The `ColumnChunkMetaData` struct for the column chunk + * @param ck The column chunk to summarize stats for + * @param is_v2 True if V2 page headers are used + */ +void update_chunk_encoding_stats(ColumnChunkMetaData& chunk_meta, + EncColumnChunk const& ck, + bool is_v2) +{ + // don't set encoding stats if there are no pages + if (ck.num_pages == 0) { return; } + + // NOTE: since cudf doesn't use mixed encodings for a chunk, we really only need to account + // for the dictionary page (if there is one), and the encoding used for the data pages. We can + // examine the chunk's encodings field to figure out the encodings without having to examine + // the page data. + auto const num_data_pages = static_cast(ck.num_data_pages()); + auto const data_page_type = is_v2 ? PageType::DATA_PAGE_V2 : PageType::DATA_PAGE; + + std::vector result; + if (ck.use_dictionary) { + // For dictionary encoding, if V1 then both data and dictionary use PLAIN_DICTIONARY. For V2 + // the dictionary uses PLAIN and the data RLE_DICTIONARY. + auto const dict_enc = is_v2 ? Encoding::PLAIN : Encoding::PLAIN_DICTIONARY; + auto const data_enc = is_v2 ? Encoding::RLE_DICTIONARY : Encoding::PLAIN_DICTIONARY; + result.push_back({PageType::DICTIONARY_PAGE, dict_enc, 1}); + if (num_data_pages > 0) { result.push_back({data_page_type, data_enc, num_data_pages}); } + } else { + // No dictionary page, the pages are encoded with something other than RLE (unless it's a + // boolean column). + for (auto const enc : chunk_meta.encodings) { + if (enc != Encoding::RLE) { + result.push_back({data_page_type, enc, num_data_pages}); + break; + } + } + // if result is empty and we're using V2 headers, then assume the data is RLE as well + if (result.empty() and is_v2 and (ck.encodings & encoding_to_mask(Encoding::RLE)) != 0) { + result.push_back({data_page_type, Encoding::RLE, num_data_pages}); + } + } + + if (not result.empty()) { chunk_meta.encoding_stats = std::move(result); } +} + /** * @brief Compute size (in bytes) of the data stored in the given column. * @@ -2144,6 +2193,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, max_write_size = std::max(max_write_size, ck.compressed_size); update_chunk_encodings(column_chunk_meta.encodings, ck.encodings); + update_chunk_encoding_stats(column_chunk_meta, ck, write_v2_headers); if (ck.ck_stat_size != 0) { std::vector const stats_blob = cudf::detail::make_std_vector_sync( diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index a16b3d63177..3a8763ed9f3 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -903,6 +903,12 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) ASSERT_TRUE(stats.min_value.has_value()); ASSERT_TRUE(stats.max_value.has_value()); + // check that min and max for the column chunk are exact (i.e. not truncated) + ASSERT_TRUE(stats.is_max_value_exact.has_value()); + EXPECT_TRUE(stats.is_max_value_exact.value()); + ASSERT_TRUE(stats.is_min_value_exact.has_value()); + EXPECT_TRUE(stats.is_min_value_exact.value()); + // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max auto const ptype = fmd.schema[c + 1].type; auto const ctype = fmd.schema[c + 1].converted_type; @@ -1674,7 +1680,18 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings) // no nulls and no repetition, so the only encoding used should be for the data. // since we're writing v1, both dict and data pages should use PLAIN_DICTIONARY. auto const expect_enc = [&fmd](int idx, cudf::io::parquet::detail::Encoding enc) { - EXPECT_EQ(fmd.row_groups[0].columns[idx].meta_data.encodings[0], enc); + auto const& col_meta = fmd.row_groups[0].columns[idx].meta_data; + EXPECT_EQ(col_meta.encodings[0], enc); + + // also check encoding stats are written properly + ASSERT_TRUE(col_meta.encoding_stats.has_value()); + auto const& enc_stats = col_meta.encoding_stats.value(); + for (auto const& ec : enc_stats) { + if (ec.page_type == cudf::io::parquet::detail::PageType::DATA_PAGE) { + EXPECT_EQ(ec.encoding, enc); + EXPECT_EQ(ec.count, 1); + } + } }; // requested plain diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 0862995bc46..8143e7919a7 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -692,7 +692,7 @@ def contains( Returning an Index of booleans using only a literal pattern. - >>> data = ['Mouse', 'dog', 'house and parrot', '23.0', np.NaN] + >>> data = ['Mouse', 'dog', 'house and parrot', '23.0', np.nan] >>> idx = cudf.Index(data) >>> idx Index(['Mouse', 'dog', 'house and parrot', '23.0', None], dtype='object') diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 01842b5f0a9..cd42bf52ea1 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1077,7 +1077,7 @@ def isna(self): >>> import cudf >>> import numpy as np >>> import pandas as pd - >>> df = cudf.DataFrame({'age': [5, 6, np.NaN], + >>> df = cudf.DataFrame({'age': [5, 6, np.nan], ... 'born': [pd.NaT, pd.Timestamp('1939-05-27'), ... pd.Timestamp('1940-04-25')], ... 'name': ['Alfred', 'Batman', ''], @@ -1095,7 +1095,7 @@ def isna(self): Show which entries in a Series are NA. - >>> ser = cudf.Series([5, 6, np.NaN, np.inf, -np.inf]) + >>> ser = cudf.Series([5, 6, np.nan, np.inf, -np.inf]) >>> ser 0 5.0 1 6.0 @@ -1113,7 +1113,7 @@ def isna(self): Show which entries in an Index are NA. - >>> idx = cudf.Index([1, 2, None, np.NaN, 0.32, np.inf]) + >>> idx = cudf.Index([1, 2, None, np.nan, 0.32, np.inf]) >>> idx Index([1.0, 2.0, , , 0.32, Inf], dtype='float64') >>> idx.isna() @@ -1156,7 +1156,7 @@ def notna(self): >>> import cudf >>> import numpy as np >>> import pandas as pd - >>> df = cudf.DataFrame({'age': [5, 6, np.NaN], + >>> df = cudf.DataFrame({'age': [5, 6, np.nan], ... 'born': [pd.NaT, pd.Timestamp('1939-05-27'), ... pd.Timestamp('1940-04-25')], ... 'name': ['Alfred', 'Batman', ''], @@ -1174,7 +1174,7 @@ def notna(self): Show which entries in a Series are NA. - >>> ser = cudf.Series([5, 6, np.NaN, np.inf, -np.inf]) + >>> ser = cudf.Series([5, 6, np.nan, np.inf, -np.inf]) >>> ser 0 5.0 1 6.0 @@ -1192,7 +1192,7 @@ def notna(self): Show which entries in an Index are NA. - >>> idx = cudf.Index([1, 2, None, np.NaN, 0.32, np.inf]) + >>> idx = cudf.Index([1, 2, None, np.nan, 0.32, np.inf]) >>> idx Index([1.0, 2.0, , , 0.32, Inf], dtype='float64') >>> idx.notna() diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 6c0acdc5fb0..f55fa4c05b5 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -60,6 +60,7 @@ from cudf.core.single_column_frame import SingleColumnFrame from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( + _NUMPY_SCTYPES, _maybe_convert_to_default_type, find_common_type, is_mixed_with_object_dtype, @@ -344,7 +345,10 @@ def _data(self): @_cudf_nvtx_annotate def __contains__(self, item): if isinstance(item, bool) or not isinstance( - item, tuple(np.sctypes["int"] + np.sctypes["float"] + [int, float]) + item, + tuple( + _NUMPY_SCTYPES["int"] + _NUMPY_SCTYPES["float"] + [int, float] + ), ): return False try: diff --git a/python/cudf/cudf/tests/test_api_types.py b/python/cudf/cudf/tests/test_api_types.py index 9436d65e0b7..4abe210c6ea 100644 --- a/python/cudf/cudf/tests/test_api_types.py +++ b/python/cudf/cudf/tests/test_api_types.py @@ -33,7 +33,6 @@ (np.float64, False), (np.complex128, False), (np.str_, False), - (np.unicode_, False), (np.datetime64, False), (np.timedelta64, False), # NumPy scalars. @@ -42,7 +41,6 @@ (np.float64(), False), (np.complex128(), False), (np.str_(), False), - (np.unicode_(), False), (np.datetime64(), False), (np.timedelta64(), False), # NumPy dtype objects. @@ -61,7 +59,6 @@ (np.array([], dtype=np.float64), False), (np.array([], dtype=np.complex128), False), (np.array([], dtype=np.str_), False), - (np.array([], dtype=np.unicode_), False), (np.array([], dtype=np.datetime64), False), (np.array([], dtype=np.timedelta64), False), (np.array([], dtype=object), False), @@ -142,7 +139,6 @@ def test_is_categorical_dtype(obj, expect): (np.float64, True), (np.complex128, True), (np.str_, False), - (np.unicode_, False), (np.datetime64, False), (np.timedelta64, False), # NumPy scalars. @@ -151,7 +147,6 @@ def test_is_categorical_dtype(obj, expect): (np.float64(), True), (np.complex128(), True), (np.str_(), False), - (np.unicode_(), False), (np.datetime64(), False), (np.timedelta64(), False), # NumPy dtype objects. @@ -170,7 +165,6 @@ def test_is_categorical_dtype(obj, expect): (np.array([], dtype=np.float64), True), (np.array([], dtype=np.complex128), True), (np.array([], dtype=np.str_), False), - (np.array([], dtype=np.unicode_), False), (np.array([], dtype=np.datetime64), False), (np.array([], dtype=np.timedelta64), False), (np.array([], dtype=object), False), @@ -247,7 +241,6 @@ def test_is_numeric_dtype(obj, expect): (np.float64, False), (np.complex128, False), (np.str_, False), - (np.unicode_, False), (np.datetime64, False), (np.timedelta64, False), # NumPy scalars. @@ -256,7 +249,6 @@ def test_is_numeric_dtype(obj, expect): (np.float64(), False), (np.complex128(), False), (np.str_(), False), - (np.unicode_(), False), (np.datetime64(), False), (np.timedelta64(), False), # NumPy dtype objects. @@ -275,7 +267,6 @@ def test_is_numeric_dtype(obj, expect): (np.array([], dtype=np.float64), False), (np.array([], dtype=np.complex128), False), (np.array([], dtype=np.str_), False), - (np.array([], dtype=np.unicode_), False), (np.array([], dtype=np.datetime64), False), (np.array([], dtype=np.timedelta64), False), (np.array([], dtype=object), False), @@ -352,7 +343,6 @@ def test_is_integer_dtype(obj, expect): (np.float64, False), (np.complex128, False), (np.str_, False), - (np.unicode_, False), (np.datetime64, False), (np.timedelta64, False), # NumPy scalars. @@ -361,7 +351,6 @@ def test_is_integer_dtype(obj, expect): (np.float64(), False), (np.complex128(), False), (np.str_(), False), - (np.unicode_(), False), (np.datetime64(), False), (np.timedelta64(), False), # NumPy dtype objects. @@ -380,7 +369,6 @@ def test_is_integer_dtype(obj, expect): (np.array([], dtype=np.float64), False), (np.array([], dtype=np.complex128), False), (np.array([], dtype=np.str_), False), - (np.array([], dtype=np.unicode_), False), (np.array([], dtype=np.datetime64), False), (np.array([], dtype=np.timedelta64), False), (np.array([], dtype=object), False), @@ -458,7 +446,6 @@ def test_is_integer(obj, expect): (np.float64, False), (np.complex128, False), (np.str_, True), - (np.unicode_, True), (np.datetime64, False), (np.timedelta64, False), # NumPy scalars. @@ -467,7 +454,6 @@ def test_is_integer(obj, expect): (np.float64(), False), (np.complex128(), False), (np.str_(), True), - (np.unicode_(), True), (np.datetime64(), False), (np.timedelta64(), False), # NumPy dtype objects. @@ -486,7 +472,6 @@ def test_is_integer(obj, expect): (np.array([], dtype=np.float64), False), (np.array([], dtype=np.complex128), False), (np.array([], dtype=np.str_), True), - (np.array([], dtype=np.unicode_), True), (np.array([], dtype=np.datetime64), False), (np.array([], dtype=np.timedelta64), False), # (np.array([], dtype=object), False), @@ -577,7 +562,6 @@ def test_is_string_dtype(obj, expect): (np.float64, False), (np.complex128, False), (np.str_, False), - (np.unicode_, False), (np.datetime64, True), (np.timedelta64, False), # NumPy scalars. @@ -586,7 +570,6 @@ def test_is_string_dtype(obj, expect): (np.float64(), False), (np.complex128(), False), (np.str_(), False), - (np.unicode_(), False), (np.datetime64(), True), (np.timedelta64(), False), # NumPy dtype objects. @@ -605,7 +588,6 @@ def test_is_string_dtype(obj, expect): (np.array([], dtype=np.float64), False), (np.array([], dtype=np.complex128), False), (np.array([], dtype=np.str_), False), - (np.array([], dtype=np.unicode_), False), (np.array([], dtype=np.datetime64), True), (np.array([], dtype=np.timedelta64), False), (np.array([], dtype=object), False), @@ -682,7 +664,6 @@ def test_is_datetime_dtype(obj, expect): (np.float64, False), (np.complex128, False), (np.str_, False), - (np.unicode_, False), (np.datetime64, False), (np.timedelta64, False), # NumPy scalars. @@ -691,7 +672,6 @@ def test_is_datetime_dtype(obj, expect): (np.float64(), False), (np.complex128(), False), (np.str_(), False), - (np.unicode_(), False), (np.datetime64(), False), (np.timedelta64(), False), # NumPy dtype objects. @@ -710,7 +690,6 @@ def test_is_datetime_dtype(obj, expect): (np.array([], dtype=np.float64), False), (np.array([], dtype=np.complex128), False), (np.array([], dtype=np.str_), False), - (np.array([], dtype=np.unicode_), False), (np.array([], dtype=np.datetime64), False), (np.array([], dtype=np.timedelta64), False), (np.array([], dtype=object), False), @@ -787,7 +766,6 @@ def test_is_list_dtype(obj, expect): (np.float64, False), (np.complex128, False), (np.str_, False), - (np.unicode_, False), (np.datetime64, False), (np.timedelta64, False), # NumPy scalars. @@ -796,7 +774,6 @@ def test_is_list_dtype(obj, expect): (np.float64(), False), (np.complex128(), False), (np.str_(), False), - (np.unicode_(), False), (np.datetime64(), False), (np.timedelta64(), False), # NumPy dtype objects. @@ -815,7 +792,6 @@ def test_is_list_dtype(obj, expect): (np.array([], dtype=np.float64), False), (np.array([], dtype=np.complex128), False), (np.array([], dtype=np.str_), False), - (np.array([], dtype=np.unicode_), False), (np.array([], dtype=np.datetime64), False), (np.array([], dtype=np.timedelta64), False), (np.array([], dtype=object), False), @@ -895,7 +871,6 @@ def test_is_struct_dtype(obj, expect): (np.float64, False), (np.complex128, False), (np.str_, False), - (np.unicode_, False), (np.datetime64, False), (np.timedelta64, False), # NumPy scalars. @@ -904,7 +879,6 @@ def test_is_struct_dtype(obj, expect): (np.float64(), False), (np.complex128(), False), (np.str_(), False), - (np.unicode_(), False), (np.datetime64(), False), (np.timedelta64(), False), # NumPy dtype objects. @@ -923,7 +897,6 @@ def test_is_struct_dtype(obj, expect): (np.array([], dtype=np.float64), False), (np.array([], dtype=np.complex128), False), (np.array([], dtype=np.str_), False), - (np.array([], dtype=np.unicode_), False), (np.array([], dtype=np.datetime64), False), (np.array([], dtype=np.timedelta64), False), (np.array([], dtype=object), False), @@ -1004,7 +977,6 @@ def test_is_decimal_dtype(obj, expect): np.float64, np.complex128, np.str_, - np.unicode_, np.datetime64, np.timedelta64, # NumPy scalars. @@ -1013,7 +985,6 @@ def test_is_decimal_dtype(obj, expect): np.float64(), np.complex128(), np.str_(), - np.unicode_(), np.datetime64(), np.timedelta64(), # NumPy dtype objects. @@ -1032,7 +1003,6 @@ def test_is_decimal_dtype(obj, expect): np.array([], dtype=np.float64), np.array([], dtype=np.complex128), np.array([], dtype=np.str_), - np.array([], dtype=np.unicode_), np.array([], dtype=np.datetime64), np.array([], dtype=np.timedelta64), np.array([], dtype=object), @@ -1088,7 +1058,6 @@ def test_pandas_agreement(obj): np.float64, np.complex128, np.str_, - np.unicode_, np.datetime64, np.timedelta64, # NumPy scalars. @@ -1097,7 +1066,6 @@ def test_pandas_agreement(obj): np.float64(), np.complex128(), np.str_(), - np.unicode_(), np.datetime64(), np.timedelta64(), # NumPy dtype objects. @@ -1116,7 +1084,6 @@ def test_pandas_agreement(obj): np.array([], dtype=np.float64), np.array([], dtype=np.complex128), np.array([], dtype=np.str_), - np.array([], dtype=np.unicode_), np.array([], dtype=np.datetime64), np.array([], dtype=np.timedelta64), np.array([], dtype=object), diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index e21fd53bee4..7aba2e45532 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -460,7 +460,7 @@ def test_categorical_dataframe_slice_copy(): pd.Series(["1.0", "2.5", "3.001", None, "9"], dtype="category"), pd.Series(["a", "b", "c", "c", "b", "a", "b", "b"]), pd.Series(["aa", "b", "c", "c", "bb", "bb", "a", "b", "b"]), - pd.Series([1, 2, 3, 89, None, np.nan, np.NaN], dtype="float64"), + pd.Series([1, 2, 3, 89, None, np.nan, np.nan], dtype="float64"), pd.Series([1, 2, 3, 89], dtype="float64"), pd.Series([1, 2.5, 3.001, 89], dtype="float64"), pd.Series([None, None, None]), @@ -493,7 +493,7 @@ def test_categorical_typecast(data, cat_type): pd.Series([1, 2, 3, 89]), pd.Series(["a", "b", "c", "c", "b", "a", "b", "b"]), pd.Series(["aa", "b", "c", "c", "bb", "bb", "a", "b", "b"]), - pd.Series([1, 2, 3, 89, None, np.nan, np.NaN], dtype="float64"), + pd.Series([1, 2, 3, 89, None, np.nan, np.nan], dtype="float64"), pd.Series([1, 2, 3, 89], dtype="float64"), pd.Series([1, 2.5, 3.001, 89], dtype="float64"), pd.Series([None, None, None]), diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 59e8b41e51a..e287603de07 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -5199,20 +5199,20 @@ def test_df_constructor_dtype(dtype): cudf.DataFrame( { "a": [1, 2, 3, 4], - "b": [7, np.NaN, 9, 10], + "b": [7, np.nan, 9, 10], "c": cudf.Series( - [np.NaN, np.NaN, np.NaN, np.NaN], nan_as_null=False + [np.nan, np.nan, np.nan, np.nan], nan_as_null=False ), "d": cudf.Series([None, None, None, None], dtype="int64"), "e": [100, None, 200, None], - "f": cudf.Series([10, None, np.NaN, 11], nan_as_null=False), + "f": cudf.Series([10, None, np.nan, 11], nan_as_null=False), } ), cudf.DataFrame( { "a": [10, 11, 12, 13, 14, 15], "b": cudf.Series( - [10, None, np.NaN, 2234, None, np.NaN], nan_as_null=False + [10, None, np.nan, 2234, None, np.nan], nan_as_null=False ), } ), @@ -5264,11 +5264,11 @@ def test_rowwise_ops_nullable_dtypes_all_null(op): gdf = cudf.DataFrame( { "a": [1, 2, 3, 4], - "b": [7, np.NaN, 9, 10], - "c": cudf.Series([np.NaN, np.NaN, np.NaN, np.NaN], dtype=float), + "b": [7, np.nan, 9, 10], + "c": cudf.Series([np.nan, np.nan, np.nan, np.nan], dtype=float), "d": cudf.Series([None, None, None, None], dtype="int64"), "e": [100, None, 200, None], - "f": cudf.Series([10, None, np.NaN, 11], nan_as_null=False), + "f": cudf.Series([10, None, np.nan, 11], nan_as_null=False), } ) @@ -5300,7 +5300,7 @@ def test_rowwise_ops_nullable_dtypes_partial_null(op): { "a": [10, 11, 12, 13, 14, 15], "b": cudf.Series( - [10, None, np.NaN, 2234, None, np.NaN], + [10, None, np.nan, 2234, None, np.nan], nan_as_null=False, ), } diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 56a4281aad9..6fb1d3d8ba5 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -211,7 +211,7 @@ def make_pdf(nrows, ncolumns=1, nvalids=0, dtype=np.int64): # Randomly but reproducibly mark subset of rows as invalid random.seed(1337) mask = random.sample(range(nrows), nvalids) - test_pdf[test_pdf.index.isin(mask)] = np.NaN + test_pdf[test_pdf.index.isin(mask)] = np.nan if dtype: test_pdf = test_pdf.astype(dtype) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index b9eb42906e8..27811d0fcde 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -507,7 +507,7 @@ def test_df_corr(method): @pytest.mark.parametrize( "data", [ - [0.0, 1, 3, 6, np.NaN, 7, 5.0, np.nan, 5, 2, 3, -100], + [0.0, 1, 3, 6, np.nan, 7, 5.0, np.nan, 5, 2, 3, -100], [np.nan] * 3, [1, 5, 3], [], @@ -555,7 +555,7 @@ def test_nans_stats(data, ops, skipna): @pytest.mark.parametrize( "data", [ - [0.0, 1, 3, 6, np.NaN, 7, 5.0, np.nan, 5, 2, 3, -100], + [0.0, 1, 3, 6, np.nan, 7, 5.0, np.nan, 5, 2, 3, -100], [np.nan] * 3, [1, 5, 3], ], diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index a33b5ca139c..2aa3129ab30 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -91,6 +91,10 @@ BOOL_TYPES = {"bool"} ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES +# The NumPy scalar types are a bit of a mess as they align with the C types +# so for now we use the `sctypes` dict (although it was made private in 2.0) +_NUMPY_SCTYPES = np.sctypes if hasattr(np, "sctypes") else np._core.sctypes + def np_to_pa_dtype(dtype): """Util to convert numpy dtype to PyArrow dtype.""" @@ -335,7 +339,7 @@ def min_signed_type(x, min_size=8): Return the smallest *signed* integer dtype that can represent the integer ``x`` """ - for int_dtype in np.sctypes["int"]: + for int_dtype in _NUMPY_SCTYPES["int"]: if (cudf.dtype(int_dtype).itemsize * 8) >= min_size: if np.iinfo(int_dtype).min <= x <= np.iinfo(int_dtype).max: return int_dtype @@ -348,7 +352,7 @@ def min_unsigned_type(x, min_size=8): Return the smallest *unsigned* integer dtype that can represent the integer ``x`` """ - for int_dtype in np.sctypes["uint"]: + for int_dtype in _NUMPY_SCTYPES["uint"]: if (cudf.dtype(int_dtype).itemsize * 8) >= min_size: if 0 <= x <= np.iinfo(int_dtype).max: return int_dtype