From b6a4c15d5da6688ba2861d9003f385e08ce5dccb Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 16 Aug 2024 18:15:20 +0800 Subject: [PATCH 1/8] Handle num-nulls in Parquet correctly --- cpp/src/arrow/dataset/file_parquet.cc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 1f8b6cc4882cf..27273066bf9cf 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -366,8 +366,9 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr const parquet::Statistics& statistics) { auto field_expr = compute::field_ref(field_ref); + bool may_has_null = !statistics.HasNullCount() || statistics.null_count() != 0; // Optimize for corner case where all values are nulls - if (statistics.num_values() == 0 && statistics.null_count() > 0) { + if (statistics.num_values() == 0 && may_has_null) { return is_null(std::move(field_expr)); } @@ -378,7 +379,6 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr auto maybe_min = Cast(min, field.type()); auto maybe_max = Cast(max, field.type()); - if (maybe_min.ok() && maybe_max.ok()) { min = maybe_min.MoveValueUnsafe().scalar(); max = maybe_max.MoveValueUnsafe().scalar(); @@ -386,7 +386,7 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr if (min->Equals(*max)) { auto single_value = compute::equal(field_expr, compute::literal(std::move(min))); - if (statistics.null_count() == 0) { + if (!may_has_null) { return single_value; } return compute::or_(std::move(single_value), is_null(std::move(field_expr))); @@ -412,8 +412,7 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr } else { in_range = compute::and_(std::move(lower_bound), std::move(upper_bound)); } - - if (statistics.null_count() != 0) { + if (may_has_null) { return compute::or_(std::move(in_range), compute::is_null(field_expr)); } return in_range; From 132363cca8fc312811654efdb5dc1ab061ed3462 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 22 Aug 2024 18:57:16 +0800 Subject: [PATCH 2/8] add test --- cpp/src/arrow/dataset/file_parquet.cc | 12 ++++-- cpp/src/arrow/dataset/file_parquet_test.cc | 47 ++++++++++++++++++++++ 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 27273066bf9cf..f54dc69a9c826 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -366,10 +366,16 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr const parquet::Statistics& statistics) { auto field_expr = compute::field_ref(field_ref); - bool may_has_null = !statistics.HasNullCount() || statistics.null_count() != 0; + bool may_has_null = !statistics.HasNullCount() || statistics.null_count() > 0; + bool must_has_null = statistics.HasNullCount() && statistics.null_count() > 0; // Optimize for corner case where all values are nulls - if (statistics.num_values() == 0 && may_has_null) { - return is_null(std::move(field_expr)); + if (statistics.num_values() == 0) { + if (must_has_null) { + return is_null(std::move(field_expr)); + } + // If there are no values and no nulls, it might be empty or contains + // only null. + return std::nullopt; } std::shared_ptr min, max; diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc index bf626826d4d1b..8e57d34cba136 100644 --- a/cpp/src/arrow/dataset/file_parquet_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_test.cc @@ -841,6 +841,53 @@ TEST(TestParquetStatistics, NullMax) { EXPECT_EQ(stat_expression->ToString(), "(x >= 1)"); } +TEST(TestParquetStatistics, NoNullCount) { + auto field = ::arrow::field("x", int32()); + auto parquet_node_ptr = ::parquet::schema::Int32("x", ::parquet::Repetition::REQUIRED); + ::parquet::ColumnDescriptor descr(parquet_node_ptr, /*max_definition_level=*/1, + /*max_repetition_level=*/0); + + auto int32_to_parquet_stats = [](int32_t v) { + std::string value; + value.resize(sizeof(int32_t)); + memcpy(value.data(), &v, sizeof(int32_t)); + return value; + }; + { + // Base case: when null_count is not set, the expression might contain null + ::parquet::EncodedStatistics encoded_stats; + encoded_stats.set_min(int32_to_parquet_stats(1)); + encoded_stats.set_max(int32_to_parquet_stats(100)); + encoded_stats.has_null_count = false; + encoded_stats.all_null_value = false; + encoded_stats.null_count = 0; + auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/10); + + auto stat_expression = + ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats); + EXPECT_EQ(stat_expression->ToString(), + "(((x >= 1) and (x <= 100)) or is_null(x, {nan_is_null=false}))"); + } + { + // Special case: when num_value is 0, if has_null, it would return + // "is_null", otherwise it cannot gurantees anything + ::parquet::EncodedStatistics encoded_stats; + encoded_stats.has_null_count = true; + encoded_stats.null_count = 1; + encoded_stats.all_null_value = true; + auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0); + auto stat_expression = + ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats); + EXPECT_EQ(stat_expression->ToString(), "is_null(x, {nan_is_null=false})"); + + encoded_stats.has_null_count = false; + encoded_stats.all_null_value = false; + stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0); + stat_expression = ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats); + EXPECT_FALSE(stat_expression.has_value()); + } +} + class DelayedBufferReader : public ::arrow::io::BufferReader { public: explicit DelayedBufferReader(const std::shared_ptr<::arrow::Buffer>& buffer) From 49f41e25290e179c7319f7665c9102f1f1895b80 Mon Sep 17 00:00:00 2001 From: mwish <1506118561@qq.com> Date: Tue, 3 Sep 2024 21:41:47 +0800 Subject: [PATCH 3/8] Update cpp/src/arrow/dataset/file_parquet.cc Co-authored-by: Antoine Pitrou --- cpp/src/arrow/dataset/file_parquet.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index f54dc69a9c826..6184a7eeab9ba 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -419,7 +419,7 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr in_range = compute::and_(std::move(lower_bound), std::move(upper_bound)); } if (may_has_null) { - return compute::or_(std::move(in_range), compute::is_null(field_expr)); + return compute::or_(std::move(in_range), compute::is_null(std::move(field_expr))); } return in_range; } From caad9a1987cb41f4135523b4c739a68b88b9451d Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 3 Sep 2024 21:48:12 +0800 Subject: [PATCH 4/8] apply suggestions --- cpp/src/arrow/dataset/file_parquet.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 6184a7eeab9ba..33c32edbe9517 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -366,11 +366,11 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr const parquet::Statistics& statistics) { auto field_expr = compute::field_ref(field_ref); - bool may_has_null = !statistics.HasNullCount() || statistics.null_count() > 0; - bool must_has_null = statistics.HasNullCount() && statistics.null_count() > 0; + bool may_have_null = !statistics.HasNullCount() || statistics.null_count() > 0; + bool has_null = statistics.HasNullCount() && statistics.null_count() > 0; // Optimize for corner case where all values are nulls if (statistics.num_values() == 0) { - if (must_has_null) { + if (has_null) { return is_null(std::move(field_expr)); } // If there are no values and no nulls, it might be empty or contains @@ -392,7 +392,7 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr if (min->Equals(*max)) { auto single_value = compute::equal(field_expr, compute::literal(std::move(min))); - if (!may_has_null) { + if (!may_have_null) { return single_value; } return compute::or_(std::move(single_value), is_null(std::move(field_expr))); @@ -418,7 +418,7 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr } else { in_range = compute::and_(std::move(lower_bound), std::move(upper_bound)); } - if (may_has_null) { + if (may_have_nulll) { return compute::or_(std::move(in_range), compute::is_null(std::move(field_expr))); } return in_range; @@ -428,7 +428,7 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr std::optional ParquetFileFragment::EvaluateStatisticsAsExpression( const Field& field, const parquet::Statistics& statistics) { - const auto field_name = field.name(); + auto field_name = field.name(); return EvaluateStatisticsAsExpression(field, FieldRef(std::move(field_name)), statistics); } From 0060abf01e23feb9d65d800fe24a099aa0bcebf6 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 3 Sep 2024 22:56:38 +0800 Subject: [PATCH 5/8] fix build --- cpp/src/arrow/dataset/file_parquet.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 33c32edbe9517..051287492b78b 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -418,7 +418,7 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr } else { in_range = compute::and_(std::move(lower_bound), std::move(upper_bound)); } - if (may_have_nulll) { + if (may_have_null) { return compute::or_(std::move(in_range), compute::is_null(std::move(field_expr))); } return in_range; From 374b5f7861bd3bc55ee74bcf2f9f0c2d3896ff6c Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 5 Sep 2024 08:46:20 +0800 Subject: [PATCH 6/8] apply suggestions --- cpp/src/arrow/dataset/file_parquet.cc | 12 +++++------- cpp/src/arrow/dataset/file_parquet_test.cc | 9 ++++++--- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 051287492b78b..93eaa7a659d14 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -367,15 +367,13 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr auto field_expr = compute::field_ref(field_ref); bool may_have_null = !statistics.HasNullCount() || statistics.null_count() > 0; - bool has_null = statistics.HasNullCount() && statistics.null_count() > 0; // Optimize for corner case where all values are nulls if (statistics.num_values() == 0) { - if (has_null) { - return is_null(std::move(field_expr)); - } - // If there are no values and no nulls, it might be empty or contains - // only null. - return std::nullopt; + // If `statistics.HasNullCount()`, it means the all the values are nulls. + // + // If there are no values and no nulls, it might be empty or all values + // are nulls. In this case, we also return a null expression. + return is_null(std::move(field_expr)); } std::shared_ptr min, max; diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc index 8e57d34cba136..2c05dcd9be459 100644 --- a/cpp/src/arrow/dataset/file_parquet_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_test.cc @@ -865,12 +865,13 @@ TEST(TestParquetStatistics, NoNullCount) { auto stat_expression = ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats); + ASSERT_TRUE(stat_expression.has_value()); EXPECT_EQ(stat_expression->ToString(), "(((x >= 1) and (x <= 100)) or is_null(x, {nan_is_null=false}))"); } { - // Special case: when num_value is 0, if has_null, it would return - // "is_null", otherwise it cannot gurantees anything + // Special case: when num_value is 0, it would return + // "is_null". ::parquet::EncodedStatistics encoded_stats; encoded_stats.has_null_count = true; encoded_stats.null_count = 1; @@ -878,13 +879,15 @@ TEST(TestParquetStatistics, NoNullCount) { auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0); auto stat_expression = ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats); + ASSERT_TRUE(stat_expression.has_value()); EXPECT_EQ(stat_expression->ToString(), "is_null(x, {nan_is_null=false})"); encoded_stats.has_null_count = false; encoded_stats.all_null_value = false; stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0); stat_expression = ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats); - EXPECT_FALSE(stat_expression.has_value()); + ASSERT_TRUE(stat_expression.has_value()); + EXPECT_EQ(stat_expression->ToString(), "is_null(x, {nan_is_null=false})"); } } From b2c5e9a3bd253a1288ac7c6e8c1e6a44330072c4 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 5 Sep 2024 23:32:36 +0800 Subject: [PATCH 7/8] update comment --- cpp/src/arrow/dataset/file_parquet.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 93eaa7a659d14..8d019924b8a82 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -371,8 +371,9 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr if (statistics.num_values() == 0) { // If `statistics.HasNullCount()`, it means the all the values are nulls. // - // If there are no values and no nulls, it might be empty or all values - // are nulls. In this case, we also return a null expression. + // If there are no values and `!statistics.HasNullCount()`, it might be + // empty or all values are nulls. In this case, we also return a null + // expression. return is_null(std::move(field_expr)); } From daad33ef203415c8b422377822e0ca43624aa429 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 5 Sep 2024 23:51:39 +0800 Subject: [PATCH 8/8] apply suggestions --- cpp/src/arrow/dataset/file_parquet.cc | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 8d019924b8a82..ca391b4354c07 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -369,11 +369,9 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr bool may_have_null = !statistics.HasNullCount() || statistics.null_count() > 0; // Optimize for corner case where all values are nulls if (statistics.num_values() == 0) { - // If `statistics.HasNullCount()`, it means the all the values are nulls. - // - // If there are no values and `!statistics.HasNullCount()`, it might be - // empty or all values are nulls. In this case, we also return a null - // expression. + // If there are no non-null values, column `field_ref` in the fragment + // might be empty or all values are nulls. In this case, we also return + // a null expression. return is_null(std::move(field_expr)); }