Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-43712: [C++][Parquet] Dataset: Handle num-nulls in Parquet correctly when !HasNullCount() #43726

Merged
merged 10 commits into from
Sep 6, 2024
18 changes: 11 additions & 7 deletions cpp/src/arrow/dataset/file_parquet.cc
Original file line number Diff line number Diff line change
Expand Up @@ -366,8 +366,14 @@ std::optional<compute::Expression> ParquetFileFragment::EvaluateStatisticsAsExpr
const parquet::Statistics& statistics) {
auto field_expr = compute::field_ref(field_ref);

bool may_have_null = !statistics.HasNullCount() || statistics.null_count() > 0;
// Optimize for corner case where all values are nulls
if (statistics.num_values() == 0 && statistics.null_count() > 0) {
if (statistics.num_values() == 0) {
// If `statistics.HasNullCount()`, it means the all the values are nulls.
//
// If there are no values and `!statistics.HasNullCount()`, it might be
pitrou marked this conversation as resolved.
Show resolved Hide resolved
// empty or all values are nulls. In this case, we also return a null
// expression.
return is_null(std::move(field_expr));
}

Expand All @@ -378,15 +384,14 @@ std::optional<compute::Expression> ParquetFileFragment::EvaluateStatisticsAsExpr

auto maybe_min = Cast(min, field.type());
auto maybe_max = Cast(max, field.type());

if (maybe_min.ok() && maybe_max.ok()) {
min = maybe_min.MoveValueUnsafe().scalar();
max = maybe_max.MoveValueUnsafe().scalar();

if (min->Equals(*max)) {
auto single_value = compute::equal(field_expr, compute::literal(std::move(min)));

if (statistics.null_count() == 0) {
if (!may_have_null) {
return single_value;
}
return compute::or_(std::move(single_value), is_null(std::move(field_expr)));
Expand All @@ -412,9 +417,8 @@ std::optional<compute::Expression> ParquetFileFragment::EvaluateStatisticsAsExpr
} else {
in_range = compute::and_(std::move(lower_bound), std::move(upper_bound));
}

if (statistics.null_count() != 0) {
return compute::or_(std::move(in_range), compute::is_null(field_expr));
if (may_have_null) {
return compute::or_(std::move(in_range), compute::is_null(std::move(field_expr)));
}
return in_range;
}
Expand All @@ -423,7 +427,7 @@ std::optional<compute::Expression> ParquetFileFragment::EvaluateStatisticsAsExpr

std::optional<compute::Expression> ParquetFileFragment::EvaluateStatisticsAsExpression(
const Field& field, const parquet::Statistics& statistics) {
const auto field_name = field.name();
auto field_name = field.name();
return EvaluateStatisticsAsExpression(field, FieldRef(std::move(field_name)),
statistics);
}
Expand Down
50 changes: 50 additions & 0 deletions cpp/src/arrow/dataset/file_parquet_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -841,6 +841,56 @@ TEST(TestParquetStatistics, NullMax) {
EXPECT_EQ(stat_expression->ToString(), "(x >= 1)");
}

TEST(TestParquetStatistics, NoNullCount) {
auto field = ::arrow::field("x", int32());
auto parquet_node_ptr = ::parquet::schema::Int32("x", ::parquet::Repetition::REQUIRED);
::parquet::ColumnDescriptor descr(parquet_node_ptr, /*max_definition_level=*/1,
/*max_repetition_level=*/0);

auto int32_to_parquet_stats = [](int32_t v) {
std::string value;
value.resize(sizeof(int32_t));
memcpy(value.data(), &v, sizeof(int32_t));
return value;
};
{
// Base case: when null_count is not set, the expression might contain null
::parquet::EncodedStatistics encoded_stats;
encoded_stats.set_min(int32_to_parquet_stats(1));
encoded_stats.set_max(int32_to_parquet_stats(100));
encoded_stats.has_null_count = false;
encoded_stats.all_null_value = false;
encoded_stats.null_count = 0;
auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/10);

auto stat_expression =
ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats);
ASSERT_TRUE(stat_expression.has_value());
EXPECT_EQ(stat_expression->ToString(),
"(((x >= 1) and (x <= 100)) or is_null(x, {nan_is_null=false}))");
}
{
// Special case: when num_value is 0, it would return
// "is_null".
::parquet::EncodedStatistics encoded_stats;
encoded_stats.has_null_count = true;
encoded_stats.null_count = 1;
encoded_stats.all_null_value = true;
auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0);
auto stat_expression =
ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats);
ASSERT_TRUE(stat_expression.has_value());
EXPECT_EQ(stat_expression->ToString(), "is_null(x, {nan_is_null=false})");

encoded_stats.has_null_count = false;
encoded_stats.all_null_value = false;
stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0);
stat_expression = ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats);
ASSERT_TRUE(stat_expression.has_value());
EXPECT_EQ(stat_expression->ToString(), "is_null(x, {nan_is_null=false})");
}
}

class DelayedBufferReader : public ::arrow::io::BufferReader {
public:
explicit DelayedBufferReader(const std::shared_ptr<::arrow::Buffer>& buffer)
Expand Down
Loading