diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index dda0e7701b1e4..429d446898e31 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -340,6 +340,8 @@ class PlainEncoder : public EncoderImpl, virtual public BooleanEnco throw ParquetException("direct put to boolean from " + values.type()->ToString() + " not supported"); } + // Put arrow array cannot mix with PlainEncoder::PutImpl. + DCHECK_EQ(0, bit_writer_.bytes_written()); const auto& data = checked_cast(values); if (data.null_count() == 0) { @@ -354,6 +356,7 @@ class PlainEncoder : public EncoderImpl, virtual public BooleanEnco sink_.length(), n_valid); for (int64_t i = 0; i < data.length(); i++) { + // Only valid boolean data will call `writer.Next`. if (data.IsValid(i)) { if (data.Value(i)) { writer.Set(); @@ -365,7 +368,7 @@ class PlainEncoder : public EncoderImpl, virtual public BooleanEnco } writer.Finish(); } - sink_.UnsafeAdvance(data.length()); + sink_.UnsafeAdvance(data.length() - data.null_count()); } private: diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 7a910e4220831..006f0a1594150 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -580,7 +580,7 @@ TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) { decoder->SetData(num_values, buf->data(), static_cast(buf->size())); typename EncodingTraits::Accumulator acc; - acc.builder.reset(new ::arrow::StringBuilder); + acc.builder = std::make_unique<::arrow::StringBuilder>(); ASSERT_EQ(num_values, decoder->DecodeArrow(static_cast(values->length()), static_cast(values->null_count()), @@ -665,6 +665,33 @@ class EncodingAdHocTyped : public ::testing::Test { ::arrow::AssertArraysEqual(*values, *result, /*verbose=*/true); } + void PlainTwice(int seed) { + auto values_single = GetValues(seed); + auto encoder = MakeTypedEncoder( + Encoding::PLAIN, /*use_dictionary=*/false, column_descr()); + auto decoder = MakeTypedDecoder(Encoding::PLAIN, column_descr()); + + ASSERT_NO_THROW(encoder->Put(*values_single)); + ASSERT_NO_THROW(encoder->Put(*values_single)); + auto buf = encoder->FlushValues(); + + EXPECT_OK_AND_ASSIGN(auto values, + ::arrow::Concatenate({values_single, values_single})); + decoder->SetData(static_cast(values->length()), buf->data(), + static_cast(buf->size())); + + BuilderType acc(arrow_type(), ::arrow::default_memory_pool()); + ASSERT_EQ(values->length() - values->null_count(), + decoder->DecodeArrow(static_cast(values->length()), + static_cast(values->null_count()), + values->null_bitmap_data(), values->offset(), &acc)); + + std::shared_ptr<::arrow::Array> result; + ASSERT_OK(acc.Finish(&result)); + ASSERT_EQ(100, result->length()); + ::arrow::AssertArraysEqual(*values, *result, /*verbose=*/true); + } + void ByteStreamSplit(int seed) { if (!std::is_same::value && !std::is_same::value) { @@ -882,6 +909,12 @@ TYPED_TEST(EncodingAdHocTyped, PlainArrowDirectPut) { } } +TYPED_TEST(EncodingAdHocTyped, PlainArrowDirectPut2) { + for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { + this->PlainTwice(seed); + } +} + TYPED_TEST(EncodingAdHocTyped, ByteStreamSplitArrowDirectPut) { for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { this->ByteStreamSplit(seed);