Skip to content

Commit

Permalink
apacheGH-41760: [C++][Parquet] Add file metadata read/write benchmark (
Browse files Browse the repository at this point in the history
…apache#41761)

Following the discussions on the Parquet ML (see [this thread](https://lists.apache.org/thread/5jyhzkwyrjk9z52g0b49g31ygnz73gxo) and [this thread](https://lists.apache.org/thread/vs3w2z5bk6s3c975rrkqdttr1dpsdn7h)), and the various complaints about poor Parquet metadata performance on wide schemas, this adds a benchmark to measure the overhead of Parquet file metadata parsing or serialization for different numbers of row groups and columns.

Sample output:
```
-----------------------------------------------------------------------------------------------------------------------
Benchmark                                                             Time             CPU   Iterations UserCounters...
-----------------------------------------------------------------------------------------------------------------------
WriteFileMetadataAndData/num_columns:1/num_row_groups:1           11743 ns        11741 ns        59930 data_size=54 file_size=290 items_per_second=85.1726k/s
WriteFileMetadataAndData/num_columns:1/num_row_groups:100        843137 ns       842920 ns          832 data_size=5.4k file_size=20.486k items_per_second=1.18635k/s
WriteFileMetadataAndData/num_columns:1/num_row_groups:1000      8232304 ns      8230294 ns           85 data_size=54k file_size=207.687k items_per_second=121.502/s
WriteFileMetadataAndData/num_columns:10/num_row_groups:1         101214 ns       101190 ns         6910 data_size=540 file_size=2.11k items_per_second=9.8824k/s
WriteFileMetadataAndData/num_columns:10/num_row_groups:100      8026185 ns      8024361 ns           87 data_size=54k file_size=193.673k items_per_second=124.621/s
WriteFileMetadataAndData/num_columns:10/num_row_groups:1000    81370293 ns     81343455 ns            8 data_size=540k file_size=1.94392M items_per_second=12.2936/s
WriteFileMetadataAndData/num_columns:100/num_row_groups:1        955862 ns       955528 ns          733 data_size=5.4k file_size=20.694k items_per_second=1.04654k/s
WriteFileMetadataAndData/num_columns:100/num_row_groups:100    80115516 ns     80086117 ns            9 data_size=540k file_size=1.94729M items_per_second=12.4866/s
WriteFileMetadataAndData/num_columns:100/num_row_groups:1000  856428565 ns    856065370 ns            1 data_size=5.4M file_size=19.7673M items_per_second=1.16814/s
WriteFileMetadataAndData/num_columns:1000/num_row_groups:1      9330003 ns      9327439 ns           75 data_size=54k file_size=211.499k items_per_second=107.211/s
WriteFileMetadataAndData/num_columns:1000/num_row_groups:100  834609159 ns    834354590 ns            1 data_size=5.4M file_size=19.9623M items_per_second=1.19853/s

ReadFileMetadata/num_columns:1/num_row_groups:1                    3824 ns         3824 ns       182381 data_size=54 file_size=290 items_per_second=261.518k/s
ReadFileMetadata/num_columns:1/num_row_groups:100                 88519 ns        88504 ns         7879 data_size=5.4k file_size=20.486k items_per_second=11.299k/s
ReadFileMetadata/num_columns:1/num_row_groups:1000               849558 ns       849391 ns          825 data_size=54k file_size=207.687k items_per_second=1.17731k/s
ReadFileMetadata/num_columns:10/num_row_groups:1                  19918 ns        19915 ns        35449 data_size=540 file_size=2.11k items_per_second=50.2138k/s
ReadFileMetadata/num_columns:10/num_row_groups:100               715822 ns       715667 ns          975 data_size=54k file_size=193.673k items_per_second=1.3973k/s
ReadFileMetadata/num_columns:10/num_row_groups:1000             7017008 ns      7015432 ns          100 data_size=540k file_size=1.94392M items_per_second=142.543/s
ReadFileMetadata/num_columns:100/num_row_groups:1                175988 ns       175944 ns         3958 data_size=5.4k file_size=20.694k items_per_second=5.68363k/s
ReadFileMetadata/num_columns:100/num_row_groups:100             6814382 ns      6812781 ns          103 data_size=540k file_size=1.94729M items_per_second=146.783/s
ReadFileMetadata/num_columns:100/num_row_groups:1000           77858645 ns     77822157 ns            9 data_size=5.4M file_size=19.7673M items_per_second=12.8498/s
ReadFileMetadata/num_columns:1000/num_row_groups:1              1670001 ns      1669563 ns          419 data_size=54k file_size=211.499k items_per_second=598.959/s
ReadFileMetadata/num_columns:1000/num_row_groups:100           77339599 ns     77292924 ns            9 data_size=5.4M file_size=19.9623M items_per_second=12.9378/s
```

* GitHub Issue: apache#41760

Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
pitrou authored May 22, 2024
1 parent 065a6da commit f3d4639
Show file tree
Hide file tree
Showing 2 changed files with 157 additions and 0 deletions.
1 change: 1 addition & 0 deletions cpp/src/parquet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,7 @@ add_parquet_benchmark(column_reader_benchmark)
add_parquet_benchmark(column_io_benchmark)
add_parquet_benchmark(encoding_benchmark)
add_parquet_benchmark(level_conversion_benchmark)
add_parquet_benchmark(metadata_benchmark)
add_parquet_benchmark(page_index_benchmark SOURCES page_index_benchmark.cc
benchmark_util.cc)
add_parquet_benchmark(arrow/reader_writer_benchmark PREFIX "parquet-arrow")
156 changes: 156 additions & 0 deletions cpp/src/parquet/metadata_benchmark.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <memory>
#include <sstream>

#include <benchmark/benchmark.h>

#include "arrow/buffer.h"
#include "arrow/io/memory.h"
#include "arrow/util/logging.h"

#include "parquet/column_writer.h"
#include "parquet/file_reader.h"
#include "parquet/file_writer.h"
#include "parquet/metadata.h"
#include "parquet/platform.h"
#include "parquet/schema.h"

namespace parquet {

using ::arrow::Buffer;
using ::arrow::io::BufferOutputStream;
using ::arrow::io::BufferReader;
using schema::GroupNode;
using schema::NodePtr;
using schema::NodeVector;

class MetadataBenchmark {
public:
explicit MetadataBenchmark(benchmark::State* state)
: MetadataBenchmark(static_cast<int>(state->range(0)),
static_cast<int>(state->range(1))) {}

MetadataBenchmark(int num_columns, int num_row_groups)
: num_columns_(num_columns), num_row_groups_(num_row_groups) {
NodeVector fields;
for (int i = 0; i < num_columns_; ++i) {
std::stringstream ss;
ss << "col" << i;
fields.push_back(parquet::schema::Int32(ss.str(), Repetition::REQUIRED));
}
schema_root_ = std::static_pointer_cast<GroupNode>(
GroupNode::Make("schema", Repetition::REQUIRED, fields));

WriterProperties::Builder prop_builder;
writer_properties_ = prop_builder.version(ParquetVersion::PARQUET_2_6)
->disable_dictionary()
->data_page_version(ParquetDataPageVersion::V2)
->build();
}

std::shared_ptr<Buffer> WriteFile(benchmark::State* state) {
PARQUET_ASSIGN_OR_THROW(auto sink, BufferOutputStream::Create());

auto writer = ParquetFileWriter::Open(sink, schema_root_, writer_properties_);
std::vector<int32_t> int32_values(1, 42);
int64_t data_size = 0;
for (int rg = 0; rg < num_row_groups_; ++rg) {
auto row_group_writer = writer->AppendRowGroup();
for (int col = 0; col < num_columns_; ++col) {
auto col_writer = row_group_writer->NextColumn();
ARROW_CHECK_EQ(col_writer->type(), Type::INT32);
auto typed_col_writer = static_cast<Int32Writer*>(col_writer);
typed_col_writer->WriteBatch(
/*num_values=*/static_cast<int64_t>(int32_values.size()),
/*def_levels=*/nullptr, /*rep_levels=*/nullptr, int32_values.data());
typed_col_writer->Close();
}
row_group_writer->Close();
data_size += row_group_writer->total_compressed_bytes_written();
}
writer->Close();
PARQUET_ASSIGN_OR_THROW(auto buf, sink->Finish());
state->counters["file_size"] = static_cast<double>(buf->size());
// Note that "data_size" includes the Thrift page headers
state->counters["data_size"] = static_cast<double>(data_size);
return buf;
}

void ReadFile(std::shared_ptr<Buffer> contents) {
auto source = std::make_shared<BufferReader>(contents);
ReaderProperties props;
auto reader = ParquetFileReader::Open(source, props);
auto metadata = reader->metadata();
ARROW_CHECK_EQ(metadata->num_columns(), num_columns_);
ARROW_CHECK_EQ(metadata->num_row_groups(), num_row_groups_);
// There should be one row per row group
ARROW_CHECK_EQ(metadata->num_rows(), num_row_groups_);
reader->Close();
}

private:
int num_columns_;
int num_row_groups_;
std::shared_ptr<GroupNode> schema_root_;
std::shared_ptr<WriterProperties> writer_properties_;
};

void WriteMetadataSetArgs(benchmark::internal::Benchmark* bench) {
bench->ArgNames({"num_columns", "num_row_groups"});

for (int num_columns : {1, 10, 100}) {
for (int num_row_groups : {1, 100, 1000}) {
bench->Args({num_columns, num_row_groups});
}
}
/* For larger num_columns, restrict num_row_groups to small values
* to avoid blowing up benchmark execution time.
*/
for (int num_row_groups : {1, 100}) {
bench->Args({/*num_columns=*/1000, num_row_groups});
}
}

void ReadMetadataSetArgs(benchmark::internal::Benchmark* bench) {
WriteMetadataSetArgs(bench);
}

void WriteFileMetadataAndData(benchmark::State& state) {
MetadataBenchmark benchmark(&state);

for (auto _ : state) {
auto sink = benchmark.WriteFile(&state);
}
state.SetItemsProcessed(state.iterations());
}

void ReadFileMetadata(benchmark::State& state) {
MetadataBenchmark benchmark(&state);
auto contents = benchmark.WriteFile(&state);

for (auto _ : state) {
benchmark.ReadFile(contents);
}
state.SetItemsProcessed(state.iterations());
}

BENCHMARK(WriteFileMetadataAndData)->Apply(WriteMetadataSetArgs);
BENCHMARK(ReadFileMetadata)->Apply(ReadMetadataSetArgs);

} // namespace parquet

0 comments on commit f3d4639

Please sign in to comment.