Skip to content

Commit

Permalink
GH-42102: [C++][Parquet] Add binary that extracts a footer from a par…
Browse files Browse the repository at this point in the history
…quet file (#42174)

### Rationale for this change

This binary will make it a lot easier for customers to share their parquet metadata with the community so that we can build a repository of footers that can be used for advancing the state of metadata in parquet.

### What changes are included in this PR?

Usage from the file binary itself:
```
Usage: parquet-dump-footer
  -h|--help    Print help and exit
  --no-scrub   Do not scrub potentially confidential metadata
  --debug      Output text represenation of footer for inspection
  --in <uri>   Input file (required): must be an URI or an absolute local path
  --out <path> Output file (optional, default stdout)

  Dump the footer of a Parquet file to stdout or to a file, optionally with
  potentially confidential metadata scrubbed.
```

### Are these changes tested?

Manually on existing parquet files.

### Are there any user-facing changes?

No.

* GitHub Issue: #42102

Lead-authored-by: Alkis Evlogimenos <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
alkis and pitrou authored Jul 22, 2024
1 parent 3483ac6 commit d21a924
Show file tree
Hide file tree
Showing 4 changed files with 209 additions and 1 deletion.
65 changes: 65 additions & 0 deletions cpp/src/parquet/metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
#include <cinttypes>
#include <memory>
#include <ostream>
#include <random>
#include <sstream>
#include <string>
#include <string_view>
#include <utility>
Expand All @@ -29,6 +31,7 @@
#include "arrow/io/memory.h"
#include "arrow/util/key_value_metadata.h"
#include "arrow/util/logging.h"
#include "arrow/util/pcg_random.h"
#include "parquet/encryption/encryption_internal.h"
#include "parquet/encryption/internal_file_decryptor.h"
#include "parquet/exception.h"
Expand Down Expand Up @@ -599,6 +602,49 @@ std::vector<SortingColumn> RowGroupMetaData::sorting_columns() const {
return impl_->sorting_columns();
}

// Replace string data with random-generated uppercase characters
static void Scrub(std::string* s) {
static ::arrow::random::pcg64 rng;
std::uniform_int_distribution<> caps(65, 90);
for (auto& c : *s) c = caps(rng);
}

// Replace potentially sensitive metadata with random data
static void Scrub(format::FileMetaData* md) {
for (auto& s : md->schema) {
Scrub(&s.name);
}
for (auto& r : md->row_groups) {
for (auto& c : r.columns) {
Scrub(&c.file_path);
if (c.__isset.meta_data) {
auto& m = c.meta_data;
for (auto& p : m.path_in_schema) Scrub(&p);
for (auto& kv : m.key_value_metadata) {
Scrub(&kv.key);
Scrub(&kv.value);
}
Scrub(&m.statistics.max_value);
Scrub(&m.statistics.min_value);
Scrub(&m.statistics.min);
Scrub(&m.statistics.max);
}

if (c.crypto_metadata.__isset.ENCRYPTION_WITH_COLUMN_KEY) {
auto& m = c.crypto_metadata.ENCRYPTION_WITH_COLUMN_KEY;
for (auto& p : m.path_in_schema) Scrub(&p);
Scrub(&m.key_metadata);
}
Scrub(&c.encrypted_column_metadata);
}
}
for (auto& kv : md->key_value_metadata) {
Scrub(&kv.key);
Scrub(&kv.value);
}
Scrub(&md->footer_signing_key_metadata);
}

// file metadata
class FileMetaData::FileMetaDataImpl {
public:
Expand Down Expand Up @@ -821,6 +867,21 @@ class FileMetaData::FileMetaDataImpl {
return out;
}

std::string SerializeUnencrypted(bool scrub, bool debug) const {
auto md = *metadata_;
if (scrub) Scrub(&md);
if (debug) {
std::ostringstream ss;
md.printTo(ss);
return ss.str();
} else {
ThriftSerializer serializer;
std::string out;
serializer.SerializeToString(&md, &out);
return out;
}
}

void set_file_decryptor(std::shared_ptr<InternalFileDecryptor> file_decryptor) {
file_decryptor_ = std::move(file_decryptor);
}
Expand Down Expand Up @@ -992,6 +1053,10 @@ std::shared_ptr<FileMetaData> FileMetaData::Subset(
return impl_->Subset(row_groups);
}

std::string FileMetaData::SerializeUnencrypted(bool scrub, bool json) const {
return impl_->SerializeUnencrypted(scrub, json);
}

void FileMetaData::WriteTo(::arrow::io::OutputStream* dst,
const std::shared_ptr<Encryptor>& encryptor) const {
return impl_->WriteTo(dst, encryptor);
Expand Down
7 changes: 7 additions & 0 deletions cpp/src/parquet/metadata.h
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,13 @@ class PARQUET_EXPORT FileMetaData {
/// FileMetaData.
std::shared_ptr<FileMetaData> Subset(const std::vector<int>& row_groups) const;

/// \brief Serialize metadata unencrypted as string
///
/// \param[in] scrub whether to remove sensitive information from the metadata.
/// \param[in] debug whether to serialize the metadata as Thrift (if false) or
/// debug text (if true).
std::string SerializeUnencrypted(bool scrub, bool debug) const;

private:
friend FileMetaDataBuilder;
friend class SerializedFile;
Expand Down
3 changes: 2 additions & 1 deletion cpp/tools/parquet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# under the License.

if(PARQUET_BUILD_EXECUTABLES)
set(PARQUET_TOOLS parquet-dump-schema parquet-reader parquet-scan)
set(PARQUET_TOOLS parquet-dump-footer parquet-dump-schema parquet-reader parquet-scan)

foreach(TOOL ${PARQUET_TOOLS})
string(REGEX REPLACE "-" "_" TOOL_SOURCE ${TOOL})
Expand All @@ -31,6 +31,7 @@ if(PARQUET_BUILD_EXECUTABLES)
install(TARGETS ${TOOL} ${INSTALL_IS_OPTIONAL}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
endforeach(TOOL)
target_link_libraries(parquet-dump-footer ${ARROW_LIBRARIES})

add_dependencies(parquet ${PARQUET_TOOLS})
endif()
135 changes: 135 additions & 0 deletions cpp/tools/parquet/parquet_dump_footer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <cstdint>
#include <cstring>
#include <fstream>
#include <iostream>
#include <optional>

#include "arrow/filesystem/filesystem.h"
#include "arrow/util/endian.h"
#include "arrow/util/ubsan.h"
#include "parquet/metadata.h"

namespace parquet {
namespace {
uint32_t ReadLE32(const void* p) {
uint32_t x = ::arrow::util::SafeLoadAs<uint32_t>(static_cast<const uint8_t*>(p));
return ::arrow::bit_util::FromLittleEndian(x);
}

void AppendLE32(uint32_t v, std::string* out) {
v = ::arrow::bit_util::ToLittleEndian(v);
out->append(reinterpret_cast<const char*>(&v), sizeof(v));
}

int DoIt(std::string in, bool scrub, bool json, std::string out) {
std::string path;
auto fs = ::arrow::fs::FileSystemFromUriOrPath(in, &path).ValueOrDie();
auto file = fs->OpenInputFile(path).ValueOrDie();
int64_t file_len = file->GetSize().ValueOrDie();
if (file_len < 8) {
std::cerr << "File too short: " << in << "\n";
return 3;
}
// First do an opportunistic read of up to 1 MiB to try and get the entire footer.
int64_t tail_len = std::min(file_len, int64_t{1} << 20);
std::string tail;
tail.resize(tail_len);
char* data = tail.data();
file->ReadAt(file_len - tail_len, tail_len, data).ValueOrDie();
if (auto magic = ReadLE32(data + tail_len - 4); magic != ReadLE32("PAR1")) {
std::cerr << "Not a Parquet file: " << in << "\n";
return 4;
}
uint32_t metadata_len = ReadLE32(data + tail_len - 8);
if (tail_len >= metadata_len + 8) {
// The footer is entirely in the initial read. Trim to size.
tail = tail.substr(tail_len - (metadata_len + 8));
} else {
// The footer is larger than the initial read, read again the exact size.
if (metadata_len > file_len) {
std::cerr << "File too short: " << in << "\n";
return 5;
}
tail_len = metadata_len + 8;
tail.resize(tail_len);
data = tail.data();
file->ReadAt(file_len - tail_len, tail_len, data).ValueOrDie();
}
auto md = FileMetaData::Make(tail.data(), &metadata_len);
std::string ser = md->SerializeUnencrypted(scrub, json);
if (!json) {
AppendLE32(static_cast<uint32_t>(ser.size()), &ser);
ser.append("PAR1", 4);
}
std::optional<std::fstream> fout;
if (!out.empty()) fout.emplace(out, std::ios::out);
std::ostream& os = fout ? *fout : std::cout;
if (!os.write(ser.data(), ser.size())) {
std::cerr << "Failed to write to output file: " << out << "\n";
return 6;
}

return 0;
}
} // namespace
} // namespace parquet

static int PrintHelp() {
std::cerr << R"(Usage: parquet-dump-footer
-h|--help Print help and exit
--no-scrub Do not scrub potentially confidential metadata
--debug Output text represenation of footer for inspection
--in <uri> Input file (required): must be an URI or an absolute local path
--out <path> Output file (optional, default stdout)
Dump the footer of a Parquet file to stdout or to a file, optionally with
potentially confidential metadata scrubbed.
)";
return 1;
}

int main(int argc, char** argv) {
bool scrub = true;
bool json = false;
std::string in;
std::string out;
for (int i = 1; i < argc; i++) {
char* arg = argv[i];
if (!std::strcmp(arg, "-h") || !std::strcmp(arg, "--help")) {
return PrintHelp();
} else if (!std::strcmp(arg, "--no-scrub")) {
scrub = false;
} else if (!std::strcmp(arg, "--json")) {
json = true;
} else if (!std::strcmp(arg, "--in")) {
if (i + 1 >= argc) return PrintHelp();
in = argv[++i];
} else if (!std::strcmp(arg, "--out")) {
if (i + 1 >= argc) return PrintHelp();
out = argv[++i];
} else {
// Unknown option.
return PrintHelp();
}
}
if (in.empty()) return PrintHelp();

return parquet::DoIt(in, scrub, json, out);
}

0 comments on commit d21a924

Please sign in to comment.