diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index ee8391818962c..7bab9104619ce 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include #include #include @@ -29,6 +31,7 @@ #include "arrow/io/memory.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" +#include "arrow/util/pcg_random.h" #include "parquet/encryption/encryption_internal.h" #include "parquet/encryption/internal_file_decryptor.h" #include "parquet/exception.h" @@ -599,6 +602,49 @@ std::vector RowGroupMetaData::sorting_columns() const { return impl_->sorting_columns(); } +// Replace string data with random-generated uppercase characters +static void Scrub(std::string* s) { + static ::arrow::random::pcg64 rng; + std::uniform_int_distribution<> caps(65, 90); + for (auto& c : *s) c = caps(rng); +} + +// Replace potentially sensitive metadata with random data +static void Scrub(format::FileMetaData* md) { + for (auto& s : md->schema) { + Scrub(&s.name); + } + for (auto& r : md->row_groups) { + for (auto& c : r.columns) { + Scrub(&c.file_path); + if (c.__isset.meta_data) { + auto& m = c.meta_data; + for (auto& p : m.path_in_schema) Scrub(&p); + for (auto& kv : m.key_value_metadata) { + Scrub(&kv.key); + Scrub(&kv.value); + } + Scrub(&m.statistics.max_value); + Scrub(&m.statistics.min_value); + Scrub(&m.statistics.min); + Scrub(&m.statistics.max); + } + + if (c.crypto_metadata.__isset.ENCRYPTION_WITH_COLUMN_KEY) { + auto& m = c.crypto_metadata.ENCRYPTION_WITH_COLUMN_KEY; + for (auto& p : m.path_in_schema) Scrub(&p); + Scrub(&m.key_metadata); + } + Scrub(&c.encrypted_column_metadata); + } + } + for (auto& kv : md->key_value_metadata) { + Scrub(&kv.key); + Scrub(&kv.value); + } + Scrub(&md->footer_signing_key_metadata); +} + // file metadata class FileMetaData::FileMetaDataImpl { public: @@ -821,6 +867,21 @@ class FileMetaData::FileMetaDataImpl { return out; } + std::string SerializeUnencrypted(bool scrub, bool debug) const { + auto md = *metadata_; + if (scrub) Scrub(&md); + if (debug) { + std::ostringstream ss; + md.printTo(ss); + return ss.str(); + } else { + ThriftSerializer serializer; + std::string out; + serializer.SerializeToString(&md, &out); + return out; + } + } + void set_file_decryptor(std::shared_ptr file_decryptor) { file_decryptor_ = std::move(file_decryptor); } @@ -992,6 +1053,10 @@ std::shared_ptr FileMetaData::Subset( return impl_->Subset(row_groups); } +std::string FileMetaData::SerializeUnencrypted(bool scrub, bool json) const { + return impl_->SerializeUnencrypted(scrub, json); +} + void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor) const { return impl_->WriteTo(dst, encryptor); diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 9fc30df58e0d3..e02d2e7c852f0 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -396,6 +396,13 @@ class PARQUET_EXPORT FileMetaData { /// FileMetaData. std::shared_ptr Subset(const std::vector& row_groups) const; + /// \brief Serialize metadata unencrypted as string + /// + /// \param[in] scrub whether to remove sensitive information from the metadata. + /// \param[in] debug whether to serialize the metadata as Thrift (if false) or + /// debug text (if true). + std::string SerializeUnencrypted(bool scrub, bool debug) const; + private: friend FileMetaDataBuilder; friend class SerializedFile; diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt index 81ab49421d0f6..e05645da28a0e 100644 --- a/cpp/tools/parquet/CMakeLists.txt +++ b/cpp/tools/parquet/CMakeLists.txt @@ -16,7 +16,7 @@ # under the License. if(PARQUET_BUILD_EXECUTABLES) - set(PARQUET_TOOLS parquet-dump-schema parquet-reader parquet-scan) + set(PARQUET_TOOLS parquet-dump-footer parquet-dump-schema parquet-reader parquet-scan) foreach(TOOL ${PARQUET_TOOLS}) string(REGEX REPLACE "-" "_" TOOL_SOURCE ${TOOL}) @@ -31,6 +31,7 @@ if(PARQUET_BUILD_EXECUTABLES) install(TARGETS ${TOOL} ${INSTALL_IS_OPTIONAL} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endforeach(TOOL) + target_link_libraries(parquet-dump-footer ${ARROW_LIBRARIES}) add_dependencies(parquet ${PARQUET_TOOLS}) endif() diff --git a/cpp/tools/parquet/parquet_dump_footer.cc b/cpp/tools/parquet/parquet_dump_footer.cc new file mode 100644 index 0000000000000..c7a4b78fdd823 --- /dev/null +++ b/cpp/tools/parquet/parquet_dump_footer.cc @@ -0,0 +1,135 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "arrow/filesystem/filesystem.h" +#include "arrow/util/endian.h" +#include "arrow/util/ubsan.h" +#include "parquet/metadata.h" + +namespace parquet { +namespace { +uint32_t ReadLE32(const void* p) { + uint32_t x = ::arrow::util::SafeLoadAs(static_cast(p)); + return ::arrow::bit_util::FromLittleEndian(x); +} + +void AppendLE32(uint32_t v, std::string* out) { + v = ::arrow::bit_util::ToLittleEndian(v); + out->append(reinterpret_cast(&v), sizeof(v)); +} + +int DoIt(std::string in, bool scrub, bool json, std::string out) { + std::string path; + auto fs = ::arrow::fs::FileSystemFromUriOrPath(in, &path).ValueOrDie(); + auto file = fs->OpenInputFile(path).ValueOrDie(); + int64_t file_len = file->GetSize().ValueOrDie(); + if (file_len < 8) { + std::cerr << "File too short: " << in << "\n"; + return 3; + } + // First do an opportunistic read of up to 1 MiB to try and get the entire footer. + int64_t tail_len = std::min(file_len, int64_t{1} << 20); + std::string tail; + tail.resize(tail_len); + char* data = tail.data(); + file->ReadAt(file_len - tail_len, tail_len, data).ValueOrDie(); + if (auto magic = ReadLE32(data + tail_len - 4); magic != ReadLE32("PAR1")) { + std::cerr << "Not a Parquet file: " << in << "\n"; + return 4; + } + uint32_t metadata_len = ReadLE32(data + tail_len - 8); + if (tail_len >= metadata_len + 8) { + // The footer is entirely in the initial read. Trim to size. + tail = tail.substr(tail_len - (metadata_len + 8)); + } else { + // The footer is larger than the initial read, read again the exact size. + if (metadata_len > file_len) { + std::cerr << "File too short: " << in << "\n"; + return 5; + } + tail_len = metadata_len + 8; + tail.resize(tail_len); + data = tail.data(); + file->ReadAt(file_len - tail_len, tail_len, data).ValueOrDie(); + } + auto md = FileMetaData::Make(tail.data(), &metadata_len); + std::string ser = md->SerializeUnencrypted(scrub, json); + if (!json) { + AppendLE32(static_cast(ser.size()), &ser); + ser.append("PAR1", 4); + } + std::optional fout; + if (!out.empty()) fout.emplace(out, std::ios::out); + std::ostream& os = fout ? *fout : std::cout; + if (!os.write(ser.data(), ser.size())) { + std::cerr << "Failed to write to output file: " << out << "\n"; + return 6; + } + + return 0; +} +} // namespace +} // namespace parquet + +static int PrintHelp() { + std::cerr << R"(Usage: parquet-dump-footer + -h|--help Print help and exit + --no-scrub Do not scrub potentially confidential metadata + --debug Output text represenation of footer for inspection + --in Input file (required): must be an URI or an absolute local path + --out Output file (optional, default stdout) + + Dump the footer of a Parquet file to stdout or to a file, optionally with + potentially confidential metadata scrubbed. +)"; + return 1; +} + +int main(int argc, char** argv) { + bool scrub = true; + bool json = false; + std::string in; + std::string out; + for (int i = 1; i < argc; i++) { + char* arg = argv[i]; + if (!std::strcmp(arg, "-h") || !std::strcmp(arg, "--help")) { + return PrintHelp(); + } else if (!std::strcmp(arg, "--no-scrub")) { + scrub = false; + } else if (!std::strcmp(arg, "--json")) { + json = true; + } else if (!std::strcmp(arg, "--in")) { + if (i + 1 >= argc) return PrintHelp(); + in = argv[++i]; + } else if (!std::strcmp(arg, "--out")) { + if (i + 1 >= argc) return PrintHelp(); + out = argv[++i]; + } else { + // Unknown option. + return PrintHelp(); + } + } + if (in.empty()) return PrintHelp(); + + return parquet::DoIt(in, scrub, json, out); +}