From a347f92c3cae752e4f5332105325f18e612ecdb6 Mon Sep 17 00:00:00 2001 From: anjakefala Date: Wed, 9 Oct 2024 13:55:38 -0700 Subject: [PATCH 01/10] GH-34535: [C++] Move ChunkResolver to the public API Co-authored-by: SChakravorti21 --- cpp/src/arrow/chunk_resolver.cc | 4 +-- cpp/src/arrow/chunk_resolver.h | 30 ++++++++++++------- cpp/src/arrow/chunk_resolver_benchmark.cc | 4 +-- cpp/src/arrow/chunked_array.h | 2 +- cpp/src/arrow/chunked_array_test.cc | 6 ++-- .../arrow/compute/kernels/chunked_internal.h | 2 +- cpp/src/arrow/compute/kernels/vector_sort.cc | 4 +-- .../compute/kernels/vector_sort_internal.h | 4 +-- cpp/src/arrow/stl_iterator.h | 2 +- docs/source/cpp/api/array.rst | 14 +++++++++ r/src/altrep.cpp | 4 +-- 11 files changed, 49 insertions(+), 27 deletions(-) diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc index bda6b17810299..ca74ffa06c820 100644 --- a/cpp/src/arrow/chunk_resolver.cc +++ b/cpp/src/arrow/chunk_resolver.cc @@ -26,7 +26,7 @@ #include "arrow/array.h" #include "arrow/record_batch.h" -namespace arrow::internal { +namespace arrow { namespace { template @@ -167,4 +167,4 @@ void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint64_t* logical_i logical_index_vec, out_chunk_location_vec, chunk_hint); } -} // namespace arrow::internal +} // namespace arrow diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index 4a5e27c05361f..f40514f980a8c 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -27,12 +27,12 @@ #include "arrow/type_fwd.h" #include "arrow/util/macros.h" -namespace arrow::internal { +namespace arrow { struct ChunkResolver; template -struct TypedChunkLocation { +struct ARROW_EXPORT TypedChunkLocation { /// \brief Index of the chunk in the array of chunks /// /// The value is always in the range `[0, chunks.size()]`. `chunks.size()` is used @@ -41,7 +41,7 @@ struct TypedChunkLocation { /// \brief Index of the value in the chunk /// - /// The value is UNDEFINED if chunk_index >= chunks.size() + /// The value is UNDEFINED if `chunk_index >= chunks.size()` IndexType index_in_chunk = 0; TypedChunkLocation() = default; @@ -61,7 +61,7 @@ using ChunkLocation = TypedChunkLocation; /// \brief An utility that incrementally resolves logical indices into /// physical indices in a chunked array. -struct ARROW_EXPORT ChunkResolver { +class ARROW_EXPORT ChunkResolver { private: /// \brief Array containing `chunks.size() + 1` offsets. /// @@ -75,8 +75,16 @@ struct ARROW_EXPORT ChunkResolver { mutable std::atomic cached_chunk_; public: + /// \brief Initialize from an `ArrayVector`. explicit ChunkResolver(const ArrayVector& chunks) noexcept; + + /// \brief Initialize from a vector of raw `Array` pointers. explicit ChunkResolver(const std::vector& chunks) noexcept; + + /// \brief Initialize from a `RecordBatchVector`. + /// + /// Because all `Array`s in a `RecordBatch` must have the same length, this + /// can be useful for iterating over multiple columns simultaneously. explicit ChunkResolver(const RecordBatchVector& batches) noexcept; /// \brief Construct a ChunkResolver from a vector of chunks.size() + 1 offsets. @@ -115,11 +123,11 @@ struct ARROW_EXPORT ChunkResolver { /// The returned ChunkLocation contains the chunk index and the within-chunk index /// equivalent to the logical index. /// - /// \pre index >= 0 - /// \post location.chunk_index in [0, chunks.size()] + /// \pre `index >= 0` + /// \post `location.chunk_index` in `[0, chunks.size()]` /// \param index The logical index to resolve /// \return ChunkLocation with a valid chunk_index if index is within - /// bounds, or with chunk_index == chunks.size() if logical index is + /// bounds, or with `chunk_index == chunks.size()` if logical index is /// `>= chunked_array.length()`. inline ChunkLocation Resolve(int64_t index) const { const auto cached_chunk = cached_chunk_.load(std::memory_order_relaxed); @@ -133,13 +141,13 @@ struct ARROW_EXPORT ChunkResolver { /// The returned ChunkLocation contains the chunk index and the within-chunk index /// equivalent to the logical index. /// - /// \pre index >= 0 - /// \post location.chunk_index in [0, chunks.size()] + /// \pre `index >= 0` + /// \post `location.chunk_index` in `[0, chunks.size()]` /// \param index The logical index to resolve /// \param hint ChunkLocation{} or the last ChunkLocation returned by /// this ChunkResolver. /// \return ChunkLocation with a valid chunk_index if index is within - /// bounds, or with chunk_index == chunks.size() if logical index is + /// bounds, or with `chunk_index == chunks.size()` if logical index is /// `>= chunked_array.length()`. inline ChunkLocation ResolveWithHint(int64_t index, ChunkLocation hint) const { assert(hint.chunk_index < static_cast(offsets_.size())); @@ -281,4 +289,4 @@ struct ARROW_EXPORT ChunkResolver { } }; -} // namespace arrow::internal +} // namespace arrow diff --git a/cpp/src/arrow/chunk_resolver_benchmark.cc b/cpp/src/arrow/chunk_resolver_benchmark.cc index 0756de3fbe930..72b06d4e36fb1 100644 --- a/cpp/src/arrow/chunk_resolver_benchmark.cc +++ b/cpp/src/arrow/chunk_resolver_benchmark.cc @@ -28,8 +28,8 @@ namespace arrow { -using internal::ChunkResolver; -using internal::TypedChunkLocation; +using ChunkResolver; +using TypedChunkLocation; namespace { diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index c65b6cb6e227f..02bcd0f9026bc 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -199,7 +199,7 @@ class ARROW_EXPORT ChunkedArray { private: template friend class ::arrow::stl::ChunkedArrayIterator; - internal::ChunkResolver chunk_resolver_; + ChunkResolver chunk_resolver_; ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray); }; diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index f98dde689c237..62aff78b48035 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -36,9 +36,9 @@ namespace arrow { -using internal::ChunkLocation; -using internal::ChunkResolver; -using internal::TypedChunkLocation; +using arrow::ChunkLocation; +using arrow::ChunkResolver; +using arrow::TypedChunkLocation; class TestChunkedArray : public ::testing::Test { protected: diff --git a/cpp/src/arrow/compute/kernels/chunked_internal.h b/cpp/src/arrow/compute/kernels/chunked_internal.h index 2b72e0ab3109e..f7cb615f3ed81 100644 --- a/cpp/src/arrow/compute/kernels/chunked_internal.h +++ b/cpp/src/arrow/compute/kernels/chunked_internal.h @@ -52,7 +52,7 @@ struct ResolvedChunk { class ChunkedArrayResolver { private: - ::arrow::internal::ChunkResolver resolver_; + ChunkResolver resolver_; std::vector chunks_; public: diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index 8766ca3baac96..15fafa1f4d219 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -23,8 +23,8 @@ namespace arrow { +using arrow::ChunkLocation; using internal::checked_cast; -using internal::ChunkLocation; namespace compute { namespace internal { @@ -852,7 +852,7 @@ class TableSorter { const RecordBatchVector batches_; const SortOptions& options_; const NullPlacement null_placement_; - const ::arrow::internal::ChunkResolver left_resolver_, right_resolver_; + const ::arrow::ChunkResolver left_resolver_, right_resolver_; const std::vector sort_keys_; uint64_t* indices_begin_; uint64_t* indices_end_; diff --git a/cpp/src/arrow/compute/kernels/vector_sort_internal.h b/cpp/src/arrow/compute/kernels/vector_sort_internal.h index 564afb8c087d2..bee7f838a05da 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_internal.h +++ b/cpp/src/arrow/compute/kernels/vector_sort_internal.h @@ -749,9 +749,9 @@ struct ResolvedTableSortKey { order(order), null_count(null_count) {} - using LocationType = ::arrow::internal::ChunkLocation; + using LocationType = ::arrow::ChunkLocation; - ResolvedChunk GetChunk(::arrow::internal::ChunkLocation loc) const { + ResolvedChunk GetChunk(::arrow::ChunkLocation loc) const { return {chunks[loc.chunk_index], loc.index_in_chunk}; } diff --git a/cpp/src/arrow/stl_iterator.h b/cpp/src/arrow/stl_iterator.h index 5f2acfb071b29..577066cba0fcd 100644 --- a/cpp/src/arrow/stl_iterator.h +++ b/cpp/src/arrow/stl_iterator.h @@ -237,7 +237,7 @@ class ChunkedArrayIterator { } private: - arrow::internal::ChunkLocation GetChunkLocation(int64_t index) const { + arrow::ChunkLocation GetChunkLocation(int64_t index) const { assert(chunked_array_); return chunked_array_->chunk_resolver_.Resolve(index); } diff --git a/docs/source/cpp/api/array.rst b/docs/source/cpp/api/array.rst index a7e5d0cf07e0a..133a432884d3b 100644 --- a/docs/source/cpp/api/array.rst +++ b/docs/source/cpp/api/array.rst @@ -19,6 +19,9 @@ Arrays ====== +Base classes +============ + .. doxygenclass:: arrow::ArrayData :project: arrow_cpp :members: @@ -85,6 +88,17 @@ Chunked Arrays :project: arrow_cpp :members: +.. doxygenstruct:: arrow::ChunkLocation + :project: arrow_cpp + :members: + +.. doxygenstruct:: arrow::TypedChunkLocation + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::ChunkResolver + :project: arrow_cpp + :members: Utilities ========= diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index bdaac0a9ce5d2..e90e06599be91 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -87,13 +87,13 @@ class ArrowAltrepData { const std::shared_ptr& chunked_array() { return chunked_array_; } - arrow::internal::ChunkLocation locate(int64_t index) { + arrow::ChunkLocation locate(int64_t index) { return resolver_.Resolve(index); } private: std::shared_ptr chunked_array_; - arrow::internal::ChunkResolver resolver_; + arrow::ChunkResolver resolver_; }; // the ChunkedArray that is being wrapped by the altrep object From b4767cdc4be53ea53f96a234654adf29abce413e Mon Sep 17 00:00:00 2001 From: anjakefala Date: Wed, 9 Oct 2024 14:08:01 -0700 Subject: [PATCH 02/10] Resolve R linter error --- r/src/altrep.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index e90e06599be91..5c9c454aa7e76 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -87,9 +87,7 @@ class ArrowAltrepData { const std::shared_ptr& chunked_array() { return chunked_array_; } - arrow::ChunkLocation locate(int64_t index) { - return resolver_.Resolve(index); - } + arrow::ChunkLocation locate(int64_t index) { return resolver_.Resolve(index); } private: std::shared_ptr chunked_array_; From b7e977e2b560bc17f5005ca19eedda2a7e2fa53b Mon Sep 17 00:00:00 2001 From: anjakefala Date: Wed, 9 Oct 2024 14:23:52 -0700 Subject: [PATCH 03/10] Fix definition of ChunkResolver --- cpp/src/arrow/chunk_resolver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index f40514f980a8c..30831816a337d 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -29,7 +29,7 @@ namespace arrow { -struct ChunkResolver; +class ChunkResolver; template struct ARROW_EXPORT TypedChunkLocation { From 7e9fefb6727cc55981fbbbb03f895b0ac3439b70 Mon Sep 17 00:00:00 2001 From: anjakefala Date: Thu, 10 Oct 2024 09:28:16 -0700 Subject: [PATCH 04/10] Remove using declarations in benchmark --- cpp/src/arrow/chunk_resolver_benchmark.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/chunk_resolver_benchmark.cc b/cpp/src/arrow/chunk_resolver_benchmark.cc index 72b06d4e36fb1..786075f2fc863 100644 --- a/cpp/src/arrow/chunk_resolver_benchmark.cc +++ b/cpp/src/arrow/chunk_resolver_benchmark.cc @@ -28,8 +28,8 @@ namespace arrow { -using ChunkResolver; -using TypedChunkLocation; +using arrow::ChunkResolver; +using arrow::TypedChunkLocation; namespace { From a838fd03accf47e980c13e7f5464451a805690d3 Mon Sep 17 00:00:00 2001 From: anjakefala Date: Thu, 10 Oct 2024 10:57:50 -0700 Subject: [PATCH 05/10] Explicitly instantiate template base struct - needed for DLL linking on Windows --- cpp/src/arrow/chunk_resolver.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index 30831816a337d..457a5277c9014 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -289,4 +289,13 @@ class ARROW_EXPORT ChunkResolver { } }; +// Explicitly instantiate template base struct, for DLL linking on Windows +template struct arrow::TypedChunkLocation; +template struct arrow::TypedChunkLocation; +template struct arrow::TypedChunkLocation; +template struct arrow::TypedChunkLocation; +template struct arrow::TypedChunkLocation; +template struct arrow::TypedChunkLocation; +template struct arrow::TypedChunkLocation; +template struct arrow::TypedChunkLocation; } // namespace arrow From 8cd7d96514f21d575fd82017788d5fd1e80ceed3 Mon Sep 17 00:00:00 2001 From: anjakefala Date: Tue, 15 Oct 2024 13:30:07 -0700 Subject: [PATCH 06/10] Felipe PR comments --- cpp/src/arrow/chunk_resolver.h | 16 ++++++++-------- cpp/src/arrow/chunk_resolver_benchmark.cc | 3 --- cpp/src/arrow/chunked_array_test.cc | 4 ---- cpp/src/arrow/compute/kernels/vector_sort.cc | 1 - 4 files changed, 8 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index 457a5277c9014..20718677599a2 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -290,12 +290,12 @@ class ARROW_EXPORT ChunkResolver { }; // Explicitly instantiate template base struct, for DLL linking on Windows -template struct arrow::TypedChunkLocation; -template struct arrow::TypedChunkLocation; -template struct arrow::TypedChunkLocation; -template struct arrow::TypedChunkLocation; -template struct arrow::TypedChunkLocation; -template struct arrow::TypedChunkLocation; -template struct arrow::TypedChunkLocation; -template struct arrow::TypedChunkLocation; +template struct TypedChunkLocation; +template struct TypedChunkLocation; +template struct TypedChunkLocation; +template struct TypedChunkLocation; +template struct TypedChunkLocation; +template struct TypedChunkLocation; +template struct TypedChunkLocation; +template struct TypedChunkLocation; } // namespace arrow diff --git a/cpp/src/arrow/chunk_resolver_benchmark.cc b/cpp/src/arrow/chunk_resolver_benchmark.cc index 786075f2fc863..a6f539a444bbc 100644 --- a/cpp/src/arrow/chunk_resolver_benchmark.cc +++ b/cpp/src/arrow/chunk_resolver_benchmark.cc @@ -28,9 +28,6 @@ namespace arrow { -using arrow::ChunkResolver; -using arrow::TypedChunkLocation; - namespace { int64_t constexpr kChunkedArrayLength = std::numeric_limits::max(); diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index 62aff78b48035..b3944fd1b1927 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -36,10 +36,6 @@ namespace arrow { -using arrow::ChunkLocation; -using arrow::ChunkResolver; -using arrow::TypedChunkLocation; - class TestChunkedArray : public ::testing::Test { protected: virtual void Construct() { diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index 15fafa1f4d219..395ed86a06b4a 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -23,7 +23,6 @@ namespace arrow { -using arrow::ChunkLocation; using internal::checked_cast; namespace compute { From 3646d0f7498ea71f62126cd125453cdcda61714d Mon Sep 17 00:00:00 2001 From: anjakefala Date: Thu, 17 Oct 2024 13:22:55 -0700 Subject: [PATCH 07/10] Remove docstrings for construcotrs that are being changed in a seperate PR --- cpp/src/arrow/chunk_resolver.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index 20718677599a2..ab0e753d0040e 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -75,23 +75,12 @@ class ARROW_EXPORT ChunkResolver { mutable std::atomic cached_chunk_; public: - /// \brief Initialize from an `ArrayVector`. explicit ChunkResolver(const ArrayVector& chunks) noexcept; - /// \brief Initialize from a vector of raw `Array` pointers. explicit ChunkResolver(const std::vector& chunks) noexcept; - /// \brief Initialize from a `RecordBatchVector`. - /// - /// Because all `Array`s in a `RecordBatch` must have the same length, this - /// can be useful for iterating over multiple columns simultaneously. explicit ChunkResolver(const RecordBatchVector& batches) noexcept; - /// \brief Construct a ChunkResolver from a vector of chunks.size() + 1 offsets. - /// - /// The first offset must be 0 and the last offset must be the logical length of the - /// chunked array. Each offset before the last represents the starting logical index of - /// the corresponding chunk. explicit ChunkResolver(std::vector offsets) noexcept : offsets_(std::move(offsets)), cached_chunk_(0) { #ifndef NDEBUG From d2d29b0d77bc7b0bf98810daf3cba36c86fdc2d3 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Thu, 17 Oct 2024 20:57:23 +0000 Subject: [PATCH 08/10] Guard ChunkResolver use in altrep.cpp --- r/src/altrep.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index 5c9c454aa7e76..5c95b634a0979 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -80,6 +80,14 @@ void DeletePointer(std::shared_ptr* ptr) { template using Pointer = cpp11::external_pointer, DeletePointer>; +#if ARROW_MAJOR_VERSION >= 18 +using ChunkResolver = arrow::ChunkResolver; +using ChunkLocation = arrow::ChunkLocation; +#else +using ChunkResolver = arrow::internal::ChunkResolver; +using ChunkLocation = arrow::internal::ChunkLocation; +#endif + class ArrowAltrepData { public: explicit ArrowAltrepData(const std::shared_ptr& chunked_array) @@ -87,11 +95,11 @@ class ArrowAltrepData { const std::shared_ptr& chunked_array() { return chunked_array_; } - arrow::ChunkLocation locate(int64_t index) { return resolver_.Resolve(index); } + ChunkLocation locate(int64_t index) { return resolver_.Resolve(index); } private: std::shared_ptr chunked_array_; - arrow::ChunkResolver resolver_; + ChunkResolver resolver_; }; // the ChunkedArray that is being wrapped by the altrep object From 3d3e1be2f5c839b05bd2f8c1330d05273246d1fb Mon Sep 17 00:00:00 2001 From: SChakravorti21 Date: Thu, 17 Oct 2024 13:22:55 -0700 Subject: [PATCH 09/10] Remove docstrings for construcotrs that are being changed in a seperate PR --- cpp/src/arrow/chunk_resolver.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index 20718677599a2..ab0e753d0040e 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -75,23 +75,12 @@ class ARROW_EXPORT ChunkResolver { mutable std::atomic cached_chunk_; public: - /// \brief Initialize from an `ArrayVector`. explicit ChunkResolver(const ArrayVector& chunks) noexcept; - /// \brief Initialize from a vector of raw `Array` pointers. explicit ChunkResolver(const std::vector& chunks) noexcept; - /// \brief Initialize from a `RecordBatchVector`. - /// - /// Because all `Array`s in a `RecordBatch` must have the same length, this - /// can be useful for iterating over multiple columns simultaneously. explicit ChunkResolver(const RecordBatchVector& batches) noexcept; - /// \brief Construct a ChunkResolver from a vector of chunks.size() + 1 offsets. - /// - /// The first offset must be 0 and the last offset must be the logical length of the - /// chunked array. Each offset before the last represents the starting logical index of - /// the corresponding chunk. explicit ChunkResolver(std::vector offsets) noexcept : offsets_(std::move(offsets)), cached_chunk_(0) { #ifndef NDEBUG From 2d31866d4f836b5fe84fbe8929fff5a821a90b58 Mon Sep 17 00:00:00 2001 From: Anja Kefala Date: Mon, 21 Oct 2024 10:51:00 -0700 Subject: [PATCH 10/10] Update r/src/altrep.cpp Co-authored-by: Jacob Wujciak-Jens --- r/src/altrep.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index 5c95b634a0979..90a459e19cb6d 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -80,7 +80,7 @@ void DeletePointer(std::shared_ptr* ptr) { template using Pointer = cpp11::external_pointer, DeletePointer>; -#if ARROW_MAJOR_VERSION >= 18 +#if ARROW_VERSION_MAJOR >= 18 using ChunkResolver = arrow::ChunkResolver; using ChunkLocation = arrow::ChunkLocation; #else