From 844aa5ea060c912fd90240d534e18e3de5c617cc Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Fri, 3 May 2024 20:44:21 -0300
Subject: [PATCH 01/20] Take: Add VectorKernel::ChunkedExec to
 SelectionKernelData

---
 .../arrow/compute/kernels/vector_selection_internal.cc |  1 +
 .../arrow/compute/kernels/vector_selection_internal.h  | 10 ++++++++++
 2 files changed, 11 insertions(+)
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
index 7189d42850e79..f5685ffa4139e 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
@@ -60,6 +60,7 @@ void RegisterSelectionFunction(const std::string& name, FunctionDoc doc,
         {std::move(kernel_data.value_type), std::move(kernel_data.selection_type)},
         OutputType(FirstType));
     base_kernel.exec = kernel_data.exec;
+    base_kernel.exec_chunked = kernel_data.chunked_exec;
     DCHECK_OK(func->AddKernel(base_kernel));
   }
   kernels.clear();
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
index 887bf08354120..558423733ca2b 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_internal.h
+++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
@@ -19,6 +19,7 @@
 
 #include <cstdint>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "arrow/array/data.h"
@@ -34,9 +35,18 @@ using FilterState = OptionsWrapper<FilterOptions>;
 using TakeState = OptionsWrapper<TakeOptions>;
 
 struct SelectionKernelData {
+  SelectionKernelData(InputType value_type, InputType selection_type,
+                      ArrayKernelExec exec,
+                      VectorKernel::ChunkedExec chunked_exec = NULLPTR)
+      : value_type(std::move(value_type)),
+        selection_type(std::move(selection_type)),
+        exec(exec),
+        chunked_exec(chunked_exec) {}
+
   InputType value_type;
   InputType selection_type;
   ArrayKernelExec exec;
+  VectorKernel::ChunkedExec chunked_exec;
 };
 
 void RegisterSelectionFunction(const std::string& name, FunctionDoc doc,

From 9797c9f30e89272f84bd55c10990510953ab4d2b Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Fri, 3 May 2024 22:02:11 -0300
Subject: [PATCH 02/20] Take: VectorKernel::output_chunked should be false for
 "array_take"

We will ensure "array_take" returns a ChunkedArray if at least one input
is chunked, just like "take" does. Even when the output fits in a single
chunk.
---
 cpp/src/arrow/compute/kernels/vector_selection.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index b265673e23c86..b047763098c0d 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -293,6 +293,8 @@ std::shared_ptr<VectorFunction> MakeIndicesNonZeroFunction(std::string name,
   VectorKernel kernel;
   kernel.null_handling = NullHandling::OUTPUT_NOT_NULL;
   kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+  // "array_take" ensures that the output will be be chunked when at least one
+  // input is chunked, so we need to set this to false.
   kernel.output_chunked = false;
   kernel.exec = IndicesNonZeroExec;
   kernel.exec_chunked = IndicesNonZeroExecChunked;
@@ -338,6 +340,7 @@ void RegisterVectorSelection(FunctionRegistry* registry) {
   VectorKernel take_base;
   take_base.init = TakeState::Init;
   take_base.can_execute_chunkwise = false;
+  take_base.output_chunked = false;
   RegisterSelectionFunction("array_take", array_take_doc, take_base,
                             std::move(take_kernels), GetDefaultTakeOptions(), registry);
 

From a135471d25680b4a09a2290ba73ee54081d0a12c Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Fri, 3 May 2024 22:17:20 -0300
Subject: [PATCH 03/20] Take: Make "array_take" handle CA->C cases by
 populating VectorKernel::exec_chunked

Before this commit, only the "take" meta function could handle CA
parameters.
---
 .../kernels/vector_selection_take_internal.cc | 175 +++++++++++++++---
 1 file changed, 148 insertions(+), 27 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
index c45cc552a2cc5..a770401fd34c2 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
@@ -579,20 +579,13 @@ class TakeMetaFunction : public MetaFunction {
     return result.array();
   }
 
-  static Result<std::shared_ptr<ArrayData>> TakeCAA(
-      const std::shared_ptr<ChunkedArray>& values, const Array& indices,
-      const TakeOptions& options, ExecContext* ctx) {
-    ARROW_ASSIGN_OR_RAISE(auto values_array,
-                          ChunkedArrayAsArray(values, ctx->memory_pool()));
-    std::vector<Datum> args = {std::move(values_array), indices};
-    return TakeAAA(args, options, ctx);
-  }
-
   static Result<std::shared_ptr<ChunkedArray>> TakeCAC(
       const std::shared_ptr<ChunkedArray>& values, const Array& indices,
       const TakeOptions& options, ExecContext* ctx) {
-    ARROW_ASSIGN_OR_RAISE(auto new_chunk, TakeCAA(values, indices, options, ctx));
-    return std::make_shared<ChunkedArray>(MakeArray(std::move(new_chunk)));
+    // "array_take" can handle CA->C cases directly
+    // (via their VectorKernel::exec_chunked)
+    ARROW_ASSIGN_OR_RAISE(auto result, CallArrayTake({values, indices}, options, ctx));
+    return result.chunked_array();
   }
 
   static Result<std::shared_ptr<ChunkedArray>> TakeCCC(
@@ -721,6 +714,115 @@ class TakeMetaFunction : public MetaFunction {
 
 // ----------------------------------------------------------------------
 
+/// \brief Prepare the output array like ExecuteArrayKernel::PrepareOutput()
+std::shared_ptr<ArrayData> PrepareOutput(const ExecBatch& batch, int64_t length) {
+  DCHECK_EQ(batch.length, length);
+  auto out = std::make_shared<ArrayData>(batch.values[0].type(), length);
+  out->buffers.resize(batch.values[0].type()->layout().buffers.size());
+  return out;
+}
+
+Status CallAAAKernel(ArrayKernelExec take_aaa_exec, KernelContext* ctx,
+                     std::shared_ptr<ArrayData> values,
+                     std::shared_ptr<ArrayData> indices, Datum* out) {
+  int64_t batch_length = values->length;
+  std::vector<Datum> args = {std::move(values), std::move(indices)};
+  ExecBatch array_array_batch(std::move(args), batch_length);
+  DCHECK_EQ(out->kind(), Datum::ARRAY);
+  ExecSpan exec_span{array_array_batch};
+  ExecResult result;
+  result.value = out->array();
+  return take_aaa_exec(ctx, exec_span, &result);
+}
+
+/// \brief Generic VectorKernel::exec_chunked for CA->A cases.
+///
+/// This function concatenates the chunks of values and then calls the
+/// AA->A take kernel.
+///
+/// \param take_aaa_exec The AA->A take kernel to use.
+Status GenericTakeChunkedExec(ArrayKernelExec take_aaa_exec, KernelContext* ctx,
+                              const ExecBatch& batch, Datum* out) {
+  auto& args = batch.values;
+  if (args[0].kind() == Datum::CHUNKED_ARRAY && args[1].kind() == Datum::ARRAY) {
+    auto& values = args[0].chunked_array();
+    auto& indices = args[1].array();
+    ARROW_ASSIGN_OR_RAISE(auto values_array, TakeMetaFunction::ChunkedArrayAsArray(
+                                                 values, ctx->memory_pool()));
+    DCHECK_EQ(values_array->length(), batch.length);
+    Datum result = PrepareOutput(batch, batch.length);
+    RETURN_NOT_OK(
+        CallAAAKernel(take_aaa_exec, ctx, values_array->data(), indices, &result));
+    out->value = std::make_shared<ChunkedArray>(MakeArray(result.array()));
+    return Status::OK();
+  }
+  return Status::NotImplemented(
+      "Unsupported kinds for 'array_take', try using 'take': "
+      "values=",
+      args[0].ToString(), "indices=", args[1].ToString());
+}
+
+template <ArrayKernelExec kTakeAAAExec>
+struct GenericTakeChunkedExecFunctor {
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    return GenericTakeChunkedExec(kTakeAAAExec, ctx, batch, out);
+  }
+
+  // XXX: to be removed
+  static Status ExecNonChunked(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    Datum result =
+        std::make_shared<ChunkedArray>(MakeArray(PrepareOutput(batch, batch.length)));
+    RETURN_NOT_OK(Exec(ctx, batch, &result));
+    DCHECK_EQ(result.chunked_array()->num_chunks(), 1);
+    out->value = result.chunked_array()->chunk(0)->data();
+    return Status::OK();
+  }
+};
+
+Status SpecialTakeChunkedExec(const ArrayKernelExec take_aaa_exec,
+                              VectorKernel::ChunkedExec take_caa_exec, KernelContext* ctx,
+                              const ExecBatch& batch, Datum* out) {
+  Datum result = PrepareOutput(batch, batch.length);
+  auto* pool = ctx->memory_pool();
+  auto& args = batch.values;
+  if (args[0].kind() == Datum::CHUNKED_ARRAY && args[1].kind() == Datum::ARRAY) {
+    auto& values = args[0].chunked_array();
+    auto& indices = args[1].array();
+    std::shared_ptr<Array> single_chunk = nullptr;
+    if (values->num_chunks() == 0 || values->length() == 0) {
+      ARROW_ASSIGN_OR_RAISE(single_chunk,
+                            MakeArrayOfNull(values->type(), /*length=*/0, pool));
+    } else if (values->num_chunks() == 1) {
+      single_chunk = values->chunk(0);
+    }
+    if (single_chunk) {
+      DCHECK_EQ(single_chunk->length(), batch.length);
+      // If the ChunkedArray was cheaply converted to a single chunk,
+      // we can use the AA->A take kernel directly.
+      RETURN_NOT_OK(
+          CallAAAKernel(take_aaa_exec, ctx, single_chunk->data(), indices, &result));
+    }
+    // Instead of concatenating the chunks, we call the CA->A take kernel
+    // which has a more efficient implementation for this case. At this point,
+    // that implementation doesn't have to care about empty or single-chunk
+    // ChunkedArrays.
+    RETURN_NOT_OK(take_caa_exec(ctx, batch, &result));
+    out->value = std::make_shared<ChunkedArray>(MakeArray(result.array()));
+    return Status::OK();
+  }
+  return Status::NotImplemented(
+      "Unsupported kinds for 'array_take', try using 'take': "
+      "values=",
+      args[0].ToString(), "indices=", args[1].ToString());
+}
+
+template <ArrayKernelExec kTakeAAAExec, VectorKernel::ChunkedExec kTakeCAAExec>
+struct SpecialTakeChunkedExecFunctor {
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    return SpecialTakeChunkedExec(kTakeAAAExec, kTakeCAAExec, ctx, batch, out);
+  }
+};
+
 }  // namespace
 
 const TakeOptions* GetDefaultTakeOptions() {
@@ -736,22 +838,41 @@ void PopulateTakeKernels(std::vector<SelectionKernelData>* out) {
   auto take_indices = match::Integer();
 
   *out = {
-      {InputType(match::Primitive()), take_indices, FixedWidthTakeExec},
-      {InputType(match::BinaryLike()), take_indices, VarBinaryTakeExec},
-      {InputType(match::LargeBinaryLike()), take_indices, LargeVarBinaryTakeExec},
-      {InputType(match::FixedSizeBinaryLike()), take_indices, FixedWidthTakeExec},
-      {InputType(null()), take_indices, NullTakeExec},
-      {InputType(Type::DICTIONARY), take_indices, DictionaryTake},
-      {InputType(Type::EXTENSION), take_indices, ExtensionTake},
-      {InputType(Type::LIST), take_indices, ListTakeExec},
-      {InputType(Type::LARGE_LIST), take_indices, LargeListTakeExec},
-      {InputType(Type::LIST_VIEW), take_indices, ListViewTakeExec},
-      {InputType(Type::LARGE_LIST_VIEW), take_indices, LargeListViewTakeExec},
-      {InputType(Type::FIXED_SIZE_LIST), take_indices, FSLTakeExec},
-      {InputType(Type::DENSE_UNION), take_indices, DenseUnionTakeExec},
-      {InputType(Type::SPARSE_UNION), take_indices, SparseUnionTakeExec},
-      {InputType(Type::STRUCT), take_indices, StructTakeExec},
-      {InputType(Type::MAP), take_indices, MapTakeExec},
+      {InputType(match::Primitive()), take_indices, FixedWidthTakeExec,
+       // XXX: doing this for testing SpecialTakeChunkedExec
+       SpecialTakeChunkedExecFunctor<
+           FixedWidthTakeExec,
+           GenericTakeChunkedExecFunctor<FixedWidthTakeExec>::ExecNonChunked>::Exec},
+      {InputType(match::BinaryLike()), take_indices, VarBinaryTakeExec,
+       GenericTakeChunkedExecFunctor<VarBinaryTakeExec>::Exec},
+      {InputType(match::LargeBinaryLike()), take_indices, LargeVarBinaryTakeExec,
+       GenericTakeChunkedExecFunctor<LargeVarBinaryTakeExec>::Exec},
+      {InputType(match::FixedSizeBinaryLike()), take_indices, FixedWidthTakeExec,
+       GenericTakeChunkedExecFunctor<FixedWidthTakeExec>::Exec},
+      {InputType(null()), take_indices, NullTakeExec,
+       GenericTakeChunkedExecFunctor<NullTakeExec>::Exec},
+      {InputType(Type::DICTIONARY), take_indices, DictionaryTake,
+       GenericTakeChunkedExecFunctor<DictionaryTake>::Exec},
+      {InputType(Type::EXTENSION), take_indices, ExtensionTake,
+       GenericTakeChunkedExecFunctor<ExtensionTake>::Exec},
+      {InputType(Type::LIST), take_indices, ListTakeExec,
+       GenericTakeChunkedExecFunctor<ListTakeExec>::Exec},
+      {InputType(Type::LARGE_LIST), take_indices, LargeListTakeExec,
+       GenericTakeChunkedExecFunctor<LargeListTakeExec>::Exec},
+      {InputType(Type::LIST_VIEW), take_indices, ListViewTakeExec,
+       GenericTakeChunkedExecFunctor<ListViewTakeExec>::Exec},
+      {InputType(Type::LARGE_LIST_VIEW), take_indices, LargeListViewTakeExec,
+       GenericTakeChunkedExecFunctor<LargeListViewTakeExec>::Exec},
+      {InputType(Type::FIXED_SIZE_LIST), take_indices, FSLTakeExec,
+       GenericTakeChunkedExecFunctor<FSLTakeExec>::Exec},
+      {InputType(Type::DENSE_UNION), take_indices, DenseUnionTakeExec,
+       GenericTakeChunkedExecFunctor<DenseUnionTakeExec>::Exec},
+      {InputType(Type::SPARSE_UNION), take_indices, SparseUnionTakeExec,
+       GenericTakeChunkedExecFunctor<SparseUnionTakeExec>::Exec},
+      {InputType(Type::STRUCT), take_indices, StructTakeExec,
+       GenericTakeChunkedExecFunctor<StructTakeExec>::Exec},
+      {InputType(Type::MAP), take_indices, MapTakeExec,
+       GenericTakeChunkedExecFunctor<MapTakeExec>::Exec},
   };
 }
 

From 36b69af8c672e9569849593f42e3ef70679d24dc Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Fri, 26 Apr 2024 22:49:45 -0300
Subject: [PATCH 04/20] gather_internal.h: Introduce GatherFromChunks

---
 .../arrow/compute/kernels/gather_internal.h   | 165 +++++++++++++++++-
 1 file changed, 158 insertions(+), 7 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/gather_internal.h b/cpp/src/arrow/compute/kernels/gather_internal.h
index 4c161533a7277..102e3db34d581 100644
--- a/cpp/src/arrow/compute/kernels/gather_internal.h
+++ b/cpp/src/arrow/compute/kernels/gather_internal.h
@@ -20,8 +20,14 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <vector>
 
+#include "arrow/array/array_base.h"
 #include "arrow/array/data.h"
+#include "arrow/chunk_resolver.h"
+#include "arrow/chunked_array.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
 #include "arrow/util/bit_block_counter.h"
 #include "arrow/util/bit_run_reader.h"
 #include "arrow/util/bit_util.h"
@@ -52,6 +58,13 @@ class GatherBaseCRTP {
   ARROW_DEFAULT_MOVE_AND_ASSIGN(GatherBaseCRTP);
 
  protected:
+  template <typename IndexCType>
+  bool IsSrcValid(const ArraySpan& src_validity, const IndexCType* idx,
+                  int64_t position) const {
+    ARROW_COMPILER_ASSUME(src_validity.buffers[0].data != nullptr);
+    return src_validity.IsValid(idx[position]);
+  }
+
   ARROW_FORCE_INLINE int64_t ExecuteNoNulls(int64_t idx_length) {
     auto* self = static_cast<GatherImpl*>(this);
     for (int64_t position = 0; position < idx_length; position++) {
@@ -76,8 +89,9 @@ class GatherBaseCRTP {
   // doesn't have to be called for resulting null positions. A position is
   // considered null if either the index or the source value is null at that
   // position.
-  template <bool kOutputIsZeroInitialized, typename IndexCType>
-  ARROW_FORCE_INLINE int64_t ExecuteWithNulls(const ArraySpan& src_validity,
+  template <bool kOutputIsZeroInitialized, typename IndexCType,
+            class ValiditySpan = ArraySpan>
+  ARROW_FORCE_INLINE int64_t ExecuteWithNulls(const ValiditySpan& src_validity,
                                               int64_t idx_length, const IndexCType* idx,
                                               const ArraySpan& idx_validity,
                                               uint8_t* out_is_valid) {
@@ -116,12 +130,11 @@ class GatherBaseCRTP {
           position += block.length;
         }
       } else {
-        // Source values may be null, so we must do random access into src_validity
+        // Source values may be null, so we must do random access with IsSrcValid()
         if (block.popcount == block.length) {
           // Faster path: indices are not null but source values may be
           for (int64_t i = 0; i < block.length; ++i) {
-            ARROW_COMPILER_ASSUME(src_validity.buffers[0].data != nullptr);
-            if (src_validity.IsValid(idx[position])) {
+            if (self->IsSrcValid(src_validity, idx, position)) {
               // value is not null
               self->WriteValue(position);
               bit_util::SetBit(out_is_valid, position);
@@ -136,9 +149,9 @@ class GatherBaseCRTP {
           // random access in general we have to check the value nullness one by
           // one.
           for (int64_t i = 0; i < block.length; ++i) {
-            ARROW_COMPILER_ASSUME(src_validity.buffers[0].data != nullptr);
             ARROW_COMPILER_ASSUME(idx_validity.buffers[0].data != nullptr);
-            if (idx_validity.IsValid(position) && src_validity.IsValid(idx[position])) {
+            if (idx_validity.IsValid(position) &&
+                self->IsSrcValid(src_validity, idx, position)) {
               // index is not null && value is not null
               self->WriteValue(position);
               bit_util::SetBit(out_is_valid, position);
@@ -303,4 +316,142 @@ class Gather</*kValueWidthInBits=*/1, IndexCType, /*kWithFactor=*/false>
   }
 };
 
+template <typename IndexCType>
+struct ChunkedValiditySpan {
+  const ChunkedArray& chunks_validity;
+  const IndexCType* chunk_index_vec;
+  const IndexCType* index_in_chunk_vec;
+  const bool may_have_nulls;
+
+  ChunkedValiditySpan(const ChunkedArray& chunks_validity,
+                      const IndexCType* chunk_index_vec,
+                      const IndexCType* index_in_chunk_vec)
+      : chunks_validity(chunks_validity),
+        chunk_index_vec(chunk_index_vec),
+        index_in_chunk_vec(index_in_chunk_vec),
+        may_have_nulls(chunks_validity.null_count() > 0) {}
+
+  bool MayHaveNulls() const { return may_have_nulls; }
+
+  bool IsValid(int64_t position) const {
+    auto chunk_index = chunk_index_vec[position];
+    auto index_in_chunk = index_in_chunk_vec[position];
+    return chunks_validity.chunk(static_cast<int>(chunk_index))->IsValid(index_in_chunk);
+  }
+};
+
+template <int kValueWidthInBits, typename IndexCType, bool kWithFactor>
+class GatherFromChunks
+    : public GatherBaseCRTP<
+          GatherFromChunks<kValueWidthInBits, IndexCType, kWithFactor>> {
+ private:
+  static_assert(!kWithFactor || kValueWidthInBits == 8,
+                "kWithFactor is only supported for kValueWidthInBits == 8");
+  static_assert(kValueWidthInBits == 1 || kValueWidthInBits % 8 == 0);
+  // kValueWidth should not be used if kValueWidthInBits == 1.
+  static constexpr int kValueWidth = kValueWidthInBits / 8;
+
+  // src_residual_bit_offsets_[i] is used to store the bit offset of the first byte (0-7)
+  // in src_chunks_[i] iff kValueWidthInBits == 1.
+  const int* src_residual_bit_offsets_ = NULLPTR;
+  // Pre-computed pointers to the start of the values in each chunk.
+  const uint8_t* const* src_chunks_;
+  // Number indices resolved in chunk_index_vec_/index_in_chunk_vec_
+  const int64_t idx_length_;
+  const IndexCType* chunk_index_vec_;
+  const IndexCType* index_in_chunk_vec_;
+
+  uint8_t* out_;
+  int64_t factor_;
+
+ public:
+  void WriteValue(int64_t position) {
+    auto chunk_index = chunk_index_vec_[position];
+    auto index_in_chunk = index_in_chunk_vec_[position];
+    auto* chunk = src_chunks_[chunk_index];
+    if constexpr (kValueWidthInBits == 1) {
+      auto src_offset = src_residual_bit_offsets_[chunk_index];
+      bit_util::SetBitTo(out_, position,
+                         bit_util::GetBit(chunk, src_offset + index_in_chunk));
+    } else if constexpr (kWithFactor) {
+      const int64_t scaled_factor = kValueWidth * factor_;
+      memcpy(out_ + position * scaled_factor, chunk + index_in_chunk * scaled_factor,
+             scaled_factor);
+    } else {
+      memcpy(out_ + position * kValueWidth, chunk + index_in_chunk * kValueWidth,
+             kValueWidth);
+    }
+  }
+
+  void WriteZero(int64_t position) {
+    if constexpr (kValueWidthInBits == 1) {
+      bit_util::ClearBit(out_, position);
+    } else if constexpr (kWithFactor) {
+      const int64_t scaled_factor = kValueWidth * factor_;
+      memset(out_ + position * scaled_factor, 0, scaled_factor);
+    } else {
+      memset(out_ + position * kValueWidth, 0, kValueWidth);
+    }
+  }
+
+  void WriteZeroSegment(int64_t position, int64_t block_length) {
+    if constexpr (kValueWidthInBits == 1) {
+      bit_util::SetBitsTo(out_, position, block_length, false);
+    } else if constexpr (kWithFactor) {
+      const int64_t scaled_factor = kValueWidth * factor_;
+      memset(out_ + position * scaled_factor, 0, block_length * scaled_factor);
+    } else {
+      memset(out_ + position * kValueWidth, 0, block_length * kValueWidth);
+    }
+  }
+
+  bool IsSrcValid(const ChunkedValiditySpan<IndexCType>& src_validity,
+                  const IndexCType* idx, int64_t position) const {
+    return src_validity.IsValid(position);
+  }
+
+ public:
+  GatherFromChunks(const int* src_residual_bit_offsets, const uint8_t* const* src_chunks,
+                   const int64_t idx_length, const IndexCType* chunk_index_vec,
+                   const IndexCType* index_in_chunk_vec, uint8_t* out, int64_t factor = 1)
+      : src_residual_bit_offsets_(src_residual_bit_offsets),
+        src_chunks_(src_chunks),
+        idx_length_(idx_length),
+        chunk_index_vec_(chunk_index_vec),
+        index_in_chunk_vec_(index_in_chunk_vec),
+        out_(out),
+        factor_(factor) {
+    assert(src_chunks && chunk_index_vec && index_in_chunk_vec && out);
+    if constexpr (kValueWidthInBits == 1) {
+      assert(src_residual_bit_offsets);
+    }
+    assert((kWithFactor || factor == 1) &&
+           "When kWithFactor is false, the factor is assumed to be 1 at compile time");
+  }
+
+  ARROW_FORCE_INLINE int64_t Execute() { return this->ExecuteNoNulls(idx_length_); }
+
+  /// \pre If kOutputIsZeroInitialized, then this->out_ has to be zero initialized.
+  /// \pre Bits in out_is_valid have to always be zero initialized.
+  /// \post The bits for the valid elements (and only those) are set in out_is_valid.
+  /// \post If !kOutputIsZeroInitialized, then positions in this->_out containing null
+  ///       elements have 0s written to them. This might be less efficient than
+  ///       zero-initializing first and calling this->Execute() afterwards.
+  /// \return The number of valid elements in out.
+  template <bool kOutputIsZeroInitialized = false>
+  ARROW_FORCE_INLINE int64_t Execute(const ChunkedArray& src_validity,
+                                     const ArraySpan& idx_validity,
+                                     uint8_t* out_is_valid) {
+    assert(idx_length_ == idx_validity.length);
+    assert(out_is_valid);
+    assert(idx_validity.type->byte_width() == sizeof(IndexCType));
+    ChunkedValiditySpan src_validity_span{src_validity, chunk_index_vec_,
+                                          index_in_chunk_vec_};
+    assert(src_validity_span.MayHaveNulls() || idx_validity.MayHaveNulls());
+    // idx=NULLPTR because when it's passed to IsSrcValid() defined above, it's not used.
+    return this->template ExecuteWithNulls<kOutputIsZeroInitialized, IndexCType>(
+        src_validity_span, idx_length_, /*idx=*/NULLPTR, idx_validity, out_is_valid);
+  }
+};
+
 }  // namespace arrow::internal

From d4b48a38359d2ad8052bcff10b268035e56c7174 Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Sun, 5 May 2024 15:54:59 -0300
Subject: [PATCH 05/20] Take: Introduce ValueSpan to delay dispatching on
 chunked-ness

---
 .../kernels/vector_selection_take_internal.cc | 87 ++++++++++++++-----
 1 file changed, 67 insertions(+), 20 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
index a770401fd34c2..7f9ce299044c9 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
@@ -326,6 +326,43 @@ namespace {
 
 using TakeState = OptionsWrapper<TakeOptions>;
 
+class ValuesSpan {
+ private:
+  const std::shared_ptr<ChunkedArray> chunked_ = nullptr;
+  const ArraySpan chunk0_;  // first chunk or the whole array
+
+ public:
+  explicit ValuesSpan(const std::shared_ptr<ChunkedArray> values)
+      : chunked_(std::move(values)), chunk0_{*values->chunk(0)->data()} {
+    DCHECK(chunked_);
+    DCHECK_GT(chunked_->num_chunks(), 0);
+  }
+
+  explicit ValuesSpan(const ArraySpan& values) : chunk0_(values) {}
+
+  bool is_chunked() const { return chunked_ != nullptr; }
+
+  const ChunkedArray& chunked_array() const {
+    DCHECK(is_chunked());
+    return *chunked_;
+  }
+
+  const ArraySpan& chunk0() const { return chunk0_; }
+
+  const ArraySpan& array() const {
+    DCHECK(!is_chunked());
+    return chunk0_;
+  }
+
+  const DataType* type() const { return chunk0_.type; }
+
+  int64_t length() const { return is_chunked() ? chunked_->length() : array().length; }
+
+  bool MayHaveNulls() const {
+    return is_chunked() ? chunked_->null_count() != 0 : array().MayHaveNulls();
+  }
+};
+
 // ----------------------------------------------------------------------
 // Implement optimized take for primitive types from boolean to
 // 1/2/4/8/16/32-byte C-type based types and fixed-size binary (0 or more
@@ -357,15 +394,22 @@ template <typename IndexCType, typename ValueBitWidthConstant,
 struct FixedWidthTakeImpl {
   static constexpr int kValueWidthInBits = ValueBitWidthConstant::value;
 
-  static Status Exec(KernelContext* ctx, const ArraySpan& values,
+  static Status Exec(KernelContext* ctx, const ValuesSpan& values,
                      const ArraySpan& indices, ArrayData* out_arr, int64_t factor) {
 #ifndef NDEBUG
-    int64_t bit_width = util::FixedWidthInBits(*values.type);
+    int64_t bit_width = util::FixedWidthInBits(*values.type());
     DCHECK(WithFactor::value || (kValueWidthInBits == bit_width && factor == 1));
     DCHECK(!WithFactor::value ||
            (factor > 0 && kValueWidthInBits == 8 &&  // factors are used with bytes
             static_cast<int64_t>(factor * kValueWidthInBits) == bit_width));
 #endif
+    // XXX: support values.is_chunked() case
+    assert(!values.is_chunked());
+    return Exec(ctx, values.array(), indices, out_arr, factor);
+  }
+
+  static Status Exec(KernelContext* ctx, const ArraySpan& values,
+                     const ArraySpan& indices, ArrayData* out_arr, int64_t factor) {
     const bool out_has_validity = values.MayHaveNulls() || indices.MayHaveNulls();
 
     const uint8_t* src;
@@ -398,7 +442,7 @@ struct FixedWidthTakeImpl {
 };
 
 template <template <typename...> class TakeImpl, typename... Args>
-Status TakeIndexDispatch(KernelContext* ctx, const ArraySpan& values,
+Status TakeIndexDispatch(KernelContext* ctx, const ValuesSpan& values,
                          const ArraySpan& indices, ArrayData* out, int64_t factor = 1) {
   // With the simplifying assumption that boundschecking has taken place
   // already at a higher level, we can now assume that the index values are all
@@ -418,27 +462,22 @@ Status TakeIndexDispatch(KernelContext* ctx, const ArraySpan& values,
   }
 }
 
-}  // namespace
-
-Status FixedWidthTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  const ArraySpan& values = batch[0].array;
-  const ArraySpan& indices = batch[1].array;
-
+Status FixedWidthTakeExecImpl(KernelContext* ctx, const ValuesSpan& values,
+                              const ArraySpan& indices, ArrayData* out_arr) {
   if (TakeState::Get(ctx).boundscheck) {
-    RETURN_NOT_OK(CheckIndexBounds(indices, values.length));
+    RETURN_NOT_OK(CheckIndexBounds(indices, values.length()));
   }
 
-  ArrayData* out_arr = out->array_data().get();
-  DCHECK(util::IsFixedWidthLike(values));
+  DCHECK(util::IsFixedWidthLike(values.chunk0()));
   // When we know for sure that values nor indices contain nulls, we can skip
   // allocating the validity bitmap altogether and save time and space.
   const bool allocate_validity = values.MayHaveNulls() || indices.MayHaveNulls();
   RETURN_NOT_OK(util::internal::PreallocateFixedWidthArrayData(
-      ctx, indices.length, /*source=*/values, allocate_validity, out_arr));
-  switch (util::FixedWidthInBits(*values.type)) {
+      ctx, indices.length, /*source=*/values.chunk0(), allocate_validity, out_arr));
+  switch (util::FixedWidthInBits(*values.type())) {
     case 0:
-      DCHECK(values.type->id() == Type::FIXED_SIZE_BINARY ||
-             values.type->id() == Type::FIXED_SIZE_LIST);
+      DCHECK(values.type()->id() == Type::FIXED_SIZE_BINARY ||
+             values.type()->id() == Type::FIXED_SIZE_LIST);
       return TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 0>>(
           ctx, values, indices, out_arr);
     case 1:
@@ -471,9 +510,9 @@ Status FixedWidthTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
       return TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 256>>(
           ctx, values, indices, out_arr);
   }
-  if (ARROW_PREDICT_TRUE(values.type->id() == Type::FIXED_SIZE_BINARY ||
-                         values.type->id() == Type::FIXED_SIZE_LIST)) {
-    int64_t byte_width = util::FixedWidthInBytes(*values.type);
+  if (ARROW_PREDICT_TRUE(values.type()->id() == Type::FIXED_SIZE_BINARY ||
+                         values.type()->id() == Type::FIXED_SIZE_LIST)) {
+    int64_t byte_width = util::FixedWidthInBytes(*values.type());
     // 0-length fixed-size binary or lists were handled above on `case 0`
     DCHECK_GT(byte_width, 0);
     return TakeIndexDispatch<FixedWidthTakeImpl,
@@ -482,7 +521,15 @@ Status FixedWidthTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
                              /*WithFactor=*/std::true_type>(ctx, values, indices, out_arr,
                                                             /*factor=*/byte_width);
   }
-  return Status::NotImplemented("Unsupported primitive type for take: ", *values.type);
+  return Status::NotImplemented("Unsupported primitive type for take: ", *values.type());
+}
+
+}  // namespace
+
+Status FixedWidthTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  ValuesSpan values{batch[0].array};
+  auto* out_arr = out->array_data().get();
+  return FixedWidthTakeExecImpl(ctx, values, batch[1].array, out_arr);
 }
 
 namespace {

From 3435bd5550e9a0407a000b9f071898749af9fae2 Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Sun, 5 May 2024 16:02:42 -0300
Subject: [PATCH 06/20] Take: Implement the FixedWidthTakeChunkedExec() kernel
 using GatherFromChunks

---
 .../kernels/vector_selection_internal.h       |   1 +
 .../kernels/vector_selection_take_internal.cc | 158 ++++++++++++++++--
 2 files changed, 141 insertions(+), 18 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
index 558423733ca2b..3dd8e2524ba22 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_internal.h
+++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
@@ -86,6 +86,7 @@ Status MapFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status VarBinaryTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status LargeVarBinaryTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status FixedWidthTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
+Status FixedWidthTakeChunkedExec(KernelContext*, const ExecBatch&, Datum*);
 Status ListTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status LargeListTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status ListViewTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
index 7f9ce299044c9..4b4789a23b778 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
@@ -50,6 +50,7 @@ using internal::BinaryBitBlockCounter;
 using internal::BitBlockCount;
 using internal::BitBlockCounter;
 using internal::CheckIndexBounds;
+using internal::ChunkResolver;
 using internal::OptionalBitBlockCounter;
 
 namespace compute {
@@ -363,6 +364,96 @@ class ValuesSpan {
   }
 };
 
+struct ChunkedFixedWidthValuesSpan {
+ private:
+  // src_residual_bit_offsets_[i] is used to store the bit offset of the first byte (0-7)
+  // in src_chunks_[i] iff kValueWidthInBits == 1.
+  std::vector<int> src_residual_bit_offsets;
+  // Pre-computed pointers to the start of the values in each chunk.
+  std::vector<const uint8_t*> src_chunks;
+
+ public:
+  ARROW_NOINLINE
+  explicit ChunkedFixedWidthValuesSpan(const ChunkedArray& values) {
+    const bool chunk_values_are_bit_sized = values.type()->id() == Type::BOOL;
+    DCHECK_EQ(chunk_values_are_bit_sized, util::FixedWidthInBytes(*values.type()) == -1);
+    if (chunk_values_are_bit_sized) {
+      src_residual_bit_offsets.resize(values.num_chunks());
+    }
+    src_chunks.resize(values.num_chunks());
+
+    for (int i = 0; i < values.num_chunks(); ++i) {
+      const ArraySpan chunk{*values.chunk(i)->data()};
+      DCHECK(util::IsFixedWidthLike(chunk));
+
+      auto offset_pointer = util::OffsetPointerOfFixedBitWidthValues(chunk);
+      if (chunk_values_are_bit_sized) {
+        src_residual_bit_offsets[i] = offset_pointer.first;
+      } else {
+        DCHECK_EQ(offset_pointer.first, 0);
+      }
+      src_chunks[i] = offset_pointer.second;
+    }
+  }
+
+  ARROW_NOINLINE
+  ~ChunkedFixedWidthValuesSpan() = default;
+
+  const int* src_residual_bit_offsets_data() const {
+    return src_residual_bit_offsets.empty() ? nullptr : src_residual_bit_offsets.data();
+  }
+
+  const uint8_t* const* src_chunks_data() const { return src_chunks.data(); }
+};
+
+/// \brief Logical indices resolved against a chunked array.
+struct ResolvedIndicesState {
+ private:
+  std::unique_ptr<Buffer> chunk_index_vec_buffer = NULLPTR;
+  std::unique_ptr<Buffer> index_in_chunk_vec_buffer = NULLPTR;
+
+  ARROW_NOINLINE
+  Status AllocateBuffers(int64_t n_indices, int64_t sizeof_index_type, MemoryPool* pool) {
+    ARROW_ASSIGN_OR_RAISE(chunk_index_vec_buffer,
+                          AllocateBuffer(n_indices * sizeof_index_type, pool));
+    ARROW_ASSIGN_OR_RAISE(index_in_chunk_vec_buffer,
+                          AllocateBuffer(n_indices * sizeof_index_type, pool));
+    return Status::OK();
+  }
+
+ public:
+  ARROW_NOINLINE
+  ~ResolvedIndicesState() = default;
+
+  template <typename IndexCType>
+  Status InitWithIndices(const ArrayVector& chunks, int64_t idx_length,
+                         const IndexCType* idx, MemoryPool* pool) {
+    RETURN_NOT_OK(AllocateBuffers(idx_length, sizeof(IndexCType), pool));
+    auto* chunk_index_vec = chunk_index_vec_buffer->mutable_data_as<IndexCType>();
+    auto* index_in_chunk_vec = index_in_chunk_vec_buffer->mutable_data_as<IndexCType>();
+    // All indices are resolved in one go without checking the validity bitmap.
+    // This is OK as long the output corresponding to the invalid indices is not used.
+    ChunkResolver resolver(chunks);
+    bool enough_precision = resolver.ResolveMany<IndexCType>(
+        /*n_indices=*/idx_length, /*logical_index_vec=*/idx, chunk_index_vec,
+        /*chunk_hint=*/static_cast<IndexCType>(0), index_in_chunk_vec);
+    if (ARROW_PREDICT_FALSE(!enough_precision)) {
+      return Status::IndexError("IndexCType is too small");
+    }
+    return Status::OK();
+  }
+
+  template <typename IndexCType>
+  const IndexCType* chunk_index_vec() const {
+    return chunk_index_vec_buffer->data_as<IndexCType>();
+  }
+
+  template <typename IndexCType>
+  const IndexCType* index_in_chunk_vec() const {
+    return index_in_chunk_vec_buffer->data_as<IndexCType>();
+  }
+};
+
 // ----------------------------------------------------------------------
 // Implement optimized take for primitive types from boolean to
 // 1/2/4/8/16/32-byte C-type based types and fixed-size binary (0 or more
@@ -403,9 +494,9 @@ struct FixedWidthTakeImpl {
            (factor > 0 && kValueWidthInBits == 8 &&  // factors are used with bytes
             static_cast<int64_t>(factor * kValueWidthInBits) == bit_width));
 #endif
-    // XXX: support values.is_chunked() case
-    assert(!values.is_chunked());
-    return Exec(ctx, values.array(), indices, out_arr, factor);
+    return values.is_chunked()
+               ? ChunkedExec(ctx, values.chunked_array(), indices, out_arr, factor)
+               : Exec(ctx, values.array(), indices, out_arr, factor);
   }
 
   static Status Exec(KernelContext* ctx, const ArraySpan& values,
@@ -439,6 +530,41 @@ struct FixedWidthTakeImpl {
     out_arr->null_count = out_arr->length - valid_count;
     return Status::OK();
   }
+
+  static Status ChunkedExec(KernelContext* ctx, const ChunkedArray& values,
+                            const ArraySpan& indices, ArrayData* out_arr,
+                            int64_t factor) {
+    const bool out_has_validity = values.null_count() > 0 || indices.MayHaveNulls();
+
+    ChunkedFixedWidthValuesSpan chunked_values{values};
+    ResolvedIndicesState resolved_idx;
+    RETURN_NOT_OK(resolved_idx.InitWithIndices<IndexCType>(
+        /*chunks=*/values.chunks(), /*idx_length=*/indices.length,
+        /*idx=*/indices.GetValues<IndexCType>(1), ctx->memory_pool()));
+
+    int64_t valid_count = 0;
+    arrow::internal::GatherFromChunks<kValueWidthInBits, IndexCType, WithFactor::value>
+        gather{chunked_values.src_residual_bit_offsets_data(),
+               chunked_values.src_chunks_data(),
+               indices.length,
+               resolved_idx.chunk_index_vec<IndexCType>(),
+               resolved_idx.index_in_chunk_vec<IndexCType>(),
+               /*out=*/util::MutableFixedWidthValuesPointer(out_arr),
+               factor};
+    if (out_has_validity) {
+      DCHECK_EQ(out_arr->offset, 0);
+      // out_is_valid must be zero-initiliazed, because Gather::Execute
+      // saves time by not having to ClearBit on every null element.
+      auto out_is_valid = out_arr->GetMutableValues<uint8_t>(0);
+      memset(out_is_valid, 0, bit_util::BytesForBits(out_arr->length));
+      valid_count = gather.template Execute<OutputIsZeroInitialized::value>(
+          /*src_validity=*/values, /*idx_validity=*/indices, out_is_valid);
+    } else {
+      valid_count = gather.Execute();
+    }
+    out_arr->null_count = out_arr->length - valid_count;
+    return Status::OK();
+  }
 };
 
 template <template <typename...> class TakeImpl, typename... Args>
@@ -532,6 +658,13 @@ Status FixedWidthTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
   return FixedWidthTakeExecImpl(ctx, values, batch[1].array, out_arr);
 }
 
+Status FixedWidthTakeChunkedExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  ValuesSpan values{batch[0].chunked_array()};
+  auto& indices = batch[1].array();
+  auto* out_arr = out->mutable_array();
+  return FixedWidthTakeExecImpl(ctx, values, *indices, out_arr);
+}
+
 namespace {
 
 // ----------------------------------------------------------------------
@@ -814,16 +947,6 @@ struct GenericTakeChunkedExecFunctor {
   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     return GenericTakeChunkedExec(kTakeAAAExec, ctx, batch, out);
   }
-
-  // XXX: to be removed
-  static Status ExecNonChunked(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    Datum result =
-        std::make_shared<ChunkedArray>(MakeArray(PrepareOutput(batch, batch.length)));
-    RETURN_NOT_OK(Exec(ctx, batch, &result));
-    DCHECK_EQ(result.chunked_array()->num_chunks(), 1);
-    out->value = result.chunked_array()->chunk(0)->data();
-    return Status::OK();
-  }
 };
 
 Status SpecialTakeChunkedExec(const ArrayKernelExec take_aaa_exec,
@@ -886,16 +1009,15 @@ void PopulateTakeKernels(std::vector<SelectionKernelData>* out) {
 
   *out = {
       {InputType(match::Primitive()), take_indices, FixedWidthTakeExec,
-       // XXX: doing this for testing SpecialTakeChunkedExec
-       SpecialTakeChunkedExecFunctor<
-           FixedWidthTakeExec,
-           GenericTakeChunkedExecFunctor<FixedWidthTakeExec>::ExecNonChunked>::Exec},
+       SpecialTakeChunkedExecFunctor<FixedWidthTakeExec,
+                                     FixedWidthTakeChunkedExec>::Exec},
       {InputType(match::BinaryLike()), take_indices, VarBinaryTakeExec,
        GenericTakeChunkedExecFunctor<VarBinaryTakeExec>::Exec},
       {InputType(match::LargeBinaryLike()), take_indices, LargeVarBinaryTakeExec,
        GenericTakeChunkedExecFunctor<LargeVarBinaryTakeExec>::Exec},
       {InputType(match::FixedSizeBinaryLike()), take_indices, FixedWidthTakeExec,
-       GenericTakeChunkedExecFunctor<FixedWidthTakeExec>::Exec},
+       SpecialTakeChunkedExecFunctor<FixedWidthTakeExec,
+                                     FixedWidthTakeChunkedExec>::Exec},
       {InputType(null()), take_indices, NullTakeExec,
        GenericTakeChunkedExecFunctor<NullTakeExec>::Exec},
       {InputType(Type::DICTIONARY), take_indices, DictionaryTake,

From 40f2422234aeb4bda87df281ca8963ab6a12bd30 Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Mon, 2 Sep 2024 14:01:07 -0300
Subject: [PATCH 07/20] Take: Adapt kernel to the ChunkResolver changes

---
 .../arrow/compute/kernels/gather_internal.h   | 46 ++++++++-----------
 .../kernels/vector_selection_take_internal.cc | 30 +++++-------
 2 files changed, 31 insertions(+), 45 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/gather_internal.h b/cpp/src/arrow/compute/kernels/gather_internal.h
index 102e3db34d581..14dde7587467b 100644
--- a/cpp/src/arrow/compute/kernels/gather_internal.h
+++ b/cpp/src/arrow/compute/kernels/gather_internal.h
@@ -319,24 +319,21 @@ class Gather</*kValueWidthInBits=*/1, IndexCType, /*kWithFactor=*/false>
 template <typename IndexCType>
 struct ChunkedValiditySpan {
   const ChunkedArray& chunks_validity;
-  const IndexCType* chunk_index_vec;
-  const IndexCType* index_in_chunk_vec;
+  const TypedChunkLocation<IndexCType>* chunk_location_vec;
   const bool may_have_nulls;
 
   ChunkedValiditySpan(const ChunkedArray& chunks_validity,
-                      const IndexCType* chunk_index_vec,
-                      const IndexCType* index_in_chunk_vec)
+                      const TypedChunkLocation<IndexCType>* chunk_location_vec)
       : chunks_validity(chunks_validity),
-        chunk_index_vec(chunk_index_vec),
-        index_in_chunk_vec(index_in_chunk_vec),
+        chunk_location_vec(chunk_location_vec),
         may_have_nulls(chunks_validity.null_count() > 0) {}
 
   bool MayHaveNulls() const { return may_have_nulls; }
 
   bool IsValid(int64_t position) const {
-    auto chunk_index = chunk_index_vec[position];
-    auto index_in_chunk = index_in_chunk_vec[position];
-    return chunks_validity.chunk(static_cast<int>(chunk_index))->IsValid(index_in_chunk);
+    auto loc = chunk_location_vec[position];
+    return chunks_validity.chunk(static_cast<int>(loc.chunk_index))
+        ->IsValid(loc.index_in_chunk);
   }
 };
 
@@ -356,29 +353,27 @@ class GatherFromChunks
   const int* src_residual_bit_offsets_ = NULLPTR;
   // Pre-computed pointers to the start of the values in each chunk.
   const uint8_t* const* src_chunks_;
-  // Number indices resolved in chunk_index_vec_/index_in_chunk_vec_
+  // Number indices resolved in chunk_location_vec_.
   const int64_t idx_length_;
-  const IndexCType* chunk_index_vec_;
-  const IndexCType* index_in_chunk_vec_;
+  const TypedChunkLocation<IndexCType>* chunk_location_vec_;
 
   uint8_t* out_;
   int64_t factor_;
 
  public:
   void WriteValue(int64_t position) {
-    auto chunk_index = chunk_index_vec_[position];
-    auto index_in_chunk = index_in_chunk_vec_[position];
-    auto* chunk = src_chunks_[chunk_index];
+    auto loc = chunk_location_vec_[position];
+    auto* chunk = src_chunks_[loc.chunk_index];
     if constexpr (kValueWidthInBits == 1) {
-      auto src_offset = src_residual_bit_offsets_[chunk_index];
+      auto src_offset = src_residual_bit_offsets_[loc.chunk_index];
       bit_util::SetBitTo(out_, position,
-                         bit_util::GetBit(chunk, src_offset + index_in_chunk));
+                         bit_util::GetBit(chunk, src_offset + loc.index_in_chunk));
     } else if constexpr (kWithFactor) {
       const int64_t scaled_factor = kValueWidth * factor_;
-      memcpy(out_ + position * scaled_factor, chunk + index_in_chunk * scaled_factor,
+      memcpy(out_ + position * scaled_factor, chunk + loc.index_in_chunk * scaled_factor,
              scaled_factor);
     } else {
-      memcpy(out_ + position * kValueWidth, chunk + index_in_chunk * kValueWidth,
+      memcpy(out_ + position * kValueWidth, chunk + loc.index_in_chunk * kValueWidth,
              kValueWidth);
     }
   }
@@ -412,16 +407,16 @@ class GatherFromChunks
 
  public:
   GatherFromChunks(const int* src_residual_bit_offsets, const uint8_t* const* src_chunks,
-                   const int64_t idx_length, const IndexCType* chunk_index_vec,
-                   const IndexCType* index_in_chunk_vec, uint8_t* out, int64_t factor = 1)
+                   const int64_t idx_length,
+                   const TypedChunkLocation<IndexCType>* chunk_location_vec, uint8_t* out,
+                   int64_t factor = 1)
       : src_residual_bit_offsets_(src_residual_bit_offsets),
         src_chunks_(src_chunks),
         idx_length_(idx_length),
-        chunk_index_vec_(chunk_index_vec),
-        index_in_chunk_vec_(index_in_chunk_vec),
+        chunk_location_vec_(chunk_location_vec),
         out_(out),
         factor_(factor) {
-    assert(src_chunks && chunk_index_vec && index_in_chunk_vec && out);
+    assert(src_chunks && chunk_location_vec_ && out);
     if constexpr (kValueWidthInBits == 1) {
       assert(src_residual_bit_offsets);
     }
@@ -445,8 +440,7 @@ class GatherFromChunks
     assert(idx_length_ == idx_validity.length);
     assert(out_is_valid);
     assert(idx_validity.type->byte_width() == sizeof(IndexCType));
-    ChunkedValiditySpan src_validity_span{src_validity, chunk_index_vec_,
-                                          index_in_chunk_vec_};
+    ChunkedValiditySpan src_validity_span{src_validity, chunk_location_vec_};
     assert(src_validity_span.MayHaveNulls() || idx_validity.MayHaveNulls());
     // idx=NULLPTR because when it's passed to IsSrcValid() defined above, it's not used.
     return this->template ExecuteWithNulls<kOutputIsZeroInitialized, IndexCType>(
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
index 4b4789a23b778..4d1410694b94b 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
@@ -52,6 +52,7 @@ using internal::BitBlockCounter;
 using internal::CheckIndexBounds;
 using internal::ChunkResolver;
 using internal::OptionalBitBlockCounter;
+using internal::TypedChunkLocation;
 
 namespace compute {
 namespace internal {
@@ -409,15 +410,12 @@ struct ChunkedFixedWidthValuesSpan {
 /// \brief Logical indices resolved against a chunked array.
 struct ResolvedIndicesState {
  private:
-  std::unique_ptr<Buffer> chunk_index_vec_buffer = NULLPTR;
-  std::unique_ptr<Buffer> index_in_chunk_vec_buffer = NULLPTR;
+  std::unique_ptr<Buffer> chunk_location_buffer = NULLPTR;
 
   ARROW_NOINLINE
   Status AllocateBuffers(int64_t n_indices, int64_t sizeof_index_type, MemoryPool* pool) {
-    ARROW_ASSIGN_OR_RAISE(chunk_index_vec_buffer,
-                          AllocateBuffer(n_indices * sizeof_index_type, pool));
-    ARROW_ASSIGN_OR_RAISE(index_in_chunk_vec_buffer,
-                          AllocateBuffer(n_indices * sizeof_index_type, pool));
+    ARROW_ASSIGN_OR_RAISE(chunk_location_buffer,
+                          AllocateBuffer(2 * n_indices * sizeof_index_type, pool));
     return Status::OK();
   }
 
@@ -429,14 +427,14 @@ struct ResolvedIndicesState {
   Status InitWithIndices(const ArrayVector& chunks, int64_t idx_length,
                          const IndexCType* idx, MemoryPool* pool) {
     RETURN_NOT_OK(AllocateBuffers(idx_length, sizeof(IndexCType), pool));
-    auto* chunk_index_vec = chunk_index_vec_buffer->mutable_data_as<IndexCType>();
-    auto* index_in_chunk_vec = index_in_chunk_vec_buffer->mutable_data_as<IndexCType>();
+    auto* chunk_location_vec =
+        chunk_location_buffer->mutable_data_as<TypedChunkLocation<IndexCType>>();
     // All indices are resolved in one go without checking the validity bitmap.
     // This is OK as long the output corresponding to the invalid indices is not used.
     ChunkResolver resolver(chunks);
     bool enough_precision = resolver.ResolveMany<IndexCType>(
-        /*n_indices=*/idx_length, /*logical_index_vec=*/idx, chunk_index_vec,
-        /*chunk_hint=*/static_cast<IndexCType>(0), index_in_chunk_vec);
+        /*n_indices=*/idx_length, /*logical_index_vec=*/idx, chunk_location_vec,
+        /*chunk_hint=*/static_cast<IndexCType>(0));
     if (ARROW_PREDICT_FALSE(!enough_precision)) {
       return Status::IndexError("IndexCType is too small");
     }
@@ -444,13 +442,8 @@ struct ResolvedIndicesState {
   }
 
   template <typename IndexCType>
-  const IndexCType* chunk_index_vec() const {
-    return chunk_index_vec_buffer->data_as<IndexCType>();
-  }
-
-  template <typename IndexCType>
-  const IndexCType* index_in_chunk_vec() const {
-    return index_in_chunk_vec_buffer->data_as<IndexCType>();
+  const TypedChunkLocation<IndexCType>* chunk_location_vec() const {
+    return chunk_location_buffer->data_as<TypedChunkLocation<IndexCType>>();
   }
 };
 
@@ -547,8 +540,7 @@ struct FixedWidthTakeImpl {
         gather{chunked_values.src_residual_bit_offsets_data(),
                chunked_values.src_chunks_data(),
                indices.length,
-               resolved_idx.chunk_index_vec<IndexCType>(),
-               resolved_idx.index_in_chunk_vec<IndexCType>(),
+               resolved_idx.chunk_location_vec<IndexCType>(),
                /*out=*/util::MutableFixedWidthValuesPointer(out_arr),
                factor};
     if (out_has_validity) {

From 022f6a0dec37645a728f4ce54096c8e4a5417c01 Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Wed, 12 Jun 2024 14:51:37 -0300
Subject: [PATCH 08/20] TakeMetaFunction: Update comment about what the
 MetaFunction does

---
 cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
index 4d1410694b94b..5121ed0a310e8 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
@@ -716,7 +716,7 @@ const FunctionDoc take_doc(
     {"input", "indices"}, "TakeOptions");
 
 // Metafunction for dispatching to different Take implementations other than
-// Array-Array.
+// [Chunked]Array-[Chunked]Array.
 class TakeMetaFunction : public MetaFunction {
  public:
   TakeMetaFunction()

From 7f1bc10086c9df2448c74152b90e6bb711e57705 Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Fri, 14 Jun 2024 17:20:06 -0300
Subject: [PATCH 09/20] Take: Support CA->C and CC->C cases directly in
 "array_take" with 2 strategies

---
 .../kernels/vector_selection_take_internal.cc | 310 ++++++++++++------
 1 file changed, 209 insertions(+), 101 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
index 5121ed0a310e8..fb4a2d8563fb3 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
@@ -729,18 +729,6 @@ class TakeMetaFunction : public MetaFunction {
     return array_take_func->Execute(args, &options, ctx);
   }
 
-  static Result<std::shared_ptr<Array>> ChunkedArrayAsArray(
-      const std::shared_ptr<ChunkedArray>& values, MemoryPool* pool) {
-    switch (values->num_chunks()) {
-      case 0:
-        return MakeArrayOfNull(values->type(), /*length=*/0, pool);
-      case 1:
-        return values->chunk(0);
-      default:
-        return Concatenate(values->chunks(), pool);
-    }
-  }
-
  private:
   static Result<std::shared_ptr<ArrayData>> TakeAAA(const std::vector<Datum>& args,
                                                     const TakeOptions& options,
@@ -751,52 +739,37 @@ class TakeMetaFunction : public MetaFunction {
     return result.array();
   }
 
-  static Result<std::shared_ptr<ChunkedArray>> TakeCAC(
-      const std::shared_ptr<ChunkedArray>& values, const Array& indices,
-      const TakeOptions& options, ExecContext* ctx) {
+  static Result<std::shared_ptr<ChunkedArray>> TakeCAC(const std::vector<Datum>& args,
+                                                       const TakeOptions& options,
+                                                       ExecContext* ctx) {
     // "array_take" can handle CA->C cases directly
     // (via their VectorKernel::exec_chunked)
-    ARROW_ASSIGN_OR_RAISE(auto result, CallArrayTake({values, indices}, options, ctx));
+    DCHECK_EQ(args[0].kind(), Datum::CHUNKED_ARRAY);
+    DCHECK_EQ(args[1].kind(), Datum::ARRAY);
+    ARROW_ASSIGN_OR_RAISE(auto result, CallArrayTake(args, options, ctx));
     return result.chunked_array();
   }
 
-  static Result<std::shared_ptr<ChunkedArray>> TakeCCC(
-      const std::shared_ptr<ChunkedArray>& values,
-      const std::shared_ptr<ChunkedArray>& indices, const TakeOptions& options,
-      ExecContext* ctx) {
-    // XXX: for every chunk in indices, values are gathered from all chunks in values to
-    // form a new chunk in the result. Performing this concatenation is not ideal, but
-    // greatly simplifies the implementation before something more efficient is
-    // implemented.
-    ARROW_ASSIGN_OR_RAISE(auto values_array,
-                          ChunkedArrayAsArray(values, ctx->memory_pool()));
-    std::vector<Datum> args = {std::move(values_array), {}};
-    std::vector<std::shared_ptr<Array>> new_chunks;
-    new_chunks.resize(indices->num_chunks());
-    for (int i = 0; i < indices->num_chunks(); i++) {
-      args[1] = indices->chunk(i);
-      // XXX: this loop can use TakeCAA once it can handle ChunkedArray
-      // without concatenating first
-      ARROW_ASSIGN_OR_RAISE(auto chunk, TakeAAA(args, options, ctx));
-      new_chunks[i] = MakeArray(chunk);
-    }
-    return std::make_shared<ChunkedArray>(std::move(new_chunks), values->type());
+  static Result<std::shared_ptr<ChunkedArray>> TakeCCC(const std::vector<Datum>& args,
+                                                       const TakeOptions& options,
+                                                       ExecContext* ctx) {
+    // "array_take" can handle CC->C cases directly
+    // (via their VectorKernel::exec_chunked)
+    DCHECK_EQ(args[0].kind(), Datum::CHUNKED_ARRAY);
+    DCHECK_EQ(args[1].kind(), Datum::CHUNKED_ARRAY);
+    ARROW_ASSIGN_OR_RAISE(auto result, CallArrayTake(args, options, ctx));
+    return result.chunked_array();
   }
 
-  static Result<std::shared_ptr<ChunkedArray>> TakeACC(const Array& values,
-                                                       const ChunkedArray& indices,
+  static Result<std::shared_ptr<ChunkedArray>> TakeACC(const std::vector<Datum>& args,
                                                        const TakeOptions& options,
                                                        ExecContext* ctx) {
-    auto num_chunks = indices.num_chunks();
-    std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
-    std::vector<Datum> args = {values, {}};
-    for (int i = 0; i < num_chunks; i++) {
-      // Take with that indices chunk
-      args[1] = indices.chunk(i);
-      ARROW_ASSIGN_OR_RAISE(auto chunk, TakeAAA(args, options, ctx));
-      new_chunks[i] = MakeArray(chunk);
-    }
-    return std::make_shared<ChunkedArray>(std::move(new_chunks), values.type());
+    // "array_take" can handle AC->C cases directly
+    // (via their VectorKernel::exec_chunked)
+    DCHECK_EQ(args[0].kind(), Datum::ARRAY);
+    DCHECK_EQ(args[1].kind(), Datum::CHUNKED_ARRAY);
+    ARROW_ASSIGN_OR_RAISE(auto result, CallArrayTake(args, options, ctx));
+    return result.chunked_array();
   }
 
   static Result<std::shared_ptr<RecordBatch>> TakeRAR(const RecordBatch& batch,
@@ -821,9 +794,10 @@ class TakeMetaFunction : public MetaFunction {
                                                 ExecContext* ctx) {
     auto ncols = table->num_columns();
     std::vector<std::shared_ptr<ChunkedArray>> columns(ncols);
-
+    std::vector<Datum> args = {/*placeholder*/ {}, indices};
     for (int j = 0; j < ncols; j++) {
-      ARROW_ASSIGN_OR_RAISE(columns[j], TakeCAC(table->column(j), indices, options, ctx));
+      args[0] = table->column(j);
+      ARROW_ASSIGN_OR_RAISE(columns[j], TakeCAC(args, options, ctx));
     }
     return Table::Make(table->schema(), std::move(columns));
   }
@@ -833,8 +807,10 @@ class TakeMetaFunction : public MetaFunction {
       const TakeOptions& options, ExecContext* ctx) {
     auto ncols = table->num_columns();
     std::vector<std::shared_ptr<ChunkedArray>> columns(ncols);
+    std::vector<Datum> args = {/*placeholder*/ {}, indices};
     for (int j = 0; j < ncols; j++) {
-      ARROW_ASSIGN_OR_RAISE(columns[j], TakeCCC(table->column(j), indices, options, ctx));
+      args[0] = table->column(j);
+      ARROW_ASSIGN_OR_RAISE(columns[j], TakeCCC(args, options, ctx));
     }
     return Table::Make(table->schema(), std::move(columns));
   }
@@ -847,18 +823,17 @@ class TakeMetaFunction : public MetaFunction {
     const auto& take_opts = static_cast<const TakeOptions&>(*options);
     switch (args[0].kind()) {
       case Datum::ARRAY:
-        if (index_kind == Datum::ARRAY) {
-          return TakeAAA(args, take_opts, ctx);
-        } else if (index_kind == Datum::CHUNKED_ARRAY) {
-          return TakeACC(*args[0].make_array(), *args[1].chunked_array(), take_opts, ctx);
+        // "array_take" can handle AA->A and AC->C cases directly
+        // (via their VectorKernel::exec and VectorKernel::exec_chunked)
+        if (index_kind == Datum::ARRAY || index_kind == Datum::CHUNKED_ARRAY) {
+          return CallArrayTake(args, take_opts, ctx);
         }
         break;
       case Datum::CHUNKED_ARRAY:
-        if (index_kind == Datum::ARRAY) {
-          return TakeCAC(args[0].chunked_array(), *args[1].make_array(), take_opts, ctx);
-        } else if (index_kind == Datum::CHUNKED_ARRAY) {
-          return TakeCCC(args[0].chunked_array(), args[1].chunked_array(), take_opts,
-                         ctx);
+        // "array_take" can handle CA->C and CC->C cases directly
+        // (via their VectorKernel::exec_chunked)
+        if (index_kind == Datum::ARRAY || index_kind == Datum::CHUNKED_ARRAY) {
+          return CallArrayTake(args, take_opts, ctx);
         }
         break;
       case Datum::RECORD_BATCH:
@@ -880,7 +855,7 @@ class TakeMetaFunction : public MetaFunction {
     return Status::NotImplemented(
         "Unsupported types for take operation: "
         "values=",
-        args[0].ToString(), "indices=", args[1].ToString());
+        args[0].ToString(), ", indices=", args[1].ToString());
   }
 };
 
@@ -894,6 +869,18 @@ std::shared_ptr<ArrayData> PrepareOutput(const ExecBatch& batch, int64_t length)
   return out;
 }
 
+Result<std::shared_ptr<Array>> ChunkedArrayAsArray(
+    const std::shared_ptr<ChunkedArray>& values, MemoryPool* pool) {
+  switch (values->num_chunks()) {
+    case 0:
+      return MakeArrayOfNull(values->type(), /*length=*/0, pool);
+    case 1:
+      return values->chunk(0);
+    default:
+      return Concatenate(values->chunks(), pool);
+  }
+}
+
 Status CallAAAKernel(ArrayKernelExec take_aaa_exec, KernelContext* ctx,
                      std::shared_ptr<ArrayData> values,
                      std::shared_ptr<ArrayData> indices, Datum* out) {
@@ -907,31 +894,97 @@ Status CallAAAKernel(ArrayKernelExec take_aaa_exec, KernelContext* ctx,
   return take_aaa_exec(ctx, exec_span, &result);
 }
 
-/// \brief Generic VectorKernel::exec_chunked for CA->A cases.
+Status CallCAAKernel(VectorKernel::ChunkedExec take_caa_exec, KernelContext* ctx,
+                     std::shared_ptr<ChunkedArray> values,
+                     std::shared_ptr<ArrayData> indices, Datum* out) {
+  int64_t batch_length = values->length();
+  std::vector<Datum> args = {std::move(values), std::move(indices)};
+  ExecBatch chunked_array_array_batch(std::move(args), batch_length);
+  DCHECK_EQ(out->kind(), Datum::ARRAY);
+  return take_caa_exec(ctx, chunked_array_array_batch, out);
+}
+
+Status TakeACCChunkedExec(ArrayKernelExec take_aaa_exec, KernelContext* ctx,
+                          const ExecBatch& batch, Datum* out) {
+  auto& values = batch.values[0].array();
+  auto& indices = batch.values[1].chunked_array();
+  auto num_chunks = indices->num_chunks();
+  std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
+  for (int i = 0; i < num_chunks; i++) {
+    // Take with that indices chunk
+    auto& indices_chunk = indices->chunk(i)->data();
+    Datum result = PrepareOutput(batch, values->length);
+    RETURN_NOT_OK(CallAAAKernel(take_aaa_exec, ctx, values, indices_chunk, &result));
+    new_chunks[i] = MakeArray(result.array());
+  }
+  out->value = std::make_shared<ChunkedArray>(std::move(new_chunks), values->type);
+  return Status::OK();
+}
+
+/// \brief Generic (slower) VectorKernel::exec_chunked (`CA->C`, `CC->C`, and `AC->C`).
+///
+/// This function concatenates the chunks of the values and then calls the `AA->A` take
+/// kernel to handle the `CA->C` cases. The ArrayData returned by the `AA->A` kernel is
+/// converted to a ChunkedArray with a single chunk to honor the `CA->C` contract.
+///
+/// For `CC->C` cases, it concatenates the chunks of the values and calls the `AA->A` take
+/// kernel for each chunk of the indices, producing a new chunked array with the same
+/// shape as the indices.
 ///
-/// This function concatenates the chunks of values and then calls the
-/// AA->A take kernel.
+/// `AC->C` cases are trivially delegated to TakeACCChunkedExec without any concatenation.
 ///
-/// \param take_aaa_exec The AA->A take kernel to use.
+/// \param take_aaa_exec The `AA->A` take kernel to use.
 Status GenericTakeChunkedExec(ArrayKernelExec take_aaa_exec, KernelContext* ctx,
                               const ExecBatch& batch, Datum* out) {
-  auto& args = batch.values;
-  if (args[0].kind() == Datum::CHUNKED_ARRAY && args[1].kind() == Datum::ARRAY) {
-    auto& values = args[0].chunked_array();
-    auto& indices = args[1].array();
-    ARROW_ASSIGN_OR_RAISE(auto values_array, TakeMetaFunction::ChunkedArrayAsArray(
-                                                 values, ctx->memory_pool()));
-    DCHECK_EQ(values_array->length(), batch.length);
-    Datum result = PrepareOutput(batch, batch.length);
-    RETURN_NOT_OK(
-        CallAAAKernel(take_aaa_exec, ctx, values_array->data(), indices, &result));
-    out->value = std::make_shared<ChunkedArray>(MakeArray(result.array()));
-    return Status::OK();
+  const auto& args = batch.values;
+  if (args[0].kind() == Datum::CHUNKED_ARRAY) {
+    auto& values_chunked = args[0].chunked_array();
+    ARROW_ASSIGN_OR_RAISE(auto values_array,
+                          ChunkedArrayAsArray(values_chunked, ctx->memory_pool()));
+    if (args[1].kind() == Datum::ARRAY) {
+      // CA->C
+      auto& indices = args[1].array();
+      DCHECK_EQ(values_array->length(), batch.length);
+      {
+        // AA->A
+        RETURN_NOT_OK(
+            CallAAAKernel(take_aaa_exec, ctx, values_array->data(), indices, out));
+        out->value = std::make_shared<ChunkedArray>(MakeArray(out->array()));
+      }
+      return Status::OK();
+    } else if (args[1].kind() == Datum::CHUNKED_ARRAY) {
+      // CC->C
+      const auto& indices = args[1].chunked_array();
+      std::vector<std::shared_ptr<Array>> new_chunks;
+      for (int i = 0; i < indices->num_chunks(); i++) {
+        // AA->A
+        auto& indices_chunk = indices->chunk(i)->data();
+        Datum result = PrepareOutput(batch, values_array->length());
+        RETURN_NOT_OK(CallAAAKernel(take_aaa_exec, ctx, values_array->data(),
+                                    indices_chunk, &result));
+        new_chunks.push_back(MakeArray(result.array()));
+      }
+      DCHECK(out->is_array());
+      out->value =
+          std::make_shared<ChunkedArray>(std::move(new_chunks), values_chunked->type());
+      return Status::OK();
+    }
+  } else {
+    // VectorKernel::exec_chunked are only called when at least one of the inputs is
+    // chunked, so we should be able to assume that args[1] is a chunked array when
+    // everything is wired up correctly.
+    if (args[1].kind() == Datum::CHUNKED_ARRAY) {
+      // AC->C
+      return TakeACCChunkedExec(take_aaa_exec, ctx, batch, out);
+    } else {
+      DCHECK(false) << "Unexpected kind for array_take's exec_chunked kernel: values="
+                    << args[0].ToString() << ", indices=" << args[1].ToString();
+    }
   }
   return Status::NotImplemented(
       "Unsupported kinds for 'array_take', try using 'take': "
       "values=",
-      args[0].ToString(), "indices=", args[1].ToString());
+      args[0].ToString(), ", indices=", args[1].ToString());
 }
 
 template <ArrayKernelExec kTakeAAAExec>
@@ -941,41 +994,96 @@ struct GenericTakeChunkedExecFunctor {
   }
 };
 
+/// \brief Specialized (faster) VectorKernel::exec_chunked (`CA->C`, `CC->C`, `AC->C`).
+///
+/// This function doesn't ever need to concatenate the chunks of the values, so it can be
+/// more efficient than GenericTakeChunkedExec that can only delegate to the `AA->A` take
+/// kernels.
+///
+/// For `CA->C` cases, it can call the `CA->A` take kernel directly [1] and trivially
+/// convert the result to a ChunkedArray of a single chunk to honor the `CA->C` contract.
+///
+/// For `CC->C` cases it can call the `CA->A` take kernel for each chunk of the indices to
+/// get each chunk that becomes the ChunkedArray output.
+///
+/// `AC->C` cases are trivially delegated to TakeACCChunkedExec.
+///
+/// \param take_aaa_exec The `AA->A` take kernel to use.
 Status SpecialTakeChunkedExec(const ArrayKernelExec take_aaa_exec,
                               VectorKernel::ChunkedExec take_caa_exec, KernelContext* ctx,
                               const ExecBatch& batch, Datum* out) {
   Datum result = PrepareOutput(batch, batch.length);
   auto* pool = ctx->memory_pool();
-  auto& args = batch.values;
-  if (args[0].kind() == Datum::CHUNKED_ARRAY && args[1].kind() == Datum::ARRAY) {
-    auto& values = args[0].chunked_array();
-    auto& indices = args[1].array();
+  const auto& args = batch.values;
+  if (args[0].kind() == Datum::CHUNKED_ARRAY) {
+    auto& values_chunked = args[0].chunked_array();
     std::shared_ptr<Array> single_chunk = nullptr;
-    if (values->num_chunks() == 0 || values->length() == 0) {
+    if (values_chunked->num_chunks() == 0 || values_chunked->length() == 0) {
       ARROW_ASSIGN_OR_RAISE(single_chunk,
-                            MakeArrayOfNull(values->type(), /*length=*/0, pool));
-    } else if (values->num_chunks() == 1) {
-      single_chunk = values->chunk(0);
+                            MakeArrayOfNull(values_chunked->type(), /*length=*/0, pool));
+    } else if (values_chunked->num_chunks() == 1) {
+      single_chunk = values_chunked->chunk(0);
     }
-    if (single_chunk) {
-      DCHECK_EQ(single_chunk->length(), batch.length);
-      // If the ChunkedArray was cheaply converted to a single chunk,
-      // we can use the AA->A take kernel directly.
-      RETURN_NOT_OK(
-          CallAAAKernel(take_aaa_exec, ctx, single_chunk->data(), indices, &result));
+
+    if (args[1].kind() == Datum::ARRAY) {
+      // CA->C
+      auto& indices = args[1].array();
+      if (single_chunk) {
+        // AA->A
+        DCHECK_EQ(single_chunk->length(), batch.length);
+        // If the ChunkedArray was cheaply converted to a single chunk,
+        // we can use the AA->A take kernel directly.
+        RETURN_NOT_OK(
+            CallAAAKernel(take_aaa_exec, ctx, single_chunk->data(), indices, out));
+        out->value = std::make_shared<ChunkedArray>(MakeArray(out->array()));
+        return Status::OK();
+      }
+      // Instead of concatenating the chunks, we call the CA->A take kernel
+      // which has a more efficient implementation for this case. At this point,
+      // that implementation doesn't have to care about empty or single-chunk
+      // ChunkedArrays.
+      RETURN_NOT_OK(take_caa_exec(ctx, batch, &result));
+      out->value = std::make_shared<ChunkedArray>(MakeArray(result.array()));
+      return Status::OK();
+    } else {
+      // CC->C
+      const auto& indices = args[1].chunked_array();
+      std::vector<std::shared_ptr<Array>> new_chunks;
+      for (int i = 0; i < indices->num_chunks(); i++) {
+        auto& indices_chunk = indices->chunk(i)->data();
+        result = PrepareOutput(batch, values_chunked->length());
+        if (single_chunk) {
+          // If the ChunkedArray was cheaply converted to a single chunk,
+          // we can use the AA->A take kernel directly.
+          RETURN_NOT_OK(CallAAAKernel(take_aaa_exec, ctx, single_chunk->data(),
+                                      indices_chunk, &result));
+        } else {
+          RETURN_NOT_OK(
+              CallCAAKernel(take_caa_exec, ctx, values_chunked, indices_chunk, &result));
+        }
+        new_chunks.push_back(MakeArray(result.array()));
+      }
+      DCHECK(out->is_array());
+      out->value =
+          std::make_shared<ChunkedArray>(std::move(new_chunks), values_chunked->type());
+      return Status::OK();
+    }
+  } else {
+    // VectorKernel::exec_chunked are only called when at least one of the inputs is
+    // chunked, so we should be able to assume that args[1] is a chunked array when
+    // everything is wired up correctly.
+    if (args[1].kind() == Datum::CHUNKED_ARRAY) {
+      // AC->C
+      return TakeACCChunkedExec(take_aaa_exec, ctx, batch, out);
+    } else {
+      DCHECK(false) << "Unexpected kind for array_take's exec_chunked kernel: values="
+                    << args[0].ToString() << ", indices=" << args[1].ToString();
     }
-    // Instead of concatenating the chunks, we call the CA->A take kernel
-    // which has a more efficient implementation for this case. At this point,
-    // that implementation doesn't have to care about empty or single-chunk
-    // ChunkedArrays.
-    RETURN_NOT_OK(take_caa_exec(ctx, batch, &result));
-    out->value = std::make_shared<ChunkedArray>(MakeArray(result.array()));
-    return Status::OK();
   }
   return Status::NotImplemented(
       "Unsupported kinds for 'array_take', try using 'take': "
       "values=",
-      args[0].ToString(), "indices=", args[1].ToString());
+      args[0].ToString(), ", indices=", args[1].ToString());
 }
 
 template <ArrayKernelExec kTakeAAAExec, VectorKernel::ChunkedExec kTakeCAAExec>

From ace70fb57f08dbd3c3ee6335b63f635e79e2e036 Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Sat, 15 Jun 2024 12:04:46 -0300
Subject: [PATCH 10/20] Take: Simplify TakeMetaFunction even further

---
 .../compute/kernels/vector_selection_take_internal.cc  | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
index fb4a2d8563fb3..8fb90584c5eeb 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
@@ -823,15 +823,9 @@ class TakeMetaFunction : public MetaFunction {
     const auto& take_opts = static_cast<const TakeOptions&>(*options);
     switch (args[0].kind()) {
       case Datum::ARRAY:
-        // "array_take" can handle AA->A and AC->C cases directly
-        // (via their VectorKernel::exec and VectorKernel::exec_chunked)
-        if (index_kind == Datum::ARRAY || index_kind == Datum::CHUNKED_ARRAY) {
-          return CallArrayTake(args, take_opts, ctx);
-        }
-        break;
       case Datum::CHUNKED_ARRAY:
-        // "array_take" can handle CA->C and CC->C cases directly
-        // (via their VectorKernel::exec_chunked)
+        // "array_take" can handle AA->A, AC->C, CA->C, CC->C cases directly
+        // (via their VectorKernel::exec and VectorKernel::exec_chunked)
         if (index_kind == Datum::ARRAY || index_kind == Datum::CHUNKED_ARRAY) {
           return CallArrayTake(args, take_opts, ctx);
         }

From 4a48d3ef453faa6d3b37b1a053f5e1a1cf84509f Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Wed, 26 Jun 2024 14:29:03 -0300
Subject: [PATCH 11/20] Remove all ARROW_NOINLINE from
 vector_selection_take_internal.cc

---
 .../arrow/compute/kernels/vector_selection_take_internal.cc   | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
index 8fb90584c5eeb..db1355f543fff 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
@@ -374,7 +374,6 @@ struct ChunkedFixedWidthValuesSpan {
   std::vector<const uint8_t*> src_chunks;
 
  public:
-  ARROW_NOINLINE
   explicit ChunkedFixedWidthValuesSpan(const ChunkedArray& values) {
     const bool chunk_values_are_bit_sized = values.type()->id() == Type::BOOL;
     DCHECK_EQ(chunk_values_are_bit_sized, util::FixedWidthInBytes(*values.type()) == -1);
@@ -397,7 +396,6 @@ struct ChunkedFixedWidthValuesSpan {
     }
   }
 
-  ARROW_NOINLINE
   ~ChunkedFixedWidthValuesSpan() = default;
 
   const int* src_residual_bit_offsets_data() const {
@@ -412,7 +410,6 @@ struct ResolvedIndicesState {
  private:
   std::unique_ptr<Buffer> chunk_location_buffer = NULLPTR;
 
-  ARROW_NOINLINE
   Status AllocateBuffers(int64_t n_indices, int64_t sizeof_index_type, MemoryPool* pool) {
     ARROW_ASSIGN_OR_RAISE(chunk_location_buffer,
                           AllocateBuffer(2 * n_indices * sizeof_index_type, pool));
@@ -420,7 +417,6 @@ struct ResolvedIndicesState {
   }
 
  public:
-  ARROW_NOINLINE
   ~ResolvedIndicesState() = default;
 
   template <typename IndexCType>

From 49b5e97e4a495289f11a909bd8b727e0b16773be Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Wed, 26 Jun 2024 17:28:56 -0300
Subject: [PATCH 12/20] gather_intenal.h: Clarify the semantics of
 ValiditySpan/IsSrcValid

---
 cpp/src/arrow/compute/kernels/gather_internal.h | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/gather_internal.h b/cpp/src/arrow/compute/kernels/gather_internal.h
index 14dde7587467b..dfc893a4da284 100644
--- a/cpp/src/arrow/compute/kernels/gather_internal.h
+++ b/cpp/src/arrow/compute/kernels/gather_internal.h
@@ -61,8 +61,10 @@ class GatherBaseCRTP {
   template <typename IndexCType>
   bool IsSrcValid(const ArraySpan& src_validity, const IndexCType* idx,
                   int64_t position) const {
+    // Translate position into index on the source
+    const int64_t index = idx[position];
     ARROW_COMPILER_ASSUME(src_validity.buffers[0].data != nullptr);
-    return src_validity.IsValid(idx[position]);
+    return src_validity.IsValid(index);
   }
 
   ARROW_FORCE_INLINE int64_t ExecuteNoNulls(int64_t idx_length) {
@@ -89,6 +91,9 @@ class GatherBaseCRTP {
   // doesn't have to be called for resulting null positions. A position is
   // considered null if either the index or the source value is null at that
   // position.
+  //
+  // ValiditySpan is any class that `GatherImpl::IsSrcValid(src_validity, idx, position)`
+  // can be called with.
   template <bool kOutputIsZeroInitialized, typename IndexCType,
             class ValiditySpan = ArraySpan>
   ARROW_FORCE_INLINE int64_t ExecuteWithNulls(const ValiditySpan& src_validity,
@@ -330,7 +335,10 @@ struct ChunkedValiditySpan {
 
   bool MayHaveNulls() const { return may_have_nulls; }
 
-  bool IsValid(int64_t position) const {
+  bool IsSrcValid(const IndexCType* idx, int64_t position) const {
+    // idx is unused because all the indices have been pre-resolved into
+    // `chunk_location_vec` by ChunkResolver::ResolveMany.
+    ARROW_UNUSED(idx);
     auto loc = chunk_location_vec[position];
     return chunks_validity.chunk(static_cast<int>(loc.chunk_index))
         ->IsValid(loc.index_in_chunk);
@@ -402,7 +410,7 @@ class GatherFromChunks
 
   bool IsSrcValid(const ChunkedValiditySpan<IndexCType>& src_validity,
                   const IndexCType* idx, int64_t position) const {
-    return src_validity.IsValid(position);
+    return src_validity.IsSrcValid(idx, position);
   }
 
  public:

From 575d6df310d1cec4f774cd7a09bfcbafb6b13e9b Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Fri, 16 Aug 2024 11:43:22 -0300
Subject: [PATCH 13/20] Take: Fix silly mistake

---
 .../arrow/compute/kernels/vector_selection_take_internal.cc  | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
index db1355f543fff..9050add49121c 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
@@ -881,7 +881,10 @@ Status CallAAAKernel(ArrayKernelExec take_aaa_exec, KernelContext* ctx,
   ExecSpan exec_span{array_array_batch};
   ExecResult result;
   result.value = out->array();
-  return take_aaa_exec(ctx, exec_span, &result);
+  RETURN_NOT_OK(take_aaa_exec(ctx, exec_span, &result));
+  DCHECK(result.is_array_data());
+  out->value = result.array_data();
+  return Status::OK();
 }
 
 Status CallCAAKernel(VectorKernel::ChunkedExec take_caa_exec, KernelContext* ctx,

From d47462e184bb86e56b99d68bed6a099bba302d0b Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Tue, 20 Aug 2024 14:55:38 -0300
Subject: [PATCH 14/20] Small fixes from PR feedback

---
 .../kernels/vector_selection_take_internal.cc      | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
index 9050add49121c..b2d78d73b66ed 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
@@ -396,8 +396,6 @@ struct ChunkedFixedWidthValuesSpan {
     }
   }
 
-  ~ChunkedFixedWidthValuesSpan() = default;
-
   const int* src_residual_bit_offsets_data() const {
     return src_residual_bit_offsets.empty() ? nullptr : src_residual_bit_offsets.data();
   }
@@ -931,12 +929,12 @@ Status GenericTakeChunkedExec(ArrayKernelExec take_aaa_exec, KernelContext* ctx,
                               const ExecBatch& batch, Datum* out) {
   const auto& args = batch.values;
   if (args[0].kind() == Datum::CHUNKED_ARRAY) {
-    auto& values_chunked = args[0].chunked_array();
+    const auto& values_chunked = args[0].chunked_array();
     ARROW_ASSIGN_OR_RAISE(auto values_array,
                           ChunkedArrayAsArray(values_chunked, ctx->memory_pool()));
     if (args[1].kind() == Datum::ARRAY) {
       // CA->C
-      auto& indices = args[1].array();
+      const auto& indices = args[1].array();
       DCHECK_EQ(values_array->length(), batch.length);
       {
         // AA->A
@@ -951,7 +949,7 @@ Status GenericTakeChunkedExec(ArrayKernelExec take_aaa_exec, KernelContext* ctx,
       std::vector<std::shared_ptr<Array>> new_chunks;
       for (int i = 0; i < indices->num_chunks(); i++) {
         // AA->A
-        auto& indices_chunk = indices->chunk(i)->data();
+        const auto& indices_chunk = indices->chunk(i)->data();
         Datum result = PrepareOutput(batch, values_array->length());
         RETURN_NOT_OK(CallAAAKernel(take_aaa_exec, ctx, values_array->data(),
                                     indices_chunk, &result));
@@ -1009,7 +1007,7 @@ Status SpecialTakeChunkedExec(const ArrayKernelExec take_aaa_exec,
   auto* pool = ctx->memory_pool();
   const auto& args = batch.values;
   if (args[0].kind() == Datum::CHUNKED_ARRAY) {
-    auto& values_chunked = args[0].chunked_array();
+    const auto& values_chunked = args[0].chunked_array();
     std::shared_ptr<Array> single_chunk = nullptr;
     if (values_chunked->num_chunks() == 0 || values_chunked->length() == 0) {
       ARROW_ASSIGN_OR_RAISE(single_chunk,
@@ -1020,7 +1018,7 @@ Status SpecialTakeChunkedExec(const ArrayKernelExec take_aaa_exec,
 
     if (args[1].kind() == Datum::ARRAY) {
       // CA->C
-      auto& indices = args[1].array();
+      const auto& indices = args[1].array();
       if (single_chunk) {
         // AA->A
         DCHECK_EQ(single_chunk->length(), batch.length);
@@ -1043,7 +1041,7 @@ Status SpecialTakeChunkedExec(const ArrayKernelExec take_aaa_exec,
       const auto& indices = args[1].chunked_array();
       std::vector<std::shared_ptr<Array>> new_chunks;
       for (int i = 0; i < indices->num_chunks(); i++) {
-        auto& indices_chunk = indices->chunk(i)->data();
+        const auto& indices_chunk = indices->chunk(i)->data();
         result = PrepareOutput(batch, values_chunked->length());
         if (single_chunk) {
           // If the ChunkedArray was cheaply converted to a single chunk,

From 566a11380b442d199f54eddf897bb52479cd8612 Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Sat, 31 Aug 2024 23:41:36 -0300
Subject: [PATCH 15/20] Take: Use fixed size blocks of locations when running
 TakeCA

---
 .../kernels/vector_selection_take_internal.cc | 140 +++++++++++++-----
 1 file changed, 104 insertions(+), 36 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
index b2d78d73b66ed..67747b4912f6f 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
@@ -340,7 +340,8 @@ class ValuesSpan {
     DCHECK_GT(chunked_->num_chunks(), 0);
   }
 
-  explicit ValuesSpan(const ArraySpan& values) : chunk0_(values) {}
+  explicit ValuesSpan(const ArraySpan& values)  // NOLINT(modernize-pass-by-value)
+      : chunk0_(values) {}
 
   bool is_chunked() const { return chunked_ != nullptr; }
 
@@ -403,42 +404,65 @@ struct ChunkedFixedWidthValuesSpan {
   const uint8_t* const* src_chunks_data() const { return src_chunks.data(); }
 };
 
-/// \brief Logical indices resolved against a chunked array.
-struct ResolvedIndicesState {
+/// \brief Buffer for chunk locations resolved against a chunked array.
+struct BoundedLocationBuffer {
  private:
   std::unique_ptr<Buffer> chunk_location_buffer = NULLPTR;
 
-  Status AllocateBuffers(int64_t n_indices, int64_t sizeof_index_type, MemoryPool* pool) {
+  Status Allocate(int64_t n_locations, int64_t sizeof_location, MemoryPool* pool) {
     ARROW_ASSIGN_OR_RAISE(chunk_location_buffer,
-                          AllocateBuffer(2 * n_indices * sizeof_index_type, pool));
+                          AllocateBuffer(n_locations * sizeof_location, pool));
     return Status::OK();
   }
 
  public:
-  ~ResolvedIndicesState() = default;
+  ~BoundedLocationBuffer() = default;
 
   template <typename IndexCType>
-  Status InitWithIndices(const ArrayVector& chunks, int64_t idx_length,
-                         const IndexCType* idx, MemoryPool* pool) {
-    RETURN_NOT_OK(AllocateBuffers(idx_length, sizeof(IndexCType), pool));
-    auto* chunk_location_vec =
-        chunk_location_buffer->mutable_data_as<TypedChunkLocation<IndexCType>>();
+  Status InitWithCapacity(int64_t n_locations, MemoryPool* pool) {
+    RETURN_NOT_OK(Allocate(n_locations, sizeof(TypedChunkLocation<IndexCType>), pool));
+    return Status::OK();
+  }
+
+  /// \brief The capacity in terms of number of resolved chunk locations.
+  ///
+  /// One location is needed for each index.
+  template <typename IndexCType>
+  int64_t Capacity() const {
+    return chunk_location_buffer->size() / sizeof(TypedChunkLocation<IndexCType>);
+  }
+
+  /// \pre idx_length <= Capacity<IndexCType>()
+  template <typename IndexCType>
+  Status ResolveIndices(const ChunkResolver& chunk_resolver, int64_t idx_length,
+                        const IndexCType* idx, IndexCType chunk_hint) {
+    DCHECK_LE(idx_length, Capacity<IndexCType>());
+    auto* chunk_location_vec = mutable_chunk_location_vec<IndexCType>();
     // All indices are resolved in one go without checking the validity bitmap.
     // This is OK as long the output corresponding to the invalid indices is not used.
-    ChunkResolver resolver(chunks);
-    bool enough_precision = resolver.ResolveMany<IndexCType>(
+    bool enough_precision = chunk_resolver.ResolveMany<IndexCType>(
         /*n_indices=*/idx_length, /*logical_index_vec=*/idx, chunk_location_vec,
-        /*chunk_hint=*/static_cast<IndexCType>(0));
+        chunk_hint);
     if (ARROW_PREDICT_FALSE(!enough_precision)) {
       return Status::IndexError("IndexCType is too small");
     }
     return Status::OK();
   }
 
+  template <typename IndexCType>
+  TypedChunkLocation<IndexCType>* mutable_chunk_location_vec() {
+    return chunk_location_buffer->mutable_data_as<TypedChunkLocation<IndexCType>>();
+  }
+
   template <typename IndexCType>
   const TypedChunkLocation<IndexCType>* chunk_location_vec() const {
     return chunk_location_buffer->data_as<TypedChunkLocation<IndexCType>>();
   }
+
+  template <typename IndexCType>
+  IndexCType chunk_index(int64_t position) const {
+    return chunk_location_vec<IndexCType>()[position].chunk_index;
+  }
 };
 
 // ----------------------------------------------------------------------
@@ -521,32 +545,76 @@ struct FixedWidthTakeImpl {
   static Status ChunkedExec(KernelContext* ctx, const ChunkedArray& values,
                             const ArraySpan& indices, ArrayData* out_arr,
                             int64_t factor) {
-    const bool out_has_validity = values.null_count() > 0 || indices.MayHaveNulls();
+    constexpr int64_t kIndexBlockCapacityInBytes = 16 * 1024;
+    // Must be a multiple of 8 so `GatherFromChunks` can always be
+    // constructed with byte-aligned output pointers in the loop.
+    constexpr int64_t kIndexBlockCapacity =
+        kIndexBlockCapacityInBytes / sizeof(IndexCType);
+    static_assert((kIndexBlockCapacity * kValueWidthInBits) % 8 == 0);
 
     ChunkedFixedWidthValuesSpan chunked_values{values};
-    ResolvedIndicesState resolved_idx;
-    RETURN_NOT_OK(resolved_idx.InitWithIndices<IndexCType>(
-        /*chunks=*/values.chunks(), /*idx_length=*/indices.length,
-        /*idx=*/indices.GetValues<IndexCType>(1), ctx->memory_pool()));
+    ChunkResolver chunk_resolver{values.chunks()};
+    BoundedLocationBuffer location_buffer;
+    // TODO(felipecrv): find a way to share the buffer on TakeCC kernel
+    RETURN_NOT_OK(location_buffer.InitWithCapacity<IndexCType>(
+        /*n_locations=*/std::min(kIndexBlockCapacity, indices.length),
+        ctx->memory_pool()));
+
+    return DoChunkedExec(ctx, values, chunked_values, chunk_resolver, indices,
+                         &location_buffer, out_arr, factor);
+  }
+
+  // \pre location_buffer is initialized
+  // \pre location_buffer->Capacity() is a multiple of 8
+  static Status DoChunkedExec(KernelContext* ctx, const ChunkedArray& values,
+                              const ChunkedFixedWidthValuesSpan& chunked_values,
+                              const ChunkResolver& chunk_resolver,
+                              const ArraySpan& indices,
+                              BoundedLocationBuffer* location_buffer, ArrayData* out_arr,
+                              int64_t factor) {
+    const bool out_has_validity = values.null_count() > 0 || indices.MayHaveNulls();
 
+    const auto location_buffer_capacity = location_buffer->Capacity<IndexCType>();
+    const auto* idx = indices.GetValues<IndexCType>(1);
+    uint8_t* out = util::MutableFixedWidthValuesPointer(out_arr);
     int64_t valid_count = 0;
-    arrow::internal::GatherFromChunks<kValueWidthInBits, IndexCType, WithFactor::value>
-        gather{chunked_values.src_residual_bit_offsets_data(),
-               chunked_values.src_chunks_data(),
-               indices.length,
-               resolved_idx.chunk_location_vec<IndexCType>(),
-               /*out=*/util::MutableFixedWidthValuesPointer(out_arr),
-               factor};
-    if (out_has_validity) {
-      DCHECK_EQ(out_arr->offset, 0);
-      // out_is_valid must be zero-initiliazed, because Gather::Execute
-      // saves time by not having to ClearBit on every null element.
-      auto out_is_valid = out_arr->GetMutableValues<uint8_t>(0);
-      memset(out_is_valid, 0, bit_util::BytesForBits(out_arr->length));
-      valid_count = gather.template Execute<OutputIsZeroInitialized::value>(
-          /*src_validity=*/values, /*idx_validity=*/indices, out_is_valid);
-    } else {
-      valid_count = gather.Execute();
+    IndexCType chunk_hint = 0;
+    int64_t idx_offset = 0;
+    while (idx_offset < indices.length) {
+      const int64_t block_length =
+          std::min(location_buffer_capacity, indices.length - idx_offset);
+
+      RETURN_NOT_OK(location_buffer->ResolveIndices<IndexCType>(
+          chunk_resolver, /*idx_length=*/block_length, idx, chunk_hint));
+      arrow::internal::GatherFromChunks<kValueWidthInBits, IndexCType, WithFactor::value>
+          gather{chunked_values.src_residual_bit_offsets_data(),
+                 chunked_values.src_chunks_data(),
+                 /*idx_length=*/block_length,
+                 location_buffer->chunk_location_vec<IndexCType>(),
+                 out,
+                 factor};
+      if (out_has_validity) {
+        DCHECK_EQ(out_arr->offset, 0);
+        // out_is_valid must be zero-initiliazed, because Gather::Execute
+        // saves time by not having to ClearBit on every null element.
+        auto out_is_valid = out_arr->GetMutableValues<uint8_t>(0);
+        memset(out_is_valid, 0, bit_util::BytesForBits(out_arr->length));
+        valid_count += gather.template Execute<OutputIsZeroInitialized::value>(
+            /*src_validity=*/values, /*idx_validity=*/indices, out_is_valid);
+      } else {
+        valid_count += gather.Execute();
+      }
+      // Prepare for the next iteration
+      chunk_hint = location_buffer->chunk_index<IndexCType>(block_length - 1);
+      idx_offset += block_length;
+      if constexpr (WithFactor::value) {
+        static_assert(kValueWidthInBits == 8);
+        out += block_length * factor;
+      } else {
+        out += (block_length * kValueWidthInBits) / 8;
+        // The last `out` produced in this loop might not be byte-aligned,
+        // but that is not a poblem because no value is written to it.
+      }
     }
     out_arr->null_count = out_arr->length - valid_count;
     return Status::OK();

From 408b4d8d5e375829126f99ee9573dbb7fdfe2cbc Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Sun, 1 Sep 2024 15:34:33 -0300
Subject: [PATCH 16/20] Take: Lazily build a ChunkResolver from ValuesSpan

This is not a time-saver yet because in TakeCC kernels, every call to
TakeCA will create a new ValuesSpan instance, but this will change in
the next commits.
---
 .../kernels/vector_selection_take_internal.cc | 39 +++++++++++++------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
index 67747b4912f6f..43e37dba963b5 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
@@ -332,6 +332,7 @@ class ValuesSpan {
  private:
   const std::shared_ptr<ChunkedArray> chunked_ = nullptr;
   const ArraySpan chunk0_;  // first chunk or the whole array
+  mutable std::optional<ChunkResolver> chunk_resolver_;
 
  public:
   explicit ValuesSpan(const std::shared_ptr<ChunkedArray> values)
@@ -343,6 +344,8 @@ class ValuesSpan {
   explicit ValuesSpan(const ArraySpan& values)  // NOLINT(modernize-pass-by-value)
       : chunk0_(values) {}
 
+  explicit ValuesSpan(const ArrayData& values) : chunk0_{ArraySpan{values}} {}
+
   bool is_chunked() const { return chunked_ != nullptr; }
 
   const ChunkedArray& chunked_array() const {
@@ -350,6 +353,18 @@ class ValuesSpan {
     return *chunked_;
   }
 
+  /// \brief Lazily builds a ChunkResolver from the underlying chunked array.
+  ///
+  /// \note This method is not thread-safe.
+  /// \pre is_chunked()
+  const ChunkResolver& chunk_resolver() const {
+    DCHECK(is_chunked());
+    if (!chunk_resolver_.has_value()) {
+      chunk_resolver_.emplace(chunked_->chunks());
+    }
+    return *chunk_resolver_;
+  }
+
   const ArraySpan& chunk0() const { return chunk0_; }
 
   const ArraySpan& array() const {
@@ -505,9 +520,8 @@ struct FixedWidthTakeImpl {
            (factor > 0 && kValueWidthInBits == 8 &&  // factors are used with bytes
             static_cast<int64_t>(factor * kValueWidthInBits) == bit_width));
 #endif
-    return values.is_chunked()
-               ? ChunkedExec(ctx, values.chunked_array(), indices, out_arr, factor)
-               : Exec(ctx, values.array(), indices, out_arr, factor);
+    return values.is_chunked() ? ChunkedExec(ctx, values, indices, out_arr, factor)
+                               : Exec(ctx, values.array(), indices, out_arr, factor);
   }
 
   static Status Exec(KernelContext* ctx, const ArraySpan& values,
@@ -542,7 +556,7 @@ struct FixedWidthTakeImpl {
     return Status::OK();
   }
 
-  static Status ChunkedExec(KernelContext* ctx, const ChunkedArray& values,
+  static Status ChunkedExec(KernelContext* ctx, const ValuesSpan& values,
                             const ArraySpan& indices, ArrayData* out_arr,
                             int64_t factor) {
     constexpr int64_t kIndexBlockCapacityInBytes = 16 * 1024;
@@ -552,28 +566,28 @@ struct FixedWidthTakeImpl {
         kIndexBlockCapacityInBytes / sizeof(IndexCType);
     static_assert((kIndexBlockCapacity * kValueWidthInBits) % 8 == 0);
 
-    ChunkedFixedWidthValuesSpan chunked_values{values};
-    ChunkResolver chunk_resolver{values.chunks()};
+    ChunkedFixedWidthValuesSpan chunked_values{values.chunked_array()};
     BoundedLocationBuffer location_buffer;
     // TODO(felipecrv): find a way to share the buffer on TakeCC kernel
     RETURN_NOT_OK(location_buffer.InitWithCapacity<IndexCType>(
         /*n_locations=*/std::min(kIndexBlockCapacity, indices.length),
         ctx->memory_pool()));
 
-    return DoChunkedExec(ctx, values, chunked_values, chunk_resolver, indices,
-                         &location_buffer, out_arr, factor);
+    return DoChunkedExec(ctx, values, chunked_values, indices, &location_buffer, out_arr,
+                         factor);
   }
 
   // \pre location_buffer is initialized
   // \pre location_buffer->Capacity() is a multiple of 8
-  static Status DoChunkedExec(KernelContext* ctx, const ChunkedArray& values,
+  static Status DoChunkedExec(KernelContext* ctx, const ValuesSpan& values,
                               const ChunkedFixedWidthValuesSpan& chunked_values,
-                              const ChunkResolver& chunk_resolver,
                               const ArraySpan& indices,
                               BoundedLocationBuffer* location_buffer, ArrayData* out_arr,
                               int64_t factor) {
-    const bool out_has_validity = values.null_count() > 0 || indices.MayHaveNulls();
+    const bool out_has_validity =
+        values.chunked_array().null_count() > 0 || indices.MayHaveNulls();
 
+    const auto& chunk_resolver = values.chunk_resolver();
     const auto location_buffer_capacity = location_buffer->Capacity<IndexCType>();
     const auto* idx = indices.GetValues<IndexCType>(1);
     uint8_t* out = util::MutableFixedWidthValuesPointer(out_arr);
@@ -600,7 +614,8 @@ struct FixedWidthTakeImpl {
         auto out_is_valid = out_arr->GetMutableValues<uint8_t>(0);
         memset(out_is_valid, 0, bit_util::BytesForBits(out_arr->length));
         valid_count += gather.template Execute<OutputIsZeroInitialized::value>(
-            /*src_validity=*/values, /*idx_validity=*/indices, out_is_valid);
+            /*src_validity=*/values.chunked_array(), /*idx_validity=*/indices,
+            out_is_valid);
       } else {
         valid_count += gather.Execute();
       }

From e638991430ac3c42b44b68e0bdad85f719e88c94 Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Sun, 1 Sep 2024 16:56:18 -0300
Subject: [PATCH 17/20] Take: Move the ValuesSpan class to the header

---
 .../kernels/vector_selection_internal.h       | 71 ++++++++++++++++++-
 .../kernels/vector_selection_take_internal.cc | 53 --------------
 2 files changed, 69 insertions(+), 55 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
index 3dd8e2524ba22..99278f0046589 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_internal.h
+++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
@@ -17,23 +17,89 @@
 
 #pragma once
 
+#include <cassert>
 #include <cstdint>
+#include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "arrow/array/data.h"
+#include "arrow/chunk_resolver.h"
 #include "arrow/compute/api_vector.h"
 #include "arrow/compute/exec.h"
 #include "arrow/compute/function.h"
 #include "arrow/compute/kernel.h"
 #include "arrow/compute/kernels/codegen_internal.h"
 
-namespace arrow::compute::internal {
+namespace arrow {
+
+using internal::ChunkResolver;
+
+namespace compute::internal {
 
 using FilterState = OptionsWrapper<FilterOptions>;
 using TakeState = OptionsWrapper<TakeOptions>;
 
+/// \brief A class used to represent the values argument in take kernels.
+///
+/// It can represent either a chunked array or a single array. When the values
+/// are chunked, the class provides a ChunkResolver to resolve the target array
+/// and index in the chunked array.
+class ValuesSpan {
+ private:
+  const std::shared_ptr<ChunkedArray> chunked_ = nullptr;
+  const ArraySpan chunk0_;  // first chunk or the whole array
+  mutable std::optional<ChunkResolver> chunk_resolver_;
+
+ public:
+  explicit ValuesSpan(const std::shared_ptr<ChunkedArray> values)
+      : chunked_(std::move(values)), chunk0_{*chunked_->chunk(0)->data()} {
+    assert(chunked_);
+    assert(chunked_->num_chunks() > 0);
+  }
+
+  explicit ValuesSpan(const ArraySpan& values)  // NOLINT(modernize-pass-by-value)
+      : chunk0_(values) {}
+
+  explicit ValuesSpan(const ArrayData& values) : chunk0_{ArraySpan{values}} {}
+
+  bool is_chunked() const { return chunked_ != nullptr; }
+
+  const ChunkedArray& chunked_array() const {
+    assert(is_chunked());
+    return *chunked_;
+  }
+
+  /// \brief Lazily builds a ChunkResolver from the underlying chunked array.
+  ///
+  /// \note This method is not thread-safe.
+  /// \pre is_chunked()
+  const ChunkResolver& chunk_resolver() const {
+    assert(is_chunked());
+    if (!chunk_resolver_.has_value()) {
+      chunk_resolver_.emplace(chunked_->chunks());
+    }
+    return *chunk_resolver_;
+  }
+
+  const ArraySpan& chunk0() const { return chunk0_; }
+
+  const ArraySpan& array() const {
+    assert(!is_chunked());
+    return chunk0_;
+  }
+
+  const DataType* type() const { return chunk0_.type; }
+
+  int64_t length() const { return is_chunked() ? chunked_->length() : array().length; }
+
+  bool MayHaveNulls() const {
+    return is_chunked() ? chunked_->null_count() != 0 : array().MayHaveNulls();
+  }
+};
+
 struct SelectionKernelData {
   SelectionKernelData(InputType value_type, InputType selection_type,
                       ArrayKernelExec exec,
@@ -97,4 +163,5 @@ Status SparseUnionTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status StructTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status MapTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
 
-}  // namespace arrow::compute::internal
+}  // namespace compute::internal
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
index 43e37dba963b5..b3623a1baaca9 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
@@ -328,59 +328,6 @@ namespace {
 
 using TakeState = OptionsWrapper<TakeOptions>;
 
-class ValuesSpan {
- private:
-  const std::shared_ptr<ChunkedArray> chunked_ = nullptr;
-  const ArraySpan chunk0_;  // first chunk or the whole array
-  mutable std::optional<ChunkResolver> chunk_resolver_;
-
- public:
-  explicit ValuesSpan(const std::shared_ptr<ChunkedArray> values)
-      : chunked_(std::move(values)), chunk0_{*values->chunk(0)->data()} {
-    DCHECK(chunked_);
-    DCHECK_GT(chunked_->num_chunks(), 0);
-  }
-
-  explicit ValuesSpan(const ArraySpan& values)  // NOLINT(modernize-pass-by-value)
-      : chunk0_(values) {}
-
-  explicit ValuesSpan(const ArrayData& values) : chunk0_{ArraySpan{values}} {}
-
-  bool is_chunked() const { return chunked_ != nullptr; }
-
-  const ChunkedArray& chunked_array() const {
-    DCHECK(is_chunked());
-    return *chunked_;
-  }
-
-  /// \brief Lazily builds a ChunkResolver from the underlying chunked array.
-  ///
-  /// \note This method is not thread-safe.
-  /// \pre is_chunked()
-  const ChunkResolver& chunk_resolver() const {
-    DCHECK(is_chunked());
-    if (!chunk_resolver_.has_value()) {
-      chunk_resolver_.emplace(chunked_->chunks());
-    }
-    return *chunk_resolver_;
-  }
-
-  const ArraySpan& chunk0() const { return chunk0_; }
-
-  const ArraySpan& array() const {
-    DCHECK(!is_chunked());
-    return chunk0_;
-  }
-
-  const DataType* type() const { return chunk0_.type; }
-
-  int64_t length() const { return is_chunked() ? chunked_->length() : array().length; }
-
-  bool MayHaveNulls() const {
-    return is_chunked() ? chunked_->null_count() != 0 : array().MayHaveNulls();
-  }
-};
-
 struct ChunkedFixedWidthValuesSpan {
  private:
   // src_residual_bit_offsets_[i] is used to store the bit offset of the first byte (0-7)

From 54d410d4892c2a09f2e11b8971f6e3c2c2666db0 Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Sun, 1 Sep 2024 18:57:20 -0300
Subject: [PATCH 18/20] Selection: Fix UB -- nothing guarantees these
 references to spans are stable

---
 cpp/src/arrow/compute/kernels/vector_selection_internal.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
index f5685ffa4139e..f75a05ce57e6f 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
@@ -193,8 +193,8 @@ struct Selection {
   };
 
   KernelContext* ctx;
-  const ArraySpan& values;
-  const ArraySpan& selection;
+  const ArraySpan values;
+  const ArraySpan selection;
   int64_t output_length;
   ArrayData* out;
   TypedBufferBuilder<bool> validity_builder;

From 3140ceb6f1820254fbae33368019193063f2167e Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Sun, 1 Sep 2024 18:25:39 -0300
Subject: [PATCH 19/20] Selection: Make sub-classes constructable with
 ValueSpan and ArraySpan's

---
 .../kernels/vector_selection_internal.cc      | 63 ++++++++++++-------
 1 file changed, 39 insertions(+), 24 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
index f75a05ce57e6f..04e725496c34f 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
@@ -19,6 +19,7 @@
 #include <limits>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "arrow/array/array_binary.h"
@@ -199,14 +200,28 @@ struct Selection {
   ArrayData* out;
   TypedBufferBuilder<bool> validity_builder;
 
-  Selection(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
-            ExecResult* out)
+  Selection(KernelContext* ctx, ArraySpan values, ArraySpan selection,
+            int64_t output_length, ArrayData* out)
       : ctx(ctx),
-        values(batch[0].array),
-        selection(batch[1].array),
+        values(std::move(values)),
+        selection(std::move(selection)),
         output_length(output_length),
-        out(out->array_data().get()),
-        validity_builder(ctx->memory_pool()) {}
+        out(out),
+        validity_builder(ctx->memory_pool()) {
+    // If the selection is an array of indices, the output length should
+    // match the number of indices in the selection array.
+    DCHECK(!is_integer(selection.type->id()) || output_length == selection.length);
+  }
+
+  Selection(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
+            ExecResult* out)
+      : Selection(ctx, batch[0].array, batch[1].array, output_length,
+                  out->array_data().get()) {}
+
+  Selection(KernelContext* ctx, const ValuesSpan& values, const ArraySpan& indices,
+            std::shared_ptr<ArrayData>* out)
+      : Selection(ctx, values.array(), indices, /*output_length=*/indices.length,
+                  out->get()) {}
 
   virtual ~Selection() = default;
 
@@ -484,9 +499,9 @@ struct VarBinarySelectionImpl : public Selection<VarBinarySelectionImpl<Type>, T
 
   static constexpr int64_t kOffsetLimit = std::numeric_limits<offset_type>::max() - 1;
 
-  VarBinarySelectionImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
-                         ExecResult* out)
-      : Base(ctx, batch, output_length, out),
+  template <typename... Args>
+  explicit VarBinarySelectionImpl(KernelContext* ctx, Args... args)
+      : Base(ctx, std::forward<Args>(args)...),
         offset_builder(ctx->memory_pool()),
         data_builder(ctx->memory_pool()) {}
 
@@ -558,9 +573,9 @@ struct ListSelectionImpl : public Selection<ListSelectionImpl<Type>, Type> {
   TypedBufferBuilder<offset_type> offset_builder;
   typename TypeTraits<Type>::OffsetBuilderType child_index_builder;
 
-  ListSelectionImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
-                    ExecResult* out)
-      : Base(ctx, batch, output_length, out),
+  template <typename... Args>
+  explicit ListSelectionImpl(KernelContext* ctx, Args... args)
+      : Base(ctx, std::forward<Args>(args)...),
         offset_builder(ctx->memory_pool()),
         child_index_builder(ctx->memory_pool()) {}
 
@@ -623,9 +638,9 @@ struct ListViewSelectionImpl : public Selection<ListViewSelectionImpl<Type>, Typ
   TypedBufferBuilder<offset_type> offsets_builder;
   TypedBufferBuilder<offset_type> sizes_builder;
 
-  ListViewSelectionImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
-                        ExecResult* out)
-      : Base(ctx, batch, output_length, out),
+  template <typename... Args>
+  explicit ListViewSelectionImpl(KernelContext* ctx, Args... args)
+      : Base(ctx, std::forward<Args>(args)...),
         offsets_builder(ctx->memory_pool()),
         sizes_builder(ctx->memory_pool()) {}
 
@@ -680,9 +695,9 @@ struct DenseUnionSelectionImpl
   std::vector<int8_t> type_codes_;
   std::vector<Int32Builder> child_indices_builders_;
 
-  DenseUnionSelectionImpl(KernelContext* ctx, const ExecSpan& batch,
-                          int64_t output_length, ExecResult* out)
-      : Base(ctx, batch, output_length, out),
+  template <typename... Args>
+  explicit DenseUnionSelectionImpl(KernelContext* ctx, Args... args)
+      : Base(ctx, std::forward<Args>(args)...),
         value_offset_buffer_builder_(ctx->memory_pool()),
         child_id_buffer_builder_(ctx->memory_pool()),
         type_codes_(checked_cast<const UnionType&>(*this->values.type).type_codes()),
@@ -761,9 +776,9 @@ struct SparseUnionSelectionImpl
   TypedBufferBuilder<int8_t> child_id_buffer_builder_;
   const int8_t type_code_for_null_;
 
-  SparseUnionSelectionImpl(KernelContext* ctx, const ExecSpan& batch,
-                           int64_t output_length, ExecResult* out)
-      : Base(ctx, batch, output_length, out),
+  template <typename... Args>
+  explicit SparseUnionSelectionImpl(KernelContext* ctx, Args... args)
+      : Base(ctx, std::forward<Args>(args)...),
         child_id_buffer_builder_(ctx->memory_pool()),
         type_code_for_null_(
             checked_cast<const UnionType&>(*this->values.type).type_codes()[0]) {}
@@ -812,9 +827,9 @@ struct FSLSelectionImpl : public Selection<FSLSelectionImpl, FixedSizeListType>
   using Base = Selection<FSLSelectionImpl, FixedSizeListType>;
   LIFT_BASE_MEMBERS();
 
-  FSLSelectionImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
-                   ExecResult* out)
-      : Base(ctx, batch, output_length, out), child_index_builder(ctx->memory_pool()) {}
+  template <typename... Args>
+  explicit FSLSelectionImpl(KernelContext* ctx, Args... args)
+      : Base(ctx, std::forward<Args>(args)...), child_index_builder(ctx->memory_pool()) {}
 
   template <typename Adapter>
   Status GenerateOutput() {

From 018320d9f6a53464759f79e1518ea4cd0dfdc099 Mon Sep 17 00:00:00 2001
From: Felipe Oliveira Carvalho <felipekde@gmail.com>
Date: Sun, 1 Sep 2024 20:01:54 -0300
Subject: [PATCH 20/20] Take: Create a signature for Take kernels support AAA
 and CAA calls

---
 .../vector_selection_filter_internal.cc       |  15 +-
 .../kernels/vector_selection_internal.cc      |  73 +++--
 .../kernels/vector_selection_internal.h       |  54 +++-
 .../kernels/vector_selection_take_internal.cc | 269 +++++++++---------
 4 files changed, 234 insertions(+), 177 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc
index bf67a474f31e2..b7c70c1a8bf24 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc
@@ -894,18 +894,23 @@ Status ExtensionFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult
 }
 
 // Transform filter to selection indices and then use Take.
-Status FilterWithTakeExec(const ArrayKernelExec& take_exec, KernelContext* ctx,
+Status FilterWithTakeExec(TakeKernelExec take_aaa_exec, KernelContext* ctx,
                           const ExecSpan& batch, ExecResult* out) {
-  std::shared_ptr<ArrayData> indices;
+  std::shared_ptr<ArrayData> indices_data;
   RETURN_NOT_OK(GetTakeIndices(batch[1].array,
                                FilterState::Get(ctx).null_selection_behavior,
                                ctx->memory_pool())
-                    .Value(&indices));
+                    .Value(&indices_data));
+
   KernelContext take_ctx(*ctx);
   TakeState state{TakeOptions::NoBoundsCheck()};
   take_ctx.SetState(&state);
-  ExecSpan take_batch({batch[0], ArraySpan(*indices)}, batch.length);
-  return take_exec(&take_ctx, take_batch, out);
+
+  ValuesSpan values(batch[0].array);
+  std::shared_ptr<ArrayData> out_data = out->array_data();
+  RETURN_NOT_OK(take_aaa_exec(&take_ctx, values, *indices_data, &out_data));
+  out->value = std::move(out_data);
+  return Status::OK();
 }
 
 // Due to the special treatment with their Take kernels, we filter Struct and SparseUnion
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
index 04e725496c34f..4a2d5ece192b4 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
@@ -968,69 +968,80 @@ Status MapFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out)
 
 namespace {
 
-template <typename Impl>
-Status TakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+template <typename SelectionImpl>
+Status TakeAAAExec(KernelContext* ctx, const ValuesSpan& values, const ArraySpan& indices,
+                   std::shared_ptr<ArrayData>* out) {
+  DCHECK(!values.is_chunked())
+      << "TakeAAAExec kernels can't be called with chunked array values";
   if (TakeState::Get(ctx).boundscheck) {
-    RETURN_NOT_OK(CheckIndexBounds(batch[1].array, batch[0].length()));
+    RETURN_NOT_OK(CheckIndexBounds(indices, values.length()));
   }
-  Impl kernel(ctx, batch, /*output_length=*/batch[1].length(), out);
+  SelectionImpl kernel(ctx, values, indices, out);
   return kernel.ExecTake();
 }
 
 }  // namespace
 
-Status VarBinaryTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  return TakeExec<VarBinarySelectionImpl<BinaryType>>(ctx, batch, out);
+Status VarBinaryTakeExec(KernelContext* ctx, const ValuesSpan& values,
+                         const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
+  return TakeAAAExec<VarBinarySelectionImpl<BinaryType>>(ctx, values, indices, out);
 }
 
-Status LargeVarBinaryTakeExec(KernelContext* ctx, const ExecSpan& batch,
-                              ExecResult* out) {
-  return TakeExec<VarBinarySelectionImpl<LargeBinaryType>>(ctx, batch, out);
+Status LargeVarBinaryTakeExec(KernelContext* ctx, const ValuesSpan& values,
+                              const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
+  return TakeAAAExec<VarBinarySelectionImpl<LargeBinaryType>>(ctx, values, indices, out);
 }
 
-Status ListTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  return TakeExec<ListSelectionImpl<ListType>>(ctx, batch, out);
+Status ListTakeExec(KernelContext* ctx, const ValuesSpan& values,
+                    const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
+  return TakeAAAExec<ListSelectionImpl<ListType>>(ctx, values, indices, out);
 }
 
-Status LargeListTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  return TakeExec<ListSelectionImpl<LargeListType>>(ctx, batch, out);
+Status LargeListTakeExec(KernelContext* ctx, const ValuesSpan& values,
+                         const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
+  return TakeAAAExec<ListSelectionImpl<LargeListType>>(ctx, values, indices, out);
 }
 
-Status ListViewTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  return TakeExec<ListViewSelectionImpl<ListViewType>>(ctx, batch, out);
+Status ListViewTakeExec(KernelContext* ctx, const ValuesSpan& values,
+                        const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
+  return TakeAAAExec<ListViewSelectionImpl<ListViewType>>(ctx, values, indices, out);
 }
 
-Status LargeListViewTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  return TakeExec<ListViewSelectionImpl<LargeListViewType>>(ctx, batch, out);
+Status LargeListViewTakeExec(KernelContext* ctx, const ValuesSpan& values,
+                             const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
+  return TakeAAAExec<ListViewSelectionImpl<LargeListViewType>>(ctx, values, indices, out);
 }
 
-Status FSLTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  const ArraySpan& values = batch[0].array;
-
+Status FSLTakeExec(KernelContext* ctx, const ValuesSpan& values, const ArraySpan& indices,
+                   std::shared_ptr<ArrayData>* out) {
   // If a FixedSizeList wraps a fixed-width type we can, in some cases, use
   // FixedWidthTakeExec for a fixed-size list array.
-  if (util::IsFixedWidthLike(values,
+  if (util::IsFixedWidthLike(values.array(),
                              /*force_null_count=*/true,
                              /*exclude_bool_and_dictionary=*/true)) {
-    return FixedWidthTakeExec(ctx, batch, out);
+    return FixedWidthTakeExec(ctx, values, indices, out);
   }
-  return TakeExec<FSLSelectionImpl>(ctx, batch, out);
+  return TakeAAAExec<FSLSelectionImpl>(ctx, values, indices, out);
 }
 
-Status DenseUnionTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  return TakeExec<DenseUnionSelectionImpl>(ctx, batch, out);
+Status DenseUnionTakeExec(KernelContext* ctx, const ValuesSpan& values,
+                          const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
+  return TakeAAAExec<DenseUnionSelectionImpl>(ctx, values, indices, out);
 }
 
-Status SparseUnionTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  return TakeExec<SparseUnionSelectionImpl>(ctx, batch, out);
+Status SparseUnionTakeExec(KernelContext* ctx, const ValuesSpan& values,
+                           const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
+  return TakeAAAExec<SparseUnionSelectionImpl>(ctx, values, indices, out);
 }
 
-Status StructTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  return TakeExec<StructSelectionImpl>(ctx, batch, out);
+Status StructTakeExec(KernelContext* ctx, const ValuesSpan& values,
+                      const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
+  return TakeAAAExec<StructSelectionImpl>(ctx, values, indices, out);
 }
 
-Status MapTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  return TakeExec<ListSelectionImpl<MapType>>(ctx, batch, out);
+Status MapTakeExec(KernelContext* ctx, const ValuesSpan& values, const ArraySpan& indices,
+                   std::shared_ptr<ArrayData>* out) {
+  return TakeAAAExec<ListSelectionImpl<MapType>>(ctx, values, indices, out);
 }
 
 }  // namespace compute::internal
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
index 99278f0046589..6a4a4fe868649 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_internal.h
+++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
@@ -100,6 +100,20 @@ class ValuesSpan {
   }
 };
 
+/// \brief Type for a single "array_take" kernel function.
+///
+/// Instead of implementing both `ArrayKernelExec` and `ChunkedExec` typed
+/// functions for each configurations of `array_take` parameters, we use
+/// templates wrapping `TakeKernelExec` functions to expose exec functions
+/// that can be registered in the kernel registry.
+///
+/// A `TakeKernelExec` always returns a single array, which is the result of
+/// taking values from a single array (AA->A) or multiple arrays (CA->A). The
+/// wrappers take care of converting the output of a CA call to C or calling
+/// the kernel multiple times to process a CC call.
+using TakeKernelExec = Status (*)(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                                  std::shared_ptr<ArrayData>*);
+
 struct SelectionKernelData {
   SelectionKernelData(InputType value_type, InputType selection_type,
                       ArrayKernelExec exec,
@@ -149,19 +163,33 @@ Status FSLFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status DenseUnionFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status MapFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
 
-Status VarBinaryTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status LargeVarBinaryTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status FixedWidthTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status FixedWidthTakeChunkedExec(KernelContext*, const ExecBatch&, Datum*);
-Status ListTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status LargeListTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status ListViewTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status LargeListViewTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status FSLTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status DenseUnionTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status SparseUnionTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status StructTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status MapTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
+// Take kernels compatible with the TakeKernelExec signature
+Status VarBinaryTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                         std::shared_ptr<ArrayData>*);
+Status LargeVarBinaryTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                              std::shared_ptr<ArrayData>*);
+Status FixedWidthTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                          std::shared_ptr<ArrayData>*);
+Status FixedWidthTakeChunkedExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                                 std::shared_ptr<ArrayData>*);
+Status ListTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                    std::shared_ptr<ArrayData>*);
+Status LargeListTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                         std::shared_ptr<ArrayData>*);
+Status ListViewTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                        std::shared_ptr<ArrayData>*);
+Status LargeListViewTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                             std::shared_ptr<ArrayData>*);
+Status FSLTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                   std::shared_ptr<ArrayData>*);
+Status DenseUnionTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                          std::shared_ptr<ArrayData>*);
+Status SparseUnionTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                           std::shared_ptr<ArrayData>*);
+Status StructTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                      std::shared_ptr<ArrayData>*);
+Status MapTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                   std::shared_ptr<ArrayData>*);
 
 }  // namespace compute::internal
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
index b3623a1baaca9..e4cc24fcd5f34 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
@@ -604,8 +604,10 @@ Status TakeIndexDispatch(KernelContext* ctx, const ValuesSpan& values,
   }
 }
 
-Status FixedWidthTakeExecImpl(KernelContext* ctx, const ValuesSpan& values,
-                              const ArraySpan& indices, ArrayData* out_arr) {
+}  // namespace
+
+Status FixedWidthTakeExec(KernelContext* ctx, const ValuesSpan& values,
+                          const ArraySpan& indices, std::shared_ptr<ArrayData>* out_arr) {
   if (TakeState::Get(ctx).boundscheck) {
     RETURN_NOT_OK(CheckIndexBounds(indices, values.length()));
   }
@@ -615,42 +617,43 @@ Status FixedWidthTakeExecImpl(KernelContext* ctx, const ValuesSpan& values,
   // allocating the validity bitmap altogether and save time and space.
   const bool allocate_validity = values.MayHaveNulls() || indices.MayHaveNulls();
   RETURN_NOT_OK(util::internal::PreallocateFixedWidthArrayData(
-      ctx, indices.length, /*source=*/values.chunk0(), allocate_validity, out_arr));
+      ctx, indices.length, /*source=*/values.chunk0(), allocate_validity,
+      out_arr->get()));
   switch (util::FixedWidthInBits(*values.type())) {
     case 0:
       DCHECK(values.type()->id() == Type::FIXED_SIZE_BINARY ||
              values.type()->id() == Type::FIXED_SIZE_LIST);
       return TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 0>>(
-          ctx, values, indices, out_arr);
+          ctx, values, indices, out_arr->get());
     case 1:
       // Zero-initialize the data buffer for the output array when the bit-width is 1
       // (e.g. Boolean array) to avoid having to ClearBit on every null element.
       // This might be profitable for other types as well, but we take the most
       // conservative approach for now.
-      memset(out_arr->buffers[1]->mutable_data(), 0, out_arr->buffers[1]->size());
+      memset((*out_arr)->buffers[1]->mutable_data(), 0, (*out_arr)->buffers[1]->size());
       return TakeIndexDispatch<
           FixedWidthTakeImpl, std::integral_constant<int, 1>, /*OutputIsZeroInitialized=*/
-          std::true_type>(ctx, values, indices, out_arr);
+          std::true_type>(ctx, values, indices, out_arr->get());
     case 8:
       return TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 8>>(
-          ctx, values, indices, out_arr);
+          ctx, values, indices, out_arr->get());
     case 16:
       return TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 16>>(
-          ctx, values, indices, out_arr);
+          ctx, values, indices, out_arr->get());
     case 32:
       return TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 32>>(
-          ctx, values, indices, out_arr);
+          ctx, values, indices, out_arr->get());
     case 64:
       return TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 64>>(
-          ctx, values, indices, out_arr);
+          ctx, values, indices, out_arr->get());
     case 128:
       // For INTERVAL_MONTH_DAY_NANO, DECIMAL128
       return TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 128>>(
-          ctx, values, indices, out_arr);
+          ctx, values, indices, out_arr->get());
     case 256:
       // For DECIMAL256
       return TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 256>>(
-          ctx, values, indices, out_arr);
+          ctx, values, indices, out_arr->get());
   }
   if (ARROW_PREDICT_TRUE(values.type()->id() == Type::FIXED_SIZE_BINARY ||
                          values.type()->id() == Type::FIXED_SIZE_LIST)) {
@@ -660,67 +663,57 @@ Status FixedWidthTakeExecImpl(KernelContext* ctx, const ValuesSpan& values,
     return TakeIndexDispatch<FixedWidthTakeImpl,
                              /*ValueBitWidth=*/std::integral_constant<int, 8>,
                              /*OutputIsZeroInitialized=*/std::false_type,
-                             /*WithFactor=*/std::true_type>(ctx, values, indices, out_arr,
+                             /*WithFactor=*/std::true_type>(ctx, values, indices,
+                                                            out_arr->get(),
                                                             /*factor=*/byte_width);
   }
   return Status::NotImplemented("Unsupported primitive type for take: ", *values.type());
 }
 
-}  // namespace
-
-Status FixedWidthTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  ValuesSpan values{batch[0].array};
-  auto* out_arr = out->array_data().get();
-  return FixedWidthTakeExecImpl(ctx, values, batch[1].array, out_arr);
-}
-
-Status FixedWidthTakeChunkedExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  ValuesSpan values{batch[0].chunked_array()};
-  auto& indices = batch[1].array();
-  auto* out_arr = out->mutable_array();
-  return FixedWidthTakeExecImpl(ctx, values, *indices, out_arr);
-}
-
 namespace {
 
 // ----------------------------------------------------------------------
 // Null take
 
-Status NullTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+Status NullTakeExec(KernelContext* ctx, const ValuesSpan& values,
+                    const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
   if (TakeState::Get(ctx).boundscheck) {
-    RETURN_NOT_OK(CheckIndexBounds(batch[1].array, batch[0].length()));
+    RETURN_NOT_OK(CheckIndexBounds(indices, values.length()));
   }
   // batch.length doesn't take into account the take indices
-  auto new_length = batch[1].array.length;
-  out->value = std::make_shared<NullArray>(new_length)->data();
+  auto new_length = indices.length;
+  *out = NullArray{new_length}.data();
   return Status::OK();
 }
 
 // ----------------------------------------------------------------------
 // Dictionary take
 
-Status DictionaryTake(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  DictionaryArray values(batch[0].array.ToArrayData());
+Status DictionaryTake(KernelContext* ctx, const ValuesSpan& values,
+                      const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
+  DictionaryArray values_dict(values.array().ToArrayData());
   Datum result;
-  RETURN_NOT_OK(Take(Datum(values.indices()), batch[1].array.ToArrayData(),
+  RETURN_NOT_OK(Take(Datum(values_dict.indices()), indices.ToArrayData(),
                      TakeState::Get(ctx), ctx->exec_context())
                     .Value(&result));
-  DictionaryArray taken_values(values.type(), result.make_array(), values.dictionary());
-  out->value = taken_values.data();
+  DictionaryArray taken_values(values_dict.type(), result.make_array(),
+                               values_dict.dictionary());
+  *out = taken_values.data();
   return Status::OK();
 }
 
 // ----------------------------------------------------------------------
 // Extension take
 
-Status ExtensionTake(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  ExtensionArray values(batch[0].array.ToArrayData());
+Status ExtensionTake(KernelContext* ctx, const ValuesSpan& values,
+                     const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
+  ExtensionArray values_data(values.array().ToArrayData());
   Datum result;
-  RETURN_NOT_OK(Take(Datum(values.storage()), batch[1].array.ToArrayData(),
+  RETURN_NOT_OK(Take(Datum(values_data.storage()), indices.ToArrayData(),
                      TakeState::Get(ctx), ctx->exec_context())
                     .Value(&result));
-  ExtensionArray taken_values(values.type(), result.make_array());
-  out->value = taken_values.data();
+  ExtensionArray taken_values(values_data.type(), result.make_array());
+  *out = taken_values.data();
   return Status::OK();
 }
 
@@ -899,49 +892,51 @@ Result<std::shared_ptr<Array>> ChunkedArrayAsArray(
   }
 }
 
-Status CallAAAKernel(ArrayKernelExec take_aaa_exec, KernelContext* ctx,
-                     std::shared_ptr<ArrayData> values,
-                     std::shared_ptr<ArrayData> indices, Datum* out) {
-  int64_t batch_length = values->length;
-  std::vector<Datum> args = {std::move(values), std::move(indices)};
-  ExecBatch array_array_batch(std::move(args), batch_length);
+Status CallXAAKernel(TakeKernelExec take_xaa_exec, KernelContext* ctx,
+                     const ValuesSpan& values, const ArraySpan& indices, Datum* out) {
   DCHECK_EQ(out->kind(), Datum::ARRAY);
-  ExecSpan exec_span{array_array_batch};
-  ExecResult result;
-  result.value = out->array();
-  RETURN_NOT_OK(take_aaa_exec(ctx, exec_span, &result));
-  DCHECK(result.is_array_data());
-  out->value = result.array_data();
+  auto out_arr = out->array();
+  RETURN_NOT_OK(take_xaa_exec(ctx, values, indices, &out_arr));
+  out->value = std::move(out_arr);
   return Status::OK();
 }
 
-Status CallCAAKernel(VectorKernel::ChunkedExec take_caa_exec, KernelContext* ctx,
-                     std::shared_ptr<ChunkedArray> values,
-                     std::shared_ptr<ArrayData> indices, Datum* out) {
-  int64_t batch_length = values->length();
-  std::vector<Datum> args = {std::move(values), std::move(indices)};
-  ExecBatch chunked_array_array_batch(std::move(args), batch_length);
-  DCHECK_EQ(out->kind(), Datum::ARRAY);
-  return take_caa_exec(ctx, chunked_array_array_batch, out);
-}
-
-Status TakeACCChunkedExec(ArrayKernelExec take_aaa_exec, KernelContext* ctx,
+Status TakeACCChunkedExec(TakeKernelExec take_aaa_exec, KernelContext* ctx,
                           const ExecBatch& batch, Datum* out) {
-  auto& values = batch.values[0].array();
+  ValuesSpan values{*batch.values[0].array()};
   auto& indices = batch.values[1].chunked_array();
   auto num_chunks = indices->num_chunks();
   std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
   for (int i = 0; i < num_chunks; i++) {
     // Take with that indices chunk
-    auto& indices_chunk = indices->chunk(i)->data();
-    Datum result = PrepareOutput(batch, values->length);
-    RETURN_NOT_OK(CallAAAKernel(take_aaa_exec, ctx, values, indices_chunk, &result));
+    auto& indices_chunk_data = indices->chunk(i)->data();
+    ArraySpan indices_chunk{*indices_chunk_data};
+    Datum result = PrepareOutput(batch, values.length());
+    RETURN_NOT_OK(CallXAAKernel(take_aaa_exec, ctx, values, indices_chunk, &result));
     new_chunks[i] = MakeArray(result.array());
   }
-  out->value = std::make_shared<ChunkedArray>(std::move(new_chunks), values->type);
+  out->value = std::make_shared<ChunkedArray>(std::move(new_chunks),
+                                              values.type()->GetSharedPtr());
   return Status::OK();
 }
 
+Status ArrayTakeExec(TakeKernelExec take_aaa_exec, KernelContext* ctx,
+                     const ExecSpan& span, ExecResult* out) {
+  ValuesSpan values{span[0].array};
+  auto& indices = span[1].array;
+  std::shared_ptr<ArrayData> out_arr = out->array_data();
+  RETURN_NOT_OK(take_aaa_exec(ctx, values, indices, &out_arr));
+  out->value = std::move(out_arr);
+  return Status::OK();
+}
+
+template <TakeKernelExec kTakeAAAExec>
+struct ArrayTakeExecFunctor {
+  static Status Exec(KernelContext* ctx, const ExecSpan& span, ExecResult* out) {
+    return ArrayTakeExec(kTakeAAAExec, ctx, span, out);
+  }
+};
+
 /// \brief Generic (slower) VectorKernel::exec_chunked (`CA->C`, `CC->C`, and `AC->C`).
 ///
 /// This function concatenates the chunks of the values and then calls the `AA->A` take
@@ -955,34 +950,34 @@ Status TakeACCChunkedExec(ArrayKernelExec take_aaa_exec, KernelContext* ctx,
 /// `AC->C` cases are trivially delegated to TakeACCChunkedExec without any concatenation.
 ///
 /// \param take_aaa_exec The `AA->A` take kernel to use.
-Status GenericTakeChunkedExec(ArrayKernelExec take_aaa_exec, KernelContext* ctx,
+Status GenericTakeChunkedExec(TakeKernelExec take_aaa_exec, KernelContext* ctx,
                               const ExecBatch& batch, Datum* out) {
   const auto& args = batch.values;
   if (args[0].kind() == Datum::CHUNKED_ARRAY) {
     const auto& values_chunked = args[0].chunked_array();
     ARROW_ASSIGN_OR_RAISE(auto values_array,
                           ChunkedArrayAsArray(values_chunked, ctx->memory_pool()));
+    const ValuesSpan values(*values_array->data());
     if (args[1].kind() == Datum::ARRAY) {
       // CA->C
-      const auto& indices = args[1].array();
+      const auto& indices_data = args[1].array();
+      const ArraySpan indices(*indices_data);
       DCHECK_EQ(values_array->length(), batch.length);
       {
         // AA->A
-        RETURN_NOT_OK(
-            CallAAAKernel(take_aaa_exec, ctx, values_array->data(), indices, out));
+        RETURN_NOT_OK(CallXAAKernel(take_aaa_exec, ctx, values, indices, out));
         out->value = std::make_shared<ChunkedArray>(MakeArray(out->array()));
       }
       return Status::OK();
     } else if (args[1].kind() == Datum::CHUNKED_ARRAY) {
       // CC->C
-      const auto& indices = args[1].chunked_array();
+      const auto& chunked_indices = args[1].chunked_array();
       std::vector<std::shared_ptr<Array>> new_chunks;
-      for (int i = 0; i < indices->num_chunks(); i++) {
+      for (int i = 0; i < chunked_indices->num_chunks(); i++) {
         // AA->A
-        const auto& indices_chunk = indices->chunk(i)->data();
+        const ArraySpan indices_chunk{*chunked_indices->chunk(i)->data()};
         Datum result = PrepareOutput(batch, values_array->length());
-        RETURN_NOT_OK(CallAAAKernel(take_aaa_exec, ctx, values_array->data(),
-                                    indices_chunk, &result));
+        RETURN_NOT_OK(CallXAAKernel(take_aaa_exec, ctx, values, indices_chunk, &result));
         new_chunks.push_back(MakeArray(result.array()));
       }
       DCHECK(out->is_array());
@@ -1008,7 +1003,7 @@ Status GenericTakeChunkedExec(ArrayKernelExec take_aaa_exec, KernelContext* ctx,
       args[0].ToString(), ", indices=", args[1].ToString());
 }
 
-template <ArrayKernelExec kTakeAAAExec>
+template <TakeKernelExec kTakeAAAExec>
 struct GenericTakeChunkedExecFunctor {
   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     return GenericTakeChunkedExec(kTakeAAAExec, ctx, batch, out);
@@ -1029,11 +1024,9 @@ struct GenericTakeChunkedExecFunctor {
 ///
 /// `AC->C` cases are trivially delegated to TakeACCChunkedExec.
 ///
-/// \param take_aaa_exec The `AA->A` take kernel to use.
-Status SpecialTakeChunkedExec(const ArrayKernelExec take_aaa_exec,
-                              VectorKernel::ChunkedExec take_caa_exec, KernelContext* ctx,
+/// \param take_xaa_exec The `AA->A` and `CA->A` take kernel to use.
+Status SpecialTakeChunkedExec(TakeKernelExec take_xaa_exec, KernelContext* ctx,
                               const ExecBatch& batch, Datum* out) {
-  Datum result = PrepareOutput(batch, batch.length);
   auto* pool = ctx->memory_pool();
   const auto& args = batch.values;
   if (args[0].kind() == Datum::CHUNKED_ARRAY) {
@@ -1048,14 +1041,15 @@ Status SpecialTakeChunkedExec(const ArrayKernelExec take_aaa_exec,
 
     if (args[1].kind() == Datum::ARRAY) {
       // CA->C
-      const auto& indices = args[1].array();
+      ArraySpan indices(*args[1].array());
       if (single_chunk) {
         // AA->A
         DCHECK_EQ(single_chunk->length(), batch.length);
+        ValuesSpan single_chunk_values(*single_chunk->data());
         // If the ChunkedArray was cheaply converted to a single chunk,
         // we can use the AA->A take kernel directly.
         RETURN_NOT_OK(
-            CallAAAKernel(take_aaa_exec, ctx, single_chunk->data(), indices, out));
+            CallXAAKernel(take_xaa_exec, ctx, single_chunk_values, indices, out));
         out->value = std::make_shared<ChunkedArray>(MakeArray(out->array()));
         return Status::OK();
       }
@@ -1063,24 +1057,29 @@ Status SpecialTakeChunkedExec(const ArrayKernelExec take_aaa_exec,
       // which has a more efficient implementation for this case. At this point,
       // that implementation doesn't have to care about empty or single-chunk
       // ChunkedArrays.
-      RETURN_NOT_OK(take_caa_exec(ctx, batch, &result));
-      out->value = std::make_shared<ChunkedArray>(MakeArray(result.array()));
+      ValuesSpan values(values_chunked);
+      auto out_arr = out->array();
+      RETURN_NOT_OK(take_xaa_exec(ctx, values, indices, &out_arr));
+      out->value = std::make_shared<ChunkedArray>(MakeArray(std::move(out_arr)));
       return Status::OK();
     } else {
+      Datum result;
       // CC->C
       const auto& indices = args[1].chunked_array();
       std::vector<std::shared_ptr<Array>> new_chunks;
       for (int i = 0; i < indices->num_chunks(); i++) {
-        const auto& indices_chunk = indices->chunk(i)->data();
+        const ArraySpan indices_chunk{*indices->chunk(i)->data()};
         result = PrepareOutput(batch, values_chunked->length());
         if (single_chunk) {
+          ValuesSpan single_chunk_values(*single_chunk->data());
           // If the ChunkedArray was cheaply converted to a single chunk,
           // we can use the AA->A take kernel directly.
-          RETURN_NOT_OK(CallAAAKernel(take_aaa_exec, ctx, single_chunk->data(),
+          RETURN_NOT_OK(CallXAAKernel(take_xaa_exec, ctx, single_chunk_values,
                                       indices_chunk, &result));
         } else {
+          ValuesSpan values(values_chunked);
           RETURN_NOT_OK(
-              CallCAAKernel(take_caa_exec, ctx, values_chunked, indices_chunk, &result));
+              CallXAAKernel(take_xaa_exec, ctx, values, indices_chunk, &result));
         }
         new_chunks.push_back(MakeArray(result.array()));
       }
@@ -1095,7 +1094,7 @@ Status SpecialTakeChunkedExec(const ArrayKernelExec take_aaa_exec,
     // everything is wired up correctly.
     if (args[1].kind() == Datum::CHUNKED_ARRAY) {
       // AC->C
-      return TakeACCChunkedExec(take_aaa_exec, ctx, batch, out);
+      return TakeACCChunkedExec(take_xaa_exec, ctx, batch, out);
     } else {
       DCHECK(false) << "Unexpected kind for array_take's exec_chunked kernel: values="
                     << args[0].ToString() << ", indices=" << args[1].ToString();
@@ -1107,10 +1106,10 @@ Status SpecialTakeChunkedExec(const ArrayKernelExec take_aaa_exec,
       args[0].ToString(), ", indices=", args[1].ToString());
 }
 
-template <ArrayKernelExec kTakeAAAExec, VectorKernel::ChunkedExec kTakeCAAExec>
+template <TakeKernelExec kTakeXAAExec>
 struct SpecialTakeChunkedExecFunctor {
   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    return SpecialTakeChunkedExec(kTakeAAAExec, kTakeCAAExec, ctx, batch, out);
+    return SpecialTakeChunkedExec(kTakeXAAExec, ctx, batch, out);
   }
 };
 
@@ -1129,40 +1128,54 @@ void PopulateTakeKernels(std::vector<SelectionKernelData>* out) {
   auto take_indices = match::Integer();
 
   *out = {
-      {InputType(match::Primitive()), take_indices, FixedWidthTakeExec,
-       SpecialTakeChunkedExecFunctor<FixedWidthTakeExec,
-                                     FixedWidthTakeChunkedExec>::Exec},
-      {InputType(match::BinaryLike()), take_indices, VarBinaryTakeExec,
-       GenericTakeChunkedExecFunctor<VarBinaryTakeExec>::Exec},
-      {InputType(match::LargeBinaryLike()), take_indices, LargeVarBinaryTakeExec,
-       GenericTakeChunkedExecFunctor<LargeVarBinaryTakeExec>::Exec},
-      {InputType(match::FixedSizeBinaryLike()), take_indices, FixedWidthTakeExec,
-       SpecialTakeChunkedExecFunctor<FixedWidthTakeExec,
-                                     FixedWidthTakeChunkedExec>::Exec},
-      {InputType(null()), take_indices, NullTakeExec,
-       GenericTakeChunkedExecFunctor<NullTakeExec>::Exec},
-      {InputType(Type::DICTIONARY), take_indices, DictionaryTake,
-       GenericTakeChunkedExecFunctor<DictionaryTake>::Exec},
-      {InputType(Type::EXTENSION), take_indices, ExtensionTake,
-       GenericTakeChunkedExecFunctor<ExtensionTake>::Exec},
-      {InputType(Type::LIST), take_indices, ListTakeExec,
-       GenericTakeChunkedExecFunctor<ListTakeExec>::Exec},
-      {InputType(Type::LARGE_LIST), take_indices, LargeListTakeExec,
-       GenericTakeChunkedExecFunctor<LargeListTakeExec>::Exec},
-      {InputType(Type::LIST_VIEW), take_indices, ListViewTakeExec,
-       GenericTakeChunkedExecFunctor<ListViewTakeExec>::Exec},
-      {InputType(Type::LARGE_LIST_VIEW), take_indices, LargeListViewTakeExec,
-       GenericTakeChunkedExecFunctor<LargeListViewTakeExec>::Exec},
-      {InputType(Type::FIXED_SIZE_LIST), take_indices, FSLTakeExec,
-       GenericTakeChunkedExecFunctor<FSLTakeExec>::Exec},
-      {InputType(Type::DENSE_UNION), take_indices, DenseUnionTakeExec,
-       GenericTakeChunkedExecFunctor<DenseUnionTakeExec>::Exec},
-      {InputType(Type::SPARSE_UNION), take_indices, SparseUnionTakeExec,
-       GenericTakeChunkedExecFunctor<SparseUnionTakeExec>::Exec},
-      {InputType(Type::STRUCT), take_indices, StructTakeExec,
-       GenericTakeChunkedExecFunctor<StructTakeExec>::Exec},
-      {InputType(Type::MAP), take_indices, MapTakeExec,
-       GenericTakeChunkedExecFunctor<MapTakeExec>::Exec},
+      SelectionKernelData{InputType(match::Primitive()), take_indices,
+                          ArrayTakeExecFunctor<FixedWidthTakeExec>::Exec,
+                          SpecialTakeChunkedExecFunctor<FixedWidthTakeExec>::Exec},
+      SelectionKernelData{InputType(match::BinaryLike()), take_indices,
+                          ArrayTakeExecFunctor<VarBinaryTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<VarBinaryTakeExec>::Exec},
+      SelectionKernelData{InputType(match::LargeBinaryLike()), take_indices,
+                          ArrayTakeExecFunctor<LargeVarBinaryTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<LargeVarBinaryTakeExec>::Exec},
+      SelectionKernelData{InputType(match::FixedSizeBinaryLike()), take_indices,
+                          ArrayTakeExecFunctor<FixedWidthTakeExec>::Exec,
+                          SpecialTakeChunkedExecFunctor<FixedWidthTakeExec>::Exec},
+      SelectionKernelData{InputType(null()), take_indices,
+                          ArrayTakeExecFunctor<NullTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<NullTakeExec>::Exec},
+      SelectionKernelData{InputType(Type::DICTIONARY), take_indices,
+                          ArrayTakeExecFunctor<DictionaryTake>::Exec,
+                          GenericTakeChunkedExecFunctor<DictionaryTake>::Exec},
+      SelectionKernelData{InputType(Type::EXTENSION), take_indices,
+                          ArrayTakeExecFunctor<ExtensionTake>::Exec,
+                          GenericTakeChunkedExecFunctor<ExtensionTake>::Exec},
+      SelectionKernelData{InputType(Type::LIST), take_indices,
+                          ArrayTakeExecFunctor<ListTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<ListTakeExec>::Exec},
+      SelectionKernelData{InputType(Type::LARGE_LIST), take_indices,
+                          ArrayTakeExecFunctor<LargeListTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<LargeListTakeExec>::Exec},
+      SelectionKernelData{InputType(Type::LIST_VIEW), take_indices,
+                          ArrayTakeExecFunctor<ListViewTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<ListViewTakeExec>::Exec},
+      SelectionKernelData{InputType(Type::LARGE_LIST_VIEW), take_indices,
+                          ArrayTakeExecFunctor<LargeListViewTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<LargeListViewTakeExec>::Exec},
+      SelectionKernelData{InputType(Type::FIXED_SIZE_LIST), take_indices,
+                          ArrayTakeExecFunctor<FSLTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<FSLTakeExec>::Exec},
+      SelectionKernelData{InputType(Type::DENSE_UNION), take_indices,
+                          ArrayTakeExecFunctor<DenseUnionTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<DenseUnionTakeExec>::Exec},
+      SelectionKernelData{InputType(Type::SPARSE_UNION), take_indices,
+                          ArrayTakeExecFunctor<SparseUnionTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<SparseUnionTakeExec>::Exec},
+      SelectionKernelData{InputType(Type::STRUCT), take_indices,
+                          ArrayTakeExecFunctor<StructTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<StructTakeExec>::Exec},
+      SelectionKernelData{InputType(Type::MAP), take_indices,
+                          ArrayTakeExecFunctor<MapTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<MapTakeExec>::Exec},
   };
 }