diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 6dc8358f502f5..678d121a05ab4 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -764,6 +764,7 @@ if(ARROW_COMPUTE) compute/kernels/scalar_arithmetic.cc compute/kernels/scalar_boolean.cc compute/kernels/scalar_compare.cc + compute/kernels/scalar_hash.cc compute/kernels/scalar_if_else.cc compute/kernels/scalar_nested.cc compute/kernels/scalar_random.cc diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 7c3bc46650e9f..022f13df8ebc0 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -927,6 +927,12 @@ Result MapLookup(const Datum& arg, MapLookupOptions options, ExecContext* return CallFunction("map_lookup", {arg}, &options, ctx); } +// ---------------------------------------------------------------------- +// Hash functions +Result Hash64(const Datum& input_array, ExecContext* ctx) { + return CallFunction("hash_64", {input_array}, ctx); +} + // ---------------------------------------------------------------------- } // namespace compute diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 947474e5962d0..ddde60f5f97e4 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -1718,5 +1718,21 @@ ARROW_EXPORT Result NanosecondsBetween(const Datum& left, const Datum& ri /// \note API not yet finalized ARROW_EXPORT Result MapLookup(const Datum& map, MapLookupOptions options, ExecContext* ctx = NULLPTR); + +/// \brief Construct a hash value for each row of the input. +/// +/// The result is an Array of length equal to the length of the input; however, the output +/// shall be a UInt64Array, with each element being a hash constructed from each row of +/// the input. If the input Array is a NestedArray, this means that each "attribute" or +/// "field" of the input NestedArray corresponding to the same "row" will collectively +/// produce a single uint64_t hash. At the moment, this function does not take options, +/// though these may be added in the future. +/// +/// \param[in] input_array input data to hash +/// \param[in] ctx function execution context, optional +/// \return elementwise hash values +ARROW_EXPORT +Result Hash64(const Datum& input_array, ExecContext* ctx = NULLPTR); + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 7c7b9c8b68d45..30756b572812c 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -73,6 +73,7 @@ add_arrow_compute_test(scalar_utility_test scalar_random_test.cc scalar_set_lookup_test.cc scalar_validity_test.cc + scalar_hash_test.cc EXTRA_LINK_LIBS arrow_compute_kernels_testing) @@ -87,6 +88,7 @@ add_arrow_benchmark(scalar_round_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_set_lookup_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_string_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_temporal_benchmark PREFIX "arrow-compute") +add_arrow_benchmark(scalar_hash_benchmark PREFIX "arrow-compute") # ---------------------------------------------------------------------- # Vector kernels diff --git a/cpp/src/arrow/compute/registry.cc b/cpp/src/arrow/compute/registry.cc index 0f535eb373269..5b4c33b369623 100644 --- a/cpp/src/arrow/compute/registry.cc +++ b/cpp/src/arrow/compute/registry.cc @@ -299,6 +299,7 @@ static std::unique_ptr CreateBuiltInRegistry() { RegisterScalarArithmetic(registry.get()); RegisterScalarBoolean(registry.get()); RegisterScalarComparison(registry.get()); + RegisterScalarHash(registry.get()); RegisterScalarIfElse(registry.get()); RegisterScalarNested(registry.get()); RegisterScalarRandom(registry.get()); // Nullary diff --git a/cpp/src/arrow/compute/registry_internal.h b/cpp/src/arrow/compute/registry_internal.h index cdc9f804e72f1..532c7fa9f76f0 100644 --- a/cpp/src/arrow/compute/registry_internal.h +++ b/cpp/src/arrow/compute/registry_internal.h @@ -30,6 +30,7 @@ void RegisterScalarBoolean(FunctionRegistry* registry); void RegisterScalarCast(FunctionRegistry* registry); void RegisterDictionaryDecode(FunctionRegistry* registry); void RegisterScalarComparison(FunctionRegistry* registry); +void RegisterScalarHash(FunctionRegistry* registry); void RegisterScalarIfElse(FunctionRegistry* registry); void RegisterScalarNested(FunctionRegistry* registry); void RegisterScalarRandom(FunctionRegistry* registry); // Nullary diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 4131bbdf6f912..c144b727a3db2 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1200,6 +1200,26 @@ Containment tests * \(8) Output is true iff :member:`MatchSubstringOptions::pattern` matches the corresponding input element at any position. + +Hash Functions +~~~~~~~~~~~~~~ + +Not to be confused with the "group by" functions, Hash functions produce an array of hash +values corresponding to the length of the input. Currently, these functions take a single +array as input. + ++-----------------------+-------+-----------------------------------+-------------+---------------+-------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++=======================+=======+===================================+=============+===============+=======+ +| hash_64 | Unary | Any | UInt64 | | \(1) | ++-----------------------+-------+-----------------------------------+-------------+---------------+-------+ + +* \(1) The hashing algorithm is "xxHash-like", making some minor trade-offs in favor of + performance. Arrays containing nested types are recursively walked and flattened; such + that each field or attribute (corresponding to the same row) are hashed and combined + into a single hash value. + + Categorizations ~~~~~~~~~~~~~~~