Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-41668: [C++] Support cast kernel from list-like to (large) string #41831

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
98 changes: 98 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_cast_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,102 @@ void AddBinaryToFixedSizeBinaryCast(CastFunction* func) {
AddBinaryToFixedSizeBinaryCast<FixedSizeBinaryType>(func);
}

// ----------------------------------------------------------------------
// List-like (List, LargeList, ListView, LargeListView, FixedSizeList, Map) to string

template <typename O, typename I>
struct ListLikeToStringCastFunctor {
using BuilderType = typename TypeTraits<O>::BuilderType;

static Status AppendValue(const ArraySpan& values, std::stringstream& ss, int64_t j,
int64_t start) {
if (j != start) {
ss << ", ";
}
if (values.IsValid(j)) {
std::shared_ptr<Scalar> value_scalar;
RETURN_NOT_OK(values.ToArray()->GetScalar(j).Value(&value_scalar));
ss << value_scalar->ToString();
} else {
ss << "null";
}
return Status::OK();
}

static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
const ArraySpan& input = batch[0].array;
auto type_id = input.type->id();
BuilderType builder(ctx->memory_pool());
RETURN_NOT_OK(builder.Reserve(input.length));

std::string type_info = input.type->ToString(true);
const ArraySpan& values = input.child_data[0];
const auto* offsets = input.GetValues<typename I::offset_type>(1);

int list_size = -1;
if (type_id == Type::FIXED_SIZE_LIST) {
list_size = checked_cast<const FixedSizeListType&>(*input.type).list_size();
}

for (int64_t i = 0; i < input.length; ++i) {
if (!input.IsValid(i)) {
RETURN_NOT_OK(builder.Append("null"));
continue;
}

std::stringstream ss;
ss << type_info << "[";

int64_t start, end;
if (type_id == Type::FIXED_SIZE_LIST) {
start = i * list_size;
end = start + list_size;
} else {
start = offsets[i];
end = offsets[i + 1];
}

for (int64_t j = start; j < end; ++j) {
RETURN_NOT_OK(AppendValue(values, ss, j, start));
}

ss << "]";
RETURN_NOT_OK(builder.Append(ss.str()));
}

std::shared_ptr<Array> output_array;
RETURN_NOT_OK(builder.Finish(&output_array));
out->value = output_array->data();
return Status::OK();
}
};

template <typename OutType>
void AddListLikeToStringCasts(CastFunction* func) {
auto out_ty = TypeTraits<OutType>::type_singleton();

DCHECK_OK(func->AddKernel(Type::LIST, {InputType(Type::LIST)}, out_ty,
ListLikeToStringCastFunctor<OutType, ListType>::Exec,
NullHandling::COMPUTED_NO_PREALLOCATE));
DCHECK_OK(func->AddKernel(Type::LARGE_LIST, {InputType(Type::LARGE_LIST)}, out_ty,
ListLikeToStringCastFunctor<OutType, LargeListType>::Exec,
NullHandling::COMPUTED_NO_PREALLOCATE));
DCHECK_OK(func->AddKernel(Type::LIST_VIEW, {InputType(Type::LIST_VIEW)}, out_ty,
ListLikeToStringCastFunctor<OutType, ListViewType>::Exec,
NullHandling::COMPUTED_NO_PREALLOCATE));
DCHECK_OK(func->AddKernel(Type::LARGE_LIST_VIEW, {InputType(Type::LARGE_LIST_VIEW)},
out_ty,
ListLikeToStringCastFunctor<OutType, LargeListViewType>::Exec,
NullHandling::COMPUTED_NO_PREALLOCATE));
DCHECK_OK(func->AddKernel(Type::FIXED_SIZE_LIST, {InputType(Type::FIXED_SIZE_LIST)},
out_ty,
ListLikeToStringCastFunctor<OutType, FixedSizeListType>::Exec,
NullHandling::COMPUTED_NO_PREALLOCATE));
DCHECK_OK(func->AddKernel(Type::MAP, {InputType(Type::MAP)}, out_ty,
ListLikeToStringCastFunctor<OutType, MapType>::Exec,
NullHandling::COMPUTED_NO_PREALLOCATE));
}

} // namespace

std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts() {
Expand All @@ -528,6 +624,7 @@ std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts() {
AddDecimalToStringCasts<StringType>(cast_string.get());
AddTemporalToStringCasts<StringType>(cast_string.get());
AddBinaryToBinaryCast<StringType>(cast_string.get());
AddListLikeToStringCasts<StringType>(cast_string.get());

auto cast_large_string =
std::make_shared<CastFunction>("cast_large_string", Type::LARGE_STRING);
Expand All @@ -536,6 +633,7 @@ std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts() {
AddDecimalToStringCasts<LargeStringType>(cast_large_string.get());
AddTemporalToStringCasts<LargeStringType>(cast_large_string.get());
AddBinaryToBinaryCast<LargeStringType>(cast_large_string.get());
AddListLikeToStringCasts<LargeStringType>(cast_large_string.get());

auto cast_fsb =
std::make_shared<CastFunction>("cast_fixed_size_binary", Type::FIXED_SIZE_BINARY);
Expand Down
167 changes: 141 additions & 26 deletions cpp/src/arrow/compute/kernels/scalar_cast_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,20 @@ static std::shared_ptr<Array> MaskArrayWithNullsAt(std::shared_ptr<Array> input,
return MakeArray(masked);
}

class TestCastToString : public ::testing::Test {
protected:
static void CheckCastToString(const std::shared_ptr<DataType>& src_type,
const std::string& src_str,
const std::string& expected_str) {
std::shared_ptr<Array> src = ArrayFromJSON(src_type, src_str);
for (const auto& out_ty : {utf8(), large_utf8()}) {
ASSERT_OK_AND_ASSIGN(auto casted_str, Cast(*src, out_ty));
ASSERT_EQ(casted_str->type()->id(), out_ty->id());
ASSERT_EQ(casted_str->ToString(), expected_str);
}
}
};

TEST(Cast, CanCast) {
auto ExpectCanCast = [](std::shared_ptr<DataType> from,
std::vector<std::shared_ptr<DataType>> to_set,
Expand Down Expand Up @@ -2275,10 +2289,6 @@ TEST(Cast, BooleanToString) {
TEST(Cast, ListToPrimitive) {
ASSERT_RAISES(NotImplemented,
Cast(*ArrayFromJSON(list(int8()), "[[1, 2], [3, 4]]"), uint8()));

ASSERT_RAISES(
NotImplemented,
Cast(*ArrayFromJSON(list(binary()), R"([["1", "2"], ["3", "4"]])"), utf8()));
}

using make_list_t = std::shared_ptr<DataType>(const std::shared_ptr<DataType>&);
Expand Down Expand Up @@ -2429,6 +2439,28 @@ TEST(Cast, FSLToList) {
CheckCast(fsl_int32, ArrayFromJSON(fixed_size_list(int16(), 1), "[[32689]]"), options);
}

TEST_F(TestCastToString, FSLToString) {
// Example with int32 list
std::shared_ptr<DataType> fsl_type = fixed_size_list(int32(), 3);
const std::string fsl_json = R"([[1, 2, 3], [4, 5, 6], [7, null, 8], null])";
const std::string expected_str = R"([
"fixed_size_list<item: int32>[3][1, 2, 3]",
"fixed_size_list<item: int32>[3][4, 5, 6]",
"fixed_size_list<item: int32>[3][7, null, 8]",
"null"
])";
CheckCastToString(fsl_type, fsl_json, expected_str);

// Example with nested fixed_size_list<int32> of size 2
fsl_type = fixed_size_list(fixed_size_list(int32(), 2), 2);
const std::string nested_fsl_json = R"([[[1, 2], [3, 4]], [[null, 5], null]])";
const std::string expected_nested_str = R"([
"fixed_size_list<item: fixed_size_list<item: int32>[2]>[2][fixed_size_list<item: int32>[2][1, 2], fixed_size_list<item: int32>[2][3, 4]]",
"fixed_size_list<item: fixed_size_list<item: int32>[2]>[2][fixed_size_list<item: int32>[2][null, 5], null]"
])";
CheckCastToString(fsl_type, nested_fsl_json, expected_nested_str);
}

TEST(Cast, ListToFSL) {
CheckCastList(list(int16()), fixed_size_list(int16(), 2),
"[[0, 1], [2, 3], null, [null, 5], null]");
Expand Down Expand Up @@ -2476,54 +2508,137 @@ TEST(Cast, ListToFSL) {
CastOptions::Safe(fixed_size_list(int32(), 3))));
}

TEST(Cast, CastMap) {
const std::string map_json =
"[[[\"x\", 1], [\"y\", 8], [\"z\", 9]], [[\"x\", 6]], [[\"y\", 36]]]";
const std::string map_json_nullable =
"[[[\"x\", 1], [\"y\", null], [\"z\", 9]], null, [[\"y\", 36]]]";
TEST_F(TestCastToString, ListToString) {
// Example with int32 list, large list, list view and large list view
const std::vector<std::pair<std::shared_ptr<DataType>, std::string>> list_types = {
{list(int32()), R"([
"list<item: int32>[1, 2]",
"list<item: int32>[3]",
"list<item: int32>[]"
])"},
{large_list(int32()), R"([
"large_list<item: int32>[1, 2]",
"large_list<item: int32>[3]",
"large_list<item: int32>[]"
])"},
{list_view(int32()), R"([
"list_view<item: int32>[1, 2]",
"list_view<item: int32>[3]",
"list_view<item: int32>[]"
])"},
{large_list_view(int32()), R"([
"large_list_view<item: int32>[1, 2]",
"large_list_view<item: int32>[3]",
"large_list_view<item: int32>[]"
])"}};

const std::string list_json = R"([[1, 2], [3], []])";

for (const auto& [list_type, expected_str] : list_types) {
CheckCastToString(list_type, list_json, expected_str);
}

// Example with nested list of int32. To avoid further code duplication, the code for
// large_list, list_view, and large_list_view is omitted.
const std::vector<std::pair<std::shared_ptr<DataType>, std::string>> nested_list_types =
{{list(list(int32())), R"([
"list<item: list<item: int32>>[list<item: int32>[1, 2], list<item: int32>[3]]",
"list<item: list<item: int32>>[list<item: int32>[4]]",
"list<item: list<item: int32>>[list<item: int32>[]]",
"list<item: list<item: int32>>[]"
])"}};

const std::string nested_list_json = R"([[[1, 2], [3]], [[4]], [[]], []])";

for (const auto& [nested_list_type, expected_nested_str] : nested_list_types) {
CheckCastToString(nested_list_type, nested_list_json, expected_nested_str);
}
}

class TestMapScalar : public TestCastToString {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just as TestMapScalar was written for MapType, I wanted to write similar classes for List and FixedSizeList.

However, I found it too extensive to address within this PR, so I only covered MapType.

It would be better to create a separate issue to handle this.

protected:
TestMapScalar() {
map_type = map(utf8(), int64());
map_json = R"([[["x", 1], ["y", 8], ["z", 9]], [["x", 6]], [["y", 36]]])";
map_json_nullable = R"([[["x", 1], ["y", null], ["z", 9]], null, [["y", 36]]])";
}

auto CheckMapCast = [map_json,
map_json_nullable](const std::shared_ptr<DataType>& dst_type) {
std::shared_ptr<DataType> src_type =
std::make_shared<MapType>(field("x", utf8(), false), field("y", int64()));
std::shared_ptr<Array> src = ArrayFromJSON(src_type, map_json);
void CheckMapCast(const std::shared_ptr<DataType>& dst_type) {
std::shared_ptr<Array> src = ArrayFromJSON(map_type, map_json);
std::shared_ptr<Array> dst = ArrayFromJSON(dst_type, map_json);
CheckCast(src, dst);

src = ArrayFromJSON(src_type, map_json_nullable);
src = ArrayFromJSON(map_type, map_json_nullable);
dst = ArrayFromJSON(dst_type, map_json_nullable);
CheckCast(src, dst);
};
}

protected:
std::shared_ptr<DataType> map_type;
std::string map_json;
std::string map_json_nullable;
};

TEST_F(TestMapScalar, RenameMap) {
// Can rename fields
CheckMapCast(std::make_shared<MapType>(field("a", utf8(), false), field("b", int64())));
// Can map keys and values
CheckMapCast(map(large_utf8(), field("y", int32())));
// Can cast a map to a to a list<struct<keys=.., values=..>>
CheckMapCast(list(struct_({field("a", utf8()), field("b", int64())})));
// Can cast a map to a large_list<struct<keys=.., values=..>>
CheckMapCast(large_list(struct_({field("a", utf8()), field("b", int64())})));

// Can rename nested field names
std::shared_ptr<DataType> src_type = map(utf8(), field("x", list(field("a", int64()))));
std::shared_ptr<DataType> dst_type = map(utf8(), field("y", list(field("b", int64()))));

std::shared_ptr<Array> src =
ArrayFromJSON(src_type, "[[[\"1\", [1,2,3]]], [[\"2\", [4,5,6]]]]");
ArrayFromJSON(src_type, R"([[["1", [1,2,3]]], [["2", [4,5,6]]]])");
std::shared_ptr<Array> dst =
ArrayFromJSON(dst_type, "[[[\"1\", [1,2,3]]], [[\"2\", [4,5,6]]]]");
ArrayFromJSON(dst_type, R"([[["1", [1,2,3]]], [["2", [4,5,6]]]])");

CheckCast(src, dst);
}

TEST_F(TestMapScalar, CastToMap) {
// Can map keys and values
CheckMapCast(map(large_utf8(), field("y", int32())));
}

TEST_F(TestMapScalar, CastToList) {
// Can cast a map to a list<struct<keys=.., values=..>>
CheckMapCast(list(struct_({field("a", utf8()), field("b", int64())})));
}

// Cannot cast to a list<struct<[fields]>> if there are not exactly 2 fields
dst_type = list(
TEST_F(TestMapScalar, CastToLargeList) {
// Can cast a map to a large_list<struct<keys=.., values=..>>
CheckMapCast(large_list(struct_({field("a", utf8()), field("b", int64())})));
}

TEST_F(TestMapScalar, CastToListWithInvalidFields) {
std::shared_ptr<DataType> src_type = map(utf8(), field("x", list(field("a", int64()))));
std::shared_ptr<Array> src =
ArrayFromJSON(src_type, R"([[["1", [1,2,3]]], [["2", [4,5,6]]]])");

std::shared_ptr<DataType> dst_type = list(
struct_({field("key", int32()), field("value", int64()), field("extra", int64())}));
EXPECT_RAISES_WITH_MESSAGE_THAT(
TypeError,
::testing::HasSubstr("must be cast to a list<struct> with exactly two fields"),
Cast(src, dst_type));
}

TEST_F(TestMapScalar, CastToString) {
const std::string expected_str = R"([
"map<string, int64>[{key:string = x, value:int64 = 1}, {key:string = y, value:int64 = 8}, {key:string = z, value:int64 = 9}]",
"map<string, int64>[{key:string = x, value:int64 = 6}]",
"map<string, int64>[{key:string = y, value:int64 = 36}]"
])";
CheckCastToString(map_type, map_json, expected_str);

const std::string expected_str_nullable = R"([
"map<string, int64>[{key:string = x, value:int64 = 1}, {key:string = y, value:int64 = null}, {key:string = z, value:int64 = 9}]",
"null",
"map<string, int64>[{key:string = y, value:int64 = 36}]"
])";
CheckCastToString(map_type, map_json_nullable, expected_str_nullable);
}

static void CheckStructToStruct(
const std::vector<std::shared_ptr<DataType>>& value_types) {
for (const auto& src_value_type : value_types) {
Expand Down
16 changes: 6 additions & 10 deletions cpp/src/arrow/scalar_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1292,11 +1292,9 @@ class TestListLikeScalar : public ::testing::Test {
auto invalid_cast_type = fixed_size_list(value_->type(), 5);
CheckListCastError(scalar, invalid_cast_type);

// Cast() function doesn't support casting list-like to string, use Scalar::CastTo()
// instead.
ASSERT_OK_AND_ASSIGN(auto casted_str, scalar.CastTo(utf8()));
ASSERT_EQ(casted_str->type->id(), utf8()->id());
ASSERT_EQ(casted_str->ToString(), scalar.ToString());
ASSERT_OK_AND_ASSIGN(auto casted_str, Cast(scalar, utf8()));
ASSERT_EQ(casted_str.scalar()->type->id(), utf8()->id());
ASSERT_EQ(casted_str.scalar()->ToString(), scalar.ToString());
}

protected:
Expand Down Expand Up @@ -1337,11 +1335,9 @@ TEST(TestFixedSizeListScalar, Cast) {
auto invalid_cast_type = fixed_size_list(int16(), 4);
CheckListCastError(scalar, invalid_cast_type);

// Cast() function doesn't support casting list-like to string, use Scalar::CastTo()
// instead.
ASSERT_OK_AND_ASSIGN(auto casted_str, scalar.CastTo(utf8()));
ASSERT_EQ(casted_str->type->id(), utf8()->id());
ASSERT_EQ(casted_str->ToString(), scalar.ToString());
ASSERT_OK_AND_ASSIGN(auto casted_str, Cast(scalar, utf8()));
ASSERT_EQ(casted_str.scalar()->type->id(), utf8()->id());
ASSERT_EQ(casted_str.scalar()->ToString(), scalar.ToString());
}

TEST(TestMapScalar, Basics) {
Expand Down
Loading