From 2459f61da1cca8c55480b63436efe39e524c723a Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 14 Aug 2024 13:58:12 -0700 Subject: [PATCH 01/58] Rename function and class containing `from_json` Signed-off-by: Nghia Truong --- src/main/cpp/CMakeLists.txt | 3 +- src/main/cpp/src/JSONUtilsJni.cpp | 14 +++++ src/main/cpp/src/MapUtilsJni.cpp | 36 ------------ .../cpp/src/{map_utils.hpp => from_json.hpp} | 2 +- .../{map_utils.cu => from_json_to_raw_map.cu} | 10 ++-- ...bug.cuh => from_json_to_raw_map_debug.cuh} | 0 .../nvidia/spark/rapids/jni/JSONUtils.java | 33 ++++++++++- .../com/nvidia/spark/rapids/jni/MapUtils.java | 55 ------------------- ...ilsTest.java => FromJsonToRawMapTest.java} | 6 +- 9 files changed, 54 insertions(+), 105 deletions(-) delete mode 100644 src/main/cpp/src/MapUtilsJni.cpp rename src/main/cpp/src/{map_utils.hpp => from_json.hpp} (95%) rename src/main/cpp/src/{map_utils.cu => from_json_to_raw_map.cu} (98%) rename src/main/cpp/src/{map_utils_debug.cuh => from_json_to_raw_map_debug.cuh} (100%) delete mode 100644 src/main/java/com/nvidia/spark/rapids/jni/MapUtils.java rename src/test/java/com/nvidia/spark/rapids/jni/{MapUtilsTest.java => FromJsonToRawMapTest.java} (95%) diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index c748045ed8..08ef76aa4b 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -191,7 +191,6 @@ add_library( src/HashJni.cpp src/HistogramJni.cpp src/JSONUtilsJni.cpp - src/MapUtilsJni.cpp src/NativeParquetJni.cpp src/ParseURIJni.cpp src/RegexRewriteUtilsJni.cpp @@ -208,9 +207,9 @@ add_library( src/cast_string_to_float.cu src/datetime_rebase.cu src/decimal_utils.cu + src/from_json_to_raw_map.cu src/get_json_object.cu src/histogram.cu - src/map_utils.cu src/murmur_hash.cu src/parse_uri.cu src/regex_rewrite_utils.cu diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index 15a6d9cf21..24e3648769 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -15,6 +15,7 @@ */ #include "cudf_jni_apis.hpp" +#include "from_json.hpp" #include "get_json_object.hpp" #include @@ -139,4 +140,17 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_getJsonObjectMultiplePaths(JNIEnv* en } CATCH_STD(env, 0); } + +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_extractRawMapFromJsonString( + JNIEnv* env, jclass, jlong input_handle) +{ + JNI_NULL_CHECK(env, input_handle, "json_column_handle is null", 0); + + try { + cudf::jni::auto_set_device(env); + auto const input = reinterpret_cast(input_handle); + return cudf::jni::ptr_as_jlong(spark_rapids_jni::from_json_to_raw_map(*input).release()); + } + CATCH_STD(env, 0); +} } diff --git a/src/main/cpp/src/MapUtilsJni.cpp b/src/main/cpp/src/MapUtilsJni.cpp deleted file mode 100644 index 0fc5f3c280..0000000000 --- a/src/main/cpp/src/MapUtilsJni.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "map_utils.hpp" - -#include -#include - -extern "C" { - -JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_MapUtils_extractRawMapFromJsonString( - JNIEnv* env, jclass, jlong input_handle) -{ - JNI_NULL_CHECK(env, input_handle, "json_column_handle is null", 0); - - try { - cudf::jni::auto_set_device(env); - auto const input = reinterpret_cast(input_handle); - return cudf::jni::ptr_as_jlong(spark_rapids_jni::from_json(*input).release()); - } - CATCH_STD(env, 0); -} -} diff --git a/src/main/cpp/src/map_utils.hpp b/src/main/cpp/src/from_json.hpp similarity index 95% rename from src/main/cpp/src/map_utils.hpp rename to src/main/cpp/src/from_json.hpp index 96ba6f7e9b..a253926be9 100644 --- a/src/main/cpp/src/map_utils.hpp +++ b/src/main/cpp/src/from_json.hpp @@ -26,7 +26,7 @@ namespace spark_rapids_jni { -std::unique_ptr from_json( +std::unique_ptr from_json_to_raw_map( cudf::column_view const& input, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); diff --git a/src/main/cpp/src/map_utils.cu b/src/main/cpp/src/from_json_to_raw_map.cu similarity index 98% rename from src/main/cpp/src/map_utils.cu rename to src/main/cpp/src/from_json_to_raw_map.cu index ebb12eee93..3c5a1adedd 100644 --- a/src/main/cpp/src/map_utils.cu +++ b/src/main/cpp/src/from_json_to_raw_map.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "map_utils_debug.cuh" +#include "from_json_to_raw_map_debug.cuh" #include #include @@ -71,7 +71,7 @@ rmm::device_uvector unify_json_strings(cudf::column_view const& input, auto const input_scv = cudf::strings_column_view{input}; auto const chars_size = input_scv.chars_size(stream); auto const output_size = - 2l + // two extra bracket characters '[' and ']' + 2l + // two extra bracket characters '[' and ']' static_cast(chars_size) + static_cast(input.size() - 1) + // append `,` character between input rows static_cast(input.null_count()) * 2l; // replace null with "{}" @@ -641,9 +641,9 @@ rmm::device_uvector compute_list_offsets( } // namespace -std::unique_ptr from_json(cudf::column_view const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr from_json_to_raw_map(cudf::column_view const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_EXPECTS(input.type().id() == cudf::type_id::STRING, "Invalid input format"); diff --git a/src/main/cpp/src/map_utils_debug.cuh b/src/main/cpp/src/from_json_to_raw_map_debug.cuh similarity index 100% rename from src/main/cpp/src/map_utils_debug.cuh rename to src/main/cpp/src/from_json_to_raw_map_debug.cuh diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index 001beea59c..ac41c15528 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -40,16 +40,16 @@ public static class PathInstructionJni { private final int index; public PathInstructionJni(PathInstructionType type, String name, long index) { - this.type = (byte)type.ordinal(); + this.type = (byte) type.ordinal(); this.name = name; if (index > Integer.MAX_VALUE) { throw new IllegalArgumentException("index is too large " + index); } - this.index = (int)index; + this.index = (int) index; } public PathInstructionJni(PathInstructionType type, String name, int index) { - this.type = (byte)type.ordinal(); + this.type = (byte) type.ordinal(); this.name = name; this.index = index; } @@ -137,6 +137,30 @@ public static ColumnVector[] getJsonObjectMultiplePaths(ColumnVector input, return ret; } + + /** + * Extract key-value pairs for each output map from the given json strings. These key-value are + * copied directly as substrings of the input without any type conversion. + *

+ * Since there is not any validity check, the output of this function may be different from + * what generated by Spark's `from_json` function. Situations that can lead to + * different/incorrect outputs may include:
+ * - The value in the input json string is invalid, such as 'abc' value for an integer key.
+ * - The value string can be non-clean format for floating-point type, such as '1.00000'. + *

+ * The output of these situations should all be NULL or a value '1.0', respectively. However, this + * function will just simply copy the input value strings to the output. + * + * @param jsonColumn The input strings column in which each row specifies a json object. + * @return A map column (i.e., a column of type {@code List>}) in + * which the key-value pairs are extracted directly from the input json strings. + */ + public static ColumnVector extractRawMapFromJsonString(ColumnView jsonColumn) { + assert jsonColumn.getType().equals(DType.STRING) : "Input type must be String"; + return new ColumnVector(extractRawMapFromJsonString(jsonColumn.getNativeView())); + } + + private static native int getMaxJSONPathDepth(); private static native long getJsonObject(long input, @@ -151,4 +175,7 @@ private static native long[] getJsonObjectMultiplePaths(long input, int[] pathOffsets, long memoryBudgetBytes, int parallelOverride); + + + private static native long extractRawMapFromJsonString(long jsonColumnHandle); } diff --git a/src/main/java/com/nvidia/spark/rapids/jni/MapUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/MapUtils.java deleted file mode 100644 index 140455b462..0000000000 --- a/src/main/java/com/nvidia/spark/rapids/jni/MapUtils.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.jni; - -import ai.rapids.cudf.ColumnVector; -import ai.rapids.cudf.ColumnView; -import ai.rapids.cudf.DType; -import ai.rapids.cudf.NativeDepsLoader; - -public class MapUtils { - static { - NativeDepsLoader.loadNativeDeps(); - } - - - /** - * Extract key-value pairs for each output map from the given json strings. These key-value are - * copied directly as substrings of the input without any type conversion. - *

- * Since there is not any validity check, the output of this function may be different from - * what generated by Spark's `from_json` function. Situations that can lead to - * different/incorrect outputs may include:
- * - The value in the input json string is invalid, such as 'abc' value for an integer key.
- * - The value string can be non-clean format for floating-point type, such as '1.00000'. - *

- * The output of these situations should all be NULL or a value '1.0', respectively. However, this - * function will just simply copy the input value strings to the output. - * - * @param jsonColumn The input strings column in which each row specifies a json object. - * @return A map column (i.e., a column of type {@code List>}) in - * which the key-value pairs are extracted directly from the input json strings. - */ - public static ColumnVector extractRawMapFromJsonString(ColumnView jsonColumn) { - assert jsonColumn.getType().equals(DType.STRING) : "Input type must be String"; - return new ColumnVector(extractRawMapFromJsonString(jsonColumn.getNativeView())); - } - - - private static native long extractRawMapFromJsonString(long jsonColumnHandle); - -} diff --git a/src/test/java/com/nvidia/spark/rapids/jni/MapUtilsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/FromJsonToRawMapTest.java similarity index 95% rename from src/test/java/com/nvidia/spark/rapids/jni/MapUtilsTest.java rename to src/test/java/com/nvidia/spark/rapids/jni/FromJsonToRawMapTest.java index 773ef7ac37..47259dac48 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/MapUtilsTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/FromJsonToRawMapTest.java @@ -23,7 +23,7 @@ import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual; -public class MapUtilsTest { +public class FromJsonToRawMapTest { @Test void testFromJsonSimpleInput() { @@ -36,7 +36,7 @@ void testFromJsonSimpleInput() { try (ColumnVector input = ColumnVector.fromStrings(jsonString1, jsonString2, null, jsonString3); - ColumnVector outputMap = MapUtils.extractRawMapFromJsonString(input); + ColumnVector outputMap = JSONUtils.extractRawMapFromJsonString(input); ColumnVector expectedKeys = ColumnVector.fromStrings("Zipcode", "ZipCodeType", "City", "State", "category", "index", "author", "title", "price"); @@ -65,7 +65,7 @@ void testFromJsonWithUTF8() { try (ColumnVector input = ColumnVector.fromStrings(jsonString1, jsonString2, null, jsonString3); - ColumnVector outputMap = MapUtils.extractRawMapFromJsonString(input); + ColumnVector outputMap = JSONUtils.extractRawMapFromJsonString(input); ColumnVector expectedKeys = ColumnVector.fromStrings("Zipc\u00f3de", "Z\u00edpCodeTyp" + "\u00e9", "City", "St\u00e2te", "Zipc\u00f3de", "Z\u00edpCodeTyp\u00e9", From 3b1e06795f0929fb3a1bfa9f741265888de7ac5a Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 14 Aug 2024 14:03:27 -0700 Subject: [PATCH 02/58] Fix copyright year Signed-off-by: Nghia Truong --- .../java/com/nvidia/spark/rapids/jni/FromJsonToRawMapTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/com/nvidia/spark/rapids/jni/FromJsonToRawMapTest.java b/src/test/java/com/nvidia/spark/rapids/jni/FromJsonToRawMapTest.java index 47259dac48..8edff2f4c8 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/FromJsonToRawMapTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/FromJsonToRawMapTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 1959b87584f27fad3bcaf22f6a68119e32e2e6af Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 14 Aug 2024 14:09:22 -0700 Subject: [PATCH 03/58] Fix style Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_raw_map.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/cpp/src/from_json_to_raw_map.cu b/src/main/cpp/src/from_json_to_raw_map.cu index 3c5a1adedd..b747c4bb77 100644 --- a/src/main/cpp/src/from_json_to_raw_map.cu +++ b/src/main/cpp/src/from_json_to_raw_map.cu @@ -71,7 +71,7 @@ rmm::device_uvector unify_json_strings(cudf::column_view const& input, auto const input_scv = cudf::strings_column_view{input}; auto const chars_size = input_scv.chars_size(stream); auto const output_size = - 2l + // two extra bracket characters '[' and ']' + 2l + // two extra bracket characters '[' and ']' static_cast(chars_size) + static_cast(input.size() - 1) + // append `,` character between input rows static_cast(input.null_count()) * 2l; // replace null with "{}" From 2f83c57da7ca3b2358cde678231714f7dd9890b9 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 14 Aug 2024 14:27:26 -0700 Subject: [PATCH 04/58] Rename parameter Signed-off-by: Nghia Truong --- .../java/com/nvidia/spark/rapids/jni/JSONUtils.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index ac41c15528..3a7c4a6a53 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -151,13 +151,13 @@ public static ColumnVector[] getJsonObjectMultiplePaths(ColumnVector input, * The output of these situations should all be NULL or a value '1.0', respectively. However, this * function will just simply copy the input value strings to the output. * - * @param jsonColumn The input strings column in which each row specifies a json object. + * @param input The input strings column in which each row specifies a json object * @return A map column (i.e., a column of type {@code List>}) in - * which the key-value pairs are extracted directly from the input json strings. + * which the key-value pairs are extracted directly from the input json strings */ - public static ColumnVector extractRawMapFromJsonString(ColumnView jsonColumn) { - assert jsonColumn.getType().equals(DType.STRING) : "Input type must be String"; - return new ColumnVector(extractRawMapFromJsonString(jsonColumn.getNativeView())); + public static ColumnVector extractRawMapFromJsonString(ColumnView input) { + assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; + return new ColumnVector(extractRawMapFromJsonString(input.getNativeView())); } @@ -177,5 +177,5 @@ private static native long[] getJsonObjectMultiplePaths(long input, int parallelOverride); - private static native long extractRawMapFromJsonString(long jsonColumnHandle); + private static native long extractRawMapFromJsonString(long input); } From 8f1da4bcb7802b123bab36ecf09c959236626d61 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 14 Aug 2024 14:38:28 -0700 Subject: [PATCH 05/58] Change parameter type Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 9 +++++---- src/main/cpp/src/from_json.hpp | 4 ++-- src/main/cpp/src/from_json_to_raw_map.cu | 17 +++++++---------- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index 24e3648769..5a0c5dd341 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -142,14 +142,15 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_getJsonObjectMultiplePaths(JNIEnv* en } JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_extractRawMapFromJsonString( - JNIEnv* env, jclass, jlong input_handle) + JNIEnv* env, jclass, jlong j_input) { - JNI_NULL_CHECK(env, input_handle, "json_column_handle is null", 0); + JNI_NULL_CHECK(env, j_input, "j_input is null", 0); try { cudf::jni::auto_set_device(env); - auto const input = reinterpret_cast(input_handle); - return cudf::jni::ptr_as_jlong(spark_rapids_jni::from_json_to_raw_map(*input).release()); + auto const input_cv = reinterpret_cast(j_input); + return cudf::jni::ptr_as_jlong( + spark_rapids_jni::from_json_to_raw_map(cudf::strings_column_view{*input_cv}).release()); } CATCH_STD(env, 0); } diff --git a/src/main/cpp/src/from_json.hpp b/src/main/cpp/src/from_json.hpp index a253926be9..75fc3bc103 100644 --- a/src/main/cpp/src/from_json.hpp +++ b/src/main/cpp/src/from_json.hpp @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #include @@ -27,7 +27,7 @@ namespace spark_rapids_jni { std::unique_ptr from_json_to_raw_map( - cudf::column_view const& input, + cudf::strings_column_view const& input, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); diff --git a/src/main/cpp/src/from_json_to_raw_map.cu b/src/main/cpp/src/from_json_to_raw_map.cu index b747c4bb77..6396a110ba 100644 --- a/src/main/cpp/src/from_json_to_raw_map.cu +++ b/src/main/cpp/src/from_json_to_raw_map.cu @@ -59,7 +59,7 @@ namespace { // 1. Append one comma character (',') to the end of each input string, except the last one. // 2. Concatenate all input strings into one string. // 3. Add a pair of bracket characters ('[' and ']') to the beginning and the end of the output. -rmm::device_uvector unify_json_strings(cudf::column_view const& input, +rmm::device_uvector unify_json_strings(cudf::strings_column_view const& input, rmm::cuda_stream_view stream) { if (input.is_empty()) { @@ -67,11 +67,10 @@ rmm::device_uvector unify_json_strings(cudf::column_view const& input, std::vector{'[', ']'}, stream, rmm::mr::get_current_device_resource()); } - auto const d_strings = cudf::column_device_view::create(input, stream); - auto const input_scv = cudf::strings_column_view{input}; - auto const chars_size = input_scv.chars_size(stream); + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + auto const chars_size = input.chars_size(stream); auto const output_size = - 2l + // two extra bracket characters '[' and ']' + 2l + // two extra bracket characters '[' and ']' static_cast(chars_size) + static_cast(input.size() - 1) + // append `,` character between input rows static_cast(input.null_count()) * 2l; // replace null with "{}" @@ -81,7 +80,7 @@ rmm::device_uvector unify_json_strings(cudf::column_view const& input, "The input json column is too large and causes overflow."); auto const joined_input = cudf::strings::detail::join_strings( - input_scv, + input, cudf::string_scalar(","), // append `,` character between the input rows cudf::string_scalar("{}"), // replacement for null rows stream, @@ -641,12 +640,10 @@ rmm::device_uvector compute_list_offsets( } // namespace -std::unique_ptr from_json_to_raw_map(cudf::column_view const& input, +std::unique_ptr from_json_to_raw_map(cudf::strings_column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(input.type().id() == cudf::type_id::STRING, "Invalid input format"); - // Firstly, concatenate all the input json strings into one giant input json string. // When testing/debugging, the output can be validated using // https://jsonformatter.curiousconcept.com. @@ -718,7 +715,7 @@ std::unique_ptr from_json_to_raw_map(cudf::column_view const& inpu std::move(offsets), std::move(structs_col), input.null_count(), - cudf::detail::copy_bitmask(input, stream, mr), + cudf::detail::copy_bitmask(input.parent(), stream, mr), stream, mr); } From 515bb6b559b3a7801ac8d96fc52513e47a8ccafb Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 14 Aug 2024 14:39:23 -0700 Subject: [PATCH 06/58] Fix style Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_raw_map.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/cpp/src/from_json_to_raw_map.cu b/src/main/cpp/src/from_json_to_raw_map.cu index 6396a110ba..73c2c4b559 100644 --- a/src/main/cpp/src/from_json_to_raw_map.cu +++ b/src/main/cpp/src/from_json_to_raw_map.cu @@ -70,7 +70,7 @@ rmm::device_uvector unify_json_strings(cudf::strings_column_view const& in auto const d_strings = cudf::column_device_view::create(input.parent(), stream); auto const chars_size = input.chars_size(stream); auto const output_size = - 2l + // two extra bracket characters '[' and ']' + 2l + // two extra bracket characters '[' and ']' static_cast(chars_size) + static_cast(input.size() - 1) + // append `,` character between input rows static_cast(input.null_count()) * 2l; // replace null with "{}" From c7d5ad95da31b6bd05147447d715848798767684 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 14 Aug 2024 15:52:03 -0700 Subject: [PATCH 07/58] Exclude the new files from compiling Signed-off-by: Nghia Truong --- src/main/cpp/CMakeLists.txt | 1 + src/main/cpp/src/from_json.hpp | 5 + src/main/cpp/src/from_json_to_struct.cu | 385 ++++++++++++++++++++++++ 3 files changed, 391 insertions(+) create mode 100644 src/main/cpp/src/from_json_to_struct.cu diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index 08ef76aa4b..9a01b513bb 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -208,6 +208,7 @@ add_library( src/datetime_rebase.cu src/decimal_utils.cu src/from_json_to_raw_map.cu + #src/from_json_to_struct.cu src/get_json_object.cu src/histogram.cu src/murmur_hash.cu diff --git a/src/main/cpp/src/from_json.hpp b/src/main/cpp/src/from_json.hpp index 75fc3bc103..0c4445a9db 100644 --- a/src/main/cpp/src/from_json.hpp +++ b/src/main/cpp/src/from_json.hpp @@ -31,4 +31,9 @@ std::unique_ptr from_json_to_raw_map( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); +std::unique_ptr from_json_to_struct( + cudf::strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/from_json_to_struct.cu b/src/main/cpp/src/from_json_to_struct.cu new file mode 100644 index 0000000000..4d248f91e7 --- /dev/null +++ b/src/main/cpp/src/from_json_to_struct.cu @@ -0,0 +1,385 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "get_json_object.hpp" +#include "json_parser.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include + +namespace spark_rapids_jni { + +namespace detail { +/** + * @brief TODO + */ +template +__launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL + void from_json_kernel(cudf::column_device_view input, std::size_t num_threads_per_row) +{ + auto const tidx = cudf::detail::grid_1d::global_thread_id(); + auto const row_idx = tidx / num_threads_per_row; + if (row_idx >= input.size()) { return; } + + auto const path_idx = tidx % num_threads_per_row; + if (path_idx >= path_data.size()) { return; } + + auto const& path = path_data[path_idx]; + char* const dst = path.out_buf + path.offsets[row_idx]; + bool is_valid = false; + cudf::size_type out_size = 0; + + auto const str = input.element(row_idx); + if (str.size_bytes() > 0) { + json_parser p{char_range{str}}; + thrust::tie(is_valid, out_size) = + evaluate_path(p, path.path_commands, dst, max_path_depth_exceeded); + + // We did not terminate the `evaluate_path` function early to reduce complexity of the code. + // Instead, if max depth was encountered, we've just continued the evaluation until here + // then discard the output entirely. + if (p.max_nesting_depth_exceeded()) { + *max_path_depth_exceeded = 1; + return; + } + + auto const max_size = path.offsets[row_idx + 1] - path.offsets[row_idx]; + if (out_size > max_size) { *(path.has_out_of_bound) = 1; } + } + + // Write out `nullptr` in the output string_view to indicate that the output is a null. + // The situation `out_stringviews == nullptr` should only happen if the kernel is launched a + // second time due to out-of-bound write in the first launch. + if (path.out_stringviews) { + path.out_stringviews[row_idx] = {is_valid ? dst : nullptr, out_size}; + } +} + +/** + * @brief A utility class to launch the main kernel. + */ +struct kernel_launcher { + static void exec(cudf::column_device_view const& input, + cudf::device_span path_data, + int8_t* max_path_depth_exceeded, + rmm::cuda_stream_view stream) + { + // The optimal values for block_size and min_block_per_sm were found through testing, + // which are either 128-8 or 256-4. The pair 128-8 seems a bit better. + static constexpr int block_size = 128; + static constexpr int min_block_per_sm = 8; + + // The number of threads for processing one input row is at least one warp. + auto const num_threads_per_row = + cudf::util::div_rounding_up_safe(path_data.size(), + static_cast(cudf::detail::warp_size)) * + cudf::detail::warp_size; + auto const num_blocks = cudf::util::div_rounding_up_safe(num_threads_per_row * input.size(), + static_cast(block_size)); + get_json_object_kernel + <<>>( + input, path_data, num_threads_per_row, max_path_depth_exceeded); + } +}; + +int64_t calc_scratch_size(cudf::strings_column_view const& input, + cudf::detail::input_offsetalator const& in_offsets, + rmm::cuda_stream_view stream) +{ + auto const max_row_size = thrust::transform_reduce( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + cuda::proclaim_return_type( + [in_offsets] __device__(auto const idx) { return in_offsets[idx + 1] - in_offsets[idx]; }), + int64_t{0}, + thrust::maximum{}); + + // We will use scratch buffers to store the output strings without knowing their sizes. + // Since we do not know their sizes, we need to allocate the buffer a bit larger than the input + // size so that we will not write output strings into an out-of-bound position. + // Checking out-of-bound needs to be performed in the main kernel to make sure we will not have + // data corruption. + auto const scratch_size = [&, max_row_size = max_row_size] { + // Pad the scratch buffer by an additional size that is a multiple of max row size. + auto constexpr padding_rows = 10; + return input.chars_size(stream) + max_row_size * padding_rows; + }(); + return scratch_size; +} + +/** + * @brief Error handling using error markers gathered after kernel launch. + * + * If the input JSON has nesting depth exceeds the maximum allowed value, an exception will be + * thrown as it is unacceptable. Otherwise, out of bound write is checked and returned. + * + * @param error_check The array of markers to check for error + * @return A boolean value indicating if there is any out of bound write + */ +bool check_error(cudf::detail::host_vector const& error_check) +{ + // The last value is to mark if nesting depth has exceeded. + CUDF_EXPECTS(error_check.back() == 0, + "The processed input has nesting depth exceeds depth limit."); + + // Do not use parallel check since we do not have many elements. + // The last element is not related, but its value is already `0` thus just check until + // the end of the array for simplicity. + return std::none_of( + error_check.cbegin(), error_check.cend(), [](auto const val) { return val != 0; }); +} + +std::vector> get_json_object_batch( + cudf::column_device_view const& input, + cudf::detail::input_offsetalator const& in_offsets, + std::vector const>> const& + json_paths, + int64_t scratch_size, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto const [d_json_paths, h_json_paths, d_inst_names, h_inst_names] = + construct_path_commands(json_paths, stream); + + auto const num_outputs = json_paths.size(); + std::vector> output; + + // The error check array contains markers denoting if there is any out-of-bound write occurs + // (first `num_outputs` elements), or if the nesting depth exceeded its limits (the last element). + rmm::device_uvector d_error_check(num_outputs + 1, stream); + auto const d_max_path_depth_exceeded = d_error_check.data() + num_outputs; + + std::vector> scratch_buffers; + std::vector>> out_stringviews; + std::vector h_path_data; + scratch_buffers.reserve(json_paths.size()); + out_stringviews.reserve(json_paths.size()); + h_path_data.reserve(json_paths.size()); + + for (std::size_t idx = 0; idx < num_outputs; ++idx) { + auto const& path = json_paths[idx]; + if (path.size() > MAX_JSON_PATH_DEPTH) { + CUDF_FAIL("JSON Path has depth exceeds the maximum allowed value."); + } + + scratch_buffers.emplace_back(rmm::device_uvector(scratch_size, stream)); + out_stringviews.emplace_back(rmm::device_uvector>{ + static_cast(input.size()), stream}); + + h_path_data.emplace_back(json_path_processing_data{d_json_paths[idx], + in_offsets, + out_stringviews.back().data(), + scratch_buffers.back().data(), + d_error_check.data() + idx}); + } + auto d_path_data = cudf::detail::make_device_uvector_async( + h_path_data, stream, rmm::mr::get_current_device_resource()); + thrust::uninitialized_fill( + rmm::exec_policy(stream), d_error_check.begin(), d_error_check.end(), 0); + + kernel_launcher::exec(input, d_path_data, d_max_path_depth_exceeded, stream); + auto h_error_check = cudf::detail::make_host_vector_sync(d_error_check, stream); + auto has_no_oob = check_error(h_error_check); + + // If we didn't see any out-of-bound write, everything is good so far. + // Just gather the output strings and return. + if (has_no_oob) { + for (auto const& out_sview : out_stringviews) { + output.emplace_back(cudf::make_strings_column(out_sview, stream, mr)); + } + return output; + } + // From here, we had out-of-bound write. Although this is very rare, it may still happen. + + std::vector> out_null_masks_and_null_counts; + std::vector, int64_t>> out_offsets_and_sizes; + std::vector> out_char_buffers; + std::vector oob_indices; + + // Check validity from the stored char pointers. + auto const validator = [] __device__(thrust::pair const item) { + return item.first != nullptr; + }; + + // Rebuild the data only for paths that had out of bound write. + h_path_data.clear(); + for (std::size_t idx = 0; idx < num_outputs; ++idx) { + auto const& out_sview = out_stringviews[idx]; + + if (h_error_check[idx]) { + oob_indices.emplace_back(idx); + output.emplace_back(nullptr); // just placeholder. + + out_null_masks_and_null_counts.emplace_back( + cudf::detail::valid_if(out_sview.begin(), out_sview.end(), validator, stream, mr)); + + // The string sizes computed in the previous kernel call will be used to allocate a new char + // buffer to store the output. + auto const size_it = cudf::detail::make_counting_transform_iterator( + 0, + cuda::proclaim_return_type( + [string_pairs = out_sview.data()] __device__(auto const idx) { + return string_pairs[idx].second; + })); + out_offsets_and_sizes.emplace_back(cudf::strings::detail::make_offsets_child_column( + size_it, size_it + input.size(), stream, mr)); + out_char_buffers.emplace_back( + rmm::device_uvector(out_offsets_and_sizes.back().second, stream, mr)); + + h_path_data.emplace_back( + json_path_processing_data{d_json_paths[idx], + cudf::detail::offsetalator_factory::make_input_iterator( + out_offsets_and_sizes.back().first->view()), + nullptr /*out_stringviews*/, + out_char_buffers.back().data(), + d_error_check.data() + idx}); + } else { + output.emplace_back(cudf::make_strings_column(out_sview, stream, mr)); + } + } + // These buffers are no longer needed. + scratch_buffers.clear(); + out_stringviews.clear(); + + // Push data to the GPU and launch the kernel again. + d_path_data = cudf::detail::make_device_uvector_async( + h_path_data, stream, rmm::mr::get_current_device_resource()); + thrust::uninitialized_fill( + rmm::exec_policy(stream), d_error_check.begin(), d_error_check.end(), 0); + kernel_launcher::exec(input, d_path_data, d_max_path_depth_exceeded, stream); + h_error_check = cudf::detail::make_host_vector_sync(d_error_check, stream); + has_no_oob = check_error(h_error_check); + + // The last kernel call should not encounter any out-of-bound write. + // If OOB is still detected, there must be something wrong happened. + CUDF_EXPECTS(has_no_oob, "Unexpected out-of-bound write in get_json_object kernel."); + + for (std::size_t idx = 0; idx < oob_indices.size(); ++idx) { + auto const out_idx = oob_indices[idx]; + output[out_idx] = + cudf::make_strings_column(input.size(), + std::move(out_offsets_and_sizes[idx].first), + out_char_buffers[idx].release(), + out_null_masks_and_null_counts[idx].second, + std::move(out_null_masks_and_null_counts[idx].first)); + } + return output; +} + +std::unique_ptr from_json_to_struct(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto const num_outputs = json_paths.size(); + + // Input is empty or all nulls - just return all null columns. + if (input.is_empty() || input.size() == input.null_count()) { + std::vector> output; + for (std::size_t idx = 0; idx < num_outputs; ++idx) { + output.emplace_back(std::make_unique(input.parent(), stream, mr)); + } + return output; + } + + std::vector sorted_indices(json_paths.size()); + std::iota(sorted_indices.begin(), sorted_indices.end(), 0); // Fill with 0, 1, 2, ... + + // Sort indices based on the corresponding paths. + std::sort(sorted_indices.begin(), sorted_indices.end(), [&json_paths](size_t i, size_t j) { + return json_paths[i] < json_paths[j]; + }); + + auto const in_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); + auto const scratch_size = calc_scratch_size(input, in_offsets, stream); + if (memory_budget_bytes <= 0 && parallel_override <= 0) { + parallel_override = static_cast(sorted_indices.size()); + } + auto const d_input_ptr = cudf::column_device_view::create(input.parent(), stream); + std::vector> output(num_outputs); + + std::vector const>> batch; + std::vector output_ids; + + std::size_t starting_path = 0; + while (starting_path < num_outputs) { + std::size_t at = starting_path; + batch.resize(0); + output_ids.resize(0); + if (parallel_override > 0) { + int count = 0; + while (at < num_outputs && count < parallel_override) { + auto output_location = sorted_indices[at]; + batch.emplace_back(json_paths[output_location]); + output_ids.push_back(output_location); + at++; + count++; + } + } else { + long budget = 0; + while (at < num_outputs && budget < memory_budget_bytes) { + auto output_location = sorted_indices[at]; + batch.emplace_back(json_paths[output_location]); + output_ids.push_back(output_location); + at++; + budget += scratch_size; + } + } + auto tmp = get_json_object_batch(*d_input_ptr, in_offsets, batch, scratch_size, stream, mr); + for (std::size_t i = 0; i < tmp.size(); i++) { + std::size_t out_i = output_ids[i]; + output[out_i] = std::move(tmp[i]); + } + starting_path = at; + } + return output; +} + +} // namespace detail + +std::unique_ptr from_json_to_struct(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::from_json_to_struct(input, stream, mr); +} + +} // namespace spark_rapids_jni From d9e52c06f4c7849a5d24785b2db3fe64fa970e98 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 16 Aug 2024 13:20:04 -0700 Subject: [PATCH 08/58] WIP Signed-off-by: Nghia Truong --- src/main/cpp/CMakeLists.txt | 2 +- src/main/cpp/src/from_json.hpp | 2 +- ...n_to_struct.cu => from_json_to_structs.cu} | 23 +++++++++----- src/main/cpp/tests/CMakeLists.txt | 8 +++-- src/main/cpp/tests/from_json.cu | 30 +++++++++++++++++++ 5 files changed, 54 insertions(+), 11 deletions(-) rename src/main/cpp/src/{from_json_to_struct.cu => from_json_to_structs.cu} (94%) create mode 100644 src/main/cpp/tests/from_json.cu diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index 9a01b513bb..5eb4bb4e4f 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -208,7 +208,7 @@ add_library( src/datetime_rebase.cu src/decimal_utils.cu src/from_json_to_raw_map.cu - #src/from_json_to_struct.cu + #src/from_json_to_structs.cu src/get_json_object.cu src/histogram.cu src/murmur_hash.cu diff --git a/src/main/cpp/src/from_json.hpp b/src/main/cpp/src/from_json.hpp index 0c4445a9db..a0645fa245 100644 --- a/src/main/cpp/src/from_json.hpp +++ b/src/main/cpp/src/from_json.hpp @@ -31,7 +31,7 @@ std::unique_ptr from_json_to_raw_map( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); -std::unique_ptr from_json_to_struct( +std::unique_ptr from_json_to_structs( cudf::strings_column_view const& input, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); diff --git a/src/main/cpp/src/from_json_to_struct.cu b/src/main/cpp/src/from_json_to_structs.cu similarity index 94% rename from src/main/cpp/src/from_json_to_struct.cu rename to src/main/cpp/src/from_json_to_structs.cu index 4d248f91e7..6e2873489a 100644 --- a/src/main/cpp/src/from_json_to_struct.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -43,6 +43,7 @@ #include +namespace test { namespace spark_rapids_jni { namespace detail { @@ -303,9 +304,9 @@ std::vector> get_json_object_batch( return output; } -std::unique_ptr from_json_to_struct(cudf::strings_column_view const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr from_json_to_struct_bk(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { auto const num_outputs = json_paths.size(); @@ -372,14 +373,22 @@ std::unique_ptr from_json_to_struct(cudf::strings_column_view cons return output; } +std::unique_ptr from_json_to_structs(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + return nullptr; +} + } // namespace detail -std::unique_ptr from_json_to_struct(cudf::strings_column_view const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr from_json_to_structs(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::from_json_to_struct(input, stream, mr); + return detail::from_json_to_structs(input, stream, mr); } } // namespace spark_rapids_jni +} // namespace test diff --git a/src/main/cpp/tests/CMakeLists.txt b/src/main/cpp/tests/CMakeLists.txt index 244d18c903..b4f8ca0dbc 100644 --- a/src/main/cpp/tests/CMakeLists.txt +++ b/src/main/cpp/tests/CMakeLists.txt @@ -45,6 +45,10 @@ endfunction(ConfigureTest) ### test sources ################################################################################## ################################################################################################### +ConfigureTest(FROM_JSON + /home/nghiat/Devel/jni/1/src/main/cpp/src/from_json_to_structs.cu + from_json.cu) + ConfigureTest(CAST_STRING cast_string.cpp) @@ -60,8 +64,8 @@ ConfigureTest(CAST_FLOAT_TO_STRING ConfigureTest(DATETIME_REBASE datetime_rebase.cpp) -ConfigureTest(ROW_CONVERSION - row_conversion.cpp) +# ConfigureTest(ROW_CONVERSION +# row_conversion.cpp) ConfigureTest(HASH hash.cpp) diff --git a/src/main/cpp/tests/from_json.cu b/src/main/cpp/tests/from_json.cu new file mode 100644 index 0000000000..fc3bb4731e --- /dev/null +++ b/src/main/cpp/tests/from_json.cu @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "from_json.hpp" + +#include +#include +#include + +#include + +class FromJsonTest : public cudf::test::BaseFixture {}; + +TEST_F(FromJsonTest, Initialization) +{ + // +} From e4bb4608db5addf9abe1134135acb221c17123d1 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 16 Aug 2024 15:17:21 -0700 Subject: [PATCH 09/58] Change signatures for `from_json_to_structs` Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 63 +++++++++++++++++++ src/main/cpp/src/from_json.hpp | 6 +- src/main/cpp/src/from_json_to_structs.cu | 20 +++--- .../nvidia/spark/rapids/jni/JSONUtils.java | 17 +++++ 4 files changed, 97 insertions(+), 9 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index 5a0c5dd341..0b6b395712 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -18,12 +18,22 @@ #include "from_json.hpp" #include "get_json_object.hpp" +#include #include +#include #include using path_instruction_type = spark_rapids_jni::path_instruction_type; +namespace cudf::jni { +cudf::io::schema_element read_schema_element(int& index, + cudf::jni::native_jintArray const& children, + cudf::jni::native_jstringArray const& names, + cudf::jni::native_jintArray const& types, + cudf::jni::native_jintArray const& scales); +} + extern "C" { JNIEXPORT jint JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_getMaxJSONPathDepth(JNIEnv* env, @@ -154,4 +164,57 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_extractRawMap } CATCH_STD(env, 0); } + +JNIEXPORT jlongArray JNICALL +Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJsonToStructs(JNIEnv* env, + jclass, + jlong j_input, + jintArray j_num_children, + jobjectArray j_col_names, + jintArray j_types, + jintArray j_scales) +{ + JNI_NULL_CHECK(env, j_input, "j_input is null", 0); + JNI_NULL_CHECK(env, j_num_children, "j_num_children is null", 0); + JNI_NULL_CHECK(env, j_col_names, "j_col_names is null", 0); + JNI_NULL_CHECK(env, j_types, "j_types is null", 0); + JNI_NULL_CHECK(env, j_scales, "j_scales is null", 0); + + try { + cudf::jni::auto_set_device(env); + cudf::jni::native_jstringArray n_col_names(env, j_col_names); + cudf::jni::native_jintArray n_types(env, j_types); + cudf::jni::native_jintArray n_scales(env, j_scales); + cudf::jni::native_jintArray n_children(env, j_num_children); + + if (n_types.size() != n_scales.size()) { + JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and scales must match size", 0); + } + if (n_col_names.size() != n_types.size()) { + JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and column names must match size", 0); + } + if (n_children.size() != n_types.size()) { + JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and num children must match size", 0); + } + + std::map schema; + int at = 0; + while (at < n_types.size()) { + schema.emplace( + n_col_names.get(at).get(), + cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)); + } + + auto const input_cv = reinterpret_cast(j_input); + auto output = + spark_rapids_jni::from_json_to_structs(cudf::strings_column_view{*input_cv}, schema); + + auto out_handles = cudf::jni::native_jlongArray(env, output.size()); + std::transform(output.begin(), output.end(), out_handles.begin(), [](auto& col) { + return cudf::jni::release_as_jlong(col); + }); + return out_handles.get_jArray(); + } + CATCH_STD(env, 0); +} } diff --git a/src/main/cpp/src/from_json.hpp b/src/main/cpp/src/from_json.hpp index a0645fa245..03afd99760 100644 --- a/src/main/cpp/src/from_json.hpp +++ b/src/main/cpp/src/from_json.hpp @@ -16,13 +16,16 @@ #pragma once +#include #include #include #include #include +#include #include +#include namespace spark_rapids_jni { @@ -31,8 +34,9 @@ std::unique_ptr from_json_to_raw_map( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); -std::unique_ptr from_json_to_structs( +std::vector> from_json_to_structs( cudf::strings_column_view const& input, + std::map const& schema, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 6e2873489a..8a918ac2d4 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -373,21 +373,25 @@ std::unique_ptr from_json_to_struct_bk(cudf::strings_column_view c return output; } -std::unique_ptr from_json_to_structs(cudf::strings_column_view const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::vector> from_json_to_structs( + cudf::strings_column_view const& input, + std::map const& schema, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { - return nullptr; + return {}; } } // namespace detail -std::unique_ptr from_json_to_structs(cudf::strings_column_view const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::vector> from_json_to_structs( + cudf::strings_column_view const& input, + std::map const& schema, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::from_json_to_structs(input, stream, mr); + return detail::from_json_to_structs(input, schema, stream, mr); } } // namespace spark_rapids_jni diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index 3a7c4a6a53..9962a3bedc 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -160,6 +160,19 @@ public static ColumnVector extractRawMapFromJsonString(ColumnView input) { return new ColumnVector(extractRawMapFromJsonString(input.getNativeView())); } + /** + * Parse JSON strings column into a structs column. + * TODO + * @param input + * @param schema + * @return + */ + public static Table fromJsonToStructs(ColumnView input, Schema schema) { + assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; + return new Table(fromJsonToStructs(input.getNativeView(), + schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), + schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales())); + } private static native int getMaxJSONPathDepth(); @@ -178,4 +191,8 @@ private static native long[] getJsonObjectMultiplePaths(long input, private static native long extractRawMapFromJsonString(long input); + + private static native long[] fromJsonToStructs(long input, + int[] numChildren, String[] columnNames, + int[] dTypeIds, int[] dTypeScales); } From 9e239cbef402d1b7961aa4e3154179804c5e9919 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 19 Aug 2024 18:15:58 -0700 Subject: [PATCH 10/58] Add `convert_schema_to_paths` function and fix compile error Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 26 +++++++++++++++++++++- src/main/cpp/src/from_json.hpp | 2 ++ src/main/cpp/src/from_json_to_structs.cu | 28 +++++++++++++++++++++--- 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index 0b6b395712..48a40557ae 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -31,8 +31,30 @@ cudf::io::schema_element read_schema_element(int& index, cudf::jni::native_jintArray const& children, cudf::jni::native_jstringArray const& names, cudf::jni::native_jintArray const& types, - cudf::jni::native_jintArray const& scales); + cudf::jni::native_jintArray const& scales) +{ + auto d_type = cudf::data_type{static_cast(types[index]), scales[index]}; + if (d_type.id() == cudf::type_id::STRUCT || d_type.id() == cudf::type_id::LIST) { + std::map child_elems; + int num_children = children[index]; + // go to the next entry, so recursion can parse it. + index++; + for (int i = 0; i < num_children; i++) { + child_elems.insert( + std::pair{names.get(index).get(), + cudf::jni::read_schema_element(index, children, names, types, scales)}); + } + return cudf::io::schema_element{d_type, std::move(child_elems)}; + } else { + if (children[index] != 0) { + throw std::invalid_argument("found children for a type that should have none"); + } + // go to the next entry before returning... + index++; + return cudf::io::schema_element{d_type, {}}; + } } +} // namespace cudf::jni extern "C" { @@ -165,6 +187,7 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_extractRawMap CATCH_STD(env, 0); } +#if 0 JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJsonToStructs(JNIEnv* env, jclass, @@ -217,4 +240,5 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJsonToStructs(JNIEnv* env, } CATCH_STD(env, 0); } +#endif } diff --git a/src/main/cpp/src/from_json.hpp b/src/main/cpp/src/from_json.hpp index 03afd99760..554d68c6ca 100644 --- a/src/main/cpp/src/from_json.hpp +++ b/src/main/cpp/src/from_json.hpp @@ -34,10 +34,12 @@ std::unique_ptr from_json_to_raw_map( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); +#if 0 std::vector> from_json_to_structs( cudf::strings_column_view const& input, std::map const& schema, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); +#endif } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 8a918ac2d4..33da2631e6 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -43,10 +44,11 @@ #include -namespace test { namespace spark_rapids_jni { namespace detail { +#if 0 +namespace test { /** * @brief TODO */ @@ -373,13 +375,34 @@ std::unique_ptr from_json_to_struct_bk(cudf::strings_column_view c return output; } +} // namespace test +#endif + +std::vector>> +convert_schema_to_paths(std::map const& schema) +{ + std::vector>> paths; + + return paths; +} + +std::vector> get_json_object( + cudf::strings_column_view const& input, + std::vector>> const& + json_paths, + int64_t memory_budget_bytes, + int32_t parallel_override, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + std::vector> from_json_to_structs( cudf::strings_column_view const& input, std::map const& schema, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return {}; + auto const json_paths = convert_schema_to_paths(schema); + return get_json_object(input, json_paths, -1L, -1, stream, mr); } } // namespace detail @@ -395,4 +418,3 @@ std::vector> from_json_to_structs( } } // namespace spark_rapids_jni -} // namespace test From 187785bd14466dccdb5f4ba0b917c51857cdd728 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 20 Aug 2024 10:32:28 -0700 Subject: [PATCH 11/58] Convert schema into paths --- src/main/cpp/src/from_json_to_structs.cu | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 33da2631e6..f5c1ed3d67 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -378,10 +378,31 @@ std::unique_ptr from_json_to_struct_bk(cudf::strings_column_view c } // namespace test #endif +void travel_path( + std::vector>>& paths, + std::vector>& current_path, + std::string const& name, + cudf::io::schema_element const& column_schema) +{ + current_path.emplace_back(path_instruction_type::NAMED, name, -1); + if (column_schema.child_types.size() == 0) { // leaf of the schema + paths.push_back(current_path); // this will copy + } else { + for (auto const& [child_name, child_schema] : column_schema.child_types) { + travel_path(paths, current_path, child_name, child_schema); + } + } + current_path.pop_back(); +} + std::vector>> convert_schema_to_paths(std::map const& schema) { std::vector>> paths; + std::vector> current_path; + std::for_each(schema.begin(), schema.end(), [&](auto const& kv) { + travel_path(paths, current_path, kv.first, kv.second); + }); return paths; } From a02cfcab579d44ad4252d5cebd2887183fbb2d23 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 22 Aug 2024 11:00:36 -0700 Subject: [PATCH 12/58] WIP --- src/main/cpp/CMakeLists.txt | 2 +- src/main/cpp/src/JSONUtilsJni.cpp | 63 ++++++++++++++----- src/main/cpp/src/from_json.hpp | 6 +- src/main/cpp/src/from_json_to_structs.cu | 31 +++++++-- src/main/cpp/src/get_json_object.cu | 31 ++++++--- src/main/cpp/src/get_json_object.hpp | 2 + src/main/cpp/src/json_parser.cuh | 23 ++++++- src/main/cpp/tests/CMakeLists.txt | 2 +- src/main/cpp/tests/from_json.cu | 19 +++++- .../nvidia/spark/rapids/jni/JSONUtils.java | 7 +-- .../spark/rapids/jni/GetJsonObjectTest.java | 19 ++++++ 11 files changed, 167 insertions(+), 38 deletions(-) diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index 5eb4bb4e4f..d06cbb8a84 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -208,7 +208,7 @@ add_library( src/datetime_rebase.cu src/decimal_utils.cu src/from_json_to_raw_map.cu - #src/from_json_to_structs.cu + src/from_json_to_structs.cu src/get_json_object.cu src/histogram.cu src/murmur_hash.cu diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index 48a40557ae..abb6d0b728 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -28,29 +28,46 @@ using path_instruction_type = spark_rapids_jni::path_instruction_type; namespace cudf::jni { cudf::io::schema_element read_schema_element(int& index, - cudf::jni::native_jintArray const& children, cudf::jni::native_jstringArray const& names, + cudf::jni::native_jintArray const& children, cudf::jni::native_jintArray const& types, cudf::jni::native_jintArray const& scales) { + printf("JNI line %d\n", __LINE__); + fflush(stdout); + auto d_type = cudf::data_type{static_cast(types[index]), scales[index]}; if (d_type.id() == cudf::type_id::STRUCT || d_type.id() == cudf::type_id::LIST) { + printf("JNI line %d\n", __LINE__); + fflush(stdout); + std::map child_elems; int num_children = children[index]; // go to the next entry, so recursion can parse it. index++; for (int i = 0; i < num_children; i++) { - child_elems.insert( - std::pair{names.get(index).get(), - cudf::jni::read_schema_element(index, children, names, types, scales)}); + printf("JNI line %d\n", __LINE__); + fflush(stdout); + + auto const name = std::string{names.get(index).get()}; + child_elems.emplace(name, + cudf::jni::read_schema_element(index, names, children, types, scales)); } return cudf::io::schema_element{d_type, std::move(child_elems)}; } else { + printf("JNI line %d\n", __LINE__); + + printf("children size: %d, idx = %d\n", children.size(), index); + + fflush(stdout); + if (children[index] != 0) { throw std::invalid_argument("found children for a type that should have none"); } // go to the next entry before returning... index++; + printf("JNI line %d\n", __LINE__); + fflush(stdout); return cudf::io::schema_element{d_type, {}}; } } @@ -187,19 +204,18 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_extractRawMap CATCH_STD(env, 0); } -#if 0 JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJsonToStructs(JNIEnv* env, jclass, jlong j_input, - jintArray j_num_children, jobjectArray j_col_names, + jintArray j_num_children, jintArray j_types, jintArray j_scales) { JNI_NULL_CHECK(env, j_input, "j_input is null", 0); - JNI_NULL_CHECK(env, j_num_children, "j_num_children is null", 0); JNI_NULL_CHECK(env, j_col_names, "j_col_names is null", 0); + JNI_NULL_CHECK(env, j_num_children, "j_num_children is null", 0); JNI_NULL_CHECK(env, j_types, "j_types is null", 0); JNI_NULL_CHECK(env, j_scales, "j_scales is null", 0); @@ -220,18 +236,38 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJsonToStructs(JNIEnv* env, JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and num children must match size", 0); } - std::map schema; - int at = 0; - while (at < n_types.size()) { - schema.emplace( - n_col_names.get(at).get(), - cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)); + printf("JNI line %d, size = %d\n", __LINE__, (int)n_types.size()); + fflush(stdout); + + std::vector> schema; + int idx = 0; + while (idx < n_types.size()) { + printf("JNI line %d\n", __LINE__); + fflush(stdout); + + auto const name = std::string{n_col_names.get(idx).get()}; + schema.emplace_back( + name, cudf::jni::read_schema_element(idx, n_col_names, n_children, n_types, n_scales)); + + // auto const name = n_col_names.get(at).get(); + printf("JNI line %d\n", __LINE__); + fflush(stdout); + + // auto child = cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, + // n_scales); printf("JNI line %d\n", __LINE__); fflush(stdout); + + // schema.emplace(name, std::move(child)); } + printf("JNI line %d\n", __LINE__); + fflush(stdout); auto const input_cv = reinterpret_cast(j_input); auto output = spark_rapids_jni::from_json_to_structs(cudf::strings_column_view{*input_cv}, schema); + printf("JNI line %d\n", __LINE__); + fflush(stdout); + auto out_handles = cudf::jni::native_jlongArray(env, output.size()); std::transform(output.begin(), output.end(), out_handles.begin(), [](auto& col) { return cudf::jni::release_as_jlong(col); @@ -240,5 +276,4 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJsonToStructs(JNIEnv* env, } CATCH_STD(env, 0); } -#endif } diff --git a/src/main/cpp/src/from_json.hpp b/src/main/cpp/src/from_json.hpp index 554d68c6ca..a176535c52 100644 --- a/src/main/cpp/src/from_json.hpp +++ b/src/main/cpp/src/from_json.hpp @@ -23,9 +23,9 @@ #include #include -#include #include #include +#include namespace spark_rapids_jni { @@ -34,12 +34,10 @@ std::unique_ptr from_json_to_raw_map( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); -#if 0 std::vector> from_json_to_structs( cudf::strings_column_view const& input, - std::map const& schema, + std::vector> const& schema, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); -#endif } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index f5c1ed3d67..cfd8148f73 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -396,7 +396,7 @@ void travel_path( } std::vector>> -convert_schema_to_paths(std::map const& schema) +convert_schema_to_paths(std::vector> const& schema) { std::vector>> paths; std::vector> current_path; @@ -413,24 +413,47 @@ std::vector> get_json_object( json_paths, int64_t memory_budget_bytes, int32_t parallel_override, + bool allow_leading_zero_numbers, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); std::vector> from_json_to_structs( cudf::strings_column_view const& input, - std::map const& schema, + std::vector> const& schema, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + printf("line %d\n", __LINE__); + fflush(stdout); auto const json_paths = convert_schema_to_paths(schema); - return get_json_object(input, json_paths, -1L, -1, stream, mr); + + printf("line %d\n", __LINE__); + fflush(stdout); + +#if 1 + for (auto const& path : json_paths) { + printf("\n\npath: \n"); + for (auto node : path) { + printf(".%s", std::get<1>(node).c_str()); + } + printf("\n"); + } + printf("\n\n"); + fflush(stdout); +#endif + + auto tmp = get_json_object(input, json_paths, -1L, -1, true, stream, mr); + printf("line %d\n", __LINE__); + fflush(stdout); + + return tmp; } } // namespace detail std::vector> from_json_to_structs( cudf::strings_column_view const& input, - std::map const& schema, + std::vector> const& schema, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu index f836186192..2f8e03a08f 100644 --- a/src/main/cpp/src/get_json_object.cu +++ b/src/main/cpp/src/get_json_object.cu @@ -827,6 +827,7 @@ template __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL void get_json_object_kernel(cudf::column_device_view input, cudf::device_span path_data, + bool allow_leading_zero_numbers, std::size_t num_threads_per_row, int8_t* max_path_depth_exceeded) { @@ -845,6 +846,7 @@ __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL auto const str = input.element(row_idx); if (str.size_bytes() > 0) { json_parser p{char_range{str}}; + p.set_allow_leading_zero_numbers(allow_leading_zero_numbers); thrust::tie(is_valid, out_size) = evaluate_path(p, path.path_commands, dst, max_path_depth_exceeded); @@ -874,6 +876,7 @@ __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL struct kernel_launcher { static void exec(cudf::column_device_view const& input, cudf::device_span path_data, + bool allow_leading_zero_numbers, int8_t* max_path_depth_exceeded, rmm::cuda_stream_view stream) { @@ -891,7 +894,7 @@ struct kernel_launcher { static_cast(block_size)); get_json_object_kernel <<>>( - input, path_data, num_threads_per_row, max_path_depth_exceeded); + input, path_data, allow_leading_zero_numbers, num_threads_per_row, max_path_depth_exceeded); } }; @@ -1022,6 +1025,7 @@ std::vector> get_json_object_batch( std::vector const>> const& json_paths, int64_t scratch_size, + bool allow_leading_zero_numbers, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -1064,7 +1068,8 @@ std::vector> get_json_object_batch( thrust::uninitialized_fill( rmm::exec_policy(stream), d_error_check.begin(), d_error_check.end(), 0); - kernel_launcher::exec(input, d_path_data, d_max_path_depth_exceeded, stream); + kernel_launcher::exec( + input, d_path_data, allow_leading_zero_numbers, d_max_path_depth_exceeded, stream); auto h_error_check = cudf::detail::make_host_vector_sync(d_error_check, stream); auto has_no_oob = check_error(h_error_check); @@ -1133,7 +1138,8 @@ std::vector> get_json_object_batch( h_path_data, stream, rmm::mr::get_current_device_resource()); thrust::uninitialized_fill( rmm::exec_policy(stream), d_error_check.begin(), d_error_check.end(), 0); - kernel_launcher::exec(input, d_path_data, d_max_path_depth_exceeded, stream); + kernel_launcher::exec( + input, d_path_data, allow_leading_zero_numbers, d_max_path_depth_exceeded, stream); h_error_check = cudf::detail::make_host_vector_sync(d_error_check, stream); has_no_oob = check_error(h_error_check); @@ -1159,6 +1165,7 @@ std::vector> get_json_object( json_paths, int64_t memory_budget_bytes, int32_t parallel_override, + bool allow_leading_zero_numbers, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -1217,7 +1224,8 @@ std::vector> get_json_object( budget += scratch_size; } } - auto tmp = get_json_object_batch(*d_input_ptr, in_offsets, batch, scratch_size, stream, mr); + auto tmp = get_json_object_batch( + *d_input_ptr, in_offsets, batch, scratch_size, allow_leading_zero_numbers, stream, mr); for (std::size_t i = 0; i < tmp.size(); i++) { std::size_t out_i = output_ids[i]; output[out_i] = std::move(tmp[i]); @@ -1232,11 +1240,14 @@ std::vector> get_json_object( std::unique_ptr get_json_object( cudf::strings_column_view const& input, std::vector> const& instructions, + bool allow_leading_zero_numbers, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return std::move(detail::get_json_object(input, {instructions}, -1, -1, stream, mr).front()); + return std::move( + detail::get_json_object(input, {instructions}, -1, -1, allow_leading_zero_numbers, stream, mr) + .front()); } std::vector> get_json_object_multiple_paths( @@ -1245,12 +1256,18 @@ std::vector> get_json_object_multiple_paths( json_paths, int64_t memory_budget_bytes, int32_t parallel_override, + bool allow_leading_zero_numbers, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::get_json_object( - input, json_paths, memory_budget_bytes, parallel_override, stream, mr); + return detail::get_json_object(input, + json_paths, + memory_budget_bytes, + parallel_override, + allow_leading_zero_numbers, + stream, + mr); } } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/get_json_object.hpp b/src/main/cpp/src/get_json_object.hpp index 0cc773517f..bb1d73afc2 100644 --- a/src/main/cpp/src/get_json_object.hpp +++ b/src/main/cpp/src/get_json_object.hpp @@ -44,6 +44,7 @@ enum class path_instruction_type : int8_t { WILDCARD, INDEX, NAMED }; std::unique_ptr get_json_object( cudf::strings_column_view const& input, std::vector> const& instructions, + bool allow_leading_zero_numbers = false, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); @@ -66,6 +67,7 @@ std::vector> get_json_object_multiple_paths( json_paths, int64_t memory_budget_bytes, int32_t parallel_override, + bool allow_leading_zero_numbers = false, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); diff --git a/src/main/cpp/src/json_parser.cuh b/src/main/cpp/src/json_parser.cuh index 12863c3333..3061633d04 100644 --- a/src/main/cpp/src/json_parser.cuh +++ b/src/main/cpp/src/json_parser.cuh @@ -204,6 +204,7 @@ class char_range_reader { * 1e-5, 1E+5, 1e0, 1E0, 1.3e5 * 1e01 : allow leading zeor after 'e' * + * TODO: update this * Invalid number examples: * 00, -00 Leading zeroes not allowed * infinity, +infinity, -infinity @@ -220,7 +221,11 @@ class char_range_reader { class json_parser { public: __device__ inline explicit json_parser(char_range _chars) - : chars(_chars), curr_pos(0), current_token(json_token::INIT), max_depth_exceeded(false) + : chars(_chars), + curr_pos(0), + current_token(json_token::INIT), + max_depth_exceeded(false), + allow_leading_zero_numbers{false} { } @@ -1071,9 +1076,14 @@ class json_parser { // check leading zeros if (!eof()) { char const next_char_after_zero = chars[curr_pos]; + // TODO: check if the current char is `.` instead of non-numeric. if (next_char_after_zero >= '0' && next_char_after_zero <= '9') { // e.g.: 01 is invalid - return false; + if (!allow_leading_zero_numbers) { + return false; + } else { + number_digits_length += skip_zero_or_more_digits(); + } } } @@ -1415,6 +1425,7 @@ class json_parser { char_range_reader reader(current_range()); return write_string(reader, destination, escape_style::UNESCAPED); } + // change here case json_token::VALUE_NUMBER_INT: if (number_token_len == 2 && chars[current_token_start_pos] == '-' && chars[current_token_start_pos + 1] == '0') { @@ -1686,6 +1697,11 @@ class json_parser { __device__ inline bool max_nesting_depth_exceeded() const { return max_depth_exceeded; } + __device__ inline void set_allow_leading_zero_numbers(bool state) + { + allow_leading_zero_numbers = state; + } + private: char_range const chars; cudf::size_type curr_pos; @@ -1708,6 +1724,9 @@ class json_parser { // Error check if the maximum nesting depth has been reached. bool max_depth_exceeded; + + // Whether allow to have leading zero in numbers. + bool allow_leading_zero_numbers; }; } // namespace spark_rapids_jni diff --git a/src/main/cpp/tests/CMakeLists.txt b/src/main/cpp/tests/CMakeLists.txt index b4f8ca0dbc..2dec79e834 100644 --- a/src/main/cpp/tests/CMakeLists.txt +++ b/src/main/cpp/tests/CMakeLists.txt @@ -46,7 +46,7 @@ endfunction(ConfigureTest) ################################################################################################### ConfigureTest(FROM_JSON - /home/nghiat/Devel/jni/1/src/main/cpp/src/from_json_to_structs.cu +# /home/nghiat/Devel/jni/1/src/main/cpp/src/from_json_to_structs.cu from_json.cu) ConfigureTest(CAST_STRING diff --git a/src/main/cpp/tests/from_json.cu b/src/main/cpp/tests/from_json.cu index fc3bb4731e..840ad2b774 100644 --- a/src/main/cpp/tests/from_json.cu +++ b/src/main/cpp/tests/from_json.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -26,5 +27,21 @@ class FromJsonTest : public cudf::test::BaseFixture {}; TEST_F(FromJsonTest, Initialization) { - // + // The last row is invalid (has an extra quote). + auto const json_string = + cudf::test::strings_column_wrapper{R"({'a': 4478, "b": 'HIMST', "c": 1276})"}; + + std::vector> schema{ + {"c", {cudf::data_type{cudf::type_id::INT32}}}, + {"a", {cudf::data_type{cudf::type_id::STRING}}}, + }; + + auto const output = + spark_rapids_jni::from_json_to_structs(cudf::strings_column_view{json_string}, schema); + + cudf::test::print(json_string); + + for (auto const& col : output) { + cudf::test::print(col->view()); + } } diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index 9962a3bedc..acbe8f6d35 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -170,7 +170,7 @@ public static ColumnVector extractRawMapFromJsonString(ColumnView input) { public static Table fromJsonToStructs(ColumnView input, Schema schema) { assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; return new Table(fromJsonToStructs(input.getNativeView(), - schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), + schema.getFlattenedColumnNames(), schema.getFlattenedNumChildren(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales())); } @@ -192,7 +192,6 @@ private static native long[] getJsonObjectMultiplePaths(long input, private static native long extractRawMapFromJsonString(long input); - private static native long[] fromJsonToStructs(long input, - int[] numChildren, String[] columnNames, - int[] dTypeIds, int[] dTypeScales); + private static native long[] fromJsonToStructs(long input, String[] columnNames, + int[] numChildren, int[] dTypeIds, int[] dTypeScales); } diff --git a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java index 6a4acb9cb9..195a243fba 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java @@ -24,6 +24,7 @@ import java.util.List; import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual; +import static ai.rapids.cudf.AssertUtils.assertTablesAreEqual; import static org.junit.jupiter.api.Assertions.assertThrows; public class GetJsonObjectTest { @@ -788,4 +789,22 @@ private JSONUtils.PathInstructionJni namedPath(String name) { private JSONUtils.PathInstructionJni indexPath(int index) { return new JSONUtils.PathInstructionJni(JSONUtils.PathInstructionType.INDEX, "", index); } + + + /** + * This test is when an exception is thrown due to the input JSON path being too long. + */ + @Test + void testFromJSON() { + ai.rapids.cudf.Schema schema = ai.rapids.cudf.Schema.builder() + .column(ai.rapids.cudf.DType.STRING, "a") + .build(); + try (ColumnVector input = ColumnVector.fromStrings("{'a': '1', 'b': '2'}"); + ai.rapids.cudf.Table expected = + new ai.rapids.cudf.Table.TestBuilder().column("1").build(); + ai.rapids.cudf.Table actual = JSONUtils.fromJsonToStructs(input, schema)) { + assertTablesAreEqual(expected, actual); + } + } + } From 49bbf7cea4feedccd88b9925b6247629261e7394 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 22 Aug 2024 11:18:58 -0700 Subject: [PATCH 13/58] Add `keep_quotes` parameter Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 7 +++++- src/main/cpp/src/get_json_object.cu | 28 ++++++++++++++++++------ 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index cfd8148f73..cdfd1950c4 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -43,6 +43,7 @@ #include #include +#include namespace spark_rapids_jni { @@ -407,10 +408,12 @@ convert_schema_to_paths(std::vector> get_json_object( cudf::strings_column_view const& input, std::vector>> const& json_paths, + std::unordered_set const& keep_quotes, int64_t memory_budget_bytes, int32_t parallel_override, bool allow_leading_zero_numbers, @@ -423,6 +426,8 @@ std::vector> from_json_to_structs( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + std::unordered_set keep_quotes; + printf("line %d\n", __LINE__); fflush(stdout); auto const json_paths = convert_schema_to_paths(schema); @@ -442,7 +447,7 @@ std::vector> from_json_to_structs( fflush(stdout); #endif - auto tmp = get_json_object(input, json_paths, -1L, -1, true, stream, mr); + auto tmp = get_json_object(input, json_paths, keep_quotes, -1L, -1, true, stream, mr); printf("line %d\n", __LINE__); fflush(stdout); diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu index 2f8e03a08f..0902c113ae 100644 --- a/src/main/cpp/src/get_json_object.cu +++ b/src/main/cpp/src/get_json_object.cu @@ -42,6 +42,7 @@ #include #include +#include namespace spark_rapids_jni { @@ -802,6 +803,7 @@ struct json_path_processing_data { thrust::pair* out_stringviews; char* out_buf; int8_t* has_out_of_bound; + bool keep_quote; }; /** @@ -1024,6 +1026,7 @@ std::vector> get_json_object_batch( cudf::detail::input_offsetalator const& in_offsets, std::vector const>> const& json_paths, + std::unordered_set const& keep_quotes, int64_t scratch_size, bool allow_leading_zero_numbers, rmm::cuda_stream_view stream, @@ -1061,7 +1064,8 @@ std::vector> get_json_object_batch( in_offsets, out_stringviews.back().data(), scratch_buffers.back().data(), - d_error_check.data() + idx}); + d_error_check.data() + idx, + keep_quotes.find(idx) != keep_quotes.end()}); } auto d_path_data = cudf::detail::make_device_uvector_async( h_path_data, stream, rmm::mr::get_current_device_resource()); @@ -1124,7 +1128,8 @@ std::vector> get_json_object_batch( out_offsets_and_sizes.back().first->view()), nullptr /*out_stringviews*/, out_char_buffers.back().data(), - d_error_check.data() + idx}); + d_error_check.data() + idx, + keep_quotes.find(idx) != keep_quotes.end()}); } else { output.emplace_back(cudf::make_strings_column(out_sview, stream, mr)); } @@ -1159,10 +1164,12 @@ std::vector> get_json_object_batch( return output; } +// TODO: update docs for keep_quotes std::vector> get_json_object( cudf::strings_column_view const& input, std::vector>> const& json_paths, + std::unordered_set const& keep_quotes, int64_t memory_budget_bytes, int32_t parallel_override, bool allow_leading_zero_numbers, @@ -1224,8 +1231,14 @@ std::vector> get_json_object( budget += scratch_size; } } - auto tmp = get_json_object_batch( - *d_input_ptr, in_offsets, batch, scratch_size, allow_leading_zero_numbers, stream, mr); + auto tmp = get_json_object_batch(*d_input_ptr, + in_offsets, + batch, + keep_quotes, + scratch_size, + allow_leading_zero_numbers, + stream, + mr); for (std::size_t i = 0; i < tmp.size(); i++) { std::size_t out_i = output_ids[i]; output[out_i] = std::move(tmp[i]); @@ -1245,9 +1258,9 @@ std::unique_ptr get_json_object( rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return std::move( - detail::get_json_object(input, {instructions}, -1, -1, allow_leading_zero_numbers, stream, mr) - .front()); + return std::move(detail::get_json_object( + input, {instructions}, {}, -1, -1, allow_leading_zero_numbers, stream, mr) + .front()); } std::vector> get_json_object_multiple_paths( @@ -1263,6 +1276,7 @@ std::vector> get_json_object_multiple_paths( CUDF_FUNC_RANGE(); return detail::get_json_object(input, json_paths, + {}, memory_budget_bytes, parallel_override, allow_leading_zero_numbers, From 14b463ef29bc68503808c0cfff60c143d3e853f8 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 22 Aug 2024 11:29:14 -0700 Subject: [PATCH 14/58] Find decimal columns in schema Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 30 +++++++++++++++++------- src/main/cpp/src/get_json_object.cu | 2 +- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index cdfd1950c4..183b983019 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -382,30 +382,39 @@ std::unique_ptr from_json_to_struct_bk(cudf::strings_column_view c void travel_path( std::vector>>& paths, std::vector>& current_path, + std::unordered_set& keep_quotes, std::string const& name, cudf::io::schema_element const& column_schema) { current_path.emplace_back(path_instruction_type::NAMED, name, -1); if (column_schema.child_types.size() == 0) { // leaf of the schema - paths.push_back(current_path); // this will copy + if (column_schema.type.id() == cudf::type_id::DECIMAL32 || + column_schema.type.id() == cudf::type_id::DECIMAL64 || + column_schema.type.id() == cudf::type_id::DECIMAL128) { + keep_quotes.insert(paths.size()); + } + paths.push_back(current_path); // this will copy } else { for (auto const& [child_name, child_schema] : column_schema.child_types) { - travel_path(paths, current_path, child_name, child_schema); + travel_path(paths, current_path, keep_quotes, child_name, child_schema); } } current_path.pop_back(); } -std::vector>> +std::pair>>, + std::unordered_set> convert_schema_to_paths(std::vector> const& schema) { std::vector>> paths; + std::unordered_set keep_quotes; + std::vector> current_path; std::for_each(schema.begin(), schema.end(), [&](auto const& kv) { - travel_path(paths, current_path, kv.first, kv.second); + travel_path(paths, current_path, keep_quotes, kv.first, kv.second); }); - return paths; + return {std::move(paths), std::move(keep_quotes)}; } // Extern @@ -426,11 +435,9 @@ std::vector> from_json_to_structs( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - std::unordered_set keep_quotes; - printf("line %d\n", __LINE__); fflush(stdout); - auto const json_paths = convert_schema_to_paths(schema); + auto const [json_paths, keep_quotes] = convert_schema_to_paths(schema); printf("line %d\n", __LINE__); fflush(stdout); @@ -443,7 +450,12 @@ std::vector> from_json_to_structs( } printf("\n"); } - printf("\n\n"); + + printf("keep quotes: \n"); + for (auto const i : keep_quotes) { + printf("%d, ", (int)i); + } + printf("\n\n\n"); fflush(stdout); #endif diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu index 0902c113ae..71a53c689f 100644 --- a/src/main/cpp/src/get_json_object.cu +++ b/src/main/cpp/src/get_json_object.cu @@ -803,7 +803,7 @@ struct json_path_processing_data { thrust::pair* out_stringviews; char* out_buf; int8_t* has_out_of_bound; - bool keep_quote; + bool keep_quotes; }; /** From ae1df4a32ddc8ed9ee9b4405336ff05e2f5b58eb Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 22 Aug 2024 19:21:20 -0700 Subject: [PATCH 15/58] Allow to specify `allow_leading_zero_numbers` Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 7 ++++--- src/main/cpp/src/from_json.hpp | 1 + src/main/cpp/src/from_json_to_structs.cu | 9 +++++++-- src/main/cpp/src/get_json_object.cu | 20 ++++++++++++++----- src/main/cpp/tests/from_json.cu | 2 +- .../nvidia/spark/rapids/jni/JSONUtils.java | 11 +++++++--- .../spark/rapids/jni/GetJsonObjectTest.java | 2 +- 7 files changed, 37 insertions(+), 15 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index abb6d0b728..e792c35d03 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -211,7 +211,8 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJsonToStructs(JNIEnv* env, jobjectArray j_col_names, jintArray j_num_children, jintArray j_types, - jintArray j_scales) + jintArray j_scales, + jboolean allow_leading_zero_numbers) { JNI_NULL_CHECK(env, j_input, "j_input is null", 0); JNI_NULL_CHECK(env, j_col_names, "j_col_names is null", 0); @@ -262,8 +263,8 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJsonToStructs(JNIEnv* env, fflush(stdout); auto const input_cv = reinterpret_cast(j_input); - auto output = - spark_rapids_jni::from_json_to_structs(cudf::strings_column_view{*input_cv}, schema); + auto output = spark_rapids_jni::from_json_to_structs( + cudf::strings_column_view{*input_cv}, schema, allow_leading_zero_numbers); printf("JNI line %d\n", __LINE__); fflush(stdout); diff --git a/src/main/cpp/src/from_json.hpp b/src/main/cpp/src/from_json.hpp index a176535c52..6e32d8e95d 100644 --- a/src/main/cpp/src/from_json.hpp +++ b/src/main/cpp/src/from_json.hpp @@ -37,6 +37,7 @@ std::unique_ptr from_json_to_raw_map( std::vector> from_json_to_structs( cudf::strings_column_view const& input, std::vector> const& schema, + bool allow_leading_zero_numbers, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 183b983019..1396ce7b51 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -391,8 +391,10 @@ void travel_path( if (column_schema.type.id() == cudf::type_id::DECIMAL32 || column_schema.type.id() == cudf::type_id::DECIMAL64 || column_schema.type.id() == cudf::type_id::DECIMAL128) { + // TODO: comment keep_quotes.insert(paths.size()); } + printf("column_schema type: %d\n", static_cast(column_schema.type.id())); paths.push_back(current_path); // this will copy } else { for (auto const& [child_name, child_schema] : column_schema.child_types) { @@ -432,6 +434,7 @@ std::vector> get_json_object( std::vector> from_json_to_structs( cudf::strings_column_view const& input, std::vector> const& schema, + bool allow_leading_zero_numbers, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -459,7 +462,8 @@ std::vector> from_json_to_structs( fflush(stdout); #endif - auto tmp = get_json_object(input, json_paths, keep_quotes, -1L, -1, true, stream, mr); + auto tmp = get_json_object( + input, json_paths, keep_quotes, -1L, -1, allow_leading_zero_numbers, stream, mr); printf("line %d\n", __LINE__); fflush(stdout); @@ -471,11 +475,12 @@ std::vector> from_json_to_structs( std::vector> from_json_to_structs( cudf::strings_column_view const& input, std::vector> const& schema, + bool allow_leading_zero_numbers, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::from_json_to_structs(input, schema, stream, mr); + return detail::from_json_to_structs(input, schema, allow_leading_zero_numbers, stream, mr); } } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu index 71a53c689f..730281e4fa 100644 --- a/src/main/cpp/src/get_json_object.cu +++ b/src/main/cpp/src/get_json_object.cu @@ -154,6 +154,13 @@ class json_generator { return b; } + static __device__ cudf::size_type write_quote(char* out, bool keep_quotes) + { + if (!keep_quotes) { return 0; } + *out = '"'; + return 1; + } + /** * Get current text from JSON parser and then write the text * Note: Because JSON strings contains '\' to do escape, @@ -161,12 +168,13 @@ class json_generator { * then can not return a pointer and length pair (char *, len), * For number token, JSON parser can return a pair (char *, len) */ - __device__ void write_raw(json_parser& parser, char* out_begin) + __device__ void write_raw(json_parser& parser, char* out_begin, bool keep_quotes) { if (array_depth > 0) { is_curr_array_empty = false; } - auto copied = parser.write_unescaped_text(out_begin + offset + output_len); - output_len += copied; + output_len += write_quote(out_begin + offset + output_len, keep_quotes); + output_len += parser.write_unescaped_text(out_begin + offset + output_len); + output_len += write_quote(out_begin + offset + output_len, keep_quotes); } /** @@ -378,6 +386,7 @@ struct context { * * @param p The JSON parser for input string * @param path_commands The command buffer to be applied to the string + * TODO: update * @param out_buf Buffer user to store the string resulted from the query * @param max_path_depth_exceeded A marker to record if the maximum path depth has been reached * during parsing the input string @@ -386,6 +395,7 @@ struct context { __device__ thrust::pair evaluate_path( json_parser& p, cudf::device_span path_commands, + bool keep_quotes, char* out_buf, int8_t* max_path_depth_exceeded) { @@ -428,7 +438,7 @@ __device__ thrust::pair evaluate_path( ctx.style == write_style::RAW) { // there is no array wildcard or slice parent, emit this string without // quotes write current string in parser to generator - ctx.g.write_raw(p, out_buf); + ctx.g.write_raw(p, out_buf, keep_quotes); ctx.dirty = 1; ctx.task_is_done = true; } @@ -850,7 +860,7 @@ __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL json_parser p{char_range{str}}; p.set_allow_leading_zero_numbers(allow_leading_zero_numbers); thrust::tie(is_valid, out_size) = - evaluate_path(p, path.path_commands, dst, max_path_depth_exceeded); + evaluate_path(p, path.path_commands, path.keep_quotes, dst, max_path_depth_exceeded); // We did not terminate the `evaluate_path` function early to reduce complexity of the code. // Instead, if max depth was encountered, we've just continued the evaluation until here diff --git a/src/main/cpp/tests/from_json.cu b/src/main/cpp/tests/from_json.cu index 840ad2b774..0ac10893c4 100644 --- a/src/main/cpp/tests/from_json.cu +++ b/src/main/cpp/tests/from_json.cu @@ -37,7 +37,7 @@ TEST_F(FromJsonTest, Initialization) }; auto const output = - spark_rapids_jni::from_json_to_structs(cudf::strings_column_view{json_string}, schema); + spark_rapids_jni::from_json_to_structs(cudf::strings_column_view{json_string}, schema, false); cudf::test::print(json_string); diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index acbe8f6d35..beed206d65 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -167,11 +167,13 @@ public static ColumnVector extractRawMapFromJsonString(ColumnView input) { * @param schema * @return */ - public static Table fromJsonToStructs(ColumnView input, Schema schema) { + public static Table fromJsonToStructs(ColumnView input, Schema schema, + boolean allowNumericLeadingZeros) { assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; return new Table(fromJsonToStructs(input.getNativeView(), schema.getFlattenedColumnNames(), schema.getFlattenedNumChildren(), - schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales())); + schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), + allowNumericLeadingZeros)); } private static native int getMaxJSONPathDepth(); @@ -193,5 +195,8 @@ private static native long[] getJsonObjectMultiplePaths(long input, private static native long extractRawMapFromJsonString(long input); private static native long[] fromJsonToStructs(long input, String[] columnNames, - int[] numChildren, int[] dTypeIds, int[] dTypeScales); + int[] numChildren, + int[] dTypeIds, + int[] dTypeScales, + boolean allowNumericLeadingZeros); } diff --git a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java index 195a243fba..f82f3a09c9 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java @@ -802,7 +802,7 @@ void testFromJSON() { try (ColumnVector input = ColumnVector.fromStrings("{'a': '1', 'b': '2'}"); ai.rapids.cudf.Table expected = new ai.rapids.cudf.Table.TestBuilder().column("1").build(); - ai.rapids.cudf.Table actual = JSONUtils.fromJsonToStructs(input, schema)) { + ai.rapids.cudf.Table actual = JSONUtils.fromJsonToStructs(input, schema, false)) { assertTablesAreEqual(expected, actual); } } From 1a96dd586420fd231f5ffa99e883658ff8fcd6dc Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 22 Aug 2024 20:11:24 -0700 Subject: [PATCH 16/58] Fix order when checking `keep_quotes` Signed-off-by: Nghia Truong --- src/main/cpp/src/get_json_object.cu | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu index 730281e4fa..14174b49cc 100644 --- a/src/main/cpp/src/get_json_object.cu +++ b/src/main/cpp/src/get_json_object.cu @@ -1036,6 +1036,7 @@ std::vector> get_json_object_batch( cudf::detail::input_offsetalator const& in_offsets, std::vector const>> const& json_paths, + std::vector const& output_ids, std::unordered_set const& keep_quotes, int64_t scratch_size, bool allow_leading_zero_numbers, @@ -1070,12 +1071,18 @@ std::vector> get_json_object_batch( out_stringviews.emplace_back(rmm::device_uvector>{ static_cast(input.size()), stream}); - h_path_data.emplace_back(json_path_processing_data{d_json_paths[idx], - in_offsets, - out_stringviews.back().data(), - scratch_buffers.back().data(), - d_error_check.data() + idx, - keep_quotes.find(idx) != keep_quotes.end()}); + printf("idx: %d, output_ids[idx]: %d\n", (int)idx, (int)output_ids[idx]); + printf("keep_quotes.find(output_ids[idx]) != keep_quotes.end(): %d\n", + (int)(keep_quotes.find(output_ids[idx]) != keep_quotes.end())); + fflush(stdout); + + h_path_data.emplace_back( + json_path_processing_data{d_json_paths[idx], + in_offsets, + out_stringviews.back().data(), + scratch_buffers.back().data(), + d_error_check.data() + idx, + keep_quotes.find(output_ids[idx]) != keep_quotes.end()}); } auto d_path_data = cudf::detail::make_device_uvector_async( h_path_data, stream, rmm::mr::get_current_device_resource()); @@ -1139,7 +1146,7 @@ std::vector> get_json_object_batch( nullptr /*out_stringviews*/, out_char_buffers.back().data(), d_error_check.data() + idx, - keep_quotes.find(idx) != keep_quotes.end()}); + keep_quotes.find(output_ids[idx]) != keep_quotes.end()}); } else { output.emplace_back(cudf::make_strings_column(out_sview, stream, mr)); } @@ -1244,6 +1251,7 @@ std::vector> get_json_object( auto tmp = get_json_object_batch(*d_input_ptr, in_offsets, batch, + output_ids, keep_quotes, scratch_size, allow_leading_zero_numbers, From 41304a461e08c4091facee64480a4fc91ffeed11 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 22 Aug 2024 23:10:35 -0700 Subject: [PATCH 17/58] Allow to specify `allow_non_numeric_numbers` Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 9 ++- src/main/cpp/src/from_json.hpp | 1 + src/main/cpp/src/from_json_to_structs.cu | 17 ++++- src/main/cpp/src/get_json_object.cu | 51 +++++++++++-- src/main/cpp/src/get_json_object.hpp | 5 ++ src/main/cpp/src/json_parser.cuh | 75 ++++++++++++++++++- src/main/cpp/tests/from_json.cu | 4 +- .../nvidia/spark/rapids/jni/JSONUtils.java | 8 +- .../spark/rapids/jni/GetJsonObjectTest.java | 2 +- 9 files changed, 151 insertions(+), 21 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index e792c35d03..d9e7bf55e4 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -212,7 +212,8 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJsonToStructs(JNIEnv* env, jintArray j_num_children, jintArray j_types, jintArray j_scales, - jboolean allow_leading_zero_numbers) + jboolean allow_leading_zero_numbers, + jboolean allow_non_numeric_numbers) { JNI_NULL_CHECK(env, j_input, "j_input is null", 0); JNI_NULL_CHECK(env, j_col_names, "j_col_names is null", 0); @@ -263,8 +264,10 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJsonToStructs(JNIEnv* env, fflush(stdout); auto const input_cv = reinterpret_cast(j_input); - auto output = spark_rapids_jni::from_json_to_structs( - cudf::strings_column_view{*input_cv}, schema, allow_leading_zero_numbers); + auto output = spark_rapids_jni::from_json_to_structs(cudf::strings_column_view{*input_cv}, + schema, + allow_leading_zero_numbers, + allow_non_numeric_numbers); printf("JNI line %d\n", __LINE__); fflush(stdout); diff --git a/src/main/cpp/src/from_json.hpp b/src/main/cpp/src/from_json.hpp index 6e32d8e95d..decd3b8640 100644 --- a/src/main/cpp/src/from_json.hpp +++ b/src/main/cpp/src/from_json.hpp @@ -38,6 +38,7 @@ std::vector> from_json_to_structs( cudf::strings_column_view const& input, std::vector> const& schema, bool allow_leading_zero_numbers, + bool allow_non_numeric_numbers, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 1396ce7b51..fb5f79c29c 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -428,6 +428,7 @@ std::vector> get_json_object( int64_t memory_budget_bytes, int32_t parallel_override, bool allow_leading_zero_numbers, + bool allow_non_numeric_numbers, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); @@ -435,6 +436,7 @@ std::vector> from_json_to_structs( cudf::strings_column_view const& input, std::vector> const& schema, bool allow_leading_zero_numbers, + bool allow_non_numeric_numbers, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -462,8 +464,15 @@ std::vector> from_json_to_structs( fflush(stdout); #endif - auto tmp = get_json_object( - input, json_paths, keep_quotes, -1L, -1, allow_leading_zero_numbers, stream, mr); + auto tmp = get_json_object(input, + json_paths, + keep_quotes, + -1L, + -1, + allow_leading_zero_numbers, + allow_non_numeric_numbers, + stream, + mr); printf("line %d\n", __LINE__); fflush(stdout); @@ -476,11 +485,13 @@ std::vector> from_json_to_structs( cudf::strings_column_view const& input, std::vector> const& schema, bool allow_leading_zero_numbers, + bool allow_non_numeric_numbers, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::from_json_to_structs(input, schema, allow_leading_zero_numbers, stream, mr); + return detail::from_json_to_structs( + input, schema, allow_leading_zero_numbers, allow_non_numeric_numbers, stream, mr); } } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu index 14174b49cc..40dc02eec5 100644 --- a/src/main/cpp/src/get_json_object.cu +++ b/src/main/cpp/src/get_json_object.cu @@ -149,6 +149,8 @@ class json_generator { if (array_depth > 0) { is_curr_array_empty = false; } + // printf("parser line %d\n", __LINE__); + auto [b, copy_len] = parser.copy_current_structure(out_begin + offset + output_len); output_len += copy_len; return b; @@ -464,6 +466,8 @@ __device__ thrust::pair evaluate_path( // case (_, Nil) // case path 3 else if (path_is_empty(ctx.path.size())) { + // printf("get obj line %d\n", __LINE__); + // general case: just copy the child tree verbatim if (!(ctx.g.copy_current_structure(p, out_buf))) { // JSON validation check @@ -713,6 +717,8 @@ __device__ thrust::pair evaluate_path( // case _ => // case path 12 else { + // printf("get obj line %d\n", __LINE__); + if (!p.try_skip_children()) { return {false, 0}; } // default case path, return false for this task ctx.dirty = 0; @@ -829,6 +835,7 @@ struct json_path_processing_data { * and we want to avoid spilling all the time or else the performance is really bad. This * essentially tells NVCC to prefer using lots of registers over spilling. * + * TODO update * @param input The input JSON strings stored in a strings column * @param path_data Array containing all path data * @param num_threads_per_row Number of threads processing each input row @@ -840,6 +847,7 @@ __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL void get_json_object_kernel(cudf::column_device_view input, cudf::device_span path_data, bool allow_leading_zero_numbers, + bool allow_non_numeric_numbers, std::size_t num_threads_per_row, int8_t* max_path_depth_exceeded) { @@ -859,6 +867,7 @@ __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL if (str.size_bytes() > 0) { json_parser p{char_range{str}}; p.set_allow_leading_zero_numbers(allow_leading_zero_numbers); + p.set_allow_non_numeric_numbers(allow_non_numeric_numbers); thrust::tie(is_valid, out_size) = evaluate_path(p, path.path_commands, path.keep_quotes, dst, max_path_depth_exceeded); @@ -889,6 +898,7 @@ struct kernel_launcher { static void exec(cudf::column_device_view const& input, cudf::device_span path_data, bool allow_leading_zero_numbers, + bool allow_non_numeric_numbers, int8_t* max_path_depth_exceeded, rmm::cuda_stream_view stream) { @@ -905,8 +915,12 @@ struct kernel_launcher { auto const num_blocks = cudf::util::div_rounding_up_safe(num_threads_per_row * input.size(), static_cast(block_size)); get_json_object_kernel - <<>>( - input, path_data, allow_leading_zero_numbers, num_threads_per_row, max_path_depth_exceeded); + <<>>(input, + path_data, + allow_leading_zero_numbers, + allow_non_numeric_numbers, + num_threads_per_row, + max_path_depth_exceeded); } }; @@ -1040,6 +1054,7 @@ std::vector> get_json_object_batch( std::unordered_set const& keep_quotes, int64_t scratch_size, bool allow_leading_zero_numbers, + bool allow_non_numeric_numbers, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -1089,8 +1104,12 @@ std::vector> get_json_object_batch( thrust::uninitialized_fill( rmm::exec_policy(stream), d_error_check.begin(), d_error_check.end(), 0); - kernel_launcher::exec( - input, d_path_data, allow_leading_zero_numbers, d_max_path_depth_exceeded, stream); + kernel_launcher::exec(input, + d_path_data, + allow_leading_zero_numbers, + allow_non_numeric_numbers, + d_max_path_depth_exceeded, + stream); auto h_error_check = cudf::detail::make_host_vector_sync(d_error_check, stream); auto has_no_oob = check_error(h_error_check); @@ -1160,8 +1179,12 @@ std::vector> get_json_object_batch( h_path_data, stream, rmm::mr::get_current_device_resource()); thrust::uninitialized_fill( rmm::exec_policy(stream), d_error_check.begin(), d_error_check.end(), 0); - kernel_launcher::exec( - input, d_path_data, allow_leading_zero_numbers, d_max_path_depth_exceeded, stream); + kernel_launcher::exec(input, + d_path_data, + allow_leading_zero_numbers, + allow_non_numeric_numbers, + d_max_path_depth_exceeded, + stream); h_error_check = cudf::detail::make_host_vector_sync(d_error_check, stream); has_no_oob = check_error(h_error_check); @@ -1190,6 +1213,7 @@ std::vector> get_json_object( int64_t memory_budget_bytes, int32_t parallel_override, bool allow_leading_zero_numbers, + bool allow_non_numeric_numbers, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -1255,6 +1279,7 @@ std::vector> get_json_object( keep_quotes, scratch_size, allow_leading_zero_numbers, + allow_non_numeric_numbers, stream, mr); for (std::size_t i = 0; i < tmp.size(); i++) { @@ -1272,12 +1297,20 @@ std::unique_ptr get_json_object( cudf::strings_column_view const& input, std::vector> const& instructions, bool allow_leading_zero_numbers, + bool allow_non_numeric_numbers, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return std::move(detail::get_json_object( - input, {instructions}, {}, -1, -1, allow_leading_zero_numbers, stream, mr) + return std::move(detail::get_json_object(input, + {instructions}, + {}, + -1, + -1, + allow_leading_zero_numbers, + allow_non_numeric_numbers, + stream, + mr) .front()); } @@ -1288,6 +1321,7 @@ std::vector> get_json_object_multiple_paths( int64_t memory_budget_bytes, int32_t parallel_override, bool allow_leading_zero_numbers, + bool allow_non_numeric_numbers, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -1298,6 +1332,7 @@ std::vector> get_json_object_multiple_paths( memory_budget_bytes, parallel_override, allow_leading_zero_numbers, + allow_non_numeric_numbers, stream, mr); } diff --git a/src/main/cpp/src/get_json_object.hpp b/src/main/cpp/src/get_json_object.hpp index bb1d73afc2..f3511919f4 100644 --- a/src/main/cpp/src/get_json_object.hpp +++ b/src/main/cpp/src/get_json_object.hpp @@ -45,6 +45,7 @@ std::unique_ptr get_json_object( cudf::strings_column_view const& input, std::vector> const& instructions, bool allow_leading_zero_numbers = false, + bool allow_non_numeric_numbers = false, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); @@ -54,6 +55,9 @@ std::unique_ptr get_json_object( * This function processes all the JSON paths in parallel, which may be faster than calling * to `get_json_object` on the individual JSON paths. However, it may consume much more GPU * memory, proportional to the number of JSON paths. + * + * TODO update params + * * @param input the input string column to parse JSON from * @param json_paths the path operations to read extract * @param memory_budget_bytes a memory budget for temporary memory usage if > 0 @@ -68,6 +72,7 @@ std::vector> get_json_object_multiple_paths( int64_t memory_budget_bytes, int32_t parallel_override, bool allow_leading_zero_numbers = false, + bool allow_non_numeric_numbers = false, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); diff --git a/src/main/cpp/src/json_parser.cuh b/src/main/cpp/src/json_parser.cuh index 3061633d04..403e6e7fe2 100644 --- a/src/main/cpp/src/json_parser.cuh +++ b/src/main/cpp/src/json_parser.cuh @@ -95,6 +95,9 @@ enum class json_token : int8_t { // e.g.: 1.25 in {"key1" : 1.25} VALUE_NUMBER_FLOAT, + // One from case sensitive of {"NaN", "+INF", "-INF", "+Infinity", "Infinity", "-Infinity"} + VALUE_NON_NUMERIC_FLOAT, + // e.g.: true in {"key1" : true} VALUE_TRUE, @@ -1018,7 +1021,43 @@ class json_parser { */ __device__ inline void parse_number_and_set_current() { - // parse sign + if (allow_non_numeric_numbers && (curr_pos + 2) < chars.size()) { + // Check for NaN only, no any `+` or `-` sign. + if (chars[curr_pos] == 'N' && chars[curr_pos + 1] == 'a' && chars[curr_pos + 2] == 'N') { + current_token = json_token::VALUE_NON_NUMERIC_FLOAT; + curr_pos += 3; + number_token_len = curr_pos - current_token_start_pos; + // printf("parser line %d\n", __LINE__); + return; + } + + auto const matched_sign = chars[curr_pos] == '-' || chars[curr_pos] == '+'; + if (matched_sign) { ++curr_pos; } + + if ((curr_pos + 2) < chars.size() && chars[curr_pos] == 'I' && chars[curr_pos + 1] == 'N' && + chars[curr_pos + 2] == 'F') { + current_token = json_token::VALUE_NON_NUMERIC_FLOAT; + curr_pos += 3; + number_token_len = curr_pos - current_token_start_pos; + // printf("parser line %d\n", __LINE__); + + return; + } + if ((curr_pos + 7) < chars.size() && chars[curr_pos] == 'I' && chars[curr_pos + 1] == 'n' && + chars[curr_pos + 2] == 'f' && chars[curr_pos + 3] == 'i' && chars[curr_pos + 4] == 'n' && + chars[curr_pos + 5] == 'i' && chars[curr_pos + 6] == 't' && chars[curr_pos + 7] == 'y') { + current_token = json_token::VALUE_NON_NUMERIC_FLOAT; + curr_pos += 8; + number_token_len = curr_pos - current_token_start_pos; + + // printf("parser line %d\n", __LINE__); + return; + } + + // Restore to the original position to parse again as a regular number. + if (matched_sign) { --curr_pos; } + } + try_skip(curr_pos, '-'); // parse unsigned number @@ -1448,6 +1487,14 @@ class json_parser { cudf::strings::detail::stod(chars.slice_sv(current_token_start_pos, number_token_len)); return spark_rapids_jni::ftos_converter::double_normalization(d_value, destination); } + case json_token::VALUE_NON_NUMERIC_FLOAT: { + if (nullptr != destination) { + for (cudf::size_type i = 0; i < number_token_len; ++i) { + *destination++ = chars[current_token_start_pos + i]; + } + } + return number_token_len; + } case json_token::VALUE_TRUE: if (nullptr != destination) { *destination++ = 't'; @@ -1509,6 +1556,8 @@ class json_parser { */ __device__ cudf::size_type write_escaped_text(char* destination) const { + // printf("parser line %d\n", __LINE__); + switch (current_token) { case json_token::VALUE_STRING: { // can not copy from JSON directly due to escaped chars @@ -1534,6 +1583,17 @@ class json_parser { cudf::strings::detail::stod(chars.slice_sv(current_token_start_pos, number_token_len)); return spark_rapids_jni::ftos_converter::double_normalization(d_value, destination); } + case json_token::VALUE_NON_NUMERIC_FLOAT: { + if (nullptr != destination) { + // printf("parser line %d\n", __LINE__); + + for (cudf::size_type i = 0; i < number_token_len; ++i) { + *destination++ = chars[current_token_start_pos + i]; + } + } + // printf("parser line %d\n", __LINE__); + return number_token_len; + } case json_token::VALUE_TRUE: if (nullptr != destination) { *destination++ = 't'; @@ -1617,6 +1677,8 @@ class json_parser { */ __device__ thrust::pair copy_current_structure(char* copy_to) { + // printf("parser line %d\n", __LINE__); + switch (current_token) { case json_token::INIT: case json_token::ERROR: @@ -1626,12 +1688,15 @@ class json_parser { case json_token::END_OBJECT: return thrust::make_pair(false, 0); case json_token::VALUE_NUMBER_INT: case json_token::VALUE_NUMBER_FLOAT: + case json_token::VALUE_NON_NUMERIC_FLOAT: case json_token::VALUE_STRING: case json_token::VALUE_TRUE: case json_token::VALUE_FALSE: case json_token::VALUE_NULL: // copy terminal token if (nullptr != copy_to) { + // printf("parser line %d\n", __LINE__); + size_t copy_len = write_escaped_text(copy_to); return thrust::make_pair(true, copy_len); } else { @@ -1702,6 +1767,11 @@ class json_parser { allow_leading_zero_numbers = state; } + __device__ inline void set_allow_non_numeric_numbers(bool state) + { + allow_non_numeric_numbers = state; + } + private: char_range const chars; cudf::size_type curr_pos; @@ -1727,6 +1797,9 @@ class json_parser { // Whether allow to have leading zero in numbers. bool allow_leading_zero_numbers; + + // TODO + bool allow_non_numeric_numbers; }; } // namespace spark_rapids_jni diff --git a/src/main/cpp/tests/from_json.cu b/src/main/cpp/tests/from_json.cu index 0ac10893c4..361a0579a9 100644 --- a/src/main/cpp/tests/from_json.cu +++ b/src/main/cpp/tests/from_json.cu @@ -36,8 +36,8 @@ TEST_F(FromJsonTest, Initialization) {"a", {cudf::data_type{cudf::type_id::STRING}}}, }; - auto const output = - spark_rapids_jni::from_json_to_structs(cudf::strings_column_view{json_string}, schema, false); + auto const output = spark_rapids_jni::from_json_to_structs( + cudf::strings_column_view{json_string}, schema, false, false); cudf::test::print(json_string); diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index beed206d65..756375a9dd 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -168,12 +168,13 @@ public static ColumnVector extractRawMapFromJsonString(ColumnView input) { * @return */ public static Table fromJsonToStructs(ColumnView input, Schema schema, - boolean allowNumericLeadingZeros) { + boolean allowNumericLeadingZeros, + boolean allowNonNumericNumbers) { assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; return new Table(fromJsonToStructs(input.getNativeView(), schema.getFlattenedColumnNames(), schema.getFlattenedNumChildren(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), - allowNumericLeadingZeros)); + allowNumericLeadingZeros, allowNonNumericNumbers)); } private static native int getMaxJSONPathDepth(); @@ -198,5 +199,6 @@ private static native long[] fromJsonToStructs(long input, String[] columnNames, int[] numChildren, int[] dTypeIds, int[] dTypeScales, - boolean allowNumericLeadingZeros); + boolean allowNumericLeadingZeros, + boolean allowNonNumericNumbers); } diff --git a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java index f82f3a09c9..0725d887df 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java @@ -802,7 +802,7 @@ void testFromJSON() { try (ColumnVector input = ColumnVector.fromStrings("{'a': '1', 'b': '2'}"); ai.rapids.cudf.Table expected = new ai.rapids.cudf.Table.TestBuilder().column("1").build(); - ai.rapids.cudf.Table actual = JSONUtils.fromJsonToStructs(input, schema, false)) { + ai.rapids.cudf.Table actual = JSONUtils.fromJsonToStructs(input, schema, false, false)) { assertTablesAreEqual(expected, actual); } } From 5ba150681ea470d7095dd74c8b8472ce33d848c1 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Sun, 25 Aug 2024 14:07:03 -0700 Subject: [PATCH 18/58] WIP for supporting structs Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 62 ++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 5 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index fb5f79c29c..a6372cbbc7 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -388,9 +389,7 @@ void travel_path( { current_path.emplace_back(path_instruction_type::NAMED, name, -1); if (column_schema.child_types.size() == 0) { // leaf of the schema - if (column_schema.type.id() == cudf::type_id::DECIMAL32 || - column_schema.type.id() == cudf::type_id::DECIMAL64 || - column_schema.type.id() == cudf::type_id::DECIMAL128) { + if (cudf::is_fixed_width(column_schema.type)) { // TODO: comment keep_quotes.insert(paths.size()); } @@ -401,12 +400,13 @@ void travel_path( travel_path(paths, current_path, keep_quotes, child_name, child_schema); } } + travel_path(paths, current_path, keep_quotes, kv.first, kv.second); current_path.pop_back(); } std::pair>>, std::unordered_set> -convert_schema_to_paths(std::vector> const& schema) +flatten_schema_to_paths(std::vector> const& schema) { std::vector>> paths; std::unordered_set keep_quotes; @@ -419,6 +419,58 @@ convert_schema_to_paths(std::vector>& output, + std::vector> const& read_columns, + cudf::io::schema_element const& column_schema) +{ + if (column_schema.child_types.size() == 0) { // leaf of the schema + paths.push_back(current_path); // this will copy + } else { + for (auto const& [child_name, child_schema] : column_schema.child_types) { + travel_path(paths, current_path, keep_quotes, child_name, child_schema); + } + } +} + +std::vector> assemble_output( + std::vector> const& schema, + std::vector>> const& paths, + std::vector>& read_columns) +{ + // Build a map from column name to column. + std::unordered_map> map_read_columns; + for (std::size_t idx = 0; idx < paths.size(); ++idx) { + map_read_columns.emplace(schema[idx].first, std::move(read_columns[idx])); + } + + std::vector> output; + output.reserve(schema.size()); + std::for_each(schema.begin(), schema.end(), [&](auto const& kv) { + auto const& name = kv.first; + auto const& column_schema = kv.second; + if (column_schema.child_types.size() == 0) { + auto const it = map_read_columns.find(name); + CUDF_EXPECTS(it != map_read_columns.end() && it->second != nullptr, "TODO"); + output.push_back(std::move(it->second)); + } else if (column_schema.type.id() == cudf::type_id::STRUCT) { + auto children = assemble_output(); + auto const num_rows = children.front()->size(); + // TODO: generate null mask from input. + output.emplace_back(std::make_unique(cudf::data_type{type_id::STRUCT}, + num_rows, + rmm::device_buffer{}, + {}, + 0, + std::move(children))); + } else { + CUDF_FAIL("Unsupported schema type"); + // TODO: support list + } + }); + + return output; +} + // Extern std::vector> get_json_object( cudf::strings_column_view const& input, @@ -442,7 +494,7 @@ std::vector> from_json_to_structs( { printf("line %d\n", __LINE__); fflush(stdout); - auto const [json_paths, keep_quotes] = convert_schema_to_paths(schema); + auto const [json_paths, keep_quotes] = flatten_schema_to_paths(schema); printf("line %d\n", __LINE__); fflush(stdout); From 43764386b5d0fe62ef69b34d84d112d2e8564491 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 10 Sep 2024 16:51:30 -0700 Subject: [PATCH 19/58] Support struct type in schema Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 67 +++++++++++------------- 1 file changed, 32 insertions(+), 35 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index a6372cbbc7..36dadcca85 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -396,11 +396,15 @@ void travel_path( printf("column_schema type: %d\n", static_cast(column_schema.type.id())); paths.push_back(current_path); // this will copy } else { + if (column_schema.type.id() != cudf::type_id::STRUCT) { + CUDF_FAIL("Unsupported column type in schema"); + } + + auto const last_path_size = paths.size(); for (auto const& [child_name, child_schema] : column_schema.child_types) { travel_path(paths, current_path, keep_quotes, child_name, child_schema); } } - travel_path(paths, current_path, keep_quotes, kv.first, kv.second); current_path.pop_back(); } @@ -419,53 +423,46 @@ flatten_schema_to_paths(std::vector>& output, - std::vector> const& read_columns, - cudf::io::schema_element const& column_schema) +void assemble_column(std::size_t& column_order, + std::vector>& output, + std::vector>& read_columns, + std::string const& name, + cudf::io::schema_element const& column_schema) { if (column_schema.child_types.size() == 0) { // leaf of the schema - paths.push_back(current_path); // this will copy + output.emplace_back(std::move(read_columns[column_order])); + ++column_order; } else { + if (column_schema.type.id() != cudf::type_id::STRUCT) { + CUDF_FAIL("Unsupported column type in schema"); + } + + std::vector> children; for (auto const& [child_name, child_schema] : column_schema.child_types) { - travel_path(paths, current_path, keep_quotes, child_name, child_schema); + assemble_column(column_order, children, read_columns, child_name, child_schema); } + + // TODO: generate null mask from input. + auto const num_rows = children.front()->size(); + output.emplace_back(std::make_unique(cudf::data_type{cudf::type_id::STRUCT}, + num_rows, + rmm::device_buffer{}, + rmm::device_buffer{}, + 0, + std::move(children))); } } std::vector> assemble_output( std::vector> const& schema, - std::vector>> const& paths, std::vector>& read_columns) { - // Build a map from column name to column. - std::unordered_map> map_read_columns; - for (std::size_t idx = 0; idx < paths.size(); ++idx) { - map_read_columns.emplace(schema[idx].first, std::move(read_columns[idx])); - } - std::vector> output; - output.reserve(schema.size()); + output.reserve(read_columns.size()); + + std::size_t column_order{0}; std::for_each(schema.begin(), schema.end(), [&](auto const& kv) { - auto const& name = kv.first; - auto const& column_schema = kv.second; - if (column_schema.child_types.size() == 0) { - auto const it = map_read_columns.find(name); - CUDF_EXPECTS(it != map_read_columns.end() && it->second != nullptr, "TODO"); - output.push_back(std::move(it->second)); - } else if (column_schema.type.id() == cudf::type_id::STRUCT) { - auto children = assemble_output(); - auto const num_rows = children.front()->size(); - // TODO: generate null mask from input. - output.emplace_back(std::make_unique(cudf::data_type{type_id::STRUCT}, - num_rows, - rmm::device_buffer{}, - {}, - 0, - std::move(children))); - } else { - CUDF_FAIL("Unsupported schema type"); - // TODO: support list - } + assemble_column(column_order, output, read_columns, kv.first, kv.second); }); return output; @@ -528,7 +525,7 @@ std::vector> from_json_to_structs( printf("line %d\n", __LINE__); fflush(stdout); - return tmp; + return assemble_output(schema, tmp); } } // namespace detail From 7c5e3ece477d1f2860b45e69628126483389974e Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 11 Sep 2024 14:35:32 -0700 Subject: [PATCH 20/58] Apply null mask for structs Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 60 +++++++++++++++++++----- 1 file changed, 49 insertions(+), 11 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 36dadcca85..7ff8e2ab1e 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -400,6 +400,8 @@ void travel_path( CUDF_FAIL("Unsupported column type in schema"); } + paths.push_back(current_path); // this will copy + auto const last_path_size = paths.size(); for (auto const& [child_name, child_schema] : column_schema.child_types) { travel_path(paths, current_path, keep_quotes, child_name, child_schema); @@ -427,7 +429,9 @@ void assemble_column(std::size_t& column_order, std::vector>& output, std::vector>& read_columns, std::string const& name, - cudf::io::schema_element const& column_schema) + cudf::io::schema_element const& column_schema, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { if (column_schema.child_types.size() == 0) { // leaf of the schema output.emplace_back(std::move(read_columns[column_order])); @@ -439,30 +443,32 @@ void assemble_column(std::size_t& column_order, std::vector> children; for (auto const& [child_name, child_schema] : column_schema.child_types) { - assemble_column(column_order, children, read_columns, child_name, child_schema); + assemble_column(column_order, children, read_columns, child_name, child_schema, stream, mr); } + auto const null_count = read_columns[column_order]->null_count(); + auto const null_mask = std::move(read_columns[column_order]->release().null_mask); + ++column_order; + // TODO: generate null mask from input. auto const num_rows = children.front()->size(); - output.emplace_back(std::make_unique(cudf::data_type{cudf::type_id::STRUCT}, - num_rows, - rmm::device_buffer{}, - rmm::device_buffer{}, - 0, - std::move(children))); + output.emplace_back(cudf::make_structs_column( + num_rows, std::move(children), null_count, std::move(*null_mask), stream, mr)); } } std::vector> assemble_output( std::vector> const& schema, - std::vector>& read_columns) + std::vector>& read_columns, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { std::vector> output; output.reserve(read_columns.size()); std::size_t column_order{0}; std::for_each(schema.begin(), schema.end(), [&](auto const& kv) { - assemble_column(column_order, output, read_columns, kv.first, kv.second); + assemble_column(column_order, output, read_columns, kv.first, kv.second, stream, mr); }); return output; @@ -511,6 +517,20 @@ std::vector> from_json_to_structs( } printf("\n\n\n"); fflush(stdout); + + auto ptr = input.chars_begin(stream); + auto size = input.chars_size(stream); + std::vector h_v(size); + CUDF_CUDA_TRY( + cudaMemcpyAsync(h_v.data(), ptr, sizeof(char) * size, cudaMemcpyDefault, stream.value())); + stream.synchronize(); + + printf("input (size = %d): ", (int)size); + for (auto c : h_v) { + printf("%c", c); + } + printf("\n"); + #endif auto tmp = get_json_object(input, @@ -525,7 +545,25 @@ std::vector> from_json_to_structs( printf("line %d\n", __LINE__); fflush(stdout); - return assemble_output(schema, tmp); + if (1) { + for (std::size_t i = 0; i < tmp.size(); ++i) { + auto out = cudf::strings_column_view{tmp[i]->view()}; + auto ptr = out.chars_begin(stream); + auto size = out.chars_size(stream); + std::vector h_v(size); + CUDF_CUDA_TRY( + cudaMemcpyAsync(h_v.data(), ptr, sizeof(char) * size, cudaMemcpyDefault, stream.value())); + stream.synchronize(); + + printf("out %d (size = %d): ", (int)i, (int)size); + for (auto c : h_v) { + printf("%c", c); + } + printf("\n"); + } + } + + return assemble_output(schema, tmp, stream, mr); } } // namespace detail From 41270403d22ba302af0002cffbbc8ce102762466 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 13 Sep 2024 22:17:57 -0700 Subject: [PATCH 21/58] Fix column order Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 7ff8e2ab1e..c5cbbc1747 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -441,15 +441,15 @@ void assemble_column(std::size_t& column_order, CUDF_FAIL("Unsupported column type in schema"); } + auto const null_count = read_columns[column_order]->null_count(); + auto const null_mask = std::move(read_columns[column_order]->release().null_mask); + ++column_order; + std::vector> children; for (auto const& [child_name, child_schema] : column_schema.child_types) { assemble_column(column_order, children, read_columns, child_name, child_schema, stream, mr); } - auto const null_count = read_columns[column_order]->null_count(); - auto const null_mask = std::move(read_columns[column_order]->release().null_mask); - ++column_order; - // TODO: generate null mask from input. auto const num_rows = children.front()->size(); output.emplace_back(cudf::make_structs_column( From 97cd60bbf81dee5f119df8849ee0e325b7496804 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 17 Sep 2024 10:54:06 -0700 Subject: [PATCH 22/58] Add `from_json_object` kernel Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 987 ++++++++++++++++++++++- 1 file changed, 945 insertions(+), 42 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index c5cbbc1747..b32435ca6d 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -49,14 +49,811 @@ namespace spark_rapids_jni { namespace detail { -#if 0 + namespace test { + +/** + * @brief JSON style to write. + */ +enum class write_style : int8_t { RAW, QUOTED, FLATTEN }; + /** - * @brief TODO + * @brief Instruction along a JSON path. + */ +struct path_instruction { + __device__ inline path_instruction(path_instruction_type _type) : type(_type) {} + + // used when type is named type + cudf::string_view name; + + // used when type is index + int index{-1}; + + path_instruction_type type; +}; + +/** + * @brief JSON generator used to write out JSON content. + * + * Because of get_json_object only outputs JSON object as a whole item, + * it's no need to store internal state for JSON object when outputing, + * only need to store internal state for JSON array. + */ +class json_generator { + public: + __device__ json_generator(int _offset = 0) : offset(_offset), output_len(0) {} + + // create a nested child generator based on this parent generator, + // child generator is a view, parent and child share the same byte array + __device__ json_generator new_child_generator() const + { + return json_generator(offset + output_len); + } + + // write [ + // add an extra comma if needed, + // e.g.: when JSON content is: [[1,2,3] + // writing a new [ should result: [[1,2,3],[ + __device__ void write_start_array(char* out_begin) + { + try_write_comma(out_begin); + + out_begin[offset + output_len] = '['; + output_len++; + array_depth++; + // new array is empty + is_curr_array_empty = true; + } + + // write ] + __device__ void write_end_array(char* out_begin) + { + out_begin[offset + output_len] = ']'; + output_len++; + // point to parent array + array_depth--; + // set parent array as non-empty because already had a closed child item. + is_curr_array_empty = false; + } + + // write first start array without output, only update internal state + __device__ void write_first_start_array_without_output() + { + // hide the outer start array token + // Note: do not inc output_len + array_depth++; + // new array is empty + is_curr_array_empty = true; + } + + // return true if it's in a array context and it's not writing the first item. + __device__ inline bool need_comma() const { return (array_depth > 0 && !is_curr_array_empty); } + + /** + * write comma accroding to current generator state + */ + __device__ void try_write_comma(char* out_begin) + { + if (need_comma()) { + // in array context and writes first item + out_begin[offset + output_len] = ','; + output_len++; + } + } + + /** + * copy current structure when parsing. If current token is start + * object/array, then copy to corresponding matched end object/array. return + * false if JSON format is invalid return true if JSON format is valid + */ + __device__ bool copy_current_structure(json_parser& parser, char* out_begin) + { + // first try add comma + try_write_comma(out_begin); + + if (array_depth > 0) { is_curr_array_empty = false; } + + // printf("parser line %d\n", __LINE__); + + auto [b, copy_len] = parser.copy_current_structure(out_begin + offset + output_len); + output_len += copy_len; + return b; + } + + static __device__ cudf::size_type write_quote(char* out, bool keep_quotes) + { + if (!keep_quotes) { return 0; } + *out = '"'; + return 1; + } + + /** + * Get current text from JSON parser and then write the text + * Note: Because JSON strings contains '\' to do escape, + * JSON parser should do unescape to remove '\' and JSON parser + * then can not return a pointer and length pair (char *, len), + * For number token, JSON parser can return a pair (char *, len) + */ + __device__ void write_raw(json_parser& parser, char* out_begin, bool keep_quotes) + { + if (array_depth > 0) { is_curr_array_empty = false; } + + output_len += write_quote(out_begin + offset + output_len, keep_quotes); + output_len += parser.write_unescaped_text(out_begin + offset + output_len); + output_len += write_quote(out_begin + offset + output_len, keep_quotes); + } + + /** + * write child raw value + * e.g.: + * + * write_outer_array_tokens = false + * need_comma = true + * [1,2,3]1,2,3 + * ^ + * | + * child pointer + * ==>> + * [1,2,3],1,2,3 + * + * + * write_outer_array_tokens = true + * need_comma = true + * [12,3,4 + * ^ + * | + * child pointer + * ==>> + * [1,[2,3,4] + * + * For more information about param write_outer_array_tokens, refer to + * `write_first_start_array_without_output` + * @param child_block_begin + * @param child_block_len + * @param write_outer_array_tokens whether write outer array tokens for child + * block + */ + __device__ void write_child_raw_value(char* child_block_begin, + int child_block_len, + bool write_outer_array_tokens) + { + bool insert_comma = need_comma(); + + if (array_depth > 0) { is_curr_array_empty = false; } + + if (write_outer_array_tokens) { + if (insert_comma) { + *(child_block_begin + child_block_len + 2) = ']'; + move_forward(child_block_begin, child_block_len, 2); + *(child_block_begin + 1) = '['; + *(child_block_begin) = ','; + } else { + *(child_block_begin + child_block_len + 1) = ']'; + move_forward(child_block_begin, child_block_len, 1); + *(child_block_begin) = '['; + } + } else { + if (insert_comma) { + move_forward(child_block_begin, child_block_len, 1); + *(child_block_begin) = ','; + } else { + // do not need comma && do not need write outer array tokens + // do nothing, because child generator buff is directly after the + // parent generator + } + } + + // update length + if (insert_comma) { output_len++; } + if (write_outer_array_tokens) { output_len += 2; } + output_len += child_block_len; + } + + // move memory block forward by specified bytes + // e.g.: memory is: 1 2 0 0, begin is 1, len is 2, after moving, + // memory is: 1 2 1 2. + // e.g.: memory is: 1 2 0 0, begin is 1, len is 1, after moving, + // memory is: 1 1 2 0. + // Note: should move from end to begin to avoid overwrite buffer + static __device__ void move_forward(char* begin, size_t len, int forward) + { + // TODO copy by 8 bytes + char* pos = begin + len + forward - 1; + char* e = begin + forward - 1; + while (pos > e) { + *pos = *(pos - forward); + pos--; + } + } + + __device__ inline int get_offset() const { return offset; } + __device__ inline int get_output_len() const { return output_len; } + + /** + * generator may contain trash output, e.g.: generator writes some output, + * then JSON format is invalid, the previous output becomes trash. + */ + __device__ inline void set_output_len_zero() { output_len = 0; } + + __device__ inline void set_output_len(size_t len) { output_len = len; } + + private: + int offset; // offset from the global output buffer + int output_len; + + int array_depth = 0; + + // whether already worte a item in current array + // used to decide whether add a comma before writing out a new item. + bool is_curr_array_empty; +}; + +/** + * path evaluator which can run on both CPU and GPU + */ +__device__ inline bool path_is_empty(size_t path_size) { return path_size == 0; } + +__device__ inline bool path_match_element(cudf::device_span path, + path_instruction_type path_type0) +{ + if (path.size() < 1) { return false; } + return path.data()[0].type == path_type0; +} + +__device__ inline bool path_match_elements(cudf::device_span path, + path_instruction_type path_type0, + path_instruction_type path_type1) +{ + if (path.size() < 2) { return false; } + return path.data()[0].type == path_type0 && path.data()[1].type == path_type1; +} + +__device__ inline thrust::tuple path_match_index( + cudf::device_span path) +{ + auto match = path_match_element(path, path_instruction_type::INDEX); + if (match) { + return thrust::make_tuple(true, path.data()[0].index); + } else { + return thrust::make_tuple(false, 0); + } +} + +__device__ inline thrust::tuple path_match_named( + cudf::device_span path) +{ + auto match = path_match_element(path, path_instruction_type::NAMED); + if (match) { + return thrust::make_tuple(true, path.data()[0].name); + } else { + return thrust::make_tuple(false, cudf::string_view()); + } +} + +__device__ inline thrust::tuple path_match_index_wildcard( + cudf::device_span path) +{ + auto match = + path_match_elements(path, path_instruction_type::INDEX, path_instruction_type::WILDCARD); + if (match) { + return thrust::make_tuple(true, path.data()[0].index); + } else { + return thrust::make_tuple(false, 0); + } +} + +/** + * @brief The cases that mirro Apache Spark case path in `jsonExpressions.scala#evaluatePath()`. + */ +enum class evaluation_case_path : int8_t { + INVALID = -1, + START_ARRAY___EMPTY_PATH___FLATTEN_STYLE = 2, + START_OBJECT___MATCHED_NAME_PATH = 4, + START_ARRAY___MATCHED_DOUBLE_WILDCARD = 5, + START_ARRAY___MATCHED_WILDCARD___STYLE_NOT_QUOTED = 6, + START_ARRAY___MATCHED_WILDCARD = 7, + START_ARRAY___MATCHED_INDEX_AND_WILDCARD = 8, + START_ARRAY___MATCHED_INDEX = 9 +}; + +/** + * @brief The struct to store states during processing JSON through different nested levels. + */ +struct context { + // used to save current generator + json_generator g; + + // used to save child JSON generator for case path 6 + json_generator child_g; + + cudf::device_span path; + + // whether written output + // if dirty > 0, indicates success + int dirty; + + // which case path that this task is from + evaluation_case_path case_path; + + // current token + json_token token; + + write_style style; + + // for some case paths + bool is_first_enter; + + // is this context task is done + bool task_is_done; +}; + +/** + * @brief Parse a single json string using the provided command buffer. + * + * @param p The JSON parser for input string + * @param path_commands The command buffer to be applied to the string + * TODO: update + * @param out_buf Buffer user to store the string resulted from the query + * @param max_path_depth_exceeded A marker to record if the maximum path depth has been reached + * during parsing the input string + * @return A pair containing the result code and the output size + */ +__device__ thrust::pair evaluate_path( + json_parser& p, + cudf::device_span path_commands, + bool keep_quotes, + char* out_buf, + int8_t* max_path_depth_exceeded) +{ + p.next_token(); + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } + + // Define stack; plus 1 indicates root context task needs an extra memory. + context stack[MAX_JSON_PATH_DEPTH + 1]; + int stack_size = 0; + + auto const push_context = [&](evaluation_case_path _case_path, + json_generator _g, + write_style _style, + cudf::device_span _path) { + if (stack_size > MAX_JSON_PATH_DEPTH) { + *max_path_depth_exceeded = 1; + // Because no more context is pushed, the evaluation output should be wrong. + // But that is not important, since we will throw exception after the kernel finishes. + return; + } + auto& ctx = stack[stack_size++]; + ctx.g = std::move(_g); + ctx.path = std::move(_path); + ctx.dirty = 0; + ctx.case_path = _case_path; + ctx.token = p.get_current_token(); + ctx.style = _style; + ctx.is_first_enter = true; + ctx.task_is_done = false; + }; + + push_context(evaluation_case_path::INVALID, json_generator{}, write_style::RAW, path_commands); + + while (stack_size > 0) { + auto& ctx = stack[stack_size - 1]; + if (!ctx.task_is_done) { + // case (VALUE_STRING, Nil) if style == RawStyle + // case path 1 + if (json_token::VALUE_STRING == ctx.token && path_is_empty(ctx.path.size()) && + ctx.style == write_style::RAW) { + // there is no array wildcard or slice parent, emit this string without + // quotes write current string in parser to generator + ctx.g.write_raw(p, out_buf, keep_quotes); + ctx.dirty = 1; + ctx.task_is_done = true; + } + // case (START_ARRAY, Nil) if style == FlattenStyle + // case path 2 + else if (json_token::START_ARRAY == ctx.token && path_is_empty(ctx.path.size()) && + ctx.style == write_style::FLATTEN) { + // flatten this array into the parent + if (json_token::END_ARRAY != p.next_token()) { + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } + // push back task + // add child task + push_context(evaluation_case_path::START_ARRAY___EMPTY_PATH___FLATTEN_STYLE, + ctx.g, + ctx.style, + {nullptr, 0}); + } else { + // END_ARRAY + ctx.task_is_done = true; + } + } + // case (_, Nil) + // case path 3 + else if (path_is_empty(ctx.path.size())) { + // printf("get obj line %d\n", __LINE__); + + // general case: just copy the child tree verbatim + if (!(ctx.g.copy_current_structure(p, out_buf))) { + // JSON validation check + return {false, 0}; + } + ctx.dirty = 1; + ctx.task_is_done = true; + } + // case (START_OBJECT, Named :: xs) + // case path 4 + else if (json_token::START_OBJECT == ctx.token && + thrust::get<0>(path_match_named(ctx.path))) { + if (!ctx.is_first_enter) { + // 2st enter + // skip the following children after the expect + if (ctx.dirty > 0) { + while (json_token::END_OBJECT != p.next_token()) { + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } + + // skip FIELD_NAME token + p.next_token(); + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } + + // skip value of FIELD_NAME + if (!p.try_skip_children()) { + // JSON validation check + return {false, 0}; + } + } + } + // Mark task is done regardless whether the expected child was found. + ctx.task_is_done = true; + } else { + // below is 1st enter + ctx.is_first_enter = false; + // match first mached children with expected name + bool found_expected_child = false; + while (json_token::END_OBJECT != p.next_token()) { + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } + + // need to try more children + auto match_named = path_match_named(ctx.path); + auto named = thrust::get<1>(match_named); + // current token is FIELD_NAME + if (p.match_current_field_name(named)) { + // skip FIELD_NAME token + p.next_token(); + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } + + // meets null token, it's not expected, return false + if (json_token::VALUE_NULL == p.get_current_token()) { return {false, 0}; } + // push sub task; sub task will update the result of path 4 + push_context(evaluation_case_path::START_OBJECT___MATCHED_NAME_PATH, + ctx.g, + ctx.style, + {ctx.path.data() + 1, ctx.path.size() - 1}); + found_expected_child = true; + break; + } else { + // skip FIELD_NAME token + p.next_token(); + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } + + // current child is not expected, skip current child + if (!p.try_skip_children()) { + // JSON validation check + return {false, 0}; + } + } + } + if (!found_expected_child) { + // did not find any expected sub child + ctx.task_is_done = true; + ctx.dirty = false; + } + } + } + // case (START_ARRAY, Wildcard :: Wildcard :: xs) + // case path 5 + else if (json_token::START_ARRAY == ctx.token && + path_match_elements( + ctx.path, path_instruction_type::WILDCARD, path_instruction_type::WILDCARD)) { + // special handling for the non-structure preserving double wildcard + // behavior in Hive + if (ctx.is_first_enter) { + ctx.is_first_enter = false; + ctx.g.write_start_array(out_buf); + } + + if (p.next_token() != json_token::END_ARRAY) { + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } + push_context(evaluation_case_path::START_ARRAY___MATCHED_DOUBLE_WILDCARD, + ctx.g, + write_style::FLATTEN, + {ctx.path.data() + 2, ctx.path.size() - 2}); + } else { + ctx.g.write_end_array(out_buf); + ctx.task_is_done = true; + } + } + // case (START_ARRAY, Wildcard :: xs) if style != QuotedStyle + // case path 6 + else if (json_token::START_ARRAY == ctx.token && + path_match_element(ctx.path, path_instruction_type::WILDCARD) && + ctx.style != write_style::QUOTED) { + // retain Flatten, otherwise use Quoted... cannot use Raw within an array + write_style next_style = write_style::RAW; + switch (ctx.style) { + case write_style::RAW: next_style = write_style::QUOTED; break; + case write_style::FLATTEN: next_style = write_style::FLATTEN; break; + case write_style::QUOTED: next_style = write_style::QUOTED; // never happen + } + + // temporarily buffer child matches, the emitted json will need to be + // modified slightly if there is only a single element written + + json_generator child_g; + if (ctx.is_first_enter) { + ctx.is_first_enter = false; + // create a child generator with hide outer array tokens mode. + child_g = ctx.g.new_child_generator(); + // write first [ without output, without update len, only update internal state + child_g.write_first_start_array_without_output(); + } else { + child_g = ctx.child_g; + } + + if (p.next_token() != json_token::END_ARRAY) { + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } + // track the number of array elements and only emit an outer array if + // we've written more than one element, this matches Hive's behavior + push_context(evaluation_case_path::START_ARRAY___MATCHED_WILDCARD___STYLE_NOT_QUOTED, + child_g, + next_style, + {ctx.path.data() + 1, ctx.path.size() - 1}); + } else { + char* child_g_start = out_buf + child_g.get_offset(); + int child_g_len = child_g.get_output_len(); + if (ctx.dirty > 1) { + // add outer array tokens + ctx.g.write_child_raw_value( + child_g_start, child_g_len, /* write_outer_array_tokens */ true); + } else if (ctx.dirty == 1) { + // remove outer array tokens + ctx.g.write_child_raw_value( + child_g_start, child_g_len, /* write_outer_array_tokens */ false); + } // else do not write anything + + // Done anyway, since we already reached the end array. + ctx.task_is_done = true; + } + } + // case (START_ARRAY, Wildcard :: xs) + // case path 7 + else if (json_token::START_ARRAY == ctx.token && + path_match_element(ctx.path, path_instruction_type::WILDCARD)) { + if (ctx.is_first_enter) { + ctx.is_first_enter = false; + ctx.g.write_start_array(out_buf); + } + if (p.next_token() != json_token::END_ARRAY) { + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } + + // wildcards can have multiple matches, continually update the dirty + // count + push_context(evaluation_case_path::START_ARRAY___MATCHED_WILDCARD, + ctx.g, + write_style::QUOTED, + {ctx.path.data() + 1, ctx.path.size() - 1}); + } else { + ctx.g.write_end_array(out_buf); + ctx.task_is_done = true; + } + } + /* case (START_ARRAY, Index(idx) :: (xs@Wildcard :: _)) */ + // case path 8 + else if (json_token::START_ARRAY == ctx.token && + thrust::get<0>(path_match_index_wildcard(ctx.path))) { + int idx = thrust::get<1>(path_match_index_wildcard(ctx.path)); + + p.next_token(); + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } + ctx.is_first_enter = false; + + int i = idx; + while (i > 0) { + if (p.get_current_token() == json_token::END_ARRAY) { + // terminate, nothing has been written + return {false, 0}; + } + + if (!p.try_skip_children()) { return {false, 0}; } + + p.next_token(); + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } + + --i; + } + + // i == 0 + push_context(evaluation_case_path::START_ARRAY___MATCHED_INDEX_AND_WILDCARD, + ctx.g, + write_style::QUOTED, + {ctx.path.data() + 1, ctx.path.size() - 1}); + } + // case (START_ARRAY, Index(idx) :: xs) + // case path 9 + else if (json_token::START_ARRAY == ctx.token && thrust::get<0>(path_match_index(ctx.path))) { + int idx = thrust::get<1>(path_match_index(ctx.path)); + + p.next_token(); + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } + + int i = idx; + while (i > 0) { + if (p.get_current_token() == json_token::END_ARRAY) { + // terminate, nothing has been written + return {false, 0}; + } + + if (!p.try_skip_children()) { return {false, 0}; } + + p.next_token(); + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } + + --i; + } + + // i == 0 + push_context(evaluation_case_path::START_ARRAY___MATCHED_INDEX, + ctx.g, + ctx.style, + {ctx.path.data() + 1, ctx.path.size() - 1}); + } + // case _ => + // case path 12 + else { + // printf("get obj line %d\n", __LINE__); + + if (!p.try_skip_children()) { return {false, 0}; } + // default case path, return false for this task + ctx.dirty = 0; + ctx.task_is_done = true; + } + } // if (!ctx.task_is_done) + else { // current context is done. + // pop current top context + stack_size--; + + // has no parent task, stack is empty, will exit + if (stack_size == 0) { break; } + + // peek parent context task + // update parent task info according to current task result + auto& p_ctx = stack[stack_size - 1]; + + switch (ctx.case_path) { + // path 2: case (START_ARRAY, Nil) if style == FlattenStyle + // path 5: case (START_ARRAY, Wildcard :: Wildcard :: xs) + // path 7: case (START_ARRAY, Wildcard :: xs) + case evaluation_case_path::START_ARRAY___EMPTY_PATH___FLATTEN_STYLE: + case evaluation_case_path::START_ARRAY___MATCHED_DOUBLE_WILDCARD: + case evaluation_case_path::START_ARRAY___MATCHED_WILDCARD: { + // collect result from child task + p_ctx.dirty += ctx.dirty; + // copy generator states to parent task; + p_ctx.g = ctx.g; + + break; + } + + // case (START_OBJECT, Named :: xs) + // case path 4 + case evaluation_case_path::START_OBJECT___MATCHED_NAME_PATH: { + p_ctx.dirty = ctx.dirty; + // copy generator states to parent task; + p_ctx.g = ctx.g; + + break; + } + + // case (START_ARRAY, Wildcard :: xs) if style != QuotedStyle + // case path 6 + case evaluation_case_path::START_ARRAY___MATCHED_WILDCARD___STYLE_NOT_QUOTED: { + // collect result from child task + p_ctx.dirty += ctx.dirty; + // update child generator for parent task + p_ctx.child_g = ctx.g; + + break; + } + + /* case (START_ARRAY, Index(idx) :: (xs@Wildcard :: _)) */ + // case path 8 + // case (START_ARRAY, Index(idx) :: xs) + // case path 9 + case evaluation_case_path::START_ARRAY___MATCHED_INDEX_AND_WILDCARD: + case evaluation_case_path::START_ARRAY___MATCHED_INDEX: { + // collect result from child task + p_ctx.dirty += ctx.dirty; + + // post logic: + while (p.next_token() != json_token::END_ARRAY) { + // JSON validation check + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } + // advance the token stream to the end of the array + if (!p.try_skip_children()) { return {false, 0}; } + } + // task is done + p_ctx.task_is_done = true; + // copy generator states to parent task; + p_ctx.g = ctx.g; + + break; + } + + default:; // Never happens! + } // end switch (ctx.case_path) + } // ctx.task_is_done + } // while (stack_size > 0) + + auto const success = stack[0].dirty > 0; + + // generator may contain trash output, e.g.: generator writes some output, + // then JSON format is invalid, the previous output becomes trash. + // We need to return output size as zero. + return {success, success ? stack[0].g.get_output_len() : 0}; +} + +/** + * @brief Struct storing data such as path instructions, output buffer etc, corresponding to a + * single JSON path. + */ +struct json_path_processing_data { + cudf::device_span path_commands; + cudf::detail::input_offsetalator offsets; + thrust::pair* out_stringviews; + char* out_buf; + int8_t* has_out_of_bound; + bool keep_quotes; +}; + +/** + * @brief Kernel for running the JSONPath query, in which one input row is processed by entire + * warp (or multiple warps) of threads. + * + * The number of warps processing each row is computed as `ceil(num_paths / warp_size)`. + * + * We explicitly set a value for `min_block_per_sm` parameter in the launch bounds to avoid + * spilling from the kernel itself. By default NVCC uses a heuristic to find a balance between + * the maximum number of registers used by a kernel and the parallelism of the kernel. + * If lots of registers are used the parallelism may suffer. But in our case NVCC gets this wrong + * and we want to avoid spilling all the time or else the performance is really bad. This + * essentially tells NVCC to prefer using lots of registers over spilling. + * + * TODO update + * @param input The input JSON strings stored in a strings column + * @param path_data Array containing all path data + * @param num_threads_per_row Number of threads processing each input row + * @param max_path_depth_exceeded A marker to record if the maximum path depth has been reached + * during parsing the input string */ template __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL - void from_json_kernel(cudf::column_device_view input, std::size_t num_threads_per_row) + void get_json_object_kernel(cudf::column_device_view input, + cudf::device_span path_data, + bool allow_leading_zero_numbers, + bool allow_non_numeric_numbers, + std::size_t num_threads_per_row, + int8_t* max_path_depth_exceeded) { auto const tidx = cudf::detail::grid_1d::global_thread_id(); auto const row_idx = tidx / num_threads_per_row; @@ -73,8 +870,10 @@ __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL auto const str = input.element(row_idx); if (str.size_bytes() > 0) { json_parser p{char_range{str}}; + p.set_allow_leading_zero_numbers(allow_leading_zero_numbers); + p.set_allow_non_numeric_numbers(allow_non_numeric_numbers); thrust::tie(is_valid, out_size) = - evaluate_path(p, path.path_commands, dst, max_path_depth_exceeded); + evaluate_path(p, path.path_commands, path.keep_quotes, dst, max_path_depth_exceeded); // We did not terminate the `evaluate_path` function early to reduce complexity of the code. // Instead, if max depth was encountered, we've just continued the evaluation until here @@ -102,6 +901,8 @@ __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL struct kernel_launcher { static void exec(cudf::column_device_view const& input, cudf::device_span path_data, + bool allow_leading_zero_numbers, + bool allow_non_numeric_numbers, int8_t* max_path_depth_exceeded, rmm::cuda_stream_view stream) { @@ -118,11 +919,88 @@ struct kernel_launcher { auto const num_blocks = cudf::util::div_rounding_up_safe(num_threads_per_row * input.size(), static_cast(block_size)); get_json_object_kernel - <<>>( - input, path_data, num_threads_per_row, max_path_depth_exceeded); + <<>>(input, + path_data, + allow_leading_zero_numbers, + allow_non_numeric_numbers, + num_threads_per_row, + max_path_depth_exceeded); } }; +/** + * @brief Construct the device vector containing necessary data for the input JSON paths. + * + * All JSON paths are processed at once, without stream synchronization, to minimize overhead. + * + * A tuple of values are returned, however, only the first element is needed for further kernel + * launch. The remaining are unused but need to be kept alive as they contains data for later + * asynchronous host-device memcpy. + */ +std::tuple>, + std::unique_ptr>>, + cudf::string_scalar, + std::string> +construct_path_commands( + std::vector const>> const& + json_paths, + rmm::cuda_stream_view stream) +{ + // Concatenate all names from path instructions. + auto h_inst_names = [&] { + std::size_t length{0}; + for (auto const& instructions : json_paths) { + for (auto const& [type, name, index] : instructions) { + if (type == path_instruction_type::NAMED) { length += name.length(); } + } + } + std::string all_names; + all_names.reserve(length); + for (auto const& instructions : json_paths) { + for (auto const& [type, name, index] : instructions) { + if (type == path_instruction_type::NAMED) { all_names += name; } + } + } + return all_names; + }(); + auto d_inst_names = cudf::string_scalar(h_inst_names, true, stream); + + std::size_t name_pos{0}; + auto h_path_commands = std::make_unique>>(); + h_path_commands->reserve(json_paths.size()); + + for (auto const& instructions : json_paths) { + h_path_commands->emplace_back(); + auto& path_commands = h_path_commands->back(); + path_commands.reserve(instructions.size()); + + for (auto const& [type, name, index] : instructions) { + path_commands.emplace_back(path_instruction{type}); + + if (type == path_instruction_type::INDEX) { + path_commands.back().index = index; + } else if (type == path_instruction_type::NAMED) { + path_commands.back().name = cudf::string_view(d_inst_names.data() + name_pos, name.size()); + name_pos += name.size(); + } else if (type != path_instruction_type::WILDCARD) { + CUDF_FAIL("Invalid path instruction type"); + } + } + } + + auto d_path_commands = std::vector>{}; + d_path_commands.reserve(h_path_commands->size()); + for (auto const& path_commands : *h_path_commands) { + d_path_commands.emplace_back(cudf::detail::make_device_uvector_async( + path_commands, stream, rmm::mr::get_current_device_resource())); + } + + return {std::move(d_path_commands), + std::move(h_path_commands), + std::move(d_inst_names), + std::move(h_inst_names)}; +} + int64_t calc_scratch_size(cudf::strings_column_view const& input, cudf::detail::input_offsetalator const& in_offsets, rmm::cuda_stream_view stream) @@ -176,7 +1054,11 @@ std::vector> get_json_object_batch( cudf::detail::input_offsetalator const& in_offsets, std::vector const>> const& json_paths, + std::vector const& output_ids, + std::unordered_set const& keep_quotes, int64_t scratch_size, + bool allow_leading_zero_numbers, + bool allow_non_numeric_numbers, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -208,18 +1090,30 @@ std::vector> get_json_object_batch( out_stringviews.emplace_back(rmm::device_uvector>{ static_cast(input.size()), stream}); - h_path_data.emplace_back(json_path_processing_data{d_json_paths[idx], - in_offsets, - out_stringviews.back().data(), - scratch_buffers.back().data(), - d_error_check.data() + idx}); + printf("idx: %d, output_ids[idx]: %d\n", (int)idx, (int)output_ids[idx]); + printf("keep_quotes.find(output_ids[idx]) != keep_quotes.end(): %d\n", + (int)(keep_quotes.find(output_ids[idx]) != keep_quotes.end())); + fflush(stdout); + + h_path_data.emplace_back( + json_path_processing_data{d_json_paths[idx], + in_offsets, + out_stringviews.back().data(), + scratch_buffers.back().data(), + d_error_check.data() + idx, + keep_quotes.find(output_ids[idx]) != keep_quotes.end()}); } auto d_path_data = cudf::detail::make_device_uvector_async( h_path_data, stream, rmm::mr::get_current_device_resource()); thrust::uninitialized_fill( rmm::exec_policy(stream), d_error_check.begin(), d_error_check.end(), 0); - kernel_launcher::exec(input, d_path_data, d_max_path_depth_exceeded, stream); + kernel_launcher::exec(input, + d_path_data, + allow_leading_zero_numbers, + allow_non_numeric_numbers, + d_max_path_depth_exceeded, + stream); auto h_error_check = cudf::detail::make_host_vector_sync(d_error_check, stream); auto has_no_oob = check_error(h_error_check); @@ -274,7 +1168,8 @@ std::vector> get_json_object_batch( out_offsets_and_sizes.back().first->view()), nullptr /*out_stringviews*/, out_char_buffers.back().data(), - d_error_check.data() + idx}); + d_error_check.data() + idx, + keep_quotes.find(output_ids[idx]) != keep_quotes.end()}); } else { output.emplace_back(cudf::make_strings_column(out_sview, stream, mr)); } @@ -288,7 +1183,12 @@ std::vector> get_json_object_batch( h_path_data, stream, rmm::mr::get_current_device_resource()); thrust::uninitialized_fill( rmm::exec_policy(stream), d_error_check.begin(), d_error_check.end(), 0); - kernel_launcher::exec(input, d_path_data, d_max_path_depth_exceeded, stream); + kernel_launcher::exec(input, + d_path_data, + allow_leading_zero_numbers, + allow_non_numeric_numbers, + d_max_path_depth_exceeded, + stream); h_error_check = cudf::detail::make_host_vector_sync(d_error_check, stream); has_no_oob = check_error(h_error_check); @@ -308,9 +1208,18 @@ std::vector> get_json_object_batch( return output; } -std::unique_ptr from_json_to_struct_bk(cudf::strings_column_view const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +// TODO: update docs for keep_quotes +std::vector> get_json_object( + cudf::strings_column_view const& input, + std::vector>> const& + json_paths, + std::unordered_set const& keep_quotes, + int64_t memory_budget_bytes, + int32_t parallel_override, + bool allow_leading_zero_numbers, + bool allow_non_numeric_numbers, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { auto const num_outputs = json_paths.size(); @@ -367,7 +1276,16 @@ std::unique_ptr from_json_to_struct_bk(cudf::strings_column_view c budget += scratch_size; } } - auto tmp = get_json_object_batch(*d_input_ptr, in_offsets, batch, scratch_size, stream, mr); + auto tmp = get_json_object_batch(*d_input_ptr, + in_offsets, + batch, + output_ids, + keep_quotes, + scratch_size, + allow_leading_zero_numbers, + allow_non_numeric_numbers, + stream, + mr); for (std::size_t i = 0; i < tmp.size(); i++) { std::size_t out_i = output_ids[i]; output[out_i] = std::move(tmp[i]); @@ -376,9 +1294,7 @@ std::unique_ptr from_json_to_struct_bk(cudf::strings_column_view c } return output; } - } // namespace test -#endif void travel_path( std::vector>>& paths, @@ -474,19 +1390,6 @@ std::vector> assemble_output( return output; } -// Extern -std::vector> get_json_object( - cudf::strings_column_view const& input, - std::vector>> const& - json_paths, - std::unordered_set const& keep_quotes, - int64_t memory_budget_bytes, - int32_t parallel_override, - bool allow_leading_zero_numbers, - bool allow_non_numeric_numbers, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); - std::vector> from_json_to_structs( cudf::strings_column_view const& input, std::vector> const& schema, @@ -533,15 +1436,15 @@ std::vector> from_json_to_structs( #endif - auto tmp = get_json_object(input, - json_paths, - keep_quotes, - -1L, - -1, - allow_leading_zero_numbers, - allow_non_numeric_numbers, - stream, - mr); + auto tmp = test::get_json_object(input, + json_paths, + keep_quotes, + -1L, + -1, + allow_leading_zero_numbers, + allow_non_numeric_numbers, + stream, + mr); printf("line %d\n", __LINE__); fflush(stdout); From 357c67194f6fe2150e6d095aced90434aeefcc08 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 17 Sep 2024 13:43:55 -0700 Subject: [PATCH 23/58] Add `type_ids` data for paths Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 38 +++++++++++++++++------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index b32435ca6d..b646eb94c4 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -401,6 +401,7 @@ struct context { __device__ thrust::pair evaluate_path( json_parser& p, cudf::device_span path_commands, + cudf::type_id path_type_id, bool keep_quotes, char* out_buf, int8_t* max_path_depth_exceeded) @@ -824,6 +825,7 @@ struct json_path_processing_data { char* out_buf; int8_t* has_out_of_bound; bool keep_quotes; + cudf::type_id type_id; }; /** @@ -872,8 +874,8 @@ __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL json_parser p{char_range{str}}; p.set_allow_leading_zero_numbers(allow_leading_zero_numbers); p.set_allow_non_numeric_numbers(allow_non_numeric_numbers); - thrust::tie(is_valid, out_size) = - evaluate_path(p, path.path_commands, path.keep_quotes, dst, max_path_depth_exceeded); + thrust::tie(is_valid, out_size) = evaluate_path( + p, path.path_commands, path.type_id, path.keep_quotes, dst, max_path_depth_exceeded); // We did not terminate the `evaluate_path` function early to reduce complexity of the code. // Instead, if max depth was encountered, we've just continued the evaluation until here @@ -1054,6 +1056,7 @@ std::vector> get_json_object_batch( cudf::detail::input_offsetalator const& in_offsets, std::vector const>> const& json_paths, + std::vector const& type_ids, std::vector const& output_ids, std::unordered_set const& keep_quotes, int64_t scratch_size, @@ -1101,7 +1104,8 @@ std::vector> get_json_object_batch( out_stringviews.back().data(), scratch_buffers.back().data(), d_error_check.data() + idx, - keep_quotes.find(output_ids[idx]) != keep_quotes.end()}); + keep_quotes.find(output_ids[idx]) != keep_quotes.end(), + type_ids[idx]}); } auto d_path_data = cudf::detail::make_device_uvector_async( h_path_data, stream, rmm::mr::get_current_device_resource()); @@ -1169,7 +1173,8 @@ std::vector> get_json_object_batch( nullptr /*out_stringviews*/, out_char_buffers.back().data(), d_error_check.data() + idx, - keep_quotes.find(output_ids[idx]) != keep_quotes.end()}); + keep_quotes.find(output_ids[idx]) != keep_quotes.end(), + type_ids[idx]}); } else { output.emplace_back(cudf::make_strings_column(out_sview, stream, mr)); } @@ -1213,6 +1218,7 @@ std::vector> get_json_object( cudf::strings_column_view const& input, std::vector>> const& json_paths, + std::vector const& type_ids, std::unordered_set const& keep_quotes, int64_t memory_budget_bytes, int32_t parallel_override, @@ -1249,19 +1255,23 @@ std::vector> get_json_object( auto const d_input_ptr = cudf::column_device_view::create(input.parent(), stream); std::vector> output(num_outputs); + // TODO: reserve std::vector const>> batch; + std::vector batch_type_ids; std::vector output_ids; std::size_t starting_path = 0; while (starting_path < num_outputs) { std::size_t at = starting_path; batch.resize(0); + batch_type_ids.resize(0); output_ids.resize(0); if (parallel_override > 0) { int count = 0; while (at < num_outputs && count < parallel_override) { auto output_location = sorted_indices[at]; batch.emplace_back(json_paths[output_location]); + batch_type_ids.push_back(type_ids[output_location]); output_ids.push_back(output_location); at++; count++; @@ -1271,6 +1281,7 @@ std::vector> get_json_object( while (at < num_outputs && budget < memory_budget_bytes) { auto output_location = sorted_indices[at]; batch.emplace_back(json_paths[output_location]); + batch_type_ids.push_back(type_ids[output_location]); output_ids.push_back(output_location); at++; budget += scratch_size; @@ -1279,6 +1290,7 @@ std::vector> get_json_object( auto tmp = get_json_object_batch(*d_input_ptr, in_offsets, batch, + batch_type_ids, output_ids, keep_quotes, scratch_size, @@ -1299,6 +1311,7 @@ std::vector> get_json_object( void travel_path( std::vector>>& paths, std::vector>& current_path, + std::vector& type_ids, std::unordered_set& keep_quotes, std::string const& name, cudf::io::schema_element const& column_schema) @@ -1311,34 +1324,38 @@ void travel_path( } printf("column_schema type: %d\n", static_cast(column_schema.type.id())); paths.push_back(current_path); // this will copy + type_ids.push_back(column_schema.type.id()); } else { if (column_schema.type.id() != cudf::type_id::STRUCT) { CUDF_FAIL("Unsupported column type in schema"); } paths.push_back(current_path); // this will copy + type_ids.push_back(column_schema.type.id()); auto const last_path_size = paths.size(); for (auto const& [child_name, child_schema] : column_schema.child_types) { - travel_path(paths, current_path, keep_quotes, child_name, child_schema); + travel_path(paths, current_path, type_ids, keep_quotes, child_name, child_schema); } } current_path.pop_back(); } -std::pair>>, - std::unordered_set> +std::tuple>>, + std::vector, + std::unordered_set> flatten_schema_to_paths(std::vector> const& schema) { std::vector>> paths; + std::vector type_ids; std::unordered_set keep_quotes; std::vector> current_path; std::for_each(schema.begin(), schema.end(), [&](auto const& kv) { - travel_path(paths, current_path, keep_quotes, kv.first, kv.second); + travel_path(paths, current_path, type_ids, keep_quotes, kv.first, kv.second); }); - return {std::move(paths), std::move(keep_quotes)}; + return {std::move(paths), std::move(type_ids), std::move(keep_quotes)}; } void assemble_column(std::size_t& column_order, @@ -1400,7 +1417,7 @@ std::vector> from_json_to_structs( { printf("line %d\n", __LINE__); fflush(stdout); - auto const [json_paths, keep_quotes] = flatten_schema_to_paths(schema); + auto const [json_paths, type_ids, keep_quotes] = flatten_schema_to_paths(schema); printf("line %d\n", __LINE__); fflush(stdout); @@ -1438,6 +1455,7 @@ std::vector> from_json_to_structs( auto tmp = test::get_json_object(input, json_paths, + type_ids, keep_quotes, -1L, -1, From 28968755800d4f7bd6fd11620dca9a4697a6ab48 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 17 Sep 2024 14:41:53 -0700 Subject: [PATCH 24/58] Fix struct null mask Signed-off-by: Nghia Truong --- src/main/cpp/src/from_json_to_structs.cu | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index b646eb94c4..7d2c03ba76 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -471,11 +471,15 @@ __device__ thrust::pair evaluate_path( // case (_, Nil) // case path 3 else if (path_is_empty(ctx.path.size())) { - // printf("get obj line %d\n", __LINE__); + // If this is a struct column, we only need to check to see if there exists a struct. + if (path_type_id == cudf::type_id::STRUCT) { + if (p.get_current_token() != json_token::START_OBJECT) { return {false, 0}; } + if (!p.try_skip_children()) { return {false, 0}; } - // general case: just copy the child tree verbatim - if (!(ctx.g.copy_current_structure(p, out_buf))) { - // JSON validation check + // Just write anything into the output, to mark the output as a non-null row. + // Such output will be discarded anyway. + ctx.g.write_start_array(out_buf); + } else if (!(ctx.g.copy_current_structure(p, out_buf))) { return {false, 0}; } ctx.dirty = 1; From ba3649db6fb0a5f75c51639fb56717609463ffc0 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 17 Sep 2024 21:42:44 -0700 Subject: [PATCH 25/58] Add Java test Signed-off-by: Nghia Truong --- .../nvidia/spark/rapids/jni/GetJsonObjectTest.java | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java index 6a4acb9cb9..b33b0be8ce 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java @@ -776,6 +776,18 @@ void getJsonObjectTest_ExceedMaxNestingDepthInJSONParser() { } } + @Test + void getJsonObjectTest_NamesWithEscapedCharacters() { + JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] { + namedPath("data") + }; + try (ColumnVector input = ColumnVector.fromStrings( + "{'data': 'TEST1'}", "{'\\u0064\\u0061t\\u0061': 'TEST2'}"); + ColumnVector expected = ColumnVector.fromStrings("TEST1", "TEST2"); + ColumnVector output = JSONUtils.getJsonObject(input, query)) { + assertColumnsAreEqual(expected, output); + } + } private JSONUtils.PathInstructionJni wildcardPath() { return new JSONUtils.PathInstructionJni(JSONUtils.PathInstructionType.WILDCARD, "", -1); From 82ad2d51659392a3e6dfef2792a29c22e4768a33 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 18 Sep 2024 13:10:43 -0700 Subject: [PATCH 26/58] Fix character matching Signed-off-by: Nghia Truong --- src/main/cpp/src/json_parser.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/cpp/src/json_parser.cuh b/src/main/cpp/src/json_parser.cuh index 12863c3333..4e712937ed 100644 --- a/src/main/cpp/src/json_parser.cuh +++ b/src/main/cpp/src/json_parser.cuh @@ -978,7 +978,7 @@ class json_parser { if (!to_match.is_null()) { for (cudf::size_type i = 0; i < bytes; i++) { - if (!(to_match.eof() && to_match.current_char() == buff[i])) { return false; } + if (to_match.eof() || to_match.current_char() != buff[i]) { return false; } to_match.next(); } } From bff6d12f51e16cb53a172a718c99e0b0baff2d3f Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 18 Sep 2024 22:51:33 -0700 Subject: [PATCH 27/58] Fix column order in schema Signed-off-by: Nghia Truong --- src/main/cpp/src/JSONUtilsJni.cpp | 28 ++++++++++++------------ src/main/cpp/src/from_json.hpp | 8 ++++++- src/main/cpp/src/from_json_to_structs.cu | 18 ++++++++------- 3 files changed, 31 insertions(+), 23 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index d9e7bf55e4..e47bb4ac1f 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -26,12 +26,12 @@ using path_instruction_type = spark_rapids_jni::path_instruction_type; -namespace cudf::jni { -cudf::io::schema_element read_schema_element(int& index, - cudf::jni::native_jstringArray const& names, - cudf::jni::native_jintArray const& children, - cudf::jni::native_jintArray const& types, - cudf::jni::native_jintArray const& scales) +namespace spark_rapids_jni { +json_schema_element read_schema_element(int& index, + cudf::jni::native_jstringArray const& names, + cudf::jni::native_jintArray const& children, + cudf::jni::native_jintArray const& types, + cudf::jni::native_jintArray const& scales) { printf("JNI line %d\n", __LINE__); fflush(stdout); @@ -41,7 +41,7 @@ cudf::io::schema_element read_schema_element(int& index, printf("JNI line %d\n", __LINE__); fflush(stdout); - std::map child_elems; + std::vector> child_elems; int num_children = children[index]; // go to the next entry, so recursion can parse it. index++; @@ -50,10 +50,9 @@ cudf::io::schema_element read_schema_element(int& index, fflush(stdout); auto const name = std::string{names.get(index).get()}; - child_elems.emplace(name, - cudf::jni::read_schema_element(index, names, children, types, scales)); + child_elems.emplace_back(name, read_schema_element(index, names, children, types, scales)); } - return cudf::io::schema_element{d_type, std::move(child_elems)}; + return json_schema_element{d_type, std::move(child_elems)}; } else { printf("JNI line %d\n", __LINE__); @@ -68,10 +67,10 @@ cudf::io::schema_element read_schema_element(int& index, index++; printf("JNI line %d\n", __LINE__); fflush(stdout); - return cudf::io::schema_element{d_type, {}}; + return json_schema_element{d_type, {}}; } } -} // namespace cudf::jni +} // namespace spark_rapids_jni extern "C" { @@ -241,7 +240,7 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJsonToStructs(JNIEnv* env, printf("JNI line %d, size = %d\n", __LINE__, (int)n_types.size()); fflush(stdout); - std::vector> schema; + std::vector> schema; int idx = 0; while (idx < n_types.size()) { printf("JNI line %d\n", __LINE__); @@ -249,7 +248,8 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJsonToStructs(JNIEnv* env, auto const name = std::string{n_col_names.get(idx).get()}; schema.emplace_back( - name, cudf::jni::read_schema_element(idx, n_col_names, n_children, n_types, n_scales)); + name, + spark_rapids_jni::read_schema_element(idx, n_col_names, n_children, n_types, n_scales)); // auto const name = n_col_names.get(at).get(); printf("JNI line %d\n", __LINE__); diff --git a/src/main/cpp/src/from_json.hpp b/src/main/cpp/src/from_json.hpp index decd3b8640..7875296c4c 100644 --- a/src/main/cpp/src/from_json.hpp +++ b/src/main/cpp/src/from_json.hpp @@ -29,6 +29,12 @@ namespace spark_rapids_jni { +struct json_schema_element { + cudf::data_type type; + + std::vector> child_types; +}; + std::unique_ptr from_json_to_raw_map( cudf::strings_column_view const& input, rmm::cuda_stream_view stream = cudf::get_default_stream(), @@ -36,7 +42,7 @@ std::unique_ptr from_json_to_raw_map( std::vector> from_json_to_structs( cudf::strings_column_view const& input, - std::vector> const& schema, + std::vector> const& schema, bool allow_leading_zero_numbers, bool allow_non_numeric_numbers, rmm::cuda_stream_view stream = cudf::get_default_stream(), diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 7d2c03ba76..b087fd6302 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "from_json.hpp" #include "get_json_object.hpp" #include "json_parser.cuh" @@ -1318,7 +1319,7 @@ void travel_path( std::vector& type_ids, std::unordered_set& keep_quotes, std::string const& name, - cudf::io::schema_element const& column_schema) + json_schema_element const& column_schema) { current_path.emplace_back(path_instruction_type::NAMED, name, -1); if (column_schema.child_types.size() == 0) { // leaf of the schema @@ -1348,7 +1349,7 @@ void travel_path( std::tuple>>, std::vector, std::unordered_set> -flatten_schema_to_paths(std::vector> const& schema) +flatten_schema_to_paths(std::vector> const& schema) { std::vector>> paths; std::vector type_ids; @@ -1366,7 +1367,7 @@ void assemble_column(std::size_t& column_order, std::vector>& output, std::vector>& read_columns, std::string const& name, - cudf::io::schema_element const& column_schema, + json_schema_element const& column_schema, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -1395,7 +1396,7 @@ void assemble_column(std::size_t& column_order, } std::vector> assemble_output( - std::vector> const& schema, + std::vector> const& schema, std::vector>& read_columns, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) @@ -1413,7 +1414,7 @@ std::vector> assemble_output( std::vector> from_json_to_structs( cudf::strings_column_view const& input, - std::vector> const& schema, + std::vector> const& schema, bool allow_leading_zero_numbers, bool allow_non_numeric_numbers, rmm::cuda_stream_view stream, @@ -1427,8 +1428,9 @@ std::vector> from_json_to_structs( fflush(stdout); #if 1 + int count{0}; for (auto const& path : json_paths) { - printf("\n\npath: \n"); + printf("\n\npath (%d/%d): \n", count++, (int)json_paths.size()); for (auto node : path) { printf(".%s", std::get<1>(node).c_str()); } @@ -1480,7 +1482,7 @@ std::vector> from_json_to_structs( cudaMemcpyAsync(h_v.data(), ptr, sizeof(char) * size, cudaMemcpyDefault, stream.value())); stream.synchronize(); - printf("out %d (size = %d): ", (int)i, (int)size); + printf("out %d / %d (size = %d): ", (int)i, (int)tmp.size(), (int)size); for (auto c : h_v) { printf("%c", c); } @@ -1495,7 +1497,7 @@ std::vector> from_json_to_structs( std::vector> from_json_to_structs( cudf::strings_column_view const& input, - std::vector> const& schema, + std::vector> const& schema, bool allow_leading_zero_numbers, bool allow_non_numeric_numbers, rmm::cuda_stream_view stream, From 1abd8f87fba4d0a3d1ff20144a08e0b798b9dc73 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 19 Sep 2024 15:52:36 -0700 Subject: [PATCH 28/58] Allow STRUCT type in JSON path --- src/main/cpp/src/from_json_to_structs.cu | 84 +++++++++++++++++------- 1 file changed, 59 insertions(+), 25 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index b087fd6302..f39819728a 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -473,13 +473,30 @@ __device__ thrust::pair evaluate_path( // case path 3 else if (path_is_empty(ctx.path.size())) { // If this is a struct column, we only need to check to see if there exists a struct. - if (path_type_id == cudf::type_id::STRUCT) { - if (p.get_current_token() != json_token::START_OBJECT) { return {false, 0}; } - if (!p.try_skip_children()) { return {false, 0}; } + if (path_type_id == cudf::type_id::STRUCT || path_type_id == cudf::type_id::LIST) { + if (path_type_id == cudf::type_id::STRUCT && + p.get_current_token() != json_token::START_OBJECT) { + return {false, 0}; + } + if (path_type_id == cudf::type_id::LIST && + p.get_current_token() != json_token::START_ARRAY) { + return {false, 0}; + } - // Just write anything into the output, to mark the output as a non-null row. - // Such output will be discarded anyway. - ctx.g.write_start_array(out_buf); + // TODO: for now, just copy the entire lists for output + // Need to parse the child elements instead. + if (path_type_id == cudf::type_id::STRUCT) { + if (!p.try_skip_children()) { return {false, 0}; } + } else if (!(ctx.g.copy_current_structure(p, out_buf))) { + return {false, 0}; + } + + // TODO: this should be for both strucs and list + if (path_type_id == cudf::type_id::STRUCT) { + // Just write anything into the output, to mark the output as a non-null row. + // Such output will be discarded anyway. + ctx.g.write_start_array(out_buf); + } } else if (!(ctx.g.copy_current_structure(p, out_buf))) { return {false, 0}; } @@ -1331,16 +1348,20 @@ void travel_path( paths.push_back(current_path); // this will copy type_ids.push_back(column_schema.type.id()); } else { - if (column_schema.type.id() != cudf::type_id::STRUCT) { - CUDF_FAIL("Unsupported column type in schema"); + if (column_schema.type.id() == cudf::type_id::STRUCT) { + printf("column_schema type: STRUCT\n"); } + if (column_schema.type.id() == cudf::type_id::LIST) { printf("column_schema type: LIST\n"); } paths.push_back(current_path); // this will copy type_ids.push_back(column_schema.type.id()); - auto const last_path_size = paths.size(); - for (auto const& [child_name, child_schema] : column_schema.child_types) { - travel_path(paths, current_path, type_ids, keep_quotes, child_name, child_schema); + // TODO: for now, don't have to parse child of lists column. + // Just output the entire lists. + if (column_schema.type.id() == cudf::type_id::STRUCT) { + for (auto const& [child_name, child_schema] : column_schema.child_types) { + travel_path(paths, current_path, type_ids, keep_quotes, child_name, child_schema); + } } } current_path.pop_back(); @@ -1375,23 +1396,36 @@ void assemble_column(std::size_t& column_order, output.emplace_back(std::move(read_columns[column_order])); ++column_order; } else { - if (column_schema.type.id() != cudf::type_id::STRUCT) { - CUDF_FAIL("Unsupported column type in schema"); - } + if (column_schema.type.id() == cudf::type_id::STRUCT) { + // TODO: null mask and null count should be extracted for both list and structs + auto const null_count = read_columns[column_order]->null_count(); + auto const null_mask = std::move(read_columns[column_order]->release().null_mask); + ++column_order; + + std::vector> children; + for (auto const& [child_name, child_schema] : column_schema.child_types) { + assemble_column(column_order, children, read_columns, child_name, child_schema, stream, mr); + } - auto const null_count = read_columns[column_order]->null_count(); - auto const null_mask = std::move(read_columns[column_order]->release().null_mask); - ++column_order; + // TODO: generate null mask from input. + auto const num_rows = children.front()->size(); + output.emplace_back(cudf::make_structs_column( + num_rows, std::move(children), null_count, std::move(*null_mask), stream, mr)); + } else if (column_schema.type.id() == cudf::type_id::LIST) { + // TODO: split LIST into child column + // For now, just output as a strings column. + output.emplace_back(std::move(read_columns[column_order])); + ++column_order; + // std::vector> children; + // for (auto const& [child_name, child_schema] : column_schema.child_types) { + // assemble_column(column_order, children, read_columns, child_name, child_schema, stream, + // mr); + // } + // CUDF_EXPECTS(children.size() == 1, "TODO"); - std::vector> children; - for (auto const& [child_name, child_schema] : column_schema.child_types) { - assemble_column(column_order, children, read_columns, child_name, child_schema, stream, mr); + } else { + CUDF_FAIL("Unsupported type"); } - - // TODO: generate null mask from input. - auto const num_rows = children.front()->size(); - output.emplace_back(cudf::make_structs_column( - num_rows, std::move(children), null_count, std::move(*null_mask), stream, mr)); } } From a18ee48ff85296ebd1324049df75d32060c2584a Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 19 Sep 2024 16:37:36 -0700 Subject: [PATCH 29/58] Add test with LIST --- src/main/cpp/tests/from_json.cu | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/main/cpp/tests/from_json.cu b/src/main/cpp/tests/from_json.cu index 361a0579a9..989c343abc 100644 --- a/src/main/cpp/tests/from_json.cu +++ b/src/main/cpp/tests/from_json.cu @@ -29,18 +29,22 @@ TEST_F(FromJsonTest, Initialization) { // The last row is invalid (has an extra quote). auto const json_string = - cudf::test::strings_column_wrapper{R"({'a': 4478, "b": 'HIMST', "c": 1276})"}; + cudf::test::strings_column_wrapper{"{'a': [{'b': 1, 'c': 2}, {'b': 3, 'c': 4}]}"}; - std::vector> schema{ - {"c", {cudf::data_type{cudf::type_id::INT32}}}, - {"a", {cudf::data_type{cudf::type_id::STRING}}}, - }; + spark_rapids_jni::json_schema_element a{cudf::data_type{cudf::type_id::LIST}, {}}; + a.child_types.emplace_back( + "b", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::INT32}, {}}); + + std::vector> schema; + schema.emplace_back("a", std::move(a)); auto const output = spark_rapids_jni::from_json_to_structs( cudf::strings_column_view{json_string}, schema, false, false); + printf("\n\ninput: \n"); cudf::test::print(json_string); + printf("\n\noutput: \n"); for (auto const& col : output) { cudf::test::print(col->view()); } From adc45beb46c5206e7454509fce23ba5c63698031 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 20 Sep 2024 12:51:27 -0700 Subject: [PATCH 30/58] Output LIST as string --- src/main/cpp/src/from_json_to_structs.cu | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index f39819728a..404593fffd 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -1350,18 +1350,24 @@ void travel_path( } else { if (column_schema.type.id() == cudf::type_id::STRUCT) { printf("column_schema type: STRUCT\n"); - } - if (column_schema.type.id() == cudf::type_id::LIST) { printf("column_schema type: LIST\n"); } - - paths.push_back(current_path); // this will copy - type_ids.push_back(column_schema.type.id()); + if (column_schema.type.id() == cudf::type_id::STRUCT) { + for (auto const& [child_name, child_schema] : column_schema.child_types) { + travel_path(paths, current_path, type_ids, keep_quotes, child_name, child_schema); + } + } + } else if (column_schema.type.id() == cudf::type_id::LIST) { + printf("column_schema type: LIST\n"); - // TODO: for now, don't have to parse child of lists column. - // Just output the entire lists. - if (column_schema.type.id() == cudf::type_id::STRUCT) { + CUDF_EXPECTS(column_schema.child_types.size() == 1, "TODO"); + current_path.emplace_back(path_instruction_type::WILDCARD, "", -1); for (auto const& [child_name, child_schema] : column_schema.child_types) { travel_path(paths, current_path, type_ids, keep_quotes, child_name, child_schema); } + current_path.pop_back(); + + } else { + // TODO + CUDF_FAIL("Unsupported type"); } } current_path.pop_back(); From 77128ebc10a3cfe1f6268cc02aef42aa455186fe Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 20 Sep 2024 13:18:28 -0700 Subject: [PATCH 31/58] Update tests and fix null mask --- src/main/cpp/src/from_json_to_structs.cu | 40 ++++++++++++++++-------- src/main/cpp/tests/from_json.cu | 35 ++++++++++++++++++++- 2 files changed, 61 insertions(+), 14 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 404593fffd..e376802e20 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -472,6 +472,8 @@ __device__ thrust::pair evaluate_path( // case (_, Nil) // case path 3 else if (path_is_empty(ctx.path.size())) { + printf("path is empty, path type = %d\n", (int)path_type_id); + // If this is a struct column, we only need to check to see if there exists a struct. if (path_type_id == cudf::type_id::STRUCT || path_type_id == cudf::type_id::LIST) { if (path_type_id == cudf::type_id::STRUCT && @@ -483,20 +485,17 @@ __device__ thrust::pair evaluate_path( return {false, 0}; } - // TODO: for now, just copy the entire lists for output - // Need to parse the child elements instead. if (path_type_id == cudf::type_id::STRUCT) { + // Or copy current structure? if (!p.try_skip_children()) { return {false, 0}; } - } else if (!(ctx.g.copy_current_structure(p, out_buf))) { + } else if (!(ctx.g.copy_current_structure(p, nullptr))) { + // not copy only if there is struct? return {false, 0}; } - // TODO: this should be for both strucs and list - if (path_type_id == cudf::type_id::STRUCT) { - // Just write anything into the output, to mark the output as a non-null row. - // Such output will be discarded anyway. - ctx.g.write_start_array(out_buf); - } + // Just write anything into the output, to mark the output as a non-null row. + // Such output will be discarded anyway. + ctx.g.write_start_array(out_buf); } else if (!(ctx.g.copy_current_structure(p, out_buf))) { return {false, 0}; } @@ -507,6 +506,8 @@ __device__ thrust::pair evaluate_path( // case path 4 else if (json_token::START_OBJECT == ctx.token && thrust::get<0>(path_match_named(ctx.path))) { + printf("start object\n"); + if (!ctx.is_first_enter) { // 2st enter // skip the following children after the expect @@ -606,6 +607,8 @@ __device__ thrust::pair evaluate_path( else if (json_token::START_ARRAY == ctx.token && path_match_element(ctx.path, path_instruction_type::WILDCARD) && ctx.style != write_style::QUOTED) { + printf("array * not quote\n"); + // retain Flatten, otherwise use Quoted... cannot use Raw within an array write_style next_style = write_style::RAW; switch (ctx.style) { @@ -658,6 +661,8 @@ __device__ thrust::pair evaluate_path( // case path 7 else if (json_token::START_ARRAY == ctx.token && path_match_element(ctx.path, path_instruction_type::WILDCARD)) { + printf("array *\n"); + if (ctx.is_first_enter) { ctx.is_first_enter = false; ctx.g.write_start_array(out_buf); @@ -1348,7 +1353,10 @@ void travel_path( paths.push_back(current_path); // this will copy type_ids.push_back(column_schema.type.id()); } else { + type_ids.push_back(column_schema.type.id()); if (column_schema.type.id() == cudf::type_id::STRUCT) { + current_path.pop_back(); + paths.push_back(current_path); // this will copy printf("column_schema type: STRUCT\n"); if (column_schema.type.id() == cudf::type_id::STRUCT) { for (auto const& [child_name, child_schema] : column_schema.child_types) { @@ -1359,7 +1367,9 @@ void travel_path( printf("column_schema type: LIST\n"); CUDF_EXPECTS(column_schema.child_types.size() == 1, "TODO"); - current_path.emplace_back(path_instruction_type::WILDCARD, "", -1); + paths.push_back(current_path); // this will copy + current_path.emplace_back(path_instruction_type::WILDCARD, "*", -1); + for (auto const& [child_name, child_schema] : column_schema.child_types) { travel_path(paths, current_path, type_ids, keep_quotes, child_name, child_schema); } @@ -1370,7 +1380,7 @@ void travel_path( CUDF_FAIL("Unsupported type"); } } - current_path.pop_back(); + if (column_schema.type.id() != cudf::type_id::STRUCT) { current_path.pop_back(); } } std::tuple>>, @@ -1420,8 +1430,12 @@ void assemble_column(std::size_t& column_order, } else if (column_schema.type.id() == cudf::type_id::LIST) { // TODO: split LIST into child column // For now, just output as a strings column. - output.emplace_back(std::move(read_columns[column_order])); - ++column_order; + ++column_order; // todo: remove this when creating the lists column + for (auto const& [child_name, child_schema] : column_schema.child_types) { + // TODO: just ignore the current lists column + assemble_column(column_order, output, read_columns, child_name, child_schema, stream, mr); + } + // std::vector> children; // for (auto const& [child_name, child_schema] : column_schema.child_types) { // assemble_column(column_order, children, read_columns, child_name, child_schema, stream, diff --git a/src/main/cpp/tests/from_json.cu b/src/main/cpp/tests/from_json.cu index 989c343abc..2fcae57cad 100644 --- a/src/main/cpp/tests/from_json.cu +++ b/src/main/cpp/tests/from_json.cu @@ -25,7 +25,7 @@ class FromJsonTest : public cudf::test::BaseFixture {}; -TEST_F(FromJsonTest, Initialization) +TEST_F(FromJsonTest, T1) { // The last row is invalid (has an extra quote). auto const json_string = @@ -33,7 +33,40 @@ TEST_F(FromJsonTest, Initialization) spark_rapids_jni::json_schema_element a{cudf::data_type{cudf::type_id::LIST}, {}}; a.child_types.emplace_back( + "struct", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::STRUCT}, {}}); + a.child_types.front().second.child_types.emplace_back( "b", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::INT32}, {}}); + a.child_types.front().second.child_types.emplace_back( + "c", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::INT32}, {}}); + + std::vector> schema; + schema.emplace_back("a", std::move(a)); + + auto const output = spark_rapids_jni::from_json_to_structs( + cudf::strings_column_view{json_string}, schema, false, false); + + printf("\n\ninput: \n"); + cudf::test::print(json_string); + + printf("\n\noutput: \n"); + for (auto const& col : output) { + cudf::test::print(col->view()); + } +} + +TEST_F(FromJsonTest, T2) +{ + // The last row is invalid (has an extra quote). + auto const json_string = + cudf::test::strings_column_wrapper{"{'a': [{'b': \"1\", 'c': 2}, {'b': \"3\", 'c': 4}]}"}; + + spark_rapids_jni::json_schema_element a{cudf::data_type{cudf::type_id::LIST}, {}}; + a.child_types.emplace_back( + "struct", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::STRUCT}, {}}); + a.child_types.front().second.child_types.emplace_back( + "b", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::INT32}, {}}); + a.child_types.front().second.child_types.emplace_back( + "c", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::INT32}, {}}); std::vector> schema; schema.emplace_back("a", std::move(a)); From a78bb729b8e210c51a15507b27496748df07fa9f Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 24 Sep 2024 10:21:24 -0700 Subject: [PATCH 32/58] Output list without outer brackets --- src/main/cpp/src/from_json_to_structs.cu | 30 ++++++++++++++---------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index e376802e20..2f97945ca4 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -578,6 +578,7 @@ __device__ thrust::pair evaluate_path( } } } +#if 0 // case (START_ARRAY, Wildcard :: Wildcard :: xs) // case path 5 else if (json_token::START_ARRAY == ctx.token && @@ -606,7 +607,7 @@ __device__ thrust::pair evaluate_path( // case path 6 else if (json_token::START_ARRAY == ctx.token && path_match_element(ctx.path, path_instruction_type::WILDCARD) && - ctx.style != write_style::QUOTED) { + ctx.style == write_style::QUOTED) { printf("array * not quote\n"); // retain Flatten, otherwise use Quoted... cannot use Raw within an array @@ -646,7 +647,7 @@ __device__ thrust::pair evaluate_path( if (ctx.dirty > 1) { // add outer array tokens ctx.g.write_child_raw_value( - child_g_start, child_g_len, /* write_outer_array_tokens */ true); + child_g_start, child_g_len, /* write_outer_array_tokens */ false); } else if (ctx.dirty == 1) { // remove outer array tokens ctx.g.write_child_raw_value( @@ -657,6 +658,7 @@ __device__ thrust::pair evaluate_path( ctx.task_is_done = true; } } +#endif // case (START_ARRAY, Wildcard :: xs) // case path 7 else if (json_token::START_ARRAY == ctx.token && @@ -665,7 +667,7 @@ __device__ thrust::pair evaluate_path( if (ctx.is_first_enter) { ctx.is_first_enter = false; - ctx.g.write_start_array(out_buf); + ctx.g.write_first_start_array_without_output(); } if (p.next_token() != json_token::END_ARRAY) { // JSON validation check @@ -678,7 +680,7 @@ __device__ thrust::pair evaluate_path( write_style::QUOTED, {ctx.path.data() + 1, ctx.path.size() - 1}); } else { - ctx.g.write_end_array(out_buf); + // ctx.g.write_end_array(out_buf); ctx.task_is_done = true; } } @@ -715,6 +717,7 @@ __device__ thrust::pair evaluate_path( write_style::QUOTED, {ctx.path.data() + 1, ctx.path.size() - 1}); } +#if 0 // case (START_ARRAY, Index(idx) :: xs) // case path 9 else if (json_token::START_ARRAY == ctx.token && thrust::get<0>(path_match_index(ctx.path))) { @@ -746,6 +749,7 @@ __device__ thrust::pair evaluate_path( ctx.style, {ctx.path.data() + 1, ctx.path.size() - 1}); } +#endif // case _ => // case path 12 else { @@ -1333,6 +1337,7 @@ std::vector> get_json_object( } return output; } + } // namespace test void travel_path( @@ -1430,18 +1435,17 @@ void assemble_column(std::size_t& column_order, } else if (column_schema.type.id() == cudf::type_id::LIST) { // TODO: split LIST into child column // For now, just output as a strings column. - ++column_order; // todo: remove this when creating the lists column + auto const null_count = read_columns[column_order]->null_count(); + auto const null_mask = std::move(read_columns[column_order]->release().null_mask); + ++column_order; + + // std::vector> children; for (auto const& [child_name, child_schema] : column_schema.child_types) { - // TODO: just ignore the current lists column assemble_column(column_order, output, read_columns, child_name, child_schema, stream, mr); } - - // std::vector> children; - // for (auto const& [child_name, child_schema] : column_schema.child_types) { - // assemble_column(column_order, children, read_columns, child_name, child_schema, stream, - // mr); - // } - // CUDF_EXPECTS(children.size() == 1, "TODO"); + // auto const num_rows = children.front()->size(); + // output.emplace_back(cudf::make_lists_column( + // num_rows, std::move(children), null_count, std::move(*null_mask), stream, mr)); } else { CUDF_FAIL("Unsupported type"); From 9f13aede39fd97365d6c234feef8fa5788d55e94 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 24 Sep 2024 14:00:42 -0700 Subject: [PATCH 33/58] Output columns following the input schema --- src/main/cpp/src/from_json_to_structs.cu | 72 +++++++++++++++++++++--- 1 file changed, 64 insertions(+), 8 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 2f97945ca4..41b4020e77 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -18,6 +18,8 @@ #include "get_json_object.hpp" #include "json_parser.cuh" +#include + #include #include #include @@ -26,8 +28,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -1405,6 +1409,44 @@ flatten_schema_to_paths(std::vector> return {std::move(paths), std::move(type_ids), std::move(keep_quotes)}; } +std::pair, std::unique_ptr> extract_lists( + std::unique_ptr& input, + json_schema_element const& column_schema, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + if (column_schema.type.id() == cudf::type_id::STRUCT) { + std::unique_ptr offsets{nullptr}; + std::vector> new_children; + cudf::size_type num_child_rows{-1}; + auto children = std::move(input->release().children); + for (std::size_t child_idx = 0; child_idx < children.size(); ++child_idx) { + auto& child = children[child_idx]; + auto [new_child_offsets, new_child] = + extract_lists(child, column_schema.child_types[child_idx].second, stream, mr); + if (num_child_rows < 0) { num_child_rows = new_child->size(); } + CUDF_EXPECTS(num_child_rows == new_child->size(), "TODO"); + + if (!offsets) { offsets = std::move(new_child_offsets); } + new_children.emplace_back(std::move(new_child)); + } + + // return cudf::make_structs_column( + // num_child_rows, std::move(children), null_count, std::move(*null_mask), stream, + // mr); + // TODO: fix null mask + return {std::move(offsets), + cudf::make_structs_column(num_child_rows, std::move(new_children), 0, {}, stream, mr)}; + } + + auto split_content = + cudf::strings::split_record( + cudf::strings_column_view{input->view()}, cudf::string_scalar{","}, -1, stream, mr) + ->release(); + return {std::move(split_content.children[cudf::lists_column_view::offsets_column_index]), + std::move(split_content.children[cudf::lists_column_view::child_column_index])}; +} + void assemble_column(std::size_t& column_order, std::vector>& output, std::vector>& read_columns, @@ -1418,7 +1460,6 @@ void assemble_column(std::size_t& column_order, ++column_order; } else { if (column_schema.type.id() == cudf::type_id::STRUCT) { - // TODO: null mask and null count should be extracted for both list and structs auto const null_count = read_columns[column_order]->null_count(); auto const null_mask = std::move(read_columns[column_order]->release().null_mask); ++column_order; @@ -1435,18 +1476,33 @@ void assemble_column(std::size_t& column_order, } else if (column_schema.type.id() == cudf::type_id::LIST) { // TODO: split LIST into child column // For now, just output as a strings column. - auto const null_count = read_columns[column_order]->null_count(); - auto const null_mask = std::move(read_columns[column_order]->release().null_mask); + // auto const null_count = read_columns[column_order]->null_count(); + // auto const null_mask = std::move(read_columns[column_order]->release().null_mask); + auto const num_rows = read_columns[column_order]->size(); ++column_order; - // std::vector> children; + std::vector> children; for (auto const& [child_name, child_schema] : column_schema.child_types) { - assemble_column(column_order, output, read_columns, child_name, child_schema, stream, mr); + assemble_column(column_order, children, read_columns, child_name, child_schema, stream, mr); } - // auto const num_rows = children.front()->size(); - // output.emplace_back(cudf::make_lists_column( - // num_rows, std::move(children), null_count, std::move(*null_mask), stream, mr)); + printf("line %d\n", __LINE__); + cudf::test::print(children.front()->view()); + + auto [offsets, child] = + extract_lists(children.front(), column_schema.child_types.front().second, stream, mr); + + printf("line %d\n", __LINE__); + cudf::test::print(child->view()); + printf("line %d\n", __LINE__); + cudf::test::print(offsets->view()); + + // TODO: fix null mask + output.emplace_back( + cudf::make_lists_column(num_rows, std::move(offsets), std::move(child), 0, {}, stream, mr)); + + printf("line %d\n", __LINE__); + cudf::test::print(output.back()->view()); } else { CUDF_FAIL("Unsupported type"); } From 76a44da213c5df9192c7ab7610f64fbc2ec41c8e Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 24 Sep 2024 21:12:58 -0700 Subject: [PATCH 34/58] Fix struct schema, and add test --- src/main/cpp/src/from_json_to_structs.cu | 64 +++++++++++++++--------- src/main/cpp/tests/from_json.cu | 26 ++++++++++ 2 files changed, 67 insertions(+), 23 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 41b4020e77..ab428656f8 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -476,7 +476,9 @@ __device__ thrust::pair evaluate_path( // case (_, Nil) // case path 3 else if (path_is_empty(ctx.path.size())) { - printf("path is empty, path type = %d\n", (int)path_type_id); + printf("path is empty, path type = %d, token = %d\n", + (int)path_type_id, + (int)p.get_current_token()); // If this is a struct column, we only need to check to see if there exists a struct. if (path_type_id == cudf::type_id::STRUCT || path_type_id == cudf::type_id::LIST) { @@ -510,7 +512,7 @@ __device__ thrust::pair evaluate_path( // case path 4 else if (json_token::START_OBJECT == ctx.token && thrust::get<0>(path_match_named(ctx.path))) { - printf("start object\n"); + // printf("start object\n"); if (!ctx.is_first_enter) { // 2st enter @@ -667,7 +669,7 @@ __device__ thrust::pair evaluate_path( // case path 7 else if (json_token::START_ARRAY == ctx.token && path_match_element(ctx.path, path_instruction_type::WILDCARD)) { - printf("array *\n"); + // printf("array *\n"); if (ctx.is_first_enter) { ctx.is_first_enter = false; @@ -1350,7 +1352,8 @@ void travel_path( std::vector& type_ids, std::unordered_set& keep_quotes, std::string const& name, - json_schema_element const& column_schema) + json_schema_element const& column_schema, + bool found_list_type = false) { current_path.emplace_back(path_instruction_type::NAMED, name, -1); if (column_schema.child_types.size() == 0) { // leaf of the schema @@ -1364,7 +1367,7 @@ void travel_path( } else { type_ids.push_back(column_schema.type.id()); if (column_schema.type.id() == cudf::type_id::STRUCT) { - current_path.pop_back(); + if (found_list_type) { current_path.pop_back(); } paths.push_back(current_path); // this will copy printf("column_schema type: STRUCT\n"); if (column_schema.type.id() == cudf::type_id::STRUCT) { @@ -1380,7 +1383,13 @@ void travel_path( current_path.emplace_back(path_instruction_type::WILDCARD, "*", -1); for (auto const& [child_name, child_schema] : column_schema.child_types) { - travel_path(paths, current_path, type_ids, keep_quotes, child_name, child_schema); + travel_path(paths, + current_path, + type_ids, + keep_quotes, + child_name, + child_schema, + /*found_list_type=*/true); } current_path.pop_back(); @@ -1389,7 +1398,9 @@ void travel_path( CUDF_FAIL("Unsupported type"); } } - if (column_schema.type.id() != cudf::type_id::STRUCT) { current_path.pop_back(); } + if (column_schema.type.id() != cudf::type_id::STRUCT || !found_list_type) { + current_path.pop_back(); + } } std::tuple>>, @@ -1476,9 +1487,11 @@ void assemble_column(std::size_t& column_order, } else if (column_schema.type.id() == cudf::type_id::LIST) { // TODO: split LIST into child column // For now, just output as a strings column. - // auto const null_count = read_columns[column_order]->null_count(); - // auto const null_mask = std::move(read_columns[column_order]->release().null_mask); - auto const num_rows = read_columns[column_order]->size(); + auto const num_rows = read_columns[column_order]->size(); + auto const null_count = read_columns[column_order]->null_count(); + auto const null_mask = std::move(read_columns[column_order]->release().null_mask); + + // printf("num rows: %d\n", num_rows); ++column_order; std::vector> children; @@ -1486,23 +1499,28 @@ void assemble_column(std::size_t& column_order, assemble_column(column_order, children, read_columns, child_name, child_schema, stream, mr); } - printf("line %d\n", __LINE__); - cudf::test::print(children.front()->view()); + // printf("line %d\n", __LINE__); + // cudf::test::print(children.front()->view()); auto [offsets, child] = extract_lists(children.front(), column_schema.child_types.front().second, stream, mr); - printf("line %d\n", __LINE__); - cudf::test::print(child->view()); - printf("line %d\n", __LINE__); - cudf::test::print(offsets->view()); + // printf("line %d\n", __LINE__); + // cudf::test::print(child->view()); + // printf("line %d\n", __LINE__); + // cudf::test::print(offsets->view()); // TODO: fix null mask - output.emplace_back( - cudf::make_lists_column(num_rows, std::move(offsets), std::move(child), 0, {}, stream, mr)); - - printf("line %d\n", __LINE__); - cudf::test::print(output.back()->view()); + output.emplace_back(cudf::make_lists_column(num_rows, + std::move(offsets), + std::move(child), + null_count, + std::move(*null_mask), + stream, + mr)); + + // printf("line %d\n", __LINE__); + // cudf::test::print(output.back()->view()); } else { CUDF_FAIL("Unsupported type"); } @@ -1541,7 +1559,7 @@ std::vector> from_json_to_structs( printf("line %d\n", __LINE__); fflush(stdout); -#if 1 +#if 0 int count{0}; for (auto const& path : json_paths) { printf("\n\npath (%d/%d): \n", count++, (int)json_paths.size()); @@ -1586,7 +1604,7 @@ std::vector> from_json_to_structs( printf("line %d\n", __LINE__); fflush(stdout); - if (1) { + if constexpr (0) { for (std::size_t i = 0; i < tmp.size(); ++i) { auto out = cudf::strings_column_view{tmp[i]->view()}; auto ptr = out.chars_begin(stream); diff --git a/src/main/cpp/tests/from_json.cu b/src/main/cpp/tests/from_json.cu index 2fcae57cad..524a42533a 100644 --- a/src/main/cpp/tests/from_json.cu +++ b/src/main/cpp/tests/from_json.cu @@ -82,3 +82,29 @@ TEST_F(FromJsonTest, T2) cudf::test::print(col->view()); } } + +TEST_F(FromJsonTest, T3) +{ + // The last row is invalid (has an extra quote). + auto const json_string = cudf::test::strings_column_wrapper{"{'data': [1,0]}"}; + + spark_rapids_jni::json_schema_element a{cudf::data_type{cudf::type_id::STRUCT}, {}}; + a.child_types.emplace_back( + "b", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::INT32}, {}}); + a.child_types.emplace_back( + "c", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::INT32}, {}}); + + std::vector> schema; + schema.emplace_back("data", std::move(a)); + + auto const output = spark_rapids_jni::from_json_to_structs( + cudf::strings_column_view{json_string}, schema, false, false); + + printf("\n\ninput: \n"); + cudf::test::print(json_string); + + printf("\n\noutput: \n"); + for (auto const& col : output) { + cudf::test::print(col->view()); + } +} From 1f36d1780c83b3b3f6806a9b0e06fc7562b38cad Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 24 Sep 2024 21:27:14 -0700 Subject: [PATCH 35/58] Add test --- src/main/cpp/tests/from_json.cu | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/main/cpp/tests/from_json.cu b/src/main/cpp/tests/from_json.cu index 524a42533a..4d0ed738ef 100644 --- a/src/main/cpp/tests/from_json.cu +++ b/src/main/cpp/tests/from_json.cu @@ -108,3 +108,27 @@ TEST_F(FromJsonTest, T3) cudf::test::print(col->view()); } } + +TEST_F(FromJsonTest, T4) +{ + // The last row is invalid (has an extra quote). + auto const json_string = cudf::test::strings_column_wrapper{"{'data': ['1', '2']}"}; + + spark_rapids_jni::json_schema_element a{cudf::data_type{cudf::type_id::LIST}, {}}; + a.child_types.emplace_back( + "string", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::STRING}, {}}); + + std::vector> schema; + schema.emplace_back("data", std::move(a)); + + auto const output = spark_rapids_jni::from_json_to_structs( + cudf::strings_column_view{json_string}, schema, false, false); + + printf("\n\ninput: \n"); + cudf::test::print(json_string); + + printf("\n\noutput: \n"); + for (auto const& col : output) { + cudf::test::print(col->view()); + } +} From af97f21254bc8852ad8211e93a458e4c7d715fc3 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 24 Sep 2024 22:28:23 -0700 Subject: [PATCH 36/58] Fix struct child of array --- src/main/cpp/src/from_json_to_structs.cu | 46 +++++++++++++++++------- 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index ab428656f8..0cff39d9b3 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -446,10 +446,10 @@ __device__ thrust::pair evaluate_path( if (!ctx.task_is_done) { // case (VALUE_STRING, Nil) if style == RawStyle // case path 1 - if (json_token::VALUE_STRING == ctx.token && path_is_empty(ctx.path.size()) && - ctx.style == write_style::RAW) { + if (json_token::VALUE_STRING == ctx.token && path_is_empty(ctx.path.size())) { // there is no array wildcard or slice parent, emit this string without // quotes write current string in parser to generator + ctx.g.try_write_comma(out_buf); ctx.g.write_raw(p, out_buf, keep_quotes); ctx.dirty = 1; ctx.task_is_done = true; @@ -613,7 +613,7 @@ __device__ thrust::pair evaluate_path( // case path 6 else if (json_token::START_ARRAY == ctx.token && path_match_element(ctx.path, path_instruction_type::WILDCARD) && - ctx.style == write_style::QUOTED) { + ctx.style != write_style::QUOTED) { printf("array * not quote\n"); // retain Flatten, otherwise use Quoted... cannot use Raw within an array @@ -669,7 +669,7 @@ __device__ thrust::pair evaluate_path( // case path 7 else if (json_token::START_ARRAY == ctx.token && path_match_element(ctx.path, path_instruction_type::WILDCARD)) { - // printf("array *\n"); + printf("array *\n"); if (ctx.is_first_enter) { ctx.is_first_enter = false; @@ -1379,18 +1379,38 @@ void travel_path( printf("column_schema type: LIST\n"); CUDF_EXPECTS(column_schema.child_types.size() == 1, "TODO"); + + // TODO: is this needed, if there is no struct child? paths.push_back(current_path); // this will copy + current_path.emplace_back(path_instruction_type::WILDCARD, "*", -1); + bool has_struct_child{false}; for (auto const& [child_name, child_schema] : column_schema.child_types) { - travel_path(paths, - current_path, - type_ids, - keep_quotes, - child_name, - child_schema, - /*found_list_type=*/true); + if (child_schema.type.id() == cudf::type_id::STRUCT) { + has_struct_child = true; + break; + } } + + // Only add a path name if this column is not under a list type. + if (has_struct_child) { + for (auto const& [child_name, child_schema] : column_schema.child_types) { + travel_path(paths, + current_path, + type_ids, + keep_quotes, + child_name, + child_schema, + /*found_list_type=*/true); + } + } else { + auto const child_type = column_schema.child_types.front().second.type; + if (cudf::is_fixed_width(child_type)) { keep_quotes.insert(paths.size()); } + paths.push_back(current_path); // this will copy + type_ids.push_back(child_type.id()); + } + current_path.pop_back(); } else { @@ -1559,7 +1579,7 @@ std::vector> from_json_to_structs( printf("line %d\n", __LINE__); fflush(stdout); -#if 0 +#if 1 int count{0}; for (auto const& path : json_paths) { printf("\n\npath (%d/%d): \n", count++, (int)json_paths.size()); @@ -1604,7 +1624,7 @@ std::vector> from_json_to_structs( printf("line %d\n", __LINE__); fflush(stdout); - if constexpr (0) { + if constexpr (1) { for (std::size_t i = 0; i < tmp.size(); ++i) { auto out = cudf::strings_column_view{tmp[i]->view()}; auto ptr = out.chars_begin(stream); From b14c3323ac26f45ba67814264857e35bb980f3dd Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 25 Sep 2024 10:56:11 -0700 Subject: [PATCH 37/58] Implement element delimiter --- src/main/cpp/src/from_json_to_structs.cu | 155 +++++++++++++++++++---- 1 file changed, 129 insertions(+), 26 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 0cff39d9b3..7b7d252eac 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -99,9 +99,9 @@ class json_generator { // add an extra comma if needed, // e.g.: when JSON content is: [[1,2,3] // writing a new [ should result: [[1,2,3],[ - __device__ void write_start_array(char* out_begin) + __device__ void write_start_array(char* out_begin, char element_delimiter) { - try_write_comma(out_begin); + try_write_comma(out_begin, element_delimiter); out_begin[offset + output_len] = '['; output_len++; @@ -137,11 +137,11 @@ class json_generator { /** * write comma accroding to current generator state */ - __device__ void try_write_comma(char* out_begin) + __device__ void try_write_comma(char* out_begin, char element_delimiter) { if (need_comma()) { // in array context and writes first item - out_begin[offset + output_len] = ','; + out_begin[offset + output_len] = element_delimiter; output_len++; } } @@ -151,10 +151,12 @@ class json_generator { * object/array, then copy to corresponding matched end object/array. return * false if JSON format is invalid return true if JSON format is valid */ - __device__ bool copy_current_structure(json_parser& parser, char* out_begin) + __device__ bool copy_current_structure(json_parser& parser, + char* out_begin, + char element_delimiter) { // first try add comma - try_write_comma(out_begin); + try_write_comma(out_begin, element_delimiter); if (array_depth > 0) { is_curr_array_empty = false; } @@ -408,6 +410,7 @@ __device__ thrust::pair evaluate_path( cudf::device_span path_commands, cudf::type_id path_type_id, bool keep_quotes, + char element_delimiter, char* out_buf, int8_t* max_path_depth_exceeded) { @@ -449,7 +452,7 @@ __device__ thrust::pair evaluate_path( if (json_token::VALUE_STRING == ctx.token && path_is_empty(ctx.path.size())) { // there is no array wildcard or slice parent, emit this string without // quotes write current string in parser to generator - ctx.g.try_write_comma(out_buf); + ctx.g.try_write_comma(out_buf, element_delimiter); ctx.g.write_raw(p, out_buf, keep_quotes); ctx.dirty = 1; ctx.task_is_done = true; @@ -494,15 +497,15 @@ __device__ thrust::pair evaluate_path( if (path_type_id == cudf::type_id::STRUCT) { // Or copy current structure? if (!p.try_skip_children()) { return {false, 0}; } - } else if (!(ctx.g.copy_current_structure(p, nullptr))) { + } else if (!(ctx.g.copy_current_structure(p, nullptr, ','))) { // not copy only if there is struct? return {false, 0}; } // Just write anything into the output, to mark the output as a non-null row. // Such output will be discarded anyway. - ctx.g.write_start_array(out_buf); - } else if (!(ctx.g.copy_current_structure(p, out_buf))) { + ctx.g.write_start_array(out_buf, element_delimiter); + } else if (!(ctx.g.copy_current_structure(p, out_buf, element_delimiter))) { return {false, 0}; } ctx.dirty = 1; @@ -889,6 +892,7 @@ template __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL void get_json_object_kernel(cudf::column_device_view input, cudf::device_span path_data, + char element_delimiter, bool allow_leading_zero_numbers, bool allow_non_numeric_numbers, std::size_t num_threads_per_row, @@ -911,8 +915,13 @@ __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL json_parser p{char_range{str}}; p.set_allow_leading_zero_numbers(allow_leading_zero_numbers); p.set_allow_non_numeric_numbers(allow_non_numeric_numbers); - thrust::tie(is_valid, out_size) = evaluate_path( - p, path.path_commands, path.type_id, path.keep_quotes, dst, max_path_depth_exceeded); + thrust::tie(is_valid, out_size) = evaluate_path(p, + path.path_commands, + path.type_id, + path.keep_quotes, + element_delimiter, + dst, + max_path_depth_exceeded); // We did not terminate the `evaluate_path` function early to reduce complexity of the code. // Instead, if max depth was encountered, we've just continued the evaluation until here @@ -940,6 +949,7 @@ __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL struct kernel_launcher { static void exec(cudf::column_device_view const& input, cudf::device_span path_data, + char element_delimiter, bool allow_leading_zero_numbers, bool allow_non_numeric_numbers, int8_t* max_path_depth_exceeded, @@ -960,6 +970,7 @@ struct kernel_launcher { get_json_object_kernel <<>>(input, path_data, + element_delimiter, allow_leading_zero_numbers, allow_non_numeric_numbers, num_threads_per_row, @@ -1096,6 +1107,7 @@ std::vector> get_json_object_batch( std::vector const& type_ids, std::vector const& output_ids, std::unordered_set const& keep_quotes, + char element_delimiter, int64_t scratch_size, bool allow_leading_zero_numbers, bool allow_non_numeric_numbers, @@ -1151,6 +1163,7 @@ std::vector> get_json_object_batch( kernel_launcher::exec(input, d_path_data, + element_delimiter, allow_leading_zero_numbers, allow_non_numeric_numbers, d_max_path_depth_exceeded, @@ -1227,6 +1240,7 @@ std::vector> get_json_object_batch( rmm::exec_policy(stream), d_error_check.begin(), d_error_check.end(), 0); kernel_launcher::exec(input, d_path_data, + element_delimiter, allow_leading_zero_numbers, allow_non_numeric_numbers, d_max_path_depth_exceeded, @@ -1257,6 +1271,7 @@ std::vector> get_json_object( json_paths, std::vector const& type_ids, std::unordered_set const& keep_quotes, + char element_delimiter, int64_t memory_budget_bytes, int32_t parallel_override, bool allow_leading_zero_numbers, @@ -1330,6 +1345,7 @@ std::vector> get_json_object( batch_type_ids, output_ids, keep_quotes, + element_delimiter, scratch_size, allow_leading_zero_numbers, allow_non_numeric_numbers, @@ -1443,6 +1459,7 @@ flatten_schema_to_paths(std::vector> std::pair, std::unique_ptr> extract_lists( std::unique_ptr& input, json_schema_element const& column_schema, + char element_delimiter, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -1452,11 +1469,16 @@ std::pair, std::unique_ptr> extract_ cudf::size_type num_child_rows{-1}; auto children = std::move(input->release().children); for (std::size_t child_idx = 0; child_idx < children.size(); ++child_idx) { - auto& child = children[child_idx]; - auto [new_child_offsets, new_child] = - extract_lists(child, column_schema.child_types[child_idx].second, stream, mr); + auto& child = children[child_idx]; + auto [new_child_offsets, new_child] = extract_lists( + child, column_schema.child_types[child_idx].second, element_delimiter, stream, mr); if (num_child_rows < 0) { num_child_rows = new_child->size(); } - CUDF_EXPECTS(num_child_rows == new_child->size(), "TODO"); + if (num_child_rows != new_child->size()) { + printf("num_child_rows != new_child->size(): %d != %d\n", + (int)num_child_rows, + (int)new_child->size()); + } + CUDF_EXPECTS(num_child_rows == new_child->size(), "num_child_rows != new_child->size()"); if (!offsets) { offsets = std::move(new_child_offsets); } new_children.emplace_back(std::move(new_child)); @@ -1470,10 +1492,25 @@ std::pair, std::unique_ptr> extract_ cudf::make_structs_column(num_child_rows, std::move(new_children), 0, {}, stream, mr)}; } - auto split_content = - cudf::strings::split_record( - cudf::strings_column_view{input->view()}, cudf::string_scalar{","}, -1, stream, mr) - ->release(); + printf("before split:\n"); + cudf::test::print(input->view()); + + auto tmp = cudf::strings::split_record(cudf::strings_column_view{input->view()}, + cudf::string_scalar{std::string{element_delimiter}}, + -1, + stream, + mr); + // auto split_content = + // cudf::strings::split_record(cudf::strings_column_view{input->view()}, + // cudf::string_scalar{std::string{element_delimiter}}, + // -1, + // stream, + // mr) + // ->release(); + printf("after split:\n"); + cudf::test::print(tmp->view()); + auto split_content = tmp->release(); + return {std::move(split_content.children[cudf::lists_column_view::offsets_column_index]), std::move(split_content.children[cudf::lists_column_view::child_column_index])}; } @@ -1483,6 +1520,7 @@ void assemble_column(std::size_t& column_order, std::vector>& read_columns, std::string const& name, json_schema_element const& column_schema, + char element_delimiter, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -1497,7 +1535,14 @@ void assemble_column(std::size_t& column_order, std::vector> children; for (auto const& [child_name, child_schema] : column_schema.child_types) { - assemble_column(column_order, children, read_columns, child_name, child_schema, stream, mr); + assemble_column(column_order, + children, + read_columns, + child_name, + child_schema, + element_delimiter, + stream, + mr); } // TODO: generate null mask from input. @@ -1516,14 +1561,21 @@ void assemble_column(std::size_t& column_order, std::vector> children; for (auto const& [child_name, child_schema] : column_schema.child_types) { - assemble_column(column_order, children, read_columns, child_name, child_schema, stream, mr); + assemble_column(column_order, + children, + read_columns, + child_name, + child_schema, + element_delimiter, + stream, + mr); } // printf("line %d\n", __LINE__); // cudf::test::print(children.front()->view()); - auto [offsets, child] = - extract_lists(children.front(), column_schema.child_types.front().second, stream, mr); + auto [offsets, child] = extract_lists( + children.front(), column_schema.child_types.front().second, element_delimiter, stream, mr); // printf("line %d\n", __LINE__); // cudf::test::print(child->view()); @@ -1547,9 +1599,55 @@ void assemble_column(std::size_t& column_order, } } +char find_delimiter(cudf::strings_column_view const& input, rmm::cuda_stream_view stream) +{ + auto constexpr num_levels = 256; + auto constexpr lower_level = std::numeric_limits::min(); + auto constexpr upper_level = std::numeric_limits::max(); + auto const num_chars = input.chars_size(stream); // stream sync + + // TODO: return when num_chars==0 + + rmm::device_uvector d_histogram(num_levels, stream); + thrust::fill(rmm::exec_policy(stream), d_histogram.begin(), d_histogram.end(), 0); + + size_t temp_storage_bytes = 0; + cub::DeviceHistogram::HistogramEven(nullptr, + temp_storage_bytes, + input.chars_begin(stream), + d_histogram.begin(), + num_levels, + lower_level, + upper_level, + num_chars, + stream.value()); + rmm::device_buffer d_temp(temp_storage_bytes, stream); + cub::DeviceHistogram::HistogramEven(d_temp.data(), + temp_storage_bytes, + input.chars_begin(stream), + d_histogram.begin(), + num_levels, + lower_level, + upper_level, + num_chars, + stream.value()); + + auto const zero_level = d_histogram.begin() - lower_level; + auto const first_zero_count_pos = + thrust::find(rmm::exec_policy(stream), zero_level + '\n', d_histogram.end(), 0); + if (first_zero_count_pos == d_histogram.end()) { + throw std::logic_error( + "can't find a character suitable as delimiter for combining json strings to json lines with " + "custom delimiter"); + } + auto const first_non_existing_char = first_zero_count_pos - zero_level; + return first_non_existing_char; +} + std::vector> assemble_output( std::vector> const& schema, std::vector>& read_columns, + char element_delimiter, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -1558,7 +1656,8 @@ std::vector> assemble_output( std::size_t column_order{0}; std::for_each(schema.begin(), schema.end(), [&](auto const& kv) { - assemble_column(column_order, output, read_columns, kv.first, kv.second, stream, mr); + assemble_column( + column_order, output, read_columns, kv.first, kv.second, element_delimiter, stream, mr); }); return output; @@ -1611,10 +1710,14 @@ std::vector> from_json_to_structs( #endif + auto const delimiter = find_delimiter(input, stream); + printf("delimiter: %c (code: %d)\n", delimiter, (int)delimiter); + auto tmp = test::get_json_object(input, json_paths, type_ids, keep_quotes, + delimiter, -1L, -1, allow_leading_zero_numbers, @@ -1642,7 +1745,7 @@ std::vector> from_json_to_structs( } } - return assemble_output(schema, tmp, stream, mr); + return assemble_output(schema, tmp, delimiter, stream, mr); } } // namespace detail From 54a8e969321436441d12d724daae21aa9d126221 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 25 Sep 2024 12:52:27 -0700 Subject: [PATCH 38/58] Cleanup --- src/main/cpp/src/from_json_to_structs.cu | 203 ++++------------------- 1 file changed, 28 insertions(+), 175 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 7b7d252eac..feaffa016f 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -18,7 +18,7 @@ #include "get_json_object.hpp" #include "json_parser.cuh" -#include +// #include #include #include @@ -479,9 +479,9 @@ __device__ thrust::pair evaluate_path( // case (_, Nil) // case path 3 else if (path_is_empty(ctx.path.size())) { - printf("path is empty, path type = %d, token = %d\n", - (int)path_type_id, - (int)p.get_current_token()); + // printf("path is empty, path type = %d, token = %d\n", + // (int)path_type_id, + // (int)p.get_current_token()); // If this is a struct column, we only need to check to see if there exists a struct. if (path_type_id == cudf::type_id::STRUCT || path_type_id == cudf::type_id::LIST) { @@ -587,92 +587,11 @@ __device__ thrust::pair evaluate_path( } } } -#if 0 - // case (START_ARRAY, Wildcard :: Wildcard :: xs) - // case path 5 - else if (json_token::START_ARRAY == ctx.token && - path_match_elements( - ctx.path, path_instruction_type::WILDCARD, path_instruction_type::WILDCARD)) { - // special handling for the non-structure preserving double wildcard - // behavior in Hive - if (ctx.is_first_enter) { - ctx.is_first_enter = false; - ctx.g.write_start_array(out_buf); - } - - if (p.next_token() != json_token::END_ARRAY) { - // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } - push_context(evaluation_case_path::START_ARRAY___MATCHED_DOUBLE_WILDCARD, - ctx.g, - write_style::FLATTEN, - {ctx.path.data() + 2, ctx.path.size() - 2}); - } else { - ctx.g.write_end_array(out_buf); - ctx.task_is_done = true; - } - } - // case (START_ARRAY, Wildcard :: xs) if style != QuotedStyle - // case path 6 - else if (json_token::START_ARRAY == ctx.token && - path_match_element(ctx.path, path_instruction_type::WILDCARD) && - ctx.style != write_style::QUOTED) { - printf("array * not quote\n"); - - // retain Flatten, otherwise use Quoted... cannot use Raw within an array - write_style next_style = write_style::RAW; - switch (ctx.style) { - case write_style::RAW: next_style = write_style::QUOTED; break; - case write_style::FLATTEN: next_style = write_style::FLATTEN; break; - case write_style::QUOTED: next_style = write_style::QUOTED; // never happen - } - - // temporarily buffer child matches, the emitted json will need to be - // modified slightly if there is only a single element written - - json_generator child_g; - if (ctx.is_first_enter) { - ctx.is_first_enter = false; - // create a child generator with hide outer array tokens mode. - child_g = ctx.g.new_child_generator(); - // write first [ without output, without update len, only update internal state - child_g.write_first_start_array_without_output(); - } else { - child_g = ctx.child_g; - } - - if (p.next_token() != json_token::END_ARRAY) { - // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } - // track the number of array elements and only emit an outer array if - // we've written more than one element, this matches Hive's behavior - push_context(evaluation_case_path::START_ARRAY___MATCHED_WILDCARD___STYLE_NOT_QUOTED, - child_g, - next_style, - {ctx.path.data() + 1, ctx.path.size() - 1}); - } else { - char* child_g_start = out_buf + child_g.get_offset(); - int child_g_len = child_g.get_output_len(); - if (ctx.dirty > 1) { - // add outer array tokens - ctx.g.write_child_raw_value( - child_g_start, child_g_len, /* write_outer_array_tokens */ false); - } else if (ctx.dirty == 1) { - // remove outer array tokens - ctx.g.write_child_raw_value( - child_g_start, child_g_len, /* write_outer_array_tokens */ false); - } // else do not write anything - - // Done anyway, since we already reached the end array. - ctx.task_is_done = true; - } - } -#endif // case (START_ARRAY, Wildcard :: xs) // case path 7 else if (json_token::START_ARRAY == ctx.token && path_match_element(ctx.path, path_instruction_type::WILDCARD)) { - printf("array *\n"); + // printf("array *\n"); if (ctx.is_first_enter) { ctx.is_first_enter = false; @@ -693,72 +612,6 @@ __device__ thrust::pair evaluate_path( ctx.task_is_done = true; } } - /* case (START_ARRAY, Index(idx) :: (xs@Wildcard :: _)) */ - // case path 8 - else if (json_token::START_ARRAY == ctx.token && - thrust::get<0>(path_match_index_wildcard(ctx.path))) { - int idx = thrust::get<1>(path_match_index_wildcard(ctx.path)); - - p.next_token(); - // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } - ctx.is_first_enter = false; - - int i = idx; - while (i > 0) { - if (p.get_current_token() == json_token::END_ARRAY) { - // terminate, nothing has been written - return {false, 0}; - } - - if (!p.try_skip_children()) { return {false, 0}; } - - p.next_token(); - // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } - - --i; - } - - // i == 0 - push_context(evaluation_case_path::START_ARRAY___MATCHED_INDEX_AND_WILDCARD, - ctx.g, - write_style::QUOTED, - {ctx.path.data() + 1, ctx.path.size() - 1}); - } -#if 0 - // case (START_ARRAY, Index(idx) :: xs) - // case path 9 - else if (json_token::START_ARRAY == ctx.token && thrust::get<0>(path_match_index(ctx.path))) { - int idx = thrust::get<1>(path_match_index(ctx.path)); - - p.next_token(); - // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } - - int i = idx; - while (i > 0) { - if (p.get_current_token() == json_token::END_ARRAY) { - // terminate, nothing has been written - return {false, 0}; - } - - if (!p.try_skip_children()) { return {false, 0}; } - - p.next_token(); - // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } - - --i; - } - - // i == 0 - push_context(evaluation_case_path::START_ARRAY___MATCHED_INDEX, - ctx.g, - ctx.style, - {ctx.path.data() + 1, ctx.path.size() - 1}); - } -#endif // case _ => // case path 12 else { @@ -1142,10 +995,10 @@ std::vector> get_json_object_batch( out_stringviews.emplace_back(rmm::device_uvector>{ static_cast(input.size()), stream}); - printf("idx: %d, output_ids[idx]: %d\n", (int)idx, (int)output_ids[idx]); - printf("keep_quotes.find(output_ids[idx]) != keep_quotes.end(): %d\n", - (int)(keep_quotes.find(output_ids[idx]) != keep_quotes.end())); - fflush(stdout); + // printf("idx: %d, output_ids[idx]: %d\n", (int)idx, (int)output_ids[idx]); + // printf("keep_quotes.find(output_ids[idx]) != keep_quotes.end(): %d\n", + // (int)(keep_quotes.find(output_ids[idx]) != keep_quotes.end())); + // fflush(stdout); h_path_data.emplace_back( json_path_processing_data{d_json_paths[idx], @@ -1377,7 +1230,7 @@ void travel_path( // TODO: comment keep_quotes.insert(paths.size()); } - printf("column_schema type: %d\n", static_cast(column_schema.type.id())); + // printf("column_schema type: %d\n", static_cast(column_schema.type.id())); paths.push_back(current_path); // this will copy type_ids.push_back(column_schema.type.id()); } else { @@ -1385,14 +1238,14 @@ void travel_path( if (column_schema.type.id() == cudf::type_id::STRUCT) { if (found_list_type) { current_path.pop_back(); } paths.push_back(current_path); // this will copy - printf("column_schema type: STRUCT\n"); + // printf("column_schema type: STRUCT\n"); if (column_schema.type.id() == cudf::type_id::STRUCT) { for (auto const& [child_name, child_schema] : column_schema.child_types) { travel_path(paths, current_path, type_ids, keep_quotes, child_name, child_schema); } } } else if (column_schema.type.id() == cudf::type_id::LIST) { - printf("column_schema type: LIST\n"); + // printf("column_schema type: LIST\n"); CUDF_EXPECTS(column_schema.child_types.size() == 1, "TODO"); @@ -1474,9 +1327,9 @@ std::pair, std::unique_ptr> extract_ child, column_schema.child_types[child_idx].second, element_delimiter, stream, mr); if (num_child_rows < 0) { num_child_rows = new_child->size(); } if (num_child_rows != new_child->size()) { - printf("num_child_rows != new_child->size(): %d != %d\n", - (int)num_child_rows, - (int)new_child->size()); + // printf("num_child_rows != new_child->size(): %d != %d\n", + // (int)num_child_rows, + // (int)new_child->size()); } CUDF_EXPECTS(num_child_rows == new_child->size(), "num_child_rows != new_child->size()"); @@ -1492,8 +1345,8 @@ std::pair, std::unique_ptr> extract_ cudf::make_structs_column(num_child_rows, std::move(new_children), 0, {}, stream, mr)}; } - printf("before split:\n"); - cudf::test::print(input->view()); + // printf("before split:\n"); + // cudf::test::print(input->view()); auto tmp = cudf::strings::split_record(cudf::strings_column_view{input->view()}, cudf::string_scalar{std::string{element_delimiter}}, @@ -1507,8 +1360,8 @@ std::pair, std::unique_ptr> extract_ // stream, // mr) // ->release(); - printf("after split:\n"); - cudf::test::print(tmp->view()); + // printf("after split:\n"); + // cudf::test::print(tmp->view()); auto split_content = tmp->release(); return {std::move(split_content.children[cudf::lists_column_view::offsets_column_index]), @@ -1671,14 +1524,14 @@ std::vector> from_json_to_structs( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - printf("line %d\n", __LINE__); - fflush(stdout); + // printf("line %d\n", __LINE__); + // fflush(stdout); auto const [json_paths, type_ids, keep_quotes] = flatten_schema_to_paths(schema); - printf("line %d\n", __LINE__); - fflush(stdout); + // printf("line %d\n", __LINE__); + // fflush(stdout); -#if 1 +#if 0 int count{0}; for (auto const& path : json_paths) { printf("\n\npath (%d/%d): \n", count++, (int)json_paths.size()); @@ -1711,7 +1564,7 @@ std::vector> from_json_to_structs( #endif auto const delimiter = find_delimiter(input, stream); - printf("delimiter: %c (code: %d)\n", delimiter, (int)delimiter); + // printf("delimiter: %c (code: %d)\n", delimiter, (int)delimiter); auto tmp = test::get_json_object(input, json_paths, @@ -1724,10 +1577,10 @@ std::vector> from_json_to_structs( allow_non_numeric_numbers, stream, mr); - printf("line %d\n", __LINE__); - fflush(stdout); + // printf("line %d\n", __LINE__); + // fflush(stdout); - if constexpr (1) { + if constexpr (0) { for (std::size_t i = 0; i < tmp.size(); ++i) { auto out = cudf::strings_column_view{tmp[i]->view()}; auto ptr = out.chars_begin(stream); From 705dd6bd9b3809c7c4ccbbda1ef8be811162df9a Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 25 Sep 2024 13:49:56 -0700 Subject: [PATCH 39/58] Add `isNullOrEmpty` function --- src/main/cpp/src/JSONUtilsJni.cpp | 15 +++++++++++ src/main/cpp/src/from_json.hpp | 5 ++++ src/main/cpp/src/from_json_to_structs.cu | 25 +++++++++++++++++++ .../nvidia/spark/rapids/jni/JSONUtils.java | 7 ++++++ 4 files changed, 52 insertions(+) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index e47bb4ac1f..e568c4db06 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -280,4 +280,19 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJsonToStructs(JNIEnv* env, } CATCH_STD(env, 0); } + +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_isNullOrEmpty(JNIEnv* env, + jclass, + jlong j_input) +{ + JNI_NULL_CHECK(env, j_input, "j_input is null", 0); + + try { + cudf::jni::auto_set_device(env); + auto const input_cv = reinterpret_cast(j_input); + return cudf::jni::release_as_jlong( + spark_rapids_jni::is_null_or_empty(cudf::strings_column_view{*input_cv})); + } + CATCH_STD(env, 0); +} } diff --git a/src/main/cpp/src/from_json.hpp b/src/main/cpp/src/from_json.hpp index 7875296c4c..706b7c25e4 100644 --- a/src/main/cpp/src/from_json.hpp +++ b/src/main/cpp/src/from_json.hpp @@ -48,4 +48,9 @@ std::vector> from_json_to_structs( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); +std::unique_ptr is_null_or_empty( + cudf::strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index feaffa016f..201de20956 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -1603,6 +1603,31 @@ std::vector> from_json_to_structs( } // namespace detail +std::unique_ptr is_null_or_empty(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto const d_input_ptr = cudf::column_device_view::create(input.parent(), stream); + rmm::device_uvector output(input.size(), stream, mr); + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + output.begin(), + [input = *d_input_ptr] __device__(cudf::size_type idx) -> bool { + if (input.is_null(idx)) { return true; } + + auto const d_str = input.element(idx); + int i = 0; + for (; i < d_str.size_bytes(); ++i) { + if (d_str[i] != ' ') { break; } + } + auto const empty = i == d_str.size_bytes(); + return empty; + }); + + return std::make_unique(std::move(output), rmm::device_buffer{}, 0); +} + std::vector> from_json_to_structs( cudf::strings_column_view const& input, std::vector> const& schema, diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index 756375a9dd..60d482b663 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -177,6 +177,11 @@ public static Table fromJsonToStructs(ColumnView input, Schema schema, allowNumericLeadingZeros, allowNonNumericNumbers)); } + public static ColumnVector isNullOrEmpty(ColumnVector input) { + assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; + return new ColumnVector(isNullOrEmpty(input.getNativeView())); + } + private static native int getMaxJSONPathDepth(); private static native long getJsonObject(long input, @@ -201,4 +206,6 @@ private static native long[] fromJsonToStructs(long input, String[] columnNames, int[] dTypeScales, boolean allowNumericLeadingZeros, boolean allowNonNumericNumbers); + + private static native long isNullOrEmpty(long input); } From a3bb86c53bdf53a76409707569f76847a01c248a Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 25 Sep 2024 13:57:12 -0700 Subject: [PATCH 40/58] Cleanup --- src/main/cpp/src/JSONUtilsJni.cpp | 44 +++++++++++++++---------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index e568c4db06..5b0dd8282a 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -33,40 +33,38 @@ json_schema_element read_schema_element(int& index, cudf::jni::native_jintArray const& types, cudf::jni::native_jintArray const& scales) { - printf("JNI line %d\n", __LINE__); - fflush(stdout); + // printf("JNI line %d\n", __LINE__); + // fflush(stdout); auto d_type = cudf::data_type{static_cast(types[index]), scales[index]}; if (d_type.id() == cudf::type_id::STRUCT || d_type.id() == cudf::type_id::LIST) { - printf("JNI line %d\n", __LINE__); - fflush(stdout); + // printf("JNI line %d\n", __LINE__); + // fflush(stdout); std::vector> child_elems; int num_children = children[index]; // go to the next entry, so recursion can parse it. index++; for (int i = 0; i < num_children; i++) { - printf("JNI line %d\n", __LINE__); - fflush(stdout); + // printf("JNI line %d\n", __LINE__); + // fflush(stdout); auto const name = std::string{names.get(index).get()}; child_elems.emplace_back(name, read_schema_element(index, names, children, types, scales)); } return json_schema_element{d_type, std::move(child_elems)}; } else { - printf("JNI line %d\n", __LINE__); - - printf("children size: %d, idx = %d\n", children.size(), index); - - fflush(stdout); + // printf("JNI line %d\n", __LINE__); + // printf("children size: %d, idx = %d\n", children.size(), index); + // fflush(stdout); if (children[index] != 0) { throw std::invalid_argument("found children for a type that should have none"); } // go to the next entry before returning... index++; - printf("JNI line %d\n", __LINE__); - fflush(stdout); + // printf("JNI line %d\n", __LINE__); + // fflush(stdout); return json_schema_element{d_type, {}}; } } @@ -237,14 +235,14 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJsonToStructs(JNIEnv* env, JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and num children must match size", 0); } - printf("JNI line %d, size = %d\n", __LINE__, (int)n_types.size()); - fflush(stdout); + // printf("JNI line %d, size = %d\n", __LINE__, (int)n_types.size()); + // fflush(stdout); std::vector> schema; int idx = 0; while (idx < n_types.size()) { - printf("JNI line %d\n", __LINE__); - fflush(stdout); + // printf("JNI line %d\n", __LINE__); + // fflush(stdout); auto const name = std::string{n_col_names.get(idx).get()}; schema.emplace_back( @@ -252,16 +250,16 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJsonToStructs(JNIEnv* env, spark_rapids_jni::read_schema_element(idx, n_col_names, n_children, n_types, n_scales)); // auto const name = n_col_names.get(at).get(); - printf("JNI line %d\n", __LINE__); - fflush(stdout); + // printf("JNI line %d\n", __LINE__); + // fflush(stdout); // auto child = cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, // n_scales); printf("JNI line %d\n", __LINE__); fflush(stdout); // schema.emplace(name, std::move(child)); } - printf("JNI line %d\n", __LINE__); - fflush(stdout); + // printf("JNI line %d\n", __LINE__); + // fflush(stdout); auto const input_cv = reinterpret_cast(j_input); auto output = spark_rapids_jni::from_json_to_structs(cudf::strings_column_view{*input_cv}, @@ -269,8 +267,8 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJsonToStructs(JNIEnv* env, allow_leading_zero_numbers, allow_non_numeric_numbers); - printf("JNI line %d\n", __LINE__); - fflush(stdout); + // printf("JNI line %d\n", __LINE__); + // fflush(stdout); auto out_handles = cudf::jni::native_jlongArray(env, output.size()); std::transform(output.begin(), output.end(), out_handles.begin(), [](auto& col) { From 3edfb0dd79bc00f15bd3aefe4fb7b1d352a86c88 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 25 Sep 2024 13:59:31 -0700 Subject: [PATCH 41/58] Change memory budget --- src/main/cpp/src/from_json_to_structs.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 201de20956..5d95e346ca 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -1571,7 +1571,7 @@ std::vector> from_json_to_structs( type_ids, keep_quotes, delimiter, - -1L, + 1024 * 1024 * 1024 * 4L, -1, allow_leading_zero_numbers, allow_non_numeric_numbers, From 92c430817fccfc21c3eb76b14f0164ed6e314f38 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 25 Sep 2024 15:58:12 -0700 Subject: [PATCH 42/58] Add test --- src/main/cpp/tests/from_json.cu | 40 +++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/src/main/cpp/tests/from_json.cu b/src/main/cpp/tests/from_json.cu index 4d0ed738ef..8ccfb27318 100644 --- a/src/main/cpp/tests/from_json.cu +++ b/src/main/cpp/tests/from_json.cu @@ -132,3 +132,43 @@ TEST_F(FromJsonTest, T4) cudf::test::print(col->view()); } } + +#include +#include +#include + +TEST_F(FromJsonTest, T5) +{ + // clang-format off +// {"BEACAHEBBO":{"GPECEDGF":"Az[M`Q.'mn`","MFCEINDHFNPJE":"FsZ!/*!){O5>M","OCIKAF":"FsZ!/*!)","GPIHMJ":"|i2l\\J)u8I*Z|TBG$Ho%t","JHG":"B]0r@jN&\"pvP=X}/##H8sRZCc?G [u\".T(FuW@bq2#AgS,S& gqy.emb3?!MfP8Vb.1*eW.WyK)7DF8b.\"","BJKAPMIHEGA":"Az[M","OFEIBPMAEIBALDDD":"FsZ!/*!"},"CGEGPD":[{"JD":"\">z\"'","GMFDD":"y:Mb`Efozq2","NHKPJLNJBJ":"Az[M`","BCCOEEALBP":"2Jn.","CJKIKCGA":"j8(9Sf)7wetOhXt{N%=y-Xu!k ijVfcNKQ+RX)Y}!!ezc)#6i!GX?Z~LvIpI.h/DBt`7y`mu*W6v6*K#8Aw\\.`\\(4G4","OPHLHN":"FsZ!/"}]} + // clang-format on + std::ifstream t("/home/nghiat/Devel/data/from_json_array_issue.json"); + std::string str((std::istreambuf_iterator(t)), std::istreambuf_iterator()); + std::cout << "input:\n" << str << std::endl; + + // The last row is invalid (has an extra quote). + auto const json_string = + cudf::test::strings_column_wrapper{std::initializer_list{str}}; + spark_rapids_jni::json_schema_element a{cudf::data_type{cudf::type_id::LIST}, {}}; + + a.child_types.emplace_back( + "struct", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::STRUCT}, {}}); + a.child_types.front().second.child_types.emplace_back( + "KMEJHDA", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::STRING}, {}}); + a.child_types.front().second.child_types.emplace_back( + "CJKIKCGA", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::STRING}, {}}); + + std::vector> schema; + schema.emplace_back("CGEGPD", std::move(a)); + + auto const output = spark_rapids_jni::from_json_to_structs( + cudf::strings_column_view{json_string}, schema, false, false); + + printf("\n\ninput: \n"); + cudf::test::print(json_string); + + printf("\n\noutput: \n"); + for (auto const& col : output) { + cudf::test::print(col->view()); + } +} From 395c0bc1ff9ad9439ab71eed7d7ec135a6c019fb Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 25 Sep 2024 17:59:11 -0700 Subject: [PATCH 43/58] Search for both element delimiter and null placeholder --- src/main/cpp/src/from_json_to_structs.cu | 76 ++++++++++++++++-------- 1 file changed, 50 insertions(+), 26 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 5d95e346ca..b52f498b16 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -1220,10 +1220,11 @@ void travel_path( std::vector>& current_path, std::vector& type_ids, std::unordered_set& keep_quotes, + bool& has_list_type, std::string const& name, - json_schema_element const& column_schema, - bool found_list_type = false) + json_schema_element const& column_schema) { + bool popped{false}; current_path.emplace_back(path_instruction_type::NAMED, name, -1); if (column_schema.child_types.size() == 0) { // leaf of the schema if (cudf::is_fixed_width(column_schema.type)) { @@ -1236,18 +1237,23 @@ void travel_path( } else { type_ids.push_back(column_schema.type.id()); if (column_schema.type.id() == cudf::type_id::STRUCT) { - if (found_list_type) { current_path.pop_back(); } + if (has_list_type) { + popped = true; + current_path.pop_back(); + } paths.push_back(current_path); // this will copy // printf("column_schema type: STRUCT\n"); if (column_schema.type.id() == cudf::type_id::STRUCT) { for (auto const& [child_name, child_schema] : column_schema.child_types) { - travel_path(paths, current_path, type_ids, keep_quotes, child_name, child_schema); + travel_path( + paths, current_path, type_ids, keep_quotes, has_list_type, child_name, child_schema); } } } else if (column_schema.type.id() == cudf::type_id::LIST) { // printf("column_schema type: LIST\n"); CUDF_EXPECTS(column_schema.child_types.size() == 1, "TODO"); + has_list_type = true; // TODO: is this needed, if there is no struct child? paths.push_back(current_path); // this will copy @@ -1265,13 +1271,8 @@ void travel_path( // Only add a path name if this column is not under a list type. if (has_struct_child) { for (auto const& [child_name, child_schema] : column_schema.child_types) { - travel_path(paths, - current_path, - type_ids, - keep_quotes, - child_name, - child_schema, - /*found_list_type=*/true); + travel_path( + paths, current_path, type_ids, keep_quotes, has_list_type, child_name, child_schema); } } else { auto const child_type = column_schema.child_types.front().second.type; @@ -1287,26 +1288,27 @@ void travel_path( CUDF_FAIL("Unsupported type"); } } - if (column_schema.type.id() != cudf::type_id::STRUCT || !found_list_type) { - current_path.pop_back(); - } + // if (column_schema.type.id() != cudf::type_id::STRUCT || !has_list_type) { + if (column_schema.type.id() != cudf::type_id::STRUCT || !popped) { current_path.pop_back(); } } std::tuple>>, std::vector, - std::unordered_set> + std::unordered_set, + bool> flatten_schema_to_paths(std::vector> const& schema) { std::vector>> paths; std::vector type_ids; std::unordered_set keep_quotes; + bool has_list_type{false}; std::vector> current_path; std::for_each(schema.begin(), schema.end(), [&](auto const& kv) { - travel_path(paths, current_path, type_ids, keep_quotes, kv.first, kv.second); + travel_path(paths, current_path, type_ids, keep_quotes, has_list_type, kv.first, kv.second); }); - return {std::move(paths), std::move(type_ids), std::move(keep_quotes)}; + return {std::move(paths), std::move(type_ids), std::move(keep_quotes), has_list_type}; } std::pair, std::unique_ptr> extract_lists( @@ -1452,7 +1454,8 @@ void assemble_column(std::size_t& column_order, } } -char find_delimiter(cudf::strings_column_view const& input, rmm::cuda_stream_view stream) +std::pair find_delimiter(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream) { auto constexpr num_levels = 256; auto constexpr lower_level = std::numeric_limits::min(); @@ -1486,15 +1489,33 @@ char find_delimiter(cudf::strings_column_view const& input, rmm::cuda_stream_vie stream.value()); auto const zero_level = d_histogram.begin() - lower_level; - auto const first_zero_count_pos = - thrust::find(rmm::exec_policy(stream), zero_level + '\n', d_histogram.end(), 0); + auto first_zero_count_pos = + thrust::find(rmm::exec_policy(stream), zero_level, d_histogram.end(), 0); if (first_zero_count_pos == d_histogram.end()) { + // Try again... + first_zero_count_pos = + thrust::find(rmm::exec_policy(stream), d_histogram.begin(), d_histogram.end(), 0); + if (first_zero_count_pos == d_histogram.end()) { + // TODO: change message + throw std::logic_error( + "can't find a character suitable as delimiter for combining json strings to json lines " + "with " + "custom delimiter"); + } + } + + auto second_zero_count_pos = + thrust::find(rmm::exec_policy(stream), first_zero_count_pos + 1, d_histogram.end(), 0); + if (second_zero_count_pos == d_histogram.end()) { + // TODO: change message throw std::logic_error( - "can't find a character suitable as delimiter for combining json strings to json lines with " + "can't find a character suitable as delimiter for combining json strings to json lines " + "with " "custom delimiter"); } - auto const first_non_existing_char = first_zero_count_pos - zero_level; - return first_non_existing_char; + + return {static_cast(first_zero_count_pos - zero_level), + static_cast(second_zero_count_pos - zero_level)}; } std::vector> assemble_output( @@ -1526,7 +1547,7 @@ std::vector> from_json_to_structs( { // printf("line %d\n", __LINE__); // fflush(stdout); - auto const [json_paths, type_ids, keep_quotes] = flatten_schema_to_paths(schema); + auto const [json_paths, type_ids, keep_quotes, has_list_type] = flatten_schema_to_paths(schema); // printf("line %d\n", __LINE__); // fflush(stdout); @@ -1563,8 +1584,11 @@ std::vector> from_json_to_structs( #endif - auto const delimiter = find_delimiter(input, stream); - // printf("delimiter: %c (code: %d)\n", delimiter, (int)delimiter); + // This should only run when there is LIST column. + char delimiter{','}, null_placeholder{'\0'}; + if (has_list_type) { std::tie(delimiter, null_placeholder) = find_delimiter(input, stream); } + printf("delimiter: %c (code: %d)\n", delimiter, (int)delimiter); + printf("null_placeholder: %c (code: %d)\n", null_placeholder, (int)null_placeholder); auto tmp = test::get_json_object(input, json_paths, From a04183c8aa95f5e7c11d4e503631f4e16a0768a8 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 25 Sep 2024 21:44:19 -0700 Subject: [PATCH 44/58] Fix null in array --- src/main/cpp/src/from_json_to_structs.cu | 127 +++++++++++++++++++---- src/main/cpp/tests/from_json.cu | 28 +++++ 2 files changed, 134 insertions(+), 21 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index b52f498b16..5059df556b 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -18,10 +18,11 @@ #include "get_json_object.hpp" #include "json_parser.cuh" -// #include +#include #include #include +#include #include #include #include @@ -190,6 +191,13 @@ class json_generator { output_len += write_quote(out_begin + offset + output_len, keep_quotes); } + __device__ void write_null_placeholder(char* out_begin, char null) + { + out_begin[offset + output_len] = null; + output_len += 1; + is_curr_array_empty = false; + } + /** * write child raw value * e.g.: @@ -387,6 +395,8 @@ struct context { write_style style; + bool is_in_array; + // for some case paths bool is_first_enter; @@ -411,6 +421,7 @@ __device__ thrust::pair evaluate_path( cudf::type_id path_type_id, bool keep_quotes, char element_delimiter, + char null_placeholder, char* out_buf, int8_t* max_path_depth_exceeded) { @@ -424,7 +435,8 @@ __device__ thrust::pair evaluate_path( auto const push_context = [&](evaluation_case_path _case_path, json_generator _g, write_style _style, - cudf::device_span _path) { + cudf::device_span _path, + bool is_in_array) { if (stack_size > MAX_JSON_PATH_DEPTH) { *max_path_depth_exceeded = 1; // Because no more context is pushed, the evaluation output should be wrong. @@ -438,11 +450,13 @@ __device__ thrust::pair evaluate_path( ctx.case_path = _case_path; ctx.token = p.get_current_token(); ctx.style = _style; + ctx.is_in_array = is_in_array; ctx.is_first_enter = true; ctx.task_is_done = false; }; - push_context(evaluation_case_path::INVALID, json_generator{}, write_style::RAW, path_commands); + push_context( + evaluation_case_path::INVALID, json_generator{}, write_style::RAW, path_commands, false); while (stack_size > 0) { auto& ctx = stack[stack_size - 1]; @@ -470,7 +484,8 @@ __device__ thrust::pair evaluate_path( push_context(evaluation_case_path::START_ARRAY___EMPTY_PATH___FLATTEN_STYLE, ctx.g, ctx.style, - {nullptr, 0}); + {nullptr, 0}, + true); } else { // END_ARRAY ctx.task_is_done = true; @@ -564,7 +579,8 @@ __device__ thrust::pair evaluate_path( push_context(evaluation_case_path::START_OBJECT___MATCHED_NAME_PATH, ctx.g, ctx.style, - {ctx.path.data() + 1, ctx.path.size() - 1}); + {ctx.path.data() + 1, ctx.path.size() - 1}, + ctx.is_in_array); found_expected_child = true; break; } else { @@ -581,9 +597,14 @@ __device__ thrust::pair evaluate_path( } } if (!found_expected_child) { - // did not find any expected sub child + if (ctx.is_in_array) { + ctx.g.try_write_comma(out_buf, element_delimiter); + ctx.g.write_null_placeholder(out_buf, null_placeholder); + ctx.dirty = 1; + } else { + ctx.dirty = false; + } ctx.task_is_done = true; - ctx.dirty = false; } } } @@ -606,7 +627,8 @@ __device__ thrust::pair evaluate_path( push_context(evaluation_case_path::START_ARRAY___MATCHED_WILDCARD, ctx.g, write_style::QUOTED, - {ctx.path.data() + 1, ctx.path.size() - 1}); + {ctx.path.data() + 1, ctx.path.size() - 1}, + true); } else { // ctx.g.write_end_array(out_buf); ctx.task_is_done = true; @@ -746,6 +768,7 @@ __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL void get_json_object_kernel(cudf::column_device_view input, cudf::device_span path_data, char element_delimiter, + char null_placeholder, bool allow_leading_zero_numbers, bool allow_non_numeric_numbers, std::size_t num_threads_per_row, @@ -773,6 +796,7 @@ __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL path.type_id, path.keep_quotes, element_delimiter, + null_placeholder, dst, max_path_depth_exceeded); @@ -803,6 +827,7 @@ struct kernel_launcher { static void exec(cudf::column_device_view const& input, cudf::device_span path_data, char element_delimiter, + char null_placeholder, bool allow_leading_zero_numbers, bool allow_non_numeric_numbers, int8_t* max_path_depth_exceeded, @@ -824,6 +849,7 @@ struct kernel_launcher { <<>>(input, path_data, element_delimiter, + null_placeholder, allow_leading_zero_numbers, allow_non_numeric_numbers, num_threads_per_row, @@ -961,6 +987,7 @@ std::vector> get_json_object_batch( std::vector const& output_ids, std::unordered_set const& keep_quotes, char element_delimiter, + char null_placeholder, int64_t scratch_size, bool allow_leading_zero_numbers, bool allow_non_numeric_numbers, @@ -1017,6 +1044,7 @@ std::vector> get_json_object_batch( kernel_launcher::exec(input, d_path_data, element_delimiter, + null_placeholder, allow_leading_zero_numbers, allow_non_numeric_numbers, d_max_path_depth_exceeded, @@ -1094,6 +1122,7 @@ std::vector> get_json_object_batch( kernel_launcher::exec(input, d_path_data, element_delimiter, + null_placeholder, allow_leading_zero_numbers, allow_non_numeric_numbers, d_max_path_depth_exceeded, @@ -1125,6 +1154,7 @@ std::vector> get_json_object( std::vector const& type_ids, std::unordered_set const& keep_quotes, char element_delimiter, + char null_placeholder, int64_t memory_budget_bytes, int32_t parallel_override, bool allow_leading_zero_numbers, @@ -1199,6 +1229,7 @@ std::vector> get_json_object( output_ids, keep_quotes, element_delimiter, + null_placeholder, scratch_size, allow_leading_zero_numbers, allow_non_numeric_numbers, @@ -1315,6 +1346,7 @@ std::pair, std::unique_ptr> extract_ std::unique_ptr& input, json_schema_element const& column_schema, char element_delimiter, + char null_placeholder, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -1324,9 +1356,14 @@ std::pair, std::unique_ptr> extract_ cudf::size_type num_child_rows{-1}; auto children = std::move(input->release().children); for (std::size_t child_idx = 0; child_idx < children.size(); ++child_idx) { - auto& child = children[child_idx]; - auto [new_child_offsets, new_child] = extract_lists( - child, column_schema.child_types[child_idx].second, element_delimiter, stream, mr); + auto& child = children[child_idx]; + auto [new_child_offsets, new_child] = + extract_lists(child, + column_schema.child_types[child_idx].second, + element_delimiter, + null_placeholder, + stream, + mr); if (num_child_rows < 0) { num_child_rows = new_child->size(); } if (num_child_rows != new_child->size()) { // printf("num_child_rows != new_child->size(): %d != %d\n", @@ -1347,14 +1384,47 @@ std::pair, std::unique_ptr> extract_ cudf::make_structs_column(num_child_rows, std::move(new_children), 0, {}, stream, mr)}; } - // printf("before split:\n"); - // cudf::test::print(input->view()); + printf("before split:\n"); + cudf::test::print(input->view()); - auto tmp = cudf::strings::split_record(cudf::strings_column_view{input->view()}, + auto tmp = cudf::strings::split_record(cudf::strings_column_view{input->view()}, cudf::string_scalar{std::string{element_delimiter}}, -1, stream, mr); + auto split_content = tmp->release(); + auto const child_cv = split_content.children[cudf::lists_column_view::child_column_index]->view(); + auto const child_strview = cudf::strings_column_view{child_cv}; + + printf("child_cv:\n"); + cudf::test::print(child_cv); + + // Convert a row index into an invalid value (-1) if that row contains a null placeholder. + // Don't care about nulls in the child column, as they will be gathered to the output. + auto const gather_it = cudf::detail::make_counting_transform_iterator( + 0, + cuda::proclaim_return_type( + [null_placeholder, + offsets = child_strview.offsets().begin(), + chars = child_strview.chars_begin(stream)] __device__(cudf::size_type idx) { + if (offsets[idx + 1] - offsets[idx] == 1) { + return chars[offsets[idx]] == null_placeholder ? -1 : idx; + } + return idx; + })); + auto out_child = std::move(cudf::detail::gather(cudf::table_view{{child_cv}}, + gather_it, + gather_it + child_cv.size(), + cudf::out_of_bounds_policy::NULLIFY, + stream, + mr) + ->release() + .front()); + printf("out_child:\n"); + cudf::test::print(out_child->view()); + + if (out_child->null_count() == 0) { out_child->set_null_mask(rmm::device_buffer{}, 0); } + // auto split_content = // cudf::strings::split_record(cudf::strings_column_view{input->view()}, // cudf::string_scalar{std::string{element_delimiter}}, @@ -1364,10 +1434,9 @@ std::pair, std::unique_ptr> extract_ // ->release(); // printf("after split:\n"); // cudf::test::print(tmp->view()); - auto split_content = tmp->release(); return {std::move(split_content.children[cudf::lists_column_view::offsets_column_index]), - std::move(split_content.children[cudf::lists_column_view::child_column_index])}; + std::move(out_child)}; } void assemble_column(std::size_t& column_order, @@ -1376,6 +1445,7 @@ void assemble_column(std::size_t& column_order, std::string const& name, json_schema_element const& column_schema, char element_delimiter, + char null_placeholder, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -1396,6 +1466,7 @@ void assemble_column(std::size_t& column_order, child_name, child_schema, element_delimiter, + null_placeholder, stream, mr); } @@ -1422,6 +1493,7 @@ void assemble_column(std::size_t& column_order, child_name, child_schema, element_delimiter, + null_placeholder, stream, mr); } @@ -1429,8 +1501,12 @@ void assemble_column(std::size_t& column_order, // printf("line %d\n", __LINE__); // cudf::test::print(children.front()->view()); - auto [offsets, child] = extract_lists( - children.front(), column_schema.child_types.front().second, element_delimiter, stream, mr); + auto [offsets, child] = extract_lists(children.front(), + column_schema.child_types.front().second, + element_delimiter, + null_placeholder, + stream, + mr); // printf("line %d\n", __LINE__); // cudf::test::print(child->view()); @@ -1522,6 +1598,7 @@ std::vector> assemble_output( std::vector> const& schema, std::vector>& read_columns, char element_delimiter, + char null_placeholder, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -1530,8 +1607,15 @@ std::vector> assemble_output( std::size_t column_order{0}; std::for_each(schema.begin(), schema.end(), [&](auto const& kv) { - assemble_column( - column_order, output, read_columns, kv.first, kv.second, element_delimiter, stream, mr); + assemble_column(column_order, + output, + read_columns, + kv.first, + kv.second, + element_delimiter, + null_placeholder, + stream, + mr); }); return output; @@ -1595,6 +1679,7 @@ std::vector> from_json_to_structs( type_ids, keep_quotes, delimiter, + null_placeholder, 1024 * 1024 * 1024 * 4L, -1, allow_leading_zero_numbers, @@ -1622,7 +1707,7 @@ std::vector> from_json_to_structs( } } - return assemble_output(schema, tmp, delimiter, stream, mr); + return assemble_output(schema, tmp, delimiter, null_placeholder, stream, mr); } } // namespace detail diff --git a/src/main/cpp/tests/from_json.cu b/src/main/cpp/tests/from_json.cu index 8ccfb27318..6937737ed7 100644 --- a/src/main/cpp/tests/from_json.cu +++ b/src/main/cpp/tests/from_json.cu @@ -172,3 +172,31 @@ TEST_F(FromJsonTest, T5) cudf::test::print(col->view()); } } + +TEST_F(FromJsonTest, T6) +{ + auto const json_string = + cudf::test::strings_column_wrapper{"{'data':[{'a':1}, {'a':2, 'b':3}, {'b':4}]}"}; + spark_rapids_jni::json_schema_element a{cudf::data_type{cudf::type_id::LIST}, {}}; + + a.child_types.emplace_back( + "struct", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::STRUCT}, {}}); + a.child_types.front().second.child_types.emplace_back( + "a", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::INT32}, {}}); + a.child_types.front().second.child_types.emplace_back( + "b", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::INT32}, {}}); + + std::vector> schema; + schema.emplace_back("data", std::move(a)); + + auto const output = spark_rapids_jni::from_json_to_structs( + cudf::strings_column_view{json_string}, schema, false, false); + + printf("\n\ninput: \n"); + cudf::test::print(json_string); + + printf("\n\noutput: \n"); + for (auto const& col : output) { + cudf::test::print(col->view()); + } +} From 1cad1f5bdc2bb57d61be0fe142098badd71fc410 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 25 Sep 2024 21:46:08 -0700 Subject: [PATCH 45/58] Cleanup --- src/main/cpp/src/from_json_to_structs.cu | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 5059df556b..733ee2a4ff 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -18,7 +18,7 @@ #include "get_json_object.hpp" #include "json_parser.cuh" -#include +// #include #include #include @@ -1384,8 +1384,8 @@ std::pair, std::unique_ptr> extract_ cudf::make_structs_column(num_child_rows, std::move(new_children), 0, {}, stream, mr)}; } - printf("before split:\n"); - cudf::test::print(input->view()); + // printf("before split:\n"); + // cudf::test::print(input->view()); auto tmp = cudf::strings::split_record(cudf::strings_column_view{input->view()}, cudf::string_scalar{std::string{element_delimiter}}, @@ -1396,8 +1396,8 @@ std::pair, std::unique_ptr> extract_ auto const child_cv = split_content.children[cudf::lists_column_view::child_column_index]->view(); auto const child_strview = cudf::strings_column_view{child_cv}; - printf("child_cv:\n"); - cudf::test::print(child_cv); + // printf("child_cv:\n"); + // cudf::test::print(child_cv); // Convert a row index into an invalid value (-1) if that row contains a null placeholder. // Don't care about nulls in the child column, as they will be gathered to the output. @@ -1420,8 +1420,8 @@ std::pair, std::unique_ptr> extract_ mr) ->release() .front()); - printf("out_child:\n"); - cudf::test::print(out_child->view()); + // printf("out_child:\n"); + // cudf::test::print(out_child->view()); if (out_child->null_count() == 0) { out_child->set_null_mask(rmm::device_buffer{}, 0); } @@ -1671,8 +1671,8 @@ std::vector> from_json_to_structs( // This should only run when there is LIST column. char delimiter{','}, null_placeholder{'\0'}; if (has_list_type) { std::tie(delimiter, null_placeholder) = find_delimiter(input, stream); } - printf("delimiter: %c (code: %d)\n", delimiter, (int)delimiter); - printf("null_placeholder: %c (code: %d)\n", null_placeholder, (int)null_placeholder); + // printf("delimiter: %c (code: %d)\n", delimiter, (int)delimiter); + // printf("null_placeholder: %c (code: %d)\n", null_placeholder, (int)null_placeholder); auto tmp = test::get_json_object(input, json_paths, From aaa7d89bc4714bb709fd3dad0894e48c1ce5acc4 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 25 Sep 2024 23:11:06 -0700 Subject: [PATCH 46/58] Add test --- src/main/cpp/tests/from_json.cu | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/main/cpp/tests/from_json.cu b/src/main/cpp/tests/from_json.cu index 6937737ed7..964c159b9c 100644 --- a/src/main/cpp/tests/from_json.cu +++ b/src/main/cpp/tests/from_json.cu @@ -133,6 +133,32 @@ TEST_F(FromJsonTest, T4) } } +TEST_F(FromJsonTest, T42) +{ + // The last row is invalid (has an extra quote). + auto const json_string = + cudf::test::strings_column_wrapper{"{'data': {'a':'1', 'b':'2'}}", "{'data': ['a', 'b']}"}; + + spark_rapids_jni::json_schema_element a{cudf::data_type{cudf::type_id::LIST}, {}}; + a.child_types.emplace_back( + "string", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::STRING}, {}}); + + std::vector> schema; + schema.emplace_back("data", std::move(a)); + + auto const output = spark_rapids_jni::from_json_to_structs( + cudf::strings_column_view{json_string}, schema, false, false); + + printf("\n\ninput: \n"); + cudf::test::print(json_string); + + printf("\n\noutput: \n"); + for (auto const& col : output) { + cudf::test::print(col->view()); + } +} + +#if 0 #include #include #include @@ -172,6 +198,7 @@ TEST_F(FromJsonTest, T5) cudf::test::print(col->view()); } } +#endif TEST_F(FromJsonTest, T6) { From 3b4c1c2759530b058532420c4b058a3384eec2e6 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 25 Sep 2024 23:24:15 -0700 Subject: [PATCH 47/58] Fix null count equal size --- src/main/cpp/src/from_json_to_structs.cu | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 733ee2a4ff..d9040de1a0 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -1387,12 +1387,18 @@ std::pair, std::unique_ptr> extract_ // printf("before split:\n"); // cudf::test::print(input->view()); - auto tmp = cudf::strings::split_record(cudf::strings_column_view{input->view()}, + auto tmp = cudf::strings::split_record(cudf::strings_column_view{input->view()}, cudf::string_scalar{std::string{element_delimiter}}, -1, stream, mr); - auto split_content = tmp->release(); + auto split_content = tmp->release(); + + if (input->size() == input->null_count()) { + return {std::move(split_content.children[cudf::lists_column_view::offsets_column_index]), + std::move(split_content.children[cudf::lists_column_view::child_column_index])}; + } + auto const child_cv = split_content.children[cudf::lists_column_view::child_column_index]->view(); auto const child_strview = cudf::strings_column_view{child_cv}; @@ -1412,6 +1418,8 @@ std::pair, std::unique_ptr> extract_ } return idx; })); + + // TODO: report issue when the input is strings column has null == size auto out_child = std::move(cudf::detail::gather(cudf::table_view{{child_cv}}, gather_it, gather_it + child_cv.size(), From f8bf8ddbf8995a020ebb65f8e23aa555e26d397e Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 26 Sep 2024 10:40:38 -0700 Subject: [PATCH 48/58] Fix struct under list --- src/main/cpp/src/from_json_to_structs.cu | 50 ++++++++++++++++++------ src/main/cpp/tests/from_json.cu | 47 +++++++++++++++++++++- 2 files changed, 82 insertions(+), 15 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index d9040de1a0..102675c06b 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -1252,6 +1252,7 @@ void travel_path( std::vector& type_ids, std::unordered_set& keep_quotes, bool& has_list_type, + bool parent_is_list, std::string const& name, json_schema_element const& column_schema) { @@ -1268,17 +1269,22 @@ void travel_path( } else { type_ids.push_back(column_schema.type.id()); if (column_schema.type.id() == cudf::type_id::STRUCT) { - if (has_list_type) { + // STRUCT directly under array does not have name field. + if (parent_is_list) { popped = true; - current_path.pop_back(); + current_path.pop_back(); // remove the last NAMED instruction. } paths.push_back(current_path); // this will copy - // printf("column_schema type: STRUCT\n"); - if (column_schema.type.id() == cudf::type_id::STRUCT) { - for (auto const& [child_name, child_schema] : column_schema.child_types) { - travel_path( - paths, current_path, type_ids, keep_quotes, has_list_type, child_name, child_schema); - } + // printf("column_schema type: STRUCT\n"); + for (auto const& [child_name, child_schema] : column_schema.child_types) { + travel_path(paths, + current_path, + type_ids, + keep_quotes, + has_list_type, + false /*parent_is_list*/, + child_name, + child_schema); } } else if (column_schema.type.id() == cudf::type_id::LIST) { // printf("column_schema type: LIST\n"); @@ -1302,8 +1308,14 @@ void travel_path( // Only add a path name if this column is not under a list type. if (has_struct_child) { for (auto const& [child_name, child_schema] : column_schema.child_types) { - travel_path( - paths, current_path, type_ids, keep_quotes, has_list_type, child_name, child_schema); + travel_path(paths, + current_path, + type_ids, + keep_quotes, + has_list_type, + true /*parent_is_list*/, + child_name, + child_schema); } } else { auto const child_type = column_schema.child_types.front().second.type; @@ -1312,7 +1324,7 @@ void travel_path( type_ids.push_back(child_type.id()); } - current_path.pop_back(); + current_path.pop_back(); // remove WILDCARD } else { // TODO @@ -1320,7 +1332,7 @@ void travel_path( } } // if (column_schema.type.id() != cudf::type_id::STRUCT || !has_list_type) { - if (column_schema.type.id() != cudf::type_id::STRUCT || !popped) { current_path.pop_back(); } + if (!popped) { current_path.pop_back(); } } std::tuple>>, @@ -1336,7 +1348,14 @@ flatten_schema_to_paths(std::vector> std::vector> current_path; std::for_each(schema.begin(), schema.end(), [&](auto const& kv) { - travel_path(paths, current_path, type_ids, keep_quotes, has_list_type, kv.first, kv.second); + travel_path(paths, + current_path, + type_ids, + keep_quotes, + has_list_type, + false /*parent_is_list*/, + kv.first, + kv.second); }); return {std::move(paths), std::move(type_ids), std::move(keep_quotes), has_list_type}; @@ -1676,6 +1695,9 @@ std::vector> from_json_to_structs( #endif + // array>> + // [{'a': {'b': 1, 'c' : 2}, 'x': []}, {}] + // This should only run when there is LIST column. char delimiter{','}, null_placeholder{'\0'}; if (has_list_type) { std::tie(delimiter, null_placeholder) = find_delimiter(input, stream); } @@ -1712,6 +1734,8 @@ std::vector> from_json_to_structs( printf("%c", c); } printf("\n"); + + // cudf::test::print(tmp[i]->view()); } } diff --git a/src/main/cpp/tests/from_json.cu b/src/main/cpp/tests/from_json.cu index 964c159b9c..c7689a59f2 100644 --- a/src/main/cpp/tests/from_json.cu +++ b/src/main/cpp/tests/from_json.cu @@ -202,8 +202,8 @@ TEST_F(FromJsonTest, T5) TEST_F(FromJsonTest, T6) { - auto const json_string = - cudf::test::strings_column_wrapper{"{'data':[{'a':1}, {'a':2, 'b':3}, {'b':4}]}"}; + auto const json_string = cudf::test::strings_column_wrapper{"{'data':[{'a':1}"}; + // cudf::test::strings_column_wrapper{"{'data':[{'a':1}, {'a':2, 'b':3}, {'b':4}]}"}; spark_rapids_jni::json_schema_element a{cudf::data_type{cudf::type_id::LIST}, {}}; a.child_types.emplace_back( @@ -225,5 +225,48 @@ TEST_F(FromJsonTest, T6) printf("\n\noutput: \n"); for (auto const& col : output) { cudf::test::print(col->view()); + printf("\n"); + } +} + +TEST_F(FromJsonTest, T7) +{ + auto const json_string = cudf::test::strings_column_wrapper{ + R"({"id": 1,"name": "John","tags": ["developer", "python"],"details": {"age": 30,"address": {"city": "San Francisco","zip": "94105"}}})"}; + + // id INT, name STRING, tags ARRAY, details STRUCT> + spark_rapids_jni::json_schema_element a{cudf::data_type{cudf::type_id::STRUCT}, {}}; + a.child_types.emplace_back( + "age", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::INT32}, {}}); + a.child_types.emplace_back( + "address", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::STRUCT}, {}}); + a.child_types.back().second.child_types.emplace_back( + "city", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::STRING}, {}}); + a.child_types.back().second.child_types.emplace_back( + "zip", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::STRING}, {}}); + + spark_rapids_jni::json_schema_element b{cudf::data_type{cudf::type_id::LIST}, {}}; + b.child_types.emplace_back( + "tags", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::STRING}, {}}); + + std::vector> schema; + schema.emplace_back( + "id", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::INT32}, {}}); + schema.emplace_back( + "name", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::STRING}, {}}); + schema.emplace_back("tags", std::move(b)); + schema.emplace_back("details", std::move(a)); + + auto const output = spark_rapids_jni::from_json_to_structs( + cudf::strings_column_view{json_string}, schema, false, false); + + printf("\n\ninput: \n"); + cudf::test::print(json_string); + + printf("\n\noutput: \n"); + for (auto const& col : output) { + cudf::test::print(col->view()); + printf("\n"); } } From d344eec46ffc992155b6b6fe8d69c9898bccbb57 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 26 Sep 2024 12:01:48 -0700 Subject: [PATCH 49/58] Cleanup --- src/main/cpp/src/get_json_object.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu index 40dc02eec5..5da408c51d 100644 --- a/src/main/cpp/src/get_json_object.cu +++ b/src/main/cpp/src/get_json_object.cu @@ -1086,10 +1086,10 @@ std::vector> get_json_object_batch( out_stringviews.emplace_back(rmm::device_uvector>{ static_cast(input.size()), stream}); - printf("idx: %d, output_ids[idx]: %d\n", (int)idx, (int)output_ids[idx]); - printf("keep_quotes.find(output_ids[idx]) != keep_quotes.end(): %d\n", - (int)(keep_quotes.find(output_ids[idx]) != keep_quotes.end())); - fflush(stdout); + // printf("idx: %d, output_ids[idx]: %d\n", (int)idx, (int)output_ids[idx]); + // printf("keep_quotes.find(output_ids[idx]) != keep_quotes.end(): %d\n", + // (int)(keep_quotes.find(output_ids[idx]) != keep_quotes.end())); + // fflush(stdout); h_path_data.emplace_back( json_path_processing_data{d_json_paths[idx], From 3c6881482329bfe3e3ff20db95794ff64be74a61 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 26 Sep 2024 13:26:28 -0700 Subject: [PATCH 50/58] Remove redundant path --- src/main/cpp/src/from_json_to_structs.cu | 35 ++++++++++++++++++------ 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 102675c06b..2f3597c57d 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -1267,8 +1267,9 @@ void travel_path( paths.push_back(current_path); // this will copy type_ids.push_back(column_schema.type.id()); } else { - type_ids.push_back(column_schema.type.id()); if (column_schema.type.id() == cudf::type_id::STRUCT) { + type_ids.push_back(column_schema.type.id()); + // STRUCT directly under array does not have name field. if (parent_is_list) { popped = true; @@ -1292,11 +1293,6 @@ void travel_path( CUDF_EXPECTS(column_schema.child_types.size() == 1, "TODO"); has_list_type = true; - // TODO: is this needed, if there is no struct child? - paths.push_back(current_path); // this will copy - - current_path.emplace_back(path_instruction_type::WILDCARD, "*", -1); - bool has_struct_child{false}; for (auto const& [child_name, child_schema] : column_schema.child_types) { if (child_schema.type.id() == cudf::type_id::STRUCT) { @@ -1305,6 +1301,14 @@ void travel_path( } } + // TODO: is this needed, if there is no struct child? + if (has_struct_child) { + paths.push_back(current_path); // this will copy + type_ids.push_back(column_schema.type.id()); + } + + current_path.emplace_back(path_instruction_type::WILDCARD, "*", -1); + // Only add a path name if this column is not under a list type. if (has_struct_child) { for (auto const& [child_name, child_schema] : column_schema.child_types) { @@ -1505,12 +1509,25 @@ void assemble_column(std::size_t& column_order, } else if (column_schema.type.id() == cudf::type_id::LIST) { // TODO: split LIST into child column // For now, just output as a strings column. + + bool has_struct_child{false}; + for (auto const& [child_name, child_schema] : column_schema.child_types) { + if (child_schema.type.id() == cudf::type_id::STRUCT) { + has_struct_child = true; + break; + } + } + auto const num_rows = read_columns[column_order]->size(); auto const null_count = read_columns[column_order]->null_count(); - auto const null_mask = std::move(read_columns[column_order]->release().null_mask); + std::unique_ptr null_mask{nullptr}; // printf("num rows: %d\n", num_rows); - ++column_order; + // If there is struct child, ..... TODO + if (has_struct_child) { + null_mask = std::move(read_columns[column_order]->release().null_mask); + ++column_order; + } std::vector> children; for (auto const& [child_name, child_schema] : column_schema.child_types) { @@ -1541,6 +1558,8 @@ void assemble_column(std::size_t& column_order, // cudf::test::print(offsets->view()); // TODO: fix null mask + if (!has_struct_child) { null_mask = std::move(children.front()->release().null_mask); } + output.emplace_back(cudf::make_lists_column(num_rows, std::move(offsets), std::move(child), From c1755834166cae283d6548462da10cccabb403ea Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 26 Sep 2024 14:33:49 -0700 Subject: [PATCH 51/58] Fix parsing non-numeric number --- src/main/cpp/src/json_parser.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/cpp/src/json_parser.cuh b/src/main/cpp/src/json_parser.cuh index aad86fb1ba..8055ed88de 100644 --- a/src/main/cpp/src/json_parser.cuh +++ b/src/main/cpp/src/json_parser.cuh @@ -1034,8 +1034,8 @@ class json_parser { auto const matched_sign = chars[curr_pos] == '-' || chars[curr_pos] == '+'; if (matched_sign) { ++curr_pos; } - if ((curr_pos + 2) < chars.size() && chars[curr_pos] == 'I' && chars[curr_pos + 1] == 'N' && - chars[curr_pos + 2] == 'F') { + if (matched_sign && (curr_pos + 2) < chars.size() && chars[curr_pos] == 'I' && + chars[curr_pos + 1] == 'N' && chars[curr_pos + 2] == 'F') { current_token = json_token::VALUE_NON_NUMERIC_FLOAT; curr_pos += 3; number_token_len = curr_pos - current_token_start_pos; From 0f0403204549104a40bd3985e59d7ca5d659b71b Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 26 Sep 2024 14:59:08 -0700 Subject: [PATCH 52/58] Add `allow_unquoted_control_chars` option --- src/main/cpp/src/JSONUtilsJni.cpp | 6 +++-- src/main/cpp/src/from_json.hpp | 1 + src/main/cpp/src/from_json_to_structs.cu | 21 ++++++++++++++-- src/main/cpp/src/json_parser.cuh | 24 +++++++++++++++---- src/main/cpp/tests/from_json.cu | 16 ++++++------- .../nvidia/spark/rapids/jni/JSONUtils.java | 8 ++++--- .../spark/rapids/jni/GetJsonObjectTest.java | 2 +- 7 files changed, 57 insertions(+), 21 deletions(-) diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index 5b0dd8282a..0aade9b575 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -210,7 +210,8 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJsonToStructs(JNIEnv* env, jintArray j_types, jintArray j_scales, jboolean allow_leading_zero_numbers, - jboolean allow_non_numeric_numbers) + jboolean allow_non_numeric_numbers, + jboolean allow_unquoted_control_chars) { JNI_NULL_CHECK(env, j_input, "j_input is null", 0); JNI_NULL_CHECK(env, j_col_names, "j_col_names is null", 0); @@ -265,7 +266,8 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJsonToStructs(JNIEnv* env, auto output = spark_rapids_jni::from_json_to_structs(cudf::strings_column_view{*input_cv}, schema, allow_leading_zero_numbers, - allow_non_numeric_numbers); + allow_non_numeric_numbers, + allow_unquoted_control_chars); // printf("JNI line %d\n", __LINE__); // fflush(stdout); diff --git a/src/main/cpp/src/from_json.hpp b/src/main/cpp/src/from_json.hpp index 706b7c25e4..43b7cafa43 100644 --- a/src/main/cpp/src/from_json.hpp +++ b/src/main/cpp/src/from_json.hpp @@ -45,6 +45,7 @@ std::vector> from_json_to_structs( std::vector> const& schema, bool allow_leading_zero_numbers, bool allow_non_numeric_numbers, + bool allow_unquoted_control_chars, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 2f3597c57d..92b87219da 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -771,6 +771,7 @@ __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL char null_placeholder, bool allow_leading_zero_numbers, bool allow_non_numeric_numbers, + bool allow_unquoted_control_chars, std::size_t num_threads_per_row, int8_t* max_path_depth_exceeded) { @@ -791,6 +792,7 @@ __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL json_parser p{char_range{str}}; p.set_allow_leading_zero_numbers(allow_leading_zero_numbers); p.set_allow_non_numeric_numbers(allow_non_numeric_numbers); + p.set_allow_unquoted_control_chars(allow_unquoted_control_chars); thrust::tie(is_valid, out_size) = evaluate_path(p, path.path_commands, path.type_id, @@ -830,6 +832,7 @@ struct kernel_launcher { char null_placeholder, bool allow_leading_zero_numbers, bool allow_non_numeric_numbers, + bool allow_unquoted_control_chars, int8_t* max_path_depth_exceeded, rmm::cuda_stream_view stream) { @@ -852,6 +855,7 @@ struct kernel_launcher { null_placeholder, allow_leading_zero_numbers, allow_non_numeric_numbers, + allow_unquoted_control_chars, num_threads_per_row, max_path_depth_exceeded); } @@ -991,6 +995,7 @@ std::vector> get_json_object_batch( int64_t scratch_size, bool allow_leading_zero_numbers, bool allow_non_numeric_numbers, + bool allow_unquoted_control_chars, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -1047,6 +1052,7 @@ std::vector> get_json_object_batch( null_placeholder, allow_leading_zero_numbers, allow_non_numeric_numbers, + allow_unquoted_control_chars, d_max_path_depth_exceeded, stream); auto h_error_check = cudf::detail::make_host_vector_sync(d_error_check, stream); @@ -1125,6 +1131,7 @@ std::vector> get_json_object_batch( null_placeholder, allow_leading_zero_numbers, allow_non_numeric_numbers, + allow_unquoted_control_chars, d_max_path_depth_exceeded, stream); h_error_check = cudf::detail::make_host_vector_sync(d_error_check, stream); @@ -1159,6 +1166,7 @@ std::vector> get_json_object( int32_t parallel_override, bool allow_leading_zero_numbers, bool allow_non_numeric_numbers, + bool allow_unquoted_control_chars, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -1233,6 +1241,7 @@ std::vector> get_json_object( scratch_size, allow_leading_zero_numbers, allow_non_numeric_numbers, + allow_unquoted_control_chars, stream, mr); for (std::size_t i = 0; i < tmp.size(); i++) { @@ -1672,6 +1681,7 @@ std::vector> from_json_to_structs( std::vector> const& schema, bool allow_leading_zero_numbers, bool allow_non_numeric_numbers, + bool allow_unquoted_control_chars, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -1733,6 +1743,7 @@ std::vector> from_json_to_structs( -1, allow_leading_zero_numbers, allow_non_numeric_numbers, + allow_unquoted_control_chars, stream, mr); // printf("line %d\n", __LINE__); @@ -1793,12 +1804,18 @@ std::vector> from_json_to_structs( std::vector> const& schema, bool allow_leading_zero_numbers, bool allow_non_numeric_numbers, + bool allow_unquoted_control_chars, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::from_json_to_structs( - input, schema, allow_leading_zero_numbers, allow_non_numeric_numbers, stream, mr); + return detail::from_json_to_structs(input, + schema, + allow_leading_zero_numbers, + allow_non_numeric_numbers, + allow_unquoted_control_chars, + stream, + mr); } } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/json_parser.cuh b/src/main/cpp/src/json_parser.cuh index 8055ed88de..7304ca0771 100644 --- a/src/main/cpp/src/json_parser.cuh +++ b/src/main/cpp/src/json_parser.cuh @@ -228,7 +228,9 @@ class json_parser { curr_pos(0), current_token(json_token::INIT), max_depth_exceeded(false), - allow_leading_zero_numbers{false} + allow_leading_zero_numbers{false}, + allow_non_numeric_numbers{false}, + allow_unquoted_control_chars{true} { } @@ -422,7 +424,7 @@ class json_parser { { // TODO eventually chars should be a reader so we can just pass it in... char_range_reader reader(chars, curr_pos); - auto [success, end_char_pos] = try_parse_string(reader); + auto [success, end_char_pos] = try_parse_string(reader, allow_unquoted_control_chars); if (success) { // TODO remove end_char_pos, and just get it from the reader... curr_pos = end_char_pos; @@ -562,7 +564,7 @@ class json_parser { // path 4: safe code point // handle single unescaped " char; happens when string is quoted by char ' - // e.g.: 'A"' string, escape to "A\\"" (5 chars: " A \ " ") + // e.g.: 'A"' string, escapcontrole to "A\\"" (5 chars: " A \ " ") if ('\"' == c && escape_style::ESCAPED == w_style) { if (copy_destination != nullptr) { *copy_destination++ = '\\'; } output_size_bytes++; @@ -625,6 +627,7 @@ class json_parser { */ static __device__ inline std::pair try_parse_string( char_range_reader& str, + bool allow_unquoted_control_chars, char_range_reader to_match = char_range_reader(char_range::null()), escape_style w_style = escape_style::UNESCAPED) { @@ -655,6 +658,8 @@ class json_parser { return std::make_pair(true, str.pos()); } else if (v >= 0 && v < 32) { + if (!allow_unquoted_control_chars) { return {false, 0}; } + // path 2: unescaped control char // copy if enabled, escape mode, write more chars @@ -1264,7 +1269,7 @@ class json_parser { // TODO eventually chars should be a reader so we can just pass it in... char_range_reader reader(chars, curr_pos); current_token_start_pos = curr_pos; - auto [success, end_char_pos] = try_parse_string(reader); + auto [success, end_char_pos] = try_parse_string(reader, allow_unquoted_control_chars); if (success) { // TODO remove end_char_pos, and just get it from the reader... curr_pos = end_char_pos; @@ -1662,7 +1667,8 @@ class json_parser { if (json_token::FIELD_NAME == current_token) { char_range_reader reader(current_range()); char_range_reader to_match(name); - auto [b, end_pos] = try_parse_string(reader, to_match, escape_style::UNESCAPED); + auto [b, end_pos] = + try_parse_string(reader, allow_unquoted_control_chars, to_match, escape_style::UNESCAPED); return b; } else { return false; @@ -1772,6 +1778,11 @@ class json_parser { allow_non_numeric_numbers = state; } + __device__ inline void set_allow_unquoted_control_chars(bool state) + { + allow_unquoted_control_chars = state; + } + private: char_range const chars; cudf::size_type curr_pos; @@ -1800,6 +1811,9 @@ class json_parser { // TODO bool allow_non_numeric_numbers; + + // TODO + bool allow_unquoted_control_chars; }; } // namespace spark_rapids_jni diff --git a/src/main/cpp/tests/from_json.cu b/src/main/cpp/tests/from_json.cu index c7689a59f2..dc3f84e2ac 100644 --- a/src/main/cpp/tests/from_json.cu +++ b/src/main/cpp/tests/from_json.cu @@ -43,7 +43,7 @@ TEST_F(FromJsonTest, T1) schema.emplace_back("a", std::move(a)); auto const output = spark_rapids_jni::from_json_to_structs( - cudf::strings_column_view{json_string}, schema, false, false); + cudf::strings_column_view{json_string}, schema, false, false, false); printf("\n\ninput: \n"); cudf::test::print(json_string); @@ -72,7 +72,7 @@ TEST_F(FromJsonTest, T2) schema.emplace_back("a", std::move(a)); auto const output = spark_rapids_jni::from_json_to_structs( - cudf::strings_column_view{json_string}, schema, false, false); + cudf::strings_column_view{json_string}, schema, false, false, false); printf("\n\ninput: \n"); cudf::test::print(json_string); @@ -98,7 +98,7 @@ TEST_F(FromJsonTest, T3) schema.emplace_back("data", std::move(a)); auto const output = spark_rapids_jni::from_json_to_structs( - cudf::strings_column_view{json_string}, schema, false, false); + cudf::strings_column_view{json_string}, schema, false, false, false); printf("\n\ninput: \n"); cudf::test::print(json_string); @@ -122,7 +122,7 @@ TEST_F(FromJsonTest, T4) schema.emplace_back("data", std::move(a)); auto const output = spark_rapids_jni::from_json_to_structs( - cudf::strings_column_view{json_string}, schema, false, false); + cudf::strings_column_view{json_string}, schema, false, false, false); printf("\n\ninput: \n"); cudf::test::print(json_string); @@ -147,7 +147,7 @@ TEST_F(FromJsonTest, T42) schema.emplace_back("data", std::move(a)); auto const output = spark_rapids_jni::from_json_to_structs( - cudf::strings_column_view{json_string}, schema, false, false); + cudf::strings_column_view{json_string}, schema, false, false, false); printf("\n\ninput: \n"); cudf::test::print(json_string); @@ -188,7 +188,7 @@ TEST_F(FromJsonTest, T5) schema.emplace_back("CGEGPD", std::move(a)); auto const output = spark_rapids_jni::from_json_to_structs( - cudf::strings_column_view{json_string}, schema, false, false); + cudf::strings_column_view{json_string}, schema, false, false, false); printf("\n\ninput: \n"); cudf::test::print(json_string); @@ -217,7 +217,7 @@ TEST_F(FromJsonTest, T6) schema.emplace_back("data", std::move(a)); auto const output = spark_rapids_jni::from_json_to_structs( - cudf::strings_column_view{json_string}, schema, false, false); + cudf::strings_column_view{json_string}, schema, false, false, false); printf("\n\ninput: \n"); cudf::test::print(json_string); @@ -259,7 +259,7 @@ TEST_F(FromJsonTest, T7) schema.emplace_back("details", std::move(a)); auto const output = spark_rapids_jni::from_json_to_structs( - cudf::strings_column_view{json_string}, schema, false, false); + cudf::strings_column_view{json_string}, schema, false, false, false); printf("\n\ninput: \n"); cudf::test::print(json_string); diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index 60d482b663..011464c9e4 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -169,12 +169,13 @@ public static ColumnVector extractRawMapFromJsonString(ColumnView input) { */ public static Table fromJsonToStructs(ColumnView input, Schema schema, boolean allowNumericLeadingZeros, - boolean allowNonNumericNumbers) { + boolean allowNonNumericNumbers, + boolean allowUnquotedControlChars) { assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; return new Table(fromJsonToStructs(input.getNativeView(), schema.getFlattenedColumnNames(), schema.getFlattenedNumChildren(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), - allowNumericLeadingZeros, allowNonNumericNumbers)); + allowNumericLeadingZeros, allowNonNumericNumbers, allowUnquotedControlChars)); } public static ColumnVector isNullOrEmpty(ColumnVector input) { @@ -205,7 +206,8 @@ private static native long[] fromJsonToStructs(long input, String[] columnNames, int[] dTypeIds, int[] dTypeScales, boolean allowNumericLeadingZeros, - boolean allowNonNumericNumbers); + boolean allowNonNumericNumbers, + boolean allowUnquotedControlChars); private static native long isNullOrEmpty(long input); } diff --git a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java index aac1137042..b04b3d2a75 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java @@ -814,7 +814,7 @@ void testFromJSON() { try (ColumnVector input = ColumnVector.fromStrings("{'a': '1', 'b': '2'}"); ai.rapids.cudf.Table expected = new ai.rapids.cudf.Table.TestBuilder().column("1").build(); - ai.rapids.cudf.Table actual = JSONUtils.fromJsonToStructs(input, schema, false, false)) { + ai.rapids.cudf.Table actual = JSONUtils.fromJsonToStructs(input, schema, false, false, false)) { assertTablesAreEqual(expected, actual); } } From 3feefae8863a6ea819056888daa1cd4b4ca98635 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 26 Sep 2024 20:11:46 -0700 Subject: [PATCH 53/58] Fix output array/struct --- src/main/cpp/src/from_json_to_structs.cu | 33 ++++++++++++++++++------ 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 92b87219da..be2ca10e8e 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -111,6 +111,12 @@ class json_generator { is_curr_array_empty = true; } + __device__ void write_start_object(char* out_begin) + { + out_begin[offset + output_len] = '{'; + output_len++; + } + // write ] __device__ void write_end_array(char* out_begin) { @@ -464,6 +470,9 @@ __device__ thrust::pair evaluate_path( // case (VALUE_STRING, Nil) if style == RawStyle // case path 1 if (json_token::VALUE_STRING == ctx.token && path_is_empty(ctx.path.size())) { + if (path_type_id == cudf::type_id::STRUCT || path_type_id == cudf::type_id::LIST) { + return {false, 0}; + } // there is no array wildcard or slice parent, emit this string without // quotes write current string in parser to generator ctx.g.try_write_comma(out_buf, element_delimiter); @@ -512,14 +521,22 @@ __device__ thrust::pair evaluate_path( if (path_type_id == cudf::type_id::STRUCT) { // Or copy current structure? if (!p.try_skip_children()) { return {false, 0}; } - } else if (!(ctx.g.copy_current_structure(p, nullptr, ','))) { - // not copy only if there is struct? - return {false, 0}; + // Just write anything into the output, to mark the output as a non-null row. + // Such output will be discarded anyway. + if (ctx.g.get_output_len() == 0) { ctx.g.write_start_object(out_buf); } + + } else { + if (!(ctx.g.copy_current_structure(p, nullptr, ','))) { + // not copy only if there is struct? + return {false, 0}; + } + // Just write anything into the output, to mark the output as a non-null row. + // Such output will be discarded anyway. + // length == 1 due to called write_first_start_array_without_output before + ctx.g.set_output_len(0); + ctx.g.write_start_array(out_buf, element_delimiter); } - // Just write anything into the output, to mark the output as a non-null row. - // Such output will be discarded anyway. - ctx.g.write_start_array(out_buf, element_delimiter); } else if (!(ctx.g.copy_current_structure(p, out_buf, element_delimiter))) { return {false, 0}; } @@ -1692,7 +1709,7 @@ std::vector> from_json_to_structs( // printf("line %d\n", __LINE__); // fflush(stdout); -#if 0 +#if 1 int count{0}; for (auto const& path : json_paths) { printf("\n\npath (%d/%d): \n", count++, (int)json_paths.size()); @@ -1749,7 +1766,7 @@ std::vector> from_json_to_structs( // printf("line %d\n", __LINE__); // fflush(stdout); - if constexpr (0) { + if constexpr (1) { for (std::size_t i = 0; i < tmp.size(); ++i) { auto out = cudf::strings_column_view{tmp[i]->view()}; auto ptr = out.chars_begin(stream); From feeeffce96cc54d334b9deaf6006bd487ef4b47c Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 26 Sep 2024 20:11:50 -0700 Subject: [PATCH 54/58] Add test --- src/main/cpp/tests/from_json.cu | 50 +++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/src/main/cpp/tests/from_json.cu b/src/main/cpp/tests/from_json.cu index dc3f84e2ac..b4a54c13d9 100644 --- a/src/main/cpp/tests/from_json.cu +++ b/src/main/cpp/tests/from_json.cu @@ -109,6 +109,56 @@ TEST_F(FromJsonTest, T3) } } +TEST_F(FromJsonTest, T32) +{ + // The last row is invalid (has an extra quote). + auto const json_string = cudf::test::strings_column_wrapper{"{'data': '1'}"}; + + spark_rapids_jni::json_schema_element a{cudf::data_type{cudf::type_id::STRUCT}, {}}; + a.child_types.emplace_back( + "b", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::STRING}, {}}); + a.child_types.emplace_back( + "c", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::STRING}, {}}); + + std::vector> schema; + schema.emplace_back("data", std::move(a)); + + auto const output = spark_rapids_jni::from_json_to_structs( + cudf::strings_column_view{json_string}, schema, false, false, false); + + printf("\n\ninput: \n"); + cudf::test::print(json_string); + + printf("\n\noutput: \n"); + for (auto const& col : output) { + cudf::test::print(col->view()); + } +} + +TEST_F(FromJsonTest, T33) +{ + // The last row is invalid (has an extra quote). + auto const json_string = cudf::test::strings_column_wrapper{"{'data': []}", "{'data': 1}"}; + + spark_rapids_jni::json_schema_element a{cudf::data_type{cudf::type_id::LIST}, {}}; + a.child_types.emplace_back( + "", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::STRING}, {}}); + + std::vector> schema; + schema.emplace_back("data", std::move(a)); + + auto const output = spark_rapids_jni::from_json_to_structs( + cudf::strings_column_view{json_string}, schema, false, false, false); + + printf("\n\ninput: \n"); + cudf::test::print(json_string); + + printf("\n\noutput: \n"); + for (auto const& col : output) { + cudf::test::print(col->view()); + } +} + TEST_F(FromJsonTest, T4) { // The last row is invalid (has an extra quote). From a6636062e00927048172e59393a1ee5d73a9e65b Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 27 Sep 2024 10:47:08 -0700 Subject: [PATCH 55/58] Fix empty list --- src/main/cpp/src/from_json_to_structs.cu | 139 +++++++++++------------ 1 file changed, 69 insertions(+), 70 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index be2ca10e8e..7652529a7c 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -44,8 +44,10 @@ #include #include +#include #include #include +#include #include #include @@ -523,18 +525,23 @@ __device__ thrust::pair evaluate_path( if (!p.try_skip_children()) { return {false, 0}; } // Just write anything into the output, to mark the output as a non-null row. // Such output will be discarded anyway. - if (ctx.g.get_output_len() == 0) { ctx.g.write_start_object(out_buf); } + ctx.g.set_output_len(1); } else { - if (!(ctx.g.copy_current_structure(p, nullptr, ','))) { - // not copy only if there is struct? - return {false, 0}; + // This is just write [ + // if (!(ctx.g.copy_current_structure(p, nullptr, ','))) { + // // not copy only if there is struct? + // return {false, 0}; + // } + + if (p.next_token() == json_token::END_ARRAY) { + ctx.g.set_output_len(0); + ctx.g.write_null_placeholder(out_buf, null_placeholder); + } else { + // Just write anything into the output, to mark the output as a non-null row. + // Such output will be discarded anyway. + ctx.g.set_output_len(1); } - // Just write anything into the output, to mark the output as a non-null row. - // Such output will be discarded anyway. - // length == 1 due to called write_first_start_array_without_output before - ctx.g.set_output_len(0); - ctx.g.write_start_array(out_buf, element_delimiter); } } else if (!(ctx.g.copy_current_structure(p, out_buf, element_delimiter))) { @@ -647,6 +654,10 @@ __device__ thrust::pair evaluate_path( {ctx.path.data() + 1, ctx.path.size() - 1}, true); } else { + if (path_type_id == cudf::type_id::LIST) { + ctx.g.set_output_len(0); + ctx.g.write_null_placeholder(out_buf, null_placeholder); + } // ctx.g.write_end_array(out_buf); ctx.task_is_done = true; } @@ -1328,10 +1339,12 @@ void travel_path( } // TODO: is this needed, if there is no struct child? - if (has_struct_child) { - paths.push_back(current_path); // this will copy - type_ids.push_back(column_schema.type.id()); - } + // Needed, since we need to mark empty list in the kernel. + // if (has_struct_child) { + + paths.push_back(current_path); // this will copy + type_ids.push_back(column_schema.type.id()); + // } current_path.emplace_back(path_instruction_type::WILDCARD, "*", -1); @@ -1399,48 +1412,15 @@ std::pair, std::unique_ptr> extract_ rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - if (column_schema.type.id() == cudf::type_id::STRUCT) { - std::unique_ptr offsets{nullptr}; - std::vector> new_children; - cudf::size_type num_child_rows{-1}; - auto children = std::move(input->release().children); - for (std::size_t child_idx = 0; child_idx < children.size(); ++child_idx) { - auto& child = children[child_idx]; - auto [new_child_offsets, new_child] = - extract_lists(child, - column_schema.child_types[child_idx].second, - element_delimiter, - null_placeholder, - stream, - mr); - if (num_child_rows < 0) { num_child_rows = new_child->size(); } - if (num_child_rows != new_child->size()) { - // printf("num_child_rows != new_child->size(): %d != %d\n", - // (int)num_child_rows, - // (int)new_child->size()); - } - CUDF_EXPECTS(num_child_rows == new_child->size(), "num_child_rows != new_child->size()"); - - if (!offsets) { offsets = std::move(new_child_offsets); } - new_children.emplace_back(std::move(new_child)); - } - - // return cudf::make_structs_column( - // num_child_rows, std::move(children), null_count, std::move(*null_mask), stream, - // mr); - // TODO: fix null mask - return {std::move(offsets), - cudf::make_structs_column(num_child_rows, std::move(new_children), 0, {}, stream, mr)}; - } - // printf("before split:\n"); // cudf::test::print(input->view()); - auto tmp = cudf::strings::split_record(cudf::strings_column_view{input->view()}, + auto tmp = cudf::strings::split_record(cudf::strings_column_view{input->view()}, cudf::string_scalar{std::string{element_delimiter}}, -1, stream, mr); + auto split_content = tmp->release(); if (input->size() == input->null_count()) { @@ -1536,24 +1516,44 @@ void assemble_column(std::size_t& column_order, // TODO: split LIST into child column // For now, just output as a strings column. - bool has_struct_child{false}; - for (auto const& [child_name, child_schema] : column_schema.child_types) { - if (child_schema.type.id() == cudf::type_id::STRUCT) { - has_struct_child = true; - break; - } - } - - auto const num_rows = read_columns[column_order]->size(); - auto const null_count = read_columns[column_order]->null_count(); - std::unique_ptr null_mask{nullptr}; + auto const num_rows = read_columns[column_order]->size(); + auto const cv = read_columns[column_order]->view(); + auto const cv_null_count = read_columns[column_order]->null_count(); + auto cv_null_mask = std::move(read_columns[column_order]->release().null_mask); + ++column_order; - // printf("num rows: %d\n", num_rows); - // If there is struct child, ..... TODO - if (has_struct_child) { - null_mask = std::move(read_columns[column_order]->release().null_mask); - ++column_order; - } + auto const d_col_ptr = cudf::column_device_view::create(cv, stream); + rmm::device_uvector empty_list_markers(cv.size(), stream); + thrust::tabulate(rmm::exec_policy(stream), + empty_list_markers.begin(), + empty_list_markers.end(), + [null_placeholder, data = *d_col_ptr] __device__(auto idx) -> bool { + if (data.is_null(idx)) { return false; } + auto const d_str = data.element(idx); + return d_str.size_bytes() == 1 && d_str[0] == null_placeholder; + }); + + auto const has_empty_list = thrust::count_if(rmm::exec_policy(stream), + empty_list_markers.begin(), + empty_list_markers.end(), + thrust::identity{}) > 0; + auto [null_mask, null_count] = [&] { + if (has_empty_list) { + return cudf::detail::valid_if( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(cv.size()), + [is_empty_list = empty_list_markers.begin(), + data = *d_col_ptr] __device__(auto idx) -> bool { + if (is_empty_list[idx]) { return true; } // data.is_valid(idx) should be false here + return data.is_valid(idx); + }, + stream, + mr); + } else { + return std::pair{std::move(*cv_null_mask), + cv_null_count}; + } + }(); std::vector> children; for (auto const& [child_name, child_schema] : column_schema.child_types) { @@ -1583,14 +1583,13 @@ void assemble_column(std::size_t& column_order, // printf("line %d\n", __LINE__); // cudf::test::print(offsets->view()); - // TODO: fix null mask - if (!has_struct_child) { null_mask = std::move(children.front()->release().null_mask); } + // auto const null_mask = std::move(children.front()->release().null_mask); output.emplace_back(cudf::make_lists_column(num_rows, std::move(offsets), std::move(child), null_count, - std::move(*null_mask), + std::move(null_mask), stream, mr)); @@ -1709,7 +1708,7 @@ std::vector> from_json_to_structs( // printf("line %d\n", __LINE__); // fflush(stdout); -#if 1 +#if 0 int count{0}; for (auto const& path : json_paths) { printf("\n\npath (%d/%d): \n", count++, (int)json_paths.size()); @@ -1766,7 +1765,7 @@ std::vector> from_json_to_structs( // printf("line %d\n", __LINE__); // fflush(stdout); - if constexpr (1) { + if constexpr (0) { for (std::size_t i = 0; i < tmp.size(); ++i) { auto out = cudf::strings_column_view{tmp[i]->view()}; auto ptr = out.chars_begin(stream); From de53ef06c9c288d2733d695ccb85512405be0eb5 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 27 Sep 2024 13:26:49 -0700 Subject: [PATCH 56/58] Add test --- src/main/cpp/tests/from_json.cu | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/main/cpp/tests/from_json.cu b/src/main/cpp/tests/from_json.cu index b4a54c13d9..3e6a83a893 100644 --- a/src/main/cpp/tests/from_json.cu +++ b/src/main/cpp/tests/from_json.cu @@ -159,6 +159,30 @@ TEST_F(FromJsonTest, T33) } } +TEST_F(FromJsonTest, T34) +{ + // The last row is invalid (has an extra quote). + auto const json_string = cudf::test::strings_column_wrapper{"{'data': [0]}"}; + + spark_rapids_jni::json_schema_element a{cudf::data_type{cudf::type_id::LIST}, {}}; + a.child_types.emplace_back( + "", spark_rapids_jni::json_schema_element{cudf::data_type{cudf::type_id::STRING}, {}}); + + std::vector> schema; + schema.emplace_back("data", std::move(a)); + + auto const output = spark_rapids_jni::from_json_to_structs( + cudf::strings_column_view{json_string}, schema, false, false, false); + + printf("\n\ninput: \n"); + cudf::test::print(json_string); + + printf("\n\noutput: \n"); + for (auto const& col : output) { + cudf::test::print(col->view()); + } +} + TEST_F(FromJsonTest, T4) { // The last row is invalid (has an extra quote). From 2407dec7a887755eae6e5bfb0ee48b83e0fa59ac Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 27 Sep 2024 13:26:56 -0700 Subject: [PATCH 57/58] Fix struct under list --- src/main/cpp/src/from_json_to_structs.cu | 113 +++++++++++++++++------ 1 file changed, 85 insertions(+), 28 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 7652529a7c..1f2439bb28 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -18,7 +18,7 @@ #include "get_json_object.hpp" #include "json_parser.cuh" -// #include +#include #include #include @@ -505,9 +505,9 @@ __device__ thrust::pair evaluate_path( // case (_, Nil) // case path 3 else if (path_is_empty(ctx.path.size())) { - // printf("path is empty, path type = %d, token = %d\n", - // (int)path_type_id, - // (int)p.get_current_token()); + printf("path is empty, path type = %d, token = %d\n", + (int)path_type_id, + (int)p.get_current_token()); // If this is a struct column, we only need to check to see if there exists a struct. if (path_type_id == cudf::type_id::STRUCT || path_type_id == cudf::type_id::LIST) { @@ -544,9 +544,16 @@ __device__ thrust::pair evaluate_path( } } + return {true, 1}; } else if (!(ctx.g.copy_current_structure(p, out_buf, element_delimiter))) { return {false, 0}; } + printf("done line %d, path type = %d, token = %d, stack size %d, %p\n", + __LINE__, + (int)path_type_id, + (int)p.get_current_token(), + (int)stack_size, + out_buf); ctx.dirty = 1; ctx.task_is_done = true; } @@ -554,12 +561,14 @@ __device__ thrust::pair evaluate_path( // case path 4 else if (json_token::START_OBJECT == ctx.token && thrust::get<0>(path_match_named(ctx.path))) { - // printf("start object\n"); + printf("start object, %p\n", out_buf); if (!ctx.is_first_enter) { // 2st enter // skip the following children after the expect if (ctx.dirty > 0) { + printf("object, token = %d, %p\n", (int)p.get_current_token(), out_buf); + while (json_token::END_OBJECT != p.next_token()) { // JSON validation check if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } @@ -571,6 +580,8 @@ __device__ thrust::pair evaluate_path( // skip value of FIELD_NAME if (!p.try_skip_children()) { + printf("object, can't skip, token = %d, %p\n", (int)p.get_current_token(), out_buf); + // JSON validation check return {false, 0}; } @@ -636,7 +647,7 @@ __device__ thrust::pair evaluate_path( // case path 7 else if (json_token::START_ARRAY == ctx.token && path_match_element(ctx.path, path_instruction_type::WILDCARD)) { - // printf("array *\n"); + printf("array *, %p\n", out_buf); if (ctx.is_first_enter) { ctx.is_first_enter = false; @@ -654,10 +665,10 @@ __device__ thrust::pair evaluate_path( {ctx.path.data() + 1, ctx.path.size() - 1}, true); } else { - if (path_type_id == cudf::type_id::LIST) { - ctx.g.set_output_len(0); - ctx.g.write_null_placeholder(out_buf, null_placeholder); - } + // if (ctx.is_first_enter && path_type_id == cudf::type_id::LIST) { + // ctx.g.set_output_len(0); + // ctx.g.write_null_placeholder(out_buf, null_placeholder); + // } // ctx.g.write_end_array(out_buf); ctx.task_is_done = true; } @@ -665,9 +676,11 @@ __device__ thrust::pair evaluate_path( // case _ => // case path 12 else { - // printf("get obj line %d\n", __LINE__); + printf("get obj line %d\n", __LINE__); if (!p.try_skip_children()) { return {false, 0}; } + printf("get obj line %d\n", __LINE__); + // default case path, return false for this task ctx.dirty = 0; ctx.task_is_done = true; @@ -751,6 +764,8 @@ __device__ thrust::pair evaluate_path( auto const success = stack[0].dirty > 0; + printf("dirty: %d\n", (int)stack[0].dirty); + // generator may contain trash output, e.g.: generator writes some output, // then JSON format is invalid, the previous output becomes trash. // We need to return output size as zero. @@ -1412,8 +1427,45 @@ std::pair, std::unique_ptr> extract_ rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - // printf("before split:\n"); - // cudf::test::print(input->view()); + printf("before split:\n"); + cudf::test::print(input->view()); + + if (column_schema.type.id() == cudf::type_id::STRUCT) { + std::unique_ptr offsets{nullptr}; + std::vector> new_children; + cudf::size_type num_child_rows{-1}; + // auto const null_count = input->null_count(); + auto input_content = input->release(); + auto children = std::move(input_content.children); + // auto null_mask = std::move(input_content.null_mask); + for (std::size_t child_idx = 0; child_idx < children.size(); ++child_idx) { + auto& child = children[child_idx]; + auto [new_child_offsets, new_child] = + extract_lists(child, + column_schema.child_types[child_idx].second, + element_delimiter, + null_placeholder, + stream, + mr); + if (num_child_rows < 0) { num_child_rows = new_child->size(); } + if (num_child_rows != new_child->size()) { + // printf("num_child_rows != new_child->size(): %d != %d\n", + // (int)num_child_rows, + // (int)new_child->size()); + } + CUDF_EXPECTS(num_child_rows == new_child->size(), "num_child_rows != new_child->size()"); + + if (!offsets) { offsets = std::move(new_child_offsets); } + new_children.emplace_back(std::move(new_child)); + } + + // return cudf::make_structs_column( + // num_child_rows, std::move(children), null_count, std::move(*null_mask), stream, + // mr); + // TODO: fix null mask + return {std::move(offsets), + cudf::make_structs_column(num_child_rows, std::move(new_children), 0, {}, stream, mr)}; + } auto tmp = cudf::strings::split_record(cudf::strings_column_view{input->view()}, cudf::string_scalar{std::string{element_delimiter}}, @@ -1421,6 +1473,9 @@ std::pair, std::unique_ptr> extract_ stream, mr); + printf("after split:\n"); + cudf::test::print(tmp->view()); + auto split_content = tmp->release(); if (input->size() == input->null_count()) { @@ -1431,8 +1486,8 @@ std::pair, std::unique_ptr> extract_ auto const child_cv = split_content.children[cudf::lists_column_view::child_column_index]->view(); auto const child_strview = cudf::strings_column_view{child_cv}; - // printf("child_cv:\n"); - // cudf::test::print(child_cv); + printf("child_cv:\n"); + cudf::test::print(child_cv); // Convert a row index into an invalid value (-1) if that row contains a null placeholder. // Don't care about nulls in the child column, as they will be gathered to the output. @@ -1568,8 +1623,8 @@ void assemble_column(std::size_t& column_order, mr); } - // printf("line %d\n", __LINE__); - // cudf::test::print(children.front()->view()); + printf("line %d\n", __LINE__); + cudf::test::print(children.front()->view()); auto [offsets, child] = extract_lists(children.front(), column_schema.child_types.front().second, @@ -1578,10 +1633,10 @@ void assemble_column(std::size_t& column_order, stream, mr); - // printf("line %d\n", __LINE__); - // cudf::test::print(child->view()); - // printf("line %d\n", __LINE__); - // cudf::test::print(offsets->view()); + printf("line %d\n", __LINE__); + cudf::test::print(child->view()); + printf("line %d\n", __LINE__); + cudf::test::print(offsets->view()); // auto const null_mask = std::move(children.front()->release().null_mask); @@ -1593,8 +1648,8 @@ void assemble_column(std::size_t& column_order, stream, mr)); - // printf("line %d\n", __LINE__); - // cudf::test::print(output.back()->view()); + printf("line %d\n", __LINE__); + cudf::test::print(output.back()->view()); } else { CUDF_FAIL("Unsupported type"); } @@ -1604,6 +1659,8 @@ void assemble_column(std::size_t& column_order, std::pair find_delimiter(cudf::strings_column_view const& input, rmm::cuda_stream_view stream) { + return {',', 'N'}; + auto constexpr num_levels = 256; auto constexpr lower_level = std::numeric_limits::min(); auto constexpr upper_level = std::numeric_limits::max(); @@ -1708,7 +1765,7 @@ std::vector> from_json_to_structs( // printf("line %d\n", __LINE__); // fflush(stdout); -#if 0 +#if 1 int count{0}; for (auto const& path : json_paths) { printf("\n\npath (%d/%d): \n", count++, (int)json_paths.size()); @@ -1746,8 +1803,8 @@ std::vector> from_json_to_structs( // This should only run when there is LIST column. char delimiter{','}, null_placeholder{'\0'}; if (has_list_type) { std::tie(delimiter, null_placeholder) = find_delimiter(input, stream); } - // printf("delimiter: %c (code: %d)\n", delimiter, (int)delimiter); - // printf("null_placeholder: %c (code: %d)\n", null_placeholder, (int)null_placeholder); + printf("delimiter: %c (code: %d)\n", delimiter, (int)delimiter); + printf("null_placeholder: %c (code: %d)\n", null_placeholder, (int)null_placeholder); auto tmp = test::get_json_object(input, json_paths, @@ -1765,7 +1822,7 @@ std::vector> from_json_to_structs( // printf("line %d\n", __LINE__); // fflush(stdout); - if constexpr (0) { + if constexpr (1) { for (std::size_t i = 0; i < tmp.size(); ++i) { auto out = cudf::strings_column_view{tmp[i]->view()}; auto ptr = out.chars_begin(stream); @@ -1781,7 +1838,7 @@ std::vector> from_json_to_structs( } printf("\n"); - // cudf::test::print(tmp[i]->view()); + cudf::test::print(tmp[i]->view()); } } From 571d692293f44be90e89ba34b2f2d37ef4523a03 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 27 Sep 2024 13:46:20 -0700 Subject: [PATCH 58/58] Cleanup --- src/main/cpp/src/from_json_to_structs.cu | 75 ++++++++++++------------ 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 1f2439bb28..27ca5017ff 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -18,7 +18,7 @@ #include "get_json_object.hpp" #include "json_parser.cuh" -#include +// #include #include #include @@ -505,9 +505,9 @@ __device__ thrust::pair evaluate_path( // case (_, Nil) // case path 3 else if (path_is_empty(ctx.path.size())) { - printf("path is empty, path type = %d, token = %d\n", - (int)path_type_id, - (int)p.get_current_token()); + // printf("path is empty, path type = %d, token = %d\n", + // (int)path_type_id, + // (int)p.get_current_token()); // If this is a struct column, we only need to check to see if there exists a struct. if (path_type_id == cudf::type_id::STRUCT || path_type_id == cudf::type_id::LIST) { @@ -548,12 +548,12 @@ __device__ thrust::pair evaluate_path( } else if (!(ctx.g.copy_current_structure(p, out_buf, element_delimiter))) { return {false, 0}; } - printf("done line %d, path type = %d, token = %d, stack size %d, %p\n", - __LINE__, - (int)path_type_id, - (int)p.get_current_token(), - (int)stack_size, - out_buf); + // printf("done line %d, path type = %d, token = %d, stack size %d, %p\n", + // __LINE__, + // (int)path_type_id, + // (int)p.get_current_token(), + // (int)stack_size, + // out_buf); ctx.dirty = 1; ctx.task_is_done = true; } @@ -561,13 +561,13 @@ __device__ thrust::pair evaluate_path( // case path 4 else if (json_token::START_OBJECT == ctx.token && thrust::get<0>(path_match_named(ctx.path))) { - printf("start object, %p\n", out_buf); + // printf("start object, %p\n", out_buf); if (!ctx.is_first_enter) { // 2st enter // skip the following children after the expect if (ctx.dirty > 0) { - printf("object, token = %d, %p\n", (int)p.get_current_token(), out_buf); + // printf("object, token = %d, %p\n", (int)p.get_current_token(), out_buf); while (json_token::END_OBJECT != p.next_token()) { // JSON validation check @@ -580,7 +580,8 @@ __device__ thrust::pair evaluate_path( // skip value of FIELD_NAME if (!p.try_skip_children()) { - printf("object, can't skip, token = %d, %p\n", (int)p.get_current_token(), out_buf); + // printf("object, can't skip, token = %d, %p\n", (int)p.get_current_token(), + // out_buf); // JSON validation check return {false, 0}; @@ -647,7 +648,7 @@ __device__ thrust::pair evaluate_path( // case path 7 else if (json_token::START_ARRAY == ctx.token && path_match_element(ctx.path, path_instruction_type::WILDCARD)) { - printf("array *, %p\n", out_buf); + // printf("array *, %p\n", out_buf); if (ctx.is_first_enter) { ctx.is_first_enter = false; @@ -676,10 +677,10 @@ __device__ thrust::pair evaluate_path( // case _ => // case path 12 else { - printf("get obj line %d\n", __LINE__); + // printf("get obj line %d\n", __LINE__); if (!p.try_skip_children()) { return {false, 0}; } - printf("get obj line %d\n", __LINE__); + // printf("get obj line %d\n", __LINE__); // default case path, return false for this task ctx.dirty = 0; @@ -764,7 +765,7 @@ __device__ thrust::pair evaluate_path( auto const success = stack[0].dirty > 0; - printf("dirty: %d\n", (int)stack[0].dirty); + // printf("dirty: %d\n", (int)stack[0].dirty); // generator may contain trash output, e.g.: generator writes some output, // then JSON format is invalid, the previous output becomes trash. @@ -1427,8 +1428,8 @@ std::pair, std::unique_ptr> extract_ rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - printf("before split:\n"); - cudf::test::print(input->view()); + // printf("before split:\n"); + // cudf::test::print(input->view()); if (column_schema.type.id() == cudf::type_id::STRUCT) { std::unique_ptr offsets{nullptr}; @@ -1473,8 +1474,8 @@ std::pair, std::unique_ptr> extract_ stream, mr); - printf("after split:\n"); - cudf::test::print(tmp->view()); + // printf("after split:\n"); + // cudf::test::print(tmp->view()); auto split_content = tmp->release(); @@ -1486,8 +1487,8 @@ std::pair, std::unique_ptr> extract_ auto const child_cv = split_content.children[cudf::lists_column_view::child_column_index]->view(); auto const child_strview = cudf::strings_column_view{child_cv}; - printf("child_cv:\n"); - cudf::test::print(child_cv); + // printf("child_cv:\n"); + // cudf::test::print(child_cv); // Convert a row index into an invalid value (-1) if that row contains a null placeholder. // Don't care about nulls in the child column, as they will be gathered to the output. @@ -1623,8 +1624,8 @@ void assemble_column(std::size_t& column_order, mr); } - printf("line %d\n", __LINE__); - cudf::test::print(children.front()->view()); + // printf("line %d\n", __LINE__); + // cudf::test::print(children.front()->view()); auto [offsets, child] = extract_lists(children.front(), column_schema.child_types.front().second, @@ -1633,10 +1634,10 @@ void assemble_column(std::size_t& column_order, stream, mr); - printf("line %d\n", __LINE__); - cudf::test::print(child->view()); - printf("line %d\n", __LINE__); - cudf::test::print(offsets->view()); + // printf("line %d\n", __LINE__); + // cudf::test::print(child->view()); + // printf("line %d\n", __LINE__); + // cudf::test::print(offsets->view()); // auto const null_mask = std::move(children.front()->release().null_mask); @@ -1648,8 +1649,8 @@ void assemble_column(std::size_t& column_order, stream, mr)); - printf("line %d\n", __LINE__); - cudf::test::print(output.back()->view()); + // printf("line %d\n", __LINE__); + // cudf::test::print(output.back()->view()); } else { CUDF_FAIL("Unsupported type"); } @@ -1659,7 +1660,7 @@ void assemble_column(std::size_t& column_order, std::pair find_delimiter(cudf::strings_column_view const& input, rmm::cuda_stream_view stream) { - return {',', 'N'}; + // return {',', 'N'}; auto constexpr num_levels = 256; auto constexpr lower_level = std::numeric_limits::min(); @@ -1765,7 +1766,7 @@ std::vector> from_json_to_structs( // printf("line %d\n", __LINE__); // fflush(stdout); -#if 1 +#if 0 int count{0}; for (auto const& path : json_paths) { printf("\n\npath (%d/%d): \n", count++, (int)json_paths.size()); @@ -1803,8 +1804,8 @@ std::vector> from_json_to_structs( // This should only run when there is LIST column. char delimiter{','}, null_placeholder{'\0'}; if (has_list_type) { std::tie(delimiter, null_placeholder) = find_delimiter(input, stream); } - printf("delimiter: %c (code: %d)\n", delimiter, (int)delimiter); - printf("null_placeholder: %c (code: %d)\n", null_placeholder, (int)null_placeholder); + // printf("delimiter: %c (code: %d)\n", delimiter, (int)delimiter); + // printf("null_placeholder: %c (code: %d)\n", null_placeholder, (int)null_placeholder); auto tmp = test::get_json_object(input, json_paths, @@ -1822,7 +1823,7 @@ std::vector> from_json_to_structs( // printf("line %d\n", __LINE__); // fflush(stdout); - if constexpr (1) { + if constexpr (0) { for (std::size_t i = 0; i < tmp.size(); ++i) { auto out = cudf::strings_column_view{tmp[i]->view()}; auto ptr = out.chars_begin(stream); @@ -1838,7 +1839,7 @@ std::vector> from_json_to_structs( } printf("\n"); - cudf::test::print(tmp[i]->view()); + // cudf::test::print(tmp[i]->view()); } }