Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

POC: Implement from_json_to_structs using get_json_object kernel #2449

Draft
wants to merge 67 commits into
base: branch-24.10
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
67 commits
Select commit Hold shift + click to select a range
2459f61
Rename function and class containing `from_json`
ttnghia Aug 14, 2024
3b1e067
Fix copyright year
ttnghia Aug 14, 2024
1959b87
Fix style
ttnghia Aug 14, 2024
2f83c57
Rename parameter
ttnghia Aug 14, 2024
8f1da4b
Change parameter type
ttnghia Aug 14, 2024
515bb6b
Fix style
ttnghia Aug 14, 2024
c7d5ad9
Exclude the new files from compiling
ttnghia Aug 14, 2024
d9e52c0
WIP
ttnghia Aug 16, 2024
e4bb460
Change signatures for `from_json_to_structs`
ttnghia Aug 16, 2024
fe2bf9f
Merge branch 'branch-24.10' into from_json
ttnghia Aug 20, 2024
3412195
Merge branch 'branch-24.10' into from_json
ttnghia Aug 20, 2024
9e239cb
Add `convert_schema_to_paths` function and fix compile error
ttnghia Aug 20, 2024
7d5653c
Merge branch 'branch-24.10' into from_json
ttnghia Aug 20, 2024
187785b
Convert schema into paths
ttnghia Aug 20, 2024
a02cfca
WIP
ttnghia Aug 22, 2024
49bbf7c
Add `keep_quotes` parameter
ttnghia Aug 22, 2024
14b463e
Find decimal columns in schema
ttnghia Aug 22, 2024
ae1df4a
Allow to specify `allow_leading_zero_numbers`
ttnghia Aug 23, 2024
1a96dd5
Fix order when checking `keep_quotes`
ttnghia Aug 23, 2024
41304a4
Allow to specify `allow_non_numeric_numbers`
ttnghia Aug 23, 2024
5ba1506
WIP for supporting structs
ttnghia Aug 25, 2024
79398b1
Merge branch 'branch-24.10' into from_json
ttnghia Sep 5, 2024
e7a64d6
Merge branch 'branch-24.10' into from_json
ttnghia Sep 9, 2024
4376438
Support struct type in schema
ttnghia Sep 10, 2024
7c5e3ec
Apply null mask for structs
ttnghia Sep 11, 2024
4127040
Fix column order
ttnghia Sep 14, 2024
97cd60b
Add `from_json_object` kernel
ttnghia Sep 17, 2024
357c671
Add `type_ids` data for paths
ttnghia Sep 17, 2024
2896875
Fix struct null mask
ttnghia Sep 17, 2024
ba3649d
Add Java test
ttnghia Sep 18, 2024
82ad2d5
Fix character matching
ttnghia Sep 18, 2024
2c2adfc
Merge branch 'fix_match_escaped' into from_json
ttnghia Sep 18, 2024
bff6d12
Fix column order in schema
ttnghia Sep 19, 2024
1abd8f8
Allow STRUCT type in JSON path
ttnghia Sep 19, 2024
a18ee48
Add test with LIST
ttnghia Sep 19, 2024
adc45be
Output LIST as string
ttnghia Sep 20, 2024
77128eb
Update tests and fix null mask
ttnghia Sep 20, 2024
a78bb72
Output list without outer brackets
ttnghia Sep 24, 2024
9f13aed
Output columns following the input schema
ttnghia Sep 24, 2024
76a44da
Fix struct schema, and add test
ttnghia Sep 25, 2024
1f36d17
Add test
ttnghia Sep 25, 2024
af97f21
Fix struct child of array
ttnghia Sep 25, 2024
b14c332
Implement element delimiter
ttnghia Sep 25, 2024
233c6b5
Merge branch 'branch-24.10' into from_json
ttnghia Sep 25, 2024
54a8e96
Cleanup
ttnghia Sep 25, 2024
705dd6b
Add `isNullOrEmpty` function
ttnghia Sep 25, 2024
a3bb86c
Cleanup
ttnghia Sep 25, 2024
3edfb0d
Change memory budget
ttnghia Sep 25, 2024
92c4308
Add test
ttnghia Sep 25, 2024
395c0bc
Search for both element delimiter and null placeholder
ttnghia Sep 26, 2024
a04183c
Fix null in array
ttnghia Sep 26, 2024
1cad1f5
Cleanup
ttnghia Sep 26, 2024
aaa7d89
Add test
ttnghia Sep 26, 2024
3b4c1c2
Fix null count equal size
ttnghia Sep 26, 2024
f8bf8dd
Fix struct under list
ttnghia Sep 26, 2024
d344eec
Cleanup
ttnghia Sep 26, 2024
3c68814
Remove redundant path
ttnghia Sep 26, 2024
7910633
Merge branch 'branch-24.10' into from_json
ttnghia Sep 26, 2024
c175583
Fix parsing non-numeric number
ttnghia Sep 26, 2024
0f04032
Add `allow_unquoted_control_chars` option
ttnghia Sep 26, 2024
3feefae
Fix output array/struct
ttnghia Sep 27, 2024
feeeffc
Add test
ttnghia Sep 27, 2024
a663606
Fix empty list
ttnghia Sep 27, 2024
dd60c81
Merge branch 'branch-24.10' into from_json
ttnghia Sep 27, 2024
de53ef0
Add test
ttnghia Sep 27, 2024
2407dec
Fix struct under list
ttnghia Sep 27, 2024
571d692
Cleanup
ttnghia Sep 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/main/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ add_library(
src/datetime_rebase.cu
src/decimal_utils.cu
src/from_json_to_raw_map.cu
src/from_json_to_structs.cu
src/get_json_object.cu
src/histogram.cu
src/murmur_hash.cu
Expand Down
141 changes: 141 additions & 0 deletions src/main/cpp/src/JSONUtilsJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,58 @@
#include "from_json.hpp"
#include "get_json_object.hpp"

#include <cudf/io/json.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <map>
#include <vector>

using path_instruction_type = spark_rapids_jni::path_instruction_type;

namespace spark_rapids_jni {
json_schema_element read_schema_element(int& index,
cudf::jni::native_jstringArray const& names,
cudf::jni::native_jintArray const& children,
cudf::jni::native_jintArray const& types,
cudf::jni::native_jintArray const& scales)
{
// printf("JNI line %d\n", __LINE__);
// fflush(stdout);

auto d_type = cudf::data_type{static_cast<cudf::type_id>(types[index]), scales[index]};
if (d_type.id() == cudf::type_id::STRUCT || d_type.id() == cudf::type_id::LIST) {
// printf("JNI line %d\n", __LINE__);
// fflush(stdout);

std::vector<std::pair<std::string, json_schema_element>> child_elems;
int num_children = children[index];
// go to the next entry, so recursion can parse it.
index++;
for (int i = 0; i < num_children; i++) {
// printf("JNI line %d\n", __LINE__);
// fflush(stdout);

auto const name = std::string{names.get(index).get()};
child_elems.emplace_back(name, read_schema_element(index, names, children, types, scales));
}
return json_schema_element{d_type, std::move(child_elems)};
} else {
// printf("JNI line %d\n", __LINE__);
// printf("children size: %d, idx = %d\n", children.size(), index);
// fflush(stdout);

if (children[index] != 0) {
throw std::invalid_argument("found children for a type that should have none");
}
// go to the next entry before returning...
index++;
// printf("JNI line %d\n", __LINE__);
// fflush(stdout);
return json_schema_element{d_type, {}};
}
}
} // namespace spark_rapids_jni

extern "C" {

JNIEXPORT jint JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_getMaxJSONPathDepth(JNIEnv* env,
Expand Down Expand Up @@ -154,4 +200,99 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_extractRawMap
}
CATCH_STD(env, 0);
}

JNIEXPORT jlongArray JNICALL
Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJsonToStructs(JNIEnv* env,
jclass,
jlong j_input,
jobjectArray j_col_names,
jintArray j_num_children,
jintArray j_types,
jintArray j_scales,
jboolean allow_leading_zero_numbers,
jboolean allow_non_numeric_numbers,
jboolean allow_unquoted_control_chars)
{
JNI_NULL_CHECK(env, j_input, "j_input is null", 0);
JNI_NULL_CHECK(env, j_col_names, "j_col_names is null", 0);
JNI_NULL_CHECK(env, j_num_children, "j_num_children is null", 0);
JNI_NULL_CHECK(env, j_types, "j_types is null", 0);
JNI_NULL_CHECK(env, j_scales, "j_scales is null", 0);

try {
cudf::jni::auto_set_device(env);
cudf::jni::native_jstringArray n_col_names(env, j_col_names);
cudf::jni::native_jintArray n_types(env, j_types);
cudf::jni::native_jintArray n_scales(env, j_scales);
cudf::jni::native_jintArray n_children(env, j_num_children);

if (n_types.size() != n_scales.size()) {
JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and scales must match size", 0);
}
if (n_col_names.size() != n_types.size()) {
JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and column names must match size", 0);
}
if (n_children.size() != n_types.size()) {
JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and num children must match size", 0);
}

// printf("JNI line %d, size = %d\n", __LINE__, (int)n_types.size());
// fflush(stdout);

std::vector<std::pair<std::string, spark_rapids_jni::json_schema_element>> schema;
int idx = 0;
while (idx < n_types.size()) {
// printf("JNI line %d\n", __LINE__);
// fflush(stdout);

auto const name = std::string{n_col_names.get(idx).get()};
schema.emplace_back(
name,
spark_rapids_jni::read_schema_element(idx, n_col_names, n_children, n_types, n_scales));

// auto const name = n_col_names.get(at).get();
// printf("JNI line %d\n", __LINE__);
// fflush(stdout);

// auto child = cudf::jni::read_schema_element(at, n_children, n_col_names, n_types,
// n_scales); printf("JNI line %d\n", __LINE__); fflush(stdout);

// schema.emplace(name, std::move(child));
}
// printf("JNI line %d\n", __LINE__);
// fflush(stdout);

auto const input_cv = reinterpret_cast<cudf::column_view const*>(j_input);
auto output = spark_rapids_jni::from_json_to_structs(cudf::strings_column_view{*input_cv},
schema,
allow_leading_zero_numbers,
allow_non_numeric_numbers,
allow_unquoted_control_chars);

// printf("JNI line %d\n", __LINE__);
// fflush(stdout);

auto out_handles = cudf::jni::native_jlongArray(env, output.size());
std::transform(output.begin(), output.end(), out_handles.begin(), [](auto& col) {
return cudf::jni::release_as_jlong(col);
});
return out_handles.get_jArray();
}
CATCH_STD(env, 0);
}

JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_isNullOrEmpty(JNIEnv* env,
jclass,
jlong j_input)
{
JNI_NULL_CHECK(env, j_input, "j_input is null", 0);

try {
cudf::jni::auto_set_device(env);
auto const input_cv = reinterpret_cast<cudf::column_view const*>(j_input);
return cudf::jni::release_as_jlong(
spark_rapids_jni::is_null_or_empty(cudf::strings_column_view{*input_cv}));
}
CATCH_STD(env, 0);
}
}
23 changes: 23 additions & 0 deletions src/main/cpp/src/from_json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,42 @@

#pragma once

#include <cudf/io/json.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/resource_ref.hpp>

#include <memory>
#include <string>
#include <vector>

namespace spark_rapids_jni {

struct json_schema_element {
cudf::data_type type;

std::vector<std::pair<std::string, json_schema_element>> child_types;
};

std::unique_ptr<cudf::column> from_json_to_raw_map(
cudf::strings_column_view const& input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());

std::vector<std::unique_ptr<cudf::column>> from_json_to_structs(
cudf::strings_column_view const& input,
std::vector<std::pair<std::string, json_schema_element>> const& schema,
bool allow_leading_zero_numbers,
bool allow_non_numeric_numbers,
bool allow_unquoted_control_chars,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());

std::unique_ptr<cudf::column> is_null_or_empty(
cudf::strings_column_view const& input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());

} // namespace spark_rapids_jni
Loading
Loading