From b9576cfa72878e61e9b11339fa63c0fe7623cfca Mon Sep 17 00:00:00 2001 From: Juan Cruz Viotti Date: Mon, 22 Jul 2024 12:46:40 -0400 Subject: [PATCH] Extend the `validate` command to validate JSONL datasets (#131) Signed-off-by: Juan Cruz Viotti --- cmake/FindJSONToolkit.cmake | 1 - docs/validate.markdown | 14 ++--- src/CMakeLists.txt | 1 + src/command_validate.cc | 69 ++++++++++++++++++----- src/main.cc | 2 +- test/CMakeLists.txt | 7 +++ test/validate/fail_jsonl_all.sh | 40 +++++++++++++ test/validate/fail_jsonl_all_verbose.sh | 41 ++++++++++++++ test/validate/fail_jsonl_invalid_entry.sh | 32 +++++++++++ test/validate/fail_jsonl_one.sh | 42 ++++++++++++++ test/validate/fail_jsonl_one_verbose.sh | 45 +++++++++++++++ test/validate/pass_jsonl.sh | 32 +++++++++++ test/validate/pass_jsonl_verbose.sh | 39 +++++++++++++ 13 files changed, 343 insertions(+), 22 deletions(-) create mode 100755 test/validate/fail_jsonl_all.sh create mode 100755 test/validate/fail_jsonl_all_verbose.sh create mode 100755 test/validate/fail_jsonl_invalid_entry.sh create mode 100755 test/validate/fail_jsonl_one.sh create mode 100755 test/validate/fail_jsonl_one_verbose.sh create mode 100755 test/validate/pass_jsonl.sh create mode 100755 test/validate/pass_jsonl_verbose.sh diff --git a/cmake/FindJSONToolkit.cmake b/cmake/FindJSONToolkit.cmake index b65f6d3..d8b5678 100644 --- a/cmake/FindJSONToolkit.cmake +++ b/cmake/FindJSONToolkit.cmake @@ -1,6 +1,5 @@ if(NOT JSONToolkit_FOUND) set(JSONTOOLKIT_INSTALL OFF CACHE BOOL "disable installation") - set(JSONTOOLKIT_JSONL OFF CACHE BOOL "disable JSONL support") add_subdirectory("${PROJECT_SOURCE_DIR}/vendor/jsontoolkit") set(JSONToolkit_FOUND ON) endif() diff --git a/docs/validate.markdown b/docs/validate.markdown index 7335ae5..43125ce 100644 --- a/docs/validate.markdown +++ b/docs/validate.markdown @@ -7,14 +7,14 @@ Validating > Draft 2020-12 soon. ```sh -jsonschema validate [--http/-h] [--verbose/-v] - [--resolve/-r ...] +jsonschema validate [--http/-h] + [--verbose/-v] [--resolve/-r ...] ``` The most popular use case of JSON Schema is to validate JSON documents. The -JSON Schema CLI offers a `validate` command to evaluate a JSON instance against -a JSON Schema, presenting human-friendly information on unsuccessful -validation. +JSON Schema CLI offers a `validate` command to evaluate either a JSON instance +or a JSONL dataset against a JSON Schema, presenting human-friendly information +on unsuccessful validation. **If you want to validate that a schema adheres to its metaschema, use the [`metaschema`](./metaschema.markdown) command instead.** @@ -55,10 +55,10 @@ error: The target document is expected to be of the given type jsonschema validate path/to/my/schema.json path/to/my/instance.json ``` -### Validate a JSON Schema against it meta-schema +### Validate a JSONL dataset against a schema ```sh -jsonschema validate path/to/my/schema.json +jsonschema validate path/to/my/schema.json path/to/my/dataset.jsonl ``` ### Validate a JSON instance enabling HTTP resolution diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index aca9dcb..deccfb2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -14,6 +14,7 @@ noa_add_default_options(PRIVATE jsonschema_cli) set_target_properties(jsonschema_cli PROPERTIES OUTPUT_NAME jsonschema) target_link_libraries(jsonschema_cli PRIVATE sourcemeta::jsontoolkit::uri) target_link_libraries(jsonschema_cli PRIVATE sourcemeta::jsontoolkit::json) +target_link_libraries(jsonschema_cli PRIVATE sourcemeta::jsontoolkit::jsonl) target_link_libraries(jsonschema_cli PRIVATE sourcemeta::jsontoolkit::jsonschema) target_link_libraries(jsonschema_cli PRIVATE sourcemeta::hydra::httpclient) diff --git a/src/command_validate.cc b/src/command_validate.cc index b355a68..e9d2157 100644 --- a/src/command_validate.cc +++ b/src/command_validate.cc @@ -1,4 +1,5 @@ #include +#include #include #include // EXIT_SUCCESS, EXIT_FAILURE @@ -46,26 +47,68 @@ auto intelligence::jsonschema::cli::validate( } bool result{true}; - const auto &instance_path{options.at("").at(1)}; + const std::filesystem::path instance_path{options.at("").at(1)}; const auto schema_template{sourcemeta::jsontoolkit::compile( schema, sourcemeta::jsontoolkit::default_schema_walker, custom_resolver, sourcemeta::jsontoolkit::default_schema_compiler)}; - const auto instance{sourcemeta::jsontoolkit::from_file(instance_path)}; + if (instance_path.extension() == ".jsonl") { + log_verbose(options) << "Interpreting input as JSONL\n"; + std::size_t index{0}; - std::ostringstream error; - result = sourcemeta::jsontoolkit::evaluate( - schema_template, instance, - sourcemeta::jsontoolkit::SchemaCompilerEvaluationMode::Fast, - pretty_evaluate_callback(error, sourcemeta::jsontoolkit::empty_pointer)); + auto stream{sourcemeta::jsontoolkit::read_file(instance_path)}; + try { + for (const auto &instance : sourcemeta::jsontoolkit::JSONL{stream}) { + std::ostringstream error; + const auto subresult = sourcemeta::jsontoolkit::evaluate( + schema_template, instance, + sourcemeta::jsontoolkit::SchemaCompilerEvaluationMode::Fast, + pretty_evaluate_callback(error, + sourcemeta::jsontoolkit::empty_pointer)); - if (result) { - log_verbose(options) - << "ok: " << std::filesystem::weakly_canonical(instance_path).string() - << "\n matches " - << std::filesystem::weakly_canonical(schema_path).string() << "\n"; + if (subresult) { + log_verbose(options) + << "ok: " + << std::filesystem::weakly_canonical(instance_path).string() + << " (entry #" << index << ")" + << "\n matches " + << std::filesystem::weakly_canonical(schema_path).string() + << "\n"; + } else { + std::cerr << "fail: " + << std::filesystem::weakly_canonical(instance_path).string() + << " (entry #" << index << ")\n\n"; + sourcemeta::jsontoolkit::prettify(instance, std::cerr); + std::cerr << "\n\n"; + std::cerr << error.str(); + result = false; + break; + } + + index += 1; + } + } catch (const sourcemeta::jsontoolkit::ParseError &error) { + // For producing better error messages + throw sourcemeta::jsontoolkit::FileParseError(instance_path, error); + } } else { - std::cerr << error.str(); + const auto instance{sourcemeta::jsontoolkit::from_file(instance_path)}; + + std::ostringstream error; + result = sourcemeta::jsontoolkit::evaluate( + schema_template, instance, + sourcemeta::jsontoolkit::SchemaCompilerEvaluationMode::Fast, + pretty_evaluate_callback(error, + sourcemeta::jsontoolkit::empty_pointer)); + + if (result) { + log_verbose(options) + << "ok: " << std::filesystem::weakly_canonical(instance_path).string() + << "\n matches " + << std::filesystem::weakly_canonical(schema_path).string() << "\n"; + } else { + std::cerr << error.str(); + } } return result ? EXIT_SUCCESS : EXIT_FAILURE; diff --git a/src/main.cc b/src/main.cc index 3d5edcb..d810a21 100644 --- a/src/main.cc +++ b/src/main.cc @@ -21,7 +21,7 @@ Global Options: Commands: - validate [--http/-h] + validate [--http/-h] Validate an instance against the given schema. diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 9b2832a..79f6d84 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -61,6 +61,13 @@ add_jsonschema_test_unix(validate/pass_draft7) add_jsonschema_test_unix(validate/fail_draft4) add_jsonschema_test_unix(validate/fail_draft6) add_jsonschema_test_unix(validate/fail_draft7) +add_jsonschema_test_unix(validate/pass_jsonl) +add_jsonschema_test_unix(validate/pass_jsonl_verbose) +add_jsonschema_test_unix(validate/fail_jsonl_invalid_entry) +add_jsonschema_test_unix(validate/fail_jsonl_one) +add_jsonschema_test_unix(validate/fail_jsonl_one_verbose) +add_jsonschema_test_unix(validate/fail_jsonl_all) +add_jsonschema_test_unix(validate/fail_jsonl_all_verbose) # Test add_jsonschema_test_unix(test/fail_true_single_resolve) diff --git a/test/validate/fail_jsonl_all.sh b/test/validate/fail_jsonl_all.sh new file mode 100755 index 0000000..d11a894 --- /dev/null +++ b/test/validate/fail_jsonl_all.sh @@ -0,0 +1,40 @@ +#!/bin/sh + +set -o errexit +set -o nounset + +TMP="$(mktemp -d)" +clean() { rm -rf "$TMP"; } +trap clean EXIT + +cat << 'EOF' > "$TMP/schema.json" +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "type": "array" +} +EOF + +cat << 'EOF' > "$TMP/instance.jsonl" +{ "foo": 1 } +{ "foo": 2 } +{ "foo": 3 } +EOF + +"$1" validate "$TMP/schema.json" "$TMP/instance.jsonl" 2>"$TMP/stderr.txt" \ + && CODE="$?" || CODE="$?" +test "$CODE" = "1" || exit 1 + +cat << EOF > "$TMP/expected.txt" +fail: $(realpath "$TMP")/instance.jsonl (entry #0) + +{ + "foo": 1 +} + +error: Schema validation failure + The target document is expected to be of the given type + at instance location "" + at evaluate path "/type" +EOF + +diff "$TMP/stderr.txt" "$TMP/expected.txt" diff --git a/test/validate/fail_jsonl_all_verbose.sh b/test/validate/fail_jsonl_all_verbose.sh new file mode 100755 index 0000000..89e3fea --- /dev/null +++ b/test/validate/fail_jsonl_all_verbose.sh @@ -0,0 +1,41 @@ +#!/bin/sh + +set -o errexit +set -o nounset + +TMP="$(mktemp -d)" +clean() { rm -rf "$TMP"; } +trap clean EXIT + +cat << 'EOF' > "$TMP/schema.json" +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "type": "array" +} +EOF + +cat << 'EOF' > "$TMP/instance.jsonl" +{ "foo": 1 } +{ "foo": 2 } +{ "foo": 3 } +EOF + +"$1" validate "$TMP/schema.json" "$TMP/instance.jsonl" --verbose 2>"$TMP/stderr.txt" \ + && CODE="$?" || CODE="$?" +test "$CODE" = "1" || exit 1 + +cat << EOF > "$TMP/expected.txt" +Interpreting input as JSONL +fail: $(realpath "$TMP")/instance.jsonl (entry #0) + +{ + "foo": 1 +} + +error: Schema validation failure + The target document is expected to be of the given type + at instance location "" + at evaluate path "/type" +EOF + +diff "$TMP/stderr.txt" "$TMP/expected.txt" diff --git a/test/validate/fail_jsonl_invalid_entry.sh b/test/validate/fail_jsonl_invalid_entry.sh new file mode 100755 index 0000000..044bfa3 --- /dev/null +++ b/test/validate/fail_jsonl_invalid_entry.sh @@ -0,0 +1,32 @@ +#!/bin/sh + +set -o errexit +set -o nounset + +TMP="$(mktemp -d)" +clean() { rm -rf "$TMP"; } +trap clean EXIT + +cat << 'EOF' > "$TMP/schema.json" +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "type": "object" +} +EOF + +cat << 'EOF' > "$TMP/instance.jsonl" +{ "foo": "first" } +{ "foo" "second" } +{ "foo": "third" } +EOF + +"$1" validate "$TMP/schema.json" "$TMP/instance.jsonl" 2>"$TMP/stderr.txt" \ + && CODE="$?" || CODE="$?" +test "$CODE" = "1" || exit 1 + +cat << EOF > "$TMP/expected.txt" +error: Failed to parse the JSON document at line 2 and column 10 + $(realpath "$TMP")/instance.jsonl +EOF + +diff "$TMP/stderr.txt" "$TMP/expected.txt" diff --git a/test/validate/fail_jsonl_one.sh b/test/validate/fail_jsonl_one.sh new file mode 100755 index 0000000..42b1e96 --- /dev/null +++ b/test/validate/fail_jsonl_one.sh @@ -0,0 +1,42 @@ +#!/bin/sh + +set -o errexit +set -o nounset + +TMP="$(mktemp -d)" +clean() { rm -rf "$TMP"; } +trap clean EXIT + +cat << 'EOF' > "$TMP/schema.json" +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "type": "object" +} +EOF + +cat << 'EOF' > "$TMP/instance.jsonl" +{ "foo": 1 } +[ { "foo": 2 } ] +{ "foo": 3 } +EOF + +"$1" validate "$TMP/schema.json" "$TMP/instance.jsonl" 2>"$TMP/stderr.txt" \ + && CODE="$?" || CODE="$?" +test "$CODE" = "1" || exit 1 + +cat << EOF > "$TMP/expected.txt" +fail: $(realpath "$TMP")/instance.jsonl (entry #1) + +[ + { + "foo": 2 + } +] + +error: Schema validation failure + The target document is expected to be of the given type + at instance location "" + at evaluate path "/type" +EOF + +diff "$TMP/stderr.txt" "$TMP/expected.txt" diff --git a/test/validate/fail_jsonl_one_verbose.sh b/test/validate/fail_jsonl_one_verbose.sh new file mode 100755 index 0000000..57f2064 --- /dev/null +++ b/test/validate/fail_jsonl_one_verbose.sh @@ -0,0 +1,45 @@ +#!/bin/sh + +set -o errexit +set -o nounset + +TMP="$(mktemp -d)" +clean() { rm -rf "$TMP"; } +trap clean EXIT + +cat << 'EOF' > "$TMP/schema.json" +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "type": "object" +} +EOF + +cat << 'EOF' > "$TMP/instance.jsonl" +{ "foo": 1 } +[ { "foo": 2 } ] +{ "foo": 3 } +EOF + +"$1" validate "$TMP/schema.json" "$TMP/instance.jsonl" --verbose 2>"$TMP/stderr.txt" \ + && CODE="$?" || CODE="$?" +test "$CODE" = "1" || exit 1 + +cat << EOF > "$TMP/expected.txt" +Interpreting input as JSONL +ok: $(realpath "$TMP")/instance.jsonl (entry #0) + matches $(realpath "$TMP")/schema.json +fail: $(realpath "$TMP")/instance.jsonl (entry #1) + +[ + { + "foo": 2 + } +] + +error: Schema validation failure + The target document is expected to be of the given type + at instance location "" + at evaluate path "/type" +EOF + +diff "$TMP/stderr.txt" "$TMP/expected.txt" diff --git a/test/validate/pass_jsonl.sh b/test/validate/pass_jsonl.sh new file mode 100755 index 0000000..97b846f --- /dev/null +++ b/test/validate/pass_jsonl.sh @@ -0,0 +1,32 @@ +#!/bin/sh + +set -o errexit +set -o nounset + +TMP="$(mktemp -d)" +clean() { rm -rf "$TMP"; } +trap clean EXIT + +cat << 'EOF' > "$TMP/schema.json" +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "properties": { + "foo": { + "type": "string" + } + } +} +EOF + +cat << 'EOF' > "$TMP/instance.jsonl" +{ "foo": "first" } +{ "foo": "second" } +{ "foo": "third" } +EOF + +"$1" validate "$TMP/schema.json" "$TMP/instance.jsonl" 2> "$TMP/output.txt" 1>&2 + +cat << EOF > "$TMP/expected.txt" +EOF + +diff "$TMP/output.txt" "$TMP/expected.txt" diff --git a/test/validate/pass_jsonl_verbose.sh b/test/validate/pass_jsonl_verbose.sh new file mode 100755 index 0000000..cc055d1 --- /dev/null +++ b/test/validate/pass_jsonl_verbose.sh @@ -0,0 +1,39 @@ +#!/bin/sh + +set -o errexit +set -o nounset + +TMP="$(mktemp -d)" +clean() { rm -rf "$TMP"; } +trap clean EXIT + +cat << 'EOF' > "$TMP/schema.json" +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "properties": { + "foo": { + "type": "string" + } + } +} +EOF + +cat << 'EOF' > "$TMP/instance.jsonl" +{ "foo": "first" } +{ "foo": "second" } +{ "foo": "third" } +EOF + +"$1" validate "$TMP/schema.json" "$TMP/instance.jsonl" --verbose 2> "$TMP/output.txt" 1>&2 + +cat << EOF > "$TMP/expected.txt" +Interpreting input as JSONL +ok: $(realpath "$TMP")/instance.jsonl (entry #0) + matches $(realpath "$TMP")/schema.json +ok: $(realpath "$TMP")/instance.jsonl (entry #1) + matches $(realpath "$TMP")/schema.json +ok: $(realpath "$TMP")/instance.jsonl (entry #2) + matches $(realpath "$TMP")/schema.json +EOF + +diff "$TMP/output.txt" "$TMP/expected.txt"