From 8bfdc51588c58724ec754ed739cbce476ed9221b Mon Sep 17 00:00:00 2001 From: "Fuchs, Stefan" Date: Sat, 15 Oct 2022 10:18:53 +0200 Subject: [PATCH 1/3] added Json Schema for dtspec yaml format --- dtspec/schema/dtspec-schema.json | 361 +++++++++++++++++++++++++++++++ 1 file changed, 361 insertions(+) create mode 100644 dtspec/schema/dtspec-schema.json diff --git a/dtspec/schema/dtspec-schema.json b/dtspec/schema/dtspec-schema.json new file mode 100644 index 0000000..6820b73 --- /dev/null +++ b/dtspec/schema/dtspec-schema.json @@ -0,0 +1,361 @@ +{ + "$id": "https://schema.insidetrack.org/dtspec/0.1.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "description": "JSON Schema for dtspec test scenario definitions", + "title": "DtspecSchema", + "additionalProperties": false, + "type": "object", + "definitions": { + "identifier_map": { + "type": "array", + "description": "Mapping of identifiers to table columns", + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "column", + "identifier" + ], + "properties": { + "column": { + "type": "string", + "description": "Name of the table column the identifier will be mapped to" + }, + "identifier": { + "type": "object", + "additionalProperties": false, + "description": "The identifier being mapped to the table", + "properties": { + "name": { + "type": "string", + "description": "Name of the identifier to map to that column" + }, + "attribute": { + "type": "string", + "description": "The identifiers attribute that refers to the generator to use (e.g. integer or string)" + } + } + } + } + } + }, + "data": { + "type": "array", + "description": "The input data that will be inserted into each source table before the test run", + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "source", + "table" + ], + "properties": { + "source": { + "type": "string", + "description": "Name of a source defined in the sources list where the data should be inserted." + }, + "table": { + "type": "string", + "description": "Input data formatted as markdown table" + } + } + } + } + }, + "properties": { + "version": { + "enum": [ + "0.1" + ], + "description": "dtspec specification schema version" + }, + "description": { + "type": "string", + "description": "Short text describing the purpose of the test scenario" + }, + "identifiers": { + "type": "array", + "description": "List of available identifiers. Each identifier can define multiple generators, e.g. to generate both a string and an integer that refer to the same entity.", + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "identifier", + "attributes" + ], + "properties": { + "identifier": { + "type": "string", + "description": "An identifier tells dtspec which columns should be used to identify a record as belonging to a case" + }, + "attributes": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "field", + "generator" + ], + "properties": { + "field": { + "type": "string", + "description": "The field of an identifier is used to later link the generator to a source and target table column." + }, + "generator": { + "enum": [ + "unique_integer", + "unique_string" + ], + "description": "Defines if the generator produces either integer or string values" + }, + "prefix": { + "type": "string", + "description": "Optional argument for unique_string can help troubleshoot testing issues" + } + } + } + } + } + } + }, + "sources": { + "type": "array", + "description": "Declares the source tables available in the test scenarios and optionally defines the mapping of identifiers to table columns.", + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "source" + ], + "properties": { + "source": { + "type": "string", + "description": "The name of the source table" + }, + "description": { + "type": "string", + "description": "Short note describing the source." + }, + "identifier_map": { + "$ref": "#/definitions/identifier_map" + }, + "defaults": { + "type": "array", + "description": "Source columns can be given defaults. This can be useful when a source field shouldn't be blank and needs some sensible default.", + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "column", + "value" + ], + "properties": { + "column": { + "type": "string", + "description": "Name of the column the default should be set for" + }, + "value": { + "type": "string", + "description": "The default value to set" + } + } + } + } + } + } + }, + "targets": { + "type": "array", + "description": "Declares the target tables available in the test scenarios and optionally defines the mapping of identifiers to table columns.", + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "target" + ], + "properties": { + "target": { + "type": "string", + "description": "The name of the target table" + }, + "description": { + "type": "string", + "description": "Short note describing the target." + }, + "identifier_map": { + "$ref": "#/definitions/identifier_map" + } + } + } + }, + "factories": { + "type": "array", + "description": "Declares the target tables available in the test scenarios and optionally defines the mapping of identifiers to table columns.", + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "factory", + "data" + ], + "properties": { + "factory": { + "type": "string", + "description": "The name of the factory" + }, + "description": { + "type": "string", + "description": "Short note describing the factory." + }, + "data": { + "$ref": "#/definitions/data" + }, + "parents": { + "type": "array", + "description": "List of parent factories. Allows to add additional data definitions from another factory specification to this factory.", + "items": { + "type": "string" + } + } + } + } + }, + "scenarios": { + "type": "array", + "description": "The list of test scenario definitions. Scenarios are collections of cases that share some common data factory or describe similar situations.", + "items": { + "type": "object", + "description": "A scenario definition with multiple test cases and optional references to factories.", + "additionalProperties": false, + "required": [ + "scenario", + "cases" + ], + "properties": { + "scenario": { + "type": "string", + "description": "Name of the test scenario" + }, + "description": { + "type": "string", + "description": "A short description of the test scenario" + }, + "factory": { + "type": "object", + "additionalProperties": false, + "required": [ + "parents" + ], + "properties": { + "parents": { + "type": "array", + "description": "List of factory names (defined under 'factories') that should be used to generate source data.", + "items": { + "type": "string" + } + } + } + }, + "cases": { + "type": "array", + "description": "A scenario can contain multiple test cases. The cases are stacked, so the user can load this stacked data into their data transformation system once, runs the data transformations once, and then collects the resulting output once.", + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "case", + "expected" + ], + "properties": { + "case": { + "type": "string", + "description": "Name of the test case" + }, + "description": { + "type": "string", + "description": "Short description of the purpose of the test case" + }, + "factory": { + "type": "object", + "description": "Test data generated for this specific case.", + "additionalProperties": false, + "required": [ + "data" + ], + "properties": { + "data": { + "$ref": "#/definitions/data" + } + } + }, + "expected": { + "type": "object", + "description": "The expected output data.", + "additionalProperties": false, + "required": [ + "data" + ], + "properties": { + "data": { + "type": "array", + "description": "List of target tables that should be validated after the test run.", + "items": { + "type": "object", + "description": "", + "additionalProperties": false, + "required": [ + "target", + "table" + ], + "properties": { + "target": { + "type": "string", + "description": "Name of the target table." + }, + "table": { + "type": "string", + "description": "The expected output data after the test run as markdown formatted table." + }, + "by": { + "type": "array", + "description": "List of column names to order the expected and actual output by to ensure a defined order for the comparison.", + "items": { + "type": "string" + } + }, + "values": { + "type": "array", + "description": "Mapping of constant values to columns that are expected in every row", + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "column", + "value" + ], + "properties": { + "column": { + "type": "string", + "description": "Table column name where the value is expected." + }, + "value": { + "type": "string", + "description": "The value expected in every row of that column." + } + } + } + } + } + } + } + } + } + } + } + } + } + } + } + } +} From 71f50205a990d16bf026e2600907996cf4936cab Mon Sep 17 00:00:00 2001 From: "Fuchs, Stefan" Date: Tue, 18 Oct 2022 18:02:45 +0200 Subject: [PATCH 2/3] modified Json Schema for dtspec yaml format based on SCHEMA in api.py --- dtspec/schema/dtspec-schema.json | 282 ++++++++++++++++--------------- 1 file changed, 150 insertions(+), 132 deletions(-) diff --git a/dtspec/schema/dtspec-schema.json b/dtspec/schema/dtspec-schema.json index 6820b73..4224483 100644 --- a/dtspec/schema/dtspec-schema.json +++ b/dtspec/schema/dtspec-schema.json @@ -1,21 +1,21 @@ { "$id": "https://schema.insidetrack.org/dtspec/0.1.json", "$schema": "https://json-schema.org/draft/2020-12/schema", - "description": "JSON Schema for dtspec test scenario definitions", - "title": "DtspecSchema", - "additionalProperties": false, - "type": "object", + "title": "Data Test Studio API spec", + "description": "Data Test Studio API spec", "definitions": { "identifier_map": { "type": "array", "description": "Mapping of identifiers to table columns", + "minItems": 1, + "uniqueItems": true, "items": { "type": "object", - "additionalProperties": false, "required": [ "column", "identifier" ], + "additionalProperties": false, "properties": { "column": { "type": "string", @@ -23,8 +23,12 @@ }, "identifier": { "type": "object", - "additionalProperties": false, "description": "The identifier being mapped to the table", + "required": [ + "name", + "attribute" + ], + "additionalProperties": false, "properties": { "name": { "type": "string", @@ -39,16 +43,17 @@ } } }, - "data": { + "factory_data": { "type": "array", - "description": "The input data that will be inserted into each source table before the test run", + "minItems": 1, + "uniqueItems": true, "items": { "type": "object", - "additionalProperties": false, "required": [ "source", "table" ], + "additionalProperties": false, "properties": { "source": { "type": "string", @@ -57,16 +62,91 @@ "table": { "type": "string", "description": "Input data formatted as markdown table" + }, + "values": { + "$ref": "#/definitions/column_values" + } + } + } + }, + "column_values": { + "type": "array", + "minItems": 1, + "uniqueItems": true, + "items": { + "type": "object", + "required": [ + "column", + "value" + ], + "additionalProperties": false, + "properties": { + "column": { + "type": "string", + "description": "Name of the column the default should be set for" + }, + "value": { + "type": [ + "string", + "null" + ], + "description": "The default value to set" + } + } + } + }, + "expected": { + "type": "object", + "required": [ + "data" + ], + "additionalProperties": false, + "description": "The expected output data.", + "properties": { + "data": { + "type": "array", + "minItems": 1, + "uniqueItems": true, + "description": "List of target tables that should be validated after the test run.", + "items": { + "type": "object", + "required": [ + "target" + ], + "additionalProperties": false, + "properties": { + "target": { + "type": "string", + "description": "Name of the target table." + }, + "table": { + "type": "string", + "description": "The expected output data after the test run as markdown formatted table." + }, + "values": { + "$ref": "#/definitions/column_values", + "description": "Mapping of constant values to columns that are expected in every row" + }, + "by": { + "type": "array", + "description": "List of column names to order the expected and actual output by to ensure a defined order for the comparison.", + "items": { + "type": "string" + } + }, + "compare_via": { + "type": "string" + } + } } } } } }, + "type": "object", "properties": { "version": { - "enum": [ - "0.1" - ], + "type": "string", "description": "dtspec specification schema version" }, "description": { @@ -75,14 +155,16 @@ }, "identifiers": { "type": "array", + "minItems": 1, + "uniqueItems": true, "description": "List of available identifiers. Each identifier can define multiple generators, e.g. to generate both a string and an integer that refer to the same entity.", "items": { "type": "object", - "additionalProperties": false, "required": [ "identifier", "attributes" ], + "additionalProperties": false, "properties": { "identifier": { "type": "string", @@ -90,28 +172,27 @@ }, "attributes": { "type": "array", + "minItems": 1, + "uniqueItems": true, "items": { "type": "object", - "additionalProperties": false, "required": [ "field", "generator" ], + "additionalProperties": true, "properties": { "field": { "type": "string", "description": "The field of an identifier is used to later link the generator to a source and target table column." }, "generator": { - "enum": [ - "unique_integer", - "unique_string" - ], + "type": "string", "description": "Defines if the generator produces either integer or string values" }, "prefix": { "type": "string", - "description": "Optional argument for unique_string can help troubleshoot testing issues" + "description": "Optional argument for unique_string. Can help troubleshoot testing issues" } } } @@ -121,84 +202,71 @@ }, "sources": { "type": "array", + "minItems": 1, + "uniqueItems": true, "description": "Declares the source tables available in the test scenarios and optionally defines the mapping of identifiers to table columns.", "items": { "type": "object", - "additionalProperties": false, "required": [ "source" ], + "addtionalProperties": false, "properties": { "source": { "type": "string", "description": "The name of the source table" }, - "description": { - "type": "string", - "description": "Short note describing the source." + "defaults": { + "$ref": "#/definitions/column_values", + "description": "Source columns can be given defaults. This can be useful when a source field shouldn't be blank and needs some sensible default." }, "identifier_map": { "$ref": "#/definitions/identifier_map" }, - "defaults": { - "type": "array", - "description": "Source columns can be given defaults. This can be useful when a source field shouldn't be blank and needs some sensible default.", - "items": { - "type": "object", - "additionalProperties": false, - "required": [ - "column", - "value" - ], - "properties": { - "column": { - "type": "string", - "description": "Name of the column the default should be set for" - }, - "value": { - "type": "string", - "description": "The default value to set" - } - } - } + "description": { + "type": "string", + "description": "Short note describing the source." } } } }, "targets": { "type": "array", + "minItems": 1, + "uniqueItems": true, "description": "Declares the target tables available in the test scenarios and optionally defines the mapping of identifiers to table columns.", "items": { "type": "object", - "additionalProperties": false, "required": [ "target" ], + "addtionalProperties": false, "properties": { "target": { "type": "string", "description": "The name of the target table" }, + "identifier_map": { + "$ref": "#/definitions/identifier_map" + }, "description": { "type": "string", "description": "Short note describing the target." - }, - "identifier_map": { - "$ref": "#/definitions/identifier_map" } } } }, "factories": { "type": "array", + "minItems": 1, + "uniqueItems": true, "description": "Declares the target tables available in the test scenarios and optionally defines the mapping of identifiers to table columns.", "items": { "type": "object", - "additionalProperties": false, "required": [ - "factory", - "data" + "factory" ], + "additionalProperties": false, "properties": { "factory": { "type": "string", @@ -208,30 +276,32 @@ "type": "string", "description": "Short note describing the factory." }, - "data": { - "$ref": "#/definitions/data" - }, "parents": { "type": "array", "description": "List of parent factories. Allows to add additional data definitions from another factory specification to this factory.", "items": { "type": "string" } + }, + "data": { + "$ref": "#/definitions/factory_data" } } } }, "scenarios": { "type": "array", + "minItems": 1, + "uniqueItems": true, "description": "The list of test scenario definitions. Scenarios are collections of cases that share some common data factory or describe similar situations.", "items": { "type": "object", "description": "A scenario definition with multiple test cases and optional references to factories.", - "additionalProperties": false, "required": [ "scenario", "cases" ], + "additionalProperties": false, "properties": { "scenario": { "type": "string", @@ -242,31 +312,29 @@ "description": "A short description of the test scenario" }, "factory": { - "type": "object", - "additionalProperties": false, - "required": [ - "parents" - ], - "properties": { - "parents": { - "type": "array", - "description": "List of factory names (defined under 'factories') that should be used to generate source data.", - "items": { - "type": "string" - } + "parents": { + "type": "array", + "description": "List of factory names (defined under 'factories') that should be used to generate source data.", + "items": { + "type": "string" } + }, + "data": { + "$ref": "#/definitions/factory_data" } }, "cases": { "type": "array", + "minItems": 1, + "uniqueItems": true, "description": "A scenario can contain multiple test cases. The cases are stacked, so the user can load this stacked data into their data transformation system once, runs the data transformations once, and then collects the resulting output once.", "items": { "type": "object", - "additionalProperties": false, "required": [ "case", "expected" ], + "additionalProperties": false, "properties": { "case": { "type": "string", @@ -278,84 +346,34 @@ }, "factory": { "type": "object", - "description": "Test data generated for this specific case.", - "additionalProperties": false, "required": [ "data" ], + "additionalProperties": false, + "description": "Test data generated for this specific case.", "properties": { "data": { - "$ref": "#/definitions/data" + "$ref": "#/definitions/factory_data" } } }, "expected": { - "type": "object", - "description": "The expected output data.", - "additionalProperties": false, - "required": [ - "data" - ], - "properties": { - "data": { - "type": "array", - "description": "List of target tables that should be validated after the test run.", - "items": { - "type": "object", - "description": "", - "additionalProperties": false, - "required": [ - "target", - "table" - ], - "properties": { - "target": { - "type": "string", - "description": "Name of the target table." - }, - "table": { - "type": "string", - "description": "The expected output data after the test run as markdown formatted table." - }, - "by": { - "type": "array", - "description": "List of column names to order the expected and actual output by to ensure a defined order for the comparison.", - "items": { - "type": "string" - } - }, - "values": { - "type": "array", - "description": "Mapping of constant values to columns that are expected in every row", - "items": { - "type": "object", - "additionalProperties": false, - "required": [ - "column", - "value" - ], - "properties": { - "column": { - "type": "string", - "description": "Table column name where the value is expected." - }, - "value": { - "type": "string", - "description": "The value expected in every row of that column." - } - } - } - } - } - } - } - } + "$ref": "#/definitions/expected" } } } } } } + }, + "metadata": { + "type": "object" } - } + }, + "required": [ + "version", + "sources", + "scenarios" + ], + "additionalProperties": false } From c0b1ec27f4bab23e5115d05a21ad4149372d4caf Mon Sep 17 00:00:00 2001 From: "Fuchs, Stefan" Date: Wed, 19 Oct 2022 10:34:31 +0200 Subject: [PATCH 3/3] load schema from file --- dtspec/api.py | 211 ++------------------------------------------------ 1 file changed, 6 insertions(+), 205 deletions(-) diff --git a/dtspec/api.py b/dtspec/api.py index 93f208a..b8b07e1 100644 --- a/dtspec/api.py +++ b/dtspec/api.py @@ -1,215 +1,16 @@ +import os + import networkx import jsonschema +import yaml from colorama import Fore, Style from dtspec.core import Identifier, Factory, Source, Target, Scenario, Case from dtspec.expectations import DataExpectation -SCHEMA = { - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "Data Test Studio API spec", - "description": "Data Test Studio API spec", - "definitions": { - "identifier_map": { - "type": "array", - "minItems": 1, - "uniqueItems": True, - "items": { - "type": "object", - "required": ["column", "identifier"], - "additionalProperties": False, - "properties": { - "column": {"type": "string"}, - "identifier": { - "type": "object", - "required": ["name", "attribute"], - "additionalProperties": False, - "properties": { - "name": {"type": "string"}, - "attribute": {"type": "string"}, - }, - }, - }, - }, - }, - "factory_data": { - "type": "array", - "minItems": 1, - "uniqueItems": True, - "items": { - "type": "object", - "required": ["source", "table"], - "additionalProperties": False, - "properties": { - "source": {"type": "string"}, - "table": {"type": "string"}, - "values": {"$ref": "#/definitions/column_values"}, - }, - }, - }, - "column_values": { - "type": "array", - "minItems": 1, - "uniqueItems": True, - "items": { - "type": "object", - "required": ["column", "value"], - "additionalProperties": False, - "properties": { - "column": {"type": "string"}, - "value": {"type": ["string", "null"]}, - }, - }, - }, - "expected": { - "type": "object", - "required": ["data"], - "additionalProperties": False, - "properties": { - "data": { - "type": "array", - "minItems": 1, - "uniqueItems": True, - "items": { - "type": "object", - "required": ["target"], - "additionalProperties": False, - "properties": { - "target": {"type": "string"}, - "table": {"type": "string"}, - "values": {"$ref": "#/definitions/column_values"}, - "by": {"type": "array", "items": {"type": "string"}}, - "compare_via": {"type": "string"}, - }, - }, - } - }, - }, - }, - "type": "object", - "properties": { - "version": {"type": "string"}, - "description": {"type": "string"}, - "identifiers": { - "type": "array", - "minItems": 1, - "uniqueItems": True, - "items": { - "type": "object", - "required": ["identifier", "attributes"], - "additionalProperties": False, - "properties": { - "identifier": {"type": "string"}, - "attributes": { - "type": "array", - "minItems": 1, - "uniqueItems": True, - "items": { - "type": "object", - "required": ["field", "generator"], - "additionalProperties": True, - "properties": { - "field": {"type": "string"}, - "generator": {"type": "string"}, - }, - }, - }, - }, - }, - }, - "sources": { - "type": "array", - "minItems": 1, - "uniqueItems": True, - "items": { - "type": "object", - "required": ["source"], - "addtionalProperties": False, - "properties": { - "source": {"type": "string"}, - "defaults": {"$ref": "#/definitions/column_values"}, - "identifier_map": {"$ref": "#/definitions/identifier_map"}, - "description": {"type": "string"}, - }, - }, - }, - "targets": { - "type": "array", - "minItems": 1, - "uniqueItems": True, - "items": { - "type": "object", - "required": ["target"], - "addtionalProperties": False, - "properties": { - "target": {"type": "string"}, - "identifier_map": {"$ref": "#/definitions/identifier_map"}, - "description": {"type": "string"}, - }, - }, - }, - "factories": { - "type": "array", - "minItems": 1, - "uniqueItems": True, - "items": { - "type": "object", - "required": ["factory"], - "additionalProperties": False, - "properties": { - "factory": {"type": "string"}, - "description": {"type": "string"}, - "parents": {"type": "array", "items": {"type": "string"}}, - "data": {"$ref": "#/definitions/factory_data"}, - }, - }, - }, - "scenarios": { - "type": "array", - "minItems": 1, - "uniqueItems": True, - "items": { - "type": "object", - "required": ["scenario", "cases"], - "additionalProperties": False, - "properties": { - "scenario": {"type": "string"}, - "description": {"type": "string"}, - "factory": { - "parents": {"type": "array", "items": {"type": "string"}}, - "data": {"$ref": "#/definitions/factory_data"}, - }, - "cases": { - "type": "array", - "minItems": 1, - "uniqueItems": True, - "items": { - "type": "object", - "required": ["case", "expected"], - "additionalProperties": False, - "properties": { - "case": {"type": "string"}, - "description": {"type": "string"}, - "factory": { - "type": "object", - "required": ["data"], - "additionalProperties": False, - "properties": { - "data": {"$ref": "#/definitions/factory_data"} - }, - }, - "expected": {"$ref": "#/definitions/expected"}, - }, - }, - }, - }, - }, - }, - "metadata": {"type": "object"}, - }, - "required": ["version", "sources", "scenarios"], - "additionalProperties": False, -} + +with open(os.path.dirname(os.path.realpath(__file__)) + "/schema/dtspec-schema.json", "r") as stream: + SCHEMA = yaml.safe_load(stream) class ApiValidationError(Exception):