From ccca299c8b88de442fd7421452d927f7e5c9f61b Mon Sep 17 00:00:00 2001 From: owenlittlejohns Date: Fri, 13 Sep 2024 09:02:27 -0600 Subject: [PATCH] TRT-552 - Implement configuration file schema v1.0.0 * TRT-553 - Read all group metadata attributes * TRT-554 - Flatten overrides and supplements * TRT-555 - Remove CF_Supplements * TRT-556 - Remove ProductEpochs and Grid_Mapping_Data * TRT-556 - Rename CFOverrides to MetadataOverrides --- CHANGELOG.md | 43 +- README.md | 14 +- VERSION | 2 +- ...arthdata_varinfo_configuration_schema.json | 154 ++++++ config/1.0.0/sample_config_1.0.0.json | 504 ++++++++++++++++++ config/CHANGELOG.md | 41 ++ docs/earthdata-varinfo.ipynb | 9 +- tests/unit/data/test_config.json | 310 ++++++----- tests/unit/test_attribute_container.py | 240 +++++++++ tests/unit/test_cf_config.py | 133 +++-- tests/unit/test_group.py | 121 +++++ tests/unit/test_utilities.py | 262 +++++++++ tests/unit/test_var_info.py | 135 +++-- tests/unit/test_variable.py | 45 +- varinfo/attribute_container.py | 156 ++++++ varinfo/cf_config.py | 195 +++---- varinfo/cmr_search.py | 16 +- varinfo/generate_umm_var.py | 8 +- varinfo/group.py | 84 +++ varinfo/umm_var.py | 34 +- varinfo/utilities.py | 144 ++++- varinfo/var_info.py | 231 ++++---- varinfo/variable.py | 192 ++----- 23 files changed, 2335 insertions(+), 738 deletions(-) create mode 100644 config/1.0.0/earthdata_varinfo_configuration_schema.json create mode 100644 config/1.0.0/sample_config_1.0.0.json create mode 100644 tests/unit/test_attribute_container.py create mode 100644 tests/unit/test_group.py create mode 100644 varinfo/attribute_container.py create mode 100644 varinfo/group.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 88f306e..7f011c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,46 @@ +## v3.0.0 +### 2024-09-11 + +The configuration file schema for `earthdata-varinfo` is significantly updated +in this release. For more information, see the release notes for schema v1.0.0 +in `config/CHANGELOG.md`. + +### Added: + +* Groups within a NetCDF-4 or DMR file are now assigned to the `VarInfo*.groups` +dictionary, allowing for their metadata attributes to be accessed after parsing +an input file. + +### Changed: + +* `CFConfig.get_cf_attributes` has been renamed `CFConfig.get_metadata_overrides`, + as there are now only overrides to be returned from this method. Calls to + `CFConfig.get_metadata_overrides` now _must_ specify a variable path. All + overrides from a configuration file for a given collection are now retrievable + from the newly public `CFConfig.metadata_overrides` class attribute. +* Metadata overrides retrieved for a matching file path are ordered such that + the most specific applicable override to the variable takes precedence. For + example, when requesting the value of the "units" metadata attribute for + variable "/nested/variable", an applicability rule that exactly matches this + variable path will take precedence over rules matching to either the group, + or all variables in the file. +* Handling of nested `Applicability_Groups` has been removed from the `CFConfig` + class, as the configuration file no longer nests these items in overrides. + +### Removed: + +* `CFConfig._cf_supplements` has been deprecated in favour of specifying all + in-file metadata changes via a `MetadataOverrides` item (formerly + `CFOverrides`) instead. ## v2.3.0 ### 2024-08-26 -The VarInfoBase.get_missing_variable_attributes method has been added to allow +The `VarInfoBase.get_missing_variable_attributes` method has been added to allow someone to get metadata attributes from the configuration file for variables that are absent from a file. An example usage is when a CF Convention grid mapping variable is missing from a source file. -The VarInfoBase.get_references_for_attribute method has been added to retrieve +The `VarInfoBase.get_references_for_attribute` method has been added to retrieve all unique variable references contained in a single metadata attribute for a list of variables. For example, retrieving all references listed under the coordinates metadata attribute. @@ -14,8 +48,9 @@ coordinates metadata attribute. ## v2.2.2 ### 2024-07-16 -The generate_collection_umm_var function in earthdata-varinfo updated to support an -optional kwarg 'config_file=' for a configuration file, to be able to override known metadata errors. +The `generate_collection_umm_var` function in earthdata-varinfo updated to +support an optional kwarg `config_file` for a configuration file, to be able to +override known metadata errors. ## v2.2.1 diff --git a/README.md b/README.md index 58da649..12dc641 100644 --- a/README.md +++ b/README.md @@ -24,13 +24,14 @@ attributes. from varinfo import CFConfig cf_config = CFConfig('ICESat2', 'ATL03', config_file='config/0.0.1/sample_config_0.0.1.json') -cf_attributes = cf_config.get_cf_attributes('/full/variable/path') +metadata_attributes = cf_config.get_metadata_attributes('/full/variable/path') ``` ### VarInfo -A group of classes that contain the relations between all variables within a -single granule. Current classes include: +A group of classes that contain metadata attributes for all groups and +variables in a single granule, and the relations between all variables within +that granule. Current classes include: * VarInfoBase: An abstract base class that contains core logic and methods used by the child classes that parse different sources of granule information. @@ -66,9 +67,10 @@ var_info.get_spatial_dimensions({'/path/to/science/variable'}) The `VarInfoFromDmr` and `VarInfoFromNetCDF4` classes also have an optional argument `short_name`, which can be used upon instantiation to specify the -short name of the collection to which the granule belongs. This option is to be -used when a granule does not contain the collection short name within its -metadata global attributes (e.g., ABoVE collections from ORNL). +short name of the collection to which the granule belongs. This option is the +preferred way to specify a collection short name, and particularly encouraged +for use when a granule does not contain the collection short name within its +metadata attributes (e.g., ABoVE collections from ORNL). ``` var_info = VarInfoFromDmr('/path/to/local/file.dmr', short_name='ATL03') diff --git a/VERSION b/VERSION index 276cbf9..4a36342 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.3.0 +3.0.0 diff --git a/config/1.0.0/earthdata_varinfo_configuration_schema.json b/config/1.0.0/earthdata_varinfo_configuration_schema.json new file mode 100644 index 0000000..2f263ae --- /dev/null +++ b/config/1.0.0/earthdata_varinfo_configuration_schema.json @@ -0,0 +1,154 @@ +{ + "$schema": "http://json-schema.org/draft/2020-12/schema", + "title": "earthdata-varinfo configuration file", + "description": "A schema for the configuration file used by earthdata-varinfo to augment CF-Convention metadata in granules.", + "type": "object", + "additionalProperties": false, + "properties": { + "Identification": { + "description": "A description indicating the tool for which earthdata-varinfo and this configuration file will be used.", + "type": "string", + "minLength": 1 + }, + "Version": { + "description": "A numeric identifier for the version of the specific configuration file (not the schema version itself).", + "type": "integer" + }, + "CollectionShortNamePath": { + "description": "A list of HDF metadata attribute paths that provide the shortname value of the collection for the data file being processed. Processed in the listed order.", + "type": "array", + "items": { + "type": "string", + "minLength": 1 + } + }, + "Mission": { + "description": "A set of mission names that are defined for matching short name values.", + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "ExcludedScienceVariables": { + "description": "VarInfo classes currently assume that any variable that has a grid mapping attribute, or has a spatial or temporal dimension and is not itself a dimension or bounds variable, should be treated as a science variable. This may not be true in all cases, and so ExcludedScienceVariables provide a method to denote non-science variables that might otherwise be incorrectly identified.", + "type": "array", + "items": { + "$ref": "#/$defs/MissionVariablePatternType" + } + }, + "RequiredVariables": { + "description": "# VarInfo classes will calculate a set of required variables for a given science variable. This setting imposes additional contents for the required variables list.", + "type": "array", + "items": { + "$ref": "#/$defs/MissionVariablePatternType" + } + }, + "MetadataOverrides": { + "description": "# For cases where CF references do not exist, or are invalid. For example, variables that have no dimension references in the HDF-5 file contents", + "type": "array", + "items": { + "$ref": "#/$defs/MetadataOverridesItemType" + } + } + }, + "required": ["Identification", "Version", "CollectionShortNamePath", "Mission"], + "$defs": { + "ApplicabilityType": { + "description": "An object that specifies a combination of satellite mission, collection short name and variable patterns to which a set of attributes should be applied. At least one of those properties must be specified.", + "type": "object", + "properties": { + "Mission": { + "description": "The name of a mission to which the attributes can be applied. This mission name should match one listed in the Mission mapping of this schema.", + "type": "string" + }, + "ShortNamePath": { + "description": "The short name for the collection to which a granule belongs.", + "type": "string" + }, + "VariablePattern": { + "description": "A regular expression identifying all variables to which the schema item should be applied.", + "type": "string" + } + }, + "anyOf": [{ + "required": ["Mission"] + }, { + "required": ["ShortNamePath"] + }], + "additionalProperties": false + }, + "AttributesItemType": { + "description": "An object that includes the name and value that should be used to either extend or overwrite a metadata attribute for applicable variables.", + "type": "object", + "properties": { + "Name": { + "description": "The metadata attribute name.", + "type": "string" + }, + "Value": { + "description": "The overriding metadata attribute value. The value specified in the configuration file will replace the corresponding metadata value in any applicable source file.", + "anyOf": [{ + "type": ["number", "string"] + }, { + "type": "array", + "items": { + "type": "number" + } + }] + } + }, + "required": ["Name", "Value"], + "additionalProperties": false + }, + "AttributesType": { + "description": "A list of metadata attributes to be updated for variables identified by the applicability rule.", + "type": "array", + "items": { + "$ref": "#/$defs/AttributesItemType" + } + }, + "MissionVariablePatternType": { + "description": "An object that defines a list of variables, as strings or regular expressions, that should be considered as either required variables or excluded as science variables for a given collection.", + "type": "object", + "properties": { + "Applicability": { + "description": "The mission and/or collection short name to which the list of required variables or excluded variables should be applied.", + "$ref": "#/$defs/ApplicabilityType" + }, + "VariablePattern": { + "description": "A list of variable strings or regular expression patterns that should match variables to be excluded or required for a given collection or mission.", + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["Applicability", "VariablePattern"], + "additionalProperties": false + }, + "MetadataOverridesItemType": { + "description": "An item that details one or more metadata attributes to overwrite according to the supplied applicability rules.", + "type": "object", + "properties": { + "_Description": { + "description": "Explains the purpose and effect of these overrides.", + "type": "string" + }, + "Applicability": { + "description": "An applicability rule that indicates which groups and variables within a file a metadata override should apply to. If only a short name and/or mission is provided, the override will apply to all groups and variables. If a VariablePattern is also provided, the override is applied only to those groups or variables whose paths match the regular expression of the VariablePattern.", + "$ref": "#/$defs/ApplicabilityType" + }, + "Attributes" : { + "description": "Metadata attributes to override for variables or groups that match the mission, short name and/or VariablePattern criteria specified in the Applicability of this object.", + "type": "array", + "items": { + "description": "A list of metadata attributes with their names and values.", + "$ref": "#/$defs/AttributesItemType" + } + } + }, + "additionalProperties": false, + "required": ["Applicability", "Attributes"] + } + } +} diff --git a/config/1.0.0/sample_config_1.0.0.json b/config/1.0.0/sample_config_1.0.0.json new file mode 100644 index 0000000..75ce2a2 --- /dev/null +++ b/config/1.0.0/sample_config_1.0.0.json @@ -0,0 +1,504 @@ +{ + "Identification": "varinfo_sample_config", + "Version": 15, + "CollectionShortNamePath": [ + "/HDF5_GLOBAL/short_name", + "/NC_GLOBAL/short_name", + "/Metadata/DatasetIdentification/shortName", + "/METADATA/DatasetIdentification/shortName", + "/Metadata/SeriesIdentification/shortName", + "/METADATA/SeriesIdentification/shortName", + "/HDF5_GLOBAL/id", + "/NC_GLOBAL/id", + "short_name" + ], + "Mission": { + "ATL\\d{2}": "ICESat2", + "GEDI_L[1234][AB]|GEDI0[1234]_[AB]": "GEDI", + "SPL[1234].+": "SMAP", + "VIIRS_NPP-.+-L2P": "VIIRS_PO" + }, + "ExcludedScienceVariables": [ + { + "Applicability": { + "Mission": "ICESat2" + }, + "VariablePattern": [ + "/quality_assessment/.*", + "/orbit_info/.*", + "/atlas_impulse_response/.*" + ] + } + ], + "RequiredVariables": [ + { + "Applicability": { + "Mission": "GEDI" + }, + "VariablePattern": [ + ".*shot_number" + ] + } + ], + "MetadataOverrides": [ + { + "Applicability": { + "Mission": "SMAP", + "ShortNamePath": "SPL4.*" + }, + "Attributes": [ + { + "Name": "grid_mapping", + "Value": "/EASE2_global_projection" + } + ] + }, + { + "Applicability": { + "Mission": "SMAP", + "ShortNamePath": "SPL4.*", + "VariablePattern": "/EASE2_global_projection" + }, + "Attributes": [ + { + "Name": "false_easting", + "Value": 0.0 + }, + { + "Name": "false_northing", + "Value": 0.0 + }, + { + "Name": "grid_mapping_name", + "Value": "lambert_cylindrical_equal_area" + }, + { + "Name": "longitude_of_central_meridian", + "Value": 0.0 + }, + { + "Name": "standard_parallel", + "Value": 30.0 + } + ] + }, + { + "Applicability": { + "Mission": "SMAP", + "ShortNamePath": "SPL3FT(P|P_E)", + "VariablePattern": "(?i).*global.*" + }, + "Attributes": [ + { + "Name": "grid_mapping", + "Value": "/EASE2_global_projection" + } + ] + }, + { + "Applicability": { + "Mission": "SMAP", + "ShortNamePath": "SPL3FT(P|P_E)", + "VariablePattern": "/EASE2_global_projection" + }, + "Attributes": [ + { + "Name": "false_easting", + "Value": 0.0 + }, + { + "Name": "false_northing", + "Value": 0.0 + }, + { + "Name": "grid_mapping_name", + "Value": "lambert_cylindrical_equal_area" + }, + { + "Name": "longitude_of_central_meridian", + "Value": 0.0 + }, + { + "Name": "standard_parallel", + "Value": 30.0 + } + ] + }, + { + "Applicability": { + "Mission": "SMAP", + "ShortNamePath": "SPL3FT(P|P_E)", + "VariablePattern": "(?i).*polar.*" + }, + "Attributes": [ + { + "Name": "grid_mapping", + "Value": "/EASE2_polar_projection" + } + ] + }, + { + "Applicability": { + "Mission": "SMAP", + "ShortNamePath": "SPL3FT(P|P_E)", + "VariablePattern": "/EASE2_polar_projection" + }, + "Attributes": [ + { + "Name": "false_easting", + "Value": 0.0 + }, + { + "Name": "false_northing", + "Value": 0.0 + }, + { + "Name": "grid_mapping_name", + "Value": "lambert_azimuthal_equal_area" + }, + { + "Name": "latitude_of_projection_origin", + "Value": 90.0 + }, + { + "Name": "longitude_of_projection_origin", + "Value": 0.0 + } + ] + }, + { + "Applicability": { + "Mission": "SMAP", + "ShortNamePath": "SPL3FTA" + }, + "Attributes": [ + { + "Name": "grid_mapping", + "Value": "/EASE2_polar_projection" + } + ] + }, + { + "Applicability": { + "Mission": "SMAP", + "ShortNamePath": "SPL3FTA", + "VariablePattern": "/EASE2_polar_projection" + }, + "Attributes": [ + { + "Name": "false_easting", + "Value": 0.0 + }, + { + "Name": "false_northing", + "Value": 0.0 + }, + { + "Name": "grid_mapping_name", + "Value": "lambert_azimuthal_equal_area" + }, + { + "Name": "latitude_of_projection_origin", + "Value": 90.0 + }, + { + "Name": "longitude_of_projection_origin", + "Value": 0.0 + } + ] + }, + { + "Applicability": { + "Mission": "SMAP", + "ShortNamePath": "SPL3FT(A|P|P_E)", + "VariablePattern": "/Freeze_Thaw_Retrieval_Data_Polar/(latitude|longitude).*" + }, + "Attributes": [ + { + "Name": "_fill", + "Value": "-9999" + } + ] + }, + { + "Applicability": { + "Mission": "SMAP", + "ShortNamePath": "SPL3SMP_E", + "VariablePattern": "/$" + }, + "Attributes": [ + { + "Name": "Data_Organization", + "Value": "h5_grid" + } + ] + }, + { + "Applicability": { + "Mission": "SMAP", + "ShortNamePath": "SPL3SMP_E", + "VariablePattern": "/Soil_Moisture_Retrieval_Data_AM/.*" + }, + "Attributes": [ + { + "Name": "coordinates", + "Value": "/Soil_Moisture_Retrieval_Data_AM/latitude, /Soil_Moisture_Retrieval_Data_AM/longitude" + } + ] + }, + { + "Applicability": { + "Mission": "SMAP", + "ShortNamePath": "SPL3SMP_E", + "VariablePattern": "/Soil_Moisture_Retrieval_Data_PM/.*" + }, + "Attributes": [ + { + "Name": "coordinates", + "Value": "/Soil_Moisture_Retrieval_Data_PM/latitude, /Soil_Moisture_Retrieval_Data_PM/longitude" + } + ] + }, + { + "Applicability": { + "Mission": "ICESat2", + "ShortNamePath": "ATL20", + "VariablePattern": "/daily/day\\d{2}/.+" + }, + "Attributes": [ + { + "Name": "ancillary_variables", + "Value": "./delta_time_beg, ./delta_time_end" + }, + { + "Name": "coordinates", + "Value": "../../grid_x ../../grid_y" + } + ] + }, + { + "Applicability": { + "Mission": "ICESat2", + "ShortNamePath": "ATL0[3-9]|ATL1[023]", + "VariablePattern": "/$" + }, + "Attributes": [ + { + "Name": "Data_Organization", + "Value": "h5_trajectory" + } + ] + }, + { + "Applicability": { + "Mission": "ICESat2", + "ShortNamePath": "ATL0[3-9]|ATL1[023]", + "VariablePattern": "/gt[123][lr]/geolocation/.*" + }, + "Attributes": [ + { + "Name": "ancillary_variables", + "Value": "podppd_flag" + } + ] + }, + { + "Applicability": { + "Mission": "ICESat2", + "ShortNamePath": "ATL03", + "VariablePattern": "/gt[123][lr]/geophys_corr/.*" + }, + "Attributes": [ + { + "Name": "subset_control_variables", + "Value": "../geolocation/delta_time, ../geolocation/reference_photon_lat, ../geolocation/reference_photon_lon" + }, + { + "Name": "subset_control_type", + "Value": "coordinates" + } + ] + }, + { + "Applicability": { + "Mission": "ICESat2", + "ShortNamePath": "ATL03", + "VariablePattern": "/gt[123][lr]/heights/.*" + }, + "Attributes": [ + { + "Name": "subset_control_variables", + "Value": "../geolocation/ph_index_beg, ../geolocation/segment_ph_cnt" + }, + { + "Name": "subset_control_type", + "Value": "fwd_segment_index" + } + ] + }, + { + "Applicability": { + "Mission": "ICESat2", + "ShortNamePath": "ATL03", + "VariablePattern": "/gt[123][lr]/geolocation/ph_index_beg" + }, + "Attributes": [ + { + "Name": "subset_control_variable_type", + "Value": "segment_index_beg" + } + ] + }, + { + "Applicability": { + "Mission": "ICESat2", + "ShortNamePath": "ATL03", + "VariablePattern": "/gt[123][lr]/geolocation/ph_index_beg" + }, + "Attributes": [ + { + "Name": "subset_control_variable_type", + "Value": "segment_index_cnt" + } + ] + }, + { + "Applicability": { + "Mission": "ICESat2", + "ShortNamePath": "ATL08", + "VariablePattern": "/gt[123][lr]/signal_photons/.*" + }, + "Attributes": [ + { + "Name": "subset_control_variables", + "Value": "../land_segments/ph_ndx_beg, ../land_segments/n_seg_ph" + }, + { + "Name": "subset_control_type", + "Value": "fwd_segment_index" + } + ] + }, + { + "Applicability": { + "Mission": "ICESat2", + "ShortNamePath": "ATL08", + "VariablePattern": "/gt[123][lr]/land_segments/ph_ndx_beg" + }, + "Attributes": [ + { + "Name": "subset_control_variable_type", + "Value": "segment_index_beg" + } + ] + }, + { + "Applicability": { + "Mission": "ICESat2", + "ShortNamePath": "ATL08", + "VariablePattern": "/gt[123][lr]/land_segments/n_seg_ph" + }, + "Attributes": [ + { + "Name": "subset_control_variable_type", + "Value": "segment_index_cnt" + } + ] + }, + { + "Applicability": { + "Mission": "ICESat2", + "ShortNamePath": "ATL1[67]", + "VariablePattern": "/$" + }, + "Attributes": [ + { + "Name": "Data_Organization", + "Value": "h5_grid" + } + ] + }, + { + "Applicability": { + "Mission": "ICESat2", + "ShortNamePath": "ATL20", + "VariablePattern": "/$" + }, + "Attributes": [ + { + "Name": "Data_Organization", + "Value": "h5_grid" + } + ] + }, + { + "Applicability": { + "Mission": "ICESat2", + "ShortNamePath": "ATL20", + "VariablePattern": ".*" + }, + "Attributes": [ + { + "Name": "coordinates", + "Value": "/crs" + } + ] + }, + { + "Applicability": { + "Mission": "GEDI", + "ShortNamePath": "GEDI_L[1234][AB]|GEDI0[1234]_[AB]", + "VariablePattern": "/$" + }, + "Attributes": [ + { + "Name": "Data_Organization", + "Value": "h5_trajectory" + } + ] + }, + { + "Applicability": { + "Mission": "GEDI", + "ShortNamePath": "GEDI_L2B|GEDI02_B", + "VariablePattern": "/BEAM[\\d]+/pgap_theta_z" + }, + "Attributes": [ + { + "Name": "subset_control_variables", + "Value": "rx_sample_start_index, rx_sample_count" + }, + { + "Name": "subset_control_type", + "Value": "fwd_segment_index" + } + ] + }, + { + "Applicability": { + "Mission": "GEDI", + "ShortNamePath": "GEDI_L2B|GEDI02_B", + "VariablePattern": "/BEAM[\\d]+/rx_sample_start_index" + }, + "Attributes": [ + { + "Name": "subset_control_variable_type", + "Value": "segment_index_beg" + } + ] + }, + { + "Applicability": { + "Mission": "GEDI", + "ShortNamePath": "GEDI_L2B|GEDI02_B", + "VariablePattern": "/BEAM[\\d]+/rx_sample_count" + }, + "Attributes": [ + { + "Name": "subset_control_variable_type", + "Value": "segment_index_cnt" + } + ] + } + ] +} diff --git a/config/CHANGELOG.md b/config/CHANGELOG.md index ac0b7e1..ace955e 100644 --- a/config/CHANGELOG.md +++ b/config/CHANGELOG.md @@ -3,6 +3,47 @@ This change log preserves the changes made between different schema versions of the earthdata-varinfo configuration file. +## 1.0.0 +### 2024-09-11 + +**TRT-552** Implementing a simpler `earthdata-varinfo` configuration file schema. + +This version of the configuration file schema makes several significant changes +to simplify the schema for broader use. + +### Changed: + +* `CF_Overrides` has been renamed to `MetadataOverrides`. +* `CFOverridesOrSupplementsItemType` has been renamed to + `MetadataOverridesItemType`. +* `Required_Fields` has been renamed to `RequiredVariables` to match the + terminology used for other schema properties. +* The casing of all attributes in the configuration file schema has been + updated to be `PascalCase` throughout. Affected schema attributes include: + * `CollectionShortNamePath`. + * `ExcludedScienceVariables`. + +### Removed: + +* The `CF_Supplements` property of the schema has been removed. All metadata + changes must now be specified in a `MetadataOverrides` item (formerly + `CF_Overrides`). +* The `Global_Attributes` property has been removed from `MetadataOverrides`. + Global attribute overrides should now be specified in the same way as + metadata attributes on any other variable or group. That is: specifying the + path to the group in the `Applicability` part of the item, and including the + updated attributes under the `Attributes` property of the item. This allows + metadata overrides to all groups, not just the global attributes in an input + file. +* The `Applicability_Group` property within `MetadataOverrides` (formerly + `CF_Overrides`) has been removed. Now attributes should only be specified + within the `Attributes` property at the root level of an override or + supplement. The `Applicability` of a `MetadataOverrides` item must now + include either a `Mission` or a `ShortNamePath`. +* The unused `ProductEpochs` section of the schema has been removed. +* The `Grid_Mapping_Data` section of the schema has been removed, in favour of + specifying grid mapping attributes via `MetadataOverrides`. + ## 0.0.1 ### 2023-01-09 diff --git a/docs/earthdata-varinfo.ipynb b/docs/earthdata-varinfo.ipynb index 742f879..2235568 100644 --- a/docs/earthdata-varinfo.ipynb +++ b/docs/earthdata-varinfo.ipynb @@ -26,7 +26,7 @@ "* Parses and extracts variable metadata from within source granules.\n", "* Also parses relationships between variables, primarily using dimension information or CF-Convention attributes expected to contain such information.\n", "* Variable classification via CF-Convention-based heuristics (e.g., using `units` and other attributes).\n", - "* Metadata can be supplemented or overwritten with a configuration file (which has a fully defined JSON schema).\n", + "* Metadata can be overwritten with a configuration file (which has a fully defined JSON schema).\n", "* Extensible for further input formats (uses abstract base classes).\n", "* Common Metadata Repository ([CMR](https://www.earthdata.nasa.gov/eosdis/science-system-description/eosdis-components/cmr)) compliant UMM-Var JSON generation.\n", "\n", @@ -73,7 +73,12 @@ "\n", "**VariableFromNetCDF4 and VariableFromDMR:**\n", "\n", - "A representation of a single granule. Extracts metadata attributes from the input source and fully qualifies references to other variables (to allow determination of relationships with other variables).\n", + "A representation of a single variable within the granule. Extracts metadata attributes from the input source and fully qualifies references to other variables (to allow determination of relationships with other variables).\n", + "\n", + "\n", + "**GroupFromNetCDF4 and GroupFromDMR:**\n", + "\n", + "A representation of a single group within the granule, including the root group. Extracts metadata attributes from the input source and fully qualifies references to variables. These classes also have a `variables` attribute that lists all child variables in the group.\n", "\n", "**CFConfig:**\n", "\n", diff --git a/tests/unit/data/test_config.json b/tests/unit/data/test_config.json index be20924..7080426 100644 --- a/tests/unit/data/test_config.json +++ b/tests/unit/data/test_config.json @@ -1,223 +1,241 @@ { "Identification": "var_subsetter_config", - "Version": 12, - "Collection_ShortName_Path": [ + "Version": 13, + "CollectionShortNamePath": [ "/HDF5_GLOBAL/short_name", "/NC_GLOBAL/short_name", "/Metadata/DatasetIdentification/shortName", "/METADATA/DatasetIdentification/shortName", "/Metadata/SeriesIdentification/shortName", "/METADATA/SeriesIdentification/shortName", + "short_name", "/id" ], "Mission": { "FAKE\\d{2}": "FakeSat", - "ATL03": "ICESat2" + "ATL03": "ICESat2", + "GEDI_L[1234][AB]|GEDI0[1234]_[AB]": "GEDI", + "SPL[1234].+": "SMAP", + "VIIRS_NPP-.+-L2P": "VIIRS_PO" }, - "Excluded_Science_Variables": [ + "ExcludedScienceVariables": [ { "Applicability": { "Mission": "FakeSat" }, - "Variable_Pattern": [ + "VariablePattern": [ "/exclude_one/.*", "/exclude_two/.*", "/exclude_three/.*" ] } ], - "Required_Fields": [ + "RequiredVariables": [ { "Applicability": { "Mission": "FakeSat" }, - "Variable_Pattern": [ + "VariablePattern": [ "/required_group/.*" ] } ], - "ProductEpochs": [ + "MetadataOverrides": [ { "Applicability": { - "Mission": "FakeSat" + "Mission": "FakeSat", + "ShortNamePath": "FAKE99" }, - "Epoch": "2005-01-01T00:00:00.000000" - } - ], - "Grid_Mapping_Data": [ - { - "Grid_Mapping_Dataset_Name": "EASE2_Global", - "grid_mapping_name": "lambert_cylindrical_equal_area", - "standard_parallel": 30.0, - "longitude_of_central_meridian": 0.0, - "false_easting": 0.0, - "false_northing": 0.0 + "Attributes": [ + { + "Name": "collection_override", + "Value": "collection value" + } + ] }, - { - "Grid_Mapping_Dataset_Name": "EASE2_Polar", - "grid_mapping_name": "lambert_azimuthal_equal_area", - "longitude_of_projection_origin": 0.0, - "latitude_of_projection_origin": 90.0, - "false_easting": 0.0, - "false_northing": 0.0 - } - ], - "CF_Overrides": [ { "Applicability": { "Mission": "FakeSat", - "ShortNamePath": "FAKE99" + "ShortNamePath": "FAKE99", + "VariablePattern": "/group/.*" }, "Attributes": [ { - "Name": "collection_override", - "Value": "collection value" + "Name": "group_override", + "Value": "group value" } - ], - "Applicability_Group": [ - { - "Applicability": { - "Variable_Pattern": "/group/.*" - }, - "Attributes": [ - { - "Name": "group_override", - "Value": "group value" - } - ] - }, + ] + }, + { + "Applicability": { + "Mission": "FakeSat", + "ShortNamePath": "FAKE99", + "VariablePattern": "/$" + }, + "Attributes": [ { - "Applicability": { - "ShortNamePath": "FAKE99", - "Variable_Pattern": "/group/variable" - }, - "Attributes": [ - { - "Name": "variable_override", - "Value": "variable value" - } - ] - }, + "Name": "global_override", + "Value": "GLOBAL" + } + ] + }, + { + "Applicability": { + "Mission": "FakeSat", + "ShortNamePath": "FAKE99", + "VariablePattern": "/group/variable" + }, + "Attributes": [ { - "Applicability": { - "Variable_Pattern": "/coordinates_group/.*" - }, - "Attributes": [ - { - "Name": "coordinates", - "Value": "lat, lon" - } - ] - }, + "Name": "variable_override", + "Value": "variable value" + } + ] + }, + { + "Applicability": { + "Mission": "FakeSat", + "ShortNamePath": "FAKE99", + "VariablePattern": "/coordinates_group/.*" + }, + "Attributes": [ { - "Applicability": { - "Variable_Pattern": "/absent_variable" - }, - "Attributes": [ - { - "Name": "extra_override", - "Value": "overriding value" - } - ] + "Name": "coordinates", + "Value": "lat, lon" } - ], - "Global_Attributes": [ + ] + }, + { + "Applicability": { + "Mission": "FakeSat", + "ShortNamePath": "FAKE99", + "VariablePattern": "/absent_variable" + }, + "Attributes": [ { - "Name": "global_override", - "Value": "GLOBAL" + "Name": "extra_override", + "Value": "overriding value" } ] }, { "Applicability": { "Mission": "FakeSat", - "ShortNamePath": "FAKE98" - }, - "Applicability_Group": [ - { - "Applicability": { - "Variable_Pattern": "/group2/.*" - }, - "Attributes": [ - { - "Name": "other_collection", - "Value": "canopy_height" - } - ] + "ShortNamePath": "FAKE98", + "VariablePattern": "/group2/.*" + }, + "Attributes": [ + { + "Name": "other_collection", + "Value": "canopy_height" } ] }, { "Applicability": { "Mission": "FakeSat2", - "ShortNamePath": "FAKE99" + "ShortNamePath": "FAKE99", + "VariablePattern": "/group3/.*" }, - "Applicability_Group": [ - { - "Applicability": { - "Variable_Pattern": "/group3/.*" - }, - "Attributes": [ - { - "Name": "other_mission", - "Value": "sea_surface_temperature" - } - ] + "Attributes": [ + { + "Name": "other_mission", + "Value": "sea_surface_temperature" } ] - } - ], - "CF_Supplements": [ + }, { "Applicability": { "Mission": "FakeSat", - "ShortNamePath": "FAKE99" + "ShortNamePath": "FAKE97", + "VariablePattern": "/.*" }, "Attributes": [ { - "Name": "collection_supplement", - "Value": "FAKE99 supplement" + "Name": "conflicting_attribute_global_and_group", + "Value": "applies to all" } - ], - "Applicability_Group": [ - { - "Applicability": { - "Variable_Pattern": "/group4/.*" - }, - "Attributes": [ - { - "Name": "group_supplement", - "Value": "FAKE99 group4" - } - ] - }, + ] + }, + { + "Applicability": { + "Mission": "FakeSat", + "ShortNamePath": "FAKE97", + "VariablePattern": "/group/.*" + }, + "Attributes": [ { - "Applicability": { - "Variable_Pattern": "/absent_variable" - }, - "Attributes": [ - { - "Name": "extra_override", - "Value": "supplemental value" - } - ] + "Name": "conflicting_attribute_global_and_group", + "Value": "applies to /group/.*" }, { - "Applicability": { - "Variable_Pattern": "/absent_supplement" - }, - "Attributes": [ - { - "Name": "extra_supplement", - "Value": "supplemental value" - } - ] + "Name": "conflicting_attribute_group_and_variable", + "Value": "applies to /group/.*" } - ], - "Global_Attributes": [ + ] + }, + { + "Applicability": { + "Mission": "FakeSat", + "ShortNamePath": "FAKE97", + "VariablePattern": "/group/variable" + }, + "Attributes": [ + { + "Name": "conflicting_attribute_group_and_variable", + "Value": "applies to /group/variable" + } + ] + }, + { + "Applicability": { + "Mission": "FakeSat", + "ShortNamePath": "FAKE97", + "VariablePattern": "/other_group/variable" + }, + "Attributes": [ + { + "Name": "test_depth_priority_over_string_length", + "Value": "applies to /other_group/variable" + } + ] + }, + { + "Applicability": { + "Mission": "FakeSat", + "ShortNamePath": "FAKE97", + "VariablePattern": "/(other_group|long|regex|things).*" + }, + "Attributes": [ + { + "Name": "test_depth_priority_over_string_length", + "Value": "applies to lots of groups" + } + ] + }, + { + "Applicability": { + "Mission": "FakeSat", + "ShortNamePath": "FAKE97", + "VariablePattern": "/string_length/variable" + }, + "Attributes": [ + { + "Name": "test_string_length_same_depth", + "Value": "applies to /string_length/variable" + } + ] + }, + { + "Applicability": { + "Mission": "FakeSat", + "ShortNamePath": "FAKE97", + "VariablePattern": "/string_length/var.*" + }, + "Attributes": [ { - "Name": "fakesat_global_supplement", - "Value": "fakesat value" + "Name": "test_string_length_same_depth", + "Value": "applies to /string_length/var.*" } ] } diff --git a/tests/unit/test_attribute_container.py b/tests/unit/test_attribute_container.py new file mode 100644 index 0000000..59674ac --- /dev/null +++ b/tests/unit/test_attribute_container.py @@ -0,0 +1,240 @@ +from shutil import rmtree +from tempfile import mkdtemp +from unittest import TestCase +import xml.etree.ElementTree as ET + +from netCDF4 import Dataset + +from varinfo.attribute_container import ( + AttributeContainerFromDmr, + AttributeContainerFromNetCDF4, +) +from varinfo.cf_config import CFConfig + +from tests.utilities import netcdf4_global_attributes, write_skeleton_netcdf4 + + +class TestAttributeContainerFromDmr(TestCase): + """Tests to ensure the `AttributeContainerFromDmr` class instantiates + correctly. This is the superclass for `GroupFromDmr` and + `VariableFromDmr`. + + """ + + @classmethod + def setUpClass(cls): + """Set up test fixtures that can be reused between tests.""" + cls.config_file = 'tests/unit/data/test_config.json' + cls.fakesat_config = CFConfig('FakeSat', 'FAKE99', config_file=cls.config_file) + cls.namespace = 'namespace_string' + cls.dmr_group = ET.fromstring( + f'<{cls.namespace}Group name="science_group">' + f' <{cls.namespace}Attribute name="coordinates">' + f' <{cls.namespace}Value>lat_for_group lon_for_group' + f' ' + f'' + ) + cls.group_path = '/science_group' + cls.dmr_variable = ET.fromstring( + f'<{cls.namespace}Float64 name="variable">' + f' <{cls.namespace}Attribute name="units" type="String">' + f' <{cls.namespace}Value>m' + f' ' + f'' + ) + cls.variable_path = '/group/variable' + + def test_instantiation_for_group(self): + """Ensure an `AttributeContainerFromDmr` can be created from a group.""" + expected_attributes = { + 'collection_override': 'collection value', + 'coordinates': 'lat_for_group lon_for_group', + } + + container = AttributeContainerFromDmr( + self.dmr_group, + self.fakesat_config, + self.namespace, + self.group_path, + ) + + self.assertEqual(container.namespace, self.namespace) + self.assertEqual(container.full_name_path, self.group_path) + self.assertDictEqual( + container.attributes, + expected_attributes, + ) + + def test_instantiation_for_variable(self): + """Ensure an `AttributeContainerFromDmr` can be created from a variable.""" + expected_attributes = { + 'collection_override': 'collection value', + 'group_override': 'group value', + 'units': 'm', + 'variable_override': 'variable value', + } + + container = AttributeContainerFromDmr( + self.dmr_variable, + self.fakesat_config, + self.namespace, + self.variable_path, + ) + + self.assertEqual(container.namespace, self.namespace) + self.assertEqual(container.full_name_path, self.variable_path) + self.assertDictEqual( + container.attributes, + expected_attributes, + ) + + def test_get_attribute_value(self): + """Ensure attribute values can be correctly retrieved.""" + container = AttributeContainerFromDmr( + self.dmr_group, + self.fakesat_config, + self.namespace, + self.group_path, + ) + + with self.subTest('Value from DMR'): + self.assertEqual( + container.get_attribute_value('coordinates'), + 'lat_for_group lon_for_group', + ) + + with self.subTest('Override from CFConfig'): + self.assertEqual( + container.get_attribute_value('collection_override'), + 'collection value', + ) + + +class TestAttributeContainerFromNetCDF4(TestCase): + """Tests to ensure the `AttributeContainerFromNetCDF4` class instantiates + correctly. This is the superclass for `GroupFromNetCDF4` and + `VariableFromNetCDF4`. + + """ + + @classmethod + def setUpClass(cls): + """Set up properties of the class that do not need to reset between + tests. + + """ + cls.config_file = 'tests/unit/data/test_config.json' + cls.fakesat_config = CFConfig('FakeSat', 'FAKE99', config_file=cls.config_file) + cls.namespace = 'namespace string' + + def setUp(self): + """Set up properties to be reset for every test.""" + self.output_dir = mkdtemp() + + def tearDown(self): + """Perform clean-up after every test.""" + rmtree(self.output_dir) + + def test_instantatiation_for_root_group(self): + """Ensure an `AttributeContainerFromNetCDF4` can be created from a + group. + + """ + netcdf4_path = write_skeleton_netcdf4(self.output_dir) + expected_attributes = { + 'collection_override': 'collection value', + 'global_override': 'GLOBAL', + **netcdf4_global_attributes, + } + + with Dataset(netcdf4_path) as dataset: + container = AttributeContainerFromNetCDF4( + dataset, + self.fakesat_config, + self.namespace, + dataset.path, + ) + + self.assertEqual(container.namespace, self.namespace) + self.assertEqual(container.full_name_path, '/') + self.assertDictEqual( + container.attributes, + expected_attributes, + ) + + def test_instantatiation_for_nested_group(self): + """Ensure an `AttributeContainerFromNetCDF4` can be created from a + group. + + """ + netcdf4_path = write_skeleton_netcdf4(self.output_dir) + expected_attributes = { + 'collection_override': 'collection value', + } + + with Dataset(netcdf4_path) as dataset: + container = AttributeContainerFromNetCDF4( + dataset['/group'], + self.fakesat_config, + self.namespace, + dataset['/group'].path, + ) + + self.assertEqual(container.namespace, self.namespace) + self.assertEqual(container.full_name_path, '/group') + self.assertDictEqual( + container.attributes, + expected_attributes, + ) + + def test_instantiation_for_variable(self): + """Ensure an `AttributeContainerFromNetCDF4` can be created from a + variable. + + """ + netcdf4_path = write_skeleton_netcdf4(self.output_dir) + expected_attributes = { + 'collection_override': 'collection value', + 'coordinates': '/lat /lon', + 'description': 'A science variable for testing', + 'group_override': 'group value', + } + + with Dataset(netcdf4_path) as dataset: + container = AttributeContainerFromNetCDF4( + dataset['/group/science2'], + self.fakesat_config, + self.namespace, + '/group/science2', + ) + + self.assertEqual(container.namespace, self.namespace) + self.assertEqual(container.full_name_path, '/group/science2') + self.assertDictEqual( + container.attributes, + expected_attributes, + ) + + def test_get_attribute_value(self): + """Ensure attribute values can be correctly retrieved.""" + netcdf4_path = write_skeleton_netcdf4(self.output_dir) + + with Dataset(netcdf4_path) as dataset: + container = AttributeContainerFromNetCDF4( + dataset['/group/science2'], + self.fakesat_config, + self.namespace, + '/group/science2', + ) + + with self.subTest('Value from NetCDF4 variable'): + self.assertEqual( + container.get_attribute_value('description'), + 'A science variable for testing', + ) + + with self.subTest('Override from CFConfig'): + self.assertEqual( + container.get_attribute_value('collection_override'), + 'collection value', + ) diff --git a/tests/unit/test_cf_config.py b/tests/unit/test_cf_config.py index 91ef58c..9d3b29e 100644 --- a/tests/unit/test_cf_config.py +++ b/tests/unit/test_cf_config.py @@ -21,27 +21,20 @@ def setUpClass(cls): cls.test_config = 'tests/unit/data/test_config.json' cls.mission = 'FakeSat' cls.short_name = 'FAKE99' - cls.excluded_science_variables = { + cls.expected_excluded_science_variables = { '/exclude_one/.*', '/exclude_two/.*', '/exclude_three/.*', } cls.required_variables = {'/required_group/.*'} - cls.global_overrides = {'global_override': 'GLOBAL'} - cls.global_supplements = {'fakesat_global_supplement': 'fakesat value'} - cls.cf_overrides = { + cls.expected_metadata_overrides = { '.*': {'collection_override': 'collection value'}, + '/$': {'global_override': 'GLOBAL'}, '/absent_variable': {'extra_override': 'overriding value'}, '/coordinates_group/.*': {'coordinates': 'lat, lon'}, '/group/.*': {'group_override': 'group value'}, '/group/variable': {'variable_override': 'variable value'}, } - cls.cf_supplements = { - '.*': {'collection_supplement': 'FAKE99 supplement'}, - '/absent_variable': {'extra_override': 'supplemental value'}, - '/absent_supplement': {'extra_supplement': 'supplemental value'}, - '/group4/.*': {'group_supplement': 'FAKE99 group4'}, - } def test_instantiation(self): """Ensure the attributes of an object are set upon class @@ -56,22 +49,15 @@ def test_instantiation(self): self.assertEqual(self.short_name, config.short_name) self.assertSetEqual( - self.excluded_science_variables, config.excluded_science_variables + self.expected_excluded_science_variables, + config.excluded_science_variables, ) self.assertSetEqual(self.required_variables, config.required_variables) - - self.assertDictEqual(self.global_overrides, config.global_overrides) - self.assertDictEqual(self.global_supplements, config.global_supplements) - - # The attributes below are protected-access within the class, however, - # this test should still check they only contain the expected items. - self.assertEqual( - self.cf_overrides, config._cf_overrides - ) # pylint: disable=W0212 - self.assertEqual( - self.cf_supplements, config._cf_supplements - ) # pylint: disable=W0212 + self.assertDictEqual( + self.expected_metadata_overrides, + config.metadata_overrides, + ) def test_instantiation_no_file(self): """Ensure an instance of the `CFConfig` class can be produced when no @@ -84,8 +70,7 @@ def test_instantiation_no_file(self): self.assertEqual(self.short_name, config.short_name) self.assertSetEqual(set(), config.excluded_science_variables) self.assertSetEqual(set(), config.required_variables) - self.assertDictEqual({}, config.global_overrides) - self.assertDictEqual({}, config.global_supplements) + self.assertDictEqual({}, config.metadata_overrides) def test_instantiation_missing_configuration_file(self): """Ensure a MissingConfigurationFileError is raised when a path to a @@ -107,75 +92,89 @@ def test_instantiation_invalid_configuration_file_format(self): config_file='tests/unit/data/ATL03_example.dmr', ) - def test_get_cf_attributes_all(self): - """Ensure the CFConfig.get_cf_references method returns all the - overriding and supplemental references from the class, in - dictionaries that are keyed on the variable pattern. - - """ - config = CFConfig(self.mission, self.short_name, self.test_config) - self.assertDictEqual( - config.get_cf_attributes(), - {'cf_overrides': self.cf_overrides, 'cf_supplements': self.cf_supplements}, - ) - - def test_get_cf_attributes_variable(self): - """Ensure the CFConfig.get_cf_references method returns all overriding - and supplemental attributes where the variable pattern matches the - supplied variable name. If multiple patterns match the variable - name, then all attributes from those patterns should be combined - into a single output dictionary. + def test_get_metadata_overrides_variable(self): + """Ensure the CFConfig.get_metadata_overrides method returns all + overriding attributes where the variable pattern matches the supplied + variable or group name. If multiple patterns match the variable name, + then the pattern that is the most specific, as determined by a + combination of hierarchical depth in the regular expression and, + secondarily, the length of the variable pattern string, will be + returned. """ - collection_overrides = {'collection_override': 'collection value'} - group_overrides = { + expected_collection_overrides = {'collection_override': 'collection value'} + expected_group_overrides = { 'collection_override': 'collection value', 'group_override': 'group value', } - variable_overrides = { + expected_variable_overrides = { 'collection_override': 'collection value', 'group_override': 'group value', 'variable_override': 'variable value', } - collection_supplements = {'collection_supplement': 'FAKE99 supplement'} - group4_supplements = { - 'collection_supplement': 'FAKE99 supplement', - 'group_supplement': 'FAKE99 group4', - } - test_args = [ [ 'Collection only', 'random_variable', - collection_overrides, - collection_supplements, + expected_collection_overrides, ], [ 'Group overrides', '/group/random', - group_overrides, - collection_supplements, + expected_group_overrides, ], [ 'Variable overrides', '/group/variable', - variable_overrides, - collection_supplements, - ], - [ - 'Group supplements', - '/group4/variable', - collection_overrides, - group4_supplements, + expected_variable_overrides, ], ] config = CFConfig(self.mission, self.short_name, self.test_config) - for description, variable, overrides, supplements in test_args: + for description, variable, expected_overrides in test_args: with self.subTest(description): self.assertDictEqual( - config.get_cf_attributes(variable), - {'cf_overrides': overrides, 'cf_supplements': supplements}, + config.get_metadata_overrides(variable), + expected_overrides, ) + + def test_get_metadata_overrides_variable_conflicts(self): + """Ensure that if a variable matches multiple override rules that + specify conflicting values for a metadata attribute, the most specific + matching metadata attribute takes precedence. + + The primary measure of specificity is the depth of the variable + hierarchy specified in the VariablePattern, with secondary sorting + based upon the length of strings at the same hierarchy. + + """ + config = CFConfig(self.mission, 'FAKE97', self.test_config) + + with self.subTest('Deeper matches takes precedence over shallower'): + self.assertDictEqual( + config.get_metadata_overrides('/group/variable'), + { + 'conflicting_attribute_global_and_group': 'applies to /group/.*', + 'conflicting_attribute_group_and_variable': 'applies to /group/variable', + }, + ) + + with self.subTest('Depth takes precedence over string length'): + self.assertDictEqual( + config.get_metadata_overrides('/other_group/variable'), + { + 'conflicting_attribute_global_and_group': 'applies to all', + 'test_depth_priority_over_string_length': 'applies to /other_group/variable', + }, + ) + + with self.subTest('String length decides between matches of equal depth'): + self.assertDictEqual( + config.get_metadata_overrides('/string_length/variable'), + { + 'conflicting_attribute_global_and_group': 'applies to all', + 'test_string_length_same_depth': 'applies to /string_length/variable', + }, + ) diff --git a/tests/unit/test_group.py b/tests/unit/test_group.py new file mode 100644 index 0000000..41ae19a --- /dev/null +++ b/tests/unit/test_group.py @@ -0,0 +1,121 @@ +from shutil import rmtree +from tempfile import mkdtemp +from unittest import TestCase +import xml.etree.ElementTree as ET + +from netCDF4 import Dataset + +from varinfo.cf_config import CFConfig +from varinfo.group import GroupFromDmr, GroupFromNetCDF4 + +from tests.utilities import netcdf4_global_attributes, write_skeleton_netcdf4 + + +class TestGroupFromDmr(TestCase): + """Tests for the `Group` class using `xml.etree.ElementTree` input.""" + + @classmethod + def setUpClass(cls): + """Set up properties of the class that do not need to be reset between + tests. + + """ + cls.config_file = 'tests/unit/data/test_config.json' + cls.fakesat_config = CFConfig('FakeSat', 'FAKE99', config_file=cls.config_file) + cls.namespace = 'namespace_string' + cls.dmr_group = ET.fromstring( + f'<{cls.namespace}Group name="science_group">' + f' <{cls.namespace}Float64 name="variable_one">' + f' ' + f' <{cls.namespace}Float64 name="variable_two">' + f' ' + f' <{cls.namespace}Attribute name="coordinates">' + f' <{cls.namespace}Value>lat lon' + f' ' + f'' + ) + cls.group_path = '/science_group' + + def test_group_instantiation(self): + """Ensure a group can be created from an input DMR XML element. + + This group should contain the correct metadata attributes, accounting + for any metadata overrides from CFConfig. + + Child variables should be listed via the `variables` class attribute. + + """ + # First attribute comes from CFConfig, the second from the DMR. + expected_attributes = { + 'collection_override': 'collection value', + 'coordinates': 'lat lon', + } + + group = GroupFromDmr( + self.dmr_group, + self.fakesat_config, + self.namespace, + self.group_path, + ) + + # Class attributes from AttributeContainer: + self.assertEqual(group.namespace, self.namespace) + self.assertEqual(group.full_name_path, self.group_path) + self.assertDictEqual(group.attributes, expected_attributes) + + # Group specific class attributes: + self.assertSetEqual( + group.variables, + {'/science_group/variable_one', '/science_group/variable_two'}, + ) + + +class TestGroupFromNetCDF4(TestCase): + """Tests for the `Group` class using NetCDF-4 input.""" + + @classmethod + def setUpClass(cls): + """Set up properties of the class that do not need to be reset between + tests. + + """ + cls.config_file = 'tests/unit/data/test_config.json' + cls.fakesat_config = CFConfig('FakeSat', 'FAKE99', config_file=cls.config_file) + cls.namespace = 'namespace string' + + def setUp(self): + """Set up properties to be reset for every test.""" + self.output_dir = mkdtemp() + + def tearDown(self): + """Perform clean-up after every test.""" + rmtree(self.output_dir) + + def test_group_instantiation(self): + """Ensure a group can be created from an input NetCDF-4 file.""" + expected_attributes = { + 'collection_override': 'collection value', + 'global_override': 'GLOBAL', + **netcdf4_global_attributes, + } + + netcdf4_path = write_skeleton_netcdf4(self.output_dir) + + with Dataset(netcdf4_path) as dataset: + group = GroupFromNetCDF4( + dataset, + self.fakesat_config, + self.namespace, + dataset.path, + ) + + # Class attributes from AttributeContainer: + self.assertEqual(group.namespace, self.namespace) + self.assertEqual(group.full_name_path, '/') + self.assertDictEqual(group.attributes, expected_attributes) + + # Group specific class attributes + self.assertSetEqual( + group.variables, + {'/lat', '/lon', '/time', '/science1', '/scalar1'}, + ) diff --git a/tests/unit/test_utilities.py b/tests/unit/test_utilities.py index 99fd6fb..fa26f2d 100644 --- a/tests/unit/test_utilities.py +++ b/tests/unit/test_utilities.py @@ -1,17 +1,26 @@ +from shutil import rmtree +from tempfile import mkdtemp from typing import List from unittest import TestCase import xml.etree.ElementTree as ET +from netCDF4 import Dataset import numpy as np from varinfo.exceptions import DmrNamespaceError from varinfo.utilities import ( + get_full_path_netcdf4_attribute, + get_full_path_xml_attribute, get_xml_attribute, + get_xml_attribute_value, + get_xml_container_attribute, get_xml_namespace, recursive_get, split_attribute_path, ) +from tests.utilities import write_skeleton_netcdf4 + class TestUtilities(TestCase): """A class for testing functions in the varinfo.utilities module.""" @@ -20,6 +29,12 @@ class TestUtilities(TestCase): def setUpClass(cls): cls.namespace = 'namespace_string' + def setUp(self): + self.output_dir = mkdtemp() + + def tearDown(self): + rmtree(self.output_dir) + def test_recursive_get(self): """Can retrieve a nested dictionary value, or account for missing data. @@ -53,6 +68,7 @@ def test_split_attribute_path(self): '/Metadata/Series/short_name', ['Metadata', 'Series', 'short_name'], ], + ['Without leading slash', 'Metadata/Series', ['Metadata', 'Series']], ] for description, full_path, expected_key_list in test_args: @@ -103,15 +119,33 @@ def test_get_xml_attribute(self): f' <{self.namespace}Value>-90.0' f' <{self.namespace}Value>90.0' f' ' + f' <{self.namespace}Attribute name="named_container" type="Container">' + f' <{self.namespace}Attribute name="nested_one" type="Float64">' + f' <{self.namespace}Value>{value}' + f' ' + f' <{self.namespace}Attribute name="nested_two" type="Float64">' + f' <{self.namespace}Value>{value}' + f' ' + f' ' f'' ) + expected_container_outputs = { + 'nested_one': 12.0, + 'nested_two': 12.0, + } test_args = [ ['Element with Float64 attribute', 'valid_attr', value, np.float64], ['Absent Attribute uses default', 'missing_attr', default, type(default)], ['Attribute omitting type property', 'no_type', '12.0', str], ['Absent Value tag uses default', 'no_value', default, type(default)], ['Unexpected type property', 'bad_type', '12.0', str], + [ + 'Container attribute', + 'named_container', + expected_container_outputs, + dict, + ], ] for description, attr_name, expected_value, expected_type in test_args: @@ -132,3 +166,231 @@ def test_get_xml_attribute(self): for value in attribute_value: self.assertIsInstance(value, np.float64) + + def test_get_xml_attribute_value(self): + """Ensure a single or list value can be extracted from a given XML + Attribute tag. If there are no child Value tags, the default value + should be returned. + + """ + with self.subTest('Single value is retrieved.'): + value = 'value_123' + attribute = ET.fromstring( + f' <{self.namespace}Attribute name="no_type">' + f' <{self.namespace}Value>{value}' + f' ' + ) + self.assertEqual( + get_xml_attribute_value(attribute, self.namespace, 'String', 'default'), + value, + ) + + with self.subTest('List of values are retrieved.'): + value_one = 'value_1' + value_two = 'value_2' + value_three = 'value_3' + attribute = ET.fromstring( + f' <{self.namespace}Attribute name="no_type">' + f' <{self.namespace}Value>{value_one}' + f' <{self.namespace}Value>{value_two}' + f' <{self.namespace}Value>{value_three}' + f' ' + ) + self.assertListEqual( + get_xml_attribute_value(attribute, self.namespace, 'String', 'default'), + [value_one, value_two, value_three], + ) + + with self.subTest('No values returns default.'): + default = 'default value' + attribute = ET.fromstring( + f' <{self.namespace}Attribute name="no_type">' + f' ' + ) + self.assertEqual( + get_xml_attribute_value(attribute, self.namespace, 'String', default), + default, + ) + + def test_get_xml_container_attribute(self): + """Ensure a dictionary of attributes is retrieved for a container.""" + + with self.subTest('Flat dictionary.'): + flat_container = ET.fromstring( + f'<{self.namespace}Attribute name="container" type="Container">' + f' <{self.namespace}Attribute name="attribute_one" type="Float64">' + f' <{self.namespace}Value>1.0' + f' ' + f' <{self.namespace}Attribute name="attribute_two" type="Float64">' + f' <{self.namespace}Value>2.0' + f' ' + f'' + ) + + expected_attribute_container = { + 'attribute_one': 1.0, + 'attribute_two': 2.0, + } + + self.assertDictEqual( + get_xml_container_attribute(flat_container, self.namespace), + expected_attribute_container, + ) + + with self.subTest('Nested dictionary.'): + nested_container = ET.fromstring( + f'<{self.namespace}Attribute name="named_container" type="Container">' + f' <{self.namespace}Attribute name="attribute_one" type="Float64">' + f' <{self.namespace}Value>1.0' + f' ' + f' <{self.namespace}Attribute name="group_one" type="Container">' + f' <{self.namespace}Attribute name="attribute_two" type="Float64">' + f' <{self.namespace}Value>2.0' + f' ' + f' <{self.namespace}Attribute name="group_two" type="Container">' + f' <{self.namespace}Attribute name="attribute_three" type="Float64">' + f' <{self.namespace}Value>3.0' + f' ' + f' ' + f' ' + f'' + ) + + expected_attribute_container = { + 'attribute_one': 1.0, + 'group_one': { + 'attribute_two': 2.0, + 'group_two': { + 'attribute_three': 3.0, + }, + }, + } + + self.assertDictEqual( + get_xml_container_attribute(nested_container, self.namespace), + expected_attribute_container, + ) + + def test_get_full_path_xml_attribute(self): + """Ensure an XML attribute nested to an arbitrary amount can have its + value retrieved from the element tree. + + """ + with open( + 'tests/unit/data/ATL03_example.dmr', 'r', encoding='utf-8' + ) as file_handler: + atl03_dmr = ET.fromstring(file_handler.read()) + + atl03_namespace = '{http://xml.opendap.org/ns/DAP/4.0#}' + + with self.subTest('Non nested attribute.'): + self.assertEqual( + get_full_path_xml_attribute( + atl03_dmr, + '/Conventions', + atl03_namespace, + ), + 'CF-1.6', + ) + + with self.subTest('No leading slash.'): + self.assertEqual( + get_full_path_xml_attribute( + atl03_dmr, + 'Conventions', + atl03_namespace, + ), + 'CF-1.6', + ) + + with self.subTest('Singly nested attribute.'): + self.assertEqual( + get_full_path_xml_attribute( + atl03_dmr, + '/gt1l/atlas_pce', + atl03_namespace, + ), + 'pce1', + ) + + with self.subTest('Deeply nested attribute.'): + self.assertEqual( + get_full_path_xml_attribute( + atl03_dmr, + '/gt1l/bckgrd_atlas/tlm_height_band1/coordinates', + atl03_namespace, + ), + 'delta_time', + ) + + with self.subTest('Attribute that does not exist returns None.'): + self.assertIsNone( + get_full_path_xml_attribute( + atl03_dmr, + '/NONEXISTENT', + atl03_namespace, + ) + ) + + with self.subTest('Non-existent variable or group returns None.'): + self.assertIsNone( + get_full_path_xml_attribute( + atl03_dmr, + '/absent_attribute_container/units', + atl03_namespace, + ) + ) + + def test_get_full_path_netcdf4_attribute(self): + """Ensure a NetCDF-4 metadata attribute can be retrieved from anywhere + in the file. This includes the root group, nested groups, variables in + the root group and variables within groups. + + """ + netcdf_file_path = write_skeleton_netcdf4(self.output_dir) + + with self.subTest('Root group attribute.'): + with Dataset(netcdf_file_path) as dataset: + self.assertEqual( + get_full_path_netcdf4_attribute(dataset, '/short_name'), 'ATL03' + ) + + with self.subTest('Root group attribute without a leading slash.'): + with Dataset(netcdf_file_path) as dataset: + self.assertEqual( + get_full_path_netcdf4_attribute(dataset, 'short_name'), 'ATL03' + ) + + with self.subTest('Absent root group attribute returns None.'): + with Dataset(netcdf_file_path) as dataset: + self.assertIsNone( + get_full_path_netcdf4_attribute(dataset, 'missing_attribute'), + ) + + with self.subTest('Root-level variable.'): + with Dataset(netcdf_file_path) as dataset: + self.assertEqual( + get_full_path_netcdf4_attribute(dataset, '/science1/coordinates'), + '/lat /lon', + ) + + with self.subTest('Absent variable attribute returns None.'): + with Dataset(netcdf_file_path) as dataset: + self.assertIsNone( + get_full_path_netcdf4_attribute(dataset, '/science1/missing'), + ) + + with self.subTest('Attribute on non-existent variable returns None.'): + with Dataset(netcdf_file_path) as dataset: + self.assertIsNone( + get_full_path_netcdf4_attribute(dataset, '/science3/coordinates'), + ) + + with self.subTest('Variable in nested group.'): + with Dataset(netcdf_file_path) as dataset: + self.assertEqual( + get_full_path_netcdf4_attribute( + dataset, '/group/science2/coordinates' + ), + '/lat /lon', + ) diff --git a/tests/unit/test_var_info.py b/tests/unit/test_var_info.py index 8021233..5b7ea63 100644 --- a/tests/unit/test_var_info.py +++ b/tests/unit/test_var_info.py @@ -8,7 +8,7 @@ InvalidConfigFileFormatError, MissingConfigurationFileError, ) -from tests.utilities import netcdf4_global_attributes, write_dmr, write_skeleton_netcdf4 +from tests.utilities import write_dmr, write_skeleton_netcdf4 class TestVarInfoFromDmr(TestCase): @@ -20,9 +20,8 @@ def setUpClass(cls): tests. """ - cls.config_file = 'tests/unit/data/test_config.json' + cls.test_config_file = 'tests/unit/data/test_config.json' cls.namespace = 'namespace_string' - cls.sample_config = 'config/0.0.1/sample_config_0.0.1.json' cls.mock_geographic_dmr = 'tests/unit/data/mock_geographic.dmr' cls.mock_dmr_two = 'tests/unit/data/mock_dataset_two.dmr' cls.mock_geo_and_projected_dmr = 'tests/unit/data/mock_geo_and_projected.dmr' @@ -131,7 +130,7 @@ def test_var_info_short_name(self): ) dmr_path = write_dmr(self.output_dir, mock_dmr) - dataset = VarInfoFromDmr(dmr_path, config_file=self.sample_config) + dataset = VarInfoFromDmr(dmr_path, config_file=self.test_config_file) self.assertEqual(dataset.short_name, short_name) @@ -139,16 +138,16 @@ def test_var_info_short_name(self): mock_dmr = f'' dmr_path = write_dmr(self.output_dir, mock_dmr) - dataset = VarInfoFromDmr(dmr_path, config_file=self.sample_config) + dataset = VarInfoFromDmr(dmr_path, config_file=self.test_config_file) - self.assertEqual(dataset.short_name, None) + self.assertIsNone(dataset.short_name) with self.subTest('No short name in metadata, but given in call'): mock_dmr = f'' dmr_path = write_dmr(self.output_dir, mock_dmr) dataset = VarInfoFromDmr( - dmr_path, short_name='ATL03', config_file=self.sample_config + dmr_path, short_name='ATL03', config_file=self.test_config_file ) self.assertEqual(dataset.short_name, 'ATL03') @@ -164,7 +163,7 @@ def test_var_info_short_name(self): dmr_path = write_dmr(self.output_dir, mock_dmr) dataset = VarInfoFromDmr( - dmr_path, short_name='ATL08', config_file=self.sample_config + dmr_path, short_name='ATL08', config_file=self.test_config_file ) self.assertEqual(dataset.short_name, 'ATL08') @@ -194,7 +193,7 @@ def test_var_info_mission(self): ) dmr_path = write_dmr(self.output_dir, mock_dmr) - dataset = VarInfoFromDmr(dmr_path, config_file=self.sample_config) + dataset = VarInfoFromDmr(dmr_path, config_file=self.test_config_file) self.assertEqual(dataset.mission, expected_mission) @@ -250,19 +249,31 @@ def test_var_info_instantiation_no_augmentation(self): """Ensure VarInfo instantiates correctly, creating records of all the variables in the granule, and correctly deciding if they are science variables, metadata or references. This test uses a mission - and short name that do not have any CF overrides or supplements. + and short name that do not have any metadata overrides. """ - dataset = VarInfoFromDmr(self.mock_geographic_dmr, config_file=self.config_file) + dataset = VarInfoFromDmr( + self.mock_geographic_dmr, config_file=self.test_config_file + ) self.assertEqual(dataset.short_name, 'ATL03') self.assertEqual(dataset.mission, 'ICESat2') - self.assertEqual( - dataset.global_attributes, - {'METADATA': {'DatasetIdentification': {'shortName': 'ATL03'}}}, + + self.assertSetEqual( + set(dataset.groups.keys()), + { + '/', + '/METADATA', + '/METADATA/DatasetIdentification', + }, ) - self.assertEqual( + self.assertDictEqual( + dataset.groups['/METADATA/DatasetIdentification'].attributes, + {'shortName': 'ATL03'}, + ) + + self.assertSetEqual( set(dataset.variables.keys()), { '/ancillary_one', @@ -274,7 +285,7 @@ def test_var_info_instantiation_no_augmentation(self): '/subset_one', }, ) - self.assertEqual( + self.assertSetEqual( dataset.references, { '/ancillary_one', @@ -287,18 +298,42 @@ def test_var_info_instantiation_no_augmentation(self): def test_var_info_instantiation_cf_augmentation(self): """Ensure VarInfo instantiates correcly, using a missions that has - overrides and supplements in the CFConfig class. + metadata attribute overrides in the CFConfig class. """ - dataset = VarInfoFromDmr(self.mock_dmr_two, config_file=self.config_file) + dataset = VarInfoFromDmr(self.mock_dmr_two, config_file=self.test_config_file) expected_global_attributes = { - 'METADATA': {'DatasetIdentification': {'shortName': 'FAKE99'}}, + 'collection_override': 'collection value', 'global_override': 'GLOBAL', - 'fakesat_global_supplement': 'fakesat value', } - self.assertEqual(dataset.global_attributes, expected_global_attributes) - self.assertEqual( + + self.assertSetEqual( + set(dataset.groups.keys()), + { + '/', + '/exclude_one', + '/required_group', + '/science', + '/METADATA', + '/METADATA/DatasetIdentification', + }, + ) + + self.assertDictEqual( + dataset.groups['/'].attributes, + expected_global_attributes, + ) + + self.assertDictEqual( + dataset.groups['/METADATA/DatasetIdentification'].attributes, + { + 'collection_override': 'collection value', + 'shortName': 'FAKE99', + }, + ) + + self.assertSetEqual( set(dataset.variables.keys()), { '/science/latitude', @@ -336,14 +371,20 @@ def test_var_info_from_dmr_instantiation_nested_attributes(self): '' ) dmr_path = write_dmr(self.output_dir, mock_dmr) - dataset = VarInfoFromDmr(dmr_path, config_file=self.config_file) + dataset = VarInfoFromDmr(dmr_path, config_file=self.test_config_file) expected_globals = { - 'history': history, - 'numeric_attribute': -90.0, - 'short_name': 'FAKESAT1', + 'HDF5_GLOBAL': { + 'history': history, + 'numeric_attribute': -90.0, + 'short_name': 'FAKESAT1', + } } - self.assertDictEqual(expected_globals, dataset.global_attributes) + + self.assertDictEqual( + expected_globals, + dataset.groups['/'].attributes, + ) def test_var_info_get_all_variables(self): """Ensure all variables from the input are returned, regardless of @@ -352,7 +393,7 @@ def test_var_info_get_all_variables(self): variables. """ - dataset = VarInfoFromDmr(self.mock_dmr_two, config_file=self.config_file) + dataset = VarInfoFromDmr(self.mock_dmr_two, config_file=self.test_config_file) expected_variables = { '/science/interesting_thing', @@ -371,7 +412,7 @@ def test_var_info_get_science_variables(self): associated instance of the `CFConfig` class. """ - dataset = VarInfoFromDmr(self.mock_dmr_two, config_file=self.config_file) + dataset = VarInfoFromDmr(self.mock_dmr_two, config_file=self.test_config_file) science_variables = dataset.get_science_variables() self.assertEqual(science_variables, {'/science/interesting_thing'}) @@ -386,7 +427,7 @@ def test_var_info_get_metadata_variables(self): excluded by the `CFConfig` instance. """ - dataset = VarInfoFromDmr(self.mock_dmr_two, config_file=self.config_file) + dataset = VarInfoFromDmr(self.mock_dmr_two, config_file=self.test_config_file) metadata_variables = dataset.get_metadata_variables() self.assertSetEqual(metadata_variables, {'/required_group/has_no_coordinates'}) @@ -400,7 +441,7 @@ class is asked for those variables required to make a viable output subset_control_variables. """ - dataset = VarInfoFromDmr(self.mock_dmr_two, config_file=self.config_file) + dataset = VarInfoFromDmr(self.mock_dmr_two, config_file=self.test_config_file) with self.subTest('All references to other variables are retrieved'): required_variables = dataset.get_required_variables( @@ -462,7 +503,7 @@ def test_exclude_fake_dimensions(self): required_variables = VarInfoFromDmr.exclude_fake_dimensions(input_variables) - self.assertEqual(required_variables, {'/science_variable', '/other_science'}) + self.assertSetEqual(required_variables, {'/science_variable', '/other_science'}) def test_get_variable(self): """Ensure a variable, both with or without, coordinates can be @@ -470,7 +511,7 @@ def test_get_variable(self): ensure `None` is returned. """ - dataset = VarInfoFromDmr(self.mock_dmr_two, config_file=self.config_file) + dataset = VarInfoFromDmr(self.mock_dmr_two, config_file=self.test_config_file) with self.subTest('A variable with coordinates'): science_variable = dataset.get_variable('/science/interesting_thing') @@ -526,7 +567,7 @@ def test_get_required_dimensions(self): ) dmr_path = write_dmr(self.output_dir, mock_dmr) - dataset = VarInfoFromDmr(dmr_path, config_file=self.config_file) + dataset = VarInfoFromDmr(dmr_path, config_file=self.test_config_file) with self.subTest('All dimensions are retrieved'): self.assertSetEqual( @@ -555,7 +596,7 @@ def test_get_spatial_dimensions(self): """ dataset = VarInfoFromDmr( - self.mock_geo_and_projected_dmr, config_file=self.config_file + self.mock_geo_and_projected_dmr, config_file=self.test_config_file ) with self.subTest('All horizontal spatial variables are returned'): @@ -588,7 +629,7 @@ def test_get_geographic_spatial_dimensions(self): """ dataset = VarInfoFromDmr( - self.mock_geo_and_projected_dmr, config_file=self.config_file + self.mock_geo_and_projected_dmr, config_file=self.test_config_file ) with self.subTest('All (and only) geographic variables are returned'): @@ -627,7 +668,7 @@ def test_get_projected_spatial_dimensions(self): """ dataset = VarInfoFromDmr( - self.mock_geo_and_projected_dmr, config_file=self.config_file + self.mock_geo_and_projected_dmr, config_file=self.test_config_file ) with self.subTest('All (and only) projected dimension variables are returned'): @@ -712,7 +753,7 @@ def test_get_temporal_dimensions(self): ) dmr_path = write_dmr(self.output_dir, mock_dmr) - dataset = VarInfoFromDmr(dmr_path, config_file=self.config_file) + dataset = VarInfoFromDmr(dmr_path, config_file=self.test_config_file) with self.subTest('All (and only) temporal variables are returned'): self.assertSetEqual( @@ -747,7 +788,7 @@ def test_get_variables_with_dimensions(self): """ dataset = VarInfoFromDmr( - self.mock_geo_and_projected_dmr, config_file=self.config_file + self.mock_geo_and_projected_dmr, config_file=self.test_config_file ) with self.subTest('Only geographically gridded variables are retrieved'): @@ -770,7 +811,7 @@ def test_get_variables_with_dimensions(self): def test_group_variables_by_dimensions(self): """Ensure all variables are grouped according to their dimensions.""" dataset = VarInfoFromDmr( - self.dimension_grouping_dmr, config_file=self.config_file + self.dimension_grouping_dmr, config_file=self.test_config_file ) expected_groups = { @@ -801,7 +842,7 @@ def test_group_variables_by_horizontal_dimensions(self): """ dataset = VarInfoFromDmr( - self.dimension_grouping_dmr, config_file=self.config_file + self.dimension_grouping_dmr, config_file=self.test_config_file ) expected_groups = { @@ -834,7 +875,7 @@ def test_var_info_netcdf4(self): """ netcdf4_path = write_skeleton_netcdf4(self.output_dir) - dataset = VarInfoFromNetCDF4(netcdf4_path, config_file=self.config_file) + dataset = VarInfoFromNetCDF4(netcdf4_path, config_file=self.test_config_file) self.assertSetEqual( dataset.get_science_variables(), {'/group/science2', '/science1'} ) @@ -843,7 +884,8 @@ def test_var_info_netcdf4(self): dataset.get_metadata_variables(), {'/scalar1', '/group/scalar2'} ) - self.assertDictEqual(dataset.global_attributes, netcdf4_global_attributes) + # Groups should now be saved to a new dictionary: + self.assertSetEqual(set(dataset.groups.keys()), {'/', '/group'}) def test_is_science_variable(self): """Ensure that a science variable is correctly recognized and @@ -871,14 +913,13 @@ def test_get_missing_variable_attributes(self): """ dataset = VarInfoFromDmr( - self.mock_dmr_two, 'FAKE99', config_file=self.config_file + self.mock_dmr_two, 'FAKE99', config_file=self.test_config_file ) with self.subTest('All CF attributes are retrieved for missing variable'): self.assertDictEqual( dataset.get_missing_variable_attributes('/absent_variable'), { - 'collection_supplement': 'FAKE99 supplement', 'collection_override': 'collection value', 'extra_override': 'overriding value', }, @@ -891,7 +932,9 @@ def test_get_references_for_attribute(self): """ dmr_path = 'tests/unit/data/GPM_3IMERGHH_example.dmr' - dataset = VarInfoFromDmr(dmr_path, 'GPM_3IMERGHH', config_file=self.config_file) + dataset = VarInfoFromDmr( + dmr_path, 'GPM_3IMERGHH', config_file=self.test_config_file + ) with self.subTest('All coordinate references for a variable'): self.assertSetEqual( dataset.get_references_for_attribute( diff --git a/tests/unit/test_variable.py b/tests/unit/test_variable.py index 70bdb30..4ddde5a 100644 --- a/tests/unit/test_variable.py +++ b/tests/unit/test_variable.py @@ -131,9 +131,8 @@ def test_variable_instantiation(self): they are contained in the variable XML, to ensure data requests can be made against the variable with index ranges specified. - Any applicable attribute override or supplement for an absent - metadata attribute should also be adopted as the value for that - attribute, with overrides taking precedence over supplements. + Any applicable attribute override for an absent or incorrect metadata + attribute should also be adopted as the value for that attribute. """ variable = VariableFromDmr( @@ -154,7 +153,6 @@ def test_variable_instantiation(self): 'subset_control_variables', 'units', 'collection_override', - 'collection_supplement', 'group_override', 'variable_override', }, @@ -231,28 +229,10 @@ def test_variable_cf_override_non_reference(self): variable.attributes.get('collection_override'), 'collection value' ) - def test_variable_cf_supplement_non_reference(self): - """Ensure a metadata attribute is supplemented by the `CFConfig`.""" - dmr_variable = ET.fromstring( - f'<{self.namespace}Float64 name="science">' - f' <{self.namespace}Attribute name="group_supplement" type="String">' - f' <{self.namespace}Value>initial_value' - f' ' - f'' - ) - - variable = VariableFromDmr( - dmr_variable, self.fakesat_config, self.namespace, '/group4/science' - ) - - self.assertEqual( - variable.attributes.get('group_supplement'), 'initial_value, FAKE99 group4' - ) - def test_variable_cf_override_absent(self): """Ensure a metadata attribute adopts the override value, even if the granule metadata originally omitted that attribute. The overriding - value should be used, and any supplemental value should be ignored. + value should be used. """ dmr_variable = ET.fromstring( @@ -266,24 +246,6 @@ def test_variable_cf_override_absent(self): self.assertEqual(variable.attributes.get('extra_override'), 'overriding value') - def test_variable_cf_supplement_absent(self): - """Ensure a metadata attribute adopts the override value, even if the - granule metadata originally omitted that attribute. - - """ - dmr_variable = ET.fromstring( - f'<{self.namespace}Float64 name="absent_supplement">' - f'' - ) - - variable = VariableFromDmr( - dmr_variable, self.fakesat_config, self.namespace, '/absent_supplement' - ) - - self.assertEqual( - variable.attributes.get('extra_supplement'), 'supplemental value' - ) - def test_variable_reference_qualification(self): """Ensure different reference types (relative, absolute) are correctly qualified. @@ -761,7 +723,6 @@ def test_variable_from_netcdf4(self): 'valid_max', 'scale', 'collection_override', - 'collection_supplement', }, ) self.assertEqual(variable.attributes['units'], 'metres') diff --git a/varinfo/attribute_container.py b/varinfo/attribute_container.py new file mode 100644 index 0000000..456e59c --- /dev/null +++ b/varinfo/attribute_container.py @@ -0,0 +1,156 @@ +""" This module contains classes designed to capture information regarding + metadata attributes as read from either an OPeNDAP DMR or a NetCDF-4 file. + These classes are inherited by both representations of groups and of + variables. + +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any, Union +import xml.etree.ElementTree as ET + +from netCDF4 import Group as NetCDF4Group +from netCDF4 import Variable as NetCDF4Variable + +from varinfo.cf_config import CFConfig +from varinfo.utilities import get_xml_attribute + + +InputContainerType = Union[ET.Element, NetCDF4Group, NetCDF4Variable] + + +class AttributeContainerBase(ABC): + """A class to represent objects that have metadata attributes, such as + groups or variables within a NetCDF-4 file or OPeNDAP DMR. + + """ + + def __init__( + self, + container: InputContainerType, + cf_config: CFConfig, + namespace: str, + full_name_path: str, + ): + """Extract metadata attributes, including any overrides defined in the + supplied `CFConfig` instance. + + """ + self.namespace = namespace + self.full_name_path = full_name_path + self.metadata_overrides = cf_config.get_metadata_overrides(self.full_name_path) + self.attributes = self._get_attributes(container) + self._add_additional_attributes() + + @abstractmethod + def _get_attributes(self, container: InputContainerType) -> dict[str, Any]: + """Extract all attributes for the container. The contents of the output + dictionary will be as stored in the granule metadata, with any + applicable overrides from `CFConfig`. + + """ + + @abstractmethod + def _get_attribute(self, container: InputContainerType, attribute_name: str) -> Any: + """Extract an attribute value from the source granule metadata. Any + applicable overrides from `CFConfig` will be applied before returning + the attribute value. + + """ + + def get_attribute_value( + self, attribute_name: str, default_value: Any | None = None + ) -> Any: + """A convenience function for the end-user to retrieve the value of a + specified attribute, or use an optional default value if that + attribute is not present in the container metadata. If no default + value is supplied, requesting the value of an absent attribute will + return `None`. + + """ + return self.attributes.get(attribute_name, default_value) + + def _add_additional_attributes(self) -> None: + """Check the `CFConfig` instance for any metadata attributes that are + listed, but not included in the original granule metadata. These should + be added to the variable metadata attributes. + + """ + self._add_missing_attributes(self.metadata_overrides) + + def _add_missing_attributes(self, extra_attributes: dict) -> None: + """Iterate through a dictionary of attributes from the `CFConfig` + instance matching this container. If there are any attributes listed + that are not already present in the `self.attributes` dictionary, + then add them to with the value from the configuration file. + + """ + for attribute_name, attribute_value in extra_attributes.items(): + if attribute_name not in self.attributes: + self.attributes[attribute_name] = attribute_value + + def _get_configured_attribute( + self, attribute_name: str, raw_attribute_value: Any + ) -> Any: + """Check the `CFConfig` instance associated with the container for any + metadata attribute overrides that should be applied to the attribute + value. + + """ + return self.metadata_overrides.get(attribute_name, raw_attribute_value) + + +class AttributeContainerFromDmr(AttributeContainerBase): + """This child class inherits from the `AttributeContainerBase` class and + implements the abstract methods assuming the container source is part of an + XML element tree. + + """ + + def _get_attributes(self, container: ET.Element) -> dict[str, Any]: + """Locate all child Attribute elements of the container and extract + their associated values. + + """ + return { + attribute.get('name'): self._get_attribute(container, attribute.get('name')) + for attribute in container.findall(f'{self.namespace}Attribute') + if attribute.get('name') is not None + } + + def _get_attribute(self, container: ET.Element, attribute_name: str) -> Any: + """Extract the value of an XML Attribute element, casting it to the + appropriate type, applying any necessary metadata overrides. + + """ + raw_value = get_xml_attribute(container, attribute_name, self.namespace) + return self._get_configured_attribute(attribute_name, raw_value) + + +class AttributeContainerFromNetCDF4(AttributeContainerBase): + """This child class inherits from the `AttributeContainerBase` class and + implements the abstract metadata assuming the container source is part of a + NetCDF-4 file. + + """ + + def _get_attributes( + self, container: NetCDF4Group | NetCDF4Variable + ) -> dict[str, Any]: + """Identify all variable attributes and save them to a dictionary.""" + return { + attribute_name: self._get_attribute(container, attribute_name) + for attribute_name in container.ncattrs() + } + + def _get_attribute( + self, container: NetCDF4Group | NetCDF4Variable, attribute_name: str + ) -> Any: + """Extract the value of the metadata attribute, applying any necessary + override from the `CFConfig` instance. + + """ + raw_value = container.__dict__.get(attribute_name) + return self._get_configured_attribute(attribute_name, raw_value) diff --git a/varinfo/cf_config.py b/varinfo/cf_config.py index a4729b0..36c3325 100644 --- a/varinfo/cf_config.py +++ b/varinfo/cf_config.py @@ -1,15 +1,14 @@ """ This module contains a class designed to read and present information from - a JSON configuration file. This configuration file provides supplemental - and overriding information for attributes provided by granules on a per- - collection basis. This information is primarily intended to augment the - CF-Convention attributes for a dataset, but can also be used to alter non - CF-Convention metadata within a granule. - - Information within the configuration file is split into blocks that have - an Applicability_Group. This section should define a mission, collection - short name and (optionally) a regular expression compatible string for - relevant variable paths. These applicability groups can be nested, and so - the mission and short name can be inherited from the parent group. + a JSON configuration file. This configuration file provides overriding + information for attributes provided by granules on a per-collection basis. + This information is primarily intended to augment the CF-Convention + attributes for a dataset, but can also be used to alter non CF-Convention + metadata within a granule. + + Information within the configuration file is split into rules that have + an Applicability. This section should define a mission, collection short + name and (optionally) a regular expression compatible string for relevant + variable paths. The configuration file also specifies variables that are incorrectly considered as science variables by the VarInfo class, due to them having @@ -19,8 +18,10 @@ """ +from __future__ import annotations + from os.path import exists -from typing import Dict, Optional +from typing import Any import json import re @@ -32,9 +33,8 @@ class CFConfig: """This class should read the main configuration file, - see e.g. sample_config.json, which defines overriding values and - supplements for the attributes stored in fields such as - ancillary_variables, or dimensions. + see e.g. sample_config.json, which defines overriding values for the + attributes stored in fields such as ancillary_variables, or dimensions. Given a mission and collection short name, upon instantiation, the object should only retain information relevant to that specific @@ -44,9 +44,9 @@ class CFConfig: def __init__( self, - mission: str, - collection_short_name: str, - config_file: Optional[str] = None, + mission: str | None, + collection_short_name: str | None, + config_file: str | None = None, ): """Set supplied class attributes. Then read the designated configuration file to obtain mission and short name specific @@ -57,12 +57,9 @@ def __init__( self.mission = mission self.short_name = collection_short_name - self._cf_overrides = {} - self._cf_supplements = {} - self.excluded_science_variables = set() - self.required_variables = set() - self.global_supplements = {} - self.global_overrides = {} + self.metadata_overrides: dict[str, dict[str, Any]] = {} + self.excluded_science_variables: set[str] = set() + self.required_variables: set[str] = set() if self.mission is not None: self._read_config_file() @@ -85,51 +82,28 @@ def _read_config_file(self): self.excluded_science_variables = { pattern - for item in config.get('Excluded_Science_Variables', []) + for item in config.get('ExcludedScienceVariables', []) if self._is_applicable( item['Applicability'].get('Mission'), item['Applicability'].get('ShortNamePath'), ) - for pattern in item['Variable_Pattern'] + for pattern in item['VariablePattern'] } self.required_variables = { pattern - for item in config.get('Required_Fields', []) - if self._is_applicable( - item['Applicability'].get('Mission'), - item['Applicability'].get('ShortNamePath'), - ) - for pattern in item['Variable_Pattern'] - } - - self.global_supplements = { - attribute['Name']: attribute['Value'] - for item in config.get('CF_Supplements', []) - if self._is_applicable( - item['Applicability'].get('Mission'), - item['Applicability'].get('ShortNamePath'), - ) - for attribute in item.get('Global_Attributes', []) - } - - self.global_overrides = { - attribute['Name']: attribute['Value'] - for item in config.get('CF_Overrides', []) + for item in config.get('RequiredVariables', []) if self._is_applicable( item['Applicability'].get('Mission'), item['Applicability'].get('ShortNamePath'), ) - for attribute in item.get('Global_Attributes', []) + for pattern in item['VariablePattern'] } - for override in config.get('CF_Overrides', []): - self._process_cf_item(override, self._cf_overrides) + for override in config.get('MetadataOverrides', []): + self._process_cf_item(override, self.metadata_overrides) - for supplement in config.get('CF_Supplements', []): - self._process_cf_item(supplement, self._cf_supplements) - - def _is_applicable(self, mission: str, short_name: str = None) -> bool: + def _is_applicable(self, mission: str, short_name: str | None = None) -> bool: """Given a mission, and optionally also a collection short name, of an applicability within the configuration file, check for a match against the mission and short name specified when instantiating the @@ -146,21 +120,17 @@ class object. def _process_cf_item( self, - cf_item: Dict, - results: Dict[str, Dict], - input_mission: str = None, - input_short_name: str = None, + cf_item: dict, + results: dict[str, dict], + input_mission: str | None = None, + input_short_name: str | None = None, ): - """Process a single block in the CF overrides or CF supplements region - of the configuration file. First check that the applicability - matches the mission and short name for the class. Next, check - for a variable pattern. This is indicative of there being - overriding or supplemental attributes in this list item. - Assign any information to the results dictionary, with a key of - that variable pattern. Lastly, check for any nested references, - which are child blocks to be processed in the same way. The mission - and short name from this block are passed to all children, as they - may not both be defined, due to assumed inheritance. + """Process a single block in the `MetadataOverrides` region of the + configuration file. First check that the applicability matches the + mission and short name for the class. Next, check for a variable + pattern. This is indicative of there being overriding attributes in + this list item. Assign any information to the results dictionary, with + a key of that variable pattern. """ mission = cf_item['Applicability'].get('Mission') or input_mission @@ -171,18 +141,11 @@ def _process_cf_item( # variable path - the assumption here is that the applicability is # to all variables (see ICESat2 dimensions override, SPL4.* and # SPL3FTA grid_mapping overrides) - pattern = cf_item['Applicability'].get('Variable_Pattern', '.*') - - if 'Attributes' in cf_item: - results[pattern] = self._create_attributes_object(cf_item) - - cf_references = cf_item.get('Applicability_Group', []) - - for cf_reference in cf_references: - self._process_cf_item(cf_reference, results, mission, short_name) + pattern = cf_item['Applicability'].get('VariablePattern', '.*') + results[pattern] = self._create_attributes_object(cf_item) @staticmethod - def _create_attributes_object(cf_item: Dict) -> Dict[str, str]: + def _create_attributes_object(cf_item: dict) -> dict[str, str]: """Construct a dictionary object containing all contained attributes, which are specified as list items with Name and Value keys. @@ -192,38 +155,56 @@ def _create_attributes_object(cf_item: Dict) -> Dict[str, str]: for attribute in cf_item.get('Attributes', {}) } - def get_cf_attributes(self, variable: str = None) -> Dict[str, Dict[str, str]]: - """Return the CF overrides and supplements that match a given - variable. If a variable is not specified, then return all overrides - and supplements. If there are no overrides or supplements, then - empty dictionaries will be returned instead. + def get_metadata_overrides(self, variable_path: str) -> dict[str, Any]: + """Return the MetadataOverrides that match a given variable. If there + are no overrides, then empty dictionaries will be returned instead. - """ - if variable is not None: - cf_overrides = self._get_matching_attributes(self._cf_overrides, variable) - cf_supplements = self._get_matching_attributes( - self._cf_supplements, variable - ) - else: - cf_overrides = self._cf_overrides - cf_supplements = self._cf_supplements + First iterate through the self.metadata_overrides and find all items + with a variable pattern that matches the supplied variable (or group) + path. - return {'cf_overrides': cf_overrides, 'cf_supplements': cf_supplements} + Next sort that dictionary, so that matching patterns are: - @staticmethod - def _get_matching_attributes( - cf_references: Dict[str, Dict[str, str]], variable: str - ) -> Dict[str, str]: - """Iterate through either the self._cf_supplements or - self._cf_overrides and extract a dictionary that combines all - applicable attributes that apply to the specified variable. If - there are conflicting values for the same attribute, only the last - value will be returned for that attribute. + * Primarily sorted from shallowed to deepest, by counting the number of + slashes in the string. + * Within each depth (with the same number of slashes), patterns are + sorted from shortest to longest string length. + + It is assumed that regular expressions that match deeper elements of + a file hierarchy are intended to be more specifically applied, and that + within a given depth, the string length is a proxy for specificity. + + Last, combine the attribute names and values from each matching + override item. Because of the ordering in the previous step, if there + are multiple values supplied for the same metadata attribute, the value + retained will be the one with the longest variable pattern, which is a + proxy for how specific the override is. """ - references = {} - for pattern, attributes in cf_references.items(): - if re.match(pattern, variable) is not None: - references.update(attributes) + matching_overrides = { + pattern: attributes + for pattern, attributes in self.metadata_overrides.items() + if re.match(pattern, variable_path) is not None + } - return references + # Order MetadataOverrides items by: + # First: Depth of the variable hierarchy included in the regular + # expression pattern (number of slashes), shallowest to deepest. + # Second: The length of the variable pattern, from shorted to longest. + # The second ordering is applied in to regular expressions specifying + # the same depth of variable hierarchy. + sorted_overrides = dict( + sorted( + matching_overrides.items(), + key=lambda pattern: (pattern[0].count('/'), len(pattern[0])), + ), + ) + + # Combine all overrides. In the case a metadata attribute appears in + # multiple matching overrides, the value that comes later based on the + # previous sorting of variable patterns will take precedence. + return { + attribute_name: attribute_value + for sorted_override in sorted_overrides.values() + for attribute_name, attribute_value in sorted_override.items() + } diff --git a/varinfo/cmr_search.py b/varinfo/cmr_search.py index d34d171..da49882 100644 --- a/varinfo/cmr_search.py +++ b/varinfo/cmr_search.py @@ -3,7 +3,9 @@ `requests` library a granule is downloaded via https and saved locally. """ -from typing import Literal, Union +from __future__ import annotations + +from typing import Literal import os.path from cmr import GranuleQuery, CMR_OPS, CMR_SIT, CMR_UAT @@ -28,12 +30,12 @@ def get_granules( - concept_id: str = None, - collection_shortname: str = None, - collection_version: str = None, - provider: str = None, + concept_id: str | None = None, + collection_shortname: str | None = None, + collection_version: str | None = None, + provider: str | None = None, cmr_env: CmrEnvType = CMR_OPS, - auth_header: str = None, + auth_header: str | None = None, ) -> list: """Search CMR to retrieve granules for a specific collection given: @@ -156,7 +158,7 @@ def download_granule( def get_edl_token_from_launchpad( launchpad_token: str, cmr_env: CmrEnvType -) -> Union[str, None]: +) -> str | None: """Retrieve an EDL token given a LaunchPad token. * launchpad_token: A LaunchPad token with no header prefixes: diff --git a/varinfo/generate_umm_var.py b/varinfo/generate_umm_var.py index af7a384..65b5c8f 100644 --- a/varinfo/generate_umm_var.py +++ b/varinfo/generate_umm_var.py @@ -10,8 +10,10 @@ """ +from __future__ import annotations + from tempfile import TemporaryDirectory -from typing import Dict, List, Union, Optional +from typing import Union import re from cmr import CMR_UAT @@ -29,7 +31,7 @@ # Custom return type: either a list of UMM-Var JSON (a list of dictionaries), # or a list of strings (either concept IDs or error strings). -UmmVarReturnType = List[Union[Dict, str]] +UmmVarReturnType = list[Union[dict, str]] def generate_collection_umm_var( @@ -37,7 +39,7 @@ def generate_collection_umm_var( auth_header: str, cmr_env: CmrEnvType = CMR_UAT, publish: bool = False, - config_file: Optional[str] = None, + config_file: str | None = None, ) -> UmmVarReturnType: """Run all the of the functions for downloading and publishing a UMM-Var entry to CMR given: diff --git a/varinfo/group.py b/varinfo/group.py new file mode 100644 index 0000000..04bf074 --- /dev/null +++ b/varinfo/group.py @@ -0,0 +1,84 @@ +""" This module contains classes that represent groups (e.g., containers of + variables within a NetCDF-4 file or OPeNDAP DMR). A group has metadata + attributes and child variables. + +""" + +from __future__ import annotations + +from abc import abstractmethod +from typing import Union +import xml.etree.ElementTree as ET + +from netCDF4 import Group as NetCDF4Group + +from varinfo.attribute_container import ( + AttributeContainerBase, + AttributeContainerFromDmr, + AttributeContainerFromNetCDF4, +) +from varinfo.cf_config import CFConfig +from varinfo.utilities import DAP4_TO_NUMPY_MAP + + +InputGroupType = Union[ET.Element, NetCDF4Group] + + +class GroupBase(AttributeContainerBase): + """A class to represent a single group contained within a granule + representation. It will produce an object with attributes and a set of + fully qualified variables within the group. + + """ + + def __init__( + self, + group: InputGroupType, + cf_config: CFConfig, + namespace: str, + full_name_path: str, + ): + """First extract all metadata attributes on the group, accounting for + overrides defined in the CFConfig file. Then parse the paths of all + child variables in the group. + + """ + super().__init__(group, cf_config, namespace, full_name_path) + self.variables = self._parse_variables(group) + + @abstractmethod + def _parse_variables(self, group: InputGroupType) -> set[str]: + """An abstract method to retrieve a set of all variables that are + direct children of the group. + + """ + + +class GroupFromDmr(GroupBase, AttributeContainerFromDmr): + """This child class inherits from the `GroupBase` class and implements the + abstract methods assuming the group source is a Dataset Metadata Response + (DMR) XML document retrieved from OPeNDAP. + + """ + + def _parse_variables(self, group: ET.Element) -> set[str]: + """Returns full paths of all child variables in the group.""" + return { + '/'.join([self.full_name_path.rstrip('/'), child.get('name', '')]) + for child in group + if child.tag.replace(self.namespace, '') in DAP4_TO_NUMPY_MAP + } + + +class GroupFromNetCDF4(GroupBase, AttributeContainerFromNetCDF4): + """This child class inherits from the `GroupBase` class and implements the + abstract methods assuming the group source is a NetCDF-4 file. + + """ + + def _parse_variables(self, group: NetCDF4Group) -> set[str]: + """Returns full paths of all child variables in the group.""" + return { + '/'.join([self.full_name_path.rstrip('/'), variable]) + for variable in group.variables + } diff --git a/varinfo/umm_var.py b/varinfo/umm_var.py index ee9c46a..49d23b6 100644 --- a/varinfo/umm_var.py +++ b/varinfo/umm_var.py @@ -31,9 +31,11 @@ """ +from __future__ import annotations + from os import makedirs from os.path import isfile, join as join_path -from typing import Any, Dict, List, Optional, Union +from typing import Any import json import requests @@ -77,7 +79,7 @@ ] -def get_all_umm_var(var_info: VarInfoBase) -> Dict[str, Dict]: +def get_all_umm_var(var_info: VarInfoBase) -> dict[str, dict]: """Iterate through all variables detected from the source granule and return a list of UMM-Var records for those variables. @@ -88,7 +90,7 @@ def get_all_umm_var(var_info: VarInfoBase) -> Dict[str, Dict]: } -def get_umm_var(var_info: VarInfoBase, variable: VariableBase) -> Dict: +def get_umm_var(var_info: VarInfoBase, variable: VariableBase) -> dict: """Map the contents of a Variable instance to a UMM-Var record. Initial attempts will be made to extract all possibly information, with the return value being scrubbed of all UMM-Var attributes with @@ -134,7 +136,7 @@ def get_umm_var(var_info: VarInfoBase, variable: VariableBase) -> Dict: } -def export_all_umm_var_to_json(umm_var_records: List, output_dir: str = '.'): +def export_all_umm_var_to_json(umm_var_records: list, output_dir: str = '.'): """Iterate through a list of UMM-Var JSON records and save them all to files (one file per record). @@ -146,7 +148,7 @@ def export_all_umm_var_to_json(umm_var_records: List, output_dir: str = '.'): export_umm_var_to_json(umm_var_record, output_dir) -def export_umm_var_to_json(umm_var_record: Dict, output_dir: str = '.'): +def export_umm_var_to_json(umm_var_record: dict, output_dir: str = '.'): """Export a single UMM-Var JSON object to a JSON output file in a specified directory (the default is the directory in which the function is called). If the specified directory does not exist, it will be @@ -171,7 +173,7 @@ def export_umm_var_to_json(umm_var_record: Dict, output_dir: str = '.'): def get_first_matched_attribute( - variable: VariableBase, attribute_names: List[str], default_value: Any = None + variable: VariableBase, attribute_names: list[str], default_value: Any = None ) -> Any: """Check a list of metadata attributes and return the value of the first one that is present in the Variable. If none of the attributes are @@ -188,7 +190,7 @@ def get_first_matched_attribute( ) -def get_dimensions(var_info: VarInfoBase, variable: VariableBase) -> Optional[List]: +def get_dimensions(var_info: VarInfoBase, variable: VariableBase) -> list | None: """Return a list of all dimensions for the variable,""" dimensions = [ get_dimension_information(var_info, variable, dimension_name) @@ -203,7 +205,7 @@ def get_dimensions(var_info: VarInfoBase, variable: VariableBase) -> Optional[Li def get_dimension_information( var_info: VarInfoBase, variable: VariableBase, dimension_name: str -) -> Dict: +) -> dict: """Retrieve a DimensionType object for the given Variable dimension. This function is only called for named dimensions listed in the `dimensions` attribute of another variable, and so should exist as at least a @@ -255,7 +257,7 @@ def get_dimension_information( def get_dimension_size( var_info: VarInfoBase, variable_with_dim: VariableBase, dimension_name: str -) -> Union[str, int]: +) -> str | int: """Extract the size of a specific dimension for a variable. This function will attempt to retrieve the dimension size from the following locations (in the order given): @@ -300,7 +302,7 @@ def get_dimension_size( return dimension_size -def get_valid_ranges(variable: VariableBase) -> Optional[List[Dict]]: +def get_valid_ranges(variable: VariableBase) -> list[dict] | None: """Return a dictionary containing the valid minimum and/or valid maximum values from the variable metadata. If valid_min, valid_max or valid_range are not set, None is returned. @@ -326,7 +328,7 @@ def get_valid_ranges(variable: VariableBase) -> Optional[List[Dict]]: return valid_range -def get_fill_values(variable: VariableBase) -> Optional[List]: +def get_fill_values(variable: VariableBase) -> list | None: """Return a List containing elements of the UMM-Var FillValueType, if there is a fill value contained in the variable metadata. Otherwise return None. @@ -360,7 +362,7 @@ def get_umm_var_dtype(variable_data_type: str) -> str: return umm_var_type -def get_metadata_specification() -> Dict: +def get_metadata_specification() -> dict: """Return standard object for the UMM-Var specification, including the URL, Name and Version. @@ -388,7 +390,7 @@ def get_json_serializable_value(input_value: Any) -> Any: def generate_variable_native_id( - collection_concept_id: str, umm_var_record: Dict + collection_concept_id: str, umm_var_record: dict ) -> str: """A helper function to create a CMR native ID given the collection concept ID and the variable UMM-Var JSON. This native ID must be unique @@ -419,7 +421,7 @@ def get_variable_type(var_info: VarInfoBase, variable: VariableBase) -> str: def publish_umm_var( collection_id: str, - umm_var_dict: Dict, + umm_var_dict: dict, auth_header: str, cmr_env: CmrEnvType = CMR_UAT, ) -> str: @@ -466,10 +468,10 @@ def publish_umm_var( def publish_all_umm_var( collection_id: str, - all_umm_var_dict: Dict, + all_umm_var_dict: dict, auth_header: str, cmr_env: CmrEnvType = CMR_UAT, -) -> Dict[str, str]: +) -> dict[str, str]: """Publish all UMM-Var entries associated with a collection to CMR given: * collection_id: a collection's concept_id * all_umm_var_dict: a nested dictionary containing diff --git a/varinfo/utilities.py b/varinfo/utilities.py index 7233302..a6bb757 100644 --- a/varinfo/utilities.py +++ b/varinfo/utilities.py @@ -18,11 +18,14 @@ """ -from typing import Dict, List, Optional +from __future__ import annotations + +from typing import Any from xml.etree.ElementTree import Element import functools import re +from netCDF4 import Dataset as NetCDF4Dataset import numpy as np from varinfo.exceptions import DmrNamespaceError @@ -61,8 +64,8 @@ } -def recursive_get(input_dictionary: Dict, keys: List[str]): - """Extract a value from an aribtrarily nested dictionary.""" +def recursive_get(input_dictionary: dict, keys: list[str]): + """Extract a value from an arbitrarily nested dictionary.""" try: nested_value = functools.reduce(dict.get, keys, input_dictionary) except TypeError: @@ -72,7 +75,7 @@ def recursive_get(input_dictionary: Dict, keys: List[str]): return nested_value -def split_attribute_path(full_path: str) -> List[str]: +def split_attribute_path(full_path: str) -> list[str]: """Take the full path to a metadata attribute and return the list of keys that locate that attribute within the global attributes. This function can account for the input path to having, or omitting, a @@ -102,8 +105,8 @@ def get_xml_attribute( variable: Element, attribute_name: str, namespace: str, - default_value: Optional = None, -) -> Optional: + default_value: Any | None = None, +) -> Any | None: """Extract the value of an XML Attribute tag from a `.dmr`. First search the supplied variable element for a fully qualified Attribute child element, with a name property matching the requested attribute name. If @@ -112,7 +115,8 @@ def get_xml_attribute( cast as the type indicated by the Attribute tag's `type` property. Attributes with multiple Value children will return a list of all those - children, cast as the indicated type. + children, cast as the indicated type. Attributes that are containers of + nested attributes will return a dictionary structure. """ attribute_element = variable.find( @@ -121,20 +125,126 @@ def get_xml_attribute( if attribute_element is not None: value_type = attribute_element.get('type', 'String') - numpy_type = DAP4_TO_NUMPY_MAP.get(value_type, str) - - value_elements = attribute_element.findall(f'{namespace}Value') - if len(value_elements) > 1: - attribute_value = [ - numpy_type(value_element.text) for value_element in value_elements - ] - elif len(value_elements) == 1: - attribute_value = numpy_type(value_elements[0].text) + if value_type != 'Container': + attribute_value = get_xml_attribute_value( + attribute_element, + namespace, + value_type, + default_value, + ) else: - attribute_value = default_value + attribute_value = get_xml_container_attribute(attribute_element, namespace) + + else: + attribute_value = default_value + + return attribute_value + + +def get_xml_attribute_value( + attribute_element: Element, + namespace: str, + value_type: str, + default_value: Any | None = None, +) -> Any | None: + """Extract the value (single or list) for an XML attribute. If there are + no attributes matching the required name, then return the supplied default + value. If no default value is supplied, the default used is `None`. + """ + numpy_type = DAP4_TO_NUMPY_MAP.get(value_type, str) + + value_elements = attribute_element.findall(f'{namespace}Value') + + if len(value_elements) > 1: + attribute_value = [ + numpy_type(value_element.text) for value_element in value_elements + ] + elif len(value_elements) == 1: + attribute_value = numpy_type(value_elements[0].text) else: attribute_value = default_value return attribute_value + + +def get_xml_container_attribute( + container_element: Element, namespace: str +) -> dict[str, Any | None]: + """Extract a dictionary of attribute values when an attribute is a container + for further attributes. This function is recursive, and so nested containers + will be treated in the same way. + + """ + attribute_dictionary = {} + + for child in list(container_element): + child_name = child.get('name') + child_type = child.get('type', 'String') + + if child_type != 'Container': + attribute_dictionary[child_name] = get_xml_attribute_value( + child, + namespace, + child_type, + ) + else: + attribute_dictionary[child_name] = get_xml_container_attribute( + child, + namespace, + ) + + return attribute_dictionary + + +def get_full_path_xml_attribute( + dmr_document: Element, + attribute_path: str, + namespace: str, +) -> Any | None: + """Helper function that retrieves the value of an XML attribute, given the + full path to that attribute. If the XML attribute is not present, then + `None` is returned. + + """ + attribute_element = dmr_document + + try: + for path_part in attribute_path.lstrip('/').split('/')[:-1]: + attribute_element = attribute_element.find(f'.//*[@name="{path_part}"]') + + attribute_value = get_xml_attribute( + attribute_element, + attribute_path.split('/')[-1], + namespace, + ) + except AttributeError: + attribute_value = None + + return attribute_value + + +def get_full_path_netcdf4_attribute( + netcdf_dataset: NetCDF4Dataset, + attribute_path: str, +) -> Any | None: + """Helper function that retrieves the value of a metadata attribute from a + NetCDF-4 file, given the full path to that attribute. If the metadata + attribute is not present, then `None` is returned. + + """ + container_path = '/'.join(attribute_path.split('/')[:-1]) + attribute_name = attribute_path.split('/')[-1] + + try: + if container_path != '': + attribute_container = netcdf_dataset[container_path] + else: + attribute_container = netcdf_dataset + + attribute_value = attribute_container.getncattr(attribute_name) + except (KeyError, IndexError, AttributeError): + attribute_value = None + + return attribute_value diff --git a/varinfo/var_info.py b/varinfo/var_info.py index 076fce1..e797f50 100644 --- a/varinfo/var_info.py +++ b/varinfo/var_info.py @@ -4,9 +4,11 @@ """ +from __future__ import annotations + from abc import ABC, abstractmethod from os.path import exists -from typing import Dict, Optional, Set, Tuple, Union, Any +from typing import Any, Union import json import re import xml.etree.ElementTree as ET @@ -18,33 +20,34 @@ InvalidConfigFileFormatError, MissingConfigurationFileError, ) +from varinfo.group import GroupFromDmr, GroupFromNetCDF4 from varinfo.utilities import ( DAP4_TO_NUMPY_MAP, + get_full_path_netcdf4_attribute, + get_full_path_xml_attribute, get_xml_namespace, - split_attribute_path, - recursive_get, ) from varinfo.variable import VariableFromDmr, VariableFromNetCDF4 -DimensionsGroupType = Dict[Tuple[str], Set[str]] -OutputVariableType = Union[VariableFromDmr] +DimensionsGroupType = dict[tuple[str], set[str]] +OutputGroupType = Union[GroupFromDmr, GroupFromNetCDF4] +OutputVariableType = Union[VariableFromDmr, VariableFromNetCDF4] class VarInfoBase(ABC): """An abstract base class to represent the full dataset of a granule, - having reading information from a representation of that granule. - - A class to represent the full dataset of a granule, by parsing a `.dmr` - file from OPeNDAP. + having reading information from a representation of that granule. Currently + supported granule representations: OPeNDAP Dataset Metadata Response (DMR), + NetCDF-4 file. """ def __init__( self, file_path: str, - short_name: Optional[str] = None, - config_file: Optional[str] = None, + short_name: str | None = None, + config_file: str | None = None, ): """Distinguish between variables containing references to other datasets, and those that do not. The former are considered science @@ -57,21 +60,18 @@ def __init__( """ self.config_file = config_file - self.cf_config = None - self.global_attributes = {} self.short_name = short_name self.mission = None self.namespace = None - self.variables: Dict[str, OutputVariableType] = {} - self.references: Set[str] = set() - self.metadata = {} + self.groups: dict[str, OutputGroupType] = {} + self.variables: dict[str, OutputVariableType] = {} + self.references: set[str] = set() + self.metadata: dict[str, OutputVariableType] = {} self._set_var_info_config() self._read_dataset(file_path) - self._set_global_attributes() self._set_mission_and_short_name() - self._set_cf_config() - self._update_global_attributes() + self.cf_config = self._set_cf_config() self._extract_variables() @abstractmethod @@ -81,13 +81,6 @@ def _read_dataset(self, file_path: str): """ - @abstractmethod - def _set_global_attributes(self): - """Extract the global attributes from the granule representation using - functionality specific to the type of input. - - """ - @abstractmethod def _extract_variables(self): """Iterate through all variables in the retrieved dataset. For each @@ -103,9 +96,8 @@ def _assign_variable(self, variable_object): updated. """ - full_path = variable_object.full_name_path self.references.update(variable_object.get_references()) - self.variables[full_path] = variable_object + self.variables[variable_object.full_name_path] = variable_object def _set_var_info_config(self): """Read the VarInfo configuration JSON file, containing locations to @@ -123,13 +115,13 @@ def _set_var_info_config(self): else: self.var_info_config = {} - def _set_cf_config(self): + def _set_cf_config(self) -> CFConfig: """Instantiate a CFConfig object, to contain any rules for exclusions, required fields and augmentations to CF attributes that are not contained within a granule from the specified collection. """ - self.cf_config = CFConfig(self.mission, self.short_name, self.config_file) + return CFConfig(self.mission, self.short_name, self.config_file) def _set_mission_and_short_name(self): """Check a series of potential locations for the collection short name @@ -138,17 +130,7 @@ def _set_mission_and_short_name(self): """ if self.short_name is None: - self.short_name = next( - ( - recursive_get(self.global_attributes, split_attribute_path(item)) - for item in self.var_info_config.get( - 'Collection_ShortName_Path', [] - ) - if recursive_get(self.global_attributes, split_attribute_path(item)) - is not None - ), - None, - ) + self._set_short_name() if self.short_name is not None: self.mission = next( @@ -160,22 +142,15 @@ def _set_mission_and_short_name(self): None, ) - def _update_global_attributes(self): - """Having identified the mission and short_name for the granule, and - therefore obtained the relevant CF configuration overrides and - supplements, update the global attributes for this granule using - the CFConfig class instance. As the overrides are assumed to have - the strongest priority, the dictionary is updated with these values - last. + @abstractmethod + def _set_short_name(self): + """Iterate through the locations in the data granule representation to + find the first value entry for the collections short name in the granule + metadata. """ - if self.cf_config.global_supplements: - self.global_attributes.update(self.cf_config.global_supplements) - if self.cf_config.global_overrides: - self.global_attributes.update(self.cf_config.global_overrides) - - def get_variable(self, variable_path: str) -> Optional[OutputVariableType]: + def get_variable(self, variable_path: str) -> OutputVariableType | None: """Retrieve a variable specified by an absolute path. First check the variables with coordinates, before checking those without. If there are no matching variables, a value of `None` is returned. @@ -183,11 +158,11 @@ def get_variable(self, variable_path: str) -> Optional[OutputVariableType]: """ return self.variables.get(variable_path) - def get_all_variables(self) -> Set[str]: + def get_all_variables(self) -> set[str]: """Retrieve a set of names for all variables in the granule.""" return set(self.variables.keys()) - def get_variables_with_coordinates(self) -> Dict[str, OutputVariableType]: + def get_variables_with_coordinates(self) -> dict[str, OutputVariableType]: """Return only variables with a `coordinates` metadata attribute. This list excludes any variables listed as an excluded science variable in the configuration file supplied to the object. @@ -239,7 +214,7 @@ def is_science_variable(self, variable: OutputVariableType) -> bool: return False - def get_science_variables(self) -> Set[str]: + def get_science_variables(self) -> set[str]: """Retrieve a set of names for all variables that have coordinate references, that are not themselves used as dimensions, coordinates or ancillary date for another variable. @@ -259,7 +234,7 @@ def get_science_variables(self) -> Set[str]: return filtered_with_coordinates - self.references - def get_metadata_variables(self) -> Set[str]: + def get_metadata_variables(self) -> set[str]: """Retrieve set of names for all variables that do no have coordinates references, that are not themselves used as dimensions, coordinates, ancillary data for another variable, or are @@ -302,7 +277,7 @@ def variable_is_excluded( return exclude_variable - def get_required_variables(self, requested_variables: Set[str]) -> Set[str]: + def get_required_variables(self, requested_variables: set[str]) -> set[str]: """Retrieve requested variables and recursively search for all associated dimension and coordinate variables. The returned set should be the union of the science variables, coordinates and @@ -327,7 +302,7 @@ def get_required_variables(self, requested_variables: Set[str]) -> Set[str]: cf_required_variables = set() requested_variables.update(cf_required_variables) - required_variables: Set[str] = set() + required_variables: set[str] = set() while len(requested_variables) > 0: variable_name = requested_variables.pop() @@ -350,7 +325,7 @@ def get_required_variables(self, requested_variables: Set[str]) -> Set[str]: return self.exclude_fake_dimensions(required_variables) - def get_required_dimensions(self, variables: Set[str]) -> Set[str]: + def get_required_dimensions(self, variables: set[str]) -> set[str]: """Return a single set of all variables that are used as dimensions for any of the listed variables. @@ -369,10 +344,7 @@ def get_missing_variable_attributes(self, variable_name: str) -> dict[str, Any]: variables would need to be in the configuration file. """ - variable_attributes = self.cf_config.get_cf_attributes(variable_name) - return ( - variable_attributes['cf_supplements'] | variable_attributes['cf_overrides'] - ) + return self.cf_config.get_metadata_overrides(variable_name) def get_references_for_attribute( self, list_of_variables: list[str], reference_attribute_name: str @@ -400,7 +372,7 @@ def get_references_for_attribute( for variable_reference in variable_references ) - def get_spatial_dimensions(self, variables: Set[str]) -> Set[str]: + def get_spatial_dimensions(self, variables: set[str]) -> set[str]: """Return a single set of all variables that are both used as dimensions for any of the input variables, and that are horizontal spatial dimensions (either geographic or projected). @@ -411,7 +383,7 @@ def get_spatial_dimensions(self, variables: Set[str]) -> Set[str]: self.get_projected_spatial_dimensions(variables), ) - def get_geographic_spatial_dimensions(self, variables: Set[str]) -> Set[str]: + def get_geographic_spatial_dimensions(self, variables: set[str]) -> set[str]: """Return a single set of all the variables that are both used as dimensions for any of the input variables, and that are geographic in nature (as determined by the `units` metadata attribute). @@ -426,7 +398,7 @@ def get_geographic_spatial_dimensions(self, variables: Set[str]) -> Set[str]: if self.get_variable(dimension).is_geographic() ) - def get_projected_spatial_dimensions(self, variables: Set[str]) -> Set[str]: + def get_projected_spatial_dimensions(self, variables: set[str]) -> set[str]: """Return a single set of all the variables that are both used as dimensions for any of the input variables, and that are projected in nature (as determined by the `standard_name` metadata @@ -439,7 +411,7 @@ def get_projected_spatial_dimensions(self, variables: Set[str]) -> Set[str]: if self.get_variable(dimension).is_projection_x_or_y() ) - def get_temporal_dimensions(self, variables: Set[str]) -> Set[str]: + def get_temporal_dimensions(self, variables: set[str]) -> set[str]: """Return a single set of all variables that are both used as dimensions for any of the input variables, and that are temporal in nature (as determined by the `units` metadata attribute). @@ -454,7 +426,7 @@ def get_temporal_dimensions(self, variables: Set[str]) -> Set[str]: if self.get_variable(dimension).is_temporal() ) - def get_variables_with_dimensions(self, dimensions: Set[str]) -> Set[str]: + def get_variables_with_dimensions(self, dimensions: set[str]) -> set[str]: """Return a single set of all variables that include all the supplied dimensions as a subset of their own dimensions. @@ -531,7 +503,7 @@ def group_variables_by_horizontal_dimensions(self) -> DimensionsGroupType: return horizontal_groups @staticmethod - def exclude_fake_dimensions(variable_set: Set[str]) -> Set[str]: + def exclude_fake_dimensions(variable_set: set[str]) -> set[str]: """An OPeNDAP `.dmr` can contain fake dimensions, used to supplement missing information for a granule. These cannot be retrieved when requesting a subset from an OPeNDAP server, and must be removed @@ -560,48 +532,28 @@ def _read_dataset(self, file_path: str): self.dataset = ET.fromstring(dmr_content) self.namespace = get_xml_namespace(self.dataset) - def _set_global_attributes(self): - """Extract all global attributes from a `.dmr` file. First this method - searches for a root level Attribute element with name - "HDF5_GLOBAL". If this is present, it is assumed to be a container - for the global attributes. If "HDF5_GLOBAL" is absent, the global - attributes are assumed to be direct children of the root Dataset - element in the XML tree. All child Attribute elements children with - a type property corresponding to a DAP4 variable type are placed in - an output dictionary. If the type is not recognised by the DAP4 - protocol, the attribute is assumed to be a string. + def _set_short_name(self): + """Iterate through all suggested locations for the collection short + name, as listed in the configuration file. For each location, perform a + search for an XML element in the DMR document for that element and, if + found, retrieve the value of that element. """ - def save_attribute(output, group_path, attribute): - attribute_name = attribute.get('name') - dap4_type = attribute.get('type') - - if dap4_type != 'Container': - attribute_value = attribute.find(f'{self.namespace}Value').text - numpy_type = DAP4_TO_NUMPY_MAP.get(dap4_type, str) - - group_dictionary = output - - if group_path != '': - # Recurse through group keys to retrieve the nested group - # to which the attribute belongs. If a group in the path - # doesn't exist, because this attribute is the first to be - # parsed from this group, then create a new nested - # dictionary for the group to contain the child attributes - nested_groups = group_path.lstrip('/').split('/') - for group in nested_groups: - group_dictionary = group_dictionary.setdefault(group, {}) - - group_dictionary[attribute_name] = numpy_type(attribute_value) - - globals_parent = ( - self.dataset.find(f'{self.namespace}Attribute[@name="HDF5_GLOBAL"]') - or self.dataset - ) - - self.traverse_elements( - globals_parent, {'Attribute'}, save_attribute, self.global_attributes + self.short_name = next( + ( + get_full_path_xml_attribute( + self.dataset, short_name_path, self.namespace + ) + for short_name_path in self.var_info_config.get( + 'CollectionShortNamePath', [] + ) + if get_full_path_xml_attribute( + self.dataset, short_name_path, self.namespace + ) + is not None + ), + None, ) def _extract_variables(self): @@ -627,7 +579,11 @@ def save_variable(output, group_path, element): all_variables = {} self.traverse_elements( - self.dataset, set(DAP4_TO_NUMPY_MAP.keys()), save_variable, all_variables + self.dataset, + set(DAP4_TO_NUMPY_MAP.keys()), + save_variable, + all_variables, + '/', ) self._remove_non_variable_references() @@ -649,25 +605,36 @@ def _remove_non_variable_references(self): def traverse_elements( self, element: ET.Element, - element_types: Set[str], + element_types: set[str], operation, output, - group_path: str = '', + group_path: str, ): """Perform a depth first search of the `.dmr` `Dataset` element. When a variable is located perform an operation on the supplied output object, using the supplied function or class. """ + self.groups[group_path] = GroupFromDmr( + element, + self.cf_config, + namespace=self.namespace, + full_name_path=group_path, + ) + + group_path = group_path.rstrip('/') + for child in list(element): # If it is in the DAP4 list: use the function - # else, if it is a Group, call this function again + # else, if it is a Group, assign to dictionary and call this + # function again element_type = child.tag.replace(self.namespace, '') if element_type in element_types: operation(output, group_path, child) elif element_type == 'Group': new_group_path = '/'.join([group_path, child.get('name')]) + self.traverse_elements( child, element_types, operation, output, new_group_path ) @@ -689,14 +656,26 @@ def _read_dataset(self, file_path: str): """ self.dataset = file_path - def _set_global_attributes(self): - """Extract all global attributes from the NetCDF-4 dataset. Using the - `Dataset.__dict__` method allows extraction of all global - attributes in a single call. + def _set_short_name(self): + """Iterate through all suggested locations for the collection short + name, as listed in the configuration file. For each location, perform a + search for a metadata attribute in the NetCDF-4 file for that attribute + and, if found, retrieve the value of that attribute. """ + with Dataset(self.dataset, 'r') as dataset: - self.global_attributes = dataset.__dict__ + self.short_name = next( + ( + get_full_path_netcdf4_attribute(dataset, short_name_path) + for short_name_path in self.var_info_config.get( + 'CollectionShortNamePath', [] + ) + if get_full_path_netcdf4_attribute(dataset, short_name_path) + is not None + ), + None, + ) def _extract_variables(self): """Traverse all groups of the NetCDF-4 file, beginning at the root @@ -706,13 +685,21 @@ def _extract_variables(self): with Dataset(self.dataset, 'r') as dataset: self._parse_group(dataset) - def _parse_group(self, group: Union[Dataset, Group]): + def _parse_group(self, group: Dataset | Group): """If the child matches one of the DAP4 variable types, then create an - instance of the `VariableFromDmr` class, and assign it to either + instance of the `VariableFromNetCDF4` class, and assign it to either the `variables_with_coordinates` or the `metadata_variables` - dictionary accordingly. + dictionary accordingly. Child groups are added to the `groups` + dictionary under the fully resolved path to that group. """ + self.groups[group.path] = GroupFromNetCDF4( + group, + self.cf_config, + namespace=self.namespace, + full_name_path=group.path, + ) + for netcdf4_variable in group.variables.values(): variable_path = '/'.join([group.path, netcdf4_variable.name]) variable_path = f'/{variable_path.lstrip("/")}' diff --git a/varinfo/variable.py b/varinfo/variable.py index 66b051f..00794d5 100644 --- a/varinfo/variable.py +++ b/varinfo/variable.py @@ -1,28 +1,35 @@ -""" This module contains a class designed to read information from a `.dmr` - file. This should group the input into science variables, metadata, - coordinates, dimensions and ancillary data sets. +""" This module contains classes designed to read information from an OPeNDAP + DMR or NetCDF-4 file. These classes will group the input into science + variables, metadata, coordinates, dimensions and ancillary data sets. """ -from abc import ABC, abstractmethod -from typing import Any, Dict, List, Optional, Set, Tuple, Union +from __future__ import annotations + +from abc import abstractmethod +from typing import Union import re import xml.etree.ElementTree as ET from netCDF4 import Variable as NetCDF4Variable +from varinfo.attribute_container import ( + AttributeContainerBase, + AttributeContainerFromDmr, + AttributeContainerFromNetCDF4, +) from varinfo.cf_config import CFConfig -from varinfo.utilities import CF_REFERENCE_ATTRIBUTES, get_xml_attribute +from varinfo.utilities import CF_REFERENCE_ATTRIBUTES InputVariableType = Union[ET.Element, NetCDF4Variable] -class VariableBase(ABC): +class VariableBase(AttributeContainerBase): """A class to represent a single variable contained within a granule representation. It will produce an object in which references are - fully qualified, and also augmented by any overrides or supplements - from the supplied configuration file. + fully qualified, and also augmented by any overrides from the supplied + configuration file. """ @@ -40,14 +47,10 @@ def __init__( Additionally, store all metadata attributes in a dictionary. """ - self.namespace = namespace - self.full_name_path = full_name_path - self.cf_config = cf_config.get_cf_attributes(self.full_name_path) + super().__init__(variable, cf_config, namespace, full_name_path) self.group_path, self.name = self._extract_group_and_name() self.data_type = self._get_data_type(variable) self.shape = self._get_shape(variable) - self.attributes = self._get_attributes(variable) - self._get_additional_attributes() self.references = self._get_all_cf_references() self.dimensions = self._extract_dimensions(variable) @@ -66,58 +69,7 @@ def _get_raw_dimensions(self, variable: InputVariableType): """ - @abstractmethod - def _get_attributes(self, variable: InputVariableType) -> Dict[str, str]: - """Extract all attributes for the variable. The contents of the - output dictionary will be as stored in the granule metadata, with - augmentation from `CFConfig`. For variables references contained - in CF-Convention attributes, users should retrieve values from the - self.references dictionary. - - """ - - @abstractmethod - def _get_attribute(self, variable: InputVariableType, attribute_name: str) -> Any: - """Extract an attribute value from the source granule metadata. Any - applicable overrides or supplements from `CFConfig` will be - applied before returning the attribute value. - - """ - - def get_attribute_value( - self, attribute_name: str, default_value: Optional = None - ) -> Any: - """A convenience function for the end-user to retrieve the value of a - specified attribute, or use an optional default value if that - attribute is not present in the variable metadata. If no default - value is supplied, requesting the value of an absent attribute will - return `None`. - - """ - return self.attributes.get(attribute_name, default_value) - - def _get_additional_attributes(self) -> None: - """Check the CF-Configuration file for any metadata attributes that - are listed, but not included in the original granule metadata. - These should be added to the variable metadata attributes. - - """ - self._add_missing_attributes(self.cf_config.get('cf_overrides')) - self._add_missing_attributes(self.cf_config.get('cf_supplements')) - - def _add_missing_attributes(self, extra_attributes: Dict) -> None: - """Iterate through a dictionary of attributes from the configuration - file entry matching this variable. If there are any attributes - listed that are not already present in the self.attributes - dictionary, then add them with the values from the configuration - file. - - """ - for attribute_name, attribute_value in extra_attributes.items(): - if attribute_name not in self.attributes: - self.attributes[attribute_name] = attribute_value - - def get_range(self) -> Optional[List[float]]: + def get_range(self) -> list[float] | None: """Retrieve the range of valid data from the variable metadata. First, try to parse the `valid_range` metadata attribute. If this is absent, check for a combination of `valid_min` and `valid_max`. @@ -137,7 +89,7 @@ def get_range(self) -> Optional[List[float]]: return valid_range - def get_valid_min(self) -> Optional[float]: + def get_valid_min(self) -> float | None: """Retrieve the minimum valid value for variable data from the associated metadata. First try to retrieve data from the `valid_min` metadata attribute. If this is absent, then try to @@ -156,7 +108,7 @@ def get_valid_min(self) -> Optional[float]: return valid_min - def get_valid_max(self) -> Optional[float]: + def get_valid_max(self) -> float | None: """Retrieve the maximum valid value for variable data from the associated metadata. First try to retrieve data from the `valid_max` metadata attribute. If this is absent, then try to @@ -175,7 +127,7 @@ def get_valid_max(self) -> Optional[float]: return valid_max - def get_references(self) -> Set[str]: + def get_references(self) -> set[str]: """Combine the references extracted from the ancillary_variables, coordinates and dimensions data into a single set for VarInfo to use directly. @@ -253,7 +205,7 @@ def is_temporal(self) -> bool: """ return ' since ' in self.attributes.get('units', '') - def _get_all_cf_references(self) -> Dict[str, Set[str]]: + def _get_all_cf_references(self) -> dict[str, set[str]]: """Retrieve a dictionary containing all CF-Convention attributes within the variable that have references to other variables in the granule. These variable references will be fully qualified paths. @@ -265,7 +217,7 @@ def _get_all_cf_references(self) -> Dict[str, Set[str]]: if attribute_name in self.attributes } - def _get_cf_references(self, attribute_name: str) -> Set[str]: + def _get_cf_references(self, attribute_name: str) -> set[str]: """Retrieve an attribute from the parsed varaible metadata, correcting for any known artefacts (missing or incorrect references). Then split this string and qualify the individual references. @@ -273,32 +225,7 @@ def _get_cf_references(self, attribute_name: str) -> Set[str]: """ return self._extract_references(self.attributes.get(attribute_name)) - def _get_configured_attribute( - self, attribute_name: str, raw_attribute_value: Any - ) -> Any: - """Check the CFConfig instances assocatiated with the collection for - any metadata attribute overrides or supplements that should be - applied to the attribute value. A metadata supplement is assumed to - imply the attribute should be a string value, with the supplement - appended to the end of the value from the granule metadata. - - """ - cf_overrides = self.cf_config['cf_overrides'].get(attribute_name) - cf_supplements = self.cf_config['cf_supplements'].get(attribute_name) - - if cf_overrides is not None: - attribute_value = cf_overrides - else: - attribute_value = raw_attribute_value - - if cf_supplements is not None and attribute_value is not None: - attribute_value = f'{attribute_value}, {cf_supplements}' - elif cf_supplements is not None: - attribute_value = cf_supplements - - return attribute_value - - def _extract_references(self, attribute_string: str) -> Set[str]: + def _extract_references(self, attribute_string: str) -> set[str]: """Given a string value of an attribute, which may contain multiple references to dataset, split that string based on either commas, or spaces (or both together). Then if any reference is a relative @@ -313,20 +240,18 @@ def _extract_references(self, attribute_string: str) -> Set[str]: return references - def _extract_dimensions(self, variable: ET.Element) -> List[str]: + def _extract_dimensions(self, variable: ET.Element) -> list[str]: """Find the dimensions for the variable in question. If there are - overriding or supplemental dimensions from the CF configuration - file, these are used instead of, or in addition to, the raw - dimensions from the `.dmr`. All references are converted to - absolute paths in the granule. A set of all fully qualified - references is returned. + overriding dimensions from the `earthdata-varinfo` configuration + file, these are used instead of the raw dimensions from the `.dmr`. All + references are converted to absolute paths in the granule. A set of all + fully qualified references is returned. """ - overrides = self.cf_config['cf_overrides'].get('dimensions') - supplements = self.cf_config['cf_supplements'].get('dimensions') + dimensions_override = self.metadata_overrides.get('dimensions') - if overrides is not None: - dimensions = re.split(r'\s+|,\s*', overrides) + if dimensions_override is not None: + dimensions = re.split(r'\s+|,\s*', dimensions_override) else: dimensions = [ dimension @@ -334,12 +259,9 @@ def _extract_dimensions(self, variable: ET.Element) -> List[str]: if dimension is not None ] - if supplements is not None: - dimensions += re.split(r'\s+|,\s*', supplements) - return self._qualify_references(dimensions) - def _qualify_references(self, raw_references: List[str]) -> List[str]: + def _qualify_references(self, raw_references: list[str]) -> list[str]: """Take a list of local references to other variables, and produce a list of absolute references. @@ -399,7 +321,7 @@ def _construct_absolute_path(self, reference: str) -> str: absolute_path = group_path_pieces + [reference] return '/'.join(absolute_path) - def _extract_group_and_name(self) -> Tuple[str]: + def _extract_group_and_name(self) -> tuple[str, str]: """Extract the group and base name of a variable from the full path, e.g. '/this/is/my/variable' should return a two-element tuple: ('/this/is/my', 'variable'). @@ -412,7 +334,7 @@ def _extract_group_and_name(self) -> Tuple[str]: return group_path, name -class VariableFromDmr(VariableBase): +class VariableFromDmr(VariableBase, AttributeContainerFromDmr): """This child class inherits from the `VariableBase` class, and implements the abstract methods assuming the variable source is part of an XML element tree. @@ -423,33 +345,14 @@ def _get_data_type(self, variable: ET.Element) -> str: """Extract a string representation of the variable data type.""" return variable.tag.lstrip(self.namespace).lower() - def _get_shape(self, variable: ET.Element) -> Tuple[int]: + def _get_shape(self, variable: ET.Element) -> tuple[int]: """Extract the shape of the variable data array. This is not yet implemented as the Dimension information is currently unavailable to the Variable XML content. """ - def _get_attributes(self, variable: ET.Element) -> Dict: - """Locate all child Attribute elements of the variable and extract - their associated values. - - """ - return { - attribute.get('name'): self._get_attribute(variable, attribute.get('name')) - for attribute in variable.findall(f'{self.namespace}Attribute') - } - - def _get_attribute(self, variable: ET.Element, attribute_name: str) -> Any: - """Extract the value of an XML Attribute element, casting it to the - appropriate type. Apply any necessary metadata overrides or - supplements. - - """ - raw_value = get_xml_attribute(variable, attribute_name, self.namespace) - return self._get_configured_attribute(attribute_name, raw_value) - - def _get_raw_dimensions(self, variable: ET.Element) -> List[str]: + def _get_raw_dimensions(self, variable: ET.Element) -> list[str]: """Extract the raw dimension names from a XML element.""" return [ dimension.get('name') @@ -457,7 +360,7 @@ def _get_raw_dimensions(self, variable: ET.Element) -> List[str]: ] -class VariableFromNetCDF4(VariableBase): +class VariableFromNetCDF4(VariableBase, AttributeContainerFromNetCDF4): """This child class inherits from the `VariableBase` class, and implements the abstract methods assuming the variable source is part of a NetCDF-4 file. @@ -468,26 +371,11 @@ def _get_data_type(self, variable: NetCDF4Variable) -> str: """Extract a string representation of the variable data type.""" return variable.datatype.name - def _get_shape(self, variable: NetCDF4Variable) -> Tuple[int]: + def _get_shape(self, variable: NetCDF4Variable) -> tuple[int]: """Extract the shape of the variable data array.""" return variable.shape - def _get_attributes(self, variable: NetCDF4Variable) -> Dict: - """Identify all variable attributes and save them to a dictionary.""" - return { - attribute_name: self._get_attribute(variable, attribute_name) - for attribute_name in variable.ncattrs() - } - - def _get_attribute(self, variable: NetCDF4Variable, attribute_name: str) -> Any: - """Extract the value of the metadata attribute, applying any necessary - overrides or supplements. - - """ - raw_value = variable.__dict__.get(attribute_name) - return self._get_configured_attribute(attribute_name, raw_value) - - def _get_raw_dimensions(self, variable: NetCDF4Variable) -> List[str]: + def _get_raw_dimensions(self, variable: NetCDF4Variable) -> list[str]: """Retrieve the dimension names as they are stored within the variable.