From 588bb0419339aced91cca3f0afe5b67346a6f43c Mon Sep 17 00:00:00 2001 From: loesvdbiggelaar Date: Wed, 17 Apr 2024 14:22:11 +0200 Subject: [PATCH] add namespaces to the rdf output --- biocypher/write/_batch_writer.py | 7 +- biocypher/write/_write.py | 3 +- biocypher/write/graph/_rdf.py | 106 +++++++++++++++---------------- test/fixtures/rdf.py | 1 + test/write/graph/test_rdf.py | 1 - 5 files changed, 61 insertions(+), 57 deletions(-) diff --git a/biocypher/write/_batch_writer.py b/biocypher/write/_batch_writer.py index 20e6d2e4..8ebc0dce 100644 --- a/biocypher/write/_batch_writer.py +++ b/biocypher/write/_batch_writer.py @@ -118,7 +118,8 @@ def __init__( db_password: str = None, db_host: str = None, db_port: str = None, - rdf_format: str = None + rdf_format: str = None, + rdf_namespaces: dict = {} ): """ @@ -201,6 +202,9 @@ class contains all methods expected by a bach writer instance, some of rdf_format: The format of RDF. + + rdf_namespaces: + The namespaces for RDF. """ self.db_name = db_name self.db_user = db_user @@ -208,6 +212,7 @@ class contains all methods expected by a bach writer instance, some of self.db_host = db_host or "localhost" self.db_port = db_port self.rdf_format = rdf_format + self.rdf_namespaces = rdf_namespaces self.delim, self.escaped_delim = self._process_delimiter(delimiter) self.adelim, self.escaped_adelim = self._process_delimiter( diff --git a/biocypher/write/_write.py b/biocypher/write/_write.py index 3e6b3ed8..317bdc22 100644 --- a/biocypher/write/_write.py +++ b/biocypher/write/_write.py @@ -105,5 +105,6 @@ def get_writer( db_user=dbms_config.get("user"), # psql db_password=dbms_config.get("password"), # psql db_port=dbms_config.get("port"), # psql - rdf_format= dbms_config.get("rdf_format") + rdf_format= dbms_config.get("rdf_format"), # rdf + rdf_namespaces= dbms_config.get("rdf_namespaces")# rdf ) diff --git a/biocypher/write/graph/_rdf.py b/biocypher/write/graph/_rdf.py index 1babb269..84423e88 100644 --- a/biocypher/write/graph/_rdf.py +++ b/biocypher/write/graph/_rdf.py @@ -17,7 +17,8 @@ from typing import Union, Optional from ast import literal_eval from collections import OrderedDict, defaultdict -from rdflib import Literal, RDFS, URIRef, Namespace, RDF, Graph +from rdflib import Literal, Namespace, Graph, RDF, RDFS, SKOS, DC, DCTERMS +from rdflib.namespace import _NAMESPACE_PREFIXES_CORE, _NAMESPACE_PREFIXES_RDFLIB import os from more_itertools import peekable @@ -149,31 +150,30 @@ def _write_single_edge_list_to_file( rdf_predicate = rdf_subject + rdf_object edge_label = self.translator.name_sentence_to_pascal(e.get_label()) - edge_uri = self.namespaces["biocypher"][edge_label] + edge_uri = self.rdf_namespaces["biocypher"][edge_label] g.add((edge_uri, RDF.type, RDFS.Class)) - g.add((self.namespaces["biocypher"][rdf_predicate], RDFS.Class, edge_uri)) - g.add((self.namespaces["biocypher"][rdf_predicate], self.namespaces["biocypher"]["subject"], self.label_to_uri(rdf_subject))) - g.add((self.namespaces["biocypher"][rdf_predicate], self.namespaces["biocypher"]["object"], self.label_to_uri(rdf_object))) + g.add((self.rdf_namespaces["biocypher"][rdf_predicate], RDF.type, edge_uri)) + g.add((self.rdf_namespaces["biocypher"][rdf_predicate], self.rdf_namespaces["biocypher"]["subject"], self.label_to_uri(rdf_subject))) + g.add((self.rdf_namespaces["biocypher"][rdf_predicate], self.rdf_namespaces["biocypher"]["object"], self.label_to_uri(rdf_object))) for key, value in rdf_properties.items(): # only write value if it exists. if value: if type(value) == list: for v in value: - g.add((self.namespaces["biocypher"][rdf_predicate], self.namespaces["biocypher"][key], Literal(v))) + g.add((self.rdf_namespaces["biocypher"][rdf_predicate], self.property_to_uri(key), Literal(v))) elif type(value) == str: if value.startswith("[") and value.endswith("]"): value = value.replace("[", "").replace("]", "").replace("'", "").split(", ") try: for v in value: - g.add((self.namespaces["biocypher"][rdf_predicate], self.namespaces["biocypher"][key], Literal(v))) + g.add((self.rdf_namespaces["biocypher"][rdf_predicate], self.property_to_uri(key), Literal(v))) except (SyntaxError, ValueError, TypeError): - g.add((self.namespaces["biocypher"][rdf_predicate], self.namespaces["biocypher"][key], Literal(value))) + g.add((self.rdf_namespaces["biocypher"][rdf_predicate], self.property_to_uri(key), Literal(value))) else: - g.add((self.namespaces["biocypher"][rdf_predicate], self.namespaces["biocypher"][key], Literal(value))) - + g.add((self.rdf_namespaces["biocypher"][rdf_predicate], self.property_to_uri(key), Literal(value))) else: - g.add((self.namespaces["biocypher"][rdf_predicate], self.namespaces["biocypher"][key], Literal(value))) + g.add((self.rdf_namespaces["biocypher"][rdf_predicate], self.property_to_uri(key), Literal(value))) g.serialize(destination=fileName, format=self.rdf_format) @@ -210,33 +210,6 @@ def _write_single_node_list_to_file( logger.error('Nodes must be passed as type BioCypherNode.') return False - for n in node_list: - - # do not check for deviations in properties. - # This is not applicable for rdf. - if False: - # check for deviations in properties - # node properties - n_props = n.get_properties() - n_keys = list(n_props.keys()) - # reference properties - ref_props = list(prop_dict.keys()) - - # compare lists order invariant - if not set(ref_props) == set(n_keys): - onode = n.get_id() - oprop1 = set(ref_props).difference(n_keys) - oprop2 = set(n_keys).difference(ref_props) - logger.error( - f'At least one node of the class {n.get_label()} ' - f'has more or fewer properties than another. ' - f'Offending node: {onode!r}, offending property: ' - f'{max([oprop1, oprop2])}. ' - f'All reference properties: {ref_props}, ' - f'All node properties: {n_keys}.', - ) - return False - # translate label to PascalCase label_pascal = self.translator.name_sentence_to_pascal(label) @@ -252,12 +225,12 @@ def _write_single_node_list_to_file( rdf_object = n.get_label() properties = n.get_properties() class_name = self.translator.name_sentence_to_pascal(rdf_object) - g.add((self.namespaces["biocypher"][class_name], RDF.type, RDFS.Class)) - g.add((self.label_to_uri(rdf_subject), RDFS.Class, self.namespaces["biocypher"][class_name])) + g.add((self.rdf_namespaces["biocypher"][class_name], RDF.type, RDFS.Class)) + g.add((self.label_to_uri(rdf_subject), RDF.type, self.rdf_namespaces["biocypher"][class_name])) for key, value in properties.items(): # only write value if it exists. if value: - g.add((self.label_to_uri(rdf_subject), self.namespaces["biocypher"][key], Literal(value))) + g.add((self.label_to_uri(rdf_subject), self.property_to_uri(key), Literal(value))) g.serialize(destination=fileName, format=self.rdf_format) @@ -383,24 +356,49 @@ def label_to_uri(self, input): """ _pref, _id = input.split(":") - if _pref in self.namespaces.keys(): - return self.namespaces[_pref][_id] + if _pref in self.rdf_namespaces.keys(): + return self.rdf_namespaces[_pref][_id] else: - return self.namespaces["biocypher"][input] + return self.rdf_namespaces["biocypher"][input] # TODO: this should flow out of the config file! # hardcoded it for now + def property_to_uri(self, input): + # These namespaces are core for rdflib; owl, rdf, rdfs, xsd and xml + for namespace in _NAMESPACE_PREFIXES_CORE.values(): + if input in namespace: + return namespace[input] + + # If the input is not found in the core, I want to search in these first + for namespace in [SKOS, DC, DCTERMS]: + if input in namespace: + return namespace[input] + + # Otherwise, try these other namespaces from rdflib + for namespace in _NAMESPACE_PREFIXES_RDFLIB.values(): + if input in namespace: + return namespace[input] + + # If "licence" is not found, we can try "license" + if input == "licence": + self.property_to_uri("license") + + # As a last resort return the biocypher namespace + return self.rdf_namespaces["biocypher"][input] + + def _init_namespaces(self, graph): - self.namespaces = {} - self.namespaces["biocypher"] = Namespace("http://example.org/biocypher#") - self.namespaces["chembl"] = Namespace("https://www.ebi.ac.uk/chembl/compound_report_card/") - self.namespaces["go"] = Namespace("http://purl.obolibrary.org/obo/GO_") - self.namespaces["mondo"] = Namespace("http://purl.obolibrary.org/obo/MONDO_") - self.namespaces["efo"] = Namespace("http://purl.obolibrary.org/obo/EFO_") - self.namespaces["hp"] = Namespace("http://purl.obolibrary.org/obo/HP_") - - for key, value in self.namespaces.items(): - graph.bind(key, Namespace(value)) + # add biocypher standard to self.rdf_namespaces + biocypher_standard = {"biocypher": "http://example.org/biocypher#"} + if not self.rdf_namespaces: + self.rdf_namespaces = biocypher_standard + else: + self.rdf_namespaces = self.rdf_namespaces | biocypher_standard + + for key, value in self.rdf_namespaces.items(): + namespace = Namespace(value) + self.rdf_namespaces[key] = namespace + graph.bind(key, namespace) \ No newline at end of file diff --git a/test/fixtures/rdf.py b/test/fixtures/rdf.py index b1195017..c89a3203 100644 --- a/test/fixtures/rdf.py +++ b/test/fixtures/rdf.py @@ -14,6 +14,7 @@ def bw_rdf(translator, deduplicator, tmp_path_session): delimiter=",", ) bw_rdf.rdf_format = "xml" + bw_rdf.namespaces = {} yield bw_rdf # teardown diff --git a/test/write/graph/test_rdf.py b/test/write/graph/test_rdf.py index b507268c..9a9dd4ff 100644 --- a/test/write/graph/test_rdf.py +++ b/test/write/graph/test_rdf.py @@ -65,7 +65,6 @@ def test_rdf_write_data(bw_rdf, length, _get_nodes, _get_edges): g = Graph() for file in rdf_files: - print(file) with open(file) as f: g_temp = Graph().parse(data=f.read(), format="xml") g += g_temp