From 588bb0419339aced91cca3f0afe5b67346a6f43c Mon Sep 17 00:00:00 2001
From: loesvdbiggelaar <loes@thehyve.nl>
Date: Wed, 17 Apr 2024 14:22:11 +0200
Subject: [PATCH] add namespaces to the rdf output

---
 biocypher/write/_batch_writer.py |   7 +-
 biocypher/write/_write.py        |   3 +-
 biocypher/write/graph/_rdf.py    | 106 +++++++++++++++----------------
 test/fixtures/rdf.py             |   1 +
 test/write/graph/test_rdf.py     |   1 -
 5 files changed, 61 insertions(+), 57 deletions(-)

diff --git a/biocypher/write/_batch_writer.py b/biocypher/write/_batch_writer.py
index 20e6d2e4..8ebc0dce 100644
--- a/biocypher/write/_batch_writer.py
+++ b/biocypher/write/_batch_writer.py
@@ -118,7 +118,8 @@ def __init__(
         db_password: str = None,
         db_host: str = None,
         db_port: str = None,
-        rdf_format: str = None
+        rdf_format: str = None,
+        rdf_namespaces: dict = {}
     ):
         """
 
@@ -201,6 +202,9 @@ class contains all methods expected by a bach writer instance, some of
             
             rdf_format:
                 The format of RDF.
+            
+            rdf_namespaces:
+                The namespaces for RDF.
         """
         self.db_name = db_name
         self.db_user = db_user
@@ -208,6 +212,7 @@ class contains all methods expected by a bach writer instance, some of
         self.db_host = db_host or "localhost"
         self.db_port = db_port
         self.rdf_format = rdf_format
+        self.rdf_namespaces = rdf_namespaces
 
         self.delim, self.escaped_delim = self._process_delimiter(delimiter)
         self.adelim, self.escaped_adelim = self._process_delimiter(
diff --git a/biocypher/write/_write.py b/biocypher/write/_write.py
index 3e6b3ed8..317bdc22 100644
--- a/biocypher/write/_write.py
+++ b/biocypher/write/_write.py
@@ -105,5 +105,6 @@ def get_writer(
             db_user=dbms_config.get("user"),  # psql
             db_password=dbms_config.get("password"),  # psql
             db_port=dbms_config.get("port"),  # psql
-            rdf_format= dbms_config.get("rdf_format")
+            rdf_format= dbms_config.get("rdf_format"), # rdf
+            rdf_namespaces= dbms_config.get("rdf_namespaces")# rdf
         )
diff --git a/biocypher/write/graph/_rdf.py b/biocypher/write/graph/_rdf.py
index 1babb269..84423e88 100644
--- a/biocypher/write/graph/_rdf.py
+++ b/biocypher/write/graph/_rdf.py
@@ -17,7 +17,8 @@
 from typing import Union, Optional
 from ast import literal_eval
 from collections import OrderedDict, defaultdict
-from rdflib import Literal, RDFS, URIRef, Namespace, RDF, Graph
+from rdflib import Literal, Namespace, Graph, RDF, RDFS, SKOS, DC, DCTERMS
+from rdflib.namespace import _NAMESPACE_PREFIXES_CORE, _NAMESPACE_PREFIXES_RDFLIB
 import os
 
 from more_itertools import peekable
@@ -149,31 +150,30 @@ def _write_single_edge_list_to_file(
                 rdf_predicate = rdf_subject + rdf_object
 
             edge_label = self.translator.name_sentence_to_pascal(e.get_label())
-            edge_uri = self.namespaces["biocypher"][edge_label]
+            edge_uri = self.rdf_namespaces["biocypher"][edge_label]
             g.add((edge_uri, RDF.type, RDFS.Class))
-            g.add((self.namespaces["biocypher"][rdf_predicate], RDFS.Class, edge_uri))
-            g.add((self.namespaces["biocypher"][rdf_predicate],  self.namespaces["biocypher"]["subject"], self.label_to_uri(rdf_subject)))
-            g.add((self.namespaces["biocypher"][rdf_predicate], self.namespaces["biocypher"]["object"], self.label_to_uri(rdf_object)))
+            g.add((self.rdf_namespaces["biocypher"][rdf_predicate], RDF.type, edge_uri))
+            g.add((self.rdf_namespaces["biocypher"][rdf_predicate],  self.rdf_namespaces["biocypher"]["subject"], self.label_to_uri(rdf_subject)))
+            g.add((self.rdf_namespaces["biocypher"][rdf_predicate], self.rdf_namespaces["biocypher"]["object"], self.label_to_uri(rdf_object)))
             
             for key, value in rdf_properties.items():
                     # only write value if it exists.
                     if value:
                         if type(value) == list:
                             for v in value:
-                                g.add((self.namespaces["biocypher"][rdf_predicate], self.namespaces["biocypher"][key], Literal(v)))
+                                g.add((self.rdf_namespaces["biocypher"][rdf_predicate], self.property_to_uri(key), Literal(v)))
                         elif type(value) == str:
                             if value.startswith("[") and value.endswith("]"):  
                                 value = value.replace("[", "").replace("]", "").replace("'", "").split(", ")
                                 try:
                                     for v in value:
-                                        g.add((self.namespaces["biocypher"][rdf_predicate], self.namespaces["biocypher"][key], Literal(v)))
+                                        g.add((self.rdf_namespaces["biocypher"][rdf_predicate], self.property_to_uri(key), Literal(v)))
                                 except (SyntaxError, ValueError, TypeError):
-                                    g.add((self.namespaces["biocypher"][rdf_predicate], self.namespaces["biocypher"][key], Literal(value)))
+                                    g.add((self.rdf_namespaces["biocypher"][rdf_predicate], self.property_to_uri(key), Literal(value)))
                             else:
-                                g.add((self.namespaces["biocypher"][rdf_predicate], self.namespaces["biocypher"][key], Literal(value)))
-
+                                g.add((self.rdf_namespaces["biocypher"][rdf_predicate], self.property_to_uri(key), Literal(value)))
                         else:
-                            g.add((self.namespaces["biocypher"][rdf_predicate], self.namespaces["biocypher"][key], Literal(value)))
+                            g.add((self.rdf_namespaces["biocypher"][rdf_predicate], self.property_to_uri(key), Literal(value)))
                     
         g.serialize(destination=fileName, format=self.rdf_format)
         
@@ -210,33 +210,6 @@ def _write_single_node_list_to_file(
             logger.error('Nodes must be passed as type BioCypherNode.')
             return False
 
-        for n in node_list:
-            
-            # do not check for deviations in properties.
-            # This is not applicable for rdf.
-            if False:
-                # check for deviations in properties
-                # node properties
-                n_props = n.get_properties()
-                n_keys = list(n_props.keys())
-                # reference properties
-                ref_props = list(prop_dict.keys())
-
-                # compare lists order invariant
-                if not set(ref_props) == set(n_keys):
-                    onode = n.get_id()
-                    oprop1 = set(ref_props).difference(n_keys)
-                    oprop2 = set(n_keys).difference(ref_props)
-                    logger.error(
-                        f'At least one node of the class {n.get_label()} '
-                        f'has more or fewer properties than another. '
-                        f'Offending node: {onode!r}, offending property: '
-                        f'{max([oprop1, oprop2])}. '
-                        f'All reference properties: {ref_props}, '
-                        f'All node properties: {n_keys}.',
-                    )
-                    return False
-
         # translate label to PascalCase
         label_pascal = self.translator.name_sentence_to_pascal(label)
 
@@ -252,12 +225,12 @@ def _write_single_node_list_to_file(
             rdf_object = n.get_label()
             properties = n.get_properties()
             class_name = self.translator.name_sentence_to_pascal(rdf_object)
-            g.add((self.namespaces["biocypher"][class_name], RDF.type, RDFS.Class))
-            g.add((self.label_to_uri(rdf_subject), RDFS.Class, self.namespaces["biocypher"][class_name]))
+            g.add((self.rdf_namespaces["biocypher"][class_name], RDF.type, RDFS.Class))
+            g.add((self.label_to_uri(rdf_subject), RDF.type, self.rdf_namespaces["biocypher"][class_name]))
             for key, value in properties.items():
                 # only write value if it exists.
                 if value:
-                    g.add((self.label_to_uri(rdf_subject), self.namespaces["biocypher"][key], Literal(value)))
+                    g.add((self.label_to_uri(rdf_subject), self.property_to_uri(key), Literal(value)))
                 
         
         g.serialize(destination=fileName, format=self.rdf_format)
@@ -383,24 +356,49 @@ def label_to_uri(self, input):
         """
         _pref, _id = input.split(":")
 
-        if _pref in self.namespaces.keys():
-            return self.namespaces[_pref][_id]
+        if _pref in self.rdf_namespaces.keys():
+            return self.rdf_namespaces[_pref][_id]
         else:
-            return self.namespaces["biocypher"][input]
+            return self.rdf_namespaces["biocypher"][input]
         
 
         # TODO: this should flow out of the config file!
         # hardcoded it for now
     
+    def property_to_uri(self, input):
+        # These namespaces are core for rdflib; owl, rdf, rdfs, xsd and xml
+        for namespace in _NAMESPACE_PREFIXES_CORE.values():
+            if input in namespace:
+                return namespace[input]
+            
+        # If the input is not found in the core, I want to search in these first
+        for namespace in [SKOS, DC, DCTERMS]:
+            if input in namespace:
+                return namespace[input]
+            
+        # Otherwise, try these other namespaces from rdflib
+        for namespace in _NAMESPACE_PREFIXES_RDFLIB.values():
+            if input in namespace:
+                return namespace[input]
+        
+        # If "licence" is not found, we can try "license"
+        if input == "licence":
+            self.property_to_uri("license")
+        
+        # As a last resort return the biocypher namespace
+        return self.rdf_namespaces["biocypher"][input]
+            
+
     def _init_namespaces(self, graph):
-        self.namespaces = {}
-        self.namespaces["biocypher"] = Namespace("http://example.org/biocypher#")
-        self.namespaces["chembl"] = Namespace("https://www.ebi.ac.uk/chembl/compound_report_card/")
-        self.namespaces["go"] = Namespace("http://purl.obolibrary.org/obo/GO_")
-        self.namespaces["mondo"] = Namespace("http://purl.obolibrary.org/obo/MONDO_")
-        self.namespaces["efo"] = Namespace("http://purl.obolibrary.org/obo/EFO_")
-        self.namespaces["hp"] = Namespace("http://purl.obolibrary.org/obo/HP_")
-
-        for key, value in self.namespaces.items():
-            graph.bind(key, Namespace(value)) 
+        # add biocypher standard to self.rdf_namespaces
+        biocypher_standard = {"biocypher": "http://example.org/biocypher#"}
+        if not self.rdf_namespaces:
+            self.rdf_namespaces = biocypher_standard
+        else:
+            self.rdf_namespaces = self.rdf_namespaces | biocypher_standard
+
+        for key, value in self.rdf_namespaces.items():
+            namespace = Namespace(value)
+            self.rdf_namespaces[key] = namespace
+            graph.bind(key, namespace) 
     
\ No newline at end of file
diff --git a/test/fixtures/rdf.py b/test/fixtures/rdf.py
index b1195017..c89a3203 100644
--- a/test/fixtures/rdf.py
+++ b/test/fixtures/rdf.py
@@ -14,6 +14,7 @@ def bw_rdf(translator, deduplicator, tmp_path_session):
         delimiter=",",
     )
     bw_rdf.rdf_format = "xml"
+    bw_rdf.namespaces = {}
     yield bw_rdf
 
     # teardown
diff --git a/test/write/graph/test_rdf.py b/test/write/graph/test_rdf.py
index b507268c..9a9dd4ff 100644
--- a/test/write/graph/test_rdf.py
+++ b/test/write/graph/test_rdf.py
@@ -65,7 +65,6 @@ def test_rdf_write_data(bw_rdf, length, _get_nodes, _get_edges):
 
     g = Graph()
     for file in rdf_files:
-        print(file)
         with open(file) as f:
             g_temp = Graph().parse(data=f.read(), format="xml")
             g += g_temp