Merge pull request #19 from obophenotype/rel_validation

Relation validation
obophenotype · Apr 14, 2022 · 8186422 · 8186422
2 parents 7bb4614 + dcd9826
commit 8186422
Show file tree

Hide file tree

Showing 7 changed files with 26,301 additions and 0 deletions.
diff --git a/src/ontology/report/all_labels.csv b/src/ontology/report/all_labels.csv
diff --git a/src/ontology/report/mba_relations.csv b/src/ontology/report/mba_relations.csv
diff --git a/src/ontology/report/not_valid_relations.tsv b/src/ontology/report/not_valid_relations.tsv
diff --git a/src/ontology/report/not_valid_relations_lbl.tsv b/src/ontology/report/not_valid_relations_lbl.tsv
diff --git a/src/scripts/relation_validator.py b/src/scripts/relation_validator.py
@@ -0,0 +1,78 @@
+import csv
+import os
+import pandas as pd
+
+
+REL_REPORT_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../ontology/report/not_valid_relations.tsv")
+REL_REPORT_LBL_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../ontology/report/not_valid_relations_lbl.tsv")
+
+ALL_LABELS_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../ontology/report/all_labels.csv")
+
+
+def add_labels_to_report(report_path, labels_path, output_path):
+    headers, records = read_csv_to_dict(report_path, delimiter="\t", generated_ids=True)
+    labels = read_csv_to_dict(labels_path)[1]
+
+    normalized_markers = []
+    for row_num in records:
+        normalized_data = {"o": records[row_num]["o"],
+                           "s": records[row_num]["s"],
+                           "olabel": records[row_num]["olabel"],
+                           "slabel": records[row_num]["slabel"],
+                           "user_oiri": records[row_num]["user_olabel"],
+                           "user_olabel": labels[records[row_num]["user_olabel"]]["label"],
+                           "user_siri": records[row_num]["user_slabel"],
+                           "user_slabel": labels[records[row_num]["user_slabel"]]["label"]
+                           }
+
+        normalized_markers.append(normalized_data)
+
+    class_robot_template = pd.DataFrame.from_records(normalized_markers)
+    class_robot_template.to_csv(output_path, sep="\t", index=False)
+
+
+def read_csv_to_dict(csv_path, id_column=0, id_column_name="", delimiter=",", id_to_lower=False, generated_ids=False):
+    """
+    Reads tsv file content into a dict. Key is the first column value and the value is dict representation of the
+    row values (each header is a key and column value is the value).
+    Args:
+        csv_path: Path of the CSV file
+        id_column: Id column becomes the keys of the dict. This column should be unique. Default is the first column.
+        id_column_name: Alternative to the numeric id_column, id_column_name specifies id_column by its header string.
+        delimiter: Value delimiter. Default is comma.
+        id_to_lower: applies string lowercase operation to the key
+        generated_ids: If 'True', uses row number as the key of the dict. Initial key is 0.
+
+    Returns:
+        Function provides two return values: first; headers of the table and second; the CSV content dict. Key of the
+        content is the first column value and the values are dict of row values.
+    """
+    records = dict()
+
+    headers = []
+    with open(csv_path) as fd:
+        rd = csv.reader(fd, delimiter=delimiter, quotechar='"')
+        row_count = 0
+        for row in rd:
+            _id = row[id_column]
+            if id_to_lower:
+                _id = str(_id).lower()
+            if generated_ids:
+                _id = row_count
+
+            if row_count == 0:
+                headers = row
+                if id_column_name and id_column_name in headers:
+                    id_column = headers.index(id_column_name)
+            else:
+                row_object = dict()
+                for column_num, column_value in enumerate(row):
+                    row_object[headers[column_num]] = column_value
+                records[_id] = row_object
+
+            row_count += 1
+
+    return headers, records
+
+
+add_labels_to_report(REL_REPORT_PATH, ALL_LABELS_PATH, REL_REPORT_LBL_PATH)
diff --git a/src/sparql/all_labels.sparql b/src/sparql/all_labels.sparql
@@ -0,0 +1,15 @@
+PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
+PREFIX owl: <http://www.w3.org/2002/07/owl#>
+PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
+
+SELECT DISTINCT ?term ?label
+WHERE {
+    ?term a owl:Class .
+    ?term rdfs:label ?label .
+
+    FILTER (STRSTARTS(STR(?term), "http://purl.obolibrary.org/obo/MBA_") || STRSTARTS(STR(?term), "http://purl.obolibrary.org/obo/DMBA_")
+    || STRSTARTS(STR(?term), "http://purl.obolibrary.org/obo/ABA_") || STRSTARTS(STR(?term), "http://purl.obolibrary.org/obo/DHBA_")
+    || STRSTARTS(STR(?term), "http://purl.obolibrary.org/obo/HBA_") || STRSTARTS(STR(?term), "http://purl.obolibrary.org/obo/PBA_"))
+}
diff --git a/src/sparql/relation_validation.sparql b/src/sparql/relation_validation.sparql
@@ -0,0 +1,22 @@
+PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
+PREFIX owl: <http://www.w3.org/2002/07/owl#>
+PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
+PREFIX UBERON: <http://purl.obolibrary.org/obo/UBERON_>
+
+SELECT DISTINCT ?o ?s ?olabel ?slabel (str(?o_mba) as ?user_olabel) (str(?s_mba) as ?user_slabel)
+WHERE{
+    ?s_mba rdfs:subClassOf ?s .
+    ?s_mba rdfs:subClassOf ?restriction.
+    ?restriction owl:onProperty <http://purl.obolibrary.org/obo/BFO_0000050> .
+    ?restriction owl:someValuesFrom ?o_mba .
+    ?o_mba rdfs:subClassOf ?o .
+    ?s rdfs:label ?slabel .
+    ?o rdfs:label ?olabel .
+
+    FILTER (isIRI(?o) && STRSTARTS(STR(?o), "http://purl.obolibrary.org/obo/UBERON_"))
+    FILTER (isIRI(?s) && STRSTARTS(STR(?s), "http://purl.obolibrary.org/obo/UBERON_"))
+    FILTER (?o != ?s)
+    FILTER (STRSTARTS(STR(?s_mba), "http://purl.obolibrary.org/obo/MBA_") || STRSTARTS(STR(?s_mba), "http://purl.obolibrary.org/obo/DMBA_"))
+}