Merge pull request #49 from GeneDx/develop

release v0.2.3
GeneDx · Mar 8, 2021 · 51c46ee · 51c46ee
2 parents 06ad18e + 6af7bf7
commit 51c46ee
Show file tree

Hide file tree

Showing 11 changed files with 132 additions and 827 deletions.
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -20,10 +20,10 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install pipenv
-        pip install .
+        pip install pytest flake8 pytest-cov
+        python setup.py develop
+
 
     - name: Test with unittest
       run: |
-        pip install pytest
-        pipenv run pytest
+        pytest tests
diff --git a/Pipfile b/Pipfile
@@ -9,13 +9,11 @@ pytest = "*"
 pytest-cov = "*"
 
 [packages]
-nltk = "*"
-spacy = "*"
-scispacy = "*"
-negspacy = "*"
-phenopy = "*"
-networkx = "*"
-gensim = "*"
+nltk = "==3.4.5"
+spacy = "==2.2.4"
+scispacy = "==0.2.4"
+negspacy = "==0.1.9"
+gensim = "==3.8.1"
 
 [requires]
 python_version = "3.7"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/setup.py b/setup.py
@@ -24,14 +24,16 @@
         ]
     },
     install_requires=[
-        'phenopy',
         'pandas',
-        'nltk',
-        'spacy',
-        'scispacy',
-        'negspacy',
+        'nltk==3.4.5',
+        'spacy==2.2.4',
+        'scispacy==0.2.4',
+        'negspacy==0.1.9',
         'networkx',
-        'gensim',
+        'obonet',
+        'requests',
+        'gensim==3.8.1',
+
 
     ]
 )
diff --git a/tests/test_extract.py b/tests/test_extract.py
@@ -4,7 +4,7 @@
 from txt2hpo.extract import Extractor, Data, group_sequence
 from txt2hpo.data import load_model
 from tests.test_cases import *
-from txt2hpo.util import hpo_network
+from txt2hpo.util import hpo_network, non_phenos
 
 
 class ExtractPhenotypesTestCase(unittest.TestCase):
@@ -24,15 +24,15 @@ def test_hpo(self):
         extract = Extractor(correct_spelling=False)
 
         # Test extracting single phenotype
-        truth = [{"hpid": ["HP:0001290"], "index": [0, 9], "matched": "Hypotonia"}]
+        truth = [{"hpid": ["HP:0001252"], "index": [0, 9], "matched": "Hypotonia"}]
         self.assertEqual(extract.hpo("Hypotonia").entries_sans_context, truth)
 
         # Test adding non phenotypic term
-        truth = [{"hpid": ["HP:0001290"], "index": [5, 14], "matched": "hypotonia"}]
+        truth = [{"hpid": ["HP:0001252"], "index": [5, 14], "matched": "hypotonia"}]
         self.assertEqual(extract.hpo("Word hypotonia").entries_sans_context, truth)
 
         # Test handling punctuation
-        truth = [{"hpid": ["HP:0001290"], "index": [6, 15], "matched": "hypotonia"}]
+        truth = [{"hpid": ["HP:0001252"], "index": [6, 15], "matched": "hypotonia"}]
         self.assertEqual(extract.hpo("Word, hypotonia").entries_sans_context, truth)
 
         # Test extracting a multiword phenotype
@@ -47,7 +47,7 @@ def test_hpo(self):
         self.assertEqual(extract.hpo("Delay developmental").entries_sans_context, truth)
 
         # Test extracting a phenotype with inflectional endings
-        truth = [{"hpid": ["HP:0001290"], "index": [0, 9], "matched": "Hypotonic"}]
+        truth = [{"hpid": ["HP:0001252"], "index": [0, 9], "matched": "Hypotonic"}]
         self.assertEqual(extract.hpo("Hypotonic").entries_sans_context, truth)
 
         # Test extracting a multiword phenotype with inflectional endings and reversed order
@@ -71,7 +71,7 @@ def test_hpo(self):
 
         # Test spellchecker
         extract = Extractor(correct_spelling=True)
-        truth = [{"hpid": ["HP:0001290"], "index": [0, 9], "matched": "Hypotonic"}]
+        truth = [{"hpid": ["HP:0001252"], "index": [0, 9], "matched": "Hypotonic"}]
 
         self.assertEqual(extract.hpo("Hyptonic").entries_sans_context, truth)
 
@@ -105,7 +105,7 @@ def test_hpo(self):
         self.assertEqual(extract.hpo("RA").entries_sans_context, truth)
 
         # Test extracting multiple phenotypes
-        truth = [{"hpid": ["HP:0001290"], "index": [0, 9], "matched": "Hypotonia"},
+        truth = [{"hpid": ["HP:0001252"], "index": [0, 9], "matched": "Hypotonia"},
                             {"hpid": ["HP:0001263"], "index": [11, 30], "matched": "developmental delay"
                              }]
         self.assertEqual(extract.hpo("Hypotonia, developmental delay").entries_sans_context, truth)
@@ -114,7 +114,7 @@ def test_hpo(self):
         extract = Extractor(correct_spelling=False, max_length=20, chunk_by="max_length")
         truth = [
             {"hpid": ["HP:0001263"], "index": [0, 19], "matched": "Developmental delay"},
-            {"hpid": ["HP:0001290"], "index": [21, 30], "matched": "hypotonia"}
+            {"hpid": ["HP:0001252"], "index": [21, 30], "matched": "hypotonia"}
         ]
         self.assertEqual(extract.hpo("Developmental delay, hypotonia").entries_sans_context, truth)
 
@@ -152,7 +152,8 @@ def test_hpo_big_text_spellcheck_on(self):
     def test_hpo_big_text_spellcheck_off(self):
         # test parsing a page
         extract = Extractor(max_neighbors=2, correct_spelling=False, remove_overlapping=True)
-        self.assertEqual(extract.hpo(test_case11_text).n_entries, 7)
+        res = extract.hpo(test_case11_text)
+        self.assertEqual(res.n_entries, 7)
 
     def test_hpo_big_text_spellcheck_off_max3(self):
         # test parsing a page
@@ -319,6 +320,12 @@ def test_extract_json_property(self):
         resp = extract.hpo("Wide gait and a wide mouth")
         self.assertEqual(truth, resp.json)
 
+    def test_extract_full_context(self):
+        extract = Extractor(max_neighbors=2, correct_spelling=False, phenotypes_only=False)
+        resp = extract.hpo("X linked")
+        self.assertEqual(resp.entries[0]['hpid'][0], 'HP:0001417')
+        self.assertEqual(resp.entries[0]['type'], 'mode_of_inheritance')
+
     def test_extract_without_negated(self):
 
         # negation should not apply if negation is part of matched string
@@ -388,8 +395,12 @@ def test_multiple_matches(self):
         resp = extract.hpo("Coloboma, microphthalmia, macrocephaly, ear pit.")
         self.assertEqual(set(resp.hpids), set(['HP:0000589', 'HP:0004467', 'HP:0000568', 'HP:0000256']))
 
-    def test_handing_term_hyphenation(self):
-        extract = Extractor(correct_spelling=False, remove_overlapping=True, resolve_conflicts=True)
+    def test_handling_term_hyphenation(self):
+        extract = Extractor(correct_spelling=False,
+                            remove_overlapping=True,
+                            resolve_conflicts=True,
+                            max_neighbors=2,
+                            phenotypes_only=False)
         hyphenated_phenos = \
             [
             (hpo_network.nodes()[x]['name'], x) for x in hpo_network.nodes() \
@@ -401,11 +412,11 @@ def test_handing_term_hyphenation(self):
 
             ]
         # Phenotypes where word-order is important is a limitation of current parsing method
-        known_bugs = ['HP:0000510', 'HP:0030932']
-        #known_bugs = []
-        long_phenos = ['HP:0011654', 'HP:0410303']
+        known_bugs = ['HP:0000510', 'HP:0030932', 'HP:0001215']
+        long_phenos = ['HP:0011654', 'HP:0410303', 'HP:0000654','HP:0000847','HP:0000864','HP:0000877','HP:0001074']
         hyphenated_phenos = [x for x in hyphenated_phenos if x[1] not in known_bugs + long_phenos]
-
+        hyphenated_phenos = [x for x in hyphenated_phenos if x[1] not in non_phenos]
+        hyphenated_phenos = hyphenated_phenos[:10]
         for test in hyphenated_phenos:
             # current version is not expected to extract very long phenotypes
             hpids = extract.hpo(test[0]).hpids

diff --git a/txt2hpo/__init__.py b/txt2hpo/__init__.py
@@ -1,2 +1,2 @@
 __project__ = 'txt2hpo'
-__version__ = '0.2.2'
+__version__ = '0.2.3'
diff --git a/txt2hpo/config.py b/txt2hpo/config.py
@@ -1,16 +1,18 @@
 import configparser
 import logging
 import os
+import requests
+
 from gensim.models import KeyedVectors
 from txt2hpo import __project__, __version__
 
 # create logger
 logger = logging.getLogger(__project__)
-logger.setLevel(logging.DEBUG)
+logger.setLevel(logging.ERROR)
 
 # create console handler
 ch = logging.StreamHandler()
-ch.setLevel(logging.DEBUG)
+ch.setLevel(logging.ERROR)
 
 # create formatter and add it to the handler
 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
@@ -58,6 +60,21 @@
     wv.save(d2v_vw_path)
     config['models']['doc2vec'] = d2v_vw_path
 
+    config['hpo'] = {}
+    obo_path = os.path.join(data_directory, 'hp.obo')
+
+    if os.path.isfile(obo_path):
+        config['hpo']['obo'] = obo_path
+    else:
+        url = "http://purl.obolibrary.org/obo/hp.obo"
+        r = requests.get(url, allow_redirects=True)
+        with open(obo_path, 'wb') as fh:
+            fh.write(r.content)
+        if os.path.isfile(obo_path):
+            config['hpo']['obo'] = obo_path
+        else:
+            logger.critical("Unable to download hp.obo from ", url)
+
     config['data'] = {}
     spellcheck_vocab_path = os.path.join(os.path.dirname(__file__), 'data/spellcheck_vocab_upd032020.json')
     config['data']['spellcheck_vocab'] = spellcheck_vocab_path

diff --git a/txt2hpo/extract.py b/txt2hpo/extract.py
@@ -11,7 +11,7 @@
 from txt2hpo.nlp import st
 from txt2hpo.data import load_model
 from txt2hpo.build_tree import search_tree, build_search_tree
-from txt2hpo.util import remove_key
+from txt2hpo.util import remove_key, non_phenos
 
 
 class Data(object):
@@ -29,8 +29,11 @@ def add(self,entry):
     def remove(self, item):
         self.entries.remove(item)
 
-    def remove_tagged(self, tag, state=True):
-        to_remove = [entry for entry in self.entries if entry[tag] is state]
+    def remove_tagged(self, tag, state=True, status=True):
+        if status is True:
+            to_remove = [entry for entry in self.entries if entry[tag] == state]
+        else:
+            to_remove = [entry for entry in self.entries if entry[tag] != state]
         for element in to_remove:
             self.remove(element)
 
@@ -46,13 +49,24 @@ def detect_negation(self):
                 entry['matched_words'] = []
             entry['is_negated'] = True if set(entry['negated']).intersection(set(entry['matched_words'])) else False
 
+    def label_terms(self):
+        for entry in self.entries:
+            for hpid in entry['hpid']:
+                if hpid in non_phenos:
+                    entry['type'] = non_phenos[hpid]
+                else:
+                    entry['type'] = 'phenotype'
+
+    def remove_non_phenos(self):
+        self.remove_tagged('type', state='phenotype', status=False)
+
     def remove_negated(self):
         self.detect_negation()
         self.remove_tagged('is_negated')
 
     def remove_overlapping(self):
         self._mark_overlapping()
-        self.remove_tagged('is_longest', False)
+        self.remove_tagged('is_longest', state=False)
 
     def _mark_overlapping(self):
         """
@@ -126,6 +140,7 @@ def entries_sans_context(self):
         result = remove_key(result, 'context')
         result = remove_key(result, 'matched_tokens')
         result = remove_key(result, 'is_longest')
+        result = remove_key(result, 'type')
         return result
 
     @property
@@ -157,7 +172,8 @@ def __init__(self, correct_spelling=True,
                  model=None,
                  custom_synonyms=None,
                  negation_language="en",
-                 chunk_by='phrase'
+                 chunk_by='phrase',
+                 phenotypes_only=True,
                  ):
 
         self.correct_spelling = correct_spelling
@@ -169,6 +185,7 @@ def __init__(self, correct_spelling=True,
         self.context_window = context_window
         self.negation_model = nlp_model(negation_language=negation_language)
         self.chunk_by = chunk_by
+        self.phenotypes_only = phenotypes_only
         if custom_synonyms:
             self.search_tree = build_search_tree(custom_synonyms=custom_synonyms)
         else:
@@ -243,6 +260,10 @@ def hpo(self, text):
         if self.remove_overlapping:
             extracted_terms.remove_overlapping()
 
+        extracted_terms.label_terms()
+        if self.phenotypes_only:
+            extracted_terms.remove_non_phenos()
+
         return extracted_terms
 
     def find_hpo_terms(self, phen_groups, stemmed_tokens, tokens, base_index):

diff --git a/txt2hpo/nlp.py b/txt2hpo/nlp.py
@@ -10,7 +10,7 @@
 def nlp_model(negation_language="en"):
     try:
         import en_core_sci_sm
-        nlp = en_core_sci_sm.load(disable=["tagger", "parser"])
+        nlp = en_core_sci_sm.load(disable=["tagger", "parser", "lemmatizer"])
         nlp.add_pipe(nlp.create_pipe('sentencizer'))
         negex = Negex(nlp, language=negation_language, chunk_prefix=["no"])
         nlp.add_pipe(negex, last=True)
@@ -32,9 +32,10 @@ def nlp_model(negation_language="en"):
 
     return nlp
 
+
 try:
     import en_core_sci_sm
-    nlp_sans_ner = en_core_sci_sm.load(disable=["tagger", "parser", "ner"])
+    nlp_sans_ner = en_core_sci_sm.load(disable=["tagger", "parser", "ner", "lemmatizer"])
     logger.info('Using scispaCy language model\n')
 
 except ModuleNotFoundError:
@@ -49,11 +50,11 @@ def nlp_model(negation_language="en"):
         logger.info('Performing a one-time download of an English language model\n')
         from spacy.cli import download
         download('en_core_web_sm')
-        nlp_sans_ner = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
+        nlp_sans_ner = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "lemmatizer"])
 
 # these are used in hpo as part of phenotype definition, should block from filtering
 remove_from_stops = "first second third fourth fifth under over front back behind ca above below without no not "
-remove_from_stops += "out up side right left more less during than take move full few all to"
+remove_from_stops += "out up side right left more less during than take move full few all to i "
 
 for not_a_stop in remove_from_stops.split(" "):
     nlp_sans_ner.vocab[not_a_stop].is_stop = False

diff --git a/txt2hpo/summarize.py b/txt2hpo/summarize.py
@@ -4,10 +4,16 @@
 import numpy as np
 from txt2hpo.util import group_pairs, summarize_tuples, df_from_tuples
 from txt2hpo.config import logger
-from phenopy.util import half_product
 from functools import reduce
 
 
+def half_product(num_rows, num_columns):
+    """yield combinations and the diagonal"""
+    for m in range(0, num_rows):
+        for n in range(m, num_columns):
+            yield (m, n)
+
+
 def phenotype_distance(extracted_hpos):
     """
     Given the return from hpo, find the normalized distance between all terms in the document.