Skip to content

Commit

Permalink
Merge pull request #49 from GeneDx/develop
Browse files Browse the repository at this point in the history
release v0.2.3
  • Loading branch information
vgainullin authored Mar 8, 2021
2 parents 06ad18e + 6af7bf7 commit 51c46ee
Show file tree
Hide file tree
Showing 11 changed files with 132 additions and 827 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/pythonpackage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pipenv
pip install .
pip install pytest flake8 pytest-cov
python setup.py develop
- name: Test with unittest
run: |
pip install pytest
pipenv run pytest
pytest tests
12 changes: 5 additions & 7 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,11 @@ pytest = "*"
pytest-cov = "*"

[packages]
nltk = "*"
spacy = "*"
scispacy = "*"
negspacy = "*"
phenopy = "*"
networkx = "*"
gensim = "*"
nltk = "==3.4.5"
spacy = "==2.2.4"
scispacy = "==0.2.4"
negspacy = "==0.1.9"
gensim = "==3.8.1"

[requires]
python_version = "3.7"
771 changes: 0 additions & 771 deletions Pipfile.lock

This file was deleted.

14 changes: 8 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,16 @@
]
},
install_requires=[
'phenopy',
'pandas',
'nltk',
'spacy',
'scispacy',
'negspacy',
'nltk==3.4.5',
'spacy==2.2.4',
'scispacy==0.2.4',
'negspacy==0.1.9',
'networkx',
'gensim',
'obonet',
'requests',
'gensim==3.8.1',


]
)
41 changes: 26 additions & 15 deletions tests/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from txt2hpo.extract import Extractor, Data, group_sequence
from txt2hpo.data import load_model
from tests.test_cases import *
from txt2hpo.util import hpo_network
from txt2hpo.util import hpo_network, non_phenos


class ExtractPhenotypesTestCase(unittest.TestCase):
Expand All @@ -24,15 +24,15 @@ def test_hpo(self):
extract = Extractor(correct_spelling=False)

# Test extracting single phenotype
truth = [{"hpid": ["HP:0001290"], "index": [0, 9], "matched": "Hypotonia"}]
truth = [{"hpid": ["HP:0001252"], "index": [0, 9], "matched": "Hypotonia"}]
self.assertEqual(extract.hpo("Hypotonia").entries_sans_context, truth)

# Test adding non phenotypic term
truth = [{"hpid": ["HP:0001290"], "index": [5, 14], "matched": "hypotonia"}]
truth = [{"hpid": ["HP:0001252"], "index": [5, 14], "matched": "hypotonia"}]
self.assertEqual(extract.hpo("Word hypotonia").entries_sans_context, truth)

# Test handling punctuation
truth = [{"hpid": ["HP:0001290"], "index": [6, 15], "matched": "hypotonia"}]
truth = [{"hpid": ["HP:0001252"], "index": [6, 15], "matched": "hypotonia"}]
self.assertEqual(extract.hpo("Word, hypotonia").entries_sans_context, truth)

# Test extracting a multiword phenotype
Expand All @@ -47,7 +47,7 @@ def test_hpo(self):
self.assertEqual(extract.hpo("Delay developmental").entries_sans_context, truth)

# Test extracting a phenotype with inflectional endings
truth = [{"hpid": ["HP:0001290"], "index": [0, 9], "matched": "Hypotonic"}]
truth = [{"hpid": ["HP:0001252"], "index": [0, 9], "matched": "Hypotonic"}]
self.assertEqual(extract.hpo("Hypotonic").entries_sans_context, truth)

# Test extracting a multiword phenotype with inflectional endings and reversed order
Expand All @@ -71,7 +71,7 @@ def test_hpo(self):

# Test spellchecker
extract = Extractor(correct_spelling=True)
truth = [{"hpid": ["HP:0001290"], "index": [0, 9], "matched": "Hypotonic"}]
truth = [{"hpid": ["HP:0001252"], "index": [0, 9], "matched": "Hypotonic"}]

self.assertEqual(extract.hpo("Hyptonic").entries_sans_context, truth)

Expand Down Expand Up @@ -105,7 +105,7 @@ def test_hpo(self):
self.assertEqual(extract.hpo("RA").entries_sans_context, truth)

# Test extracting multiple phenotypes
truth = [{"hpid": ["HP:0001290"], "index": [0, 9], "matched": "Hypotonia"},
truth = [{"hpid": ["HP:0001252"], "index": [0, 9], "matched": "Hypotonia"},
{"hpid": ["HP:0001263"], "index": [11, 30], "matched": "developmental delay"
}]
self.assertEqual(extract.hpo("Hypotonia, developmental delay").entries_sans_context, truth)
Expand All @@ -114,7 +114,7 @@ def test_hpo(self):
extract = Extractor(correct_spelling=False, max_length=20, chunk_by="max_length")
truth = [
{"hpid": ["HP:0001263"], "index": [0, 19], "matched": "Developmental delay"},
{"hpid": ["HP:0001290"], "index": [21, 30], "matched": "hypotonia"}
{"hpid": ["HP:0001252"], "index": [21, 30], "matched": "hypotonia"}
]
self.assertEqual(extract.hpo("Developmental delay, hypotonia").entries_sans_context, truth)

Expand Down Expand Up @@ -152,7 +152,8 @@ def test_hpo_big_text_spellcheck_on(self):
def test_hpo_big_text_spellcheck_off(self):
# test parsing a page
extract = Extractor(max_neighbors=2, correct_spelling=False, remove_overlapping=True)
self.assertEqual(extract.hpo(test_case11_text).n_entries, 7)
res = extract.hpo(test_case11_text)
self.assertEqual(res.n_entries, 7)

def test_hpo_big_text_spellcheck_off_max3(self):
# test parsing a page
Expand Down Expand Up @@ -319,6 +320,12 @@ def test_extract_json_property(self):
resp = extract.hpo("Wide gait and a wide mouth")
self.assertEqual(truth, resp.json)

def test_extract_full_context(self):
extract = Extractor(max_neighbors=2, correct_spelling=False, phenotypes_only=False)
resp = extract.hpo("X linked")
self.assertEqual(resp.entries[0]['hpid'][0], 'HP:0001417')
self.assertEqual(resp.entries[0]['type'], 'mode_of_inheritance')

def test_extract_without_negated(self):

# negation should not apply if negation is part of matched string
Expand Down Expand Up @@ -388,8 +395,12 @@ def test_multiple_matches(self):
resp = extract.hpo("Coloboma, microphthalmia, macrocephaly, ear pit.")
self.assertEqual(set(resp.hpids), set(['HP:0000589', 'HP:0004467', 'HP:0000568', 'HP:0000256']))

def test_handing_term_hyphenation(self):
extract = Extractor(correct_spelling=False, remove_overlapping=True, resolve_conflicts=True)
def test_handling_term_hyphenation(self):
extract = Extractor(correct_spelling=False,
remove_overlapping=True,
resolve_conflicts=True,
max_neighbors=2,
phenotypes_only=False)
hyphenated_phenos = \
[
(hpo_network.nodes()[x]['name'], x) for x in hpo_network.nodes() \
Expand All @@ -401,11 +412,11 @@ def test_handing_term_hyphenation(self):

]
# Phenotypes where word-order is important is a limitation of current parsing method
known_bugs = ['HP:0000510', 'HP:0030932']
#known_bugs = []
long_phenos = ['HP:0011654', 'HP:0410303']
known_bugs = ['HP:0000510', 'HP:0030932', 'HP:0001215']
long_phenos = ['HP:0011654', 'HP:0410303', 'HP:0000654','HP:0000847','HP:0000864','HP:0000877','HP:0001074']
hyphenated_phenos = [x for x in hyphenated_phenos if x[1] not in known_bugs + long_phenos]

hyphenated_phenos = [x for x in hyphenated_phenos if x[1] not in non_phenos]
hyphenated_phenos = hyphenated_phenos[:10]
for test in hyphenated_phenos:
# current version is not expected to extract very long phenotypes
hpids = extract.hpo(test[0]).hpids
Expand Down
2 changes: 1 addition & 1 deletion txt2hpo/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__project__ = 'txt2hpo'
__version__ = '0.2.2'
__version__ = '0.2.3'
21 changes: 19 additions & 2 deletions txt2hpo/config.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
import configparser
import logging
import os
import requests

from gensim.models import KeyedVectors
from txt2hpo import __project__, __version__

# create logger
logger = logging.getLogger(__project__)
logger.setLevel(logging.DEBUG)
logger.setLevel(logging.ERROR)

# create console handler
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setLevel(logging.ERROR)

# create formatter and add it to the handler
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
Expand Down Expand Up @@ -58,6 +60,21 @@
wv.save(d2v_vw_path)
config['models']['doc2vec'] = d2v_vw_path

config['hpo'] = {}
obo_path = os.path.join(data_directory, 'hp.obo')

if os.path.isfile(obo_path):
config['hpo']['obo'] = obo_path
else:
url = "http://purl.obolibrary.org/obo/hp.obo"
r = requests.get(url, allow_redirects=True)
with open(obo_path, 'wb') as fh:
fh.write(r.content)
if os.path.isfile(obo_path):
config['hpo']['obo'] = obo_path
else:
logger.critical("Unable to download hp.obo from ", url)

config['data'] = {}
spellcheck_vocab_path = os.path.join(os.path.dirname(__file__), 'data/spellcheck_vocab_upd032020.json')
config['data']['spellcheck_vocab'] = spellcheck_vocab_path
Expand Down
31 changes: 26 additions & 5 deletions txt2hpo/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from txt2hpo.nlp import st
from txt2hpo.data import load_model
from txt2hpo.build_tree import search_tree, build_search_tree
from txt2hpo.util import remove_key
from txt2hpo.util import remove_key, non_phenos


class Data(object):
Expand All @@ -29,8 +29,11 @@ def add(self,entry):
def remove(self, item):
self.entries.remove(item)

def remove_tagged(self, tag, state=True):
to_remove = [entry for entry in self.entries if entry[tag] is state]
def remove_tagged(self, tag, state=True, status=True):
if status is True:
to_remove = [entry for entry in self.entries if entry[tag] == state]
else:
to_remove = [entry for entry in self.entries if entry[tag] != state]
for element in to_remove:
self.remove(element)

Expand All @@ -46,13 +49,24 @@ def detect_negation(self):
entry['matched_words'] = []
entry['is_negated'] = True if set(entry['negated']).intersection(set(entry['matched_words'])) else False

def label_terms(self):
for entry in self.entries:
for hpid in entry['hpid']:
if hpid in non_phenos:
entry['type'] = non_phenos[hpid]
else:
entry['type'] = 'phenotype'

def remove_non_phenos(self):
self.remove_tagged('type', state='phenotype', status=False)

def remove_negated(self):
self.detect_negation()
self.remove_tagged('is_negated')

def remove_overlapping(self):
self._mark_overlapping()
self.remove_tagged('is_longest', False)
self.remove_tagged('is_longest', state=False)

def _mark_overlapping(self):
"""
Expand Down Expand Up @@ -126,6 +140,7 @@ def entries_sans_context(self):
result = remove_key(result, 'context')
result = remove_key(result, 'matched_tokens')
result = remove_key(result, 'is_longest')
result = remove_key(result, 'type')
return result

@property
Expand Down Expand Up @@ -157,7 +172,8 @@ def __init__(self, correct_spelling=True,
model=None,
custom_synonyms=None,
negation_language="en",
chunk_by='phrase'
chunk_by='phrase',
phenotypes_only=True,
):

self.correct_spelling = correct_spelling
Expand All @@ -169,6 +185,7 @@ def __init__(self, correct_spelling=True,
self.context_window = context_window
self.negation_model = nlp_model(negation_language=negation_language)
self.chunk_by = chunk_by
self.phenotypes_only = phenotypes_only
if custom_synonyms:
self.search_tree = build_search_tree(custom_synonyms=custom_synonyms)
else:
Expand Down Expand Up @@ -243,6 +260,10 @@ def hpo(self, text):
if self.remove_overlapping:
extracted_terms.remove_overlapping()

extracted_terms.label_terms()
if self.phenotypes_only:
extracted_terms.remove_non_phenos()

return extracted_terms

def find_hpo_terms(self, phen_groups, stemmed_tokens, tokens, base_index):
Expand Down
9 changes: 5 additions & 4 deletions txt2hpo/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
def nlp_model(negation_language="en"):
try:
import en_core_sci_sm
nlp = en_core_sci_sm.load(disable=["tagger", "parser"])
nlp = en_core_sci_sm.load(disable=["tagger", "parser", "lemmatizer"])
nlp.add_pipe(nlp.create_pipe('sentencizer'))
negex = Negex(nlp, language=negation_language, chunk_prefix=["no"])
nlp.add_pipe(negex, last=True)
Expand All @@ -32,9 +32,10 @@ def nlp_model(negation_language="en"):

return nlp


try:
import en_core_sci_sm
nlp_sans_ner = en_core_sci_sm.load(disable=["tagger", "parser", "ner"])
nlp_sans_ner = en_core_sci_sm.load(disable=["tagger", "parser", "ner", "lemmatizer"])
logger.info('Using scispaCy language model\n')

except ModuleNotFoundError:
Expand All @@ -49,11 +50,11 @@ def nlp_model(negation_language="en"):
logger.info('Performing a one-time download of an English language model\n')
from spacy.cli import download
download('en_core_web_sm')
nlp_sans_ner = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
nlp_sans_ner = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "lemmatizer"])

# these are used in hpo as part of phenotype definition, should block from filtering
remove_from_stops = "first second third fourth fifth under over front back behind ca above below without no not "
remove_from_stops += "out up side right left more less during than take move full few all to"
remove_from_stops += "out up side right left more less during than take move full few all to i "

for not_a_stop in remove_from_stops.split(" "):
nlp_sans_ner.vocab[not_a_stop].is_stop = False
Expand Down
8 changes: 7 additions & 1 deletion txt2hpo/summarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,16 @@
import numpy as np
from txt2hpo.util import group_pairs, summarize_tuples, df_from_tuples
from txt2hpo.config import logger
from phenopy.util import half_product
from functools import reduce


def half_product(num_rows, num_columns):
"""yield combinations and the diagonal"""
for m in range(0, num_rows):
for n in range(m, num_columns):
yield (m, n)


def phenotype_distance(extracted_hpos):
"""
Given the return from hpo, find the normalized distance between all terms in the document.
Expand Down
Loading

0 comments on commit 51c46ee

Please sign in to comment.