diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index c7a87fa..4bf2b7f 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -21,9 +21,8 @@ jobs: run: | python -m pip install --upgrade pip pip install pipenv - python setup.py develop + pip install . - - name: Test with unittest run: | pip install pytest diff --git a/setup.py b/setup.py index 71441e5..819e079 100644 --- a/setup.py +++ b/setup.py @@ -31,9 +31,6 @@ 'negspacy', 'networkx', 'gensim', - 'en_core_sci_sm @ https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_sm-0.2.4.tar.gz', - - ], - dependency_links=['https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_sm-0.2.4.tar.gz#egg=en_core_sci_sm'] + ] ) diff --git a/txt2hpo/nlp.py b/txt2hpo/nlp.py index b348919..d24f802 100644 --- a/txt2hpo/nlp.py +++ b/txt2hpo/nlp.py @@ -1,24 +1,29 @@ import spacy -import en_core_sci_sm from negspacy.negation import Negex from gensim.parsing.preprocessing import remove_stopwords from txt2hpo.config import logger -from txt2hpo.util import hpo_network +from txt2hpo.util import hpo_network, download_model from nltk.stem import RegexpStemmer from spacy.tokens import Token def nlp_model(negation_language="en"): try: + import en_core_sci_sm nlp = en_core_sci_sm.load(disable=["tagger", "parser"]) nlp.add_pipe(nlp.create_pipe('sentencizer')) negex = Negex(nlp, language=negation_language, chunk_prefix=["no"]) nlp.add_pipe(negex, last=True) Token.set_extension('negex', default=False, force=True) - except OSError: - nlp = None - logger.info('Negation model could not be loaded\n') + except ModuleNotFoundError: + rl = download_model("https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_sm-0.2.4.tar.gz") + if rl == 0: + import en_core_sci_sm + nlp = en_core_sci_sm.load(disable=["tagger", "parser"]) + else: + logger.info('Negation model could not be loaded\n') + nlp = None if nlp: for not_a_stop in remove_from_stops.split(" "): @@ -28,15 +33,23 @@ def nlp_model(negation_language="en"): return nlp try: + import en_core_sci_sm nlp_sans_ner = en_core_sci_sm.load(disable=["tagger", "parser", "ner"]) - logger.info('Using sci spacy language model\n') + logger.info('Using scispaCy language model\n') -except OSError as e: - logger.info('Sci spacy language model could not be loaded\n') - logger.info('Performing a one-time download of an English language model\n') - from spacy.cli import download - download('en_core_web_sm') - nlp_sans_ner = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"]) +except ModuleNotFoundError: + rl = download_model( + "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_sm-0.2.4.tar.gz") + if rl == 0: + import en_core_sci_sm + nlp_sans_ner = en_core_sci_sm.load(disable=["tagger", "parser", "ner"]) + logger.info('Using scispaCy language model\n') + else: + logger.info('scispaCy language model could not be loaded\n') + logger.info('Performing a one-time download of an English language model\n') + from spacy.cli import download + download('en_core_web_sm') + nlp_sans_ner = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"]) # these are used in hpo as part of phenotype definition, should block from filtering remove_from_stops = "first second third fourth fifth under over front back behind ca above below without no not " diff --git a/txt2hpo/util.py b/txt2hpo/util.py index c7c552a..4d47310 100644 --- a/txt2hpo/util.py +++ b/txt2hpo/util.py @@ -4,6 +4,10 @@ from phenopy.config import config as phenopy_config from phenopy import generate_annotated_hpo_network +import sys +import subprocess +import os + obo_file = phenopy_config.get('hpo', 'obo_file') @@ -57,3 +61,12 @@ def remove_key(dict_list, key): if key in d: del d[key] return dict_list + + +def download_model(filename, user_pip_args=None): + download_url = filename + pip_args = ["--no-cache-dir"] + if user_pip_args: + pip_args.extend(user_pip_args) + cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url] + return subprocess.call(cmd, env=os.environ.copy()) \ No newline at end of file