Skip to content

Commit

Permalink
semehr_annotate.py - working version
Browse files Browse the repository at this point in the history
  • Loading branch information
user name committed Feb 18, 2024
1 parent 57504ec commit e3095db
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 19 deletions.
8 changes: 5 additions & 3 deletions doc/annotation_creation.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,18 +38,20 @@ repopulated; it would not be possible to mix the two outputs.
The first (optional) step is to anonymise the documents:

```
semehr_anon.py --all -i txt_dir -o anon_dir [--xml]
semehr_anon.py -i txt_dir -o anon_dir [--xml]
```

Note: using `--all` writes an output file for every input file, otherwise
only files actually anonymised (those containing PII) are written.

The annotation step can be performed with:

```
semehr_annotate.sh -i anon_dir/ -o annot_dir/
```

Input files must be named `*.txt` and output files will be named similarly `*.json`.
It requires a config file specified with `-c` unless CogStack-SemEHR is in a
well-known location typically `/opt/semehr/CogStack-SemEHR`

## DICOM SR annotation

This is similar to Standalone document annotation but with a preceding
Expand Down
1 change: 1 addition & 0 deletions doc/tools.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ export PYTHONPATH=/path/to/Smi_Common_Python # if SmiServices is not yet in your
## Run SemEHR on the sample document to get the semehr_results

```
# input files must be named *.txt, output files will be *.json
./semehr_annotate.sh -i ~/SemEHR/structuredreports/src/data/mtsamples_ihi_docs/ -o ~/SemEHR/structuredreports/src/data/mtsamples_ihi_semehr_results/
```

Expand Down
54 changes: 38 additions & 16 deletions src/applications/semehr_annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
# Given an input directory of text files run SemEHR to produce annotations in JSON format.
# NOTE the text files must have .txt extension (see full_text_fn_ptn below).
# Requires:
# config file template in $CS/data/semehr_processor.json
# gcp/gate/bioyodie which requires java
# config file template in /opt/semehr/CoGStack-SemEHR/data/semehr_processor.json
# /opt/gcp,gcp/gate/bioyodie which requires java
# RAM disk in /run/user/$id

import argparse
import datetime
from inspect import signature
Expand All @@ -14,17 +15,19 @@
import pprint
import shutil
import sys

from SemEHR.semehr_processor import process_semehr

# Configuration
semehr_conf_file='/opt/semehr/CogStack-SemEHR/data/semehr_processor.json'
semehr_path=None
gcp_path=None
keep_tmp=False

# Global variables
# User-specified options
input_docs_dir = None
output_docs_dir = None

# Global variables
copytree_can_ignore_dirs = 'dirs_exist_ok' in [x.name for x in signature(shutil.copytree).parameters.values()]

# Configure logging before doing anything
Expand All @@ -41,6 +44,7 @@ def find_java11():
else:
logger.error("Cannot find Java-11")
JAVA_HOME = None
logging.debug('Found java11 in %s' % JAVA_HOME)
return JAVA_HOME

# Parse command line arguments
Expand Down Expand Up @@ -74,6 +78,7 @@ def find_java11():
semehr_conf_file=os.path.join(semehr_path, "data", "semehr_processor.json")

# Read the config file
logging.debug('Reading %s' % semehr_conf_file)
conf = {}
with open(semehr_conf_file) as fd:
conf = json.load(fd)
Expand All @@ -92,6 +97,7 @@ def find_java11():
if not os.path.isdir(semehr_path):
raise Exception("cannot find SemEHR")
conf['env']['semehr_path'] = semehr_path
logging.debug('semehr_path %s' % semehr_path)

# If gcp_path given on command line then update config
if gcp_path:
Expand All @@ -110,28 +116,36 @@ def find_java11():
semehr_results_dir = os.path.join(semehr_data_dir, "semehr_results")
log = os.path.join(semehr_data_dir, "annotator.log")

logging.debug('temporary files in %s' % semehr_data_dir)
os.makedirs(semehr_data_dir, exist_ok = True)
os.makedirs(output_dir, exist_ok = True)
os.makedirs(semehr_results_dir, exist_ok = True)
if copytree_can_ignore_dirs:
os.makedirs(txt_dir, exist_ok = True)
else:
print('REMOVING !!! %s' % output_docs_dir)
logging.warning('renaming existing %s' % output_docs_dir)
if os.path.isdir(output_docs_dir):
shutil.rmtree(output_docs_dir) # XXX !!! *** will delete your output directory!!!
os.rename(output_docs_dir, output_docs_dir.rstrip('/')+datetime.datetime.now().strftime('%H%M%S'))

# ---------------------------------------------------------------------
def tidy():
# Rather than delete everything, preserve the log file
if keep_tmp:
logging.info('temporary files are kept in %s' % semehr_data_dir)
return
# We could delete everything:
#shutil.rmtree(semehr_data_dir, ignore_errors=True)
# but instead just delete the data files and keep the logs:
shutil.rmtree(txt_dir, ignore_errors=True)
shutil.rmtree(output_dir, ignore_errors=True)
shutil.rmtree(semehr_results_dir, ignore_errors=True)

# ---------------------------------------------------------------------
# If we find a valid Java then override config
j11 = find_java11()
if j11:
conf['env']['java_home'] = j11

# ---------------------------------------------------------------------
# Update the environment for sub-processes
os.environ['GCP_HOME'] = conf['env']['gcp_home']
os.environ['GATE_HOME'] = conf['env']['gate_home']
Expand All @@ -144,18 +158,19 @@ def tidy():
':'+os.environ['GCP_HOME'] + \
':'+os.environ['GATE_HOME']+'/bin'

# ---------------------------------------------------------------------
# NOTE to include a study use doc_ann_analysis.study_folder = ${STUDY_PATH}
# and doc_ann_analysis.rule_config_path = ${STUDY_CONFIG}

conf['yodie']['input_doc_file_path'] = txt_dir
conf['yodie']['config_xml_path'] = os.path.join(semehr_data_dir, "yodi.xml") # this should be created automatically
conf['yodie']['output_file_path'] = output_dir
conf['yodie']['input_doc_file_path'] = os.path.abspath(txt_dir)
conf['yodie']['config_xml_path'] = os.path.abspath(os.path.join(semehr_data_dir, "yodi.xml")) # this should be created automatically
conf['yodie']['output_file_path'] = os.path.abspath(output_dir)

# Update config file with custom directories
conf['job']['job_status_file_path'] = semehr_data_dir
conf['doc_ann_analysis']['ann_docs_path'] = output_dir
conf['doc_ann_analysis']['full_text_folder'] = input_docs_dir
conf['doc_ann_analysis']['output_folder'] = semehr_results_dir
conf['job']['job_status_file_path'] = os.path.abspath(semehr_data_dir)
conf['doc_ann_analysis']['ann_docs_path'] = os.path.abspath(output_dir)
conf['doc_ann_analysis']['full_text_folder'] = os.path.abspath(txt_dir)
conf['doc_ann_analysis']['output_folder'] = os.path.abspath(semehr_results_dir)
# Delete unused config
del conf['doc_ann_analysis']['rule_config_path']
del conf['doc_ann_analysis']['study_folder']
Expand All @@ -167,24 +182,31 @@ def tidy():
#logger.debug('Final conf: %s' % conf)
#pprint.pprint('Final conf: %s' % conf)

# ---------------------------------------------------------------------
# Copy the input documents into the working space (RAM disk)
logger.debug('Copy from %s to %s' % (input_docs_dir, txt_dir))
if copytree_can_ignore_dirs:
shutil.copytree(input_docs_dir, txt_dir, dirs_exist_ok=True)
else:
shutil.copytree(input_docs_dir, txt_dir)

# ---------------------------------------------------------------------
# Run the annotation code
# XXX do we need to chdir into semehr directory??
logger.debug('Annotating...')
##os.chdir('/opt/semehr/CogStack-SemEHR')
current_dir = os.getcwd()
process_semehr(conf)
os.chdir(current_dir)

# Copy the output documents from the working space back to the user dir
# ---------------------------------------------------------------------
# Copy the output documents from the working space back to the user dir,
# could copy the fully-featured files in output_dir, but instead prefer
# the annotations only from semehr_dir.
logger.debug('Copy from %s to %s' % (semehr_results_dir, output_docs_dir))
if copytree_can_ignore_dirs:
shutil.copytree(semehr_results_dir, output_docs_dir, dirs_exist_ok=True)
else:
shutil.copytree(semehr_results_dir, output_docs_dir)

# ---------------------------------------------------------------------
tidy()

0 comments on commit e3095db

Please sign in to comment.