hakyimlab · meliao · May 4, 2020 · May 4, 2020 · May 4, 2020 · May 4, 2020
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
-.DS_Store
+__pycache__/
+*.py[cod]
 
diff --git a/.travis.yml b/.travis.yml
@@ -20,24 +20,24 @@ install:
     fi
   - bash miniconda.sh -b -p $HOME/miniconda
   - hash -r
-# Install bgenix
+  # Install bgenix
   - wget http://code.enkre.net/bgen/tarball/release/bgen.tgz
-  - tar -xf bgen.tgz
+  - tar -xzf bgen.tgz
   - cd bgen.tgz
   - ./waf configure
   - ./waf
   - ./build/test/unit/test_bgen
   - sudo cp ./build/apps/bgenix /usr/local/bin/
   - cd ..
   - sudo cp lib/qctool/* /usr/local/bin/
-# Install conda and create environment
+  # Install conda and create environment
   - source /home/travis/miniconda/etc/profile.d/conda.sh
   - conda config --set always_yes yes --set changeps1 no
   - conda update -q conda
   - conda info -a
   - conda env create -q -n test-environment --file environment.yml
   - conda activate test-environment
-#  - sleep 10
+  #  - sleep 10
 
 script: nosetests --with-coverage
 

diff --git a/Dockerfile b/Dockerfile
@@ -51,6 +51,10 @@ COPY misc/codings /var/lib/codings
 # Other environmental variables
 ENV UKBREST_WITHDRAWALS_PATH="/var/lib/withdrawals"
 
+# EHR directories
+ENV PRIMARY_CARE_DIR="/var/lib/primary_care"
+ENV HOSPITAL_INPATIENT="/var/lib/hospital_inpatient"
+
 WORKDIR /opt
 
 COPY docker/start.py /opt/

diff --git a/README.md b/README.md
@@ -5,7 +5,6 @@
 
 # ukbREST
 
-
 **Title:** ukbREST: efficient and streamlined data access for reproducible research of large biobanks
 
 **Authors:** Milton Pividori and Hae Kyung Im
@@ -32,6 +31,7 @@ These characteristics make ukbREST an important tool to make biobank’s valuabl
 </p>
 
 # News
+ * 2020-05-22: ukbREST supports [loading](https://github.com/hakyimlab/ukbrest/wiki/Load-real-UK-Biobank-data) and [querying](https://github.com/hakyimlab/ukbrest/wiki/Electronic-health-record-queries) electronic health records from the UK Biobank.
  * 2019-12-06: the installation steps for macOS and PostgreSQL have been updated. [Check it out!](https://github.com/hakyimlab/ukbrest/wiki/Installation-instructions)
  * 2018-11-25: fix when a dataset has a data-field already loaded. Docker image is now updated.
  Check out the [documentation](https://github.com/hakyimlab/ukbrest/wiki/Load-real-UK-Biobank-data) (Section `Duplicated data-fields`).

diff --git a/codemap/.DS_Store b/codemap/.DS_Store
diff --git a/docker/start.py b/docker/start.py
@@ -7,7 +7,7 @@
 import re
 
 from ukbrest.config import logger, GENOTYPE_PATH_ENV, PHENOTYPE_PATH, PHENOTYPE_CSV_ENV, DB_URI_ENV, CODINGS_PATH, \
-    SAMPLES_DATA_PATH, WITHDRAWALS_PATH
+    SAMPLES_DATA_PATH, WITHDRAWALS_PATH, PRIMARY_CARE_DIR, HOSPITAL_INPATIENT_DIR
 
 
 parser = argparse.ArgumentParser()
@@ -16,6 +16,7 @@
 parser.add_argument('--load-codings', action='store_true', help='Loads a set of codings files (coding_NUM.tsv).')
 parser.add_argument('--load-withdrawals', action='store_true', help='Loads a list of participants who has withdrawn consent (*.csv files).')
 parser.add_argument('--load-samples-data', action='store_true', help='Loads a set of files containing information about samples.')
+parser.add_argument('--load-ehr', action='store_true', help='Loads electronic health records.')
 
 args, unknown_args = parser.parse_known_args()
 
@@ -118,6 +119,19 @@ def _setup_db_uri():
         parser.error('No DB URI was specified. You have to set it using the environment variable UKBREST_DB_URI. For '
                      'example, for PostgreSQL, the format is: postgresql://user:pass@host:port/dbname')
 
+def _setup_ehr_paths():
+    primary_care_dir = environ.get(PRIMARY_CARE_DIR, None)
+    if not isdir(primary_care_dir):
+        parser.error("The specified primary care directory does not exist.")
+
+    hospital_inpatient_dir = environ.get(HOSPITAL_INPATIENT_DIR, None)
+    if not isdir(hospital_inpatient_dir):
+        parser.error("The specified hospital inpatient directory does not exist")
+
+    if (primary_care_dir is None) and (hospital_inpatient_dir is None):
+        parser.error("Neither primary care nor hospital inpatient directories were specified.")
+
+
 
 if __name__ == '__main__':
     if args.load:
@@ -149,6 +163,11 @@ def _setup_db_uri():
 
         commands = ('python', ['python', '/opt/ukbrest/load_data.py', '--load-samples-data'] + unknown_args)
 
+    elif args.load_ehr:
+        _setup_ehr_paths()
+        _setup_db_uri()
+
+        commands = ('python', ['python', '/opt/ukbrest/load_data.py', '--load-ehr'])
     else:
         _setup_genotype_path()
         _setup_db_uri()

diff --git a/environment.yml b/environment.yml
@@ -1,26 +1,27 @@
 name: ukbrest
 channels:
-- defaults
-- conda-forge
+  - defaults
+  - conda-forge
 dependencies:
-- coveralls=1.2.0
-- coverage=4.5.1
-- eventlet=0.21.0
-- flask-restful=0.3.6
-- ruamel.yaml=0.15.34
-- beautifulsoup4=4.6.0
-- flask=0.12.2
-- gevent=1.2.2
-- gunicorn=19.7.1
-- html5lib=0.999999999
-- ipython=6.2.1
-- joblib=0.11
-- lxml=4.1.1
-- numpy=1.13.3
-- pandas=0.21.0
-- psycopg2=2.7.3.2
-- python=3.6.3
-- sqlalchemy=1.1.13
-- sqlite=3.20.1
-- nose=1.3.7
-- flask-httpauth=3.2
+  - coverage
+  - flask-httpauth=3.2
+  - eventlet=0.21.0
+  - ruamel.yaml=0.15.34
+  - sqlalchemy
+  - flask
+  - gevent=1.2.2
+  - sqlite=3.20.1
+  - coveralls=1.2.0
+  - python=3.6.3
+  - psycopg2=2.7.3.2
+  - lxml=4.1.1
+  - html5lib=0.999999999
+  - gunicorn=19.7.1
+  - joblib=0.11
+  - nose=1.3.7
+  - numpy=1.13.3
+  - ipython=6.2.1
+  - beautifulsoup4=4.6.0
+  - flask-restful=0.3.6
+  - pandas=0.21.0
+
diff --git a/start.py b/start.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python
+
+from os import listdir, execvp
+from os import environ
+from os.path import isdir, join, basename
+import argparse
+import re
+
+from ukbrest.config import logger, GENOTYPE_PATH_ENV, PHENOTYPE_PATH, PHENOTYPE_CSV_ENV, DB_URI_ENV, CODINGS_PATH, \
+    SAMPLES_DATA_PATH, WITHDRAWALS_PATH
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--load', action='store_true', help='Specifies whether data should be loaded into the DB.')
+parser.add_argument('--load-sql', action='store_true', help='Loads some useful SQL functions into the database.')
+parser.add_argument('--load-codings', action='store_true', help='Loads a set of codings files (coding_NUM.tsv).')
+parser.add_argument('--load-withdrawals', action='store_true', help='Loads a list of participants who has withdrawn consent (*.csv files).')
+parser.add_argument('--load-samples-data', action='store_true', help='Loads a set of files containing information about samples.')
+
+args, unknown_args = parser.parse_known_args()
+
+
+def _setup_genotype_path():
+    genotype_path = environ.get(GENOTYPE_PATH_ENV, None)
+
+    if not isdir(genotype_path):
+        logger.warning('The genotype directory does not exist. You have to mount it using '
+                       'the option "-v hostDir:{}" of "docker run"'.format(genotype_path))
+        return
+
+    bgen_files = [f for f in listdir(genotype_path) if f.lower().endswith('.bgen')]
+    if len(bgen_files) == 0:
+        logger.warning('No .bgen files were found in the genotype directory')
+
+    bgi_files = [f for f in listdir(genotype_path) if f.lower().endswith('.bgi')]
+    if len(bgi_files) == 0:
+        logger.warning('No .bgi files were found in the genotype directory')
+
+
+def _setup_phenotype_path():
+    phenotype_path = environ.get(PHENOTYPE_PATH, None)
+
+    if not isdir(phenotype_path):
+        parser.error('The phenotype directory does not exist. You have to mount it using '
+                     'the option "-v hostDir:{}" of "docker run"'.format(phenotype_path))
+
+    filename_number_pattern = re.compile('(?P<dataset_id>\d+)')
+
+    def sort_datasets(f):
+        """Returns the first number found in the filename as a float. If none is found, then return a minimum number."""
+        filename = basename(f)
+
+        m = re.search(filename_number_pattern, filename)
+        if m is not None:
+            return float(m.group('dataset_id'))
+
+        return float('-inf')
+
+    # by default, sort .csv files in reverse order taking the first number found in their names.
+    # So for instance, these files: ukb00.csv, ukb01.csv and ukb50.csv would be loaded in
+    # this order: ukb50.csv, ukb01.csv and ukb00.csv
+    # the number in the file is interpreted as the dataset id, and greater means newer.
+    phenotype_csv_file = sorted(
+        [f for f in listdir(phenotype_path) if f.lower().endswith('.csv')],
+        key=sort_datasets,
+        reverse=True
+    )
+
+    # check whether there is at least one and only one csv file
+    if len(phenotype_csv_file) == 0:
+        parser.error('No .csv files were found in the phenotype directory')
+
+    environ[PHENOTYPE_CSV_ENV] = ';'.join([join(phenotype_path, csv_file) for csv_file in phenotype_csv_file])
+
+
+def _setup_codings():
+    phenotype_path = environ.get(PHENOTYPE_PATH, None)
+    coding_path = environ.get(CODINGS_PATH, None)
+
+    if coding_path is None:
+        environ[CODINGS_PATH] = 'codings'
+        coding_path = 'codings'
+
+    coding_path = join(phenotype_path, coding_path)
+
+    if not isdir(coding_path):
+        parser.error('The codings directory does not exist: {}'.format(coding_path))
+
+
+def _setup_withdrawals():
+    withdrawals_path = environ.get(WITHDRAWALS_PATH, None)
+
+    if withdrawals_path is None:
+        parser.error('The withdrawals directory was not specified')
+
+    if not isdir(withdrawals_path):
+        parser.error('The withdrawals directory does not exist: {}'.format(withdrawals_path))
+
+
+def _setup_samples_data():
+    phenotype_path = environ.get(PHENOTYPE_PATH, None)
+    samples_data_path = environ.get(SAMPLES_DATA_PATH, None)
+
+    if samples_data_path is None:
+        environ[SAMPLES_DATA_PATH] = 'samples_data'
+        samples_data_path = 'samples_data'
+
+    samples_data_path = join(phenotype_path, samples_data_path)
+
+    if not isdir(samples_data_path):
+        parser.error('The samples data directory does not exist: {}'.format(samples_data_path))
+
+
+def _setup_db_uri():
+    db_uri = environ.get(DB_URI_ENV, None)
+
+    if db_uri is None:
+        parser.error('No DB URI was specified. You have to set it using the environment variable UKBREST_DB_URI. For '
+                     'example, for PostgreSQL, the format is: postgresql://user:pass@host:port/dbname')
+
+
+if __name__ == '__main__':
+    if args.load:
+        _setup_phenotype_path()
+        _setup_db_uri()
+
+        commands = ('python', ['python', '/opt/ukbrest/load_data.py'])
+
+    elif args.load_sql:
+        _setup_db_uri()
+
+        commands = ('python', ['python', '/opt/ukbrest/load_data.py', '--load-sql'])
+
+    elif args.load_codings:
+        _setup_codings()
+        _setup_db_uri()
+
+        commands = ('python', ['python', '/opt/ukbrest/load_data.py', '--load-codings'])
+
+    elif args.load_withdrawals:
+        _setup_withdrawals()
+        _setup_db_uri()
+
+        commands = ('python', ['python', '/opt/ukbrest/load_data.py', '--load-withdrawals'])
+
+    elif args.load_samples_data:
+        _setup_samples_data()
+        _setup_db_uri()
+
+        commands = ('python', ['python', '/opt/ukbrest/load_data.py', '--load-samples-data'] + unknown_args)
+
+    else:
+        _setup_genotype_path()
+        _setup_db_uri()
+        # TODO: check if data was loaded into PostgreSQL
+
+        commands = ('gunicorn', ['gunicorn', 'ukbrest.wsgi:app'])
+
+    execvp(*commands)
diff --git a/tests/data/ehr/gp_clinical.txt b/tests/data/ehr/gp_clinical.txt
@@ -0,0 +1,7 @@
+eid	data_provider	event_dt	read_2	read_3	value1	value2	value3
+1	2	22/03/2014	j550.
+2	1	17/03/2016	42Z7.		12.800
+2	1	17/03/2016	42Z7.		12.800
+3	1	04/11/2013	426..		4.000
+3	1	04/11/2013	4266.		0.000
+3	1	04/11/2013	428..		28.700
diff --git a/tests/data/ehr/gp_registrations.txt b/tests/data/ehr/gp_registrations.txt
@@ -0,0 +1,8 @@
+eid	data_provider	reg_date	deduct_date
+1	2	02/02/1902
+2	1	12/06/2002	21/03/2014
+2	1	25/04/2014
+3	1	30/09/1993
+	3	30/09/1993
+4		30/09/1993
+5	6
diff --git a/tests/data/ehr/gp_scripts.txt b/tests/data/ehr/gp_scripts.txt
@@ -0,0 +1,9 @@
+eid	data_provider	issue_date	read_2	bnf_code	dmd_code	drug_name	quantity
+1	2	30/10/2015			02.08.02.00.00	Warfarin 1mg tablets	28 tablet
+1	2	30/10/2015			02.08.02.00.00	Warfarin 500microgram tablets	56 tablet
+1	2	30/10/2015			02.08.02.00.00	Warfarin 1mg tablets	28 tablet
+1	2	30/10/2015			02.08.02.00.00	Warfarin 1mg tablets	28 tablet
+1	2	07/07/2015			02.08.02.00.00	Warfarin 1mg tablets	28 tablet
+1	2	07/07/2015			02.08.02.00.00	Warfarin 500microgram tablets	56 tablet
+1	2	30/10/2015			02.08.02.00.00	Warfarin 1mg tablets	28 tablet
+1	2	07/12/2015			07.04.01.01.00	Tamsulosin 400microgram / Dutasteride 500microgram capsules	30 capsule
diff --git a/tests/data/ehr/hesin.txt b/tests/data/ehr/hesin.txt
@@ -0,0 +1,5 @@
+eid	ins_index	dsource	source	epistart	epiend	epidur	bedyear	epistat	epitype	epiorder	spell_index	spell_seq	spelbgin	spelend	speldur	pctcode	gpprpct	category	elecdate	elecdur	admidate	admimeth_uni	admimeth	admisorc_uni	admisorc	firstreg	classpat_uni	classpat	intmanag_uni	intmanag	mainspef_uni	mainspef	tretspef_uni	tretspef	operstat	disdate	dismeth_uni	dismeth	disdest_uni	disdest	carersi
+1	1	HES	18	20120305	20120306	1	1	3
+1	2	HES	18	20120305	20120306	1	2	3
+1	3	GGG	19	20120305	20120306	1	1	3
+2	1	HES	18	20120305	20120306	1	1	3
diff --git a/tests/data/ehr/hesin_diag.txt b/tests/data/ehr/hesin_diag.txt
@@ -0,0 +1,6 @@
+eid	ins_index	arr_index	level	diag_icd9	diag_icd9_nb	diag_icd10	diag_icd10_nb
+1	1	1		Code_A
+1	2	1		Code_B
+1	2	2		Code_A
+2	1	1		Code_C
+3	1	1		Code_B
diff --git a/tests/data/ehr_missing/gp_clinical.txt b/tests/data/ehr_missing/gp_clinical.txt
@@ -0,0 +1,7 @@
+eid	data_provider	event_dt	read_2	read_3	value1	value2	value3
+1	2	22/03/2014	j550.
+2	1	17/03/2016	42Z7.		12.800
+2	1	17/03/2016	42Z7.		12.800
+3	1	04/11/2013	426..		4.000
+3	1	04/11/2013	4266.		0.000
+3	1	04/11/2013	428..		28.700
diff --git a/tests/data/ehr_missing/gp_scripts.txt b/tests/data/ehr_missing/gp_scripts.txt
@@ -0,0 +1,9 @@
+eid	data_provider	issue_date	read_2	bnf_code	dmd_code	drug_name	quantity
+1	2	30/10/2015			02.08.02.00.00	Warfarin 1mg tablets	28 tablet
+1	2	30/10/2015			02.08.02.00.00	Warfarin 500microgram tablets	56 tablet
+1	2	30/10/2015			02.08.02.00.00	Warfarin 1mg tablets	28 tablet
+1	2	30/10/2015			02.08.02.00.00	Warfarin 1mg tablets	28 tablet
+1	2	07/07/2015			02.08.02.00.00	Warfarin 1mg tablets	28 tablet
+1	2	07/07/2015			02.08.02.00.00	Warfarin 500microgram tablets	56 tablet
+1	2	30/10/2015			02.08.02.00.00	Warfarin 1mg tablets	28 tablet
+1	2	07/12/2015			07.04.01.01.00	Tamsulosin 400microgram / Dutasteride 500microgram capsules	30 capsule
diff --git a/tests/data/ehr_missing/hesin.txt b/tests/data/ehr_missing/hesin.txt
@@ -0,0 +1,5 @@
+eid	ins_index	dsource	source	epistart	epiend	epidur	bedyear	epistat	epitype	epiorder	spell_index	spell_seq	spelbgin	spelend	speldur	pctcode	gpprpct	category	elecdate	elecdur	admidate	admimeth_uni	admimeth	admisorc_uni	admisorc	firstreg	classpat_uni	classpat	intmanag_uni	intmanag	mainspef_uni	mainspef	tretspef_uni	tretspef	operstat	disdate	dismeth_uni	dismeth	disdest_uni	disdest	carersi
+1	1	HES	18	20120305	20120306	1	1	3
+1	2	HES	18	20120305	20120306	1	2	3
+1	3	GGG	19	20120305	20120306	1	1	3
+2	1	HES	18	20120305	20120306	1	1	3
diff --git a/tests/data/withdrawals/withdrawals.csv b/tests/data/withdrawals/withdrawals.csv
@@ -0,0 +1,4 @@
+1000
+1001
+1002
+1003
diff --git a/tests/settings.py b/tests/settings.py
@@ -6,5 +6,6 @@
 #       -e POSTGRES_DB=ukb -p 5432:5432 postgres:9.6
 POSTGRESQL_ENGINE='postgresql://test:test@localhost:5432/ukb'
 
+
 # SQLite
 SQLITE_ENGINE='sqlite:///tmp.db'