-
Notifications
You must be signed in to change notification settings - Fork 22
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
EHR Compatibility #7
base: master
Are you sure you want to change the base?
Changes from all commits
b4279a1
fe2f5a2
cc624ea
e0a9286
6f0b9cc
b30a2fd
6b19567
143dc27
6dbc8c8
61cc7c9
00ebe32
ca3fd29
9421069
cedecce
58e5ae4
d8e8a78
c3c488f
57ccaef
7188bfd
03ea9a0
a352260
acad6e0
8ff189b
e2f046c
dfd3625
74a78d0
0a572dc
0b05ba4
acb3dac
54f6072
a0ded58
4765034
8e30a2c
23cc238
7e5360b
2b0fddb
ed728fc
69dcb97
2140f07
f980c24
9e01a43
c47b23e
c4f3b84
f698227
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
.DS_Store | ||
__pycache__/ | ||
*.py[cod] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,26 +1,27 @@ | ||
name: ukbrest | ||
channels: | ||
- defaults | ||
- conda-forge | ||
- defaults | ||
- conda-forge | ||
dependencies: | ||
- coveralls=1.2.0 | ||
- coverage=4.5.1 | ||
- eventlet=0.21.0 | ||
- flask-restful=0.3.6 | ||
- ruamel.yaml=0.15.34 | ||
- beautifulsoup4=4.6.0 | ||
- flask=0.12.2 | ||
- gevent=1.2.2 | ||
- gunicorn=19.7.1 | ||
- html5lib=0.999999999 | ||
- ipython=6.2.1 | ||
- joblib=0.11 | ||
- lxml=4.1.1 | ||
- numpy=1.13.3 | ||
- pandas=0.21.0 | ||
- psycopg2=2.7.3.2 | ||
- python=3.6.3 | ||
- sqlalchemy=1.1.13 | ||
- sqlite=3.20.1 | ||
- nose=1.3.7 | ||
- flask-httpauth=3.2 | ||
- coverage | ||
- flask-httpauth=3.2 | ||
- eventlet=0.21.0 | ||
- ruamel.yaml=0.15.34 | ||
- sqlalchemy | ||
- flask | ||
- gevent=1.2.2 | ||
- sqlite=3.20.1 | ||
- coveralls=1.2.0 | ||
- python=3.6.3 | ||
- psycopg2=2.7.3.2 | ||
- lxml=4.1.1 | ||
- html5lib=0.999999999 | ||
- gunicorn=19.7.1 | ||
- joblib=0.11 | ||
- nose=1.3.7 | ||
- numpy=1.13.3 | ||
- ipython=6.2.1 | ||
- beautifulsoup4=4.6.0 | ||
- flask-restful=0.3.6 | ||
- pandas=0.21.0 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
#!/usr/bin/env python | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this file here? Looks similar than |
||
|
||
from os import listdir, execvp | ||
from os import environ | ||
from os.path import isdir, join, basename | ||
import argparse | ||
import re | ||
|
||
from ukbrest.config import logger, GENOTYPE_PATH_ENV, PHENOTYPE_PATH, PHENOTYPE_CSV_ENV, DB_URI_ENV, CODINGS_PATH, \ | ||
SAMPLES_DATA_PATH, WITHDRAWALS_PATH | ||
|
||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--load', action='store_true', help='Specifies whether data should be loaded into the DB.') | ||
parser.add_argument('--load-sql', action='store_true', help='Loads some useful SQL functions into the database.') | ||
parser.add_argument('--load-codings', action='store_true', help='Loads a set of codings files (coding_NUM.tsv).') | ||
parser.add_argument('--load-withdrawals', action='store_true', help='Loads a list of participants who has withdrawn consent (*.csv files).') | ||
parser.add_argument('--load-samples-data', action='store_true', help='Loads a set of files containing information about samples.') | ||
|
||
args, unknown_args = parser.parse_known_args() | ||
|
||
|
||
def _setup_genotype_path(): | ||
genotype_path = environ.get(GENOTYPE_PATH_ENV, None) | ||
|
||
if not isdir(genotype_path): | ||
logger.warning('The genotype directory does not exist. You have to mount it using ' | ||
'the option "-v hostDir:{}" of "docker run"'.format(genotype_path)) | ||
return | ||
|
||
bgen_files = [f for f in listdir(genotype_path) if f.lower().endswith('.bgen')] | ||
if len(bgen_files) == 0: | ||
logger.warning('No .bgen files were found in the genotype directory') | ||
|
||
bgi_files = [f for f in listdir(genotype_path) if f.lower().endswith('.bgi')] | ||
if len(bgi_files) == 0: | ||
logger.warning('No .bgi files were found in the genotype directory') | ||
|
||
|
||
def _setup_phenotype_path(): | ||
phenotype_path = environ.get(PHENOTYPE_PATH, None) | ||
|
||
if not isdir(phenotype_path): | ||
parser.error('The phenotype directory does not exist. You have to mount it using ' | ||
'the option "-v hostDir:{}" of "docker run"'.format(phenotype_path)) | ||
|
||
filename_number_pattern = re.compile('(?P<dataset_id>\d+)') | ||
|
||
def sort_datasets(f): | ||
"""Returns the first number found in the filename as a float. If none is found, then return a minimum number.""" | ||
filename = basename(f) | ||
|
||
m = re.search(filename_number_pattern, filename) | ||
if m is not None: | ||
return float(m.group('dataset_id')) | ||
|
||
return float('-inf') | ||
|
||
# by default, sort .csv files in reverse order taking the first number found in their names. | ||
# So for instance, these files: ukb00.csv, ukb01.csv and ukb50.csv would be loaded in | ||
# this order: ukb50.csv, ukb01.csv and ukb00.csv | ||
# the number in the file is interpreted as the dataset id, and greater means newer. | ||
phenotype_csv_file = sorted( | ||
[f for f in listdir(phenotype_path) if f.lower().endswith('.csv')], | ||
key=sort_datasets, | ||
reverse=True | ||
) | ||
|
||
# check whether there is at least one and only one csv file | ||
if len(phenotype_csv_file) == 0: | ||
parser.error('No .csv files were found in the phenotype directory') | ||
|
||
environ[PHENOTYPE_CSV_ENV] = ';'.join([join(phenotype_path, csv_file) for csv_file in phenotype_csv_file]) | ||
|
||
|
||
def _setup_codings(): | ||
phenotype_path = environ.get(PHENOTYPE_PATH, None) | ||
coding_path = environ.get(CODINGS_PATH, None) | ||
|
||
if coding_path is None: | ||
environ[CODINGS_PATH] = 'codings' | ||
coding_path = 'codings' | ||
|
||
coding_path = join(phenotype_path, coding_path) | ||
|
||
if not isdir(coding_path): | ||
parser.error('The codings directory does not exist: {}'.format(coding_path)) | ||
|
||
|
||
def _setup_withdrawals(): | ||
withdrawals_path = environ.get(WITHDRAWALS_PATH, None) | ||
|
||
if withdrawals_path is None: | ||
parser.error('The withdrawals directory was not specified') | ||
|
||
if not isdir(withdrawals_path): | ||
parser.error('The withdrawals directory does not exist: {}'.format(withdrawals_path)) | ||
|
||
|
||
def _setup_samples_data(): | ||
phenotype_path = environ.get(PHENOTYPE_PATH, None) | ||
samples_data_path = environ.get(SAMPLES_DATA_PATH, None) | ||
|
||
if samples_data_path is None: | ||
environ[SAMPLES_DATA_PATH] = 'samples_data' | ||
samples_data_path = 'samples_data' | ||
|
||
samples_data_path = join(phenotype_path, samples_data_path) | ||
|
||
if not isdir(samples_data_path): | ||
parser.error('The samples data directory does not exist: {}'.format(samples_data_path)) | ||
|
||
|
||
def _setup_db_uri(): | ||
db_uri = environ.get(DB_URI_ENV, None) | ||
|
||
if db_uri is None: | ||
parser.error('No DB URI was specified. You have to set it using the environment variable UKBREST_DB_URI. For ' | ||
'example, for PostgreSQL, the format is: postgresql://user:pass@host:port/dbname') | ||
|
||
|
||
if __name__ == '__main__': | ||
if args.load: | ||
_setup_phenotype_path() | ||
_setup_db_uri() | ||
|
||
commands = ('python', ['python', '/opt/ukbrest/load_data.py']) | ||
|
||
elif args.load_sql: | ||
_setup_db_uri() | ||
|
||
commands = ('python', ['python', '/opt/ukbrest/load_data.py', '--load-sql']) | ||
|
||
elif args.load_codings: | ||
_setup_codings() | ||
_setup_db_uri() | ||
|
||
commands = ('python', ['python', '/opt/ukbrest/load_data.py', '--load-codings']) | ||
|
||
elif args.load_withdrawals: | ||
_setup_withdrawals() | ||
_setup_db_uri() | ||
|
||
commands = ('python', ['python', '/opt/ukbrest/load_data.py', '--load-withdrawals']) | ||
|
||
elif args.load_samples_data: | ||
_setup_samples_data() | ||
_setup_db_uri() | ||
|
||
commands = ('python', ['python', '/opt/ukbrest/load_data.py', '--load-samples-data'] + unknown_args) | ||
|
||
else: | ||
_setup_genotype_path() | ||
_setup_db_uri() | ||
# TODO: check if data was loaded into PostgreSQL | ||
|
||
commands = ('gunicorn', ['gunicorn', 'ukbrest.wsgi:app']) | ||
|
||
execvp(*commands) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
eid data_provider event_dt read_2 read_3 value1 value2 value3 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make sure this is not real data. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And I would add a proper extension to the file, probably |
||
1 2 22/03/2014 j550. | ||
2 1 17/03/2016 42Z7. 12.800 | ||
2 1 17/03/2016 42Z7. 12.800 | ||
3 1 04/11/2013 426.. 4.000 | ||
3 1 04/11/2013 4266. 0.000 | ||
3 1 04/11/2013 428.. 28.700 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
eid data_provider reg_date deduct_date | ||
1 2 02/02/1902 | ||
2 1 12/06/2002 21/03/2014 | ||
2 1 25/04/2014 | ||
3 1 30/09/1993 | ||
3 30/09/1993 | ||
4 30/09/1993 | ||
5 6 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
eid data_provider issue_date read_2 bnf_code dmd_code drug_name quantity | ||
1 2 30/10/2015 02.08.02.00.00 Warfarin 1mg tablets 28 tablet | ||
1 2 30/10/2015 02.08.02.00.00 Warfarin 500microgram tablets 56 tablet | ||
1 2 30/10/2015 02.08.02.00.00 Warfarin 1mg tablets 28 tablet | ||
1 2 30/10/2015 02.08.02.00.00 Warfarin 1mg tablets 28 tablet | ||
1 2 07/07/2015 02.08.02.00.00 Warfarin 1mg tablets 28 tablet | ||
1 2 07/07/2015 02.08.02.00.00 Warfarin 500microgram tablets 56 tablet | ||
1 2 30/10/2015 02.08.02.00.00 Warfarin 1mg tablets 28 tablet | ||
1 2 07/12/2015 07.04.01.01.00 Tamsulosin 400microgram / Dutasteride 500microgram capsules 30 capsule |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
eid ins_index dsource source epistart epiend epidur bedyear epistat epitype epiorder spell_index spell_seq spelbgin spelend speldur pctcode gpprpct category elecdate elecdur admidate admimeth_uni admimeth admisorc_uni admisorc firstreg classpat_uni classpat intmanag_uni intmanag mainspef_uni mainspef tretspef_uni tretspef operstat disdate dismeth_uni dismeth disdest_uni disdest carersi | ||
1 1 HES 18 20120305 20120306 1 1 3 | ||
1 2 HES 18 20120305 20120306 1 2 3 | ||
1 3 GGG 19 20120305 20120306 1 1 3 | ||
2 1 HES 18 20120305 20120306 1 1 3 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
eid ins_index arr_index level diag_icd9 diag_icd9_nb diag_icd10 diag_icd10_nb | ||
1 1 1 Code_A | ||
1 2 1 Code_B | ||
1 2 2 Code_A | ||
2 1 1 Code_C | ||
3 1 1 Code_B |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
eid data_provider event_dt read_2 read_3 value1 value2 value3 | ||
1 2 22/03/2014 j550. | ||
2 1 17/03/2016 42Z7. 12.800 | ||
2 1 17/03/2016 42Z7. 12.800 | ||
3 1 04/11/2013 426.. 4.000 | ||
3 1 04/11/2013 4266. 0.000 | ||
3 1 04/11/2013 428.. 28.700 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
eid data_provider issue_date read_2 bnf_code dmd_code drug_name quantity | ||
1 2 30/10/2015 02.08.02.00.00 Warfarin 1mg tablets 28 tablet | ||
1 2 30/10/2015 02.08.02.00.00 Warfarin 500microgram tablets 56 tablet | ||
1 2 30/10/2015 02.08.02.00.00 Warfarin 1mg tablets 28 tablet | ||
1 2 30/10/2015 02.08.02.00.00 Warfarin 1mg tablets 28 tablet | ||
1 2 07/07/2015 02.08.02.00.00 Warfarin 1mg tablets 28 tablet | ||
1 2 07/07/2015 02.08.02.00.00 Warfarin 500microgram tablets 56 tablet | ||
1 2 30/10/2015 02.08.02.00.00 Warfarin 1mg tablets 28 tablet | ||
1 2 07/12/2015 07.04.01.01.00 Tamsulosin 400microgram / Dutasteride 500microgram capsules 30 capsule |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
eid ins_index dsource source epistart epiend epidur bedyear epistat epitype epiorder spell_index spell_seq spelbgin spelend speldur pctcode gpprpct category elecdate elecdur admidate admimeth_uni admimeth admisorc_uni admisorc firstreg classpat_uni classpat intmanag_uni intmanag mainspef_uni mainspef tretspef_uni tretspef operstat disdate dismeth_uni dismeth disdest_uni disdest carersi | ||
1 1 HES 18 20120305 20120306 1 1 3 | ||
1 2 HES 18 20120305 20120306 1 2 3 | ||
1 3 GGG 19 20120305 20120306 1 1 3 | ||
2 1 HES 18 20120305 20120306 1 1 3 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
1000 | ||
1001 | ||
1002 | ||
1003 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's better to fix most of the versions here, since otherwise the Docker image will be built always with different versions of critical packages like
flask
.My current approach is to have two
environment.yml
files:python==3.8
ornumpy=1.13
(note that I'm not fixing the revision part of the version). This one is for production, building the Docker image, etc.