From c7ca5671f521934a035f0db0be389911c372a548 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 2 May 2021 10:16:13 -0500 Subject: [PATCH 01/46] Initial mdt package structure --- src/mdt/__init__.py | 0 src/mdt/meps/__init__.py | 0 src/mdt/rxnorm/__init__.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/mdt/__init__.py create mode 100644 src/mdt/meps/__init__.py create mode 100644 src/mdt/rxnorm/__init__.py diff --git a/src/mdt/__init__.py b/src/mdt/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/mdt/meps/__init__.py b/src/mdt/meps/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/mdt/rxnorm/__init__.py b/src/mdt/rxnorm/__init__.py new file mode 100644 index 0000000..e69de29 From b03342fe99f78a352266f65b3bd069043986c136 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 2 May 2021 10:29:21 -0500 Subject: [PATCH 02/46] Initial database module within mdt package --- src/mdt/database.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/mdt/database.py diff --git a/src/mdt/database.py b/src/mdt/database.py new file mode 100644 index 0000000..e69de29 From d723c173a19c44d3af06f33a64a032910d274ce4 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 2 May 2021 10:29:53 -0500 Subject: [PATCH 03/46] Initial synthea module --- src/mdt/synthea.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/mdt/synthea.py diff --git a/src/mdt/synthea.py b/src/mdt/synthea.py new file mode 100644 index 0000000..e69de29 From 19a0a28e7fa62cea92a34f224ffb1830b9e33481 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 2 May 2021 10:53:30 -0500 Subject: [PATCH 04/46] Moved mdt_functions methods into rxclass.py --- src/mdt/rxnorm/rxclass.py | 67 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 src/mdt/rxnorm/rxclass.py diff --git a/src/mdt/rxnorm/rxclass.py b/src/mdt/rxnorm/rxclass.py new file mode 100644 index 0000000..70f422e --- /dev/null +++ b/src/mdt/rxnorm/rxclass.py @@ -0,0 +1,67 @@ + + +def rxclass_findclassesbyid_payload(class_id): + """Generates and returns URLs as strings for hitting the RxClass API function FindClassesById.""" + + param_dict = {'classId':class_id} + + payload = payload_constructor('https://rxnav.nlm.nih.gov/REST/rxclass/class/byId.json?', param_dict) + + return payload + + +def rxclass_getclassmember_payload(class_id, relation, ttys = ['IN','MIN']): + """Generates and returns URLs as strings for hitting the RxClass API function GetClassMembers.""" + + relation_dict = { + 'ATC':"ATC", + 'has_EPC':"DailyMed", + 'has_Chemical_Structure':"DailyMed", + 'has_MoA':"DailyMed", + 'has_PE':"DailyMed", + 'has_EPC':"FDASPL", + 'has_Chemical_Structure':"FDASPL", + #'has_MoA':"FDASPL", + #'has_PE':"FDASPL", + 'has_TC': "FMTSME", + 'CI_with': "MEDRT", + 'induces': "MEDRT", + 'may_diagnose': "MEDRT", + 'may_prevent': "MEDRT", + 'may_treat': "MEDRT", + 'CI_ChemClass': "MEDRT", + 'has_active_metabolites': "MEDRT", + 'has_Ingredient': "MEDRT", + 'CI_MoA': "MEDRT", + #'has_MoA': "MEDRT", + 'has_PK': "MEDRT", + 'site_of_metabolism': "MEDRT", + 'CI_PE': "MEDRT", + #'has_PE': "MEDRT", + 'has_schedule':'RXNORM', + 'MESH': "MESH", + 'isa_disposition': "SNOWMEDCT", + 'isa_structure': "SNOWMEDCT", + 'has_VAClass': "VA", + 'has_VAClass_extended': "VA", + } + + if relation not in list(relation_dict.keys()): + raise ValueError("results: relation must be one of %r." % list(relation_dict.keys())) + + #If relaSource is VA or RXNORM, specify ttys as one or more of: SCD, SBD, GPCK, BPCK. The default TTYs do not intersect VA or RXNORM classes. + if relation_dict.get(relation) in ['VA','RXNORM']: + ttys = ttys.extend(['SCD','SBD','GPCK','BPCK']) + + + param_dict = {'classId':class_id, + 'relaSource':relation_dict.get(relation), + 'ttys':'+'.join(ttys)} + + #Does not send rela parameter on data sources with single rela, see RxClass API documentation + if relation not in ['MESH','ATC']: + param_dict['rela'] = relation + + payload = payload_constructor('https://rxnav.nlm.nih.gov/REST/rxclass/classMembers.json?', param_dict) + + return payload From 8336a3e06c812132b6f332476a7b5328bac59e47 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 2 May 2021 10:54:15 -0500 Subject: [PATCH 05/46] Moved mdt_functions into database.py --- src/mdt/database.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/mdt/database.py b/src/mdt/database.py index e69de29..9ef2370 100644 --- a/src/mdt/database.py +++ b/src/mdt/database.py @@ -0,0 +1,41 @@ + + +def create_mdt_con(): + """create defualt connection to the data/MDT.db database. If database does not exist it creates it.""" + conn = sql.connect('data/MDT.db') + return conn + + +def sql_create_table(table_name, df, conn=None, delete_df=True): + """Creates a table in the connected database when passed a pandas dataframe. + Note default is to delete dataframe if table name is same as global variable name that stores the df and delete_df is True""" + + if conn == None: + conn = create_mdt_con() + + try: + df.to_sql(table_name, conn, if_exists='replace',index=False) + print('{} table created in DB'.format(table_name)) + except: + print('Could not create table {0} in DB'.format(table_name)) + + +def db_query(query_str,conn=None): + """Sends Query to DB and returns results as a dataframe""" + + if conn == None: + conn = create_mdt_con() + + return pd.read_sql(query_str,conn) + + +def read_sql_string(file_name): + """reads the contents of a sql script into a string for python to use in a query""" + + fd = open(file_name, 'r') + query_str = fd.read() + fd.close() + + print('Read {0} file as string'.format(file_name)) + + return query_str From 96dee8bada01aab836bd236810be3e8e208dfa0d Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 2 May 2021 10:54:47 -0500 Subject: [PATCH 06/46] Moved mdt_functions into synthea.py --- src/mdt/synthea.py | 253 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 253 insertions(+) diff --git a/src/mdt/synthea.py b/src/mdt/synthea.py index e69de29..198074a 100644 --- a/src/mdt/synthea.py +++ b/src/mdt/synthea.py @@ -0,0 +1,253 @@ + + +def read_json(file_name): + # Opening JSON file + f = open(file_name,) + + # returns JSON object as a dictionary + data = json.load(f) + return data + + +def age_values(file_name): + """reads age_ranges from JSON to create dataframe with age_values""" + + data = {} + data['age'] = read_json('mdt_config.json')['age'] + data['age_values'] = [list(range(int(age.split('-')[0]), int(age.split('-')[1])+1)) for age in data['age']] + df = pd.DataFrame(data) + df = df.explode('age_values') + return df + + +#TODO: Add option to string search doseage form +def rxcui_ndc_matcher(rxcui_list): + """mashes list of RxCUIs against RxNorm combined table to get matching NDCs. + Select output of return, clipboard, csv....return is default""" + + df = db_query('SELECT * FROM rxcui_ndc') + filtered_df = df[df['medication_ingredient_rxcui'].isin(rxcui_list) | df['medication_product_rxcui'].isin(rxcui_list)] + + print("RXCUI list matched on {0} NDCs".format(filtered_df['medication_ndc'].count())) + + return filtered_df + + +def output_df(df,output='csv', filename='df_output'): + """Outputs a dataframe to a csv of clipboard if you use the output=clipboard arguement""" + + + if output == 'clipboard': + df.to_clipboard(index=False,excel=True) + elif output == 'csv': + df.to_csv('data/'+filename+'.csv',index=False) + + +def output_json(data, filename='json_output'): + with open('data/'+filename+'.json', 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + + +def normalize_name(name): + #Replace all non-alphanumeric characters with an underscore + name = re.sub(r"[^a-zA-Z0-9]", "_", name) + #Then, replace all duplicate underscores with just one underscore + name = re.sub(r"_{2,}", "_", name) + #If there'a an underscore at the end of the word, remove + name = re.sub(r"_$", "", name) + return name + + +def generate_module(rxcui_ndc_df, rxclass_name): + module_dict = {} + state_prefix = 'Prescribe_' + + + rxclass_name = normalize_name(rxclass_name) + module_dict['name'] = rxclass_name + ' Medications' + module_dict['remarks'] = ['Remarks go here', 'and here.'] + #NOTE: not sure the difference between 1 and 2... I think 2 is the most recent version(?) + module_dict['gmf_version'] = 2 + + states_dict = {} + + #Initial state (required) + #NOTE: if we change to conditional to check for existence of medication, channge direct_transition to transition + states_dict['Initial'] = { + 'type': 'Initial', + 'direct_transition': state_prefix + 'Ingredient' + } + + #Terminal state (required) + states_dict['Terminal'] = { + 'type': 'Terminal' + } + + #Get tuples of medication_product names and medication_product RXCUIs and loop through to generate MedicationOrders + + #Read in MEPS Reference table + meps_reference_str = read_sql_string('meps_reference.sql') + meps_reference = db_query(meps_reference_str) + + #Join MEPS to filtered rxcui_ndc dataframe (rxcui_list) + meps_rxcui = meps_reference.astype(str).merge(rxcui_ndc_df.astype(str)[['medication_ingredient_name', 'medication_ingredient_rxcui','medication_product_name', 'medication_product_rxcui', 'medication_ndc']], how = 'inner', left_on = 'RXNDC', right_on = 'medication_ndc') + + #Optional: Age range join - can be customized in the mdt_config.json file + #groupby_demographic_variable: must be either an empty list [] or list of patient demographics (e.g., age, gender, state) - based on user inputs in the mdt_config.json file + + data = read_json('mdt_config.json') + demographic_distrib_flags = data['demographic_distrib_flags'] + + groupby_demographic_variables = [] + for k, v in demographic_distrib_flags.items(): + if v == 'Y': + groupby_demographic_variables.append(k) + + if demographic_distrib_flags['age'] == 'Y': + age_ranges = age_values('mdt_config.json') + meps_rxcui = meps_rxcui.merge(age_ranges.astype(str), how='inner', left_on='AGELAST', right_on='age_values') + #Optional: State-region mapping from MEPS + if demographic_distrib_flags['state'] == 'Y': + meps_rxcui = meps_rxcui.merge(meps_region_states.astype(str), how='inner', left_on='region_num', right_on='region_value') + + + #Clean text to JSON/SQL-friendly format + for col in meps_rxcui[['medication_ingredient_name', 'medication_product_name']]: + meps_rxcui[col] = meps_rxcui[col].apply(lambda x: normalize_name(x)) + + + dcp_dict = {} + output = 'csv' + medication_ingredient_list = meps_rxcui['medication_ingredient_name'].unique().tolist() + + #Ingredient Name Distribution (Transition 1) + + """Numerator = ingred_name + Denominator = total population [filtered by rxclass_name upstream between rxcui_ndc & rxclass] + 1. Find distinct count of patients (DUPERSID) = patient_count + 2. Multiply count of patients * personweight = weighted_patient_count + 3. Add the weighted_patient_counts, segmented by ingredient_name + selected patient demographics = patients_by_demographics (Numerator) + 4. Add the patients_by_demographics from Step 3 = weighted_patient_count_total (Denominator) -- Taking SUM of SUMs to make the Denominator = 100% + 5. Calculate percentage (Output from Step 3/Output from Step 4) -- format as 0.0-1.0 per Synthea requirements. + 6. Add the 'prescribe_' prefix to the medication_ingredient_name (e.g., 'prescribe_fluticasone') + 7. Pivot the dataframe to transpose medication_ingredient_names from rows to columns """ + + filename = rxclass_name + '_ingredient_distrib' + #1 + dcp_dict['patient_count_ingredient'] = meps_rxcui[['medication_ingredient_name', 'medication_ingredient_rxcui', 'person_weight', 'DUPERSID']+groupby_demographic_variables].groupby(['medication_ingredient_name', 'medication_ingredient_rxcui', 'person_weight']+groupby_demographic_variables)['DUPERSID'].nunique() + dcp_df = pd.DataFrame(dcp_dict['patient_count_ingredient']).reset_index() + #2 + dcp_df['weighted_patient_count_ingredient'] = dcp_df['person_weight'].astype(float)*dcp_df['DUPERSID'] + #3 + dcp_dict['patients_by_demographics_ingredient'] = dcp_df.groupby(['medication_ingredient_name']+groupby_demographic_variables)['weighted_patient_count_ingredient'].sum() + dcp_demographic_df = pd.DataFrame(dcp_dict['patients_by_demographics_ingredient']).reset_index() + #4 + if len(groupby_demographic_variables) > 0: + dcp_demographictotal_df = pd.merge(dcp_demographic_df, dcp_demographic_df.groupby(groupby_demographic_variables)['weighted_patient_count_ingredient'].sum(), how = 'inner', left_on = groupby_demographic_variables, right_index=True, suffixes = ('_demographic', '_total')) + else: + dcp_demographictotal_df = dcp_demographic_df + dcp_demographictotal_df['weighted_patient_count_ingredient_demographic'] = dcp_demographic_df['weighted_patient_count_ingredient'] + dcp_demographictotal_df['weighted_patient_count_ingredient_total'] = dcp_demographic_df['weighted_patient_count_ingredient'].sum() + #5 + dcp_demographictotal_df['percent_ingredient_patients'] = round(dcp_demographictotal_df['weighted_patient_count_ingredient_demographic']/dcp_demographictotal_df['weighted_patient_count_ingredient_total'], 3) + #6 TODO: change this column to medication_product_state_name(?) + dcp_demographictotal_df['medication_ingredient_name'] = dcp_demographictotal_df['medication_ingredient_name'].apply(lambda x: normalize_name(state_prefix + x)) + #Generate ingredient table transition + lookup_table_transition = [] + lookup_table_name = filename + '.' + output + module_medication_ingredient_name_list = dcp_demographictotal_df['medication_ingredient_name'].unique().tolist() + for idx, transition in enumerate(module_medication_ingredient_name_list): + lookup_table_transition.append({ + 'transition': transition, + 'default_probability': '1' if idx == 0 else '0', + 'lookup_table_name': lookup_table_name + }) + state_name = state_prefix + 'Ingredient' + states_dict[state_name] = { + 'type': 'Simple', + 'name': state_name, + 'lookup_table_transition': lookup_table_transition + } + #7 + dcp_dict['percent_ingredient_patients'] = dcp_demographictotal_df + if len(groupby_demographic_variables) > 0: + dcp_dict['percent_ingredient_patients'] = dcp_dict['percent_ingredient_patients'].reset_index().pivot(index= groupby_demographic_variables, columns = 'medication_ingredient_name', values='percent_ingredient_patients').reset_index() + else: + dcp_dict['percent_ingredient_patients'] = dcp_dict['percent_ingredient_patients'][['medication_ingredient_name', 'percent_ingredient_patients']].set_index('medication_ingredient_name').T + + #Fill NULLs and save as CSV + dcp_dict['percent_ingredient_patients'].fillna(0, inplace=True) + output_df(dcp_dict['percent_ingredient_patients'], output=output, filename=filename) + + #Product Name Distribution (Transition 2) + """Numerator = product_name + Denominator = ingred_name + Loop through all the ingredient_names to create product distributions by ingredient name + Same steps as above for Ingredient Name Distribution (1-7), but first filter medication_product_names for only those that have the same medication_ingredient_name (Step 0) """ + + for ingred_name in medication_ingredient_list: + filename = rxclass_name + '_product_' + ingred_name + '_distrib' + #0 + meps_rxcui_ingred = meps_rxcui[meps_rxcui['medication_ingredient_name']==ingred_name][['medication_product_name', 'medication_product_rxcui', 'medication_ingredient_name', 'medication_ingredient_rxcui', 'person_weight', 'DUPERSID']+groupby_demographic_variables] + #1 + dcp_dict['patient_count_product'] = meps_rxcui_ingred.groupby(['medication_product_name', 'medication_product_rxcui', 'medication_ingredient_name', 'medication_ingredient_rxcui', 'person_weight']+groupby_demographic_variables)['DUPERSID'].nunique() + dcp_df = pd.DataFrame(dcp_dict['patient_count_product']).reset_index() + #2 + dcp_df['weighted_patient_count_product'] = dcp_df['person_weight'].astype(float)*dcp_df['DUPERSID'] + #3 + dcp_dict['patients_by_demographics_product'] = dcp_df.groupby(['medication_product_name', 'medication_ingredient_name']+groupby_demographic_variables)['weighted_patient_count_product'].sum() + dcp_demographic_df = pd.DataFrame(dcp_dict['patients_by_demographics_product']).reset_index() + #4 + dcp_demographictotal_df = pd.merge(dcp_demographic_df, dcp_demographic_df.groupby(['medication_ingredient_name']+groupby_demographic_variables)['weighted_patient_count_product'].sum(), how = 'inner', left_on = ['medication_ingredient_name']+groupby_demographic_variables, right_index=True, suffixes = ('_demographic', '_total')) + #5 + dcp_demographictotal_df['percent_product_patients'] = round(dcp_demographictotal_df['weighted_patient_count_product_demographic']/dcp_demographictotal_df['weighted_patient_count_product_total'], 3) + #6 TODO: change this column to medication_product_state_name or medication_product_transition_name(?) + dcp_demographictotal_df['medication_product_name'] = dcp_demographictotal_df['medication_product_name'].apply(lambda x: normalize_name(state_prefix + x)) + #Generate product table transition + lookup_table_transition = [] + lookup_table_name = filename + '.' + output + module_medication_product_name_list = dcp_demographictotal_df['medication_product_name'].unique().tolist() + for idx, transition in enumerate(module_medication_product_name_list): + lookup_table_transition.append({ + 'transition': transition, + 'default_probability': '1' if idx == 1 else '0', + 'lookup_table_name': lookup_table_name + }) + state_name = state_prefix + ingred_name + states_dict[state_name] = { + 'type': 'Simple', + 'name': state_name, + 'lookup_table_transition': lookup_table_transition + } + #7 + dcp_dict['percent_product_patients'] = dcp_demographictotal_df + if len(groupby_demographic_variables) > 0: + dcp_dict['percent_product_patients'] = dcp_dict['percent_product_patients'].reset_index().pivot(index= groupby_demographic_variables, columns = 'medication_product_name', values='percent_product_patients').reset_index() + else: + dcp_dict['percent_product_patients'] = dcp_dict['percent_product_patients'][['medication_product_name', 'percent_product_patients']].set_index('medication_product_name').T + + #Fill NULLs and save as CSV + dcp_dict['percent_product_patients'].fillna(0, inplace=True) + output_df(dcp_dict['percent_product_patients'], output=output, filename=filename) + + #Generate MedicationOrder states + medication_products = list(meps_rxcui[['medication_product_name', 'medication_product_rxcui']].to_records(index=False)) + for (medication_product_name, medication_product_rxcui) in medication_products: + state_name = normalize_name(state_prefix + medication_product_name) + attribute = normalize_name(rxclass_name + '_prescription') + codes = { + 'system': 'RxNorm', + 'code': medication_product_rxcui, + 'display': medication_product_name + } + states_dict[state_name] = { + 'type': 'MedicationOrder', + 'assign_to_attribute': attribute, + 'codes': [ codes ], + 'direct_transition': 'Terminal', + 'name': state_name + } + + module_dict['states'] = states_dict + + output_json(module_dict) From dc90fd12913a2fac8b95299945cae4fc8f68c718 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 2 May 2021 10:56:06 -0500 Subject: [PATCH 07/46] Moved mdt_functions into utils.py --- src/mdt/rxnorm/utils.py | 47 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 src/mdt/rxnorm/utils.py diff --git a/src/mdt/rxnorm/utils.py b/src/mdt/rxnorm/utils.py new file mode 100644 index 0000000..da45a2a --- /dev/null +++ b/src/mdt/rxnorm/utils.py @@ -0,0 +1,47 @@ + + +def json_extract(obj, key): + """Recursively fetch values from nested JSON.""" + arr = [] + + def extract(obj, arr, key): + """Recursively search for values of key in JSON tree.""" + if isinstance(obj, dict): + for k, v in obj.items(): + if isinstance(v, (dict, list)): + extract(v, arr, key) + elif k == key: + arr.append(v) + elif isinstance(obj, list): + for item in obj: + extract(item, arr, key) + return arr + + values = extract(obj, arr, key) + print(values) + return values + + +def payload_constructor(base_url,params): + #TODO: exception handling for params as dict + + params_str = urllib.parse.urlencode(params, safe=':+') + payload = {'base_url':base_url, + 'params':params_str} + + #debug print out + print("""Payload built with base URL: {0} and parameters: {1}""".format(base_url,params_str)) + + return payload + + +def rxapi_get_requestor(request_dict): + """Sends a GET request to either RxNorm or RxClass""" + response = requests.get(request_dict['base_url'],params=request_dict['params']) + + #debug print out + print("GET Request sent to URL: {0}".format(response.url)) + print("Response HTTP Code: {0}".format(response.status_code)) + if response.status_code == 200: + #TODO: Add execption handling that can manage 200 responses with no JSON + return response.json() From f2e6f5fc0f77ef92c0e1c172658d6b2231f674f6 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 2 May 2021 11:59:24 -0500 Subject: [PATCH 08/46] Initial setup.py --- setup.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 setup.py diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..f88bdfb --- /dev/null +++ b/setup.py @@ -0,0 +1,42 @@ +from setuptools import setup, find_packages +import pathlib + +here = pathlib.Path(__file__).parent.resolve() + +# Get the long description from the README file +long_description = (here / 'README.md').read_text(encoding='utf-8') + +setup( + name='medicationDiversification', + version='1.0.0', + # description='A sample Python project', # Optional + # long_description=long_description, # Optional + # long_description_content_type='text/markdown', # Optional (see note above) + # url='https://github.com/pypa/sampleproject', # Optional + # author='A. Random Developer', # Optional + # author_email='author@example.com', # Optional + # keywords='sample, setuptools, development', # Optional + package_dir={'': 'src'}, + packages=find_packages(where='src'), + python_requires='>=3.6, <4', + # install_requires=['peppercorn'], # Optional + + # If there are data files included in your packages that need to be + # installed, specify them here. + # package_data={ # Optional + # 'sample': ['package_data.dat'], + #}, + + # Although 'package_data' is the preferred approach, in some case you may + # need to place data files outside of your packages. See: + # http://docs.python.org/distutils/setupscript.html#installing-additional-files + # + # In this case, 'data_file' will be installed into '/my_data' + # data_files=[('my_data', ['data/data_file'])], # Optional + + entry_points={ # Optional + 'console_scripts': [ + 'mdt=mdt.cli:entry_point', + ], + }, +) From 957669f8c07e91f74a1245df43feb3ca4d75f16e Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 2 May 2021 12:00:32 -0500 Subject: [PATCH 09/46] Comment out entrypoint setup in setup.py for now --- setup.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index f88bdfb..0f5544d 100644 --- a/setup.py +++ b/setup.py @@ -34,9 +34,9 @@ # In this case, 'data_file' will be installed into '/my_data' # data_files=[('my_data', ['data/data_file'])], # Optional - entry_points={ # Optional - 'console_scripts': [ - 'mdt=mdt.cli:entry_point', - ], - }, + # entry_points={ # Optional + # 'console_scripts': [ + # 'mdt=mdt.cli:entry_point', + # ], + # }, ) From 2d122e0ed343fceae55e9451476e6c3823822578 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 2 May 2021 12:01:12 -0500 Subject: [PATCH 10/46] Initial run_mdt.py main script/module --- src/mdt/run_mdt.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 src/mdt/run_mdt.py diff --git a/src/mdt/run_mdt.py b/src/mdt/run_mdt.py new file mode 100644 index 0000000..6c94e05 --- /dev/null +++ b/src/mdt/run_mdt.py @@ -0,0 +1,9 @@ +from mdt.database import create_mdt_con + + +def main(): + conn = create_mdt_con() + + +if __name__ == '__main__': + main() From a20c4fe1ca55d05698edf6fb358f557c1e3815b1 Mon Sep 17 00:00:00 2001 From: Bridg109 <40433162+Bridg109@users.noreply.github.com> Date: Mon, 3 May 2021 22:03:48 -0500 Subject: [PATCH 11/46] added function to download RxNorm, --- src/mdt/rxnorm/utils.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/mdt/rxnorm/utils.py b/src/mdt/rxnorm/utils.py index da45a2a..a8d49c8 100644 --- a/src/mdt/rxnorm/utils.py +++ b/src/mdt/rxnorm/utils.py @@ -1,4 +1,6 @@ - +from pathlib import Path +import requests, os +from typing import Callable def json_extract(obj, key): """Recursively fetch values from nested JSON.""" @@ -45,3 +47,16 @@ def rxapi_get_requestor(request_dict): if response.status_code == 200: #TODO: Add execption handling that can manage 200 responses with no JSON return response.json() + + +def get_dataset( + dest: os.PathLike = Path.cwd(), + handler: Callable[[any], None] = None +): + url = f'https://download.nlm.nih.gov/rxnorm/RxNorm_full_prescribe_current.zip' + response = requests.get(url) + if handler: + return handler(response.content) + (dest / url.split('/')[-1]).write_bytes(response.content) + return response + From 16bd740301ae91b032aacc42e449319a1b7965cf Mon Sep 17 00:00:00 2001 From: Bridg109 <40433162+Bridg109@users.noreply.github.com> Date: Mon, 3 May 2021 22:04:59 -0500 Subject: [PATCH 12/46] adds to download and load RxNorm, Pathlib use --- src/mdt/database.py | 42 ++++++++++++++++++++++++++++++++++++++++-- src/mdt/run_mdt.py | 4 ++-- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/src/mdt/database.py b/src/mdt/database.py index 9ef2370..7e0e6b2 100644 --- a/src/mdt/database.py +++ b/src/mdt/database.py @@ -1,12 +1,27 @@ +from .rxnorm.utils import get_dataset + +from pathlib import Path +import zipfile,io, sqlite3 +import pandas as pd + + +def to_data(): + """creates paths to data folder, making directory if not present""" + path = Path.cwd() / 'data' + try: + path.mkdir(exist_ok=False) + except: + pass + return path def create_mdt_con(): """create defualt connection to the data/MDT.db database. If database does not exist it creates it.""" - conn = sql.connect('data/MDT.db') + conn = sqlite3.connect(to_data() / 'MDT.db') return conn -def sql_create_table(table_name, df, conn=None, delete_df=True): +def sql_create_table(table_name, df, conn=None): """Creates a table in the connected database when passed a pandas dataframe. Note default is to delete dataframe if table name is same as global variable name that stores the df and delete_df is True""" @@ -39,3 +54,26 @@ def read_sql_string(file_name): print('Read {0} file as string'.format(file_name)) return query_str + + +def load_rxnorm(): + """downloads and loads RxNorm dataset into database""" + + z = zipfile.ZipFile(get_dataset(handler=io.BytesIO)) + + col_names = ['RXCUI','LAT','TS','LUI','STT','SUI','ISPREF','RXAUI','SAUI','SCUI','SDUI','SAB','TTY','CODE','STR','SRL','SUPPRESS','CVF','test'] + rxnconso = pd.read_csv(z.open('rrf/RXNCONSO.RRF'),sep='|',header=None,dtype=object,names=col_names) + sql_create_table('rxnconso',rxnconso) + del rxnconso + + col_names = ['RXCUI1','RXAUI1','STYPE1','REL','RXCUI2','RXAUI2','STYPE2','RELA','RUI','SRUI','SAB','SL','DIR','RG','SUPPRESS','CVF','test'] + rxnrel = pd.read_csv(z.open('rrf/RXNREL.RRF'),sep='|',dtype=object,header=None,names=col_names) + sql_create_table('rxnrel',rxnrel) + del rxnrel + + col_names = ['RXCUI','LUI','SUI','RXAUI','STYPE','CODE','ATUI','SATUI','ATN','SAB','ATV','SUPPRESS','CVF','test'] + rxnsat = pd.read_csv(z.open('rrf/RXNSAT.RRF'),sep='|',dtype=object,header=None,names=col_names) + sql_create_table('rxnsat',rxnsat) + del rxnsat + + del z \ No newline at end of file diff --git a/src/mdt/run_mdt.py b/src/mdt/run_mdt.py index 6c94e05..096cb81 100644 --- a/src/mdt/run_mdt.py +++ b/src/mdt/run_mdt.py @@ -1,8 +1,8 @@ -from mdt.database import create_mdt_con +from mdt.database import load_rxnorm def main(): - conn = create_mdt_con() + load_rxnorm() if __name__ == '__main__': From 1af83aac8a482f9586770b704df830b469986566 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Wed, 5 May 2021 08:26:43 -0500 Subject: [PATCH 13/46] ignore .vim, .ds_store and python egg-info --- .gitignore | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 5540557..576c65e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,14 @@ data __pycache__ .vscode +.vim +.DS_Store venv .venv +*.egg-info + .ipynb_checkpoints */.ipynb_checkpoints/* -*.ipynb \ No newline at end of file +*.ipynb From 49e30ddbf3afb7ae78d6225daaf2fa636383351f Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Fri, 7 May 2021 08:00:32 -0500 Subject: [PATCH 14/46] Meps utils module with get_dataset function --- src/mdt/meps/utils.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 src/mdt/meps/utils.py diff --git a/src/mdt/meps/utils.py b/src/mdt/meps/utils.py new file mode 100644 index 0000000..70e24c5 --- /dev/null +++ b/src/mdt/meps/utils.py @@ -0,0 +1,27 @@ +import os +from pathlib import Path +from typing import Callable +import requests + + +def get_dataset( + dat_name: str, + dest: os.PathLike = Path.cwd(), + handler: Callable[[any], None] = None +): + """Get a MEPS Dataset given a dat name + extension + + Args: + dat_name (str): MEPS dat file name, ie: h206adat.zip + dest (Path): Destination path to save file, defaults to CWD + hander (func, optional): Function to bypass CWD save + """ + url = f'https://www.meps.ahrq.gov/mepsweb/data_files/pufs/{dat_name}' + response = requests.get(url) + + if handler: + return handler(response) + + (dest / url.split('/')[-1]).write_bytes(response.content) + + return response From 548342eed05e9fc89cde63b4077004b20272ee28 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Fri, 7 May 2021 08:28:18 -0500 Subject: [PATCH 15/46] Move meps_lists vars into new meps module columns.py --- src/mdt/meps/columns.py | 383 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 383 insertions(+) create mode 100644 src/mdt/meps/columns.py diff --git a/src/mdt/meps/columns.py b/src/mdt/meps/columns.py new file mode 100644 index 0000000..e372f31 --- /dev/null +++ b/src/mdt/meps/columns.py @@ -0,0 +1,383 @@ +import pandas as pd + +#Source: https://www.meps.ahrq.gov/survey_comp/hc_technical_notes.shtml +meps_region_states = pd.DataFrame({'region_value': [1, 2, 3, 4], + 'region_label': ['Northeast', 'Midwest', 'South', 'West'], + 'state': [['Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 'New Jersey', + 'New York', 'Pennsylvania', 'Rhode Island', 'Vermont'], + ['Indiana', 'Illinois', 'Iowa', 'Kansas', 'Michigan', 'Minnesota', 'Missouri', + 'Nebraska', 'North Dakota', 'Ohio', 'South Dakota', 'Wisconsin'], + ['Alabama', 'Arkansas', 'Delaware', 'District of Columbia', 'Florida', + 'Georgia', 'Kentucky', 'Louisiana', 'Maryland', 'Mississippi', 'North Carolina', 'Oklahoma', 'South Carolina', 'Tennessee', 'Texas', 'Virginia', + 'West Virginia'], + ['Alaska', 'Arizona', 'California', 'Colorado', 'Hawaii', 'Idaho', 'Montana', + 'Nevada', 'New Mexico', 'Oregon', 'Utah', 'Washington', 'Wyoming']] + }).set_index(['region_value'])['state'].apply(pd.Series).stack().reset_index(level=1, drop=True).reset_index().rename(columns={0:'state'}).astype(str) + + + +d_col_names=["DUID", "PID", "DUPERSID", "PANEL", "FAMID31", "FAMID42", + "FAMID53", "FAMID18", "FAMIDYR", "CPSFAMID", "FCSZ1231", + "FCRP1231", "RULETR31", "RULETR42", "RULETR53", "RULETR18", + "RUSIZE31", "RUSIZE42", "RUSIZE53", "RUSIZE18", "RUCLAS31", + "RUCLAS42", "RUCLAS53", "RUCLAS18", "FAMSZE31", "FAMSZE42", + "FAMSZE53", "FAMSZE18", "FMRS1231", "FAMS1231", "FAMSZEYR", + "FAMRFPYR", "REGION31", "REGION42", "REGION53", "REGION18", + "REFPRS31", "REFPRS42", "REFPRS53", "REFPRS18", "RESP31", + "RESP42", "RESP53", "RESP18", "PROXY31", "PROXY42", + "PROXY53", "PROXY18", "INTVLANG", "BEGRFM31", "BEGRFY31", + "ENDRFM31", "ENDRFY31", "BEGRFM42", "BEGRFY42", "ENDRFM42", + "ENDRFY42", "BEGRFM53", "BEGRFY53", "ENDRFM53", "ENDRFY53", + "ENDRFM18", "ENDRFY18", "KEYNESS", "INSCOP31", "INSCOP42", + "INSCOP53", "INSCOP18", "INSC1231", "INSCOPE", "ELGRND31", + "ELGRND42", "ELGRND53", "ELGRND18", "PSTATS31", "PSTATS42", + "PSTATS53", "RURSLT31", "RURSLT42", "RURSLT53", "AGE31X", + "AGE42X", "AGE53X", "AGE18X", "AGELAST", "DOBMM", "DOBYY", + "SEX", "RACEV1X", "RACEV2X", "RACEAX", "RACEBX", "RACEWX", + "RACETHX", "HISPANX", "HISPNCAT", "MARRY31X", "MARRY42X", + "MARRY53X", "MARRY18X", "SPOUID31", "SPOUID42", "SPOUID53", + "SPOUID18", "SPOUIN31", "SPOUIN42", "SPOUIN53", "SPOUIN18", + "EDUCYR", "HIDEG", "FTSTU31X", "FTSTU42X", "FTSTU53X", + "FTSTU18X", "ACTDTY31", "ACTDTY42", "ACTDTY53", "HONRDC31", + "HONRDC42", "REFRL31X", "REFRL42X", "REFRL53X", "REFRL18X", + "OTHLANG", "LANGSPK", "HWELLSPE", "OTHLGSPK", "WHTLGSPK", + "HWELLSPK", "BORNUSA", "YRSINUS", "MOPID31X", "MOPID42X", + "MOPID53X", "DAPID31X", "DAPID42X", "DAPID53X", "RTHLTH31", + "RTHLTH42", "RTHLTH53", "MNHLTH31", "MNHLTH42", "MNHLTH53", + "HIBPDX", "HIBPAGED", "BPMLDX", "CHDDX", "CHDAGED", + "ANGIDX", "ANGIAGED", "MIDX", "MIAGED", "OHRTDX", + "OHRTAGED", "OHRTTYPE", "STRKDX", "STRKAGED", "EMPHDX", + "EMPHAGED", "CHBRON31", "CHOLDX", "CHOLAGED", "CANCERDX", + "CABLADDR", "CABREAST", "CACERVIX", "CACOLON", "CALUNG", + "CALYMPH", "CAMELANO", "CAOTHER", "CAPROSTA", "CASKINNM", + "CASKINDK", "CAUTERUS", "DIABDX_M18", "DIABAGED", + "JTPAIN31_M18", "ARTHDX", "ARTHTYPE", "ARTHAGED", "ASTHDX", + "ASTHAGED", "ASSTIL31", "ASATAK31", "ASTHEP31", "ASACUT31", + "ASMRCN31", "ASPREV31", "ASDALY31", "ASPKFL31", "ASEVFL31", + "ASWNFL31", "ADHDADDX", "ADHDAGED", "IADLHP31", "ADLHLP31", + "AIDHLP31", "WLKLIM31", "LFTDIF31", "STPDIF31", "WLKDIF31", + "MILDIF31", "STNDIF31", "BENDIF31", "RCHDIF31", "FNGRDF31", + "ACTLIM31", "WRKLIM31", "HSELIM31", "SCHLIM31", "UNABLE31", + "SOCLIM31", "COGLIM31", "DFHEAR42", "DFSEE42", "DFCOG42", + "DFWLKC42", "DFDRSB42", "DFERND42", "ANYLMI18", "CHPMED42", + "CHPMHB42", "CHPMCN42", "CHSERV42", "CHSRHB42", "CHSRCN42", + "CHLIMI42", "CHLIHB42", "CHLICO42", "CHTHER42", "CHTHHB42", + "CHTHCO42", "CHCOUN42", "CHEMPB42", "CSHCN42", "MESHGT42", + "WHNHGT42", "MESWGT42", "WHNWGT42", "CHBMIX42", "MESVIS42", + "EATHLT42", "WHNEAT42", "PHYSCL42", "WHNPHY42", "SAFEST42", + "WHNSAF42", "BOOST42", "WHNBST42", "LAPBLT42", "WHNLAP42", + "HELMET42", "WHNHEL42", "NOSMOK42", "WHNSMK42", "TIMALN42", + "LSTETH53", "PHYEXE53", "OFTSMK53", "SAQELIG", "ADSEX42", + "ADAGE42", "ADPROX42", "ADGENH42", "ADDAYA42", "ADCLIM42", + "ADACLS42", "ADWKLM42", "ADEMLS42", "ADMWCF42", "ADPAIN42", + "ADPCFL42", "ADENGY42", "ADPRST42", "ADSOCA42", "VPCS42", + "VMCS42", "VRFLAG42", "ADNERV42", "ADHOPE42", "ADREST42", + "ADSAD42", "ADEFRT42", "ADWRTH42", "K6SUM42", "ADINTR42", + "ADDPRS42", "PHQ242", "ADBRTC42", "ADMDVT42", "ADFLST42", + "ADWGHD42", "ADBMI42", "ADWTAD42", "ADKALC42", "ADRNK542", + "ADRNK442", "ADSTAL42", "ADTBAC42", "ADOFTB42", "ADQTTB42", + "ADQTMD42", "ADQTHP42", "ADMOOD42", "ADBPCK42", "ADCHLC42", + "ADPNEU42", "ADSHNG42", "ADNOAP42", "ADDSCU42", "ADCOLN42", + "ADCLNS42", "ADSGMD42", "ADBLDS42", "ADPROS42", "ADPSAG42", + "ADUTRM42", "ADPAP42", "ADPAPG42", "ADOSTP42", "ADBNDN42", + "ADBRST42", "ADMMGR42", "ADCMPM42", "ADCMPY42", "ADLANG42", + "VSAQELIG", "VACTDY53", "VAPRHT53", "VACOPD53", "VADERM53", + "VAGERD53", "VAHRLS53", "VABACK53", "VAJTPN53", "VARTHR53", + "VAGOUT53", "VANECK53", "VATMD53", "VAPTSD53", "VALCOH53", + "VABIPL53", "VADEPR53", "VAMOOD53", "VAPROS53", "VARHAB53", + "VAMNHC53", "VAGCNS53", "VARXMD53", "VACRGV53", "VAMOBL53", + "VACOST53", "VARECM53", "VAREP53", "VAWAIT53", "VALOCT53", + "VANTWK53", "VANEED53", "VAOUT53", "VAPAST53", "VACOMP53", + "VAMREC53", "VAGTRC53", "VACARC53", "VAPROB53", "VACARE53", + "VAPACT53", "VAPCPR53", "VAPROV53", "VAPCOT53", "VAPCCO53", + "VAPCRC53", "VAPCSN53", "VAPCRF53", "VAPCSO53", "VAPCOU53", + "VAPCUN53", "VASPCL53", "VASPMH53", "VASPOU53", "VASPUN53", + "VACMPM53", "VACMPY53", "VAPROX53", "DCSELIG", "DSDIA53", + "DSA1C53", "DSFT1953", "DSFT1853", "DSFT1753", "DSFB1753", + "DSFTNV53", "DSEY1953", "DSEY1853", "DSEY1753", "DSEB1753", + "DSEYNV53", "DSCH1953", "DSCH1853", "DSCH1753", "DSCB1753", + "DSCHNV53", "DSFL1953", "DSFL1853", "DSFL1753", "DSVB1753", + "DSFLNV53", "DSKIDN53", "DSEYPR53", "DSDIET53", "DSMED53", + "DSINSU53", "DSCPCP53", "DSCNPC53", "DSCPHN53", "DSCINT53", + "DSCGRP53", "DSCONF53", "DSPRX53", "DDNWRK18", "OTHDYS18", + "OTHNDD18", "ACCELI42", "HAVEUS42", "PRACTP42", + "YNOUSC42_M18", "PROVTY42_M18", "PLCTYP42", "TMTKUS42", + "TYPEPE42", "LOCATN42", "HSPLAP42", "WHITPR42", "BLCKPR42", + "ASIANP42", "NATAMP42", "PACISP42", "OTHRCP42", "GENDRP42", + "PHNREG42", "OFFHOU42", "AFTHOU42", "TREATM42", "DECIDE42", + "EXPLOP42", "PRVSPK42", "DLAYCA42", "AFRDCA42", "DLAYDN42", + "AFRDDN42", "DLAYPM42", "AFRDPM42", "EMPST31", "EMPST42", + "EMPST53", "RNDFLG31", "MORJOB31", "MORJOB42", "MORJOB53", + "EVRWRK", "HRWG31X", "HRWG42X", "HRWG53X", "HRWGIM31", + "HRWGIM42", "HRWGIM53", "HRHOW31", "HRHOW42", "HRHOW53", + "DIFFWG31", "DIFFWG42", "DIFFWG53", "NHRWG31", "NHRWG42", + "NHRWG53", "HOUR31", "HOUR42", "HOUR53", "TEMPJB31", + "TEMPJB42", "TEMPJB53", "SSNLJB31", "SSNLJB42", "SSNLJB53", + "SELFCM31", "SELFCM42", "SELFCM53", "DISVW31X", "DISVW42X", + "DISVW53X", "CHOIC31", "CHOIC42", "CHOIC53", "INDCAT31", + "INDCAT42", "INDCAT53", "NUMEMP31", "NUMEMP42", "NUMEMP53", + "MORE31", "MORE42", "MORE53", "UNION31", "UNION42", + "UNION53", "NWK31", "NWK42", "NWK53", "CHGJ3142", + "CHGJ4253", "YCHJ3142", "YCHJ4253", "STJBMM31", "STJBYY31", + "STJBMM42", "STJBYY42", "STJBMM53", "STJBYY53", "EVRETIRE", + "OCCCAT31", "OCCCAT42", "OCCCAT53", "PAYVAC31", "PAYVAC42", + "PAYVAC53", "SICPAY31", "SICPAY42", "SICPAY53", "PAYDR31", + "PAYDR42", "PAYDR53", "RETPLN31", "RETPLN42", "RETPLN53", + "BSNTY31", "BSNTY42", "BSNTY53", "JOBORG31", "JOBORG42", + "JOBORG53", "HELD31X", "HELD42X", "HELD53X", "OFFER31X", + "OFFER42X", "OFFER53X", "OFREMP31", "OFREMP42", "OFREMP53", + "EMPST31H", "EMPST42H", "EMPST53H", "SLFCM31H", "SLFCM42H", + "SLFCM53H", "NMEMP31H", "NMEMP42H", "NMEMP53H", "MORE31H", + "MORE42H", "MORE53H", "INDCT31H", "INDCT42H", "INDCT53H", + "OCCCT31H", "OCCCT42H", "OCCCT53H", "HOUR31H", "HOUR42H", + "HOUR53H", "JBORG31H", "JBORG42H", "JBORG53H", "UNION31H", + "UNION42H", "UNION53H", "BSNTY31H", "BSNTY42H", "BSNTY53H", + "HRWG31H", "HRWG42H", "HRWG53H", "CMJHLD31", "CMJHLD42", + "CMJHLD53", "OFFER31H", "OFFER42H", "OFFER53H", "OFEMP31H", + "OFEMP42H", "OFEMP53H", "PYVAC31H", "PYVAC42H", "PYVAC53H", + "SCPAY31H", "SCPAY42H", "SCPAY53H", "PAYDR31H", "PAYDR42H", + "PAYDR53H", "RTPLN31H", "RTPLN42H", "RTPLN53H", "FILEDR18", + "WILFIL18", "FLSTAT18", "FILER18", "JTINRU18", "JNTPID18", + "TAXFRM18", "FOODST18", "FOODMN18", "FOODVL18", "TTLP18X", + "FAMINC18", "POVCAT18", "POVLEV18", "WAGEP18X", "WAGIMP18", + "BUSNP18X", "BUSIMP18", "UNEMP18X", "UNEIMP18", "WCMPP18X", + "WCPIMP18", "INTRP18X", "INTIMP18", "DIVDP18X", "DIVIMP18", + "SALEP18X", "SALIMP18", "PENSP18X", "PENIMP18", "SSECP18X", + "SSCIMP18", "TRSTP18X", "TRTIMP18", "VETSP18X", "VETIMP18", + "IRASP18X", "IRAIMP18", "ALIMP18X", "ALIIMP18", "CHLDP18X", + "CHLIMP18", "CASHP18X", "CSHIMP18", "SSIP18X", "SSIIMP18", + "PUBP18X", "PUBIMP18", "OTHRP18X", "OTHIMP18", "HIEUIDX", + "TRIJA18X", "TRIFE18X", "TRIMA18X", "TRIAP18X", "TRIMY18X", + "TRIJU18X", "TRIJL18X", "TRIAU18X", "TRISE18X", "TRIOC18X", + "TRINO18X", "TRIDE18X", "MCRJA18", "MCRFE18", "MCRMA18", + "MCRAP18", "MCRMY18", "MCRJU18", "MCRJL18", "MCRAU18", + "MCRSE18", "MCROC18", "MCRNO18", "MCRDE18", "MCRJA18X", + "MCRFE18X", "MCRMA18X", "MCRAP18X", "MCRMY18X", "MCRJU18X", + "MCRJL18X", "MCRAU18X", "MCRSE18X", "MCROC18X", "MCRNO18X", + "MCRDE18X", "MCDJA18", "MCDFE18", "MCDMA18", "MCDAP18", + "MCDMY18", "MCDJU18", "MCDJL18", "MCDAU18", "MCDSE18", + "MCDOC18", "MCDNO18", "MCDDE18", "MCDJA18X", "MCDFE18X", + "MCDMA18X", "MCDAP18X", "MCDMY18X", "MCDJU18X", "MCDJL18X", + "MCDAU18X", "MCDSE18X", "MCDOC18X", "MCDNO18X", "MCDDE18X", + "GVAJA18", "GVAFE18", "GVAMA18", "GVAAP18", "GVAMY18", + "GVAJU18", "GVAJL18", "GVAAU18", "GVASE18", "GVAOC18", + "GVANO18", "GVADE18", "GVBJA18", "GVBFE18", "GVBMA18", + "GVBAP18", "GVBMY18", "GVBJU18", "GVBJL18", "GVBAU18", + "GVBSE18", "GVBOC18", "GVBNO18", "GVBDE18", "GVCJA18", + "GVCFE18", "GVCMA18", "GVCAP18", "GVCMY18", "GVCJU18", + "GVCJL18", "GVCAU18", "GVCSE18", "GVCOC18", "GVCNO18", + "GVCDE18", "VAPJA18", "VAPFE18", "VAPMA18", "VAPAP18", + "VAPMY18", "VAPJU18", "VAPJL18", "VAPAU18", "VAPSE18", + "VAPOC18", "VAPNO18", "VAPDE18", "IHSJA18", "IHSFE18", + "IHSMA18", "IHSAP18", "IHSMY18", "IHSJU18", "IHSJL18", + "IHSAU18", "IHSSE18", "IHSOC18", "IHSNO18", "IHSDE18", + "PUBJA18X", "PUBFE18X", "PUBMA18X", "PUBAP18X", "PUBMY18X", + "PUBJU18X", "PUBJL18X", "PUBAU18X", "PUBSE18X", "PUBOC18X", + "PUBNO18X", "PUBDE18X", "PEGJA18", "PEGFE18", "PEGMA18", + "PEGAP18", "PEGMY18", "PEGJU18", "PEGJL18", "PEGAU18", + "PEGSE18", "PEGOC18", "PEGNO18", "PEGDE18", "PDKJA18", + "PDKFE18", "PDKMA18", "PDKAP18", "PDKMY18", "PDKJU18", + "PDKJL18", "PDKAU18", "PDKSE18", "PDKOC18", "PDKNO18", + "PDKDE18", "PNGJA18", "PNGFE18", "PNGMA18", "PNGAP18", + "PNGMY18", "PNGJU18", "PNGJL18", "PNGAU18", "PNGSE18", + "PNGOC18", "PNGNO18", "PNGDE18", "POGJA18", "POGFE18", + "POGMA18", "POGAP18", "POGMY18", "POGJU18", "POGJL18", + "POGAU18", "POGSE18", "POGOC18", "POGNO18", "POGDE18", + "POEJA18", "POEFE18", "POEMA18", "POEAP18", "POEMY18", + "POEJU18", "POEJL18", "POEAU18", "POESE18", "POEOC18", + "POENO18", "POEDE18", "PNEJA18", "PNEFE18", "PNEMA18", + "PNEAP18", "PNEMY18", "PNEJU18", "PNEJL18", "PNEAU18", + "PNESE18", "PNEOC18", "PNENO18", "PNEDE18", "PRXJA18", + "PRXFE18", "PRXMA18", "PRXAP18", "PRXMY18", "PRXJU18", + "PRXJL18", "PRXAU18", "PRXSE18", "PRXOC18", "PRXNO18", + "PRXDE18", "PRIJA18", "PRIFE18", "PRIMA18", "PRIAP18", + "PRIMY18", "PRIJU18", "PRIJL18", "PRIAU18", "PRISE18", + "PRIOC18", "PRINO18", "PRIDE18", "HPEJA18", "HPEFE18", + "HPEMA18", "HPEAP18", "HPEMY18", "HPEJU18", "HPEJL18", + "HPEAU18", "HPESE18", "HPEOC18", "HPENO18", "HPEDE18", + "HPDJA18", "HPDFE18", "HPDMA18", "HPDAP18", "HPDMY18", + "HPDJU18", "HPDJL18", "HPDAU18", "HPDSE18", "HPDOC18", + "HPDNO18", "HPDDE18", "HPNJA18", "HPNFE18", "HPNMA18", + "HPNAP18", "HPNMY18", "HPNJU18", "HPNJL18", "HPNAU18", + "HPNSE18", "HPNOC18", "HPNNO18", "HPNDE18", "HPOJA18", + "HPOFE18", "HPOMA18", "HPOAP18", "HPOMY18", "HPOJU18", + "HPOJL18", "HPOAU18", "HPOSE18", "HPOOC18", "HPONO18", + "HPODE18", "HPXJA18", "HPXFE18", "HPXMA18", "HPXAP18", + "HPXMY18", "HPXJU18", "HPXJL18", "HPXAU18", "HPXSE18", + "HPXOC18", "HPXNO18", "HPXDE18", "HPRJA18", "HPRFE18", + "HPRMA18", "HPRAP18", "HPRMY18", "HPRJU18", "HPRJL18", + "HPRAU18", "HPRSE18", "HPROC18", "HPRNO18", "HPRDE18", + "INSJA18X", "INSFE18X", "INSMA18X", "INSAP18X", "INSMY18X", + "INSJU18X", "INSJL18X", "INSAU18X", "INSSE18X", "INSOC18X", + "INSNO18X", "INSDE18X", "PRVEV18", "TRIEV18", "MCREV18", + "MCDEV18", "VAEV18", "GVAEV18", "GVBEV18", "GVCEV18", + "UNINS18", "INSCOV18", "INSURC18", "TRIST31X", "TRIST42X", + "TRIST18X", "TRIPR31X", "TRIPR42X", "TRIPR18X", "TRIEX31X", + "TRIEX42X", "TRIEX18X", "TRILI31X", "TRILI42X", "TRILI18X", + "TRICH31X", "TRICH42X", "TRICH18X", "MCRPD31", "MCRPD42", + "MCRPD18", "MCRPD31X", "MCRPD42X", "MCRPD18X", "MCRPB31", + "MCRPB42", "MCRPB18", "MCRPHO31", "MCRPHO42", "MCRPHO18", + "MCDHMO31", "MCDHMO42", "MCDHMO18", "MCDMC31", "MCDMC42", + "MCDMC18", "PRVHMO31", "PRVHMO42", "PRVHMO18", "FSAGT31", + "HASFSA31", "PFSAMT31", "PREVCOVR", "MORECOVR", "TRICR31X", + "TRICR42X", "TRICR53X", "TRICR18X", "TRIAT31X", "TRIAT42X", + "TRIAT53X", "TRIAT18X", "MCAID31", "MCAID42", "MCAID53", + "MCAID18", "MCAID31X", "MCAID42X", "MCAID53X", "MCAID18X", + "MCARE31", "MCARE42", "MCARE53", "MCARE18", "MCARE31X", + "MCARE42X", "MCARE53X", "MCARE18X", "MCDAT31X", "MCDAT42X", + "MCDAT53X", "MCDAT18X", "GOVTA31", "GOVTA42", "GOVTA53", + "GOVTA18", "GOVAAT31", "GOVAAT42", "GOVAAT53", "GOVAAT18", + "GOVTB31", "GOVTB42", "GOVTB53", "GOVTB18", "GOVBAT31", + "GOVBAT42", "GOVBAT53", "GOVBAT18", "GOVTC31", "GOVTC42", + "GOVTC53", "GOVTC18", "GOVCAT31", "GOVCAT42", "GOVCAT53", + "GOVCAT18", "VAPROG31", "VAPROG42", "VAPROG53", "VAPROG18", + "VAPRAT31", "VAPRAT42", "VAPRAT53", "VAPRAT18", "IHS31", + "IHS42", "IHS53", "IHS18", "IHSAT31", "IHSAT42", "IHSAT53", + "IHSAT18", "PRIDK31", "PRIDK42", "PRIDK53", "PRIDK18", + "PRIEU31", "PRIEU42", "PRIEU53", "PRIEU18", "PRING31", + "PRING42", "PRING53", "PRING18", "PRIOG31", "PRIOG42", + "PRIOG53", "PRIOG18", "PRINEO31", "PRINEO42", "PRINEO53", + "PRINEO18", "PRIEUO31", "PRIEUO42", "PRIEUO53", "PRIEUO18", + "PRSTX31", "PRSTX42", "PRSTX53", "PRSTX18", "PRIV31", + "PRIV42", "PRIV53", "PRIV18", "PRIVAT31", "PRIVAT42", + "PRIVAT53", "PRIVAT18", "PUB31X", "PUB42X", "PUB53X", + "PUB18X", "PUBAT31X", "PUBAT42X", "PUBAT53X", "PUBAT18X", + "VERFLG31", "VERFLG42", "VERFLG18", "INS31X", "INS42X", + "INS53X", "INS18X", "INSAT31X", "INSAT42X", "INSAT53X", + "INSAT18X", "DENTIN31", "DENTIN42", "DENTIN53", "DNTINS31", + "DNTINS18", "PMEDIN31", "PMEDIN42", "PMEDIN53", "PMDINS31", + "PMDINS18", "PROBPY42", "CRFMPY42", "PYUNBL42", "PMEDUP31", + "PMEDUP42", "PMEDUP53", "PMEDPY31", "PMEDPY42", "PMEDPY53", + "TOTTCH18", "TOTEXP18", "TOTSLF18", "TOTMCR18", "TOTMCD18", + "TOTPRV18", "TOTVA18", "TOTTRI18", "TOTOFD18", "TOTSTL18", + "TOTWCP18", "TOTOPR18", "TOTOPU18", "TOTOSR18", "TOTPTR18", + "TOTOTH18", "OBTOTV18", "OBVTCH18", "OBVEXP18", "OBVSLF18", + "OBVMCR18", "OBVMCD18", "OBVPRV18", "OBVVA18", "OBVTRI18", + "OBVOFD18", "OBVSTL18", "OBVWCP18", "OBVOPR18", "OBVOPU18", + "OBVOSR18", "OBVPTR18", "OBVOTH18", "OBDRV18", "OBDTCH18", + "OBDEXP18", "OBDSLF18", "OBDMCR18", "OBDMCD18", "OBDPRV18", + "OBDVA18", "OBDTRI18", "OBDOFD18", "OBDSTL18", "OBDWCP18", + "OBDOPR18", "OBDOPU18", "OBDOSR18", "OBDPTR18", "OBDOTH18", + "OPTOTV18", "OPTTCH18", "OPTEXP18", "OPTSLF18", "OPTMCR18", + "OPTMCD18", "OPTPRV18", "OPTVA18", "OPTTRI18", "OPTOFD18", + "OPTSTL18", "OPTWCP18", "OPTOPR18", "OPTOPU18", "OPTOSR18", + "OPTPTR18", "OPTOTH18", "OPFTCH18", "OPFEXP18", "OPFSLF18", + "OPFMCR18", "OPFMCD18", "OPFPRV18", "OPFVA18", "OPFTRI18", + "OPFOFD18", "OPFSTL18", "OPFWCP18", "OPFOPR18", "OPFOPU18", + "OPFOSR18", "OPFPTR18", "OPFOTH18", "OPDEXP18", "OPDTCH18", + "OPDSLF18", "OPDMCR18", "OPDMCD18", "OPDPRV18", "OPDVA18", + "OPDTRI18", "OPDOFD18", "OPDSTL18", "OPDWCP18", "OPDOPR18", + "OPDOPU18", "OPDOSR18", "OPDPTR18", "OPDOTH18", "OPDRV18", + "OPVTCH18", "OPVEXP18", "OPVSLF18", "OPVMCR18", "OPVMCD18", + "OPVPRV18", "OPVVA18", "OPVTRI18", "OPVOFD18", "OPVSTL18", + "OPVWCP18", "OPVOPR18", "OPVOPU18", "OPVOSR18", "OPVPTR18", + "OPVOTH18", "OPSEXP18", "OPSTCH18", "OPSSLF18", "OPSMCR18", + "OPSMCD18", "OPSPRV18", "OPSVA18", "OPSTRI18", "OPSOFD18", + "OPSSTL18", "OPSWCP18", "OPSOPR18", "OPSOPU18", "OPSOSR18", + "OPSPTR18", "OPSOTH18", "ERTOT18", "ERTTCH18", "ERTEXP18", + "ERTSLF18", "ERTMCR18", "ERTMCD18", "ERTPRV18", "ERTVA18", + "ERTTRI18", "ERTOFD18", "ERTSTL18", "ERTWCP18", "ERTOPR18", + "ERTOPU18", "ERTOSR18", "ERTPTR18", "ERTOTH18", "ERFTCH18", + "ERFEXP18", "ERFSLF18", "ERFMCR18", "ERFMCD18", "ERFPRV18", + "ERFVA18", "ERFTRI18", "ERFOFD18", "ERFSTL18", "ERFWCP18", + "ERFOPR18", "ERFOPU18", "ERFOSR18", "ERFPTR18", "ERFOTH18", + "ERDEXP18", "ERDTCH18", "ERDSLF18", "ERDMCR18", "ERDMCD18", + "ERDPRV18", "ERDVA18", "ERDTRI18", "ERDOFD18", "ERDSTL18", + "ERDWCP18", "ERDOPR18", "ERDOPU18", "ERDOSR18", "ERDPTR18", + "ERDOTH18", "IPDIS18", "IPTEXP18", "IPTTCH18", "IPTSLF18", + "IPTMCR18", "IPTMCD18", "IPTPRV18", "IPTVA18", "IPTTRI18", + "IPTOFD18", "IPTSTL18", "IPTWCP18", "IPTOPR18", "IPTOPU18", + "IPTOSR18", "IPTPTR18", "IPTOTH18", "IPFEXP18", "IPFTCH18", + "IPFSLF18", "IPFMCR18", "IPFMCD18", "IPFPRV18", "IPFVA18", + "IPFTRI18", "IPFOFD18", "IPFSTL18", "IPFWCP18", "IPFOPR18", + "IPFOPU18", "IPFOSR18", "IPFPTR18", "IPFOTH18", "IPDEXP18", + "IPDTCH18", "IPDSLF18", "IPDMCR18", "IPDMCD18", "IPDPRV18", + "IPDVA18", "IPDTRI18", "IPDOFD18", "IPDSTL18", "IPDWCP18", + "IPDOPR18", "IPDOPU18", "IPDOSR18", "IPDPTR18", "IPDOTH18", + "IPNGTD18", "DVTOT18", "DVTTCH18", "DVTEXP18", "DVTSLF18", + "DVTMCR18", "DVTMCD18", "DVTPRV18", "DVTVA18", "DVTTRI18", + "DVTOFD18", "DVTSTL18", "DVTWCP18", "DVTOPR18", "DVTOPU18", + "DVTOSR18", "DVTPTR18", "DVTOTH18", "HHTOTD18", "HHAGD18", + "HHATCH18", "HHAEXP18", "HHASLF18", "HHAMCR18", "HHAMCD18", + "HHAPRV18", "HHAVA18", "HHATRI18", "HHAOFD18", "HHASTL18", + "HHAWCP18", "HHAOPR18", "HHAOPU18", "HHAOSR18", "HHAPTR18", + "HHAOTH18", "HHINDD18", "HHNTCH18", "HHNEXP18", "HHNSLF18", + "HHNMCD18", "HHNMCR18", "HHNPRV18", "HHNVA18", "HHNTRI18", + "HHNOFD18", "HHNSTL18", "HHNWCP18", "HHNOPR18", "HHNOPU18", + "HHNOSR18", "HHNPTR18", "HHNOTH18", "HHINFD18", "VISEXP18", + "VISTCH18", "VISSLF18", "VISMCR18", "VISMCD18", "VISPRV18", + "VISVA18", "VISTRI18", "VISOFD18", "VISSTL18", "VISWCP18", + "VISOPR18", "VISOPU18", "VISOSR18", "VISPTR18", "VISOTH18", + "OTHTCH18", "OTHEXP18", "OTHSLF18", "OTHMCR18", "OTHMCD18", + "OTHPRV18", "OTHVA18", "OTHTRI18", "OTHOFD18", "OTHSTL18", + "OTHWCP18", "OTHOPR18", "OTHOPU18", "OTHOSR18", "OTHPTR18", + "OTHOTH18", "RXTOT18", "RXEXP18", "RXSLF18", "RXMCR18", + "RXMCD18", "RXPRV18", "RXVA18", "RXTRI18", "RXOFD18", + "RXSTL18", "RXWCP18", "RXOPR18", "RXOPU18", "RXOSR18", + "RXPTR18", "RXOTH18", "PERWT18F", "FAMWT18F", "FAMWT18C", + "SAQWT18F", "DIABW18F", "VSAQW18F", "VARSTR", "VARPSU"] + +d_col_spaces = [(0,7), +(7,10), +(10,20), +(20,22), +(22,24), +(24,26), +(26,28), +(28,30), +(30,32), +(32,34), +(34,36), +(36,38), +(38,40), +(40,42), +(42,44), +(44,47), +(47,49), +(49,51), +(51,53), +(53,55), +(55,57), +(57,59), +(59,61), +(61,62), +(62,64), +(64,66), +(66,68), +(68,70), +(70,72), +(72,74), +(74,76), +(76,77), +(77,79), +(79,81), +(81,83), +(83,85), +(85,88), +(88,91), +(91,94), +(94,97), +(97,98), +(98,99), +(99,100), +(100,101), +(101,103),(103,105),(105,107),(107,108),(108,110),(110,112),(112,116),(116,118),(118,122),(122,124),(124,128),(128,130),(130,134),(134,136),(136,140),(140,142),(142,146),(146,148),(148,152),(152,153),(153,154),(154,155),(155,156),(156,157),(157,158),(158,159),(159,160),(160,161),(161,162),(162,163),(163,165),(165,167),(167,169),(169,171),(171,173),(173,175),(175,177),(177,179),(179,181),(181,183),(183,185),(185,187),(187,191),(191,192),(192,193),(193,195),(195,196),(196,197),(197,198),(198,199),(199,200),(200,201),(201,203),(203,205),(205,207),(207,209),(209,212),(212,215),(215,218),(218,221),(221,224),(224,227),(227,230),(230,233),(233,236),(236,239),(239,241),(241,243),(243,245),(245,247),(247,249),(249,251),(251,253),(253,256),(256,259),(259,261),(261,263),(263,265),(265,267),(267,270),(270,272),(272,275),(275,278),(278,280),(280,282),(282,285),(285,287),(287,290),(290,293),(293,296),(296,299),(299,302),(302,305),(305,307),(307,309),(309,311),(311,313),(313,315),(315,317),(317,320),(320,322),(322,324),(324,327),(327,329),(329,332),(332,334),(334,337),(337,339),(339,342),(342,344),(344,346),(346,349),(349,351),(351,354),(354,356),(356,358),(358,361),(361,363),(363,366),(366,368),(368,370),(370,372),(372,374),(374,376),(376,378),(378,380),(380,382),(382,384),(384,386),(386,388),(388,390),(390,393),(393,395),(395,397),(397,400),(400,402),(402,405),(405,407),(407,409),(409,412),(412,415),(415,417),(417,420),(420,422),(422,425),(425,427),(427,430),(430,432),(432,434),(434,437),(437,439),(439,441),(441,443),(443,445),(445,447),(447,449),(449,451),(451,453),(453,455),(455,457),(457,459),(459,461),(461,463),(463,465),(465,467),(467,469),(469,471),(471,473),(473,475),(475,477),(477,479),(479,481),(481,483),(483,485),(485,487),(487,489),(489,492),(492,495),(495,497),(497,499),(499,502),(502,504),(504,506),(506,509),(509,511),(511,513),(513,516),(516,518),(518,520),(520,523),(523,525),(525,527),(527,530),(530,532),(532,535),(535,537),(537,542),(542,545),(545,548),(548,550),(550,553),(553,555),(555,557),(557,559),(559,562),(562,564),(564,567),(567,569),(569,572),(572,574),(574,577),(577,579),(579,582),(582,584),(584,586),(586,588),(588,589),(589,592),(592,595),(595,598),(598,601),(601,604),(604,607),(607,610),(610,613),(613,616),(616,619),(619,622),(622,625),(625,628),(628,631),(631,634),(634,640),(640,646),(646,648),(648,651),(651,654),(654,657),(657,660),(660,663),(663,666),(666,669),(669,672),(672,675),(675,678),(678,681),(681,684),(684,687),(687,690),(690,695),(695,698),(698,701),(701,704),(704,707),(707,710),(710,713),(713,716),(716,719),(719,722),(722,725),(725,728),(728,731),(731,734),(734,737),(737,740),(740,743),(743,746),(746,749),(749,752),(752,755),(755,758),(758,761),(761,764),(764,767),(767,770),(770,773),(773,776),(776,779),(779,782),(782,785),(785,788),(788,792),(792,794),(794,795),(795,797),(797,800),(800,803),(803,806),(806,809),(809,812),(812,815),(815,818),(818,821),(821,824),(824,827),(827,830),(830,833),(833,836),(836,839),(839,842),(842,845),(845,848),(848,851),(851,854),(854,857),(857,860),(860,863),(863,866),(866,869),(869,872),(872,875),(875,878),(878,881),(881,884),(884,887),(887,890),(890,893),(893,896),(896,899),(899,902),(902,905),(905,908),(908,911),(911,914),(914,917),(917,920),(920,923),(923,926),(926,929),(929,931),(931,934),(934,937),(937,940),(940,942),(942,945),(945,948),(948,951),(951,954),(954,957),(957,961),(961,964),(964,965),(965,967),(967,970),(970,973),(973,976),(976,979),(979,982),(982,985),(985,988),(988,991),(991,994),(994,997),(997,1000), +(1000,1003),(1003,1006),(1006,1009),(1009,1012),(1012,1015),(1015,1018),(1018,1021),(1021,1024),(1024,1027),(1027,1030),(1030,1033),(1033,1036),(1036,1039),(1039,1042),(1042,1045),(1045,1047),(1047,1049),(1049,1051),(1051,1053),(1053,1055),(1055,1058),(1058,1061),(1061,1064),(1064,1067),(1067,1070),(1070,1072),(1072,1074),(1074,1076),(1076,1078),(1078,1080),(1080,1082),(1082,1084),(1084,1086),(1086,1088),(1088,1090),(1090,1092),(1092,1094),(1094,1096),(1096,1098),(1098,1100),(1100,1102),(1102,1104),(1104,1106),(1106,1108),(1108,1110),(1110,1112),(1112,1114),(1114,1116),(1116,1118),(1118,1120),(1120,1122),(1122,1124),(1124,1126),(1126,1128),(1128,1130),(1130,1133),(1133,1136),(1136,1139),(1139,1141),(1141,1144),(1144,1147),(1147,1150),(1150,1153),(1153,1159),(1159,1165),(1165,1171),(1171,1172),(1172,1173),(1173,1174),(1174,1177),(1177,1180),(1180,1183),(1183,1186),(1186,1189),(1189,1192),(1192,1198),(1198,1204),(1204,1210),(1210,1213),(1213,1216),(1216,1219),(1219,1222),(1222,1225),(1225,1228),(1228,1231),(1231,1234),(1234,1237),(1237,1240),(1240,1243),(1243,1246),(1246,1249),(1249,1252),(1252,1255),(1255,1258),(1258,1261),(1261,1264),(1264,1267),(1267,1270),(1270,1273),(1273,1276),(1276,1279),(1279,1282),(1282,1285),(1285,1288),(1288,1291),(1291,1294),(1294,1297),(1297,1300),(1300,1303),(1303,1305),(1305,1307),(1307,1310),(1310,1313),(1313,1316),(1316,1319),(1319,1322),(1322,1326),(1326,1329),(1329,1333),(1333,1336),(1336,1340),(1340,1343),(1343,1346),(1346,1349),(1349,1352),(1352,1355),(1355,1358),(1358,1361),(1361,1364),(1364,1367),(1367,1370),(1370,1373),(1373,1376),(1376,1379),(1379,1382),(1382,1385),(1385,1388),(1388,1391),(1391,1394),(1394,1397),(1397,1400),(1400,1403),(1403,1406),(1406,1409),(1409,1412),(1412,1415),(1415,1418),(1418,1421),(1421,1424),(1424,1427),(1427,1430),(1430,1433),(1433,1435),(1435,1437),(1437,1439),(1439,1441),(1441,1443),(1443,1445),(1445,1448),(1448,1451),(1451,1454),(1454,1456),(1456,1458),(1458,1460),(1460,1462),(1462,1464),(1464,1466),(1466,1468),(1468,1470),(1470,1472),(1472,1475),(1475,1478),(1478,1481),(1481,1483),(1483,1485),(1485,1487),(1487,1489),(1489,1491),(1491,1493),(1493,1495),(1495,1497),(1497,1499),(1499,1505),(1505,1511),(1511,1517),(1517,1519),(1519,1521),(1521,1523),(1523,1525),(1525,1527),(1527,1529),(1529,1531),(1531,1533),(1533,1535),(1535,1537),(1537,1539),(1539,1541),(1541,1543),(1543,1545),(1545,1547),(1547,1549),(1549,1551),(1551,1553),(1553,1555),(1555,1557),(1557,1559),(1559,1561),(1561,1563),(1563,1565),(1565,1567),(1567,1569),(1569,1572),(1572,1574),(1574,1576),(1576,1578),(1578,1582),(1582,1589),(1589,1596),(1596,1597),(1597,1609),(1609,1615),(1615,1616),(1616,1622),(1622,1623),(1623,1628),(1628,1629),(1629,1634),(1634,1635),(1635,1640),(1640,1641),(1641,1646),(1646,1647),(1647,1654),(1654,1655),(1655,1660),(1660,1661),(1661,1666),(1666,1667),(1667,1674),(1674,1675),(1675,1680),(1680,1681),(1681,1686),(1686,1687),(1687,1692),(1692,1693),(1693,1698),(1698,1699),(1699,1704),(1704,1705),(1705,1710),(1710,1711),(1711,1716),(1716,1717),(1717,1723),(1723,1724),(1724,1733),(1733,1735),(1735,1737),(1737,1739),(1739,1741),(1741,1743),(1743,1745),(1745,1747),(1747,1749),(1749,1751),(1751,1753),(1753,1755),(1755,1757),(1757,1759),(1759,1761),(1761,1763),(1763,1765),(1765,1767),(1767,1769),(1769,1771),(1771,1773),(1773,1775),(1775,1777),(1777,1779),(1779,1781),(1781,1783),(1783,1785),(1785,1787),(1787,1789),(1789,1791),(1791,1793),(1793,1795),(1795,1797),(1797,1799),(1799,1801),(1801,1803),(1803,1805),(1805,1807),(1807,1809),(1809,1811),(1811,1813),(1813,1815),(1815,1817),(1817,1819),(1819,1821),(1821,1823),(1823,1825),(1825,1827),(1827,1829),(1829,1831),(1831,1833),(1833,1835),(1835,1837),(1837,1839),(1839,1841),(1841,1843),(1843,1845),(1845,1847),(1847,1849),(1849,1851),(1851,1853),(1853,1855),(1855,1857),(1857,1859),(1859,1861),(1861,1863),(1863,1865),(1865,1867),(1867,1869),(1869,1871),(1871,1873),(1873,1875),(1875,1877),(1877,1879),(1879,1881),(1881,1883),(1883,1885),(1885,1887),(1887,1889),(1889,1891),(1891,1893),(1893,1895),(1895,1897),(1897,1899),(1899,1901),(1901,1903),(1903,1905),(1905,1907),(1907,1909),(1909,1911),(1911,1913),(1913,1915),(1915,1917),(1917,1919),(1919,1921),(1921,1923),(1923,1925),(1925,1927),(1927,1929),(1929,1931),(1931,1933),(1933,1935),(1935,1937),(1937,1939),(1939,1941),(1941,1943),(1943,1945),(1945,1947),(1947,1949),(1949,1951),(1951,1953),(1953,1955),(1955,1957),(1957,1959),(1959,1961),(1961,1963),(1963,1965),(1965,1967),(1967,1969),(1969,1971),(1971,1973),(1973,1975),(1975,1977),(1977,1979),(1979,1981),(1981,1983),(1983,1985),(1985,1987),(1987,1989),(1989,1991),(1991,1993),(1993,1995),(1995,1997),(1997,1999),(1999,2001),(2001,2003),(2003,2005),(2005,2007),(2007,2009),(2009,2011),(2011,2013),(2013,2015),(2015,2017),(2017,2019),(2019,2021),(2021,2023),(2023,2025),(2025,2027),(2027,2029),(2029,2031),(2031,2033),(2033,2035),(2035,2037),(2037,2039),(2039,2041),(2041,2043),(2043,2045),(2045,2047),(2047,2049),(2049,2051),(2051,2053),(2053,2055),(2055,2057),(2057,2059),(2059,2061),(2061,2063),(2063,2065),(2065,2067),(2067,2069),(2069,2071),(2071,2073),(2073,2075),(2075,2077),(2077,2079),(2079,2081),(2081,2083),(2083,2085),(2085,2087),(2087,2089),(2089,2091),(2091,2093),(2093,2095),(2095,2097),(2097,2099),(2099,2101),(2101,2103),(2103,2105),(2105,2107),(2107,2109),(2109,2111),(2111,2113),(2113,2115),(2115,2117),(2117,2119),(2119,2121),(2121,2123),(2123,2125),(2125,2127),(2127,2129),(2129,2131),(2131,2133),(2133,2135),(2135,2137),(2137,2139),(2139,2141),(2141,2143),(2143,2145),(2145,2147),(2147,2149),(2149,2151),(2151,2153),(2153,2155),(2155,2157),(2157,2159),(2159,2161),(2161,2163),(2163,2165),(2165,2167),(2167,2169),(2169,2171),(2171,2173),(2173,2175),(2175,2177),(2177,2179),(2179,2181),(2181,2183),(2183,2185),(2185,2187),(2187,2189),(2189,2191),(2191,2193),(2193,2195),(2195,2197),(2197,2199),(2199,2201),(2201,2203),(2203,2205),(2205,2207),(2207,2209),(2209,2211),(2211,2213),(2213,2215),(2215,2217),(2217,2219),(2219,2221),(2221,2223),(2223,2225),(2225,2227),(2227,2229),(2229,2231),(2231,2233),(2233,2235),(2235,2237),(2237,2239),(2239,2241),(2241,2243),(2243,2245),(2245,2247),(2247,2249),(2249,2251),(2251,2253),(2253,2255),(2255,2257),(2257,2259),(2259,2261),(2261,2263),(2263,2265),(2265,2267),(2267,2269),(2269,2271),(2271,2273),(2273,2275),(2275,2277),(2277,2279),(2279,2281),(2281,2283),(2283,2285),(2285,2287),(2287,2289),(2289,2291),(2291,2293),(2293,2295),(2295,2297),(2297,2299),(2299,2301),(2301,2303),(2303,2305),(2305,2307),(2307,2309),(2309,2311),(2311,2313),(2313,2315),(2315,2317),(2317,2319),(2319,2321),(2321,2323),(2323,2325),(2325,2327),(2327,2329),(2329,2331),(2331,2333),(2333,2335),(2335,2337),(2337,2339),(2339,2341),(2341,2343),(2343,2345),(2345,2347),(2347,2349),(2349,2351),(2351,2353),(2353,2355),(2355,2357),(2357,2358),(2358,2359),(2359,2360),(2360,2361),(2361,2362),(2362,2363),(2363,2364),(2364,2365),(2365,2366),(2366,2367),(2367,2368),(2368,2370),(2370,2372),(2372,2374),(2374,2376),(2376,2378),(2378,2380),(2380,2382),(2382,2384),(2384,2386),(2386,2388),(2388,2390),(2390,2392),(2392,2394),(2394,2396),(2396,2398),(2398,2401),(2401,2404),(2404,2407),(2407,2410),(2410,2413),(2413,2416),(2416,2419),(2419,2422),(2422,2425),(2425,2428),(2428,2431),(2431,2434),(2434,2437),(2437,2440),(2440,2443),(2443,2446),(2446,2449),(2449,2452),(2452,2455),(2455,2458),(2458,2461),(2461,2463),(2463,2466),(2466,2470),(2470,2472),(2472,2474),(2474,2476),(2476,2478),(2478,2480),(2480,2482),(2482,2484),(2484,2486),(2486,2488),(2488,2490),(2490,2492),(2492,2494),(2494,2496),(2496,2498),(2498,2500),(2500,2502),(2502,2504),(2504,2506),(2506,2508),(2508,2510),(2510,2512),(2512,2514),(2514,2516),(2516,2518),(2518,2520),(2520,2522),(2522,2524),(2524,2526),(2526,2528),(2528,2530),(2530,2532),(2532,2534),(2534,2536),(2536,2538),(2538,2540),(2540,2542),(2542,2544),(2544,2546),(2546,2548),(2548,2550),(2550,2552),(2552,2554),(2554,2556),(2556,2558),(2558,2560),(2560,2562),(2562,2564),(2564,2566),(2566,2568),(2568,2570),(2570,2572),(2572,2574),(2574,2576),(2576,2578),(2578,2580),(2580,2582),(2582,2584),(2584,2586),(2586,2588),(2588,2590),(2590,2592),(2592,2594),(2594,2596),(2596,2598),(2598,2600),(2600,2602),(2602,2604),(2604,2606),(2606,2608),(2608,2610),(2610,2612),(2612,2614),(2614,2616),(2616,2618),(2618,2620),(2620,2622),(2622,2624),(2624,2626),(2626,2628),(2628,2630),(2630,2632),(2632,2634),(2634,2636),(2636,2638),(2638,2640),(2640,2642),(2642,2644),(2644,2646),(2646,2648),(2648,2650),(2650,2652),(2652,2654),(2654,2656),(2656,2658),(2658,2660),(2660,2662),(2662,2664),(2664,2666),(2666,2668),(2668,2670),(2670,2672),(2672,2674),(2674,2676),(2676,2678),(2678,2680),(2680,2682),(2682,2684),(2684,2686),(2686,2688),(2688,2690),(2690,2692),(2692,2694),(2694,2696),(2696,2698),(2698,2700),(2700,2702),(2702,2704),(2704,2706),(2706,2708),(2708,2710),(2710,2712),(2712,2714),(2714,2716),(2716,2718),(2718,2720),(2720,2722),(2722,2724),(2724,2726),(2726,2728),(2728,2730),(2730,2732),(2732,2734),(2734,2736),(2736,2738),(2738,2740),(2740,2742),(2742,2744),(2744,2746),(2746,2748),(2748,2750),(2750,2752),(2752,2755),(2755,2758),(2758,2761),(2761,2768),(2768,2774),(2774,2780),(2780,2786),(2786,2792),(2792,2798),(2798,2804),(2804,2810),(2810,2815),(2815,2820),(2820,2825),(2825,2831),(2831,2836),(2836,2842),(2842,2848),(2848,2854),(2854,2857),(2857,2864),(2864,2870),(2870,2876),(2876,2882),(2882,2887),(2887,2893),(2893,2898),(2898,2903),(2903,2908),(2908,2912),(2912,2917),(2917,2922),(2922,2926),(2926,2931),(2931,2937),(2937,2942),(2942,2945),(2945,2952),(2952,2958),(2958,2964),(2964,2970),(2970,2975),(2975,2981),(2981,2986),(2986,2991),(2991,2996),(2996,3000),(3000,3005),(3005,3010),(3010,3014),(3014,3019),(3019,3025),(3025,3030),(3030,3033),(3033,3040),(3040,3046),(3046,3051),(3051,3056),(3056,3062),(3062,3068),(3068,3073),(3073,3078),(3078,3082),(3082,3087),(3087,3092),(3092,3096),(3096,3100),(3100,3105),(3105,3111),(3111,3116),(3116,3123),(3123,3129),(3129,3134),(3134,3139),(3139,3145),(3145,3151),(3151,3156),(3156,3161),(3161,3165),(3165,3170),(3170,3175),(3175,3179),(3179,3183),(3183,3188),(3188,3194),(3194,3199),(3199,3204),(3204,3209),(3209,3213),(3213,3218),(3218,3223),(3223,3228),(3228,3233),(3233,3237),(3237,3238),(3238,3241),(3241,3245),(3245,3249),(3249,3253),(3253,3258),(3258,3263),(3263,3268),(3268,3270),(3270,3277),(3277,3283),(3283,3288),(3288,3293),(3293,3299),(3299,3304),(3304,3309),(3309,3313),(3313,3317),(3317,3321),(3321,3326),(3326,3330),(3330,3334),(3334,3339),(3339,3344),(3344,3349),(3349,3354),(3354,3359),(3359,3363),(3363,3368),(3368,3372),(3372,3377),(3377,3381),(3381,3385),(3385,3386),(3386,3389),(3389,3393),(3393,3397),(3397,3401),(3401,3406),(3406,3411),(3411,3416),(3416,3418),(3418,3424),(3424,3429),(3429,3433),(3433,3438),(3438,3443),(3443,3448),(3448,3452),(3452,3456),(3456,3460),(3460,3465),(3465,3470),(3470,3474),(3474,3478),(3478,3482),(3482,3487),(3487,3492),(3492,3498),(3498,3503),(3503,3507),(3507,3512),(3512,3517),(3517,3522),(3522,3526),(3526,3530),(3530,3534),(3534,3539),(3539,3544),(3544,3548),(3548,3552),(3552,3556),(3556,3561),(3561,3566),(3566,3571),(3571,3576),(3576,3580),(3580,3584),(3584,3588),(3588,3592),(3592,3596),(3596,3600),(3600,3601),(3601,3604),(3604,3608),(3608,3612),(3612,3615),(3615,3619),(3619,3623),(3623,3627),(3627,3629),(3629,3635),(3635,3642),(3642,3647),(3647,3653),(3653,3659),(3659,3665),(3665,3671),(3671,3676),(3676,3681),(3681,3686),(3686,3691),(3691,3696),(3696,3701),(3701,3707),(3707,3713),(3713,3719),(3719,3725),(3725,3732),(3732,3737),(3737,3743),(3743,3749),(3749,3755),(3755,3761),(3761,3766),(3766,3771),(3771,3776),(3776,3781),(3781,3786),(3786,3791),(3791,3797),(3797,3803),(3803,3809),(3809,3814),(3814,3820),(3820,3824),(3824,3829),(3829,3834),(3834,3839),(3839,3844),(3844,3848),(3848,3849),(3849,3853),(3853,3857),(3857,3861),(3861,3865),(3865,3870),(3870,3875),(3875,3880),(3880,3883),(3883,3885),(3885,3890),(3890,3895),(3895,3900),(3900,3904),(3904,3909),(3909,3914),(3914,3918),(3918,3922),(3922,3926),(3926,3930),(3930,3933),(3933,3937),(3937,3941),(3941,3945),(3945,3950),(3950,3954),(3954,3957),(3957,3960),(3960,3966),(3966,3972),(3972,3977),(3977,3983),(3983,3989),(3989,3994),(3994,3999),(3999,4000),(4000,4004),(4004,4009),(4009,4010),(4010,4014),(4014,4018),(4018,4022),(4022,4027),(4027,4032),(4032,4035),(4035,4041),(4041,4047),(4047,4053),(4053,4058),(4058,4062),(4062,4066),(4066,4071),(4071,4073),(4073,4074),(4074,4077),(4077,4078),(4078,4081),(4081,4082),(4082,4087),(4087,4091),(4091,4096),(4096,4099),(4099,4104),(4104,4109),(4109,4113),(4113,4117),(4117,4122),(4122,4126),(4126,4129),(4129,4132),(4132,4135),(4135,4138),(4138,4141),(4141,4145),(4145,4148),(4148,4152),(4152,4156),(4156,4160),(4160,4165),(4165,4170),(4170,4175),(4175,4180),(4180,4185),(4185,4190),(4190,4194),(4194,4199),(4199,4204),(4204,4208),(4208,4213),(4213,4217),(4217,4221),(4221,4225),(4225,4230),(4230,4235),(4235,4238),(4238,4244),(4244,4249),(4249,4255),(4255,4261),(4261,4267),(4267,4272),(4272,4277),(4277,4282),(4282,4287),(4287,4291),(4291,4297),(4297,4302),(4302,4308),(4308,4314),(4314,4320),(4320,4332),(4332,4344),(4344,4356),(4356,4369),(4369,4381),(4381,4393),(4393,4397),(4397,None)] + + + + +p_col_names = ['DUID', 'PID', 'DUPERSID', 'DRUGIDX', 'RXRECIDX', 'LINKIDX','PANEL', 'PURCHRD', 'RXBEGMM', 'RXBEGYRX', 'RXNAME', + 'RXDRGNAM', 'RXNDC', 'RXQUANTY', 'RXFORM', 'RXFRMUNT','RXSTRENG', 'RXSTRUNT', 'RXDAYSUP', 'PHARTP1', 'PHARTP2', + 'PHARTP3', 'PHARTP4', 'PHARTP5', 'PHARTP6', 'PHARTP7','PHARTP8', 'PHARTP9', 'RXFLG', 'IMPFLAG', 'PCIMPFLG', + 'DIABEQUIP', 'INPCFLG', 'PREGCAT', 'TC1', 'TC1S1','TC1S1_1', 'TC1S1_2', 'TC1S2', 'TC1S2_1', 'TC1S3', + 'TC1S3_1', 'TC2', 'TC2S1', 'TC2S1_1', 'TC2S1_2', 'TC2S2','TC3', 'TC3S1', 'TC3S1_1', 'RXSF18X', 'RXMR18X', 'RXMD18X', + 'RXPV18X', 'RXVA18X', 'RXTR18X', 'RXOF18X', 'RXSL18X','RXWC18X', 'RXOT18X', 'RXOR18X', 'RXOU18X', 'RXXP18X', + 'PERWT18F', 'VARSTR', 'VARPSU'] + +p_col_spaces = [(0,7),(7,10),(10,20),(20,33),(33,52),(52,68),(68,70),(70,71),(71,74),(74,78),(78,128),(128,188),(188,199), + (199,206),(206,256),(256,306),(306,356),(356,406),(406,409),(409,412),(412,414),(414,416),(416,418),(418,420),(420,422), + (422,424),(424,426),(426,428),(428,429),(429,430),(430,431),(431,432),(432,433),(433,436),(436,439),(439,442),(442,445), + (445,448),(448,451),(451,454),(454,456),(456,458),(458,461),(461,464),(464,467),(467,470),(470,473),(473,476),(476,479), + (479,482),(482,490),(490,498),(498,506),(506,514),(514,522),(522,529),(529,536),(536,543),(543,550),(550,558),(558,566), + (566,573),(573,581),(581,593),(593,597),(597,None)] From bdf7ef2ab065465c72723f87f1b808cd189ff7d3 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Fri, 7 May 2021 12:35:39 -0500 Subject: [PATCH 16/46] Change meps get_dataset to get response.content vs response --- src/mdt/meps/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdt/meps/utils.py b/src/mdt/meps/utils.py index 70e24c5..c7b73d4 100644 --- a/src/mdt/meps/utils.py +++ b/src/mdt/meps/utils.py @@ -20,7 +20,7 @@ def get_dataset( response = requests.get(url) if handler: - return handler(response) + return handler(response.content) (dest / url.split('/')[-1]).write_bytes(response.content) From 2889fcbd6de4d46558d14669df38fbc68955b038 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Fri, 7 May 2021 12:38:34 -0500 Subject: [PATCH 17/46] Allow modules to be imported from package namespace --- src/mdt/meps/__init__.py | 4 ++++ src/mdt/rxnorm/__init__.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/src/mdt/meps/__init__.py b/src/mdt/meps/__init__.py index e69de29..41a97c2 100644 --- a/src/mdt/meps/__init__.py +++ b/src/mdt/meps/__init__.py @@ -0,0 +1,4 @@ +from . import utils +from . import columns + +__all__ = ['utils', 'columns'] diff --git a/src/mdt/rxnorm/__init__.py b/src/mdt/rxnorm/__init__.py index e69de29..2b01e39 100644 --- a/src/mdt/rxnorm/__init__.py +++ b/src/mdt/rxnorm/__init__.py @@ -0,0 +1,4 @@ +from . import rxclass +from . import utils + +__all__ = ['rxclass', 'utils'] From 73dd0e9da98512c5283e02243d51e9be12ece4c0 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Fri, 7 May 2021 12:45:00 -0500 Subject: [PATCH 18/46] Add load_meps function to database.py --- src/mdt/database.py | 69 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 5 deletions(-) diff --git a/src/mdt/database.py b/src/mdt/database.py index 7e0e6b2..e2d0e04 100644 --- a/src/mdt/database.py +++ b/src/mdt/database.py @@ -1,7 +1,8 @@ -from .rxnorm.utils import get_dataset - +from . import rxnorm, meps from pathlib import Path -import zipfile,io, sqlite3 +import zipfile +import io +import sqlite3 import pandas as pd @@ -59,7 +60,7 @@ def read_sql_string(file_name): def load_rxnorm(): """downloads and loads RxNorm dataset into database""" - z = zipfile.ZipFile(get_dataset(handler=io.BytesIO)) + z = zipfile.ZipFile(rxnorm.utils.get_dataset(handler=io.BytesIO)) col_names = ['RXCUI','LAT','TS','LUI','STT','SUI','ISPREF','RXAUI','SAUI','SCUI','SDUI','SAB','TTY','CODE','STR','SRL','SUPPRESS','CVF','test'] rxnconso = pd.read_csv(z.open('rrf/RXNCONSO.RRF'),sep='|',header=None,dtype=object,names=col_names) @@ -76,4 +77,62 @@ def load_rxnorm(): sql_create_table('rxnsat',rxnsat) del rxnsat - del z \ No newline at end of file + del z + + +def load_meps(): + '''Load Meps data into db''' + z = zipfile.ZipFile( + meps.utils.get_dataset('h206adat.zip', handler=io.BytesIO) + ) + + meps_prescription = pd.read_fwf( + z.open('H206A.dat'), + header=None, + names=meps.columns.p_col_names, + converters={col: str for col in meps.columns.p_col_names}, + colspecs=meps.columns.p_col_spaces, + ) + + sql_create_table('meps_prescription', meps_prescription) + del meps_prescription + del z + + z = zipfile.ZipFile( + meps.utils.get_dataset('h209dat.zip', handler=io.BytesIO) + ) + + meps_demographics = pd.read_fwf( + z.open('h209.dat'), + header=None, + names=meps.columns.d_col_names, + converters={col: str for col in meps.columns.d_col_names}, + colspecs=meps.columns.d_col_spaces, + usecols=['DUPERSID', 'PERWT18F', "REGION18", 'SEX', 'AGELAST'] + ) + + # removing numbers from meps_demographic column names, since the '18' in region18 and perwt18f in MEPS are year-specific + meps_demographics.columns = meps_demographics.columns.str.replace(r'\d+', '',regex=True) + sql_create_table('meps_demographics', meps_demographics) + del meps_demographics + del z + + sql_create_table('meps_region_states', meps.columns.meps_region_states) + + meps_reference_str = read_sql_string('meps_reference.sql') + meps_reference = db_query(meps_reference_str) + sql_create_table('meps_reference', meps_reference) + del meps_reference + + # TEST!!!!!!!!!!!!!!!! reads record count from created database + meps_prescription = db_query("Select count(*) AS records from meps_prescription") + print('DB table meps_prescription has {0} records'.format(meps_prescription['records'].iloc[0])) + + meps_demographics = db_query("Select count(*) AS records from meps_demographics") + print('DB table meps_demographics has {0} records'.format(meps_demographics['records'].iloc[0])) + + meps_reference = db_query("Select count(*) AS records from meps_reference") + print('DB table meps_reference has {0} records'.format(meps_reference['records'].iloc[0])) + + meps_region_states = db_query("Select count(*) AS records from meps_region_states") + print('DB table meps_region_states has {0} records'.format(meps_region_states['records'].iloc[0])) From 832ec257cf58873f094c50e05de9633432c02767 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Fri, 7 May 2021 12:45:22 -0500 Subject: [PATCH 19/46] Add load_meps to main function of run_mdt module --- src/mdt/run_mdt.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mdt/run_mdt.py b/src/mdt/run_mdt.py index 096cb81..92c8ed5 100644 --- a/src/mdt/run_mdt.py +++ b/src/mdt/run_mdt.py @@ -1,8 +1,9 @@ -from mdt.database import load_rxnorm +from mdt.database import load_rxnorm, load_meps def main(): load_rxnorm() + load_meps() if __name__ == '__main__': From bd9e747906184d03a91356bd4765a818f39dd799 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sat, 8 May 2021 09:11:06 -0500 Subject: [PATCH 20/46] Require requests and pandas to install if mdt is installed --- setup.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 0f5544d..764f3f5 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ version='1.0.0', # description='A sample Python project', # Optional # long_description=long_description, # Optional - # long_description_content_type='text/markdown', # Optional (see note above) + # long_description_content_type='text/markdown', # Optional # url='https://github.com/pypa/sampleproject', # Optional # author='A. Random Developer', # Optional # author_email='author@example.com', # Optional @@ -19,13 +19,16 @@ package_dir={'': 'src'}, packages=find_packages(where='src'), python_requires='>=3.6, <4', - # install_requires=['peppercorn'], # Optional + install_requires=[ + 'requests', + 'pandas' + ], # Optional # If there are data files included in your packages that need to be # installed, specify them here. # package_data={ # Optional # 'sample': ['package_data.dat'], - #}, + # }, # Although 'package_data' is the preferred approach, in some case you may # need to place data files outside of your packages. See: From 3ce65feb6584a4dd1975d2c60ec48b86f2459838 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 9 May 2021 18:07:41 -0500 Subject: [PATCH 21/46] Change package install name to mdt, include .sql files in packages --- setup.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 764f3f5..8893d23 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ long_description = (here / 'README.md').read_text(encoding='utf-8') setup( - name='medicationDiversification', + name='mdt', version='1.0.0', # description='A sample Python project', # Optional # long_description=long_description, # Optional @@ -26,10 +26,9 @@ # If there are data files included in your packages that need to be # installed, specify them here. - # package_data={ # Optional - # 'sample': ['package_data.dat'], - # }, - + package_data={ + "":['*.sql'] + } # Although 'package_data' is the preferred approach, in some case you may # need to place data files outside of your packages. See: # http://docs.python.org/distutils/setupscript.html#installing-additional-files From ceee61e3c3c888de110d1df8dfe6a23607f8b675 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 9 May 2021 18:10:17 -0500 Subject: [PATCH 22/46] Add sql packages to rxnorm & meps, include sql files --- src/mdt/meps/sql/__init__.py | 0 src/mdt/meps/sql/meps_reference.sql | 14 ++ src/mdt/rxnorm/sql/__init__.py | 0 src/mdt/rxnorm/sql/rxcui_ndc.sql | 206 ++++++++++++++++++++++++++++ 4 files changed, 220 insertions(+) create mode 100644 src/mdt/meps/sql/__init__.py create mode 100644 src/mdt/meps/sql/meps_reference.sql create mode 100644 src/mdt/rxnorm/sql/__init__.py create mode 100644 src/mdt/rxnorm/sql/rxcui_ndc.sql diff --git a/src/mdt/meps/sql/__init__.py b/src/mdt/meps/sql/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/mdt/meps/sql/meps_reference.sql b/src/mdt/meps/sql/meps_reference.sql new file mode 100644 index 0000000..7cd2e62 --- /dev/null +++ b/src/mdt/meps/sql/meps_reference.sql @@ -0,0 +1,14 @@ +--"Sex" assignments are from MEPS, source: https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=PROJYR15&varName=SEX + +SELECT DISTINCT + t1.dupersid, + t2.perwtf AS person_weight, + t1.rxndc, + CASE WHEN t2.sex = 1 THEN 'M' + WHEN t2.sex = 2 THEN 'F' + END AS gender, + t2.agelast, --patient's last known age; advantage of using this col over other age cols is every patient has age (no NULLs) + t2.region AS region_num + FROM meps_prescription AS t1 + INNER JOIN meps_demographics AS t2 + ON t1.dupersid = t2.dupersid \ No newline at end of file diff --git a/src/mdt/rxnorm/sql/__init__.py b/src/mdt/rxnorm/sql/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/mdt/rxnorm/sql/rxcui_ndc.sql b/src/mdt/rxnorm/sql/rxcui_ndc.sql new file mode 100644 index 0000000..920fc4d --- /dev/null +++ b/src/mdt/rxnorm/sql/rxcui_ndc.sql @@ -0,0 +1,206 @@ +select distinct + sq.medication_ingredient_rxcui + , sq.medication_ingredient_name + , sq.medication_ingredient_tty + , sq.medication_product_rxcui + , sq.medication_product_name + , sq.medication_product_tty + + , df_rxnconso.rxcui as dose_form_rxcui + , df_rxnconso.str as dose_form_name + , df_rxnconso.tty as dose_form_tty + + --, dfg_rxnconso.rxcui as dose_form_group_rxcui + --, dfg_rxnconso.str as dose_form_group_name + --, dfg_rxnconso.tty as dose_form_group_tty + + , ndc_rxnsat.atv as medication_ndc + +from ( + + select in_rxnconso.rxcui as medication_ingredient_rxcui + , in_rxnconso.str as medication_ingredient_name + , in_rxnconso.tty as medication_ingredient_tty + , scd_rxnconso.rxcui as medication_product_rxcui + , scd_rxnconso.str as medication_product_name + , scd_rxnconso.tty as medication_product_tty + + -- medication ingredient (IN) + from rxnconso in_rxnconso + + -- medication product (SCDC -> SCD) + left join rxnrel scdc_rxnrel on scdc_rxnrel.rxcui2 = in_rxnconso.rxcui and scdc_rxnrel.rela = 'ingredient_of' + left join rxnconso scdc_rxnconso on scdc_rxnconso.rxcui = scdc_rxnrel.rxcui1 and scdc_rxnconso.sab = 'RXNORM' and scdc_rxnconso.tty = 'SCDC' + left join rxnrel scd_rxnrel on scd_rxnrel.rxcui2 = scdc_rxnrel.rxcui1 and scd_rxnrel.rela = 'constitutes' + left join rxnconso scd_rxnconso on scd_rxnconso.rxcui = scd_rxnrel.rxcui1 and scd_rxnconso.sab = 'RXNORM' and scd_rxnconso.tty = 'SCD' + + where in_rxnconso.tty = 'IN' + and in_rxnconso.sab = 'RXNORM' + +union all + + select in_rxnconso.rxcui as medication_ingredient_rxcui + , in_rxnconso.str as medication_ingredient_name + , in_rxnconso.tty as medication_ingredient_tty + , sbd_rxnconso.rxcui as medication_product_rxcui + , sbd_rxnconso.str as medication_product_name + , sbd_rxnconso.tty as medication_product_tty + + -- medication ingredient (IN) + from rxnconso in_rxnconso + + -- medication product (BN -> SBD) + left join rxnrel bn_rxnrel on bn_rxnrel.rxcui2 = in_rxnconso.rxcui and bn_rxnrel.rela = 'has_tradename' + left join rxnconso bn_rxnconso on bn_rxnconso.rxcui = bn_rxnrel.rxcui1 and bn_rxnconso.sab = 'RXNORM' and bn_rxnconso.tty = 'BN' + left join rxnrel sbd_rxnrel on sbd_rxnrel.rxcui2 = bn_rxnrel.rxcui1 and sbd_rxnrel.rela = 'ingredient_of' + left join rxnconso sbd_rxnconso on sbd_rxnconso.rxcui = sbd_rxnrel.rxcui1 and sbd_rxnconso.sab = 'RXNORM' and sbd_rxnconso.tty = 'SBD' + + where in_rxnconso.tty = 'IN' + and in_rxnconso.sab = 'RXNORM' + +union all + + select in_rxnconso.rxcui as medication_ingredient_rxcui + , in_rxnconso.str as medication_ingredient_name + , in_rxnconso.tty as medication_ingredient_tty + , gpck_rxnconso.rxcui as medication_product_rxcui + , gpck_rxnconso.str as medication_product_name + , gpck_rxnconso.tty as medication_product_tty + + -- medication ingredient (IN) + from rxnconso in_rxnconso + + -- medication product (SCDC -> SCD -> GPCK) + left join rxnrel scdc_rxnrel on scdc_rxnrel.rxcui2 = in_rxnconso.rxcui and scdc_rxnrel.rela = 'ingredient_of' + left join rxnconso scdc_rxnconso on scdc_rxnconso.rxcui = scdc_rxnrel.rxcui1 and scdc_rxnconso.sab = 'RXNORM' and scdc_rxnconso.tty = 'SCDC' + left join rxnrel scd_rxnrel on scd_rxnrel.rxcui2 = scdc_rxnrel.rxcui1 and scd_rxnrel.rela = 'constitutes' + left join rxnconso scd_rxnconso on scd_rxnconso.rxcui = scd_rxnrel.rxcui1 and scd_rxnconso.sab = 'RXNORM' and scd_rxnconso.tty = 'SCD' + left join rxnrel gpck_rxnrel on gpck_rxnrel.rxcui2 = scd_rxnrel.rxcui1 and gpck_rxnrel.rela = 'contained_in' + left join rxnconso gpck_rxnconso on gpck_rxnconso.rxcui = gpck_rxnrel.rxcui1 and gpck_rxnconso.sab = 'RXNORM' and gpck_rxnconso.tty = 'GPCK' + + where in_rxnconso.tty = 'IN' + and in_rxnconso.sab = 'RXNORM' + +union all + + select in_rxnconso.rxcui as medication_ingredient_rxcui + , in_rxnconso.str as medication_ingredient_name + , in_rxnconso.tty as medication_ingredient_tty + , bpck_rxnconso.rxcui as medication_product_rxcui + , bpck_rxnconso.str as medication_product_name + , bpck_rxnconso.tty as medication_product_tty + + -- medication ingredient (IN) + from rxnconso in_rxnconso + + -- medication product (SCDC -> SCD -> GPCK -> BPCK) + left join rxnrel scdc_rxnrel on scdc_rxnrel.rxcui2 = in_rxnconso.rxcui and scdc_rxnrel.rela = 'ingredient_of' + left join rxnconso scdc_rxnconso on scdc_rxnconso.rxcui = scdc_rxnrel.rxcui1 and scdc_rxnconso.sab = 'RXNORM' and scdc_rxnconso.tty = 'SCDC' + left join rxnrel scd_rxnrel on scd_rxnrel.rxcui2 = scdc_rxnrel.rxcui1 and scd_rxnrel.rela = 'constitutes' + left join rxnconso scd_rxnconso on scd_rxnconso.rxcui = scd_rxnrel.rxcui1 and scd_rxnconso.sab = 'RXNORM' and scd_rxnconso.tty = 'SCD' + left join rxnrel gpck_rxnrel on gpck_rxnrel.rxcui2 = scd_rxnrel.rxcui1 and gpck_rxnrel.rela = 'contained_in' + left join rxnconso gpck_rxnconso on gpck_rxnconso.rxcui = gpck_rxnrel.rxcui1 and gpck_rxnconso.sab = 'RXNORM' and gpck_rxnconso.tty = 'GPCK' + left join rxnrel bpck_rxnrel on bpck_rxnrel.rxcui2 = gpck_rxnrel.rxcui1 and bpck_rxnrel.rela = 'has_tradename' + left join rxnconso bpck_rxnconso on bpck_rxnconso.rxcui = bpck_rxnrel.rxcui1 and bpck_rxnconso.sab = 'RXNORM' and bpck_rxnconso.tty = 'BPCK' + + where in_rxnconso.tty = 'IN' + and in_rxnconso.sab = 'RXNORM' + +union all + + select min_rxnconso.rxcui as medication_ingredient_rxcui + , min_rxnconso.str as medication_ingredient_name + , min_rxnconso.tty as medication_ingredient_tty + , scd_rxnconso.rxcui as medication_product_rxcui + , scd_rxnconso.str as medication_product_name + , scd_rxnconso.tty as medication_product_tty + + -- medication ingredient (MIN) + from rxnconso min_rxnconso + + -- medication product (SCD) + left join rxnrel scd_rxnrel on scd_rxnrel.rxcui2 = min_rxnconso.rxcui and scd_rxnrel.rela = 'ingredients_of' + left join rxnconso scd_rxnconso on scd_rxnconso.rxcui = scd_rxnrel.rxcui1 and scd_rxnconso.sab = 'RXNORM' and scd_rxnconso.tty = 'SCD' + + where min_rxnconso.tty = 'MIN' + and min_rxnconso.sab = 'RXNORM' + +union all + + select min_rxnconso.rxcui as medication_ingredient_rxcui + , min_rxnconso.str as medication_ingredient_name + , min_rxnconso.tty as medication_ingredient_tty + , sbd_rxnconso.rxcui as medication_product_rxcui + , sbd_rxnconso.str as medication_product_name + , sbd_rxnconso.tty as medication_product_tty + + -- medication ingredient (MIN) + from rxnconso min_rxnconso + + -- medication product (SCD -> SBD) + left join rxnrel scd_rxnrel on scd_rxnrel.rxcui2 = min_rxnconso.rxcui and scd_rxnrel.rela = 'ingredients_of' + left join rxnconso scd_rxnconso on scd_rxnconso.rxcui = scd_rxnrel.rxcui1 and scd_rxnconso.sab = 'RXNORM' and scd_rxnconso.tty = 'SCD' + left join rxnrel sbd_rxnrel on sbd_rxnrel.rxcui2 = scd_rxnrel.rxcui1 and sbd_rxnrel.rela = 'has_tradename' + left join rxnconso sbd_rxnconso on sbd_rxnconso.rxcui = sbd_rxnrel.rxcui1 and sbd_rxnconso.sab = 'RXNORM' and sbd_rxnconso.tty = 'SBD' + + where min_rxnconso.tty = 'MIN' + and min_rxnconso.sab = 'RXNORM' + +union all + + select min_rxnconso.rxcui as medication_ingredient_rxcui + , min_rxnconso.str as medication_ingredient_name + , min_rxnconso.tty as medication_ingredient_tty + , gpck_rxnconso.rxcui as medication_product_rxcui + , gpck_rxnconso.str as medication_product_name + , gpck_rxnconso.tty as medication_product_tty + + -- medication ingredient (MIN) + from rxnconso min_rxnconso + + -- medication product (SCD -> GPCK) + left join rxnrel scd_rxnrel on scd_rxnrel.rxcui2 = min_rxnconso.rxcui and scd_rxnrel.rela = 'ingredients_of' + left join rxnconso scd_rxnconso on scd_rxnconso.rxcui = scd_rxnrel.rxcui1 and scd_rxnconso.sab = 'RXNORM' and scd_rxnconso.tty = 'SCD' + left join rxnrel gpck_rxnrel on gpck_rxnrel.rxcui2 = scd_rxnrel.rxcui1 and gpck_rxnrel.rela = 'contained_in' + left join rxnconso gpck_rxnconso on gpck_rxnconso.rxcui = gpck_rxnrel.rxcui1 and gpck_rxnconso.sab = 'RXNORM' and gpck_rxnconso.tty = 'GPCK' + + where min_rxnconso.tty = 'MIN' + and min_rxnconso.sab = 'RXNORM' + +union all + + select min_rxnconso.rxcui as medication_ingredient_rxcui + , min_rxnconso.str as medication_ingredient_name + , min_rxnconso.tty as medication_ingredient_tty + , bpck_rxnconso.rxcui as medication_product_rxcui + , bpck_rxnconso.str as medication_product_name + , bpck_rxnconso.tty as medication_product_tty + + -- medication ingredient (MIN) + from rxnconso min_rxnconso + + -- medication product (SCD -> SBD -> BPCK) + left join rxnrel scd_rxnrel on scd_rxnrel.rxcui2 = min_rxnconso.rxcui and scd_rxnrel.rela = 'ingredients_of' + left join rxnconso scd_rxnconso on scd_rxnconso.rxcui = scd_rxnrel.rxcui1 and scd_rxnconso.sab = 'RXNORM' and scd_rxnconso.tty = 'SCD' + left join rxnrel sbd_rxnrel on sbd_rxnrel.rxcui2 = scd_rxnrel.rxcui1 and sbd_rxnrel.rela = 'has_tradename' + left join rxnconso sbd_rxnconso on sbd_rxnconso.rxcui = sbd_rxnrel.rxcui1 and sbd_rxnconso.sab = 'RXNORM' and sbd_rxnconso.tty = 'SBD' + left join rxnrel bpck_rxnrel on bpck_rxnrel.rxcui2 = sbd_rxnrel.rxcui1 and bpck_rxnrel.rela = 'contained_in' + left join rxnconso bpck_rxnconso on bpck_rxnconso.rxcui = bpck_rxnrel.rxcui1 and bpck_rxnconso.sab = 'RXNORM' and bpck_rxnconso.tty = 'BPCK' + + where min_rxnconso.tty = 'MIN' + and min_rxnconso.sab = 'RXNORM' +) as sq + +-- dose form +left join rxnrel df_rxnrel on df_rxnrel.rxcui2 = sq.medication_product_rxcui and df_rxnrel.rela = 'has_dose_form' +left join rxnconso df_rxnconso on df_rxnconso.rxcui = df_rxnrel.rxcui1 and df_rxnconso.sab = 'RXNORM' and df_rxnconso.tty = 'DF' + +-- dose form group +--left join rxnrel dfg_rxnrel on dfg_rxnrel.rxcui2 = df_rxnrel.rxcui1 and dfg_rxnrel.rela = 'isa' +--left join rxnconso dfg_rxnconso on dfg_rxnconso.rxcui = dfg_rxnrel.rxcui1 and dfg_rxnconso.sab = 'RXNORM' and dfg_rxnconso.tty = 'DFG' + +-- ndc +left join rxnsat ndc_rxnsat on ndc_rxnsat.rxcui = sq.medication_product_rxcui and ndc_rxnsat.sab = 'RXNORM' and ndc_rxnsat.atn = 'NDC' + +where ndc_rxnsat.atv is not null +-- and sq.medication_ingredient_rxcui in ('285155','10582','10814','10565','325521','10572') \ No newline at end of file From e2defa1d66b0af5b85c5bd0f1e6b8a70115f806c Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 9 May 2021 18:14:03 -0500 Subject: [PATCH 23/46] Initial get_sql function to get meps package sql files --- src/mdt/meps/utils.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/mdt/meps/utils.py b/src/mdt/meps/utils.py index c7b73d4..e2b9650 100644 --- a/src/mdt/meps/utils.py +++ b/src/mdt/meps/utils.py @@ -1,8 +1,12 @@ import os +import importlib.resources as pkg_resources from pathlib import Path from typing import Callable import requests +from . import sql + + def get_dataset( dat_name: str, @@ -25,3 +29,8 @@ def get_dataset( (dest / url.split('/')[-1]).write_bytes(response.content) return response + + +def get_sql(file_name): + meps_sql = pkg_resources.read_text(sql, file_name) + return meps_sql From 5f5a454316cacb07479d8c9c6e294565891aec85 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Mon, 10 May 2021 07:51:10 -0500 Subject: [PATCH 24/46] Import lib requires python >3.7 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8893d23..be37c6e 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ # keywords='sample, setuptools, development', # Optional package_dir={'': 'src'}, packages=find_packages(where='src'), - python_requires='>=3.6, <4', + python_requires='>=3.7 <4', install_requires=[ 'requests', 'pandas' From 90a7f0897f12b9616b7330bf50d3ea5f59ade8c9 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sat, 15 May 2021 08:23:38 -0500 Subject: [PATCH 25/46] Missing comma in setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index be37c6e..fe22fef 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ # keywords='sample, setuptools, development', # Optional package_dir={'': 'src'}, packages=find_packages(where='src'), - python_requires='>=3.7 <4', + python_requires='>=3.7, <4', install_requires=[ 'requests', 'pandas' From f303c5cf1febad2dfac1fa028ce9714c7f4a6060 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 16 May 2021 08:30:40 -0500 Subject: [PATCH 26/46] Use meps utils function to get reference sql --- src/mdt/database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdt/database.py b/src/mdt/database.py index e2d0e04..07b288f 100644 --- a/src/mdt/database.py +++ b/src/mdt/database.py @@ -119,7 +119,7 @@ def load_meps(): sql_create_table('meps_region_states', meps.columns.meps_region_states) - meps_reference_str = read_sql_string('meps_reference.sql') + meps_reference_str = meps.utils.get_sql('meps_reference.sql') meps_reference = db_query(meps_reference_str) sql_create_table('meps_reference', meps_reference) del meps_reference From fcbe75bb7f5b5643e5a67a46d112cd75ab7e4579 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 16 May 2021 08:33:18 -0500 Subject: [PATCH 27/46] Add get_sql function to rxnorm utils --- src/mdt/rxnorm/utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/mdt/rxnorm/utils.py b/src/mdt/rxnorm/utils.py index a8d49c8..b4a56ee 100644 --- a/src/mdt/rxnorm/utils.py +++ b/src/mdt/rxnorm/utils.py @@ -1,7 +1,11 @@ from pathlib import Path +import importlib.resources as pkg_resources import requests, os from typing import Callable +from . import sql + + def json_extract(obj, key): """Recursively fetch values from nested JSON.""" arr = [] @@ -60,3 +64,7 @@ def get_dataset( (dest / url.split('/')[-1]).write_bytes(response.content) return response + +def get_sql(file_name): + meps_sql = pkg_resources.read_text(sql, file_name) + return meps_sql From 2d08999324b8899874f7f3b4930260acb05787c5 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 16 May 2021 14:43:12 -0500 Subject: [PATCH 28/46] Rename synthea.py to utils.py --- src/mdt/{synthea.py => utils.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/mdt/{synthea.py => utils.py} (100%) diff --git a/src/mdt/synthea.py b/src/mdt/utils.py similarity index 100% rename from src/mdt/synthea.py rename to src/mdt/utils.py From 9f46e230520aa3ec7a9eb275c31f3f7fc6d73f76 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 16 May 2021 14:45:57 -0500 Subject: [PATCH 29/46] Basic mdt config.py, will need future refactor --- src/mdt/config.py | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 src/mdt/config.py diff --git a/src/mdt/config.py b/src/mdt/config.py new file mode 100644 index 0000000..6fb0551 --- /dev/null +++ b/src/mdt/config.py @@ -0,0 +1,5 @@ +MEPS_CONFIG={ + "age": ["0-3", "4-7", "8-11", "12-18", "19-49", "50-64", "65-99"], + "demographic_distrib_flags" : {"age": "Y", "gender": "Y", "state": "Y"}, + "meps_year" : "18" +} From a1dbc3888bcba9893a5fc6041739093d6799bee6 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 16 May 2021 16:13:08 -0500 Subject: [PATCH 30/46] Add missing payload constructor import --- src/mdt/rxnorm/rxclass.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mdt/rxnorm/rxclass.py b/src/mdt/rxnorm/rxclass.py index 70f422e..73cb971 100644 --- a/src/mdt/rxnorm/rxclass.py +++ b/src/mdt/rxnorm/rxclass.py @@ -1,3 +1,4 @@ +from .utils import payload_constructor def rxclass_findclassesbyid_payload(class_id): From 9b3b854959715df8ee792d383f3aa0d6d478b3b8 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 16 May 2021 16:17:39 -0500 Subject: [PATCH 31/46] Add missing urllib import --- src/mdt/rxnorm/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mdt/rxnorm/utils.py b/src/mdt/rxnorm/utils.py index b4a56ee..bd88458 100644 --- a/src/mdt/rxnorm/utils.py +++ b/src/mdt/rxnorm/utils.py @@ -1,3 +1,4 @@ +import urllib from pathlib import Path import importlib.resources as pkg_resources import requests, os From 25ed016a259ed3b9b42129565095cec26f2601f0 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 16 May 2021 16:20:50 -0500 Subject: [PATCH 32/46] Add missing imports, re, pandas, meps, database functions --- src/mdt/utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/mdt/utils.py b/src/mdt/utils.py index 198074a..2f2653d 100644 --- a/src/mdt/utils.py +++ b/src/mdt/utils.py @@ -1,4 +1,8 @@ - +import json +import re +import pandas as pd +from mdt.database import db_query, read_sql_string +from mdt import meps def read_json(file_name): # Opening JSON file @@ -108,7 +112,7 @@ def generate_module(rxcui_ndc_df, rxclass_name): meps_rxcui = meps_rxcui.merge(age_ranges.astype(str), how='inner', left_on='AGELAST', right_on='age_values') #Optional: State-region mapping from MEPS if demographic_distrib_flags['state'] == 'Y': - meps_rxcui = meps_rxcui.merge(meps_region_states.astype(str), how='inner', left_on='region_num', right_on='region_value') + meps_rxcui = meps_rxcui.merge(meps.columns.meps_region_states.astype(str), how='inner', left_on='region_num', right_on='region_value') #Clean text to JSON/SQL-friendly format From 1ebb93443c0363f8f130122b01933d29b4916655 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 16 May 2021 16:26:34 -0500 Subject: [PATCH 33/46] Move rx_api script into run_mdt.py, fix imports, currently broken --- src/mdt/run_mdt.py | 62 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/src/mdt/run_mdt.py b/src/mdt/run_mdt.py index 92c8ed5..cd4d2e9 100644 --- a/src/mdt/run_mdt.py +++ b/src/mdt/run_mdt.py @@ -1,10 +1,72 @@ from mdt.database import load_rxnorm, load_meps +from mdt import rxnorm +from mdt.utils import ( + rxcui_ndc_matcher, + output_df, + generate_module +) def main(): load_rxnorm() load_meps() + #TODO: replace this with config settings or JSON input + #For testing: D007037 = Hypothyroidism, D001249 = Asthma + rxclass_id = 'D001249' + rxclass_rela = 'may_treat' + + #Call RxClass FindClassesById API to get class info (name primarily) of the specified class + rxclass_response = rxnorm.utils.rxapi_get_requestor( + rxnorm.rxclass.rxclass_findclassesbyid_payload(rxclass_id) + ) + rxclass_names = rxnorm.utils.json_extract(rxclass_response, 'className') + #TODO: allow for name override in input settings + #TODO: build in better error handling if rxclass_id is garbage or returns no info + rxclass_name = rxclass_names[0] if len(rxclass_names) > 0 else 'unspecified' + + #Call RxClass GetClassMember API to get members of the specified class with specified relationship(s) + rxclass_response = rxnorm.utils.rxapi_get_requestor( + rxnorm.rxclass.rxclass_getclassmember_payload(rxclass_id, rxclass_rela) + ) + + #First, get all medications that contain one of the ingredient RXCUIs + #This will result in duplicate NDCs and potentially no MINs + rxcui_ingredient_list = rxnorm.utils.json_extract(rxclass_response, 'rxcui') + rxcui_ingredient_df = rxcui_ndc_matcher(rxcui_ingredient_list) + + #Second, get all of the medications that contain one of the product RXCUIs in the df above + #This will result in potentially INs and MINs, but still duplicate NDCs + rxcui_product_list = rxcui_ingredient_df['medication_product_rxcui'].drop_duplicates().tolist() + rxcui_product_df = rxcui_ndc_matcher(rxcui_product_list) + + #Third, query the df above with a window function to group by NDC and prefer MIN over IN + #This will result in only distinct NDCs that map to either an MIN (preferred) or an IN + #https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_sql.html#top-n-rows-per-group + rxcui_ndc_df = rxcui_product_df.assign( + rn = rxcui_product_df.sort_values(['medication_ingredient_tty'], ascending=False) + .groupby(['medication_ndc']) + .cumcount() + + 1 + ).query('rn < 2').drop(columns=['rn']) + + #Filter by dose form group (DFG) or dose form (DF) + #Function expects the rxcui_ndc_df, a list of DFG or DF names, and a flag for whether to include (default) or exclude + #If list of DFGs or DFs is empty, then nothing is filtered out + #https://www.nlm.nih.gov/research/umls/rxnorm/docs/appendix3.html + + # Add in after adding dfg info + # dfg_df_list = [] + # rxcui_ndc_df = filter_by_df(rxcui_ndc_df, dfg_df_list) + + #Saves df to csv + output_df(rxcui_ndc_df) + + #Gets distributions for the rxcui_ndc_df products + #TODO: adjust the second argument so that it'll grab the rxclass_sources (class + description, e.g., asthma_may_prevent or ATC, e.g., CCBs) + #TODO: maybe add an input for a population_df so we can modularize MEPS in case they replace it with another population source + generate_module(rxcui_ndc_df, rxclass_name) + if __name__ == '__main__': main() From c016d304d55721e57681c86c19beb5a986746779 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 16 May 2021 16:40:17 -0500 Subject: [PATCH 34/46] Use meps get_sql function --- src/mdt/database.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/mdt/database.py b/src/mdt/database.py index 07b288f..e8631e0 100644 --- a/src/mdt/database.py +++ b/src/mdt/database.py @@ -79,6 +79,10 @@ def load_rxnorm(): del z + rxcui_ndc = db_query(rxnorm.utils.get_sql('rxcui_ndc.sql')) + sql_create_table('rxcui_ndc', rxcui_ndc) + del rxcui_ndc + def load_meps(): '''Load Meps data into db''' From ed6483ad64fae124c918e52d17d70018957fae0e Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 16 May 2021 16:40:56 -0500 Subject: [PATCH 35/46] Use meps get_sql function in mdt.utils --- src/mdt/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mdt/utils.py b/src/mdt/utils.py index 2f2653d..fa7d5ba 100644 --- a/src/mdt/utils.py +++ b/src/mdt/utils.py @@ -90,8 +90,7 @@ def generate_module(rxcui_ndc_df, rxclass_name): #Get tuples of medication_product names and medication_product RXCUIs and loop through to generate MedicationOrders #Read in MEPS Reference table - meps_reference_str = read_sql_string('meps_reference.sql') - meps_reference = db_query(meps_reference_str) + meps_reference = db_query(meps.utils.get_sql('meps_reference.sql')) #Join MEPS to filtered rxcui_ndc dataframe (rxcui_list) meps_rxcui = meps_reference.astype(str).merge(rxcui_ndc_df.astype(str)[['medication_ingredient_name', 'medication_ingredient_rxcui','medication_product_name', 'medication_product_rxcui', 'medication_ndc']], how = 'inner', left_on = 'RXNDC', right_on = 'medication_ndc') From d689303b4ef6d971b0c4bddd1aaaa82157edefa1 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 16 May 2021 16:53:08 -0500 Subject: [PATCH 36/46] Monkey patch to read age and age values from python config.py --- src/mdt/utils.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/mdt/utils.py b/src/mdt/utils.py index fa7d5ba..dcb7335 100644 --- a/src/mdt/utils.py +++ b/src/mdt/utils.py @@ -1,6 +1,7 @@ import json import re import pandas as pd +from mdt.config import MEPS_CONFIG from mdt.database import db_query, read_sql_string from mdt import meps @@ -13,11 +14,12 @@ def read_json(file_name): return data -def age_values(file_name): +# Monkey patched this function to get run_mdt working by removing the filename arg and importing from config +def age_values(): """reads age_ranges from JSON to create dataframe with age_values""" - + data = {} - data['age'] = read_json('mdt_config.json')['age'] + data['age'] = MEPS_CONFIG['age'] data['age_values'] = [list(range(int(age.split('-')[0]), int(age.split('-')[1])+1)) for age in data['age']] df = pd.DataFrame(data) df = df.explode('age_values') @@ -98,7 +100,7 @@ def generate_module(rxcui_ndc_df, rxclass_name): #Optional: Age range join - can be customized in the mdt_config.json file #groupby_demographic_variable: must be either an empty list [] or list of patient demographics (e.g., age, gender, state) - based on user inputs in the mdt_config.json file - data = read_json('mdt_config.json') + data = MEPS_CONFIG demographic_distrib_flags = data['demographic_distrib_flags'] groupby_demographic_variables = [] @@ -107,7 +109,7 @@ def generate_module(rxcui_ndc_df, rxclass_name): groupby_demographic_variables.append(k) if demographic_distrib_flags['age'] == 'Y': - age_ranges = age_values('mdt_config.json') + age_ranges = age_values() meps_rxcui = meps_rxcui.merge(age_ranges.astype(str), how='inner', left_on='AGELAST', right_on='age_values') #Optional: State-region mapping from MEPS if demographic_distrib_flags['state'] == 'Y': From 1848d756c43d1126a7469dfd00be64676610040a Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 16 May 2021 16:55:29 -0500 Subject: [PATCH 37/46] Skip loading rxnorm and meps if MDT.db exists --- src/mdt/run_mdt.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/mdt/run_mdt.py b/src/mdt/run_mdt.py index cd4d2e9..759bce4 100644 --- a/src/mdt/run_mdt.py +++ b/src/mdt/run_mdt.py @@ -1,3 +1,4 @@ +from pathlib import Path from mdt.database import load_rxnorm, load_meps from mdt import rxnorm from mdt.utils import ( @@ -8,8 +9,10 @@ def main(): - load_rxnorm() - load_meps() + + if not (Path.cwd() / 'data' / 'MDT.db'): + load_rxnorm() + load_meps() #TODO: replace this with config settings or JSON input #For testing: D007037 = Hypothyroidism, D001249 = Asthma From 270bf4feae3b5e49802d1a64b6aeecead9da7355 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 16 May 2021 16:58:57 -0500 Subject: [PATCH 38/46] Uses system args to pass rxclass_id and rxclass_rela to run_mdt.py --- src/mdt/run_mdt.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/mdt/run_mdt.py b/src/mdt/run_mdt.py index 759bce4..f03a4d1 100644 --- a/src/mdt/run_mdt.py +++ b/src/mdt/run_mdt.py @@ -1,3 +1,4 @@ +import sys from pathlib import Path from mdt.database import load_rxnorm, load_meps from mdt import rxnorm @@ -8,17 +9,15 @@ ) -def main(): +#TODO: replace this with config settings or JSON input +#For testing: D007037 = Hypothyroidism, D001249 = Asthma + +def main(rxclass_id, rxclass_rela): if not (Path.cwd() / 'data' / 'MDT.db'): load_rxnorm() load_meps() - #TODO: replace this with config settings or JSON input - #For testing: D007037 = Hypothyroidism, D001249 = Asthma - rxclass_id = 'D001249' - rxclass_rela = 'may_treat' - #Call RxClass FindClassesById API to get class info (name primarily) of the specified class rxclass_response = rxnorm.utils.rxapi_get_requestor( rxnorm.rxclass.rxclass_findclassesbyid_payload(rxclass_id) @@ -72,4 +71,6 @@ def main(): if __name__ == '__main__': - main() + rxclass_id = sys.argv[1] + rxclass_rela = sys.argv[2] + main(rxclass_id, rxclass_rela) From 23569cdcee9bce10be29089f60860669806afbc5 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 16 May 2021 17:16:44 -0500 Subject: [PATCH 39/46] Add rxnorm dosage form sql --- src/mdt/rxnorm/sql/dfg_df.sql | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 src/mdt/rxnorm/sql/dfg_df.sql diff --git a/src/mdt/rxnorm/sql/dfg_df.sql b/src/mdt/rxnorm/sql/dfg_df.sql new file mode 100644 index 0000000..e7a57c1 --- /dev/null +++ b/src/mdt/rxnorm/sql/dfg_df.sql @@ -0,0 +1,10 @@ +select distinct df_rxnconso.str as df, dfg_rxnconso.str as dfg + +-- dose form +from rxnconso df_rxnconso + +-- dose form group +left join rxnrel dfg_rxnrel on dfg_rxnrel.rxcui2 = df_rxnconso.rxcui and dfg_rxnrel.rela = 'isa' +left join rxnconso dfg_rxnconso on dfg_rxnconso.rxcui = dfg_rxnrel.rxcui1 and dfg_rxnconso.sab = 'RXNORM' and dfg_rxnconso.tty = 'DFG' + +where df_rxnconso.sab = 'RXNORM' and df_rxnconso.tty = 'DF' From fffc61e1c3802b1e36c042afdb7b526c953369ff Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 16 May 2021 17:17:44 -0500 Subject: [PATCH 40/46] Load dfg table with load_rxnorm --- src/mdt/database.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/mdt/database.py b/src/mdt/database.py index e8631e0..ae5f560 100644 --- a/src/mdt/database.py +++ b/src/mdt/database.py @@ -75,7 +75,7 @@ def load_rxnorm(): col_names = ['RXCUI','LUI','SUI','RXAUI','STYPE','CODE','ATUI','SATUI','ATN','SAB','ATV','SUPPRESS','CVF','test'] rxnsat = pd.read_csv(z.open('rrf/RXNSAT.RRF'),sep='|',dtype=object,header=None,names=col_names) sql_create_table('rxnsat',rxnsat) - del rxnsat + del rxnsat del z @@ -83,6 +83,10 @@ def load_rxnorm(): sql_create_table('rxcui_ndc', rxcui_ndc) del rxcui_ndc + dfg_df = db_query(rxnorm.utils.get_sql('dfg_df.sql')) + sql_create_table('dfg_df', dfg_df) + del dfg_df + def load_meps(): '''Load Meps data into db''' From 029fd498c944d1eee0129cd2a96af22cc31f4799 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 16 May 2021 17:19:20 -0500 Subject: [PATCH 41/46] Add filter_by_df function to mdt.utils, missing path.exists added --- src/mdt/run_mdt.py | 7 ++++--- src/mdt/utils.py | 23 +++++++++++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/src/mdt/run_mdt.py b/src/mdt/run_mdt.py index f03a4d1..15f6697 100644 --- a/src/mdt/run_mdt.py +++ b/src/mdt/run_mdt.py @@ -4,6 +4,7 @@ from mdt import rxnorm from mdt.utils import ( rxcui_ndc_matcher, + filter_by_df, output_df, generate_module ) @@ -14,7 +15,7 @@ def main(rxclass_id, rxclass_rela): - if not (Path.cwd() / 'data' / 'MDT.db'): + if not (Path.cwd() / 'data' / 'MDT.db').exists(): load_rxnorm() load_meps() @@ -58,8 +59,8 @@ def main(rxclass_id, rxclass_rela): #https://www.nlm.nih.gov/research/umls/rxnorm/docs/appendix3.html # Add in after adding dfg info - # dfg_df_list = [] - # rxcui_ndc_df = filter_by_df(rxcui_ndc_df, dfg_df_list) + dfg_df_list = [] + rxcui_ndc_df = filter_by_df(rxcui_ndc_df, dfg_df_list) #Saves df to csv output_df(rxcui_ndc_df) diff --git a/src/mdt/utils.py b/src/mdt/utils.py index dcb7335..662f62c 100644 --- a/src/mdt/utils.py +++ b/src/mdt/utils.py @@ -38,6 +38,29 @@ def rxcui_ndc_matcher(rxcui_list): return filtered_df +def filter_by_df(rxcui_ndc_df, dfg_df_list, method='include'): + """Gets DFs from dfg_df table that match either a DF in the list, or have a DFG that matches a DFG in the list + If dfg_df list is empty, return the rxcui_ndc_df without filtering + Select method option of include or exclude....include is default""" + + if len(dfg_df_list) == 0: + return rxcui_ndc_df + + dfg_df_df = db_query('SELECT * FROM dfg_df') + filtered_dfg_df_df = dfg_df_df[dfg_df_df['dfg'].isin(dfg_df_list) | dfg_df_df['df'].isin(dfg_df_list)] + df_list = filtered_dfg_df_df['df'].tolist() + + if method == 'include': + filtered_rxcui_ndc_df = rxcui_ndc_df[rxcui_ndc_df['dose_form_name'].isin(df_list)] + elif method == 'exclude': + filtered_rxcui_ndc_df = rxcui_ndc_df[~rxcui_ndc_df['dose_form_name'].isin(df_list)] + else: + filtered_rxcui_ndc_df = rxcui_ndc_df + + print("RXCUI list filtered on DF matched on {0} NDCs".format(filtered_rxcui_ndc_df['medication_ndc'].count())) + + return filtered_rxcui_ndc_df + def output_df(df,output='csv', filename='df_output'): """Outputs a dataframe to a csv of clipboard if you use the output=clipboard arguement""" From d87c2ca859d75c588a74246e39adb7eef5feb1a0 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 16 May 2021 17:25:41 -0500 Subject: [PATCH 42/46] Initial FDA subpackage setup --- src/mdt/fda/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/mdt/fda/__init__.py diff --git a/src/mdt/fda/__init__.py b/src/mdt/fda/__init__.py new file mode 100644 index 0000000..e69de29 From 08804335717282e8669af9401d0e0e9951767734 Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 16 May 2021 17:28:42 -0500 Subject: [PATCH 43/46] fda utils module setup, get_dataset function --- src/mdt/fda/__init__.py | 1 + src/mdt/fda/utils.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 src/mdt/fda/utils.py diff --git a/src/mdt/fda/__init__.py b/src/mdt/fda/__init__.py index e69de29..eb018c3 100644 --- a/src/mdt/fda/__init__.py +++ b/src/mdt/fda/__init__.py @@ -0,0 +1 @@ +from . import utils diff --git a/src/mdt/fda/utils.py b/src/mdt/fda/utils.py new file mode 100644 index 0000000..0d7a332 --- /dev/null +++ b/src/mdt/fda/utils.py @@ -0,0 +1,17 @@ +import requests +from pathlib import Path + + +def get_dataset( + dest = Path.cwd(), + handler = None +): + url = f'https://www.accessdata.fda.gov/cder/ndctext.zip' + response = requests.get(url) + + if handler: + return handler(response.content) + + (dest / url.split('/')[-1]).write_bytes(response.content) + + return response From e1a413e902507a44bf6e91f319bac44584b0247b Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 16 May 2021 17:46:00 -0500 Subject: [PATCH 44/46] load_fda function setup --- src/mdt/database.py | 35 ++++++++++++++++++++++++++++++++++- src/mdt/run_mdt.py | 3 ++- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/src/mdt/database.py b/src/mdt/database.py index ae5f560..dc2266a 100644 --- a/src/mdt/database.py +++ b/src/mdt/database.py @@ -1,4 +1,4 @@ -from . import rxnorm, meps +from . import rxnorm, meps, fda from pathlib import Path import zipfile import io @@ -144,3 +144,36 @@ def load_meps(): meps_region_states = db_query("Select count(*) AS records from meps_region_states") print('DB table meps_region_states has {0} records'.format(meps_region_states['records'].iloc[0])) + + +def load_fda(): + '''Load FDA tables into db''' + + z = zipfile.ZipFile( + fda.utils.get_dataset(handler=io.BytesIO) + ) + product = pd.read_csv(z.open('product.txt'),sep='\t',dtype=object,header=0,encoding='cp1252') + package = pd.read_csv(z.open('package.txt'),sep='\t',dtype=object,header=0,encoding='cp1252') + sql_create_table('product',product) + sql_create_table('package',package) + del product + del package + + #deletes FDA ZIP + del z + + #NOTE: Rob's python code to join one of these tables with the rxcui_ndc table goes here + """ + rxcui_ndc_string = read_sql_string('rxcui_ndc.sql') + rxcui_ndc = db_query(rxcui_ndc_string) + sql_create_table('rxcui_ndc', rxcui_ndc) + del rxcui_ndc + """ + + + #TEST!!!!!!!!!!!!!!!! reads record count from created database + product = db_query("Select count(*) AS records from product limit 1") + print('DB table product has {0} records'.format(product['records'].iloc[0])) + + package = db_query("Select count(*) AS records from package limit 1") + print('DB table package has {0} records'.format(package['records'].iloc[0])) diff --git a/src/mdt/run_mdt.py b/src/mdt/run_mdt.py index 15f6697..cbafb2a 100644 --- a/src/mdt/run_mdt.py +++ b/src/mdt/run_mdt.py @@ -1,6 +1,6 @@ import sys from pathlib import Path -from mdt.database import load_rxnorm, load_meps +from mdt.database import load_rxnorm, load_meps, load_fda from mdt import rxnorm from mdt.utils import ( rxcui_ndc_matcher, @@ -18,6 +18,7 @@ def main(rxclass_id, rxclass_rela): if not (Path.cwd() / 'data' / 'MDT.db').exists(): load_rxnorm() load_meps() + load_fda() #Call RxClass FindClassesById API to get class info (name primarily) of the specified class rxclass_response = rxnorm.utils.rxapi_get_requestor( From 5f72506a44655f3ac3f320c6bd6e2535f1fa46ec Mon Sep 17 00:00:00 2001 From: Yevgeny Bulochnik Date: Sun, 16 May 2021 17:54:48 -0500 Subject: [PATCH 45/46] Dev setup in readme --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 0fc6b50..8d1839b 100644 --- a/README.md +++ b/README.md @@ -85,3 +85,11 @@ src/ │ │ │ ├─ hypothyroidism.json │ │ │ ├─ ... ``` + +## Contribution Guide +1. Setup a venv with `python -m venv venv`, this will create a a directory called venv in your current working directory +2. Activate your venv with `source venv/bin/activate` or on windows `venv/Scripts/Activate` +3. Install MDT with `pip install -e .`, this sets up mdt as an installed editable package +4. Run MDT with `python -m mdt.run_mdt D007037 may_treat` + - `run_mdt` takes two system args the rxclass_id and rxclass_rela these must be specified + - the initial run of `run_mdt` will download all necessary files and build the database in `data/` in the current working directory From cc1154d98327c5a521578e114d31fb8c9abaf43a Mon Sep 17 00:00:00 2001 From: kristentaytok Date: Sun, 16 May 2021 22:28:35 -0700 Subject: [PATCH 46/46] #30 adding marketyears to the generate_module function Taking Eugene's restructure branch (from PR #59), this updates the following: 1. Added cleaned version of Rob's code to database.by --> creates an ingredient_rxcui_year table & a product_rxcui_year table, for their respective distributions in generate_module 2. added 'year' column to the generate_module dataframes/CSV files. 3. fixed default_probability typo in utils.py (if idx == 1 --> changed to if idx == 0) --- src/mdt/database.py | 60 +++++++++++++++++++++++++++++++++++++++------ src/mdt/utils.py | 40 +++++++++++++++++++----------- 2 files changed, 79 insertions(+), 21 deletions(-) diff --git a/src/mdt/database.py b/src/mdt/database.py index dc2266a..3e216f9 100644 --- a/src/mdt/database.py +++ b/src/mdt/database.py @@ -1,9 +1,10 @@ -from . import rxnorm, meps, fda +from mdt import rxnorm, meps, fda from pathlib import Path import zipfile import io import sqlite3 import pandas as pd +from datetime import datetime def to_data(): @@ -152,28 +153,73 @@ def load_fda(): z = zipfile.ZipFile( fda.utils.get_dataset(handler=io.BytesIO) ) + + #moves FDA files to sqlite database by reading as dataframes product = pd.read_csv(z.open('product.txt'),sep='\t',dtype=object,header=0,encoding='cp1252') package = pd.read_csv(z.open('package.txt'),sep='\t',dtype=object,header=0,encoding='cp1252') sql_create_table('product',product) sql_create_table('package',package) - del product - del package + #deletes FDA ZIP del z - #NOTE: Rob's python code to join one of these tables with the rxcui_ndc table goes here - """ + + + #join product table with the rxcui_ndc table rxcui_ndc_string = read_sql_string('rxcui_ndc.sql') rxcui_ndc = db_query(rxcui_ndc_string) sql_create_table('rxcui_ndc', rxcui_ndc) - del rxcui_ndc - """ + product['PRODUCTNDC'] = product['PRODUCTNDC'].str.replace('-', '').str.zfill(9) + rxcui_ndc['medication_ndc'] = rxcui_ndc['medication_ndc'].astype(str).str.zfill(9) + product_rxcui = product.merge(rxcui_ndc, left_on = 'PRODUCTNDC', right_on = rxcui_ndc['medication_ndc'].str.slice(start=0,stop=9), how = 'left') + + + #extract year from startmarketingdate & endmarketingdate + #fill NULL endmarketingyear with current year + product_rxcui['STARTMARKETINGYEAR'] = product_rxcui['STARTMARKETINGDATE'].str.slice(start=0, stop=4).astype(int) + product_rxcui['ENDMARKETINGYEAR'] = product_rxcui['ENDMARKETINGDATE'].str.slice(start=0, stop=4) + product_rxcui['ENDMARKETINGYEAR'] = product_rxcui['ENDMARKETINGYEAR'].fillna(datetime.now().year) + product_rxcui['ENDMARKETINGYEAR'] = product_rxcui['ENDMARKETINGYEAR'].astype(int) + product_rxcui = product_rxcui[['medication_ingredient_rxcui', 'medication_ingredient_name', 'medication_product_rxcui', + 'medication_product_name', 'STARTMARKETINGYEAR', 'ENDMARKETINGYEAR']] + + med_marketing_year_dict = {} + med_state_level_list = ['medication_ingredient', 'medication_product'] + + #create a dictionary of df's (one for ingredient, other for product) that contains a range of years that each rxcui was available o nthe market + def med_marketing_year(med_state_level_list): + for med_state_level in med_state_level_list: + #takes MIN startmarketingdate and MAX endmarketingdate for each rxcui + med_marketing_year_dict[med_state_level+'_max_marketingyear_range'] = product_rxcui.groupby([med_state_level+'_rxcui', med_state_level+'_name']).agg({'STARTMARKETINGYEAR': 'min', 'ENDMARKETINGYEAR': 'max'}).reset_index() + + #creates a row for each year between startmarketingdate and endmarketingdate for each rxcui + zipped = zip(med_marketing_year_dict[med_state_level+'_max_marketingyear_range'][med_state_level+'_rxcui'], med_marketing_year_dict[med_state_level+'_max_marketingyear_range']['STARTMARKETINGYEAR'], med_marketing_year_dict[med_state_level+'_max_marketingyear_range']['ENDMARKETINGYEAR']) + med_marketing_year_dict[med_state_level+'_rxcui_years'] = pd.DataFrame([(i, y) for i, s, e in zipped for y in range(s, e+1)], + columns=[med_state_level+'_rxcui','year']) + sql_create_table(med_state_level+'_rxcui_years',med_marketing_year_dict[med_state_level+'_rxcui_years']) + print(med_state_level+'_rxcui_years') + + med_marketing_year(med_state_level_list) + + #deletes other dataframes + del product + del package + del rxcui_ndc + del medication_ingredient_rxcui_years + del medication_product_rxcui_years + #TEST!!!!!!!!!!!!!!!! reads record count from created database product = db_query("Select count(*) AS records from product limit 1") print('DB table product has {0} records'.format(product['records'].iloc[0])) package = db_query("Select count(*) AS records from package limit 1") print('DB table package has {0} records'.format(package['records'].iloc[0])) + + medication_product_rxcui_years = db_query("Select count(*) AS records from medication_product_rxcui_years limit 1") + print('DB table medication_product_rxcui_years has {0} records'.format(medication_product_rxcui_years['records'].iloc[0])) + + medication_ingredient_rxcui_years = db_query("Select count(*) AS records from medication_ingredient_rxcui_years limit 1") + print('DB table medication_ingredient_rxcui_years has {0} records'.format(medication_ingredient_rxcui_years['records'].iloc[0])) \ No newline at end of file diff --git a/src/mdt/utils.py b/src/mdt/utils.py index 662f62c..367edbe 100644 --- a/src/mdt/utils.py +++ b/src/mdt/utils.py @@ -117,9 +117,16 @@ def generate_module(rxcui_ndc_df, rxclass_name): #Read in MEPS Reference table meps_reference = db_query(meps.utils.get_sql('meps_reference.sql')) + #Read in FDA Ingredient-RxCUI-Years Reference table (for years that a given ingredient was available on the market) + ingredient_rxcui_years = db_query('SELECT * FROM medication_ingredient_rxcui_years') + + #Read in FDA Product-RxCUI-Years Reference table (for years that a given product was available on the market) + product_rxcui_years = db_query('SELECT * FROM medication_product_rxcui_years') + #Join MEPS to filtered rxcui_ndc dataframe (rxcui_list) meps_rxcui = meps_reference.astype(str).merge(rxcui_ndc_df.astype(str)[['medication_ingredient_name', 'medication_ingredient_rxcui','medication_product_name', 'medication_product_rxcui', 'medication_ndc']], how = 'inner', left_on = 'RXNDC', right_on = 'medication_ndc') + #Optional: Age range join - can be customized in the mdt_config.json file #groupby_demographic_variable: must be either an empty list [] or list of patient demographics (e.g., age, gender, state) - based on user inputs in the mdt_config.json file @@ -162,20 +169,23 @@ def generate_module(rxcui_ndc_df, rxclass_name): filename = rxclass_name + '_ingredient_distrib' #1 - dcp_dict['patient_count_ingredient'] = meps_rxcui[['medication_ingredient_name', 'medication_ingredient_rxcui', 'person_weight', 'DUPERSID']+groupby_demographic_variables].groupby(['medication_ingredient_name', 'medication_ingredient_rxcui', 'person_weight']+groupby_demographic_variables)['DUPERSID'].nunique() + #Join MEPS to ingredient_rxcui_years dataframe (rxcuis_by_fda_marketingdates) + meps_rxcui_ingred_years = meps_rxcui.astype(str).merge(ingredient_rxcui_years.astype(str)[['medication_ingredient_rxcui', 'year']], how = 'inner', on = 'medication_ingredient_rxcui') + dcp_dict['patient_count_ingredient'] = meps_rxcui_ingred_years[['medication_ingredient_name', 'medication_ingredient_rxcui', 'year', 'person_weight', 'DUPERSID']+groupby_demographic_variables].groupby(['medication_ingredient_name', 'medication_ingredient_rxcui', 'year', 'person_weight']+groupby_demographic_variables)['DUPERSID'].nunique() dcp_df = pd.DataFrame(dcp_dict['patient_count_ingredient']).reset_index() #2 dcp_df['weighted_patient_count_ingredient'] = dcp_df['person_weight'].astype(float)*dcp_df['DUPERSID'] #3 - dcp_dict['patients_by_demographics_ingredient'] = dcp_df.groupby(['medication_ingredient_name']+groupby_demographic_variables)['weighted_patient_count_ingredient'].sum() + dcp_dict['patients_by_demographics_ingredient'] = dcp_df.groupby(['medication_ingredient_name', 'year']+groupby_demographic_variables)['weighted_patient_count_ingredient'].sum() dcp_demographic_df = pd.DataFrame(dcp_dict['patients_by_demographics_ingredient']).reset_index() #4 if len(groupby_demographic_variables) > 0: - dcp_demographictotal_df = pd.merge(dcp_demographic_df, dcp_demographic_df.groupby(groupby_demographic_variables)['weighted_patient_count_ingredient'].sum(), how = 'inner', left_on = groupby_demographic_variables, right_index=True, suffixes = ('_demographic', '_total')) + dcp_demographictotal_df = pd.merge(dcp_demographic_df, dcp_demographic_df.groupby(groupby_demographic_variables+['year'])['weighted_patient_count_ingredient'].sum(), how = 'inner', left_on = groupby_demographic_variables+['year'], right_index=True, suffixes = ('_demographic', '_total')) else: - dcp_demographictotal_df = dcp_demographic_df - dcp_demographictotal_df['weighted_patient_count_ingredient_demographic'] = dcp_demographic_df['weighted_patient_count_ingredient'] - dcp_demographictotal_df['weighted_patient_count_ingredient_total'] = dcp_demographic_df['weighted_patient_count_ingredient'].sum() + # dcp_demographictotal_df = dcp_demographic_df + # dcp_demographictotal_df['weighted_patient_count_ingredient_demographic'] = dcp_demographic_df['weighted_patient_count_ingredient'] + # dcp_demographictotal_df['weighted_patient_count_ingredient_total'] = dcp_demographic_df['weighted_patient_count_ingredient'].sum() + dcp_demographictotal_df = pd.merge(dcp_demographic_df, dcp_demographic_df.groupby('year')['weighted_patient_count_ingredient'].sum(), how = 'inner', left_on = 'year', right_index=True, suffixes = ('_demographic', '_total')) #5 dcp_demographictotal_df['percent_ingredient_patients'] = round(dcp_demographictotal_df['weighted_patient_count_ingredient_demographic']/dcp_demographictotal_df['weighted_patient_count_ingredient_total'], 3) #6 TODO: change this column to medication_product_state_name(?) @@ -199,9 +209,9 @@ def generate_module(rxcui_ndc_df, rxclass_name): #7 dcp_dict['percent_ingredient_patients'] = dcp_demographictotal_df if len(groupby_demographic_variables) > 0: - dcp_dict['percent_ingredient_patients'] = dcp_dict['percent_ingredient_patients'].reset_index().pivot(index= groupby_demographic_variables, columns = 'medication_ingredient_name', values='percent_ingredient_patients').reset_index() + dcp_dict['percent_ingredient_patients'] = dcp_dict['percent_ingredient_patients'].reset_index().pivot(index= groupby_demographic_variables+['year'], columns = 'medication_ingredient_name', values='percent_ingredient_patients').reset_index() else: - dcp_dict['percent_ingredient_patients'] = dcp_dict['percent_ingredient_patients'][['medication_ingredient_name', 'percent_ingredient_patients']].set_index('medication_ingredient_name').T + dcp_dict['percent_ingredient_patients'] = dcp_dict['percent_ingredient_patients'][['medication_ingredient_name', 'percent_ingredient_patients', 'year']].set_index('medication_ingredient_name').T #Fill NULLs and save as CSV dcp_dict['percent_ingredient_patients'].fillna(0, inplace=True) @@ -216,17 +226,19 @@ def generate_module(rxcui_ndc_df, rxclass_name): for ingred_name in medication_ingredient_list: filename = rxclass_name + '_product_' + ingred_name + '_distrib' #0 - meps_rxcui_ingred = meps_rxcui[meps_rxcui['medication_ingredient_name']==ingred_name][['medication_product_name', 'medication_product_rxcui', 'medication_ingredient_name', 'medication_ingredient_rxcui', 'person_weight', 'DUPERSID']+groupby_demographic_variables] + #Join MEPS to product_rxcui_years dataframe (rxcuis_by_fda_marketingdates) + meps_rxcui_prod_years = meps_rxcui.astype(str).merge(product_rxcui_years.astype(str)[['medication_product_rxcui', 'year']], how = 'inner', on = 'medication_product_rxcui') + meps_rxcui_ingred = meps_rxcui_prod_years[meps_rxcui_prod_years['medication_ingredient_name']==ingred_name][['medication_product_name', 'medication_product_rxcui', 'medication_ingredient_name', 'medication_ingredient_rxcui', 'year', 'person_weight', 'DUPERSID']+groupby_demographic_variables] #1 - dcp_dict['patient_count_product'] = meps_rxcui_ingred.groupby(['medication_product_name', 'medication_product_rxcui', 'medication_ingredient_name', 'medication_ingredient_rxcui', 'person_weight']+groupby_demographic_variables)['DUPERSID'].nunique() + dcp_dict['patient_count_product'] = meps_rxcui_ingred.groupby(['medication_product_name', 'medication_product_rxcui', 'medication_ingredient_name', 'medication_ingredient_rxcui', 'year', 'person_weight']+groupby_demographic_variables)['DUPERSID'].nunique() dcp_df = pd.DataFrame(dcp_dict['patient_count_product']).reset_index() #2 dcp_df['weighted_patient_count_product'] = dcp_df['person_weight'].astype(float)*dcp_df['DUPERSID'] #3 - dcp_dict['patients_by_demographics_product'] = dcp_df.groupby(['medication_product_name', 'medication_ingredient_name']+groupby_demographic_variables)['weighted_patient_count_product'].sum() + dcp_dict['patients_by_demographics_product'] = dcp_df.groupby(['medication_product_name', 'medication_ingredient_name', 'year']+groupby_demographic_variables)['weighted_patient_count_product'].sum() dcp_demographic_df = pd.DataFrame(dcp_dict['patients_by_demographics_product']).reset_index() #4 - dcp_demographictotal_df = pd.merge(dcp_demographic_df, dcp_demographic_df.groupby(['medication_ingredient_name']+groupby_demographic_variables)['weighted_patient_count_product'].sum(), how = 'inner', left_on = ['medication_ingredient_name']+groupby_demographic_variables, right_index=True, suffixes = ('_demographic', '_total')) + dcp_demographictotal_df = pd.merge(dcp_demographic_df, dcp_demographic_df.groupby(['medication_ingredient_name', 'year']+groupby_demographic_variables)['weighted_patient_count_product'].sum(), how = 'inner', left_on = ['medication_ingredient_name', 'year']+groupby_demographic_variables, right_index=True, suffixes = ('_demographic', '_total')) #5 dcp_demographictotal_df['percent_product_patients'] = round(dcp_demographictotal_df['weighted_patient_count_product_demographic']/dcp_demographictotal_df['weighted_patient_count_product_total'], 3) #6 TODO: change this column to medication_product_state_name or medication_product_transition_name(?) @@ -250,9 +262,9 @@ def generate_module(rxcui_ndc_df, rxclass_name): #7 dcp_dict['percent_product_patients'] = dcp_demographictotal_df if len(groupby_demographic_variables) > 0: - dcp_dict['percent_product_patients'] = dcp_dict['percent_product_patients'].reset_index().pivot(index= groupby_demographic_variables, columns = 'medication_product_name', values='percent_product_patients').reset_index() + dcp_dict['percent_product_patients'] = dcp_dict['percent_product_patients'].reset_index().pivot(index= groupby_demographic_variables+['year'], columns = 'medication_product_name', values='percent_product_patients').reset_index() else: - dcp_dict['percent_product_patients'] = dcp_dict['percent_product_patients'][['medication_product_name', 'percent_product_patients']].set_index('medication_product_name').T + dcp_dict['percent_product_patients'] = dcp_dict['percent_product_patients'][['medication_product_name', 'percent_product_patients', 'year']].set_index('medication_product_name').T #Fill NULLs and save as CSV dcp_dict['percent_product_patients'].fillna(0, inplace=True)