From 1557f3611b4758b747ba84c425240141bb18cc3b Mon Sep 17 00:00:00 2001 From: clarkenj Date: Tue, 22 Aug 2023 14:30:47 -0400 Subject: [PATCH 01/14] added scripts --- scripts/npi_to_mbi.py | 51 +++++++ scripts/significance_corr.py | 265 +++++++++++++++++++++++++++++++++++ 2 files changed, 316 insertions(+) create mode 100644 scripts/npi_to_mbi.py create mode 100644 scripts/significance_corr.py diff --git a/scripts/npi_to_mbi.py b/scripts/npi_to_mbi.py new file mode 100644 index 0000000..5aa3607 --- /dev/null +++ b/scripts/npi_to_mbi.py @@ -0,0 +1,51 @@ +import pandas as pd +from pathlib import Path + +## Load NPIQ or NPI scores and convert to MBI framework +## Domains: https://adni.bitbucket.io/reference/npi.html + + +def _mbi_conversion(df): + # Calculate MBI domains + df["decreased_motivation"] = df["NPIG"] + df["emotional_dysregulation"] = df["NPID"] + df["NPIE"] + df["NPIF"] + df["impulse_dyscontrol"] = df["NPIC"] + df["NPII"] + df["NPIJ"] + df["social_inappropriateness"] = df["NPIH"] + df["abnormal_perception"] = df["NPIA"] + df["NPIB"] + + # Calculate MBI total score + mbi_domains = [ + "decreased_motivation", + "emotional_dysregulation", + "impulse_dyscontrol", + "social_inappropriateness", + "abnormal_perception", + ] + df["MBI_total_score"] = df[mbi_domains].sum(axis=1) + df["mbi_status"] = (df["MBI_total_score"] >= 1).astype(int) + + mbi_columns = [ + "RID", + "EXAMDATE", + "decreased_motivation", + "emotional_dysregulation", + "impulse_dyscontrol", + "social_inappropriateness", + "abnormal_perception", + "MBI_total_score", + "mbi_status", + ] + mbi_df = df[mbi_columns].copy() + + return mbi_df + + +root_p = Path("__file__").resolve().parents[1] / "data" / "adni" + +# Load NPI or NPI-Q scores +npi_df = pd.read_csv(root_p / "NPIQ_22Aug2023.csv") + +mbi_df = _mbi_conversion(npi_df) + +# Save results - make sure to change name +mbi_df.to_csv(root_p / "adni_npiq_mbi_status_20230822.csv", index=False) diff --git a/scripts/significance_corr.py b/scripts/significance_corr.py new file mode 100644 index 0000000..05a1e7e --- /dev/null +++ b/scripts/significance_corr.py @@ -0,0 +1,265 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +import util +import time +import os +import itertools +from statsmodels.stats.multitest import multipletests +from scipy.stats import pearsonr +from argparse import ArgumentParser + + +# SEBASTIEN URCHS +def p_permut(empirical_value, permutation_values): + n_permutation = len(permutation_values) + if empirical_value >= 0: + return (np.sum(permutation_values > empirical_value) + 1) / (n_permutation + 1) + return (np.sum(permutation_values < empirical_value) + 1) / (n_permutation + 1) + + +def filter_fdr(df, contrasts): + df_filtered = df[ + (df["pair0"].isin(contrasts)) & (df["pair1"].isin(contrasts)) + ].copy() + _, fdr, _, _ = multipletests(df_filtered["pval"], method="fdr_bh") + df_filtered["fdr_filtered"] = fdr + return df_filtered + + +def mat_form(df, contrasts, value="betamap_corr"): + n = len(contrasts) + d = dict(zip(contrasts, range(n))) + mat = np.zeros((n, n)) + for c in contrasts: + # fill out vertical strip of mat + for i in range(n): + if i == d[c]: + val = 1 + else: + val = df[ + ((df["pair0"] == c) | (df["pair1"] == c)) + & ((df["pair0"] == contrasts[i]) | (df["pair1"] == contrasts[i])) + ][value] + mat[i, d[c]] = val + mat[d[c], i] = val + return pd.DataFrame(mat, columns=contrasts, index=contrasts) + + +def make_matrices(df, contrasts, fdr="fdr_filtered"): + "Param fdr can be set to 'fdr_filtered': FDR is performed using the pvalues only from the chosen contrasts" + " or 'fdr': values taken from FDR performed on full set of 42 contrasts" + if fdr == "fdr_filtered": + df = filter_fdr(df, contrasts) + mat_corr = mat_form(df, contrasts, value="betamap_corr") + mat_pval = mat_form(df, contrasts, value="pval") + mat_fdr = mat_form(df, contrasts, value=fdr) + return mat_corr, mat_pval, mat_fdr + + +def get_corr_dist(cases, nulls, path_out, tag="wholeconn"): + # For each unique pair, between the null maps. + n_pairs = int((len(cases)) * (len(cases) - 1) / 2) + corr = np.zeros((n_pairs, 5000)) + + print( + "Getting correlation between 5000 null maps for {} unique pairs for {} cases...".format( + n_pairs, len(cases) + ) + ) + pair = [] + l = 0 + for i in itertools.combinations(cases, 2): + for j in range(5000): + corr[l, j] = pearsonr( + nulls.loc[i[0]].values[j, :], nulls.loc[i[1]].values[j, :] + )[0] + + pair.append(i) + if l % 50 == 0: + print("{}/{}".format(l, n_pairs)) + l = l + 1 + + df = pd.DataFrame(corr) + df["pair"] = pair + df.to_csv(os.path.join(path_out, "correlation_dist_{}.csv".format(tag))) + return df + + +def get_corr(cases, betas, path_out, tag="wholeconn"): + # For each unique pair, correlation between betamaps. Use standardized betas here (as in rest of paper). + n_pairs = int((len(cases)) * (len(cases) - 1) / 2) + corr = np.zeros(n_pairs) + + print( + "Getting correlation between betamaps for {} unique pairs for {} cases...".format( + n_pairs, len(cases) + ) + ) + pair = [] + l = 0 + for i in itertools.combinations(cases, 2): + corr[l] = pearsonr(betas.loc[i[0]].values, betas.loc[i[1]].values)[0] + l = l + 1 + pair.append(i) + df = pd.DataFrame(corr) + df["pair"] = pair + df.to_csv(os.path.join(path_out, "correlation_betas_{}.csv".format(tag))) + return df + + +def get_corr_pval(maps, nulls, betas, path_out, tag="wholeconn"): + df = get_corr_dist(maps, nulls, path_out, tag=tag) + df_bb = get_corr(maps, betas, path_out, tag=tag) + + df_bb = df_bb.rename(columns={0: "betamap_corr"}) + df_master = df_bb.merge(df, on="pair") + + print("Calculating pvals...") + # CALCULATE PVALS + pval = [] + for i in df_master.index: + p = p_permut(df_master.loc[i, "betamap_corr"], df_master[range(5000)].loc[i]) + pval.append(p) + df_master["pval"] = pval + + # ADD LABELS + pair0 = [p[0] for p in df["pair"].tolist()] + pair1 = [p[1] for p in df["pair"].tolist()] + df_master["pair0"] = pair0 + df_master["pair1"] = pair1 + + df_compact = df_master[["pair0", "pair1", "betamap_corr", "pval"]] + df_compact.to_csv( + os.path.join(path_out, "corr_pval_null_v_null_{}.csv".format(tag)) + ) + + return df_compact + + +def _make_region_masks(row): + mask = np.tri(64, k=0, dtype=bool) + + # Get the parcel number from the beginning of the row and subtract 1 + parcel_num = int(row[0]) - 1 + + # Get the parcel name from the second column value + parcel_name = str(row[1]).upper().replace(" ", "_") + + # Generate the mask + parcel = np.zeros((64, 64), bool) + parcel[:, parcel_num] = True + parcel_mask = parcel + np.transpose(parcel) + parcel_mask = np.tril(parcel_mask) + parcel_mask = parcel_mask[mask] + + return (parcel_mask, parcel_name) + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument( + "--n_path_mc", help="path to mc null models dir", dest="n_path_mc" + ) + parser.add_argument("--b_path_mc", help="path to mc betamaps dir", dest="b_path_mc") + parser.add_argument("--path_out", help="path to output directory", dest="path_out") + parser.add_argument( + "--path_corr", help="path to corr dir", dest="path_corr", default=None + ) + parser.add_argument( + "--path_parcellation", help="path to MIST_64.cv", dest="path_parcellation" + ) + args = parser.parse_args() + + n_path_mc = os.path.join(args.n_path_mc, "null_model_{}_vs_control_mc.npy") + # cont_n_path_mc = os.path.join(args.n_path_mc,'{}_null_model_mc.npy') + b_path_mc = os.path.join(args.b_path_mc, "{}_vs_con_mean.csv") + # cont_b_path_mc = os.path.join(args.b_path_mc,'cont_{}_results_mc.csv') + path_out = args.path_out + path_corr = args.path_corr + + cases = ["mci_neg", "mci_pos", "sz"] + + maps = cases # + cont + + ############# + # LOAD DATA # + ############# + null = [] + beta_std = [] + + for c in cases: + null.append(pd.DataFrame(np.load(n_path_mc.format(c)))) + beta_std.append(pd.read_csv(b_path_mc.format(c))["stand_betas"].values) + + betamaps_std = pd.DataFrame(beta_std, index=maps) + nullmodels = pd.concat(null, keys=maps) + + #################### + # WHOLE CONNECTOME # + #################### + print("Creating correlation distributions for whole connectome...") + df_wc = get_corr_pval(maps, nullmodels, betamaps_std, path_out, tag="wholeconn") + + # make matrices + print("Preparing correlation matrices...") + subset = ["mci_neg", "mci_pos", "sz"] + + corr, pval, fdr = make_matrices(df_wc, subset, fdr="fdr_filtered") + + corr.to_csv(os.path.join(path_out, "FC_corr_wholebrain_mc_null_v_null.csv")) + pval.to_csv(os.path.join(path_out, "FC_corr_pval_wholebrain_mc_null_v_null.csv")) + fdr.to_csv( + os.path.join(path_out, "FC_corr_fdr_filtered_wholebrain_mc_null_v_null.csv") + ) + + #################### + # REGIONS # + #################### + df_MIST = pd.read_csv(args.path_parcellation, delimiter=";") + + for index, row in df_MIST.iterrows(): + # create region mask + parcel_mask, parcel_name = _make_region_masks(row) + + # filter maps + null_parcel = [n.transpose()[parcel_mask].transpose() for n in null] + beta_std_parcel = [b[parcel_mask] for b in beta_std] + + betamaps_std_parcel = pd.DataFrame(beta_std_parcel, index=maps) + nullmodels_parcel = pd.concat(null_parcel, keys=maps) + + print(f"Creating correlation distributions for {parcel_name}...") + df_parcel = get_corr_pval( + maps, nullmodels_parcel, betamaps_std_parcel, path_out, tag=parcel_name + ) + + # make matrices + corr_parcel, pval_parcel, fdr_filtered_parcel = make_matrices( + df_parcel, subset, fdr="fdr_filtered" + ) + + _, pval_parcel_fdr_corrected, _, _ = multipletests(pval_parcel, method="fdr_bh") + + # TO DO - see if I need to correct fdr filtered, I think so? + if (pval_parcel < 0.05).any(): + corr_parcel.to_csv( + os.path.join(path_out, f"FC_corr_{parcel_name}_mc_null_v_null.csv") + ) + pval_parcel.to_csv( + os.path.join(path_out, f"FC_corr_pval_{parcel_name}_mc_null_v_null.csv") + ) + pval_parcel_fdr_corrected.to_csv( + os.path.join( + path_out, + f"FC_corr_pval_fdr_corrected_{parcel_name}_mc_null_v_null.csv", + ) + ) + fdr_filtered_parcel.to_csv( + os.path.join( + path_out, f"FC_corr_fdr_filtered_{parcel_name}_mc_null_v_null.csv" + ) + ) + + print("Done!") From 80fe67e78a2dbe1f41f339a31fa97d6e67deb7d6 Mon Sep 17 00:00:00 2001 From: Natasha Clarke <57987005+clarkenj@users.noreply.github.com> Date: Tue, 22 Aug 2023 15:28:39 -0400 Subject: [PATCH 02/14] Delete baseline_biomarkers.py No longer using this approach --- scripts/baseline_biomarkers.py | 173 --------------------------------- 1 file changed, 173 deletions(-) delete mode 100644 scripts/baseline_biomarkers.py diff --git a/scripts/baseline_biomarkers.py b/scripts/baseline_biomarkers.py deleted file mode 100644 index e7ea469..0000000 --- a/scripts/baseline_biomarkers.py +++ /dev/null @@ -1,173 +0,0 @@ -import pandas as pd -import numpy as np - -from functools import reduce -from pathlib import Path - -# paths to files - -adni_data = Path("__file__").resolve().parent / 'data' / 'adni_spreadsheet.csv' -tau_data = Path("__file__").resolve().parent / 'data' / 'UCBERKELEYAV1451_04_26_22.csv' -other_biomarker_data = Path("__file__").resolve().parent / 'data' / 'ADNIMERGE.csv' - -# functions - -def get_baselines(file): - - # load baseline phenotypic data - pheno = pd.read_csv(file, index_col=0, header=0) - - # keep only the variables of interest - pheno = pheno.filter(['Subject ID','Phase','Sex','Research Group', 'Visit','Study Date','Age'], axis=1) - - # convert 'study date' to 'session' in datetime format, to match other spreadsheets - pheno['session'] = pd.to_datetime(pheno['Study Date']) - - # pull out only the subject id and asign it to the index - pheno_subj = [] - for i in pheno['Subject ID']: - subj = i.split('_')[2].lstrip("0") # remove leading zeros since it won't match ADNI IDs - pheno_subj.append(subj) - - pheno.index = pheno_subj - pheno.rename_axis('RID',inplace=True) - pheno.index = pheno.index.astype('int64') - - # separate patients and controls, because (in theory) we can use any control visit as baseline, but - # for patients we want their actual baseline data - patient_diagnoses = ['AD', 'EMCI', 'LMCI', 'MCI', 'SMC'] - patient_df = pheno[pheno['Research Group'].isin(patient_diagnoses)] # df of patient diagnoses - - control_df = pheno.loc[pheno['Research Group'] == 'CN'] # df of control diagnoses - - # I think these visits are acceptable as baseline data, i.e. actual baseline +/-3 months, excluding - # any initial visits where patient continued from a previous phase - bl_visits = ['ADNI Screening','ADNI2 Month 3 MRI-New Pt', 'ADNI2 Screening MRI-New Pt', - 'ADNIGO Month 3 MRI','ADNIGO Screening MRI'] - - patient_df_bl = patient_df[patient_df['Visit'].isin(bl_visits)] - - # rejoin the patients to the controls - new_df = pd.concat([control_df,patient_df_bl]) - - # select the earliest visit available for each participant - new_df.sort_values(['Subject ID', 'Age'], inplace=True) # sort by age - baseline_df = new_df[~new_df.duplicated(['Subject ID'], keep='first')] # select the first row - - # sort df by index - baseline_df.sort_values(by='RID', inplace=True) - - # calculate window for acceptable biomarker data, currently +- 12months - baseline_df.loc[:,('date_lower')] = baseline_df.loc[:,('session')] - pd.DateOffset(months=18) - baseline_df.loc[:,('date_upper')] = baseline_df.loc[:,('session')] + pd.DateOffset(months=18) - - return (baseline_df) - -def load_biomarker_df(biomarker_data): - - # load data - biomarker_df = pd.read_csv(biomarker_data, index_col=0, header=0, low_memory=False) - - # convert examdate to datetime - biomarker_df['EXAMDATE'] = pd.to_datetime(biomarker_df['EXAMDATE']) - - # sort df by index and date - biomarker_df.sort_values(by=['RID', 'EXAMDATE'],inplace=True) - - # create column from index (useful for later functions) - biomarker_df['RID'] = biomarker_df.index - - return (biomarker_df) - -def match_baselines_biomarker(biomarker_df, baseline_df, biomarker): - - df = pd.DataFrame(columns=['RID',biomarker,biomarker+'_EXAMDATE']) #create df - common_ids = biomarker_df.index.intersection(baseline_df.index) #find ids common to the biomarker and baseline dfs - biomarker_df = biomarker_df.set_index('EXAMDATE') #reindex, needed to use 'nearest' method - - for rid in common_ids: - participant_df = biomarker_df[(biomarker_df['RID'] == rid)] #create df of all participants results - participant_df = participant_df.dropna(subset=[biomarker]) #drop NaNs, since we only want nearest date with result available - - baseline = baseline_df.loc[rid] #create df of participants baseline data - session = baseline['session'] #participant's baseline date - - if participant_df.empty: - pass - else: - idx_nearest = participant_df.index.get_loc(session, method='nearest') #find the index of the closest test date to session - nearest_date = participant_df.index[idx_nearest] #which date is it - nearest_result = participant_df[biomarker][idx_nearest] #find the biomarker result associated with closest date - - df.loc[len(df)] = [rid,nearest_result,nearest_date] #add to df - - df = df.set_index('RID') - - return (df) - -def check_visit_window(biomarker_df, baseline_df, biomarker): - - ''' - Join closest biomarkers to baseline info, check if the result was collected on a date within the baseline - window, and if not replace with NaN. Drop unwanted columns and return biomarker info again, ready to merge. - ''' - - # create new df, merging biomarker data into baseline df - baseline_bio = baseline_df.join(biomarker_df) - - # create mask of date range, between lower and upper acceptable dates - mask_date_range = (baseline_bio[biomarker+'_EXAMDATE'] > baseline_bio['date_lower']) & (baseline_bio[biomarker+'_EXAMDATE'] < baseline_bio['date_upper']) - - # fill values collected outside date range with NaN - baseline_bio[biomarker][~mask_date_range] = np.nan - - cols_to_drop = ['Subject ID', - 'Phase', - 'Sex', - 'Research Group', - 'Visit', - 'Study Date', - 'Age', - 'session', - 'date_lower', - 'date_upper'] - - baseline_bio = baseline_bio.drop(cols_to_drop, axis=1) #drop unwanted columns - - return (baseline_bio) - -def get_biomarker(biomarker_df, baseline_df, biomarker): - - #runs two functions to match the baseline to closest biomarker result, and then checks if it falls in the window - - find_nearest_biomarker = match_baselines_biomarker(biomarker_df, baseline_df, biomarker) - window_checked = check_visit_window(find_nearest_biomarker, baseline_df, biomarker) - - return (window_checked) - -if __name__ == '__main__': - - #get baselines from adni spreadsheet - baseline_df = get_baselines(adni_data) - - # load biomarker files - tau_df = load_biomarker_df(tau_data) - other_biomarkers_df = load_biomarker_df(other_biomarker_data) - - # create one df per biomarker with closest result within a one year window of participant baseline - tau = get_biomarker(tau_df, baseline_df, 'META_TEMPORAL_SUVR') - abeta = get_biomarker(other_biomarkers_df, baseline_df, 'ABETA') - ptau = get_biomarker(other_biomarkers_df, baseline_df, 'PTAU') - av45 = get_biomarker(other_biomarkers_df, baseline_df, 'AV45') - fbb = get_biomarker(other_biomarkers_df, baseline_df, 'FBB') - - # create list of dataframes (baseline data and all individual biomarkers) - data_frames = [baseline_df, tau, abeta, ptau, av45, fbb] - - # merge dataframes - master_biomarkers = reduce(lambda left, right: - pd.merge_asof(left, right, left_index=True, right_index=True), - data_frames) - - # save baseline and matched biomarker data in data directory - master_biomarkers.to_csv(Path("__file__").resolve().parent / 'data' / 'master_biomarkers.csv') From 5742a96dad10a9aeb5905ee7350751f1f4749d2c Mon Sep 17 00:00:00 2001 From: Natasha Clarke <57987005+clarkenj@users.noreply.github.com> Date: Tue, 22 Aug 2023 15:28:51 -0400 Subject: [PATCH 03/14] Delete biomarker_thresholding.ipynb https://github.com/SIMEXP/ad_sz/tree/new_afc_scripts/scripts --- scripts/biomarker_thresholding.ipynb | 294 --------------------------- 1 file changed, 294 deletions(-) delete mode 100644 scripts/biomarker_thresholding.ipynb diff --git a/scripts/biomarker_thresholding.ipynb b/scripts/biomarker_thresholding.ipynb deleted file mode 100644 index dedac85..0000000 --- a/scripts/biomarker_thresholding.ipynb +++ /dev/null @@ -1,294 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 22, - "id": "d213377a", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "from pathlib import Path" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "9a4034fa", - "metadata": {}, - "outputs": [], - "source": [ - "# set thresholds and calculate 5% margin\n", - "\n", - "# CSF biomarkers\n", - "ab42_threshold = 980\n", - "ab42_margin = (5*ab42_threshold)/100\n", - "ab42_upper = ab42_threshold+ab42_margin\n", - "ab42_lower = ab42_threshold-ab42_margin\n", - "\n", - "ptau_threshold = 23\n", - "ptau_margin = (5*ptau_threshold)/100\n", - "ptau_upper = ptau_threshold+ptau_margin\n", - "ptau_lower = ptau_threshold-ptau_margin\n", - "\n", - "# PET biomarkers\n", - "fbb_threshold = 1.08\n", - "fbb_margin = (5*fbb_threshold)/100\n", - "fbb_upper = fbb_threshold+fbb_margin\n", - "fbb_lower = fbb_threshold-fbb_margin\n", - "\n", - "av45_threshold = 1.08\n", - "av45_margin = (5*av45_threshold)/100\n", - "av45_upper = av45_threshold+av45_margin\n", - "av45_lower = av45_threshold-av45_margin" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a913b27a", - "metadata": {}, - "outputs": [], - "source": [ - "def calculate_tau_cutoff(df):\n", - " \n", - " '''\n", - " Take a df with biomarker results for all participants. Select controls only. Find those negative for amyloid\n", - " PET (either av45 or FBB). For those who hav flortaucipir results, calculate average. Calculate +2SD, this is\n", - " the flortaucipir cutoff.\n", - " \n", - " '''\n", - " \n", - " controls = df.loc[df['Research Group'] == 'CN']\n", - " \n", - " controls_Apet_neg = (controls[(controls['FBB'] < fbb_lower) | \n", - " (controls['AV45'] < av45_lower)])\n", - " \n", - " tau_mean = controls_Apet_neg['META_ROI'].mean()\n", - " tau_sd = controls_Apet_neg['META_ROI'].std()\n", - " tau_cutoff = tau_mean+(tau_sd*2)\n", - " \n", - " return (tau_cutoff) " - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "bfa2dd1c", - "metadata": {}, - "outputs": [], - "source": [ - "# paths to files\n", - "biomarker_data = Path(\"__file__\").resolve().parents[1] / 'data' / 'master_biomarkers.csv'" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "837cb329", - "metadata": {}, - "outputs": [], - "source": [ - "# load baseline data\n", - "df = pd.read_csv(biomarker_data, index_col=0, header=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "c75a9dee", - "metadata": {}, - "outputs": [], - "source": [ - "# replace some biomarker data which has been coded as > or <\n", - "df = df.replace({'ABETA': {'>1700': 1701, '<200': 199}})\n", - "df = df.replace({'PTAU': {'<8': 7, '>120':121}})\n", - "\n", - "df[['ABETA', 'PTAU']] = df[['ABETA', 'PTAU']].apply(pd.to_numeric)" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "b2d48836", - "metadata": {}, - "outputs": [], - "source": [ - "# calculate study specific cutoff for tau PET meta temporal ROI \n", - "tau_threshold = calculate_tau_cutoff(df)\n", - "tau_margin = (5*tau_threshold)/100\n", - "tau_upper = tau_threshold+tau_margin\n", - "tau_lower = tau_threshold-tau_margin" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "60d0393b", - "metadata": {}, - "outputs": [], - "source": [ - "def biomarker_negative_controls(df):\n", - " \n", - " '''\n", - " Take a df with biomarker results for participants labelled as controls, and return a df excluding those\n", - " who have a positive result on any one of five markers (or no biomarker information).\n", - " \n", - " To get more info on which biomarkers are positive, uncomment last section. But this is only to give an \n", - " idea, since some may be positive on more than one marker.\n", - " '''\n", - " \n", - " controls_all = df.loc[df['Research Group'] == 'CN']\n", - " \n", - " # drop participants who don't have any biomarker data\n", - " controls = controls_all.dropna(subset=['ABETA','PTAU','AV45','FBB','META_ROI'], how='all')\n", - " \n", - " controls_biomarker_neg = (controls[(controls['FBB'] < fbb_lower) | \n", - " (controls['AV45'] < av45_lower) | \n", - " (controls['ABETA'] > ab42_upper) | \n", - " (controls['PTAU'] < ptau_lower) | \n", - " (controls['META_ROI'] < tau_lower)])\n", - " \n", - " n_controls = len(controls_all)\n", - " no_markers = n_controls-len(controls)\n", - " n_controls_negative = len(controls_biomarker_neg)\n", - " n_dropped = (n_controls-no_markers) - n_controls_negative\n", - " \n", - " print ('Of {} controls, {} dropped due to no biomarker data and {} due to positive/borderline biomarkers. {} remaining.'.format(\n", - " n_controls,\n", - " no_markers, \n", - " n_dropped,\n", - " n_controls_negative))\n", - " \n", - " '''\n", - " controls_biomarker_pos = pd.concat([controls,controls_biomarker_neg]).drop_duplicates(keep=False)\n", - " print ('{} positive/borderline for Abeta42.'.format(len(controls_biomarker_pos[(controls_biomarker_pos['ABETA'] < ab42_upper)])))\n", - " print ('{} positive/borderline for Ptau.'.format(len(controls_biomarker_pos[(controls_biomarker_pos['PTAU'] > ptau_lower)])))\n", - " print ('{} positive/borderline for PET amyloid (FBB tracer).'.format(len(controls_biomarker_pos[(controls_biomarker_pos['FBB'] > fbb_lower)])))\n", - " print ('{} positive/borderline for PET amyloid (AV45 tracer).'.format(len(controls_biomarker_pos[(controls_biomarker_pos['AV45'] > av45_lower)])))\n", - " print ('{} positive/borderline for PET tau (meta roi).'.format(len(controls_biomarker_pos[(controls_biomarker_pos['META_ROI'] > tau_lower)])))\n", - " '''\n", - " return (controls_biomarker_neg)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "b3a95289", - "metadata": {}, - "outputs": [], - "source": [ - "def biomarker_positive_patients(df, diagnoses):\n", - " \n", - " '''\n", - " Take a df with biomarker results for participants labelled as a patient (specifying which), \n", - " and return a df including only those who have a positive result on any one of five markers. \n", - " Exclude any with no biomarker information.\n", - " \n", - " To get more info on which biomarkers are negative, uncomment last section. But this is only to give an \n", - " idea, since some may be negative on more than one marker.\n", - " '''\n", - " \n", - " patients_all = df.loc[df['Research Group'].isin(diagnoses)]\n", - " \n", - " # drop participants who don't have any biomarker data\n", - " patients = patients_all.dropna(subset=['ABETA','PTAU','AV45','FBB','META_ROI'], how='all')\n", - " \n", - " patients_biomarker_pos = (patients[(patients['FBB'] > fbb_upper) | \n", - " (patients['AV45'] > av45_upper) | \n", - " (patients['ABETA'] < ab42_lower) | \n", - " (patients['PTAU'] > ptau_upper) |\n", - " (patients['META_ROI'] > tau_upper)])\n", - "\n", - " n_patients = len(patients_all)\n", - " no_markers = n_patients-len(patients)\n", - " n_patients_positive = len(patients_biomarker_pos)\n", - " n_dropped = (n_patients-no_markers) - n_patients_positive\n", - " \n", - " print ('Of {} {}, {} dropped due to no biomarker data and {} due to negative/borderline biomarkers. {} remaining.'.format(\n", - " n_patients,\n", - " diagnoses,\n", - " no_markers,\n", - " n_dropped,\n", - " n_patients_positive))\n", - " \n", - " '''\n", - " patients_neg = pd.concat([patients,patients_biomarker_pos]).drop_duplicates(keep=False)\n", - " \n", - " print ('{} negative/borderline for Abeta42.'.format(len(patients_neg[(patients_neg['ABETA'] > ab42_lower)])))\n", - " print ('{} negative/borderline for Ptau.'.format(len(patients_neg[(patients_neg['PTAU'] < ptau_upper)])))\n", - " print ('{} negative/borderline for PET amyloid (FBB tracer).'.format(len(patients_neg[(patients_neg['FBB'] < fbb_upper)])))\n", - " print ('{} negative/borderline for PET amyloid (AV45 tracer).'.format(len(patients_neg[(patients_neg['AV45'] < av45_upper)])))\n", - " print ('{} negative/borderline for PET tau (meta roi).'.format(len(patients_neg[(patients_neg['META_ROI'] < tau_upper)])))\n", - " '''\n", - " return (patients_biomarker_pos)" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "0d123b9f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Of 362 controls, 44 dropped due to no biomarker data and 60 due to positive/borderline biomarkers. 258 remaining.\n", - "Of 58 ['AD'], 7 dropped due to no biomarker data and 3 due to negative/borderline biomarkers. 48 remaining.\n", - "Of 165 ['EMCI', 'LMCI', 'MCI'], 25 dropped due to no biomarker data and 54 due to negative/borderline biomarkers. 86 remaining.\n" - ] - } - ], - "source": [ - "# run functions to get negative controls and positive patients\n", - "controls_bio_neg = biomarker_negative_controls(df)\n", - "ad_bio_positive = biomarker_positive_patients(df, ['AD'])\n", - "mci_bio_positive = biomarker_positive_patients(df, ['EMCI','LMCI','MCI'])" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "979848a1", - "metadata": {}, - "outputs": [], - "source": [ - "master_df = pd.concat([controls_bio_neg, ad_bio_positive, mci_bio_positive])" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "cf508b49", - "metadata": {}, - "outputs": [], - "source": [ - "master_df.to_csv(Path(\"__file__\").resolve().parents[1] / 'data' / 'final_biomarker_spreadsheet.csv')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From c444bd31ecff21254fcf8f1874be7b220a97ed9a Mon Sep 17 00:00:00 2001 From: Natasha Clarke <57987005+clarkenj@users.noreply.github.com> Date: Tue, 22 Aug 2023 15:29:55 -0400 Subject: [PATCH 04/14] Delete calculate_meta_roi.ipynb No longer using this approach --- scripts/calculate_meta_roi.ipynb | 723 ------------------------------- 1 file changed, 723 deletions(-) delete mode 100644 scripts/calculate_meta_roi.ipynb diff --git a/scripts/calculate_meta_roi.ipynb b/scripts/calculate_meta_roi.ipynb deleted file mode 100644 index b3cb980..0000000 --- a/scripts/calculate_meta_roi.ipynb +++ /dev/null @@ -1,723 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "3881d9a6", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "from pathlib import Path" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "3fdee605", - "metadata": {}, - "outputs": [], - "source": [ - "tau_data = Path(\"__file__\").resolve().parents[1] / 'data' / 'UCBERKELEYAV1451_04_26_22.csv'" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "7b6b5db3", - "metadata": {}, - "outputs": [], - "source": [ - "# load tau PET data\n", - "df = pd.read_csv(tau_data, index_col=0, header=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "83a462f2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
VISCODEVISCODE2EXAMDATEINFERIORCEREBELLUM_SUVRINFERIORCEREBELLUM_VOLUMEERODED_SUBCORTICALWM_SUVRERODED_SUBCORTICALWM_VOLUMEBRAAK1_SUVRBRAAK1_VOLUMEBRAAK34_SUVR...RIGHT_PUTAMEN_VOLUMERIGHT_THALAMUS_PROPER_SUVRRIGHT_THALAMUS_PROPER_VOLUMERIGHT_VENTRALDC_SUVRRIGHT_VENTRALDC_VOLUMERIGHT_VESSEL_SUVRRIGHT_VESSEL_VOLUMEWM_HYPOINTENSITIES_SUVRWM_HYPOINTENSITIES_VOLUMEupdate_stamp
RID
21initm1442018-02-021.0285552851.17711106951.193728691.2152...41691.522458741.386033781.457610.01.061524602022-04-27 15:31:20.0
31initm1502018-04-240.9647623441.0258556831.246325581.0969...57661.142160161.234031781.36912.00.7641268452022-04-27 15:31:20.0
31y1m1622019-04-230.9636615101.0193504271.226425081.0675...55371.080158651.164430601.420315.00.7805283062022-04-27 15:31:20.0
56initm1442018-02-200.9790644151.24421144011.268238091.2016...40261.339666011.362536551.619042.01.055613852022-04-27 15:31:20.0
56y1m1562019-01-100.9548633021.28631186251.193136661.1839...41451.361863491.387336251.451110.01.085014272022-04-27 15:31:20.0
\n", - "

5 rows × 242 columns

\n", - "
" - ], - "text/plain": [ - " VISCODE VISCODE2 EXAMDATE INFERIORCEREBELLUM_SUVR \\\n", - "RID \n", - "21 init m144 2018-02-02 1.0285 \n", - "31 init m150 2018-04-24 0.9647 \n", - "31 y1 m162 2019-04-23 0.9636 \n", - "56 init m144 2018-02-20 0.9790 \n", - "56 y1 m156 2019-01-10 0.9548 \n", - "\n", - " INFERIORCEREBELLUM_VOLUME ERODED_SUBCORTICALWM_SUVR \\\n", - "RID \n", - "21 55285 1.1771 \n", - "31 62344 1.0258 \n", - "31 61510 1.0193 \n", - "56 64415 1.2442 \n", - "56 63302 1.2863 \n", - "\n", - " ERODED_SUBCORTICALWM_VOLUME BRAAK1_SUVR BRAAK1_VOLUME BRAAK34_SUVR \\\n", - "RID \n", - "21 110695 1.1937 2869 1.2152 \n", - "31 55683 1.2463 2558 1.0969 \n", - "31 50427 1.2264 2508 1.0675 \n", - "56 114401 1.2682 3809 1.2016 \n", - "56 118625 1.1931 3666 1.1839 \n", - "\n", - " ... RIGHT_PUTAMEN_VOLUME RIGHT_THALAMUS_PROPER_SUVR \\\n", - "RID ... \n", - "21 ... 4169 1.5224 \n", - "31 ... 5766 1.1421 \n", - "31 ... 5537 1.0801 \n", - "56 ... 4026 1.3396 \n", - "56 ... 4145 1.3618 \n", - "\n", - " RIGHT_THALAMUS_PROPER_VOLUME RIGHT_VENTRALDC_SUVR \\\n", - "RID \n", - "21 5874 1.3860 \n", - "31 6016 1.2340 \n", - "31 5865 1.1644 \n", - "56 6601 1.3625 \n", - "56 6349 1.3873 \n", - "\n", - " RIGHT_VENTRALDC_VOLUME RIGHT_VESSEL_SUVR RIGHT_VESSEL_VOLUME \\\n", - "RID \n", - "21 3378 1.4576 10.0 \n", - "31 3178 1.3691 2.0 \n", - "31 3060 1.4203 15.0 \n", - "56 3655 1.6190 42.0 \n", - "56 3625 1.4511 10.0 \n", - "\n", - " WM_HYPOINTENSITIES_SUVR WM_HYPOINTENSITIES_VOLUME update_stamp \n", - "RID \n", - "21 1.0615 2460 2022-04-27 15:31:20.0 \n", - "31 0.7641 26845 2022-04-27 15:31:20.0 \n", - "31 0.7805 28306 2022-04-27 15:31:20.0 \n", - "56 1.0556 1385 2022-04-27 15:31:20.0 \n", - "56 1.0850 1427 2022-04-27 15:31:20.0 \n", - "\n", - "[5 rows x 242 columns]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "630a3d56", - "metadata": {}, - "outputs": [], - "source": [ - "result_cols = ['CTX_LH_ENTORHINAL_SUVR',\n", - "'CTX_RH_ENTORHINAL_SUVR',\n", - "'LEFT_AMYGDALA_SUVR',\n", - "'RIGHT_AMYGDALA_SUVR',\n", - "'CTX_LH_PARAHIPPOCAMPAL_SUVR',\n", - "'CTX_RH_PARAHIPPOCAMPAL_SUVR',\n", - "'CTX_LH_FUSIFORM_SUVR',\n", - "'CTX_RH_FUSIFORM_SUVR',\n", - "'CTX_LH_INFERIORTEMPORAL_SUVR',\n", - "'CTX_RH_INFERIORTEMPORAL_SUVR',\n", - "'CTX_LH_MIDDLETEMPORAL_SUVR',\n", - "'CTX_RH_MIDDLETEMPORAL_SUVR']" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "79cb3167", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
VISCODEVISCODE2EXAMDATEINFERIORCEREBELLUM_SUVRINFERIORCEREBELLUM_VOLUMEERODED_SUBCORTICALWM_SUVRERODED_SUBCORTICALWM_VOLUMEBRAAK1_SUVRBRAAK1_VOLUMEBRAAK34_SUVR...RIGHT_THALAMUS_PROPER_VOLUMERIGHT_VENTRALDC_SUVRRIGHT_VENTRALDC_VOLUMERIGHT_VESSEL_SUVRRIGHT_VESSEL_VOLUMEWM_HYPOINTENSITIES_SUVRWM_HYPOINTENSITIES_VOLUMEupdate_stampmeta_roiMETA_ROI
RID
21initm1442018-02-021.0285552851.17711106951.193728691.2152...58741.386033781.457610.01.061524602022-04-27 15:31:20.01.2349581.234958
31initm1502018-04-240.9647623441.0258556831.246325581.0969...60161.234031781.36912.00.7641268452022-04-27 15:31:20.01.1450251.145025
31y1m1622019-04-230.9636615101.0193504271.226425081.0675...58651.164430601.420315.00.7805283062022-04-27 15:31:20.01.1378251.137825
56initm1442018-02-200.9790644151.24421144011.268238091.2016...66011.362536551.619042.01.055613852022-04-27 15:31:20.01.2358831.235883
56y1m1562019-01-100.9548633021.28631186251.193136661.1839...63491.387336251.451110.01.085014272022-04-27 15:31:20.01.2089331.208933
\n", - "

5 rows × 244 columns

\n", - "
" - ], - "text/plain": [ - " VISCODE VISCODE2 EXAMDATE INFERIORCEREBELLUM_SUVR \\\n", - "RID \n", - "21 init m144 2018-02-02 1.0285 \n", - "31 init m150 2018-04-24 0.9647 \n", - "31 y1 m162 2019-04-23 0.9636 \n", - "56 init m144 2018-02-20 0.9790 \n", - "56 y1 m156 2019-01-10 0.9548 \n", - "\n", - " INFERIORCEREBELLUM_VOLUME ERODED_SUBCORTICALWM_SUVR \\\n", - "RID \n", - "21 55285 1.1771 \n", - "31 62344 1.0258 \n", - "31 61510 1.0193 \n", - "56 64415 1.2442 \n", - "56 63302 1.2863 \n", - "\n", - " ERODED_SUBCORTICALWM_VOLUME BRAAK1_SUVR BRAAK1_VOLUME BRAAK34_SUVR \\\n", - "RID \n", - "21 110695 1.1937 2869 1.2152 \n", - "31 55683 1.2463 2558 1.0969 \n", - "31 50427 1.2264 2508 1.0675 \n", - "56 114401 1.2682 3809 1.2016 \n", - "56 118625 1.1931 3666 1.1839 \n", - "\n", - " ... RIGHT_THALAMUS_PROPER_VOLUME RIGHT_VENTRALDC_SUVR \\\n", - "RID ... \n", - "21 ... 5874 1.3860 \n", - "31 ... 6016 1.2340 \n", - "31 ... 5865 1.1644 \n", - "56 ... 6601 1.3625 \n", - "56 ... 6349 1.3873 \n", - "\n", - " RIGHT_VENTRALDC_VOLUME RIGHT_VESSEL_SUVR RIGHT_VESSEL_VOLUME \\\n", - "RID \n", - "21 3378 1.4576 10.0 \n", - "31 3178 1.3691 2.0 \n", - "31 3060 1.4203 15.0 \n", - "56 3655 1.6190 42.0 \n", - "56 3625 1.4511 10.0 \n", - "\n", - " WM_HYPOINTENSITIES_SUVR WM_HYPOINTENSITIES_VOLUME \\\n", - "RID \n", - "21 1.0615 2460 \n", - "31 0.7641 26845 \n", - "31 0.7805 28306 \n", - "56 1.0556 1385 \n", - "56 1.0850 1427 \n", - "\n", - " update_stamp meta_roi META_ROI \n", - "RID \n", - "21 2022-04-27 15:31:20.0 1.234958 1.234958 \n", - "31 2022-04-27 15:31:20.0 1.145025 1.145025 \n", - "31 2022-04-27 15:31:20.0 1.137825 1.137825 \n", - "56 2022-04-27 15:31:20.0 1.235883 1.235883 \n", - "56 2022-04-27 15:31:20.0 1.208933 1.208933 \n", - "\n", - "[5 rows x 244 columns]" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['META_ROI'] = df[result_cols].mean(axis=1)\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "5aec7b55", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
EXAMDATEMETA_ROI
RID
212018-02-021.234958
312018-04-241.145025
312019-04-231.137825
562018-02-201.235883
562019-01-101.208933
\n", - "
" - ], - "text/plain": [ - " EXAMDATE META_ROI\n", - "RID \n", - "21 2018-02-02 1.234958\n", - "31 2018-04-24 1.145025\n", - "31 2019-04-23 1.137825\n", - "56 2018-02-20 1.235883\n", - "56 2019-01-10 1.208933" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "results_df = df[['EXAMDATE','META_ROI']].copy()\n", - "results_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "5e52c415", - "metadata": {}, - "outputs": [], - "source": [ - "results_df.to_csv(Path(\"__file__\").resolve().parents[1] / 'data' / 'tau_meta_roi.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "663dc7b4", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From bd9c875495a6d8dc6f95069bbcb52cc5ac6677d7 Mon Sep 17 00:00:00 2001 From: clarkenj Date: Mon, 28 Aug 2023 12:56:32 -0400 Subject: [PATCH 05/14] added get_ds00030_participants.py --- scripts/get_ds00030_participants.py | 33 +++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 scripts/get_ds00030_participants.py diff --git a/scripts/get_ds00030_participants.py b/scripts/get_ds00030_participants.py new file mode 100644 index 0000000..4319183 --- /dev/null +++ b/scripts/get_ds00030_participants.py @@ -0,0 +1,33 @@ +import pandas as pd +from pathlib import Path + +## Load ds000030 participants.csv and keep only required subjects and columns + +root_p = Path("__file__").resolve().parents[1] / "data" / "ds000030" + +df = pd.read_csv(root_p / "participants.csv") + +# Keep only rows where diagnosis is CONTROL or SCHZ +df = df[df["diagnosis"].isin(["CONTROL", "SCHZ"])] + +# Add columns SZ and CON +df["SZ"] = (df["diagnosis"] == "SCHZ").astype(int) +df["CON"] = (df["diagnosis"] == "CONTROL").astype(int) + +# Add GROUP column +df["GROUP"] = df["diagnosis"] +df["GROUP"] = df["GROUP"].replace({"SCHZ": "SZ", "CONTROL": "CON"}) + +# Rename some columns and map values for sex +df.rename(columns={"gender": "SEX", "age": "AGE"}, inplace=True) +df["SEX"] = df["SEX"].map({"M": 0, "F": 1}) + +# Add COHORT and SITE columns +df["COHORT"] = "ds000030" +df["SITE"] = 0 + +cols = ["participant_id", "AGE", "SEX", "SITE", "SZ", "CON", "COHORT", "GROUP"] +df = df[cols] + +# Save results +df.to_csv(root_p / "ds000030_participants.csv", index=False) From 976273a52fc579c166bf10795a7c537e7312c56d Mon Sep 17 00:00:00 2001 From: clarkenj Date: Mon, 28 Aug 2023 15:02:04 -0400 Subject: [PATCH 06/14] added scripts for sz data wrangling --- scripts/get_cobre_participants.py | 33 +++++++++++++++++++++++++++++ scripts/get_ds00030_participants.py | 22 +++++++++---------- scripts/get_srpbs_participants.py | 31 +++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 11 deletions(-) create mode 100644 scripts/get_cobre_participants.py create mode 100644 scripts/get_srpbs_participants.py diff --git a/scripts/get_cobre_participants.py b/scripts/get_cobre_participants.py new file mode 100644 index 0000000..bec947a --- /dev/null +++ b/scripts/get_cobre_participants.py @@ -0,0 +1,33 @@ +import pandas as pd +from pathlib import Path + +## Load cobre participant data and do some stuff + +root_p = Path("__file__").resolve().parents[1] / "data" + +df = pd.read_csv(root_p / "cobre" / "COBRE_phenotypic_data.csv") + +# Add columns SZ and CON +df["SZ"] = (df["Subject Type"] == "Patient").astype(int) +df["CON"] = (df["Subject Type"] == "Control").astype(int) + +# Add group column +df["group"] = df["Subject Type"] +df["group"] = df["group"].replace({"Patient": "SZ", "Control": "CON"}) + +# Rename some columns and map values for sex +df.rename( + columns={"Gender": "sex", "Current Age": "age", "Unnamed: 0": "participant_id"}, + inplace=True, +) +df["sex"] = df["sex"].map({"Male": 0, "Female": 1}) + +# Add cohort and site columns +df["cohort"] = "COBRE" +df["site"] = 0 + +cols = ["participant_id", "age", "sex", "site", "cohort", "group", "SZ", "CON"] +df = df[cols] + +# Save results +df.to_csv(root_p / "cobre_participants.csv", index=False) diff --git a/scripts/get_ds00030_participants.py b/scripts/get_ds00030_participants.py index 4319183..e49d66c 100644 --- a/scripts/get_ds00030_participants.py +++ b/scripts/get_ds00030_participants.py @@ -1,11 +1,11 @@ import pandas as pd from pathlib import Path -## Load ds000030 participants.csv and keep only required subjects and columns +## Load ds000030 participant data and do some stuff -root_p = Path("__file__").resolve().parents[1] / "data" / "ds000030" +root_p = Path("__file__").resolve().parents[1] / "data" -df = pd.read_csv(root_p / "participants.csv") +df = pd.read_csv(root_p / "ds000030" / "participants.csv") # Keep only rows where diagnosis is CONTROL or SCHZ df = df[df["diagnosis"].isin(["CONTROL", "SCHZ"])] @@ -14,19 +14,19 @@ df["SZ"] = (df["diagnosis"] == "SCHZ").astype(int) df["CON"] = (df["diagnosis"] == "CONTROL").astype(int) -# Add GROUP column -df["GROUP"] = df["diagnosis"] -df["GROUP"] = df["GROUP"].replace({"SCHZ": "SZ", "CONTROL": "CON"}) +# Add group column +df["group"] = df["diagnosis"] +df["group"] = df["group"].replace({"SCHZ": "SZ", "CONTROL": "CON"}) # Rename some columns and map values for sex -df.rename(columns={"gender": "SEX", "age": "AGE"}, inplace=True) -df["SEX"] = df["SEX"].map({"M": 0, "F": 1}) +df.rename(columns={"gender": "sex"}, inplace=True) +df["sex"] = df["sex"].map({"M": 0, "F": 1}) # Add COHORT and SITE columns -df["COHORT"] = "ds000030" -df["SITE"] = 0 +df["cohort"] = "ds000030" +df["site"] = 0 -cols = ["participant_id", "AGE", "SEX", "SITE", "SZ", "CON", "COHORT", "GROUP"] +cols = ["participant_id", "age", "sex", "site", "cohort", "group", "SZ", "CON"] df = df[cols] # Save results diff --git a/scripts/get_srpbs_participants.py b/scripts/get_srpbs_participants.py new file mode 100644 index 0000000..fddba39 --- /dev/null +++ b/scripts/get_srpbs_participants.py @@ -0,0 +1,31 @@ +import pandas as pd +from pathlib import Path + +## Load srpbs participant data and do some stuff + +root_p = Path("__file__").resolve().parents[1] / "data" + +df = pd.read_csv(root_p / "srpbs" / "participants.tsv", sep="\t") + +# Keep only rows where diagnosis is 0 (control) or 4 (sz) +df = df[df["diag"].isin([0, 4])] + +# Add columns SZ and CON +df["SZ"] = (df["diag"] == 4).astype(int) +df["CON"] = (df["diag"] == 0).astype(int) + +# Add group column +df["group"] = df["diag"] +df["group"] = df["group"].replace({4: "SZ", 0: "CON"}) + +# Map values for sex, from total N in Tanaka paper +df["sex"] = df["sex"].map({1: 0, 2: 1}) + +# Add cohort and site columns +df["cohort"] = "SRPBS" + +cols = ["participant_id", "age", "sex", "site", "cohort", "group", "SZ", "CON"] +df = df[cols] + +# Save results +df.to_csv(root_p / "srpbs_participants.csv", index=False) From da09b0b7f8e3809ff292514a59c41bf8329f24ff Mon Sep 17 00:00:00 2001 From: clarkenj Date: Wed, 6 Sep 2023 11:21:30 -0400 Subject: [PATCH 07/14] added more wrangling scripts --- scripts/{npi_to_mbi.py => adni_npi_to_mbi.py} | 12 +- scripts/cimaq_npi_to_mbi.py | 87 ++++ scripts/cnvfc/stats.py | 348 +++++++++++++++ scripts/oasis_wrangling_draft.ipynb | 401 ++++++++++++++++++ 4 files changed, 842 insertions(+), 6 deletions(-) rename scripts/{npi_to_mbi.py => adni_npi_to_mbi.py} (77%) create mode 100644 scripts/cimaq_npi_to_mbi.py create mode 100644 scripts/cnvfc/stats.py create mode 100644 scripts/oasis_wrangling_draft.ipynb diff --git a/scripts/npi_to_mbi.py b/scripts/adni_npi_to_mbi.py similarity index 77% rename from scripts/npi_to_mbi.py rename to scripts/adni_npi_to_mbi.py index 5aa3607..8ff48ba 100644 --- a/scripts/npi_to_mbi.py +++ b/scripts/adni_npi_to_mbi.py @@ -1,7 +1,7 @@ import pandas as pd from pathlib import Path -## Load NPIQ or NPI scores and convert to MBI framework +## Load NPIQ or NPI scores for ADNI and convert to MBI framework ## Domains: https://adni.bitbucket.io/reference/npi.html @@ -21,8 +21,8 @@ def _mbi_conversion(df): "social_inappropriateness", "abnormal_perception", ] - df["MBI_total_score"] = df[mbi_domains].sum(axis=1) - df["mbi_status"] = (df["MBI_total_score"] >= 1).astype(int) + df["mbi_total_score"] = df[mbi_domains].sum(axis=1) + df["mbi_status"] = (df["mbi_total_score"] >= 1).astype(int) mbi_columns = [ "RID", @@ -32,7 +32,7 @@ def _mbi_conversion(df): "impulse_dyscontrol", "social_inappropriateness", "abnormal_perception", - "MBI_total_score", + "mbi_total_score", "mbi_status", ] mbi_df = df[mbi_columns].copy() @@ -43,9 +43,9 @@ def _mbi_conversion(df): root_p = Path("__file__").resolve().parents[1] / "data" / "adni" # Load NPI or NPI-Q scores -npi_df = pd.read_csv(root_p / "NPIQ_22Aug2023.csv") +npi_df = pd.read_csv(root_p / "NPI_22Aug2023.csv") mbi_df = _mbi_conversion(npi_df) # Save results - make sure to change name -mbi_df.to_csv(root_p / "adni_npiq_mbi_status_20230822.csv", index=False) +mbi_df.to_csv(root_p / "adni_npi_mbi_status_20230829.csv", index=False) diff --git a/scripts/cimaq_npi_to_mbi.py b/scripts/cimaq_npi_to_mbi.py new file mode 100644 index 0000000..c94ec4a --- /dev/null +++ b/scripts/cimaq_npi_to_mbi.py @@ -0,0 +1,87 @@ +import pandas as pd +from pathlib import Path + +## Load NPIQ scores for CIMA-Q and convert to MBI framework + + +def _map_values(df): + # Map values for columns we are using + mapping = {"0_non": 0, "1_oui_léger": 1, "2_oui_modéré": 2, "3_oui_sévère": 3} + + columns_to_map = [ + "22901_apathie", + "22901_depression_dysphorie", + "22901_anxiete", + "22901_euphorie", + "22901_agitation_aggressivite", + "22901_irritabilite", + "22901_comp_moteur_aberrant", + "22901_impulsivite", + "22901_idees_delirantes", + "22901_hallucinations", + ] + + for column in columns_to_map: + df[column] = df[column].map(mapping) + + return df + + +def _mbi_conversion(df): + # Calculate MBI domains + df["decreased_motivation"] = df["22901_apathie"] + df["emotional_dysregulation"] = ( + df["22901_depression_dysphorie"] + df["22901_anxiete"] + df["22901_euphorie"] + ) + df["impulse_dyscontrol"] = ( + df["22901_agitation_aggressivite"] + + df["22901_irritabilite"] + + df["22901_comp_moteur_aberrant"] + ) + df["social_inappropriateness"] = df["22901_impulsivite"] + df["abnormal_perception"] = ( + df["22901_idees_delirantes"] + df["22901_hallucinations"] + ) + + # Calculate MBI total score + mbi_domains = [ + "decreased_motivation", + "emotional_dysregulation", + "impulse_dyscontrol", + "social_inappropriateness", + "abnormal_perception", + ] + + df["mbi_total_score"] = df[mbi_domains].sum(axis=1) + df["mbi_status"] = (df["mbi_total_score"] >= 1).astype(int) + + mbi_columns = [ + "PSCID", + "Date_taken", + "decreased_motivation", + "emotional_dysregulation", + "impulse_dyscontrol", + "social_inappropriateness", + "abnormal_perception", + "mbi_total_score", + "mbi_status", + ] + + mbi_df = df[mbi_columns].copy() + + return mbi_df + + +root_p = Path("__file__").resolve().parents[1] / "data" / "cimaq" + +# Load NPI-Q scores +npi_df = pd.read_csv( + root_p / "22901_inventaire_neuropsychiatrique_q_initiale.tsv", sep="\t" +) + +npi_df = npi_df.dropna(subset=["Date_taken"]) +npi_df = _map_values(npi_df) +mbi_df = _mbi_conversion(npi_df) + +# Save results - make sure to change name +mbi_df.to_csv(root_p / "cimaq_npiq_mbi_status_20230829.csv", index=False) diff --git a/scripts/cnvfc/stats.py b/scripts/cnvfc/stats.py new file mode 100644 index 0000000..3a4f28e --- /dev/null +++ b/scripts/cnvfc/stats.py @@ -0,0 +1,348 @@ +import sys +import time +import warnings +import scipy as sp +import numpy as np +import patsy as pat +import pandas as pd +from scipy import stats +import statsmodels.api as sm +#from .tools import conn2mat +from sklearn import linear_model as sln +from sklearn import preprocessing as skp +from sklearn import model_selection as skm +from statsmodels.sandbox.stats.multicomp import multipletests as stm + + +def make_weights(data, pattern): + if not data.shape[1:] == pattern.shape: + raise Exception(f'data and pattern shape mismatch: {data.shape[1:]}, {pattern.shape}') + n_nodes = pattern.shape[0] + n_data = data.shape[0] + weights = np.array([[np.corrcoef(data[data_id, node_id, :], + pattern[node_id, :])[0, 1] + for node_id in range(n_nodes)] + for data_id in range(n_data)]) + return weights + + +def categorical_enrichment(pheno, data, group, case=None, control=None, node_labels='node_name_not_specificed'): + sub_mask, cases_mask = find_subset(pheno, group, [case, control]) + sub_data = data[sub_mask, ...] + n_data = sub_data.shape[1] + n_case = np.sum(cases_mask[case]) + n_control = np.sum(cases_mask[control]) + + results = {key: list() for key in ['node', 'U', 'p_value', + f'median_{group}_{case}', + f'median_{group}_{control}', + 'rank_biserial_correlation']} + # Conduct Mann-Whitney-U for each node region + for node_id in range(n_data): + u_right, p = sp.stats.mannwhitneyu(sub_data[cases_mask[case], node_id], sub_data[cases_mask[control], node_id], + alternative='two-sided') + u_left = n_case * n_control - u_right + u_min = np.min([u_left, u_right]) + # Compute the median for the case and control groups + median_case = np.median(sub_data[cases_mask[case], node_id]) + median_control = np.median(sub_data[cases_mask[control], node_id]) + # Compute rank biserial correlation + r_b = 1 - (2 * u_min) / (n_case * n_control) + # Determine if cases > controls or reverse + case_gt_con = u_right > u_min + if not case_gt_con: + r_b = -r_b + # Store the results + results['node'].append(node_id + 1) + results['U'].append(u_min) + results['p_value'].append(p) + results[f'median_{group}_{case}'].append(median_case) + results[f'median_{group}_{control}'].append(median_control) + results['rank_biserial_correlation'].append(r_b) + + results_table = pd.DataFrame(data=results) + # Correct for multiple comparisons + (fdr_pass, qval, _, _) = stm(results_table.p_value, alpha=0.05, method='fdr_bh') + results_table['q_value'] = qval + # Add the node_names + results_table['node_name'] = node_labels + + return results_table + + +def continuous_enrichment(pheno, data, covariate, node_labels='node_name_not_specificed'): + sub_mask = find_subset(pheno, covariate) + sub_pheno = pheno.loc[sub_mask] + sub_data = data[sub_mask, :] + n_data = sub_data.shape[1] + + results = {key: list() for key in ['node', 'pearson_r', 'p_value']} + for node_id in range(n_data): + r, p = stats.pearsonr(sub_pheno[covariate], sub_data[:, node_id]) + # Store the results + results['node'].append(node_id + 1) + results['pearson_r'].append(r) + results['p_value'].append(p) + + results_table = pd.DataFrame(data=results) + # Correct for multiple comparisons + (fdr_pass, qval, _, _) = stm(results_table.p_value, alpha=0.05, method='fdr_bh') + results_table['q_value'] = qval + # Add the node_names + results_table['node_name'] = node_labels + + return results_table + + +def find_subset(pheno, column, cases=None): + # TODO check pandas type input + subset_mask = np.array(~pheno[column].isnull()) + if cases is not None and not not cases: + all_cases = pheno.loc[subset_mask][column].unique() + try: + case_available = np.array([True if case in all_cases else False for case in cases]) + except TypeError as e: + raise Exception(f'the attribute "cases" needs to be iterable but is: {type(cases)}') from e + if not all(case_available): + if not any(case_available): + raise Exception(f'none of the requested cases of "{column}" are available') + else: + warnings.warn( + f'\nnot all requested cases of "{column}" are available: {list(zip(cases, case_available))}', + RuntimeWarning) + case_masks = np.array([pheno[column] == case for case in cases]) + subset_mask = np.any(case_masks, 0) + # Return the masked instances of the requested cases + cases_dict = {case: case_masks[idx][subset_mask] for idx, case in enumerate(cases)} + return subset_mask, cases_dict + else: + return subset_mask + + +def standardize(data, mask): + scaler = skp.StandardScaler(with_mean=False, with_std=True) + scaler.fit(data[mask, :]) + standardized_data = scaler.transform(data) + return standardized_data + + +def find_contrast(design_matrix, contrast): + # Find the contrast column + contrast_columns = [(col_id, col) for col_id, col in enumerate(design_matrix.columns) if f'{contrast}' in col] + if not len(contrast_columns) == 1: + raise Exception(f'There is no single factor that matches {contrast}: {(list(design_matrix.columns))}') + return contrast_columns + + +def fast_glm(data, design_matrix, contrast): + # Does not compute p-values but operates in parallel + contrast_id, contrast_name = find_contrast(design_matrix, contrast)[0] + glm = sln.LinearRegression(fit_intercept=False, normalize=False, n_jobs=-2) + res = glm.fit(design_matrix, data) + betas = res.coef_[:, contrast_id] + return betas + + +def glm(data, design_matrix, contrast): + contrast_id, contrast_name = find_contrast(design_matrix, contrast)[0] + n_data = data.shape[1] + + # Conduct the GLM + betas = np.zeros(shape=n_data) + pvals = np.zeros(shape=n_data) + for conn_id in range(n_data): + model = sm.OLS(data[:, conn_id], design_matrix) + results = model.fit() + betas[conn_id] = results.params[contrast_id] + pvals[conn_id] = results.pvalues[contrast_id] + + return betas, pvals + + +def glm_wrap_cc(conn, pheno, group, case, control, regressors='', report=False, fast=False): + # Make sure pheno and conn have the same number of cases + if not conn.shape[0] == pheno.shape[0]: + print(f'Conn ({conn.shape[0]}) and pheno ({pheno.shape[0]}) must be same number of cases') + + # Define the subset of the sample + sub_mask, case_masks = find_subset(pheno, group, [case, control]) + sub_conn = conn[sub_mask, :] + sub_pheno = pheno.loc[sub_mask] + n_sub = np.sum(sub_mask) + n_case = np.sum(case_masks[case]) + n_control = np.sum(case_masks[control]) + n_data = sub_conn.shape[1] + if report: + print(f'Selected sample based on group variable {group}.\n' + f'cases: {case} (n={n_case})\n' + f'controls: {control} (n={n_control})\n' + f'original sample: n={pheno.shape[0]}; new sample: n={n_sub}\n' + f'{n_data} data points available\n' + f'standardized estimators are based on {group}=={control}') + + stand_conn = standardize(sub_conn, case_masks[control]) + + # Construct design matrix + if type(control) == str: + contrast = f'C({group}, Treatment("{control}"))' + else: + contrast = f'C({group}, Treatment({control}))' + formula = ' + '.join((regressors, contrast)) + dmat = pat.dmatrix(formula, sub_pheno, return_type='dataframe') + + if fast: + betas = fast_glm(sub_conn, dmat, group) + table = pd.DataFrame(data={'betas': betas}) + else: + betas, pvals = glm(sub_conn, dmat, group) + stand_betas, _ = glm(stand_conn, dmat, group) + table = pd.DataFrame(data={'betas': betas, 'stand_betas': stand_betas, 'pvals': pvals}) + + return table + + +def glm_wrap_continuous(conn, pheno, contrast, regressors, report=False, fast=False): + # Make sure pheno and conn have the same number of cases + if not conn.shape[0] == pheno.shape[0]: + print(f'Conn ({conn.shape[0]}) and pheno ({pheno.shape[0]}) must be same number of cases') + + # Define the subset of the sample + sub_mask = find_subset(pheno, contrast) + sub_conn = conn[sub_mask, :] + sub_pheno = pheno.loc[sub_mask] + n_sub = sub_pheno.shape[0] + n_data = sub_conn.shape[1] + sub_conn_stand = standardize(sub_conn, np.ones(n_sub).astype(bool)) + + if report: + print(f'Selected sample based on contrast variable {contrast}.\n' + f'Found {n_sub} subjects with no missing data for {contrast}\n' + f'original sample: n={pheno.shape[0]}; new sample: n={n_sub}\n' + f'{n_data} data points available\n' + f'standardized estimators are based on all subjects with no missing data for {contrast}') + + formula = ' + '.join((regressors, contrast)) + design_matrix = pat.dmatrix(formula, sub_pheno, return_type='dataframe') + + if fast: + betas = fast_glm(sub_conn, design_matrix, contrast) + table = pd.DataFrame(data={'betas': betas}) + else: + betas, pvals = glm(sub_conn, design_matrix, contrast) + stand_betas, _ = glm(sub_conn_stand, design_matrix, contrast) + table = pd.DataFrame(data={'betas': betas, 'stand_betas': stand_betas, 'pvals': pvals}) + + return table + + +def permutation_glm_contrast(conn, pheno, n_a, n_a_1, n_b, n_b_1, n_iter=5000): + # conn: connectome (n_sub, n_conn) + # pheno: pheno pandas table, same order as conn + # n_a / n_b: number of subs per group a / b + # n_a_1 / n_b_1: number of subs in the first contrast of group a / b + # n_iter: number of permutations + + # Some sanity checks because people can be stupid + if n_a_1 > n_a: + raise ValueError(f'n_a_1 ({n_a_1}) cannot be larger than n_a ({n_a})') + if n_b_1 > n_b: + raise ValueError(f'n_b_1 ({n_b_1}) cannot be larger than n_b ({n_b})') + + shsp = skm.ShuffleSplit(n_splits=n_iter, test_size=0.5) + pheno.reset_index(inplace=True, drop=True) + pheno.loc[:, 'group'] = 0 + n_subjects = pheno.shape[0] + beta = np.zeros((conn.shape[1], 2, n_iter)) + + start = time.time() + for fold_id, (group_a, group_b) in enumerate(shsp.split(np.ones(n_subjects))): + pheno_a = pheno.loc[group_a[:n_a]].copy() + pheno_b = pheno.loc[group_b[:n_b]].copy() + # Assign the groups + pheno_a.loc[group_a[:n_a_1], 'group'] = 1 + pheno_b.loc[group_b[:n_b_1], 'group'] = 1 + mat1 = pat.dmatrix('sex_dummy + Age_Bin + FD_scrubbed_s1_s2 + group', pheno_a, return_type='dataframe') + mat2 = pat.dmatrix('sex_dummy + Age_Bin + FD_scrubbed_s1_s2 + group', pheno_b, return_type='dataframe') + beta[:, 0, fold_id] = fast_glm(conn[group_a[:n_a], :], mat1, 'group') + beta[:, 1, fold_id] = fast_glm(conn[group_b[:n_b], :], mat2, 'group') + + elapsed = time.time() - start + done = fold_id + 1 + remaining = n_iter - done + time_left = (elapsed / done) * remaining + + sys.stdout.write('\r {}/{}. {:.2f}s left ({:.3f}s)'.format(done, n_iter, time_left, elapsed / done)) + sys.stdout.flush() + sys.stdout.write('\r Done. Took {:.2f}s'.format(elapsed)) + sys.stdout.flush() + + return beta + + +def contrast_correlation(beta, mask, mode='region'): + # mode: 'region' or 'global' + if mode == 'region': + n_iter = beta.shape[2] + n_seed = mask.shape[0] + corr = np.zeros((n_seed, n_iter)) + for fold_id in range(n_iter): + # Map the betas back to a matrix + beta1_mat = conn2mat(beta[:, 0, fold_id], mask) + beta2_mat = conn2mat(beta[:, 1, fold_id], mask) + # Compute the correlation + corr[:, fold_id] = np.array([np.corrcoef(beta1_mat[nid, :], beta2_mat[nid, :])[0, 1] for nid in range(64)]) + return corr + elif mode == 'global': + n_iter = beta.shape[2] + corr = np.array([np.corrcoef(beta[:, 0, fold_id], beta[:, 1, fold_id])[0, 1] for fold_id in range(n_iter)]) + return corr + else: + raise ValueError(f'mode {mode} is not configured!') + + +def permutation_glm(pheno, conn, group, case, control, regressors='', n_iter=1000, stand=False): + # Make sure pheno and conn have the same number of cases + if not conn.shape[0] == pheno.shape[0]: + print(f'Conn ({conn.shape[0]}) and pheno ({pheno.shape[0]}) must be same number of cases') + sub_mask, case_masks = find_subset(pheno, group, [case, control]) + sub_pheno = pheno.loc[sub_mask, :] + sub_pheno.loc[:, 'group'] = '' + sub_conn = conn[sub_mask, :] + # Standardize the input data + if stand: + sub_conn = standardize(sub_conn, case_masks[control]) + n_conn = sub_conn.shape[1] + + n1 = np.sum(case_masks[case]) + n2 = np.sum(case_masks[control]) + r2 = n2 / (n1 + n2) + + splitter = skm.ShuffleSplit(n_splits=n_iter, test_size=r2, random_state=1) + + i_pheno = sub_pheno.copy() + group_id = list(i_pheno.columns).index('group') + + betas = np.zeros(shape=(n_iter, n_conn)) + start = time.time() + for fold_id, (train_id, test_id) in enumerate(splitter.split(sub_conn)): + n1_ids = train_id[:n1] + n2_ids = test_id[:n2] + i_pheno = sub_pheno.copy() + i_pheno['group'] = '' + i_pheno.iloc[n1_ids, group_id] = 'case' + i_pheno.iloc[n2_ids, group_id] = 'control' + table = glm_wrap_cc(sub_conn, i_pheno, 'group', 'case', 'control', regressors=regressors, + report=False, fast=True) + betas[fold_id, :] = table['betas'].values + + elapsed = time.time() - start + done = fold_id + 1 + remaining = n_iter - done + time_left = (elapsed / done) * remaining + + sys.stdout.write('\r {}/{}. {:.2f}s left ({:.3f}s per permutation)'.format(done, n_iter, time_left, elapsed / done)) + sys.stdout.flush() + sys.stdout.write('\r Done. Took {:.2f}s'.format(elapsed)) + sys.stdout.flush() + + return betas diff --git a/scripts/oasis_wrangling_draft.ipynb b/scripts/oasis_wrangling_draft.ipynb new file mode 100644 index 0000000..8a875a8 --- /dev/null +++ b/scripts/oasis_wrangling_draft.ipynb @@ -0,0 +1,401 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 33, + "id": "476173ef-0750-4714-9822-1cbe24db08ee", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from pathlib import Path\n", + "\n", + "## Load cobre participant data and do some stuff\n", + "\n", + "root_p = Path(\"__file__\").resolve().parents[1] / \"data\" / \"oasis\"\n", + "\n", + "df = pd.read_csv(root_p / \"clarken_8_28_2023_13_54_33.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "1aaf1c89-1f7c-4b75-be35-c120ad9484bc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MR IDSubjectAgeScannerScansM/FNPIQINFNPIQINFXDELDELSEV...NEOPIFCOGOTHCOGOTHXCOGOTHIFCOGOTH2COGOTH2XCOGOTH2FCOGOTH3COGOTH3XCOGOTH3F
0OAS30001_MR_d0129OAS3000165.03.0Tbold(3), T1w(2), T2w(2)F2.0NaN0.0NaN...NaN0.0NaNNaNNaNNaNNaNNaNNaNNaN
1OAS30001_MR_d0757OAS3000167.03.0Tbold(2), dwi(1), minIP(1), swi(1), T1w(2), T2s...F2.0NaN0.0NaN...NaN0.0NaNNaNNaNNaNNaNNaNNaNNaN
2OAS30001_MR_d2430OAS3000171.03.0Tangio(1), asl(1), dwi(2), fieldmap(3), FLAIR(1...F2.0NaN0.0NaN...NaN0.0NaNNaN0.0NaNNaN0.0NaNNaN
3OAS30001_MR_d3132OAS3000173.03.0Tasl(2), bold(2), dwi(2), fieldmap(3), T1w(1), ...F2.0NaN0.0NaN...NaN0.0NaNNaN0.0NaNNaN0.0NaNNaN
4OAS30001_MR_d3746OAS30001NaNNaNangio(1), asl(1), bold(2), fieldmap(3), FLAIR(...F2.0NaN0.0NaN...NaN0.0NaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

5 rows × 120 columns

\n", + "
" + ], + "text/plain": [ + " MR ID Subject Age Scanner \\\n", + "0 OAS30001_MR_d0129 OAS30001 65.0 3.0T \n", + "1 OAS30001_MR_d0757 OAS30001 67.0 3.0T \n", + "2 OAS30001_MR_d2430 OAS30001 71.0 3.0T \n", + "3 OAS30001_MR_d3132 OAS30001 73.0 3.0T \n", + "4 OAS30001_MR_d3746 OAS30001 NaN NaN \n", + "\n", + " Scans M/F NPIQINF NPIQINFX \\\n", + "0 bold(3), T1w(2), T2w(2) F 2.0 NaN \n", + "1 bold(2), dwi(1), minIP(1), swi(1), T1w(2), T2s... F 2.0 NaN \n", + "2 angio(1), asl(1), dwi(2), fieldmap(3), FLAIR(1... F 2.0 NaN \n", + "3 asl(2), bold(2), dwi(2), fieldmap(3), T1w(1), ... F 2.0 NaN \n", + "4 angio(1), asl(1), bold(2), fieldmap(3), FLAIR(... F 2.0 NaN \n", + "\n", + " DEL DELSEV ... NEOPIF COGOTH COGOTHX COGOTHIF COGOTH2 COGOTH2X \\\n", + "0 0.0 NaN ... NaN 0.0 NaN NaN NaN NaN \n", + "1 0.0 NaN ... NaN 0.0 NaN NaN NaN NaN \n", + "2 0.0 NaN ... NaN 0.0 NaN NaN 0.0 NaN \n", + "3 0.0 NaN ... NaN 0.0 NaN NaN 0.0 NaN \n", + "4 0.0 NaN ... NaN 0.0 NaN NaN NaN NaN \n", + "\n", + " COGOTH2F COGOTH3 COGOTH3X COGOTH3F \n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN 0.0 NaN NaN \n", + "3 NaN 0.0 NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + "[5 rows x 120 columns]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "e24c5b24-8227-413c-b2b8-a5d537b0c01c", + "metadata": {}, + "outputs": [], + "source": [ + "# Rename some columns and map values for sex\n", + "df.rename(\n", + " columns={\"M/F\": \"sex\", \"Age\": \"age\", \"Subject\": \"participant_id\"},\n", + " inplace=True,\n", + ")\n", + "df[\"sex\"] = df[\"sex\"].map({\"M\": 0, \"F\": 1})" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "03e5edfa-c403-4cae-85be-e9fcb18e3057", + "metadata": {}, + "outputs": [], + "source": [ + "# Find controls in one of two ways, due to how missing data was coded \n", + "# Create a list of column names that need to be empty\n", + "empty_columns = [\"NORMCOG\", \"DEMENTED\", \"MCIAMEM\", \"MCIAPLUS\", \"MCIAPLAN\", \"MCIAPATT\", \"MCIAPEX\", \"MCIAPVIS\",\n", + " \"MCINON1\", \"MCIN1LAN\", \"MCIN1ATT\", \"MCIN1EX\", \"MCIN1VIS\", \"MCINON2\", \"MCIN2LAN\", \"MCIN2ATT\",\n", + " \"MCIN2EX\", \"MCIN2VIS\"]\n", + "\n", + "# Use boolean indexing to filter rows based on conditions\n", + "control_df = df[((df[\"dx1\"] == 'Cognitively healthy') & (df[empty_columns].isna().all(axis=1))) |\n", + " (df['NORMCOG'] == 1)]" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "af86c09e-09dc-4a9b-b856-d91e0ebbd79e", + "metadata": {}, + "outputs": [], + "source": [ + "# MCI only\n", + "# Filter rows where any of the MCI diagnoses are True\n", + "mci_df = df[df[['MCIAMEM', 'MCIAPLUS', 'MCIAPLAN', 'MCIAPATT', 'MCIAPEX',\n", + " 'MCIAPVIS', 'MCINON1', 'MCIN1LAN', 'MCIN1ATT', 'MCIN1EX',\n", + " 'MCIN1VIS', 'MCINON2', 'MCIN2LAN', 'MCIN2ATT', 'MCIN2EX', 'MCIN2VIS']].any(axis=1)]" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "3d728ec7-d0a5-492c-a26d-a3798af9739d", + "metadata": {}, + "outputs": [], + "source": [ + "# Alzheimers only\n", + "# Filter rows where DEMENTED column is True\n", + "ad_df = df[df['DEMENTED'] == 1]" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "5d62faec-57c0-469e-99f3-88a9d4ea064d", + "metadata": {}, + "outputs": [], + "source": [ + "# Add group column\n", + "control_df.loc[:, \"group\"] = \"CON\"\n", + "mci_df.loc[:, \"group\"] = \"MCI\"\n", + "ad_df.loc[:, \"group\"] = \"AD\"" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "1a0facb6-4dc4-4ecb-a332-131e8efbcbfe", + "metadata": {}, + "outputs": [], + "source": [ + "def _mbi_conversion(df):\n", + " # Calculate MBI domains\n", + " df[\"decreased_motivation\"] = df[\"APA\"]\n", + " df[\"emotional_dysregulation\"] = df[\"DEPD\"] + df[\"ANX\"] + df[\"ELAT\"]\n", + " df[\"impulse_dyscontrol\"] = df[\"AGIT\"] + df[\"IRR\"] + df[\"MOT\"]\n", + " df[\"social_inappropriateness\"] = df[\"DISN\"]\n", + " df[\"abnormal_perception\"] = df[\"DEL\"] + df[\"HALL\"]\n", + "\n", + " # Calculate MBI total score\n", + " mbi_domains = [\n", + " \"decreased_motivation\",\n", + " \"emotional_dysregulation\",\n", + " \"impulse_dyscontrol\",\n", + " \"social_inappropriateness\",\n", + " \"abnormal_perception\",\n", + " ]\n", + " df[\"MBI_total_score\"] = df[mbi_domains].sum(axis=1)\n", + " df[\"mbi_status\"] = (df[\"MBI_total_score\"] >= 1).astype(int)\n", + "\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "59093612-7228-4859-90af-2d4dc6d8578d", + "metadata": {}, + "outputs": [], + "source": [ + "mci_df = _mbi_conversion(mci_df.copy()) \n", + "dementia_df = _mbi_conversion(dementia_df.copy()) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a10dfed-85f3-41a7-8f3c-6581db86b56a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d1a3f3a-bb9e-46a0-8721-00604aa2248d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1082b745-09a1-478a-b889-b35cef7518fd", + "metadata": {}, + "outputs": [], + "source": [ + "# Next steps: match connectomes to patient df to check which ones we have. select earliest one with mbi status.\n", + "# then separate into positive and negative. select all the correct columns, then merge with controls\n", + "\n", + "\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From fdc8f47b740b890d4cc4818ed2aef7a8beda2de7 Mon Sep 17 00:00:00 2001 From: clarkenj Date: Tue, 9 Apr 2024 15:09:42 -0400 Subject: [PATCH 08/14] tidied up, added npy script --- scripts/adni_npi_to_mbi.py | 51 ---- scripts/cimaq_npi_to_mbi.py | 87 ------ scripts/get_cobre_participants.py | 33 --- scripts/get_ds00030_participants.py | 33 --- scripts/get_srpbs_participants.py | 31 --- scripts/h5_to_npy.py | 173 ++++++++++++ scripts/oasis_wrangling_draft.ipynb | 401 ---------------------------- 7 files changed, 173 insertions(+), 636 deletions(-) delete mode 100644 scripts/adni_npi_to_mbi.py delete mode 100644 scripts/cimaq_npi_to_mbi.py delete mode 100644 scripts/get_cobre_participants.py delete mode 100644 scripts/get_ds00030_participants.py delete mode 100644 scripts/get_srpbs_participants.py create mode 100644 scripts/h5_to_npy.py delete mode 100644 scripts/oasis_wrangling_draft.ipynb diff --git a/scripts/adni_npi_to_mbi.py b/scripts/adni_npi_to_mbi.py deleted file mode 100644 index 8ff48ba..0000000 --- a/scripts/adni_npi_to_mbi.py +++ /dev/null @@ -1,51 +0,0 @@ -import pandas as pd -from pathlib import Path - -## Load NPIQ or NPI scores for ADNI and convert to MBI framework -## Domains: https://adni.bitbucket.io/reference/npi.html - - -def _mbi_conversion(df): - # Calculate MBI domains - df["decreased_motivation"] = df["NPIG"] - df["emotional_dysregulation"] = df["NPID"] + df["NPIE"] + df["NPIF"] - df["impulse_dyscontrol"] = df["NPIC"] + df["NPII"] + df["NPIJ"] - df["social_inappropriateness"] = df["NPIH"] - df["abnormal_perception"] = df["NPIA"] + df["NPIB"] - - # Calculate MBI total score - mbi_domains = [ - "decreased_motivation", - "emotional_dysregulation", - "impulse_dyscontrol", - "social_inappropriateness", - "abnormal_perception", - ] - df["mbi_total_score"] = df[mbi_domains].sum(axis=1) - df["mbi_status"] = (df["mbi_total_score"] >= 1).astype(int) - - mbi_columns = [ - "RID", - "EXAMDATE", - "decreased_motivation", - "emotional_dysregulation", - "impulse_dyscontrol", - "social_inappropriateness", - "abnormal_perception", - "mbi_total_score", - "mbi_status", - ] - mbi_df = df[mbi_columns].copy() - - return mbi_df - - -root_p = Path("__file__").resolve().parents[1] / "data" / "adni" - -# Load NPI or NPI-Q scores -npi_df = pd.read_csv(root_p / "NPI_22Aug2023.csv") - -mbi_df = _mbi_conversion(npi_df) - -# Save results - make sure to change name -mbi_df.to_csv(root_p / "adni_npi_mbi_status_20230829.csv", index=False) diff --git a/scripts/cimaq_npi_to_mbi.py b/scripts/cimaq_npi_to_mbi.py deleted file mode 100644 index c94ec4a..0000000 --- a/scripts/cimaq_npi_to_mbi.py +++ /dev/null @@ -1,87 +0,0 @@ -import pandas as pd -from pathlib import Path - -## Load NPIQ scores for CIMA-Q and convert to MBI framework - - -def _map_values(df): - # Map values for columns we are using - mapping = {"0_non": 0, "1_oui_léger": 1, "2_oui_modéré": 2, "3_oui_sévère": 3} - - columns_to_map = [ - "22901_apathie", - "22901_depression_dysphorie", - "22901_anxiete", - "22901_euphorie", - "22901_agitation_aggressivite", - "22901_irritabilite", - "22901_comp_moteur_aberrant", - "22901_impulsivite", - "22901_idees_delirantes", - "22901_hallucinations", - ] - - for column in columns_to_map: - df[column] = df[column].map(mapping) - - return df - - -def _mbi_conversion(df): - # Calculate MBI domains - df["decreased_motivation"] = df["22901_apathie"] - df["emotional_dysregulation"] = ( - df["22901_depression_dysphorie"] + df["22901_anxiete"] + df["22901_euphorie"] - ) - df["impulse_dyscontrol"] = ( - df["22901_agitation_aggressivite"] - + df["22901_irritabilite"] - + df["22901_comp_moteur_aberrant"] - ) - df["social_inappropriateness"] = df["22901_impulsivite"] - df["abnormal_perception"] = ( - df["22901_idees_delirantes"] + df["22901_hallucinations"] - ) - - # Calculate MBI total score - mbi_domains = [ - "decreased_motivation", - "emotional_dysregulation", - "impulse_dyscontrol", - "social_inappropriateness", - "abnormal_perception", - ] - - df["mbi_total_score"] = df[mbi_domains].sum(axis=1) - df["mbi_status"] = (df["mbi_total_score"] >= 1).astype(int) - - mbi_columns = [ - "PSCID", - "Date_taken", - "decreased_motivation", - "emotional_dysregulation", - "impulse_dyscontrol", - "social_inappropriateness", - "abnormal_perception", - "mbi_total_score", - "mbi_status", - ] - - mbi_df = df[mbi_columns].copy() - - return mbi_df - - -root_p = Path("__file__").resolve().parents[1] / "data" / "cimaq" - -# Load NPI-Q scores -npi_df = pd.read_csv( - root_p / "22901_inventaire_neuropsychiatrique_q_initiale.tsv", sep="\t" -) - -npi_df = npi_df.dropna(subset=["Date_taken"]) -npi_df = _map_values(npi_df) -mbi_df = _mbi_conversion(npi_df) - -# Save results - make sure to change name -mbi_df.to_csv(root_p / "cimaq_npiq_mbi_status_20230829.csv", index=False) diff --git a/scripts/get_cobre_participants.py b/scripts/get_cobre_participants.py deleted file mode 100644 index bec947a..0000000 --- a/scripts/get_cobre_participants.py +++ /dev/null @@ -1,33 +0,0 @@ -import pandas as pd -from pathlib import Path - -## Load cobre participant data and do some stuff - -root_p = Path("__file__").resolve().parents[1] / "data" - -df = pd.read_csv(root_p / "cobre" / "COBRE_phenotypic_data.csv") - -# Add columns SZ and CON -df["SZ"] = (df["Subject Type"] == "Patient").astype(int) -df["CON"] = (df["Subject Type"] == "Control").astype(int) - -# Add group column -df["group"] = df["Subject Type"] -df["group"] = df["group"].replace({"Patient": "SZ", "Control": "CON"}) - -# Rename some columns and map values for sex -df.rename( - columns={"Gender": "sex", "Current Age": "age", "Unnamed: 0": "participant_id"}, - inplace=True, -) -df["sex"] = df["sex"].map({"Male": 0, "Female": 1}) - -# Add cohort and site columns -df["cohort"] = "COBRE" -df["site"] = 0 - -cols = ["participant_id", "age", "sex", "site", "cohort", "group", "SZ", "CON"] -df = df[cols] - -# Save results -df.to_csv(root_p / "cobre_participants.csv", index=False) diff --git a/scripts/get_ds00030_participants.py b/scripts/get_ds00030_participants.py deleted file mode 100644 index e49d66c..0000000 --- a/scripts/get_ds00030_participants.py +++ /dev/null @@ -1,33 +0,0 @@ -import pandas as pd -from pathlib import Path - -## Load ds000030 participant data and do some stuff - -root_p = Path("__file__").resolve().parents[1] / "data" - -df = pd.read_csv(root_p / "ds000030" / "participants.csv") - -# Keep only rows where diagnosis is CONTROL or SCHZ -df = df[df["diagnosis"].isin(["CONTROL", "SCHZ"])] - -# Add columns SZ and CON -df["SZ"] = (df["diagnosis"] == "SCHZ").astype(int) -df["CON"] = (df["diagnosis"] == "CONTROL").astype(int) - -# Add group column -df["group"] = df["diagnosis"] -df["group"] = df["group"].replace({"SCHZ": "SZ", "CONTROL": "CON"}) - -# Rename some columns and map values for sex -df.rename(columns={"gender": "sex"}, inplace=True) -df["sex"] = df["sex"].map({"M": 0, "F": 1}) - -# Add COHORT and SITE columns -df["cohort"] = "ds000030" -df["site"] = 0 - -cols = ["participant_id", "age", "sex", "site", "cohort", "group", "SZ", "CON"] -df = df[cols] - -# Save results -df.to_csv(root_p / "ds000030_participants.csv", index=False) diff --git a/scripts/get_srpbs_participants.py b/scripts/get_srpbs_participants.py deleted file mode 100644 index fddba39..0000000 --- a/scripts/get_srpbs_participants.py +++ /dev/null @@ -1,31 +0,0 @@ -import pandas as pd -from pathlib import Path - -## Load srpbs participant data and do some stuff - -root_p = Path("__file__").resolve().parents[1] / "data" - -df = pd.read_csv(root_p / "srpbs" / "participants.tsv", sep="\t") - -# Keep only rows where diagnosis is 0 (control) or 4 (sz) -df = df[df["diag"].isin([0, 4])] - -# Add columns SZ and CON -df["SZ"] = (df["diag"] == 4).astype(int) -df["CON"] = (df["diag"] == 0).astype(int) - -# Add group column -df["group"] = df["diag"] -df["group"] = df["group"].replace({4: "SZ", 0: "CON"}) - -# Map values for sex, from total N in Tanaka paper -df["sex"] = df["sex"].map({1: 0, 2: 1}) - -# Add cohort and site columns -df["cohort"] = "SRPBS" - -cols = ["participant_id", "age", "sex", "site", "cohort", "group", "SZ", "CON"] -df = df[cols] - -# Save results -df.to_csv(root_p / "srpbs_participants.csv", index=False) diff --git a/scripts/h5_to_npy.py b/scripts/h5_to_npy.py new file mode 100644 index 0000000..805bb0b --- /dev/null +++ b/scripts/h5_to_npy.py @@ -0,0 +1,173 @@ +#import os dont think I need this? +import h5py +import numpy as np +import pandas as pd +from pathlib import Path + +def load_connectome(file, participant_id, session, identifier, no_session): + # Construct the dataset path + if no_session: + dataset_path = f"sub-{participant_id}/{identifier}_atlas-MIST_desc-64_connectome" + else: + dataset_path = f"sub-{participant_id}/ses-{session}/{identifier}_atlas-MIST_desc-64_connectome" + + dataset = file.get(dataset_path) + + connectome_npy = None # Initialise to None in case missing + if dataset is not None: + connectome_npy = dataset[...] # Load the dataset into a NumPy array + return connectome_npy + +def save_connectome(connectome_npy, output_p, identifier): + if connectome_npy is not None: + output_file_p = output_p / f"{identifier}.npy" + np.save(output_file_p, connectome_npy) + print(f"Saved data for {identifier}") + else: + print(f"Connectome for {identifier} not found or could not be loaded") + +def process_connectome_participant(row, base_dir, output_p): + identifier = row["identifier"] + participant_id = row["participant_id"] + session = row["ses"] + # Construct path to subject's HDF5 file + hdf5_file_p = base_dir / f"{participant_id}" / f"sub-{participant_id}_atlas-MIST_desc-scrubbing.5+gsr.h5" + + if hdf5_file_p.exists(): + with h5py.File(hdf5_file_p, "r") as file: + connectome_npy = load_connectome(file, participant_id, session, identifier) + save_connectome(connectome_npy, output_p, identifier) + +def process_connectome_group(row, file, output_p, no_session): + identifier = row["identifier"] + participant_id = row["participant_id"] + session = row["ses"] + + connectome_npy = load_connectome(file, participant_id, session, identifier, no_session) + save_connectome(connectome_npy, output_p, identifier) + +def process_cobre(conn_p, df, output_dir): + base_dir = conn_p / "cobre_connectome-0.4.1" + hdf5_file_p = base_dir / "atlas-MIST_desc-scrubbing.5+gsr.h5" + output_p = output_dir / "cobre" + if not output_p.exists(): + output_p.mkdir(parents=True, exist_ok=True) + + # Filter df for dataset + filtered_df = df[df['dataset'] == 'cobre'] + + with h5py.File(hdf5_file_p, "r") as file: + # Loop through the filtered dataframe + for index, row in filtered_df.iterrows(): + process_connectome_group(row, file, output_p) + +def process_ds000030(conn_p, df, output_dir): + base_dir = conn_p / "ds000030_connectomes-0.4.1" + hdf5_file_p = base_dir / "atlas-MIST_desc-scrubbing.5+gsr.h5" + output_p = output_dir / "ds000030" + if not output_p.exists(): + output_p.mkdir(parents=True, exist_ok=True) + + # Filter df for dataset + filtered_df = df[df['dataset'] == 'ds000030'] + + with h5py.File(hdf5_file_p, "r") as file: + # Loop through the filtered dataframe + for index, row in filtered_df.iterrows(): + process_connectome_group(row, file, output_p, no_session=True) + +def process_hcpep(conn_p, df, output_dir): + base_dir = conn_p / "hcp-ep_connectome-0.4.1" + output_p = output_dir / "hcpep" + if not output_p.exists(): + output_p.mkdir(parents=True, exist_ok=True) + + # Filter df for dataset + filtered_df = df[df['dataset'] == 'hcpep'] + + # Loop through the filtered dataframe + for index, row in filtered_df.iterrows(): + process_connectome_participant(row, base_dir, output_p) + +def process_srpbs(conn_p, df, output_dir): + base_dir = conn_p / "srpbs_connectome-0.4.1" + output_p = output_dir / "srpbs" + if not output_p.exists(): + output_p.mkdir(parents=True, exist_ok=True) + + # Filter df for dataset + filtered_df = df[df['dataset'] == 'srpbs'] + + # Loop through the filtered dataframe + for index, row in filtered_df.iterrows(): + process_connectome_participant(row, base_dir, output_p) + +def process_adni(conn_p, final_file_p, output_dir): + base_dir = conn_p / "adni_connectomes-0.4.1" + final_file = final_file_p / "final_adni.tsv" + output_p = output_dir / "adni" + if not output_p.exists(): + output_p.mkdir(parents=True, exist_ok=True) + + # Load the file to get identifiers and participant_ids + df = pd.read_csv(final_file, sep="\t") + + # Loop through the filtered dataframe + for index, row in df.iterrows(): + process_connectome_participant(row, base_dir, output_p) + +def process_oasis3(conn_p, final_file_p, output_dir): + base_dir = conn_p / "oasis3_connectomes-0.4.1" + final_file = final_file_p / "final_oasis3.tsv" + output_p = output_dir / "oasis3" + if not output_p.exists(): + output_p.mkdir(parents=True, exist_ok=True) + + # Load the file to get identifiers and participant_ids + df = pd.read_csv(final_file, sep="\t") + + # Loop through the filtered dataframe + for index, row in df.iterrows(): + process_connectome_participant(row, base_dir, output_p) + +def process_cimaq(conn_p, final_file_p, output_dir): + base_dir = conn_p / "cimaq_connectomes-0.4.1" + final_file = final_file_p / "final_cimaq.tsv" + output_p = output_dir / "cimaq" + if not output_p.exists(): + output_p.mkdir(parents=True, exist_ok=True) + + # Load the file to get identifiers and participant_ids + df = pd.read_csv(final_file, sep="\t") + + # Loop through the filtered dataframe + for index, row in df.iterrows(): + process_connectome_participant(row, base_dir, output_p) + + +if __name__ == "__main__": + passed_qc_file_p = Path("/home/nclarke/projects/rrg-pbellec/nclarke/ad_sz/files/passed_qc_master.tsv") + conn_p = Path("/home/nclarke/scratch") + output_dir = Path("/home/nclarke/scratch/npy_connectome") + + # These datasets needed some extra steps aftre checking QC, so get identifiers from a different tsv + final_file_p = Path("/home/nclarke/projects/rrg-pbellec/nclarke/ad_sz/files") + + # Load the file to get identifiers and participant_ids + df = pd.read_csv(passed_qc_file_p, sep="\t") + + #process_cobre(conn_p, df, output_dir) + process_ds000030(conn_p, df, output_dir) + #process_hcpep(conn_p, df, output_dir) + #process_srpbs(conn_p, df, output_dir) + #process_adni(conn_p, final_file_p, output_dir) + #process_cimaq(conn_p, final_file_p, output_dir) + #process_oasis3(conn_p, final_file_p, output_dir) + + + + + + + + diff --git a/scripts/oasis_wrangling_draft.ipynb b/scripts/oasis_wrangling_draft.ipynb deleted file mode 100644 index 8a875a8..0000000 --- a/scripts/oasis_wrangling_draft.ipynb +++ /dev/null @@ -1,401 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 33, - "id": "476173ef-0750-4714-9822-1cbe24db08ee", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from pathlib import Path\n", - "\n", - "## Load cobre participant data and do some stuff\n", - "\n", - "root_p = Path(\"__file__\").resolve().parents[1] / \"data\" / \"oasis\"\n", - "\n", - "df = pd.read_csv(root_p / \"clarken_8_28_2023_13_54_33.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "1aaf1c89-1f7c-4b75-be35-c120ad9484bc", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
MR IDSubjectAgeScannerScansM/FNPIQINFNPIQINFXDELDELSEV...NEOPIFCOGOTHCOGOTHXCOGOTHIFCOGOTH2COGOTH2XCOGOTH2FCOGOTH3COGOTH3XCOGOTH3F
0OAS30001_MR_d0129OAS3000165.03.0Tbold(3), T1w(2), T2w(2)F2.0NaN0.0NaN...NaN0.0NaNNaNNaNNaNNaNNaNNaNNaN
1OAS30001_MR_d0757OAS3000167.03.0Tbold(2), dwi(1), minIP(1), swi(1), T1w(2), T2s...F2.0NaN0.0NaN...NaN0.0NaNNaNNaNNaNNaNNaNNaNNaN
2OAS30001_MR_d2430OAS3000171.03.0Tangio(1), asl(1), dwi(2), fieldmap(3), FLAIR(1...F2.0NaN0.0NaN...NaN0.0NaNNaN0.0NaNNaN0.0NaNNaN
3OAS30001_MR_d3132OAS3000173.03.0Tasl(2), bold(2), dwi(2), fieldmap(3), T1w(1), ...F2.0NaN0.0NaN...NaN0.0NaNNaN0.0NaNNaN0.0NaNNaN
4OAS30001_MR_d3746OAS30001NaNNaNangio(1), asl(1), bold(2), fieldmap(3), FLAIR(...F2.0NaN0.0NaN...NaN0.0NaNNaNNaNNaNNaNNaNNaNNaN
\n", - "

5 rows × 120 columns

\n", - "
" - ], - "text/plain": [ - " MR ID Subject Age Scanner \\\n", - "0 OAS30001_MR_d0129 OAS30001 65.0 3.0T \n", - "1 OAS30001_MR_d0757 OAS30001 67.0 3.0T \n", - "2 OAS30001_MR_d2430 OAS30001 71.0 3.0T \n", - "3 OAS30001_MR_d3132 OAS30001 73.0 3.0T \n", - "4 OAS30001_MR_d3746 OAS30001 NaN NaN \n", - "\n", - " Scans M/F NPIQINF NPIQINFX \\\n", - "0 bold(3), T1w(2), T2w(2) F 2.0 NaN \n", - "1 bold(2), dwi(1), minIP(1), swi(1), T1w(2), T2s... F 2.0 NaN \n", - "2 angio(1), asl(1), dwi(2), fieldmap(3), FLAIR(1... F 2.0 NaN \n", - "3 asl(2), bold(2), dwi(2), fieldmap(3), T1w(1), ... F 2.0 NaN \n", - "4 angio(1), asl(1), bold(2), fieldmap(3), FLAIR(... F 2.0 NaN \n", - "\n", - " DEL DELSEV ... NEOPIF COGOTH COGOTHX COGOTHIF COGOTH2 COGOTH2X \\\n", - "0 0.0 NaN ... NaN 0.0 NaN NaN NaN NaN \n", - "1 0.0 NaN ... NaN 0.0 NaN NaN NaN NaN \n", - "2 0.0 NaN ... NaN 0.0 NaN NaN 0.0 NaN \n", - "3 0.0 NaN ... NaN 0.0 NaN NaN 0.0 NaN \n", - "4 0.0 NaN ... NaN 0.0 NaN NaN NaN NaN \n", - "\n", - " COGOTH2F COGOTH3 COGOTH3X COGOTH3F \n", - "0 NaN NaN NaN NaN \n", - "1 NaN NaN NaN NaN \n", - "2 NaN 0.0 NaN NaN \n", - "3 NaN 0.0 NaN NaN \n", - "4 NaN NaN NaN NaN \n", - "\n", - "[5 rows x 120 columns]" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "e24c5b24-8227-413c-b2b8-a5d537b0c01c", - "metadata": {}, - "outputs": [], - "source": [ - "# Rename some columns and map values for sex\n", - "df.rename(\n", - " columns={\"M/F\": \"sex\", \"Age\": \"age\", \"Subject\": \"participant_id\"},\n", - " inplace=True,\n", - ")\n", - "df[\"sex\"] = df[\"sex\"].map({\"M\": 0, \"F\": 1})" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "03e5edfa-c403-4cae-85be-e9fcb18e3057", - "metadata": {}, - "outputs": [], - "source": [ - "# Find controls in one of two ways, due to how missing data was coded \n", - "# Create a list of column names that need to be empty\n", - "empty_columns = [\"NORMCOG\", \"DEMENTED\", \"MCIAMEM\", \"MCIAPLUS\", \"MCIAPLAN\", \"MCIAPATT\", \"MCIAPEX\", \"MCIAPVIS\",\n", - " \"MCINON1\", \"MCIN1LAN\", \"MCIN1ATT\", \"MCIN1EX\", \"MCIN1VIS\", \"MCINON2\", \"MCIN2LAN\", \"MCIN2ATT\",\n", - " \"MCIN2EX\", \"MCIN2VIS\"]\n", - "\n", - "# Use boolean indexing to filter rows based on conditions\n", - "control_df = df[((df[\"dx1\"] == 'Cognitively healthy') & (df[empty_columns].isna().all(axis=1))) |\n", - " (df['NORMCOG'] == 1)]" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "af86c09e-09dc-4a9b-b856-d91e0ebbd79e", - "metadata": {}, - "outputs": [], - "source": [ - "# MCI only\n", - "# Filter rows where any of the MCI diagnoses are True\n", - "mci_df = df[df[['MCIAMEM', 'MCIAPLUS', 'MCIAPLAN', 'MCIAPATT', 'MCIAPEX',\n", - " 'MCIAPVIS', 'MCINON1', 'MCIN1LAN', 'MCIN1ATT', 'MCIN1EX',\n", - " 'MCIN1VIS', 'MCINON2', 'MCIN2LAN', 'MCIN2ATT', 'MCIN2EX', 'MCIN2VIS']].any(axis=1)]" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "3d728ec7-d0a5-492c-a26d-a3798af9739d", - "metadata": {}, - "outputs": [], - "source": [ - "# Alzheimers only\n", - "# Filter rows where DEMENTED column is True\n", - "ad_df = df[df['DEMENTED'] == 1]" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "5d62faec-57c0-469e-99f3-88a9d4ea064d", - "metadata": {}, - "outputs": [], - "source": [ - "# Add group column\n", - "control_df.loc[:, \"group\"] = \"CON\"\n", - "mci_df.loc[:, \"group\"] = \"MCI\"\n", - "ad_df.loc[:, \"group\"] = \"AD\"" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "1a0facb6-4dc4-4ecb-a332-131e8efbcbfe", - "metadata": {}, - "outputs": [], - "source": [ - "def _mbi_conversion(df):\n", - " # Calculate MBI domains\n", - " df[\"decreased_motivation\"] = df[\"APA\"]\n", - " df[\"emotional_dysregulation\"] = df[\"DEPD\"] + df[\"ANX\"] + df[\"ELAT\"]\n", - " df[\"impulse_dyscontrol\"] = df[\"AGIT\"] + df[\"IRR\"] + df[\"MOT\"]\n", - " df[\"social_inappropriateness\"] = df[\"DISN\"]\n", - " df[\"abnormal_perception\"] = df[\"DEL\"] + df[\"HALL\"]\n", - "\n", - " # Calculate MBI total score\n", - " mbi_domains = [\n", - " \"decreased_motivation\",\n", - " \"emotional_dysregulation\",\n", - " \"impulse_dyscontrol\",\n", - " \"social_inappropriateness\",\n", - " \"abnormal_perception\",\n", - " ]\n", - " df[\"MBI_total_score\"] = df[mbi_domains].sum(axis=1)\n", - " df[\"mbi_status\"] = (df[\"MBI_total_score\"] >= 1).astype(int)\n", - "\n", - " return df" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "59093612-7228-4859-90af-2d4dc6d8578d", - "metadata": {}, - "outputs": [], - "source": [ - "mci_df = _mbi_conversion(mci_df.copy()) \n", - "dementia_df = _mbi_conversion(dementia_df.copy()) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6a10dfed-85f3-41a7-8f3c-6581db86b56a", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5d1a3f3a-bb9e-46a0-8721-00604aa2248d", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1082b745-09a1-478a-b889-b35cef7518fd", - "metadata": {}, - "outputs": [], - "source": [ - "# Next steps: match connectomes to patient df to check which ones we have. select earliest one with mbi status.\n", - "# then separate into positive and negative. select all the correct columns, then merge with controls\n", - "\n", - "\n", - "\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From cdf45dc2886f7d1c68765e42d4348b605824b38b Mon Sep 17 00:00:00 2001 From: clarkenj Date: Tue, 9 Apr 2024 15:51:55 -0400 Subject: [PATCH 09/14] add first draft combat file script --- scripts/prepare_combat_file.py | 61 ++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 scripts/prepare_combat_file.py diff --git a/scripts/prepare_combat_file.py b/scripts/prepare_combat_file.py new file mode 100644 index 0000000..5d09044 --- /dev/null +++ b/scripts/prepare_combat_file.py @@ -0,0 +1,61 @@ +from pathlib import Path +import numpy as np + +def load_connectomes(base_directory_p): + """ + Load connectome data from .npy files across multiple subdirectories. + + Parameters: + - base_directory_p: Path to the base directory containing subdirectories of connectome files. + + Returns: + - connectomes: A list of connectome matrices, one per participant across all subdirectories. + """ + connectomes = [] + # Iterate over each subdirectory in the base directory + for directory_p in base_directory_p.iterdir(): + if directory_p.is_dir(): + for file_p in directory_p.glob("*.npy"): + connectome = np.load(file_p) + connectomes.append(connectome) + return connectomes + +def connectome_to_edges_matrix(connectomes): + """ + Transform a list of connectomes into a matrix suitable for ComBat, + where rows are ROI pairs and columns are participants. + """ + num_participants = len(connectomes) + num_rois = connectomes[0].shape[0] + num_edges = num_rois * (num_rois - 1) // 2 + + edges_matrix = np.zeros((num_edges, num_participants)) + for i, connectome in enumerate(connectomes): + edge_index = 0 + for row in range(num_rois): + for col in range(row + 1, num_rois): + edges_matrix[edge_index, i] = connectome[row, col] + edge_index += 1 + + return edges_matrix + +# Set paths +base_directory_p = Path("/home/neuromod/ad_sz/data/npy_connectome") +output_p = "/home/neuromod/ad_sz/data/edges_matrix.tsv" + +# Load connectomes from all subdirectories. +connectomes = load_connectomes(base_directory_p) + +print(f"Number of connectomes loaded: {len(connectomes)}") +for connectome in connectomes[:5]: # Print the shape of the first 5 connectomes as a check + print(connectome.shape) + +# Transform into the ComBat-ready matrix. +edges_matrix = connectome_to_edges_matrix(connectomes) + +print(f"Edges matrix shape: {edges_matrix.shape}") + +# Save the edges_matrix to a .tsv file +np.savetxt(output_p, edges_matrix, delimiter='\t') + +print(f"Saved edges matrix to {output_p}") From 166331d524bccbe9fd4c7910beb4ba4783a6e5a3 Mon Sep 17 00:00:00 2001 From: clarkenj Date: Mon, 29 Apr 2024 16:46:59 -0400 Subject: [PATCH 10/14] added data processing script --- scripts/prepare_input_data.py | 151 ++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 scripts/prepare_input_data.py diff --git a/scripts/prepare_input_data.py b/scripts/prepare_input_data.py new file mode 100644 index 0000000..5962502 --- /dev/null +++ b/scripts/prepare_input_data.py @@ -0,0 +1,151 @@ +import h5py +import numpy as np +import pandas as pd +from pathlib import Path + +dataset_configs = { + 'cobre': { + 'connectome_path': "cobre_connectome-0.4.1_MIST_afc", + 'no_session': False, + 'subject_folder': False + }, + 'cimaq': { + 'connectome_path': "cimaq_connectome-0.4.1_MIST_afc", + 'no_session': False, + 'subject_folder': True + }, + 'hcpep': { + 'connectome_path': "hcp-ep_connectome-0.4.1", + 'no_session': False, + 'subject_folder': True + }, + 'ds000030': { + 'connectome_path': "ds000030_connectomes-0.4.1", + 'no_session': True, + 'subject_folder': False + }, +} + +def save_npy_connectome(connectome_npy, output_p, identifier): + # Not using this function currently + if connectome_npy is not None: + output_file_p = output_p / f"{identifier}.npy" + np.save(output_file_p, connectome_npy) + print(f"Saved data for {identifier}") + else: + print(f"Connectome for {identifier} not found or could not be loaded") + +def connectome_to_edges_matrix_with_ids(connectomes_with_ids): + # Update function to new one + num_participants = len(connectomes_with_ids) + if num_participants == 0: + return np.array([]), [] # Handling the case of no connectomes + + num_rois = connectomes_with_ids[0][1].shape[0] # Shape from the first connectome + num_edges = num_rois * (num_rois + 1) // 2 + + edges_matrix = np.zeros((num_edges, num_participants)) + identifiers = [] + + for i, (identifier, connectome) in enumerate(connectomes_with_ids): + identifiers.append(identifier) + edge_index = 0 + for row in range(num_rois): + for col in range(row, num_rois): # Include the diagonal + edges_matrix[edge_index, i] = connectome[row, col] + edge_index += 1 + + return edges_matrix, identifiers + + +def process_datasets(conn_p, df, dataset_name): + config = dataset_configs[dataset_name] + base_dir = conn_p / config['connectome_path'] + + filtered_df = df[df['dataset'] == dataset_name] + + connectomes_ids = [] + covariate_data_list = [] + for index, row in filtered_df.iterrows(): + participant_id = row["participant_id"] + identifier = row["identifier"] + session = None if config['no_session'] else row.get("ses", None) + site = row["site"] + sex = row["sex"] + age = row["age"] + #diagnosis = row["diagnosis"] + + # Adjust file path based on whether dataset uses a subject folder + if config['subject_folder']: + file_path = base_dir / participant_id / f"sub-{participant_id}_atlas-MIST_desc-scrubbing.5+gsr.h5" + else: + file_path = base_dir / "atlas-MIST_desc-scrubbing.5+gsr.h5" + try: + with h5py.File(file_path, "r") as file: + connectome = load_connectome(file, participant_id, session, identifier, config['no_session']) + if connectome is not None: + connectomes_ids.append((identifier, connectome)) + #covariates = pd.DataFrame({'SITE': site, 'SEX': sex, 'AGE': age, 'DIAGNOSIS': diagnosis}, index=[identifier]) + covariates = pd.DataFrame({'SITE': site, 'SEX': sex, 'AGE': age}, index=[identifier]) + covariate_data_list.append(covariates) + except FileNotFoundError: + print(f"File not found: {file_path}") + + if connectomes_ids: + edges_matrix, identifiers = connectome_to_edges_matrix_with_ids(connectomes_ids) + edges_df = pd.DataFrame(edges_matrix.T, index=identifiers) + covariate_df = pd.concat(covariate_data_list, ignore_index=False) + return edges_df, covariate_df + else: + return pd.DataFrame(), pd.DataFrame() + +def load_connectome(file, participant_id, session, identifier, no_session): + # Path construction within .h5 file + dataset_path = f"sub-{participant_id}" + + if not no_session: + dataset_path += f"/ses-{session}" + + dataset_path += f"/{identifier}_atlas-MIST_desc-64_connectome" + dataset = file.get(dataset_path) + + if dataset is not None: + return dataset[...] + else: + print(f"Connectome for {identifier} not found or could not be loaded") + return None + +def one_hot_encode_column(df, column_name): + dummies = pd.get_dummies(df[column_name], prefix=column_name) + df = pd.concat([df, dummies], axis=1) + df.drop(column_name, axis=1, inplace=True) + return df + + +if __name__ == "__main__": + conn_p = Path("/home/neuromod/ad_sz/data") + output_p = Path("/home/neuromod/ad_sz/data/input_data") + df = pd.read_csv("/home/neuromod/wrangling-phenotype/outputs/final_master_pheno.tsv",sep="\t") + + if not output_p.exists(): + output_p.mkdir(parents=True, exist_ok=True) + + all_edges = [] + all_covariates = [] + for dataset in dataset_configs: + edges_df, covariates_df = process_datasets(conn_p, df, dataset) + all_edges.append(edges_df) + all_covariates.append(covariates_df) + + # Concatenate all edges and covariates + combined_edges_df = pd.concat(all_edges) + combined_covariates_df = pd.concat(all_covariates) + + # One-hot encode covariates + combined_covariates_df = one_hot_encode_column(combined_covariates_df, 'SEX') + #combined_covariates_df = one_hot_encode_column(combined_covariates_df, 'DIAGNOSIS') + + combined_edges_df.sort_index().to_csv(output_p / 'combined_edges.tsv', sep='\t', index=True) + combined_covariates_df.sort_index().to_csv(output_p / 'combat_covariates.tsv', sep='\t', index=True) + + print(f"Saved edges matrix and coavraites to {output_p}") From 09f19284e1932ef37d79158d53e71db650397eef Mon Sep 17 00:00:00 2001 From: clarkenj Date: Tue, 30 Apr 2024 17:10:39 -0400 Subject: [PATCH 11/14] continued work on data loading --- scripts/prepare_input_data.py | 185 +++++++++++++++++++++++----------- 1 file changed, 124 insertions(+), 61 deletions(-) diff --git a/scripts/prepare_input_data.py b/scripts/prepare_input_data.py index 5962502..a256e6f 100644 --- a/scripts/prepare_input_data.py +++ b/scripts/prepare_input_data.py @@ -26,6 +26,22 @@ }, } +def load_connectome(file, participant_id, session, identifier, no_session): + # Path construction within .h5 file + dataset_path = f"sub-{participant_id}" + + if not no_session: + dataset_path += f"/ses-{session}" + + dataset_path += f"/{identifier}_atlas-MIST_desc-64_connectome" + dataset = file.get(dataset_path) + + if dataset is not None: + return dataset[...] + else: + print(f"Connectome for {identifier} not found") + return None + def save_npy_connectome(connectome_npy, output_p, identifier): # Not using this function currently if connectome_npy is not None: @@ -33,47 +49,63 @@ def save_npy_connectome(connectome_npy, output_p, identifier): np.save(output_file_p, connectome_npy) print(f"Saved data for {identifier}") else: - print(f"Connectome for {identifier} not found or could not be loaded") - -def connectome_to_edges_matrix_with_ids(connectomes_with_ids): - # Update function to new one - num_participants = len(connectomes_with_ids) - if num_participants == 0: - return np.array([]), [] # Handling the case of no connectomes - - num_rois = connectomes_with_ids[0][1].shape[0] # Shape from the first connectome - num_edges = num_rois * (num_rois + 1) // 2 - - edges_matrix = np.zeros((num_edges, num_participants)) - identifiers = [] - - for i, (identifier, connectome) in enumerate(connectomes_with_ids): - identifiers.append(identifier) - edge_index = 0 - for row in range(num_rois): - for col in range(row, num_rois): # Include the diagonal - edges_matrix[edge_index, i] = connectome[row, col] - edge_index += 1 - - return edges_matrix, identifiers - + print(f"Connectome for {identifier} not found") + +def connectome_to_edge_matrix(connectome): + """ + Transform a single connectome into an edge matrix. + Extract the upper triangular part of the connectome, including the diagonal. + """ + # Get the upper triangular indices including the diagonal + row_indices, col_indices = np.triu_indices_from(connectome) + + # Extract the values at these indices + edge_matrix = connectome[row_indices, col_indices] + + return edge_matrix + +def process_connectomes(connectomes_by_participant): + """ + For multiple connectomes per participant, average them, apply Fisher z transform, and convert to an edge matrix. + """ + edges_matrix_dict = {} + for participant_id, connectomes in connectomes_by_participant.items(): + # Compute the mean connectome + mean_connectome = np.mean(connectomes, axis=0) + # Apply Fisher Z transformation + transformed_connectome = np.arctanh(mean_connectome) + # Convert connectome to a marix for ComBat + edges_matrix = connectome_to_edge_matrix(transformed_connectome) + edges_matrix_dict[participant_id] = edges_matrix + + return edges_matrix_dict + + +def matrix_dict_to_df(edges_matrix_dict): + # Create a DataFrame from the edge matrices + # Convert dictionary values to a 2D numpy array + edge_matrix_combined = np.column_stack(list(edges_matrix_dict.values())) + participants = list(edges_matrix_dict.keys()) + edges_df = pd.DataFrame(edge_matrix_combined, columns=participants) + + # Transpose so suitable for ComBat + edges_df = edges_df.T + + return edges_df def process_datasets(conn_p, df, dataset_name): config = dataset_configs[dataset_name] base_dir = conn_p / config['connectome_path'] + # Filter the df for required scans for the dataset filtered_df = df[df['dataset'] == dataset_name] + valid_rows = [] # To store rows where the connectome was found - connectomes_ids = [] - covariate_data_list = [] + connectomes_by_participant = {} for index, row in filtered_df.iterrows(): participant_id = row["participant_id"] identifier = row["identifier"] session = None if config['no_session'] else row.get("ses", None) - site = row["site"] - sex = row["sex"] - age = row["age"] - #diagnosis = row["diagnosis"] # Adjust file path based on whether dataset uses a subject folder if config['subject_folder']: @@ -84,36 +116,49 @@ def process_datasets(conn_p, df, dataset_name): with h5py.File(file_path, "r") as file: connectome = load_connectome(file, participant_id, session, identifier, config['no_session']) if connectome is not None: - connectomes_ids.append((identifier, connectome)) - #covariates = pd.DataFrame({'SITE': site, 'SEX': sex, 'AGE': age, 'DIAGNOSIS': diagnosis}, index=[identifier]) - covariates = pd.DataFrame({'SITE': site, 'SEX': sex, 'AGE': age}, index=[identifier]) - covariate_data_list.append(covariates) + # Append the connectome to the participant's list in the dictionary + if participant_id not in connectomes_by_participant: + connectomes_by_participant[participant_id] = [] + connectomes_by_participant[participant_id].append(connectome) + + # Add the row to valid_rows if the connectome is found + valid_rows.append(row) + except FileNotFoundError: print(f"File not found: {file_path}") - if connectomes_ids: - edges_matrix, identifiers = connectome_to_edges_matrix_with_ids(connectomes_ids) - edges_df = pd.DataFrame(edges_matrix.T, index=identifiers) - covariate_df = pd.concat(covariate_data_list, ignore_index=False) - return edges_df, covariate_df - else: - return pd.DataFrame(), pd.DataFrame() + # Process connectomes + if connectomes_by_participant: + edges_matrix_dict = process_connectomes(connectomes_by_participant) + edges_df = matrix_dict_to_df(edges_matrix_dict) -def load_connectome(file, participant_id, session, identifier, no_session): - # Path construction within .h5 file - dataset_path = f"sub-{participant_id}" + # Convert the list of valid rows to a DataFrame + valid_df = pd.DataFrame(valid_rows) - if not no_session: - dataset_path += f"/ses-{session}" + return edges_df, valid_df - dataset_path += f"/{identifier}_atlas-MIST_desc-64_connectome" - dataset = file.get(dataset_path) +def create_pheno_df(valid_df): + # Group by 'participant_id' and aggregate, taking the first entry for variables that do not change (for this use case) + aggregation_functions = { + 'sex': 'first', + 'age': 'first', + 'diagnosis': 'first', + 'site': 'first', + 'mean_fd_scrubbed': 'mean', # Averaging mean framewise displacement across scans + 'mbi_status': 'first' + } + pheno_df = valid_df.groupby('participant_id').agg(aggregation_functions) - if dataset is not None: - return dataset[...] - else: - print(f"Connectome for {identifier} not found or could not be loaded") - return None + return pheno_df + +def create_covariates_df(valid_df): + # Group by participant_id and select the first entry for each group + first_entries = valid_df.groupby('participant_id').first() + + # Extract the relevant covariates + covariates_df = first_entries[['site', 'sex', 'age', 'diagnosis']] + + return covariates_df def one_hot_encode_column(df, column_name): dummies = pd.get_dummies(df[column_name], prefix=column_name) @@ -121,6 +166,16 @@ def one_hot_encode_column(df, column_name): df.drop(column_name, axis=1, inplace=True) return df +def process_covariates(combined_covariates_df): + processed_covariates_df = combined_covariates_df.copy() + # Rename diagnosis variable + processed_covariates_df['diagnosis'] = processed_covariates_df['diagnosis'].replace('PSYC', 'SCHZ') + + # One-hot encode covariates + processed_covariates_df = one_hot_encode_column(processed_covariates_df, 'sex') + processed_covariates_df = one_hot_encode_column(processed_covariates_df, 'diagnosis') + + return processed_covariates_df if __name__ == "__main__": conn_p = Path("/home/neuromod/ad_sz/data") @@ -131,21 +186,29 @@ def one_hot_encode_column(df, column_name): output_p.mkdir(parents=True, exist_ok=True) all_edges = [] + all_pheno = [] all_covariates = [] for dataset in dataset_configs: - edges_df, covariates_df = process_datasets(conn_p, df, dataset) + edges_df, valid_df = process_datasets(conn_p, df, dataset) + pheno_df = create_pheno_df(valid_df) + covariates_df = create_covariates_df(valid_df) + + # Collect data per dataset all_edges.append(edges_df) + all_pheno.append(pheno_df) all_covariates.append(covariates_df) - # Concatenate all edges and covariates + # Concatenate data across datasets combined_edges_df = pd.concat(all_edges) + combined_pheno_df = pd.concat(all_pheno) combined_covariates_df = pd.concat(all_covariates) - # One-hot encode covariates - combined_covariates_df = one_hot_encode_column(combined_covariates_df, 'SEX') - #combined_covariates_df = one_hot_encode_column(combined_covariates_df, 'DIAGNOSIS') + # Process covariates and pheno data suitable for analysis + processed_covariates_df = process_covariates(combined_covariates_df) - combined_edges_df.sort_index().to_csv(output_p / 'combined_edges.tsv', sep='\t', index=True) - combined_covariates_df.sort_index().to_csv(output_p / 'combat_covariates.tsv', sep='\t', index=True) + # Ensure order of data is identical and output + combined_edges_df.sort_index().to_csv(output_p / 'combat_edges.tsv', sep='\t', index=True) + processed_covariates_df.sort_index().to_csv(output_p / 'combat_covariates.tsv', sep='\t', index=True) + combined_pheno_df.sort_index().to_csv(output_p / 'pheno.tsv', sep='\t', index=True) - print(f"Saved edges matrix and coavraites to {output_p}") + print(f"Saved edges matrix, covariates and pheno to {output_p}") From 0a8ab25a36611046b5216e65222408c521f541b9 Mon Sep 17 00:00:00 2001 From: clarkenj Date: Mon, 6 May 2024 13:09:26 -0400 Subject: [PATCH 12/14] fully working script --- .gitignore | 2 +- scripts/prepare_input_data.py | 146 ++++++++++++++++++++++------------ 2 files changed, 94 insertions(+), 54 deletions(-) diff --git a/.gitignore b/.gitignore index 0d05551..918137f 100644 --- a/.gitignore +++ b/.gitignore @@ -113,7 +113,7 @@ venv/ ENV/ env.bak/ venv.bak/ -cwas_neuro_env/ +ad_sz_env/ # Spyder project settings .spyderproject diff --git a/scripts/prepare_input_data.py b/scripts/prepare_input_data.py index a256e6f..6ba6fed 100644 --- a/scripts/prepare_input_data.py +++ b/scripts/prepare_input_data.py @@ -4,28 +4,24 @@ from pathlib import Path dataset_configs = { - 'cobre': { - 'connectome_path': "cobre_connectome-0.4.1_MIST_afc", - 'no_session': False, - 'subject_folder': False + "cobre": { + "connectome_path": "cobre_connectome-0.4.1_MIST_afc", + "no_session": False, + "subject_folder": False, }, - 'cimaq': { - 'connectome_path': "cimaq_connectome-0.4.1_MIST_afc", - 'no_session': False, - 'subject_folder': True + "cimaq": { + "connectome_path": "cimaq_connectome-0.4.1_MIST_afc", + "no_session": False, + "subject_folder": True, }, - 'hcpep': { - 'connectome_path': "hcp-ep_connectome-0.4.1", - 'no_session': False, - 'subject_folder': True - }, - 'ds000030': { - 'connectome_path': "ds000030_connectomes-0.4.1", - 'no_session': True, - 'subject_folder': False + "hcpep": { + "connectome_path": "hcp-ep_connectome-0.4.1", + "no_session": False, + "subject_folder": True, }, } + def load_connectome(file, participant_id, session, identifier, no_session): # Path construction within .h5 file dataset_path = f"sub-{participant_id}" @@ -42,6 +38,7 @@ def load_connectome(file, participant_id, session, identifier, no_session): print(f"Connectome for {identifier} not found") return None + def save_npy_connectome(connectome_npy, output_p, identifier): # Not using this function currently if connectome_npy is not None: @@ -51,6 +48,7 @@ def save_npy_connectome(connectome_npy, output_p, identifier): else: print(f"Connectome for {identifier} not found") + def connectome_to_edge_matrix(connectome): """ Transform a single connectome into an edge matrix. @@ -64,6 +62,7 @@ def connectome_to_edge_matrix(connectome): return edge_matrix + def process_connectomes(connectomes_by_participant): """ For multiple connectomes per participant, average them, apply Fisher z transform, and convert to an edge matrix. @@ -82,7 +81,7 @@ def process_connectomes(connectomes_by_participant): def matrix_dict_to_df(edges_matrix_dict): - # Create a DataFrame from the edge matrices + # Create a df from the edge matrices # Convert dictionary values to a 2D numpy array edge_matrix_combined = np.column_stack(list(edges_matrix_dict.values())) participants = list(edges_matrix_dict.keys()) @@ -93,28 +92,35 @@ def matrix_dict_to_df(edges_matrix_dict): return edges_df + def process_datasets(conn_p, df, dataset_name): config = dataset_configs[dataset_name] - base_dir = conn_p / config['connectome_path'] + base_dir = conn_p / config["connectome_path"] # Filter the df for required scans for the dataset - filtered_df = df[df['dataset'] == dataset_name] + filtered_df = df[df["dataset"] == dataset_name] valid_rows = [] # To store rows where the connectome was found connectomes_by_participant = {} for index, row in filtered_df.iterrows(): participant_id = row["participant_id"] identifier = row["identifier"] - session = None if config['no_session'] else row.get("ses", None) + session = None if config["no_session"] else row.get("ses", None) # Adjust file path based on whether dataset uses a subject folder - if config['subject_folder']: - file_path = base_dir / participant_id / f"sub-{participant_id}_atlas-MIST_desc-scrubbing.5+gsr.h5" + if config["subject_folder"]: + file_path = ( + base_dir + / participant_id + / f"sub-{participant_id}_atlas-MIST_desc-scrubbing.5+gsr.h5" + ) else: file_path = base_dir / "atlas-MIST_desc-scrubbing.5+gsr.h5" try: with h5py.File(file_path, "r") as file: - connectome = load_connectome(file, participant_id, session, identifier, config['no_session']) + connectome = load_connectome( + file, participant_id, session, identifier, config["no_session"] + ) if connectome is not None: # Append the connectome to the participant's list in the dictionary if participant_id not in connectomes_by_participant: @@ -127,88 +133,122 @@ def process_datasets(conn_p, df, dataset_name): except FileNotFoundError: print(f"File not found: {file_path}") + edges_df = pd.DataFrame() # Process connectomes if connectomes_by_participant: edges_matrix_dict = process_connectomes(connectomes_by_participant) edges_df = matrix_dict_to_df(edges_matrix_dict) - # Convert the list of valid rows to a DataFrame + # Convert the list of valid rows to a df valid_df = pd.DataFrame(valid_rows) return edges_df, valid_df + def create_pheno_df(valid_df): # Group by 'participant_id' and aggregate, taking the first entry for variables that do not change (for this use case) aggregation_functions = { - 'sex': 'first', - 'age': 'first', - 'diagnosis': 'first', - 'site': 'first', - 'mean_fd_scrubbed': 'mean', # Averaging mean framewise displacement across scans - 'mbi_status': 'first' + "sex": "first", + "age": "first", + "diagnosis": "first", + "site": "first", + "mean_fd_scrubbed": "mean", # Averaging mean framewise displacement across scans + "group": "first", } - pheno_df = valid_df.groupby('participant_id').agg(aggregation_functions) + pheno_df = valid_df.groupby("participant_id").agg(aggregation_functions) return pheno_df -def create_covariates_df(valid_df): + +def create_covariates_df_old(valid_df): # Group by participant_id and select the first entry for each group - first_entries = valid_df.groupby('participant_id').first() + first_entries = valid_df.groupby("participant_id").first() # Extract the relevant covariates - covariates_df = first_entries[['site', 'sex', 'age', 'diagnosis']] + covariates_df = first_entries[["site", "sex", "age", "diagnosis"]] return covariates_df + def one_hot_encode_column(df, column_name): dummies = pd.get_dummies(df[column_name], prefix=column_name) df = pd.concat([df, dummies], axis=1) df.drop(column_name, axis=1, inplace=True) return df -def process_covariates(combined_covariates_df): - processed_covariates_df = combined_covariates_df.copy() - # Rename diagnosis variable - processed_covariates_df['diagnosis'] = processed_covariates_df['diagnosis'].replace('PSYC', 'SCHZ') - # One-hot encode covariates - processed_covariates_df = one_hot_encode_column(processed_covariates_df, 'sex') - processed_covariates_df = one_hot_encode_column(processed_covariates_df, 'diagnosis') +def one_hot_encode_column_no_prefix(df, column_name): + dummies = pd.get_dummies(df[column_name]) + df = pd.concat([df, dummies], axis=1) + df.drop(column_name, axis=1, inplace=True) + return df + + +def create_covariates_df(pheno_df): + # Extract the relevant covariates from pheno_df + covariates_df = pheno_df[ + [ + "site", + "sex_male", + "sex_female", + "age", + "diagnosis_MCI", + "diagnosis_ADD", + "diagnosis_CON", + "diagnosis_SCHZ", + ] + ] # split CON into controls from AD and SCHZ datasets? + + return covariates_df - return processed_covariates_df if __name__ == "__main__": conn_p = Path("/home/neuromod/ad_sz/data") output_p = Path("/home/neuromod/ad_sz/data/input_data") - df = pd.read_csv("/home/neuromod/wrangling-phenotype/outputs/final_master_pheno.tsv",sep="\t") + df = pd.read_csv( + "/home/neuromod/wrangling-phenotype/outputs/final_qc_pheno.tsv", sep="\t" + ) if not output_p.exists(): output_p.mkdir(parents=True, exist_ok=True) all_edges = [] all_pheno = [] - all_covariates = [] for dataset in dataset_configs: edges_df, valid_df = process_datasets(conn_p, df, dataset) pheno_df = create_pheno_df(valid_df) - covariates_df = create_covariates_df(valid_df) # Collect data per dataset all_edges.append(edges_df) all_pheno.append(pheno_df) - all_covariates.append(covariates_df) # Concatenate data across datasets combined_edges_df = pd.concat(all_edges) combined_pheno_df = pd.concat(all_pheno) - combined_covariates_df = pd.concat(all_covariates) - # Process covariates and pheno data suitable for analysis - processed_covariates_df = process_covariates(combined_covariates_df) + # One-hot encode columns + combined_pheno_df = one_hot_encode_column(combined_pheno_df, "sex") + combined_pheno_df = one_hot_encode_column(combined_pheno_df, "diagnosis") + combined_pheno_df = one_hot_encode_column_no_prefix(combined_pheno_df, "group") + + # Create covariates_df from pheno_df + covariates_df = create_covariates_df(combined_pheno_df) + + # Capitalize column names, necessary for combat etc + covariates_df.columns = [x.upper() for x in covariates_df.columns] + combined_pheno_df.columns = [x.upper() for x in combined_pheno_df.columns] + + print("Shape of combined_edges_df = ", combined_edges_df.shape) + print("Shape of processed_covariates_df = ", covariates_df.shape) + print("Shape of pheno_hot_df = ", combined_pheno_df.shape) # Ensure order of data is identical and output - combined_edges_df.sort_index().to_csv(output_p / 'combat_edges.tsv', sep='\t', index=True) - processed_covariates_df.sort_index().to_csv(output_p / 'combat_covariates.tsv', sep='\t', index=True) - combined_pheno_df.sort_index().to_csv(output_p / 'pheno.tsv', sep='\t', index=True) + combined_edges_df.sort_index().to_csv( + output_p / "combat_edges.tsv", sep="\t", index=True + ) + covariates_df.sort_index().to_csv( + output_p / "combat_covariates.tsv", sep="\t", index=True + ) + combined_pheno_df.sort_index().to_csv(output_p / "pheno.tsv", sep="\t", index=True) print(f"Saved edges matrix, covariates and pheno to {output_p}") From 2b88c3c7a72e5c2d95a2c508bc40e0c7944bde6c Mon Sep 17 00:00:00 2001 From: clarkenj Date: Tue, 7 May 2024 12:57:17 -0400 Subject: [PATCH 13/14] added global signal --- scripts/prepare_input_data.py | 52 ++++++++++++++++------------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/scripts/prepare_input_data.py b/scripts/prepare_input_data.py index 6ba6fed..d81c26e 100644 --- a/scripts/prepare_input_data.py +++ b/scripts/prepare_input_data.py @@ -4,18 +4,8 @@ from pathlib import Path dataset_configs = { - "cobre": { - "connectome_path": "cobre_connectome-0.4.1_MIST_afc", - "no_session": False, - "subject_folder": False, - }, - "cimaq": { - "connectome_path": "cimaq_connectome-0.4.1_MIST_afc", - "no_session": False, - "subject_folder": True, - }, - "hcpep": { - "connectome_path": "hcp-ep_connectome-0.4.1", + "adni": { + "connectome_path": "adni_connectome-0.4.1_MIST_afc", "no_session": False, "subject_folder": True, }, @@ -65,19 +55,23 @@ def connectome_to_edge_matrix(connectome): def process_connectomes(connectomes_by_participant): """ - For multiple connectomes per participant, average them, apply Fisher z transform, and convert to an edge matrix. + Apply Fisher z transform, compute the global signal and convert to an edge matrix. For multiple connectomes per participant, connectomes are averaged first. """ edges_matrix_dict = {} + global_signal_dict = {} for participant_id, connectomes in connectomes_by_participant.items(): # Compute the mean connectome mean_connectome = np.mean(connectomes, axis=0) # Apply Fisher Z transformation transformed_connectome = np.arctanh(mean_connectome) + # Compute global signal (mean of the Z values) + global_signal = np.mean(transformed_connectome) + global_signal_dict[participant_id] = global_signal # Convert connectome to a marix for ComBat edges_matrix = connectome_to_edge_matrix(transformed_connectome) edges_matrix_dict[participant_id] = edges_matrix - return edges_matrix_dict + return edges_matrix_dict, global_signal_dict def matrix_dict_to_df(edges_matrix_dict): @@ -136,27 +130,31 @@ def process_datasets(conn_p, df, dataset_name): edges_df = pd.DataFrame() # Process connectomes if connectomes_by_participant: - edges_matrix_dict = process_connectomes(connectomes_by_participant) + edges_matrix_dict, global_signal_dict = process_connectomes( + connectomes_by_participant + ) edges_df = matrix_dict_to_df(edges_matrix_dict) # Convert the list of valid rows to a df valid_df = pd.DataFrame(valid_rows) - return edges_df, valid_df + return edges_df, valid_df, global_signal_dict -def create_pheno_df(valid_df): +def create_pheno_df(valid_df, global_signal_dict): # Group by 'participant_id' and aggregate, taking the first entry for variables that do not change (for this use case) aggregation_functions = { "sex": "first", "age": "first", "diagnosis": "first", "site": "first", + "dataset": "first", "mean_fd_scrubbed": "mean", # Averaging mean framewise displacement across scans "group": "first", } pheno_df = valid_df.groupby("participant_id").agg(aggregation_functions) - + # Add global signal to pheno_df + pheno_df["mean_conn"] = pheno_df.index.map(global_signal_dict) return pheno_df @@ -203,11 +201,9 @@ def create_covariates_df(pheno_df): if __name__ == "__main__": - conn_p = Path("/home/neuromod/ad_sz/data") - output_p = Path("/home/neuromod/ad_sz/data/input_data") - df = pd.read_csv( - "/home/neuromod/wrangling-phenotype/outputs/final_qc_pheno.tsv", sep="\t" - ) + conn_p = Path("/home/nclarke/scratch") + output_p = Path("/home/nclarke") + df = pd.read_csv("/home/nclarke/final_qc_pheno.tsv", sep="\t") if not output_p.exists(): output_p.mkdir(parents=True, exist_ok=True) @@ -215,8 +211,8 @@ def create_covariates_df(pheno_df): all_edges = [] all_pheno = [] for dataset in dataset_configs: - edges_df, valid_df = process_datasets(conn_p, df, dataset) - pheno_df = create_pheno_df(valid_df) + edges_df, valid_df, global_signal_dict = process_datasets(conn_p, df, dataset) + pheno_df = create_pheno_df(valid_df, global_signal_dict) # Collect data per dataset all_edges.append(edges_df) @@ -229,7 +225,7 @@ def create_covariates_df(pheno_df): # One-hot encode columns combined_pheno_df = one_hot_encode_column(combined_pheno_df, "sex") combined_pheno_df = one_hot_encode_column(combined_pheno_df, "diagnosis") - combined_pheno_df = one_hot_encode_column_no_prefix(combined_pheno_df, "group") + combined_pheno_df = one_hot_encode_column(combined_pheno_df, "group") # Create covariates_df from pheno_df covariates_df = create_covariates_df(combined_pheno_df) @@ -239,8 +235,8 @@ def create_covariates_df(pheno_df): combined_pheno_df.columns = [x.upper() for x in combined_pheno_df.columns] print("Shape of combined_edges_df = ", combined_edges_df.shape) - print("Shape of processed_covariates_df = ", covariates_df.shape) - print("Shape of pheno_hot_df = ", combined_pheno_df.shape) + print("Shape of covariates_df = ", covariates_df.shape) + print("Shape of combined_pheno_df = ", combined_pheno_df.shape) # Ensure order of data is identical and output combined_edges_df.sort_index().to_csv( From 230f925f9ae6f5dbd765d290d860b22c2587d51a Mon Sep 17 00:00:00 2001 From: clarkenj Date: Tue, 7 May 2024 17:55:49 -0400 Subject: [PATCH 14/14] fixed adni session bug --- scripts/prepare_input_data.py | 40 ++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/scripts/prepare_input_data.py b/scripts/prepare_input_data.py index d81c26e..34f5f76 100644 --- a/scripts/prepare_input_data.py +++ b/scripts/prepare_input_data.py @@ -4,7 +4,37 @@ from pathlib import Path dataset_configs = { - "adni": { + "cobre": { + "connectome_path": "cobre_connectome-0.4.1_MIST_afc", + "no_session": False, + "subject_folder": False, + }, + "cimaq": { + "connectome_path": "cimaq_connectome-0.4.1_MIST_afc", + "no_session": False, + "subject_folder": True, + }, + "hcpep": { + "connectome_path": "hcp-ep_connectome-0.4.1", + "no_session": False, + "subject_folder": True, + }, + "ds000030": { + "connectome_path": "ds000030_connectomes-0.4.1", + "no_session": True, + "subject_folder": False, + }, + "oasis3": { + "connectome_path": "oasis3_connectomes-0.4.1", + "no_session": False, + "subject_folder": True, + }, + "srpbs": { + "connectome_path": "srpbs_connectome-0.4.1", + "no_session": False, + "subject_folder": True, + }, + "adni": { "connectome_path": "adni_connectome-0.4.1_MIST_afc", "no_session": False, "subject_folder": True, @@ -202,8 +232,12 @@ def create_covariates_df(pheno_df): if __name__ == "__main__": conn_p = Path("/home/nclarke/scratch") - output_p = Path("/home/nclarke") - df = pd.read_csv("/home/nclarke/final_qc_pheno.tsv", sep="\t") + output_p = Path("/home/nclarke/ad_sz/data/input_data") + df = pd.read_csv("/home/nclarke/ad_sz/data/final_qc_pheno.tsv", sep="\t") + + # Convert adni sessions to strings so can formulate paths correctly + mask = df["dataset"] == "adni" + df.loc[mask, "ses"] = pd.to_datetime(df.loc[mask, "ses"]).dt.strftime("%Y%m%d") if not output_p.exists(): output_p.mkdir(parents=True, exist_ok=True)