From 1557f3611b4758b747ba84c425240141bb18cc3b Mon Sep 17 00:00:00 2001
From: clarkenj <natashajayne@gmail.com>
Date: Tue, 22 Aug 2023 14:30:47 -0400
Subject: [PATCH 01/14] added scripts

---
 scripts/npi_to_mbi.py        |  51 +++++++
 scripts/significance_corr.py | 265 +++++++++++++++++++++++++++++++++++
 2 files changed, 316 insertions(+)
 create mode 100644 scripts/npi_to_mbi.py
 create mode 100644 scripts/significance_corr.py

diff --git a/scripts/npi_to_mbi.py b/scripts/npi_to_mbi.py
new file mode 100644
index 0000000..5aa3607
--- /dev/null
+++ b/scripts/npi_to_mbi.py
@@ -0,0 +1,51 @@
+import pandas as pd
+from pathlib import Path
+
+## Load NPIQ or NPI scores and convert to MBI framework
+## Domains: https://adni.bitbucket.io/reference/npi.html
+
+
+def _mbi_conversion(df):
+    # Calculate MBI domains
+    df["decreased_motivation"] = df["NPIG"]
+    df["emotional_dysregulation"] = df["NPID"] + df["NPIE"] + df["NPIF"]
+    df["impulse_dyscontrol"] = df["NPIC"] + df["NPII"] + df["NPIJ"]
+    df["social_inappropriateness"] = df["NPIH"]
+    df["abnormal_perception"] = df["NPIA"] + df["NPIB"]
+
+    # Calculate MBI total score
+    mbi_domains = [
+        "decreased_motivation",
+        "emotional_dysregulation",
+        "impulse_dyscontrol",
+        "social_inappropriateness",
+        "abnormal_perception",
+    ]
+    df["MBI_total_score"] = df[mbi_domains].sum(axis=1)
+    df["mbi_status"] = (df["MBI_total_score"] >= 1).astype(int)
+
+    mbi_columns = [
+        "RID",
+        "EXAMDATE",
+        "decreased_motivation",
+        "emotional_dysregulation",
+        "impulse_dyscontrol",
+        "social_inappropriateness",
+        "abnormal_perception",
+        "MBI_total_score",
+        "mbi_status",
+    ]
+    mbi_df = df[mbi_columns].copy()
+
+    return mbi_df
+
+
+root_p = Path("__file__").resolve().parents[1] / "data" / "adni"
+
+# Load NPI or NPI-Q scores
+npi_df = pd.read_csv(root_p / "NPIQ_22Aug2023.csv")
+
+mbi_df = _mbi_conversion(npi_df)
+
+# Save results - make sure to change name
+mbi_df.to_csv(root_p / "adni_npiq_mbi_status_20230822.csv", index=False)
diff --git a/scripts/significance_corr.py b/scripts/significance_corr.py
new file mode 100644
index 0000000..05a1e7e
--- /dev/null
+++ b/scripts/significance_corr.py
@@ -0,0 +1,265 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import util
+import time
+import os
+import itertools
+from statsmodels.stats.multitest import multipletests
+from scipy.stats import pearsonr
+from argparse import ArgumentParser
+
+
+# SEBASTIEN URCHS
+def p_permut(empirical_value, permutation_values):
+    n_permutation = len(permutation_values)
+    if empirical_value >= 0:
+        return (np.sum(permutation_values > empirical_value) + 1) / (n_permutation + 1)
+    return (np.sum(permutation_values < empirical_value) + 1) / (n_permutation + 1)
+
+
+def filter_fdr(df, contrasts):
+    df_filtered = df[
+        (df["pair0"].isin(contrasts)) & (df["pair1"].isin(contrasts))
+    ].copy()
+    _, fdr, _, _ = multipletests(df_filtered["pval"], method="fdr_bh")
+    df_filtered["fdr_filtered"] = fdr
+    return df_filtered
+
+
+def mat_form(df, contrasts, value="betamap_corr"):
+    n = len(contrasts)
+    d = dict(zip(contrasts, range(n)))
+    mat = np.zeros((n, n))
+    for c in contrasts:
+        # fill out vertical strip of mat
+        for i in range(n):
+            if i == d[c]:
+                val = 1
+            else:
+                val = df[
+                    ((df["pair0"] == c) | (df["pair1"] == c))
+                    & ((df["pair0"] == contrasts[i]) | (df["pair1"] == contrasts[i]))
+                ][value]
+            mat[i, d[c]] = val
+            mat[d[c], i] = val
+    return pd.DataFrame(mat, columns=contrasts, index=contrasts)
+
+
+def make_matrices(df, contrasts, fdr="fdr_filtered"):
+    "Param fdr can be set to 'fdr_filtered': FDR is performed using the pvalues only from the chosen contrasts"
+    "                              or 'fdr': values taken from FDR performed on full set of 42 contrasts"
+    if fdr == "fdr_filtered":
+        df = filter_fdr(df, contrasts)
+    mat_corr = mat_form(df, contrasts, value="betamap_corr")
+    mat_pval = mat_form(df, contrasts, value="pval")
+    mat_fdr = mat_form(df, contrasts, value=fdr)
+    return mat_corr, mat_pval, mat_fdr
+
+
+def get_corr_dist(cases, nulls, path_out, tag="wholeconn"):
+    # For each unique pair, between the null maps.
+    n_pairs = int((len(cases)) * (len(cases) - 1) / 2)
+    corr = np.zeros((n_pairs, 5000))
+
+    print(
+        "Getting correlation between 5000 null maps for {} unique pairs for {} cases...".format(
+            n_pairs, len(cases)
+        )
+    )
+    pair = []
+    l = 0
+    for i in itertools.combinations(cases, 2):
+        for j in range(5000):
+            corr[l, j] = pearsonr(
+                nulls.loc[i[0]].values[j, :], nulls.loc[i[1]].values[j, :]
+            )[0]
+
+        pair.append(i)
+        if l % 50 == 0:
+            print("{}/{}".format(l, n_pairs))
+        l = l + 1
+
+    df = pd.DataFrame(corr)
+    df["pair"] = pair
+    df.to_csv(os.path.join(path_out, "correlation_dist_{}.csv".format(tag)))
+    return df
+
+
+def get_corr(cases, betas, path_out, tag="wholeconn"):
+    # For each unique pair, correlation between betamaps. Use standardized betas here (as in rest of paper).
+    n_pairs = int((len(cases)) * (len(cases) - 1) / 2)
+    corr = np.zeros(n_pairs)
+
+    print(
+        "Getting correlation between betamaps for {} unique pairs for {} cases...".format(
+            n_pairs, len(cases)
+        )
+    )
+    pair = []
+    l = 0
+    for i in itertools.combinations(cases, 2):
+        corr[l] = pearsonr(betas.loc[i[0]].values, betas.loc[i[1]].values)[0]
+        l = l + 1
+        pair.append(i)
+    df = pd.DataFrame(corr)
+    df["pair"] = pair
+    df.to_csv(os.path.join(path_out, "correlation_betas_{}.csv".format(tag)))
+    return df
+
+
+def get_corr_pval(maps, nulls, betas, path_out, tag="wholeconn"):
+    df = get_corr_dist(maps, nulls, path_out, tag=tag)
+    df_bb = get_corr(maps, betas, path_out, tag=tag)
+
+    df_bb = df_bb.rename(columns={0: "betamap_corr"})
+    df_master = df_bb.merge(df, on="pair")
+
+    print("Calculating pvals...")
+    # CALCULATE PVALS
+    pval = []
+    for i in df_master.index:
+        p = p_permut(df_master.loc[i, "betamap_corr"], df_master[range(5000)].loc[i])
+        pval.append(p)
+    df_master["pval"] = pval
+
+    # ADD LABELS
+    pair0 = [p[0] for p in df["pair"].tolist()]
+    pair1 = [p[1] for p in df["pair"].tolist()]
+    df_master["pair0"] = pair0
+    df_master["pair1"] = pair1
+
+    df_compact = df_master[["pair0", "pair1", "betamap_corr", "pval"]]
+    df_compact.to_csv(
+        os.path.join(path_out, "corr_pval_null_v_null_{}.csv".format(tag))
+    )
+
+    return df_compact
+
+
+def _make_region_masks(row):
+    mask = np.tri(64, k=0, dtype=bool)
+
+    # Get the parcel number from the beginning of the row and subtract 1
+    parcel_num = int(row[0]) - 1
+
+    # Get the parcel name from the second column value
+    parcel_name = str(row[1]).upper().replace(" ", "_")
+
+    # Generate the mask
+    parcel = np.zeros((64, 64), bool)
+    parcel[:, parcel_num] = True
+    parcel_mask = parcel + np.transpose(parcel)
+    parcel_mask = np.tril(parcel_mask)
+    parcel_mask = parcel_mask[mask]
+
+    return (parcel_mask, parcel_name)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--n_path_mc", help="path to mc null models dir", dest="n_path_mc"
+    )
+    parser.add_argument("--b_path_mc", help="path to mc betamaps dir", dest="b_path_mc")
+    parser.add_argument("--path_out", help="path to output directory", dest="path_out")
+    parser.add_argument(
+        "--path_corr", help="path to corr dir", dest="path_corr", default=None
+    )
+    parser.add_argument(
+        "--path_parcellation", help="path to MIST_64.cv", dest="path_parcellation"
+    )
+    args = parser.parse_args()
+
+    n_path_mc = os.path.join(args.n_path_mc, "null_model_{}_vs_control_mc.npy")
+    # cont_n_path_mc = os.path.join(args.n_path_mc,'{}_null_model_mc.npy')
+    b_path_mc = os.path.join(args.b_path_mc, "{}_vs_con_mean.csv")
+    # cont_b_path_mc = os.path.join(args.b_path_mc,'cont_{}_results_mc.csv')
+    path_out = args.path_out
+    path_corr = args.path_corr
+
+    cases = ["mci_neg", "mci_pos", "sz"]
+
+    maps = cases  # + cont
+
+    #############
+    # LOAD DATA #
+    #############
+    null = []
+    beta_std = []
+
+    for c in cases:
+        null.append(pd.DataFrame(np.load(n_path_mc.format(c))))
+        beta_std.append(pd.read_csv(b_path_mc.format(c))["stand_betas"].values)
+
+    betamaps_std = pd.DataFrame(beta_std, index=maps)
+    nullmodels = pd.concat(null, keys=maps)
+
+    ####################
+    # WHOLE CONNECTOME #
+    ####################
+    print("Creating correlation distributions for whole connectome...")
+    df_wc = get_corr_pval(maps, nullmodels, betamaps_std, path_out, tag="wholeconn")
+
+    # make matrices
+    print("Preparing correlation matrices...")
+    subset = ["mci_neg", "mci_pos", "sz"]
+
+    corr, pval, fdr = make_matrices(df_wc, subset, fdr="fdr_filtered")
+
+    corr.to_csv(os.path.join(path_out, "FC_corr_wholebrain_mc_null_v_null.csv"))
+    pval.to_csv(os.path.join(path_out, "FC_corr_pval_wholebrain_mc_null_v_null.csv"))
+    fdr.to_csv(
+        os.path.join(path_out, "FC_corr_fdr_filtered_wholebrain_mc_null_v_null.csv")
+    )
+
+    ####################
+    # REGIONS #
+    ####################
+    df_MIST = pd.read_csv(args.path_parcellation, delimiter=";")
+
+    for index, row in df_MIST.iterrows():
+        # create region mask
+        parcel_mask, parcel_name = _make_region_masks(row)
+
+        # filter maps
+        null_parcel = [n.transpose()[parcel_mask].transpose() for n in null]
+        beta_std_parcel = [b[parcel_mask] for b in beta_std]
+
+        betamaps_std_parcel = pd.DataFrame(beta_std_parcel, index=maps)
+        nullmodels_parcel = pd.concat(null_parcel, keys=maps)
+
+        print(f"Creating correlation distributions for {parcel_name}...")
+        df_parcel = get_corr_pval(
+            maps, nullmodels_parcel, betamaps_std_parcel, path_out, tag=parcel_name
+        )
+
+        # make matrices
+        corr_parcel, pval_parcel, fdr_filtered_parcel = make_matrices(
+            df_parcel, subset, fdr="fdr_filtered"
+        )
+
+        _, pval_parcel_fdr_corrected, _, _ = multipletests(pval_parcel, method="fdr_bh")
+
+        # TO DO - see if I need to correct fdr filtered, I think so?
+        if (pval_parcel < 0.05).any():
+            corr_parcel.to_csv(
+                os.path.join(path_out, f"FC_corr_{parcel_name}_mc_null_v_null.csv")
+            )
+            pval_parcel.to_csv(
+                os.path.join(path_out, f"FC_corr_pval_{parcel_name}_mc_null_v_null.csv")
+            )
+            pval_parcel_fdr_corrected.to_csv(
+                os.path.join(
+                    path_out,
+                    f"FC_corr_pval_fdr_corrected_{parcel_name}_mc_null_v_null.csv",
+                )
+            )
+            fdr_filtered_parcel.to_csv(
+                os.path.join(
+                    path_out, f"FC_corr_fdr_filtered_{parcel_name}_mc_null_v_null.csv"
+                )
+            )
+
+    print("Done!")

From 80fe67e78a2dbe1f41f339a31fa97d6e67deb7d6 Mon Sep 17 00:00:00 2001
From: Natasha Clarke <57987005+clarkenj@users.noreply.github.com>
Date: Tue, 22 Aug 2023 15:28:39 -0400
Subject: [PATCH 02/14] Delete baseline_biomarkers.py

No longer using this approach
---
 scripts/baseline_biomarkers.py | 173 ---------------------------------
 1 file changed, 173 deletions(-)
 delete mode 100644 scripts/baseline_biomarkers.py

diff --git a/scripts/baseline_biomarkers.py b/scripts/baseline_biomarkers.py
deleted file mode 100644
index e7ea469..0000000
--- a/scripts/baseline_biomarkers.py
+++ /dev/null
@@ -1,173 +0,0 @@
-import pandas as pd
-import numpy as np
-
-from functools import reduce
-from pathlib import Path
-
-# paths to files
-
-adni_data = Path("__file__").resolve().parent / 'data' / 'adni_spreadsheet.csv'
-tau_data = Path("__file__").resolve().parent / 'data' / 'UCBERKELEYAV1451_04_26_22.csv'
-other_biomarker_data = Path("__file__").resolve().parent / 'data' / 'ADNIMERGE.csv'
-
-# functions
-
-def get_baselines(file):
-    
-    # load baseline phenotypic data
-    pheno = pd.read_csv(file, index_col=0, header=0)
-
-    # keep only the variables of interest
-    pheno = pheno.filter(['Subject ID','Phase','Sex','Research Group', 'Visit','Study Date','Age'], axis=1)
-
-    # convert 'study date' to 'session' in datetime format, to match other spreadsheets
-    pheno['session'] = pd.to_datetime(pheno['Study Date'])
-    
-    # pull out only the subject id and asign it to the index
-    pheno_subj = []
-    for i in pheno['Subject ID']:
-        subj = i.split('_')[2].lstrip("0") # remove leading zeros since it won't match ADNI IDs
-        pheno_subj.append(subj)
-    
-    pheno.index = pheno_subj
-    pheno.rename_axis('RID',inplace=True)
-    pheno.index = pheno.index.astype('int64')
-    
-    # separate patients and controls, because (in theory) we can use any control visit as baseline, but
-    # for patients we want their actual baseline data
-    patient_diagnoses = ['AD', 'EMCI', 'LMCI', 'MCI', 'SMC']
-    patient_df = pheno[pheno['Research Group'].isin(patient_diagnoses)] # df of patient diagnoses
-
-    control_df = pheno.loc[pheno['Research Group'] == 'CN'] # df of control diagnoses
-
-    # I think these visits are acceptable as baseline data, i.e. actual baseline +/-3 months, excluding
-    # any initial visits where patient continued from a previous phase
-    bl_visits = ['ADNI Screening','ADNI2 Month 3 MRI-New Pt', 'ADNI2 Screening MRI-New Pt', 
-                   'ADNIGO Month 3 MRI','ADNIGO Screening MRI']
-
-    patient_df_bl = patient_df[patient_df['Visit'].isin(bl_visits)]
-    
-    # rejoin the patients to the controls
-    new_df = pd.concat([control_df,patient_df_bl])
-    
-    # select the earliest visit available for each participant
-    new_df.sort_values(['Subject ID', 'Age'], inplace=True) # sort by age
-    baseline_df = new_df[~new_df.duplicated(['Subject ID'], keep='first')] # select the first row
-    
-    # sort df by index
-    baseline_df.sort_values(by='RID', inplace=True)
-    
-    # calculate window for acceptable biomarker data, currently +- 12months
-    baseline_df.loc[:,('date_lower')] = baseline_df.loc[:,('session')] - pd.DateOffset(months=18)
-    baseline_df.loc[:,('date_upper')] = baseline_df.loc[:,('session')] + pd.DateOffset(months=18)
-
-    return (baseline_df)
-
-def load_biomarker_df(biomarker_data):
-
-    # load data
-    biomarker_df = pd.read_csv(biomarker_data, index_col=0, header=0, low_memory=False)
-
-    # convert examdate to datetime
-    biomarker_df['EXAMDATE'] = pd.to_datetime(biomarker_df['EXAMDATE'])
-
-    # sort df by index and date
-    biomarker_df.sort_values(by=['RID', 'EXAMDATE'],inplace=True)
-
-    # create column from index (useful for later functions)
-    biomarker_df['RID'] = biomarker_df.index
-    
-    return (biomarker_df)
-
-def match_baselines_biomarker(biomarker_df, baseline_df, biomarker):
-    
-    df = pd.DataFrame(columns=['RID',biomarker,biomarker+'_EXAMDATE']) #create df
-    common_ids = biomarker_df.index.intersection(baseline_df.index) #find ids common to the biomarker and baseline dfs
-    biomarker_df = biomarker_df.set_index('EXAMDATE') #reindex, needed to use 'nearest' method
-
-    for rid in common_ids:
-        participant_df = biomarker_df[(biomarker_df['RID'] == rid)] #create df of all participants results
-        participant_df = participant_df.dropna(subset=[biomarker]) #drop NaNs, since we only want nearest date with result available
-        
-        baseline = baseline_df.loc[rid] #create df of participants baseline data
-        session = baseline['session'] #participant's baseline date
-
-        if participant_df.empty:
-            pass
-        else:
-            idx_nearest = participant_df.index.get_loc(session, method='nearest') #find the index of the closest test date to session
-            nearest_date = participant_df.index[idx_nearest] #which date is it
-            nearest_result = participant_df[biomarker][idx_nearest] #find the biomarker result associated with closest date
-
-            df.loc[len(df)] = [rid,nearest_result,nearest_date] #add to df
-    
-    df = df.set_index('RID')        
-    
-    return (df)
-
-def check_visit_window(biomarker_df, baseline_df, biomarker):
-    
-    '''
-    Join closest biomarkers to baseline info, check if the result was collected on a date within the baseline
-    window, and if not replace with NaN. Drop unwanted columns and return biomarker info again, ready to merge.
-    '''
-    
-    # create new df, merging biomarker data into baseline df
-    baseline_bio = baseline_df.join(biomarker_df)
-    
-    # create mask of date range, between lower and upper acceptable dates
-    mask_date_range = (baseline_bio[biomarker+'_EXAMDATE'] > baseline_bio['date_lower']) & (baseline_bio[biomarker+'_EXAMDATE'] < baseline_bio['date_upper'])
-    
-    # fill values collected outside date range with NaN
-    baseline_bio[biomarker][~mask_date_range] = np.nan
-    
-    cols_to_drop = ['Subject ID',
-     'Phase',
-     'Sex',
-     'Research Group',
-     'Visit',
-     'Study Date',
-     'Age',
-     'session',
-     'date_lower',
-     'date_upper']
-    
-    baseline_bio = baseline_bio.drop(cols_to_drop, axis=1) #drop unwanted columns
-    
-    return (baseline_bio)
-
-def get_biomarker(biomarker_df, baseline_df, biomarker):
-
-    #runs two functions to match the baseline to closest biomarker result, and then checks if it falls in the window
-    
-    find_nearest_biomarker = match_baselines_biomarker(biomarker_df, baseline_df, biomarker)
-    window_checked = check_visit_window(find_nearest_biomarker, baseline_df, biomarker)
-    
-    return (window_checked)
-
-if __name__ == '__main__':
-    
-    #get baselines from adni spreadsheet
-    baseline_df = get_baselines(adni_data)
-
-    # load biomarker files
-    tau_df = load_biomarker_df(tau_data)
-    other_biomarkers_df = load_biomarker_df(other_biomarker_data)
-    
-    # create one df per biomarker with closest result within a one year window of participant baseline
-    tau = get_biomarker(tau_df, baseline_df, 'META_TEMPORAL_SUVR')
-    abeta = get_biomarker(other_biomarkers_df, baseline_df, 'ABETA')
-    ptau = get_biomarker(other_biomarkers_df, baseline_df, 'PTAU')
-    av45 = get_biomarker(other_biomarkers_df, baseline_df, 'AV45')
-    fbb = get_biomarker(other_biomarkers_df, baseline_df, 'FBB')
-
-    # create list of dataframes (baseline data and all individual biomarkers)
-    data_frames = [baseline_df, tau, abeta, ptau, av45, fbb]
-
-    # merge dataframes
-    master_biomarkers = reduce(lambda left, right:
-             pd.merge_asof(left, right, left_index=True, right_index=True),
-             data_frames)
-    
-    # save baseline and matched biomarker data in data directory
-    master_biomarkers.to_csv(Path("__file__").resolve().parent / 'data' / 'master_biomarkers.csv')

From 5742a96dad10a9aeb5905ee7350751f1f4749d2c Mon Sep 17 00:00:00 2001
From: Natasha Clarke <57987005+clarkenj@users.noreply.github.com>
Date: Tue, 22 Aug 2023 15:28:51 -0400
Subject: [PATCH 03/14] Delete biomarker_thresholding.ipynb

https://github.com/SIMEXP/ad_sz/tree/new_afc_scripts/scripts
---
 scripts/biomarker_thresholding.ipynb | 294 ---------------------------
 1 file changed, 294 deletions(-)
 delete mode 100644 scripts/biomarker_thresholding.ipynb

diff --git a/scripts/biomarker_thresholding.ipynb b/scripts/biomarker_thresholding.ipynb
deleted file mode 100644
index dedac85..0000000
--- a/scripts/biomarker_thresholding.ipynb
+++ /dev/null
@@ -1,294 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "d213377a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "\n",
-    "from pathlib import Path"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "id": "9a4034fa",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# set thresholds and calculate 5% margin\n",
-    "\n",
-    "# CSF biomarkers\n",
-    "ab42_threshold = 980\n",
-    "ab42_margin = (5*ab42_threshold)/100\n",
-    "ab42_upper = ab42_threshold+ab42_margin\n",
-    "ab42_lower = ab42_threshold-ab42_margin\n",
-    "\n",
-    "ptau_threshold = 23\n",
-    "ptau_margin = (5*ptau_threshold)/100\n",
-    "ptau_upper = ptau_threshold+ptau_margin\n",
-    "ptau_lower = ptau_threshold-ptau_margin\n",
-    "\n",
-    "# PET biomarkers\n",
-    "fbb_threshold = 1.08\n",
-    "fbb_margin = (5*fbb_threshold)/100\n",
-    "fbb_upper = fbb_threshold+fbb_margin\n",
-    "fbb_lower = fbb_threshold-fbb_margin\n",
-    "\n",
-    "av45_threshold = 1.08\n",
-    "av45_margin = (5*av45_threshold)/100\n",
-    "av45_upper = av45_threshold+av45_margin\n",
-    "av45_lower = av45_threshold-av45_margin"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a913b27a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def calculate_tau_cutoff(df):\n",
-    "    \n",
-    "    '''\n",
-    "    Take a df with biomarker results for all participants. Select controls only. Find those negative for amyloid\n",
-    "    PET (either av45 or FBB). For those who hav flortaucipir results, calculate average. Calculate +2SD, this is\n",
-    "    the flortaucipir cutoff.\n",
-    "    \n",
-    "    '''\n",
-    "    \n",
-    "    controls = df.loc[df['Research Group'] == 'CN']\n",
-    "    \n",
-    "    controls_Apet_neg = (controls[(controls['FBB'] < fbb_lower) | \n",
-    "             (controls['AV45'] < av45_lower)])\n",
-    "    \n",
-    "    tau_mean = controls_Apet_neg['META_ROI'].mean()\n",
-    "    tau_sd = controls_Apet_neg['META_ROI'].std()\n",
-    "    tau_cutoff = tau_mean+(tau_sd*2)\n",
-    "                                       \n",
-    "    return (tau_cutoff) "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 41,
-   "id": "bfa2dd1c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# paths to files\n",
-    "biomarker_data = Path(\"__file__\").resolve().parents[1] / 'data' / 'master_biomarkers.csv'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "id": "837cb329",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# load baseline data\n",
-    "df = pd.read_csv(biomarker_data, index_col=0, header=0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "id": "c75a9dee",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# replace some biomarker data which has been coded as > or <\n",
-    "df = df.replace({'ABETA': {'>1700': 1701, '<200': 199}})\n",
-    "df = df.replace({'PTAU': {'<8': 7, '>120':121}})\n",
-    "\n",
-    "df[['ABETA', 'PTAU']] = df[['ABETA', 'PTAU']].apply(pd.to_numeric)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "id": "b2d48836",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# calculate study specific cutoff for tau PET meta temporal ROI \n",
-    "tau_threshold = calculate_tau_cutoff(df)\n",
-    "tau_margin = (5*tau_threshold)/100\n",
-    "tau_upper = tau_threshold+tau_margin\n",
-    "tau_lower = tau_threshold-tau_margin"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
-   "id": "60d0393b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def biomarker_negative_controls(df):\n",
-    "    \n",
-    "    '''\n",
-    "    Take a df with biomarker results for participants labelled as controls, and return a df excluding those\n",
-    "    who have a positive result on any one of five markers (or no biomarker information).\n",
-    "    \n",
-    "    To get more info on which biomarkers are positive, uncomment last section. But this is only to give an \n",
-    "    idea, since some may be positive on more than one marker.\n",
-    "    '''\n",
-    "    \n",
-    "    controls_all = df.loc[df['Research Group'] == 'CN']\n",
-    "    \n",
-    "    # drop participants who don't have any biomarker data\n",
-    "    controls = controls_all.dropna(subset=['ABETA','PTAU','AV45','FBB','META_ROI'], how='all')\n",
-    "    \n",
-    "    controls_biomarker_neg = (controls[(controls['FBB'] < fbb_lower) | \n",
-    "             (controls['AV45'] < av45_lower) | \n",
-    "             (controls['ABETA'] > ab42_upper) | \n",
-    "            (controls['PTAU'] < ptau_lower) | \n",
-    "            (controls['META_ROI'] < tau_lower)])\n",
-    "    \n",
-    "    n_controls = len(controls_all)\n",
-    "    no_markers = n_controls-len(controls)\n",
-    "    n_controls_negative = len(controls_biomarker_neg)\n",
-    "    n_dropped = (n_controls-no_markers) - n_controls_negative\n",
-    "    \n",
-    "    print ('Of {} controls, {} dropped due to no biomarker data and {} due to positive/borderline biomarkers. {} remaining.'.format(\n",
-    "        n_controls,\n",
-    "        no_markers, \n",
-    "        n_dropped,\n",
-    "        n_controls_negative))\n",
-    "    \n",
-    "    '''\n",
-    "    controls_biomarker_pos = pd.concat([controls,controls_biomarker_neg]).drop_duplicates(keep=False)\n",
-    "    print ('{} positive/borderline for Abeta42.'.format(len(controls_biomarker_pos[(controls_biomarker_pos['ABETA'] < ab42_upper)])))\n",
-    "    print ('{} positive/borderline for Ptau.'.format(len(controls_biomarker_pos[(controls_biomarker_pos['PTAU'] > ptau_lower)])))\n",
-    "    print ('{} positive/borderline for PET amyloid (FBB tracer).'.format(len(controls_biomarker_pos[(controls_biomarker_pos['FBB'] > fbb_lower)])))\n",
-    "    print ('{} positive/borderline for PET amyloid (AV45 tracer).'.format(len(controls_biomarker_pos[(controls_biomarker_pos['AV45'] > av45_lower)])))\n",
-    "    print ('{} positive/borderline for PET tau (meta roi).'.format(len(controls_biomarker_pos[(controls_biomarker_pos['META_ROI'] > tau_lower)])))\n",
-    "    '''\n",
-    "    return (controls_biomarker_neg)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "id": "b3a95289",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def biomarker_positive_patients(df, diagnoses):\n",
-    "    \n",
-    "    '''\n",
-    "    Take a df with biomarker results for participants labelled as a patient (specifying which), \n",
-    "    and return a df including only those who have a positive result on any one of five markers. \n",
-    "    Exclude any with no biomarker information.\n",
-    "    \n",
-    "    To get more info on which biomarkers are negative, uncomment last section. But this is only to give an \n",
-    "    idea, since some may be negative on more than one marker.\n",
-    "    '''\n",
-    "    \n",
-    "    patients_all = df.loc[df['Research Group'].isin(diagnoses)]\n",
-    "    \n",
-    "    # drop participants who don't have any biomarker data\n",
-    "    patients = patients_all.dropna(subset=['ABETA','PTAU','AV45','FBB','META_ROI'], how='all')\n",
-    "    \n",
-    "    patients_biomarker_pos = (patients[(patients['FBB'] > fbb_upper) | \n",
-    "             (patients['AV45'] > av45_upper) | \n",
-    "             (patients['ABETA'] < ab42_lower) | \n",
-    "             (patients['PTAU'] > ptau_upper) |\n",
-    "            (patients['META_ROI'] > tau_upper)])\n",
-    "\n",
-    "    n_patients = len(patients_all)\n",
-    "    no_markers = n_patients-len(patients)\n",
-    "    n_patients_positive = len(patients_biomarker_pos)\n",
-    "    n_dropped = (n_patients-no_markers) - n_patients_positive\n",
-    "    \n",
-    "    print ('Of {} {}, {} dropped due to no biomarker data and {} due to negative/borderline biomarkers. {} remaining.'.format(\n",
-    "        n_patients,\n",
-    "        diagnoses,\n",
-    "        no_markers,\n",
-    "       n_dropped,\n",
-    "      n_patients_positive))\n",
-    "    \n",
-    "    '''\n",
-    "    patients_neg = pd.concat([patients,patients_biomarker_pos]).drop_duplicates(keep=False)\n",
-    "    \n",
-    "    print ('{} negative/borderline for Abeta42.'.format(len(patients_neg[(patients_neg['ABETA'] > ab42_lower)])))\n",
-    "    print ('{} negative/borderline for Ptau.'.format(len(patients_neg[(patients_neg['PTAU'] < ptau_upper)])))\n",
-    "    print ('{} negative/borderline for PET amyloid (FBB tracer).'.format(len(patients_neg[(patients_neg['FBB'] < fbb_upper)])))\n",
-    "    print ('{} negative/borderline for PET amyloid (AV45 tracer).'.format(len(patients_neg[(patients_neg['AV45'] < av45_upper)])))\n",
-    "    print ('{} negative/borderline for PET tau (meta roi).'.format(len(patients_neg[(patients_neg['META_ROI'] < tau_upper)])))\n",
-    "    '''\n",
-    "    return (patients_biomarker_pos)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 50,
-   "id": "0d123b9f",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Of 362 controls, 44 dropped due to no biomarker data and 60 due to positive/borderline biomarkers. 258 remaining.\n",
-      "Of 58 ['AD'], 7 dropped due to no biomarker data and 3 due to negative/borderline biomarkers. 48 remaining.\n",
-      "Of 165 ['EMCI', 'LMCI', 'MCI'], 25 dropped due to no biomarker data and 54 due to negative/borderline biomarkers. 86 remaining.\n"
-     ]
-    }
-   ],
-   "source": [
-    "# run functions to get negative controls and positive patients\n",
-    "controls_bio_neg = biomarker_negative_controls(df)\n",
-    "ad_bio_positive = biomarker_positive_patients(df, ['AD'])\n",
-    "mci_bio_positive = biomarker_positive_patients(df, ['EMCI','LMCI','MCI'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 51,
-   "id": "979848a1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "master_df = pd.concat([controls_bio_neg, ad_bio_positive, mci_bio_positive])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 52,
-   "id": "cf508b49",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "master_df.to_csv(Path(\"__file__\").resolve().parents[1] / 'data' / 'final_biomarker_spreadsheet.csv')"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

From c444bd31ecff21254fcf8f1874be7b220a97ed9a Mon Sep 17 00:00:00 2001
From: Natasha Clarke <57987005+clarkenj@users.noreply.github.com>
Date: Tue, 22 Aug 2023 15:29:55 -0400
Subject: [PATCH 04/14] Delete calculate_meta_roi.ipynb

No longer using this approach
---
 scripts/calculate_meta_roi.ipynb | 723 -------------------------------
 1 file changed, 723 deletions(-)
 delete mode 100644 scripts/calculate_meta_roi.ipynb

diff --git a/scripts/calculate_meta_roi.ipynb b/scripts/calculate_meta_roi.ipynb
deleted file mode 100644
index b3cb980..0000000
--- a/scripts/calculate_meta_roi.ipynb
+++ /dev/null
@@ -1,723 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "3881d9a6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "\n",
-    "from pathlib import Path"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "3fdee605",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tau_data = Path(\"__file__\").resolve().parents[1] / 'data' / 'UCBERKELEYAV1451_04_26_22.csv'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "7b6b5db3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# load tau PET data\n",
-    "df = pd.read_csv(tau_data, index_col=0, header=0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "83a462f2",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>VISCODE</th>\n",
-       "      <th>VISCODE2</th>\n",
-       "      <th>EXAMDATE</th>\n",
-       "      <th>INFERIORCEREBELLUM_SUVR</th>\n",
-       "      <th>INFERIORCEREBELLUM_VOLUME</th>\n",
-       "      <th>ERODED_SUBCORTICALWM_SUVR</th>\n",
-       "      <th>ERODED_SUBCORTICALWM_VOLUME</th>\n",
-       "      <th>BRAAK1_SUVR</th>\n",
-       "      <th>BRAAK1_VOLUME</th>\n",
-       "      <th>BRAAK34_SUVR</th>\n",
-       "      <th>...</th>\n",
-       "      <th>RIGHT_PUTAMEN_VOLUME</th>\n",
-       "      <th>RIGHT_THALAMUS_PROPER_SUVR</th>\n",
-       "      <th>RIGHT_THALAMUS_PROPER_VOLUME</th>\n",
-       "      <th>RIGHT_VENTRALDC_SUVR</th>\n",
-       "      <th>RIGHT_VENTRALDC_VOLUME</th>\n",
-       "      <th>RIGHT_VESSEL_SUVR</th>\n",
-       "      <th>RIGHT_VESSEL_VOLUME</th>\n",
-       "      <th>WM_HYPOINTENSITIES_SUVR</th>\n",
-       "      <th>WM_HYPOINTENSITIES_VOLUME</th>\n",
-       "      <th>update_stamp</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>RID</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>21</th>\n",
-       "      <td>init</td>\n",
-       "      <td>m144</td>\n",
-       "      <td>2018-02-02</td>\n",
-       "      <td>1.0285</td>\n",
-       "      <td>55285</td>\n",
-       "      <td>1.1771</td>\n",
-       "      <td>110695</td>\n",
-       "      <td>1.1937</td>\n",
-       "      <td>2869</td>\n",
-       "      <td>1.2152</td>\n",
-       "      <td>...</td>\n",
-       "      <td>4169</td>\n",
-       "      <td>1.5224</td>\n",
-       "      <td>5874</td>\n",
-       "      <td>1.3860</td>\n",
-       "      <td>3378</td>\n",
-       "      <td>1.4576</td>\n",
-       "      <td>10.0</td>\n",
-       "      <td>1.0615</td>\n",
-       "      <td>2460</td>\n",
-       "      <td>2022-04-27 15:31:20.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>31</th>\n",
-       "      <td>init</td>\n",
-       "      <td>m150</td>\n",
-       "      <td>2018-04-24</td>\n",
-       "      <td>0.9647</td>\n",
-       "      <td>62344</td>\n",
-       "      <td>1.0258</td>\n",
-       "      <td>55683</td>\n",
-       "      <td>1.2463</td>\n",
-       "      <td>2558</td>\n",
-       "      <td>1.0969</td>\n",
-       "      <td>...</td>\n",
-       "      <td>5766</td>\n",
-       "      <td>1.1421</td>\n",
-       "      <td>6016</td>\n",
-       "      <td>1.2340</td>\n",
-       "      <td>3178</td>\n",
-       "      <td>1.3691</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>0.7641</td>\n",
-       "      <td>26845</td>\n",
-       "      <td>2022-04-27 15:31:20.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>31</th>\n",
-       "      <td>y1</td>\n",
-       "      <td>m162</td>\n",
-       "      <td>2019-04-23</td>\n",
-       "      <td>0.9636</td>\n",
-       "      <td>61510</td>\n",
-       "      <td>1.0193</td>\n",
-       "      <td>50427</td>\n",
-       "      <td>1.2264</td>\n",
-       "      <td>2508</td>\n",
-       "      <td>1.0675</td>\n",
-       "      <td>...</td>\n",
-       "      <td>5537</td>\n",
-       "      <td>1.0801</td>\n",
-       "      <td>5865</td>\n",
-       "      <td>1.1644</td>\n",
-       "      <td>3060</td>\n",
-       "      <td>1.4203</td>\n",
-       "      <td>15.0</td>\n",
-       "      <td>0.7805</td>\n",
-       "      <td>28306</td>\n",
-       "      <td>2022-04-27 15:31:20.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>56</th>\n",
-       "      <td>init</td>\n",
-       "      <td>m144</td>\n",
-       "      <td>2018-02-20</td>\n",
-       "      <td>0.9790</td>\n",
-       "      <td>64415</td>\n",
-       "      <td>1.2442</td>\n",
-       "      <td>114401</td>\n",
-       "      <td>1.2682</td>\n",
-       "      <td>3809</td>\n",
-       "      <td>1.2016</td>\n",
-       "      <td>...</td>\n",
-       "      <td>4026</td>\n",
-       "      <td>1.3396</td>\n",
-       "      <td>6601</td>\n",
-       "      <td>1.3625</td>\n",
-       "      <td>3655</td>\n",
-       "      <td>1.6190</td>\n",
-       "      <td>42.0</td>\n",
-       "      <td>1.0556</td>\n",
-       "      <td>1385</td>\n",
-       "      <td>2022-04-27 15:31:20.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>56</th>\n",
-       "      <td>y1</td>\n",
-       "      <td>m156</td>\n",
-       "      <td>2019-01-10</td>\n",
-       "      <td>0.9548</td>\n",
-       "      <td>63302</td>\n",
-       "      <td>1.2863</td>\n",
-       "      <td>118625</td>\n",
-       "      <td>1.1931</td>\n",
-       "      <td>3666</td>\n",
-       "      <td>1.1839</td>\n",
-       "      <td>...</td>\n",
-       "      <td>4145</td>\n",
-       "      <td>1.3618</td>\n",
-       "      <td>6349</td>\n",
-       "      <td>1.3873</td>\n",
-       "      <td>3625</td>\n",
-       "      <td>1.4511</td>\n",
-       "      <td>10.0</td>\n",
-       "      <td>1.0850</td>\n",
-       "      <td>1427</td>\n",
-       "      <td>2022-04-27 15:31:20.0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5 rows × 242 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "    VISCODE VISCODE2    EXAMDATE  INFERIORCEREBELLUM_SUVR  \\\n",
-       "RID                                                         \n",
-       "21     init     m144  2018-02-02                   1.0285   \n",
-       "31     init     m150  2018-04-24                   0.9647   \n",
-       "31       y1     m162  2019-04-23                   0.9636   \n",
-       "56     init     m144  2018-02-20                   0.9790   \n",
-       "56       y1     m156  2019-01-10                   0.9548   \n",
-       "\n",
-       "     INFERIORCEREBELLUM_VOLUME  ERODED_SUBCORTICALWM_SUVR  \\\n",
-       "RID                                                         \n",
-       "21                       55285                     1.1771   \n",
-       "31                       62344                     1.0258   \n",
-       "31                       61510                     1.0193   \n",
-       "56                       64415                     1.2442   \n",
-       "56                       63302                     1.2863   \n",
-       "\n",
-       "     ERODED_SUBCORTICALWM_VOLUME  BRAAK1_SUVR  BRAAK1_VOLUME  BRAAK34_SUVR  \\\n",
-       "RID                                                                          \n",
-       "21                        110695       1.1937           2869        1.2152   \n",
-       "31                         55683       1.2463           2558        1.0969   \n",
-       "31                         50427       1.2264           2508        1.0675   \n",
-       "56                        114401       1.2682           3809        1.2016   \n",
-       "56                        118625       1.1931           3666        1.1839   \n",
-       "\n",
-       "     ...  RIGHT_PUTAMEN_VOLUME  RIGHT_THALAMUS_PROPER_SUVR  \\\n",
-       "RID  ...                                                     \n",
-       "21   ...                  4169                      1.5224   \n",
-       "31   ...                  5766                      1.1421   \n",
-       "31   ...                  5537                      1.0801   \n",
-       "56   ...                  4026                      1.3396   \n",
-       "56   ...                  4145                      1.3618   \n",
-       "\n",
-       "     RIGHT_THALAMUS_PROPER_VOLUME  RIGHT_VENTRALDC_SUVR  \\\n",
-       "RID                                                       \n",
-       "21                           5874                1.3860   \n",
-       "31                           6016                1.2340   \n",
-       "31                           5865                1.1644   \n",
-       "56                           6601                1.3625   \n",
-       "56                           6349                1.3873   \n",
-       "\n",
-       "     RIGHT_VENTRALDC_VOLUME  RIGHT_VESSEL_SUVR  RIGHT_VESSEL_VOLUME  \\\n",
-       "RID                                                                   \n",
-       "21                     3378             1.4576                 10.0   \n",
-       "31                     3178             1.3691                  2.0   \n",
-       "31                     3060             1.4203                 15.0   \n",
-       "56                     3655             1.6190                 42.0   \n",
-       "56                     3625             1.4511                 10.0   \n",
-       "\n",
-       "     WM_HYPOINTENSITIES_SUVR  WM_HYPOINTENSITIES_VOLUME           update_stamp  \n",
-       "RID                                                                             \n",
-       "21                    1.0615                       2460  2022-04-27 15:31:20.0  \n",
-       "31                    0.7641                      26845  2022-04-27 15:31:20.0  \n",
-       "31                    0.7805                      28306  2022-04-27 15:31:20.0  \n",
-       "56                    1.0556                       1385  2022-04-27 15:31:20.0  \n",
-       "56                    1.0850                       1427  2022-04-27 15:31:20.0  \n",
-       "\n",
-       "[5 rows x 242 columns]"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "630a3d56",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "result_cols = ['CTX_LH_ENTORHINAL_SUVR',\n",
-    "'CTX_RH_ENTORHINAL_SUVR',\n",
-    "'LEFT_AMYGDALA_SUVR',\n",
-    "'RIGHT_AMYGDALA_SUVR',\n",
-    "'CTX_LH_PARAHIPPOCAMPAL_SUVR',\n",
-    "'CTX_RH_PARAHIPPOCAMPAL_SUVR',\n",
-    "'CTX_LH_FUSIFORM_SUVR',\n",
-    "'CTX_RH_FUSIFORM_SUVR',\n",
-    "'CTX_LH_INFERIORTEMPORAL_SUVR',\n",
-    "'CTX_RH_INFERIORTEMPORAL_SUVR',\n",
-    "'CTX_LH_MIDDLETEMPORAL_SUVR',\n",
-    "'CTX_RH_MIDDLETEMPORAL_SUVR']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "id": "79cb3167",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>VISCODE</th>\n",
-       "      <th>VISCODE2</th>\n",
-       "      <th>EXAMDATE</th>\n",
-       "      <th>INFERIORCEREBELLUM_SUVR</th>\n",
-       "      <th>INFERIORCEREBELLUM_VOLUME</th>\n",
-       "      <th>ERODED_SUBCORTICALWM_SUVR</th>\n",
-       "      <th>ERODED_SUBCORTICALWM_VOLUME</th>\n",
-       "      <th>BRAAK1_SUVR</th>\n",
-       "      <th>BRAAK1_VOLUME</th>\n",
-       "      <th>BRAAK34_SUVR</th>\n",
-       "      <th>...</th>\n",
-       "      <th>RIGHT_THALAMUS_PROPER_VOLUME</th>\n",
-       "      <th>RIGHT_VENTRALDC_SUVR</th>\n",
-       "      <th>RIGHT_VENTRALDC_VOLUME</th>\n",
-       "      <th>RIGHT_VESSEL_SUVR</th>\n",
-       "      <th>RIGHT_VESSEL_VOLUME</th>\n",
-       "      <th>WM_HYPOINTENSITIES_SUVR</th>\n",
-       "      <th>WM_HYPOINTENSITIES_VOLUME</th>\n",
-       "      <th>update_stamp</th>\n",
-       "      <th>meta_roi</th>\n",
-       "      <th>META_ROI</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>RID</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>21</th>\n",
-       "      <td>init</td>\n",
-       "      <td>m144</td>\n",
-       "      <td>2018-02-02</td>\n",
-       "      <td>1.0285</td>\n",
-       "      <td>55285</td>\n",
-       "      <td>1.1771</td>\n",
-       "      <td>110695</td>\n",
-       "      <td>1.1937</td>\n",
-       "      <td>2869</td>\n",
-       "      <td>1.2152</td>\n",
-       "      <td>...</td>\n",
-       "      <td>5874</td>\n",
-       "      <td>1.3860</td>\n",
-       "      <td>3378</td>\n",
-       "      <td>1.4576</td>\n",
-       "      <td>10.0</td>\n",
-       "      <td>1.0615</td>\n",
-       "      <td>2460</td>\n",
-       "      <td>2022-04-27 15:31:20.0</td>\n",
-       "      <td>1.234958</td>\n",
-       "      <td>1.234958</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>31</th>\n",
-       "      <td>init</td>\n",
-       "      <td>m150</td>\n",
-       "      <td>2018-04-24</td>\n",
-       "      <td>0.9647</td>\n",
-       "      <td>62344</td>\n",
-       "      <td>1.0258</td>\n",
-       "      <td>55683</td>\n",
-       "      <td>1.2463</td>\n",
-       "      <td>2558</td>\n",
-       "      <td>1.0969</td>\n",
-       "      <td>...</td>\n",
-       "      <td>6016</td>\n",
-       "      <td>1.2340</td>\n",
-       "      <td>3178</td>\n",
-       "      <td>1.3691</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>0.7641</td>\n",
-       "      <td>26845</td>\n",
-       "      <td>2022-04-27 15:31:20.0</td>\n",
-       "      <td>1.145025</td>\n",
-       "      <td>1.145025</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>31</th>\n",
-       "      <td>y1</td>\n",
-       "      <td>m162</td>\n",
-       "      <td>2019-04-23</td>\n",
-       "      <td>0.9636</td>\n",
-       "      <td>61510</td>\n",
-       "      <td>1.0193</td>\n",
-       "      <td>50427</td>\n",
-       "      <td>1.2264</td>\n",
-       "      <td>2508</td>\n",
-       "      <td>1.0675</td>\n",
-       "      <td>...</td>\n",
-       "      <td>5865</td>\n",
-       "      <td>1.1644</td>\n",
-       "      <td>3060</td>\n",
-       "      <td>1.4203</td>\n",
-       "      <td>15.0</td>\n",
-       "      <td>0.7805</td>\n",
-       "      <td>28306</td>\n",
-       "      <td>2022-04-27 15:31:20.0</td>\n",
-       "      <td>1.137825</td>\n",
-       "      <td>1.137825</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>56</th>\n",
-       "      <td>init</td>\n",
-       "      <td>m144</td>\n",
-       "      <td>2018-02-20</td>\n",
-       "      <td>0.9790</td>\n",
-       "      <td>64415</td>\n",
-       "      <td>1.2442</td>\n",
-       "      <td>114401</td>\n",
-       "      <td>1.2682</td>\n",
-       "      <td>3809</td>\n",
-       "      <td>1.2016</td>\n",
-       "      <td>...</td>\n",
-       "      <td>6601</td>\n",
-       "      <td>1.3625</td>\n",
-       "      <td>3655</td>\n",
-       "      <td>1.6190</td>\n",
-       "      <td>42.0</td>\n",
-       "      <td>1.0556</td>\n",
-       "      <td>1385</td>\n",
-       "      <td>2022-04-27 15:31:20.0</td>\n",
-       "      <td>1.235883</td>\n",
-       "      <td>1.235883</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>56</th>\n",
-       "      <td>y1</td>\n",
-       "      <td>m156</td>\n",
-       "      <td>2019-01-10</td>\n",
-       "      <td>0.9548</td>\n",
-       "      <td>63302</td>\n",
-       "      <td>1.2863</td>\n",
-       "      <td>118625</td>\n",
-       "      <td>1.1931</td>\n",
-       "      <td>3666</td>\n",
-       "      <td>1.1839</td>\n",
-       "      <td>...</td>\n",
-       "      <td>6349</td>\n",
-       "      <td>1.3873</td>\n",
-       "      <td>3625</td>\n",
-       "      <td>1.4511</td>\n",
-       "      <td>10.0</td>\n",
-       "      <td>1.0850</td>\n",
-       "      <td>1427</td>\n",
-       "      <td>2022-04-27 15:31:20.0</td>\n",
-       "      <td>1.208933</td>\n",
-       "      <td>1.208933</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5 rows × 244 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "    VISCODE VISCODE2    EXAMDATE  INFERIORCEREBELLUM_SUVR  \\\n",
-       "RID                                                         \n",
-       "21     init     m144  2018-02-02                   1.0285   \n",
-       "31     init     m150  2018-04-24                   0.9647   \n",
-       "31       y1     m162  2019-04-23                   0.9636   \n",
-       "56     init     m144  2018-02-20                   0.9790   \n",
-       "56       y1     m156  2019-01-10                   0.9548   \n",
-       "\n",
-       "     INFERIORCEREBELLUM_VOLUME  ERODED_SUBCORTICALWM_SUVR  \\\n",
-       "RID                                                         \n",
-       "21                       55285                     1.1771   \n",
-       "31                       62344                     1.0258   \n",
-       "31                       61510                     1.0193   \n",
-       "56                       64415                     1.2442   \n",
-       "56                       63302                     1.2863   \n",
-       "\n",
-       "     ERODED_SUBCORTICALWM_VOLUME  BRAAK1_SUVR  BRAAK1_VOLUME  BRAAK34_SUVR  \\\n",
-       "RID                                                                          \n",
-       "21                        110695       1.1937           2869        1.2152   \n",
-       "31                         55683       1.2463           2558        1.0969   \n",
-       "31                         50427       1.2264           2508        1.0675   \n",
-       "56                        114401       1.2682           3809        1.2016   \n",
-       "56                        118625       1.1931           3666        1.1839   \n",
-       "\n",
-       "     ...  RIGHT_THALAMUS_PROPER_VOLUME  RIGHT_VENTRALDC_SUVR  \\\n",
-       "RID  ...                                                       \n",
-       "21   ...                          5874                1.3860   \n",
-       "31   ...                          6016                1.2340   \n",
-       "31   ...                          5865                1.1644   \n",
-       "56   ...                          6601                1.3625   \n",
-       "56   ...                          6349                1.3873   \n",
-       "\n",
-       "     RIGHT_VENTRALDC_VOLUME  RIGHT_VESSEL_SUVR  RIGHT_VESSEL_VOLUME  \\\n",
-       "RID                                                                   \n",
-       "21                     3378             1.4576                 10.0   \n",
-       "31                     3178             1.3691                  2.0   \n",
-       "31                     3060             1.4203                 15.0   \n",
-       "56                     3655             1.6190                 42.0   \n",
-       "56                     3625             1.4511                 10.0   \n",
-       "\n",
-       "     WM_HYPOINTENSITIES_SUVR  WM_HYPOINTENSITIES_VOLUME  \\\n",
-       "RID                                                       \n",
-       "21                    1.0615                       2460   \n",
-       "31                    0.7641                      26845   \n",
-       "31                    0.7805                      28306   \n",
-       "56                    1.0556                       1385   \n",
-       "56                    1.0850                       1427   \n",
-       "\n",
-       "              update_stamp  meta_roi  META_ROI  \n",
-       "RID                                             \n",
-       "21   2022-04-27 15:31:20.0  1.234958  1.234958  \n",
-       "31   2022-04-27 15:31:20.0  1.145025  1.145025  \n",
-       "31   2022-04-27 15:31:20.0  1.137825  1.137825  \n",
-       "56   2022-04-27 15:31:20.0  1.235883  1.235883  \n",
-       "56   2022-04-27 15:31:20.0  1.208933  1.208933  \n",
-       "\n",
-       "[5 rows x 244 columns]"
-      ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df['META_ROI'] = df[result_cols].mean(axis=1)\n",
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "id": "5aec7b55",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>EXAMDATE</th>\n",
-       "      <th>META_ROI</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>RID</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>21</th>\n",
-       "      <td>2018-02-02</td>\n",
-       "      <td>1.234958</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>31</th>\n",
-       "      <td>2018-04-24</td>\n",
-       "      <td>1.145025</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>31</th>\n",
-       "      <td>2019-04-23</td>\n",
-       "      <td>1.137825</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>56</th>\n",
-       "      <td>2018-02-20</td>\n",
-       "      <td>1.235883</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>56</th>\n",
-       "      <td>2019-01-10</td>\n",
-       "      <td>1.208933</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       EXAMDATE  META_ROI\n",
-       "RID                      \n",
-       "21   2018-02-02  1.234958\n",
-       "31   2018-04-24  1.145025\n",
-       "31   2019-04-23  1.137825\n",
-       "56   2018-02-20  1.235883\n",
-       "56   2019-01-10  1.208933"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "results_df = df[['EXAMDATE','META_ROI']].copy()\n",
-    "results_df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "id": "5e52c415",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "results_df.to_csv(Path(\"__file__\").resolve().parents[1] / 'data' / 'tau_meta_roi.csv')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "663dc7b4",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

From bd9c875495a6d8dc6f95069bbcb52cc5ac6677d7 Mon Sep 17 00:00:00 2001
From: clarkenj <natashajayne@gmail.com>
Date: Mon, 28 Aug 2023 12:56:32 -0400
Subject: [PATCH 05/14] added get_ds00030_participants.py

---
 scripts/get_ds00030_participants.py | 33 +++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 scripts/get_ds00030_participants.py

diff --git a/scripts/get_ds00030_participants.py b/scripts/get_ds00030_participants.py
new file mode 100644
index 0000000..4319183
--- /dev/null
+++ b/scripts/get_ds00030_participants.py
@@ -0,0 +1,33 @@
+import pandas as pd
+from pathlib import Path
+
+## Load ds000030 participants.csv and keep only required subjects and columns
+
+root_p = Path("__file__").resolve().parents[1] / "data" / "ds000030"
+
+df = pd.read_csv(root_p / "participants.csv")
+
+# Keep only rows where diagnosis is CONTROL or SCHZ
+df = df[df["diagnosis"].isin(["CONTROL", "SCHZ"])]
+
+# Add columns SZ and CON
+df["SZ"] = (df["diagnosis"] == "SCHZ").astype(int)
+df["CON"] = (df["diagnosis"] == "CONTROL").astype(int)
+
+# Add GROUP column
+df["GROUP"] = df["diagnosis"]
+df["GROUP"] = df["GROUP"].replace({"SCHZ": "SZ", "CONTROL": "CON"})
+
+# Rename some columns and map values for sex
+df.rename(columns={"gender": "SEX", "age": "AGE"}, inplace=True)
+df["SEX"] = df["SEX"].map({"M": 0, "F": 1})
+
+# Add COHORT and SITE columns
+df["COHORT"] = "ds000030"
+df["SITE"] = 0
+
+cols = ["participant_id", "AGE", "SEX", "SITE", "SZ", "CON", "COHORT", "GROUP"]
+df = df[cols]
+
+# Save results
+df.to_csv(root_p / "ds000030_participants.csv", index=False)

From 976273a52fc579c166bf10795a7c537e7312c56d Mon Sep 17 00:00:00 2001
From: clarkenj <natashajayne@gmail.com>
Date: Mon, 28 Aug 2023 15:02:04 -0400
Subject: [PATCH 06/14] added scripts for sz data wrangling

---
 scripts/get_cobre_participants.py   | 33 +++++++++++++++++++++++++++++
 scripts/get_ds00030_participants.py | 22 +++++++++----------
 scripts/get_srpbs_participants.py   | 31 +++++++++++++++++++++++++++
 3 files changed, 75 insertions(+), 11 deletions(-)
 create mode 100644 scripts/get_cobre_participants.py
 create mode 100644 scripts/get_srpbs_participants.py

diff --git a/scripts/get_cobre_participants.py b/scripts/get_cobre_participants.py
new file mode 100644
index 0000000..bec947a
--- /dev/null
+++ b/scripts/get_cobre_participants.py
@@ -0,0 +1,33 @@
+import pandas as pd
+from pathlib import Path
+
+## Load cobre participant data and do some stuff
+
+root_p = Path("__file__").resolve().parents[1] / "data"
+
+df = pd.read_csv(root_p / "cobre" / "COBRE_phenotypic_data.csv")
+
+# Add columns SZ and CON
+df["SZ"] = (df["Subject Type"] == "Patient").astype(int)
+df["CON"] = (df["Subject Type"] == "Control").astype(int)
+
+# Add group column
+df["group"] = df["Subject Type"]
+df["group"] = df["group"].replace({"Patient": "SZ", "Control": "CON"})
+
+# Rename some columns and map values for sex
+df.rename(
+    columns={"Gender": "sex", "Current Age": "age", "Unnamed: 0": "participant_id"},
+    inplace=True,
+)
+df["sex"] = df["sex"].map({"Male": 0, "Female": 1})
+
+# Add cohort and site columns
+df["cohort"] = "COBRE"
+df["site"] = 0
+
+cols = ["participant_id", "age", "sex", "site", "cohort", "group", "SZ", "CON"]
+df = df[cols]
+
+# Save results
+df.to_csv(root_p / "cobre_participants.csv", index=False)
diff --git a/scripts/get_ds00030_participants.py b/scripts/get_ds00030_participants.py
index 4319183..e49d66c 100644
--- a/scripts/get_ds00030_participants.py
+++ b/scripts/get_ds00030_participants.py
@@ -1,11 +1,11 @@
 import pandas as pd
 from pathlib import Path
 
-## Load ds000030 participants.csv and keep only required subjects and columns
+## Load ds000030 participant data and do some stuff
 
-root_p = Path("__file__").resolve().parents[1] / "data" / "ds000030"
+root_p = Path("__file__").resolve().parents[1] / "data"
 
-df = pd.read_csv(root_p / "participants.csv")
+df = pd.read_csv(root_p / "ds000030" / "participants.csv")
 
 # Keep only rows where diagnosis is CONTROL or SCHZ
 df = df[df["diagnosis"].isin(["CONTROL", "SCHZ"])]
@@ -14,19 +14,19 @@
 df["SZ"] = (df["diagnosis"] == "SCHZ").astype(int)
 df["CON"] = (df["diagnosis"] == "CONTROL").astype(int)
 
-# Add GROUP column
-df["GROUP"] = df["diagnosis"]
-df["GROUP"] = df["GROUP"].replace({"SCHZ": "SZ", "CONTROL": "CON"})
+# Add group column
+df["group"] = df["diagnosis"]
+df["group"] = df["group"].replace({"SCHZ": "SZ", "CONTROL": "CON"})
 
 # Rename some columns and map values for sex
-df.rename(columns={"gender": "SEX", "age": "AGE"}, inplace=True)
-df["SEX"] = df["SEX"].map({"M": 0, "F": 1})
+df.rename(columns={"gender": "sex"}, inplace=True)
+df["sex"] = df["sex"].map({"M": 0, "F": 1})
 
 # Add COHORT and SITE columns
-df["COHORT"] = "ds000030"
-df["SITE"] = 0
+df["cohort"] = "ds000030"
+df["site"] = 0
 
-cols = ["participant_id", "AGE", "SEX", "SITE", "SZ", "CON", "COHORT", "GROUP"]
+cols = ["participant_id", "age", "sex", "site", "cohort", "group", "SZ", "CON"]
 df = df[cols]
 
 # Save results
diff --git a/scripts/get_srpbs_participants.py b/scripts/get_srpbs_participants.py
new file mode 100644
index 0000000..fddba39
--- /dev/null
+++ b/scripts/get_srpbs_participants.py
@@ -0,0 +1,31 @@
+import pandas as pd
+from pathlib import Path
+
+## Load srpbs participant data and do some stuff
+
+root_p = Path("__file__").resolve().parents[1] / "data"
+
+df = pd.read_csv(root_p / "srpbs" / "participants.tsv", sep="\t")
+
+# Keep only rows where diagnosis is 0 (control) or 4 (sz)
+df = df[df["diag"].isin([0, 4])]
+
+# Add columns SZ and CON
+df["SZ"] = (df["diag"] == 4).astype(int)
+df["CON"] = (df["diag"] == 0).astype(int)
+
+# Add group column
+df["group"] = df["diag"]
+df["group"] = df["group"].replace({4: "SZ", 0: "CON"})
+
+# Map values for sex, from total N in Tanaka paper
+df["sex"] = df["sex"].map({1: 0, 2: 1})
+
+# Add cohort and site columns
+df["cohort"] = "SRPBS"
+
+cols = ["participant_id", "age", "sex", "site", "cohort", "group", "SZ", "CON"]
+df = df[cols]
+
+# Save results
+df.to_csv(root_p / "srpbs_participants.csv", index=False)

From da09b0b7f8e3809ff292514a59c41bf8329f24ff Mon Sep 17 00:00:00 2001
From: clarkenj <natashajayne@gmail.com>
Date: Wed, 6 Sep 2023 11:21:30 -0400
Subject: [PATCH 07/14] added more wrangling scripts

---
 scripts/{npi_to_mbi.py => adni_npi_to_mbi.py} |  12 +-
 scripts/cimaq_npi_to_mbi.py                   |  87 ++++
 scripts/cnvfc/stats.py                        | 348 +++++++++++++++
 scripts/oasis_wrangling_draft.ipynb           | 401 ++++++++++++++++++
 4 files changed, 842 insertions(+), 6 deletions(-)
 rename scripts/{npi_to_mbi.py => adni_npi_to_mbi.py} (77%)
 create mode 100644 scripts/cimaq_npi_to_mbi.py
 create mode 100644 scripts/cnvfc/stats.py
 create mode 100644 scripts/oasis_wrangling_draft.ipynb

diff --git a/scripts/npi_to_mbi.py b/scripts/adni_npi_to_mbi.py
similarity index 77%
rename from scripts/npi_to_mbi.py
rename to scripts/adni_npi_to_mbi.py
index 5aa3607..8ff48ba 100644
--- a/scripts/npi_to_mbi.py
+++ b/scripts/adni_npi_to_mbi.py
@@ -1,7 +1,7 @@
 import pandas as pd
 from pathlib import Path
 
-## Load NPIQ or NPI scores and convert to MBI framework
+## Load NPIQ or NPI scores for ADNI and convert to MBI framework
 ## Domains: https://adni.bitbucket.io/reference/npi.html
 
 
@@ -21,8 +21,8 @@ def _mbi_conversion(df):
         "social_inappropriateness",
         "abnormal_perception",
     ]
-    df["MBI_total_score"] = df[mbi_domains].sum(axis=1)
-    df["mbi_status"] = (df["MBI_total_score"] >= 1).astype(int)
+    df["mbi_total_score"] = df[mbi_domains].sum(axis=1)
+    df["mbi_status"] = (df["mbi_total_score"] >= 1).astype(int)
 
     mbi_columns = [
         "RID",
@@ -32,7 +32,7 @@ def _mbi_conversion(df):
         "impulse_dyscontrol",
         "social_inappropriateness",
         "abnormal_perception",
-        "MBI_total_score",
+        "mbi_total_score",
         "mbi_status",
     ]
     mbi_df = df[mbi_columns].copy()
@@ -43,9 +43,9 @@ def _mbi_conversion(df):
 root_p = Path("__file__").resolve().parents[1] / "data" / "adni"
 
 # Load NPI or NPI-Q scores
-npi_df = pd.read_csv(root_p / "NPIQ_22Aug2023.csv")
+npi_df = pd.read_csv(root_p / "NPI_22Aug2023.csv")
 
 mbi_df = _mbi_conversion(npi_df)
 
 # Save results - make sure to change name
-mbi_df.to_csv(root_p / "adni_npiq_mbi_status_20230822.csv", index=False)
+mbi_df.to_csv(root_p / "adni_npi_mbi_status_20230829.csv", index=False)
diff --git a/scripts/cimaq_npi_to_mbi.py b/scripts/cimaq_npi_to_mbi.py
new file mode 100644
index 0000000..c94ec4a
--- /dev/null
+++ b/scripts/cimaq_npi_to_mbi.py
@@ -0,0 +1,87 @@
+import pandas as pd
+from pathlib import Path
+
+## Load NPIQ scores for CIMA-Q and convert to MBI framework
+
+
+def _map_values(df):
+    # Map values for columns we are using
+    mapping = {"0_non": 0, "1_oui_léger": 1, "2_oui_modéré": 2, "3_oui_sévère": 3}
+
+    columns_to_map = [
+        "22901_apathie",
+        "22901_depression_dysphorie",
+        "22901_anxiete",
+        "22901_euphorie",
+        "22901_agitation_aggressivite",
+        "22901_irritabilite",
+        "22901_comp_moteur_aberrant",
+        "22901_impulsivite",
+        "22901_idees_delirantes",
+        "22901_hallucinations",
+    ]
+
+    for column in columns_to_map:
+        df[column] = df[column].map(mapping)
+
+    return df
+
+
+def _mbi_conversion(df):
+    # Calculate MBI domains
+    df["decreased_motivation"] = df["22901_apathie"]
+    df["emotional_dysregulation"] = (
+        df["22901_depression_dysphorie"] + df["22901_anxiete"] + df["22901_euphorie"]
+    )
+    df["impulse_dyscontrol"] = (
+        df["22901_agitation_aggressivite"]
+        + df["22901_irritabilite"]
+        + df["22901_comp_moteur_aberrant"]
+    )
+    df["social_inappropriateness"] = df["22901_impulsivite"]
+    df["abnormal_perception"] = (
+        df["22901_idees_delirantes"] + df["22901_hallucinations"]
+    )
+
+    # Calculate MBI total score
+    mbi_domains = [
+        "decreased_motivation",
+        "emotional_dysregulation",
+        "impulse_dyscontrol",
+        "social_inappropriateness",
+        "abnormal_perception",
+    ]
+
+    df["mbi_total_score"] = df[mbi_domains].sum(axis=1)
+    df["mbi_status"] = (df["mbi_total_score"] >= 1).astype(int)
+
+    mbi_columns = [
+        "PSCID",
+        "Date_taken",
+        "decreased_motivation",
+        "emotional_dysregulation",
+        "impulse_dyscontrol",
+        "social_inappropriateness",
+        "abnormal_perception",
+        "mbi_total_score",
+        "mbi_status",
+    ]
+
+    mbi_df = df[mbi_columns].copy()
+
+    return mbi_df
+
+
+root_p = Path("__file__").resolve().parents[1] / "data" / "cimaq"
+
+# Load NPI-Q scores
+npi_df = pd.read_csv(
+    root_p / "22901_inventaire_neuropsychiatrique_q_initiale.tsv", sep="\t"
+)
+
+npi_df = npi_df.dropna(subset=["Date_taken"])
+npi_df = _map_values(npi_df)
+mbi_df = _mbi_conversion(npi_df)
+
+# Save results - make sure to change name
+mbi_df.to_csv(root_p / "cimaq_npiq_mbi_status_20230829.csv", index=False)
diff --git a/scripts/cnvfc/stats.py b/scripts/cnvfc/stats.py
new file mode 100644
index 0000000..3a4f28e
--- /dev/null
+++ b/scripts/cnvfc/stats.py
@@ -0,0 +1,348 @@
+import sys
+import time
+import warnings
+import scipy as sp
+import numpy as np
+import patsy as pat
+import pandas as pd
+from scipy import stats
+import statsmodels.api as sm
+#from .tools import conn2mat
+from sklearn import linear_model as sln
+from sklearn import preprocessing as skp
+from sklearn import model_selection as skm
+from statsmodels.sandbox.stats.multicomp import multipletests as stm
+
+
+def make_weights(data, pattern):
+    if not data.shape[1:] == pattern.shape:
+        raise Exception(f'data and pattern shape mismatch: {data.shape[1:]}, {pattern.shape}')
+    n_nodes = pattern.shape[0]
+    n_data = data.shape[0]
+    weights = np.array([[np.corrcoef(data[data_id, node_id, :],
+                                     pattern[node_id, :])[0, 1]
+                         for node_id in range(n_nodes)]
+                        for data_id in range(n_data)])
+    return weights
+
+
+def categorical_enrichment(pheno, data, group, case=None, control=None, node_labels='node_name_not_specificed'):
+    sub_mask, cases_mask = find_subset(pheno, group, [case, control])
+    sub_data = data[sub_mask, ...]
+    n_data = sub_data.shape[1]
+    n_case = np.sum(cases_mask[case])
+    n_control = np.sum(cases_mask[control])
+
+    results = {key: list() for key in ['node', 'U', 'p_value',
+                                       f'median_{group}_{case}',
+                                       f'median_{group}_{control}',
+                                       'rank_biserial_correlation']}
+    # Conduct Mann-Whitney-U for each node region
+    for node_id in range(n_data):
+        u_right, p = sp.stats.mannwhitneyu(sub_data[cases_mask[case], node_id], sub_data[cases_mask[control], node_id],
+                                           alternative='two-sided')
+        u_left = n_case * n_control - u_right
+        u_min = np.min([u_left, u_right])
+        # Compute the median for the case and control groups
+        median_case = np.median(sub_data[cases_mask[case], node_id])
+        median_control = np.median(sub_data[cases_mask[control], node_id])
+        # Compute rank biserial correlation
+        r_b = 1 - (2 * u_min) / (n_case * n_control)
+        # Determine if cases > controls or reverse
+        case_gt_con = u_right > u_min
+        if not case_gt_con:
+            r_b = -r_b
+        # Store the results
+        results['node'].append(node_id + 1)
+        results['U'].append(u_min)
+        results['p_value'].append(p)
+        results[f'median_{group}_{case}'].append(median_case)
+        results[f'median_{group}_{control}'].append(median_control)
+        results['rank_biserial_correlation'].append(r_b)
+
+    results_table = pd.DataFrame(data=results)
+    # Correct for multiple comparisons
+    (fdr_pass, qval, _, _) = stm(results_table.p_value, alpha=0.05, method='fdr_bh')
+    results_table['q_value'] = qval
+    # Add the node_names
+    results_table['node_name'] = node_labels
+
+    return results_table
+
+
+def continuous_enrichment(pheno, data, covariate, node_labels='node_name_not_specificed'):
+    sub_mask = find_subset(pheno, covariate)
+    sub_pheno = pheno.loc[sub_mask]
+    sub_data = data[sub_mask, :]
+    n_data = sub_data.shape[1]
+
+    results = {key: list() for key in ['node', 'pearson_r', 'p_value']}
+    for node_id in range(n_data):
+        r, p = stats.pearsonr(sub_pheno[covariate], sub_data[:, node_id])
+        # Store the results
+        results['node'].append(node_id + 1)
+        results['pearson_r'].append(r)
+        results['p_value'].append(p)
+
+    results_table = pd.DataFrame(data=results)
+    # Correct for multiple comparisons
+    (fdr_pass, qval, _, _) = stm(results_table.p_value, alpha=0.05, method='fdr_bh')
+    results_table['q_value'] = qval
+    # Add the node_names
+    results_table['node_name'] = node_labels
+
+    return results_table
+
+
+def find_subset(pheno, column, cases=None):
+    # TODO check pandas type input
+    subset_mask = np.array(~pheno[column].isnull())
+    if cases is not None and not not cases:
+        all_cases = pheno.loc[subset_mask][column].unique()
+        try:
+            case_available = np.array([True if case in all_cases else False for case in cases])
+        except TypeError as e:
+            raise Exception(f'the attribute "cases" needs to be iterable but is: {type(cases)}') from e
+        if not all(case_available):
+            if not any(case_available):
+                raise Exception(f'none of the requested cases of "{column}" are available')
+            else:
+                warnings.warn(
+                    f'\nnot all requested cases of "{column}" are available: {list(zip(cases, case_available))}',
+                    RuntimeWarning)
+        case_masks = np.array([pheno[column] == case for case in cases])
+        subset_mask = np.any(case_masks, 0)
+        # Return the masked instances of the requested cases
+        cases_dict = {case: case_masks[idx][subset_mask] for idx, case in enumerate(cases)}
+        return subset_mask, cases_dict
+    else:
+        return subset_mask
+
+
+def standardize(data, mask):
+    scaler = skp.StandardScaler(with_mean=False, with_std=True)
+    scaler.fit(data[mask, :])
+    standardized_data = scaler.transform(data)
+    return standardized_data
+
+
+def find_contrast(design_matrix, contrast):
+    # Find the contrast column
+    contrast_columns = [(col_id, col) for col_id, col in enumerate(design_matrix.columns) if f'{contrast}' in col]
+    if not len(contrast_columns) == 1:
+        raise Exception(f'There is no single factor that matches {contrast}: {(list(design_matrix.columns))}')
+    return contrast_columns
+
+
+def fast_glm(data, design_matrix, contrast):
+    # Does not compute p-values but operates in parallel
+    contrast_id, contrast_name = find_contrast(design_matrix, contrast)[0]
+    glm = sln.LinearRegression(fit_intercept=False, normalize=False, n_jobs=-2)
+    res = glm.fit(design_matrix, data)
+    betas = res.coef_[:, contrast_id]
+    return betas
+
+
+def glm(data, design_matrix, contrast):
+    contrast_id, contrast_name = find_contrast(design_matrix, contrast)[0]
+    n_data = data.shape[1]
+
+    # Conduct the GLM
+    betas = np.zeros(shape=n_data)
+    pvals = np.zeros(shape=n_data)
+    for conn_id in range(n_data):
+        model = sm.OLS(data[:, conn_id], design_matrix)
+        results = model.fit()
+        betas[conn_id] = results.params[contrast_id]
+        pvals[conn_id] = results.pvalues[contrast_id]
+
+    return betas, pvals
+
+
+def glm_wrap_cc(conn, pheno, group, case, control, regressors='', report=False, fast=False):
+    # Make sure pheno and conn have the same number of cases
+    if not conn.shape[0] == pheno.shape[0]:
+        print(f'Conn ({conn.shape[0]}) and pheno ({pheno.shape[0]}) must be same number of cases')
+
+    # Define the subset of the sample
+    sub_mask, case_masks = find_subset(pheno, group, [case, control])
+    sub_conn = conn[sub_mask, :]
+    sub_pheno = pheno.loc[sub_mask]
+    n_sub = np.sum(sub_mask)
+    n_case = np.sum(case_masks[case])
+    n_control = np.sum(case_masks[control])
+    n_data = sub_conn.shape[1]
+    if report:
+        print(f'Selected sample based on group variable {group}.\n'
+              f'cases: {case} (n={n_case})\n'
+              f'controls: {control} (n={n_control})\n'
+              f'original sample: n={pheno.shape[0]}; new sample: n={n_sub}\n'
+              f'{n_data} data points available\n'
+              f'standardized estimators are based on {group}=={control}')
+
+    stand_conn = standardize(sub_conn, case_masks[control])
+
+    # Construct design matrix
+    if type(control) == str:
+        contrast = f'C({group}, Treatment("{control}"))'
+    else:
+        contrast = f'C({group}, Treatment({control}))'
+    formula = ' + '.join((regressors, contrast))
+    dmat = pat.dmatrix(formula, sub_pheno, return_type='dataframe')
+
+    if fast:
+        betas = fast_glm(sub_conn, dmat, group)
+        table = pd.DataFrame(data={'betas': betas})
+    else:
+        betas, pvals = glm(sub_conn, dmat, group)
+        stand_betas, _ = glm(stand_conn, dmat, group)
+        table = pd.DataFrame(data={'betas': betas, 'stand_betas': stand_betas, 'pvals': pvals})
+
+    return table
+
+
+def glm_wrap_continuous(conn, pheno, contrast, regressors, report=False, fast=False):
+    # Make sure pheno and conn have the same number of cases
+    if not conn.shape[0] == pheno.shape[0]:
+        print(f'Conn ({conn.shape[0]}) and pheno ({pheno.shape[0]}) must be same number of cases')
+
+    # Define the subset of the sample
+    sub_mask = find_subset(pheno, contrast)
+    sub_conn = conn[sub_mask, :]
+    sub_pheno = pheno.loc[sub_mask]
+    n_sub = sub_pheno.shape[0]
+    n_data = sub_conn.shape[1]
+    sub_conn_stand = standardize(sub_conn, np.ones(n_sub).astype(bool))
+
+    if report:
+        print(f'Selected sample based on contrast variable {contrast}.\n'
+              f'Found {n_sub} subjects with no missing data for {contrast}\n'
+              f'original sample: n={pheno.shape[0]}; new sample: n={n_sub}\n'
+              f'{n_data} data points available\n'
+              f'standardized estimators are based on all subjects with no missing data for {contrast}')
+
+    formula = ' + '.join((regressors, contrast))
+    design_matrix = pat.dmatrix(formula, sub_pheno, return_type='dataframe')
+
+    if fast:
+        betas = fast_glm(sub_conn, design_matrix, contrast)
+        table = pd.DataFrame(data={'betas': betas})
+    else:
+        betas, pvals = glm(sub_conn, design_matrix, contrast)
+        stand_betas, _ = glm(sub_conn_stand, design_matrix, contrast)
+        table = pd.DataFrame(data={'betas': betas, 'stand_betas': stand_betas, 'pvals': pvals})
+
+    return table
+
+
+def permutation_glm_contrast(conn, pheno, n_a, n_a_1, n_b, n_b_1, n_iter=5000):
+    # conn: connectome (n_sub, n_conn)
+    # pheno: pheno pandas table, same order as conn
+    # n_a / n_b: number of subs per group a / b
+    # n_a_1 / n_b_1: number of subs in the first contrast of group a / b
+    # n_iter: number of permutations
+
+    # Some sanity checks because people can be stupid
+    if n_a_1 > n_a:
+        raise ValueError(f'n_a_1 ({n_a_1}) cannot be larger than n_a ({n_a})')
+    if n_b_1 > n_b:
+        raise ValueError(f'n_b_1 ({n_b_1}) cannot be larger than n_b ({n_b})')
+
+    shsp = skm.ShuffleSplit(n_splits=n_iter, test_size=0.5)
+    pheno.reset_index(inplace=True, drop=True)
+    pheno.loc[:, 'group'] = 0
+    n_subjects = pheno.shape[0]
+    beta = np.zeros((conn.shape[1], 2, n_iter))
+
+    start = time.time()
+    for fold_id, (group_a, group_b) in enumerate(shsp.split(np.ones(n_subjects))):
+        pheno_a = pheno.loc[group_a[:n_a]].copy()
+        pheno_b = pheno.loc[group_b[:n_b]].copy()
+        # Assign the groups
+        pheno_a.loc[group_a[:n_a_1], 'group'] = 1
+        pheno_b.loc[group_b[:n_b_1], 'group'] = 1
+        mat1 = pat.dmatrix('sex_dummy + Age_Bin + FD_scrubbed_s1_s2 + group', pheno_a, return_type='dataframe')
+        mat2 = pat.dmatrix('sex_dummy + Age_Bin + FD_scrubbed_s1_s2 + group', pheno_b, return_type='dataframe')
+        beta[:, 0, fold_id] = fast_glm(conn[group_a[:n_a], :], mat1, 'group')
+        beta[:, 1, fold_id] = fast_glm(conn[group_b[:n_b], :], mat2, 'group')
+
+        elapsed = time.time() - start
+        done = fold_id + 1
+        remaining = n_iter - done
+        time_left = (elapsed / done) * remaining
+
+        sys.stdout.write('\r {}/{}. {:.2f}s left ({:.3f}s)'.format(done, n_iter, time_left, elapsed / done))
+        sys.stdout.flush()
+    sys.stdout.write('\r Done. Took {:.2f}s'.format(elapsed))
+    sys.stdout.flush()
+
+    return beta
+
+
+def contrast_correlation(beta, mask, mode='region'):
+    # mode: 'region' or 'global'
+    if mode == 'region':
+        n_iter = beta.shape[2]
+        n_seed = mask.shape[0]
+        corr = np.zeros((n_seed, n_iter))
+        for fold_id in range(n_iter):
+            # Map the betas back to a matrix
+            beta1_mat = conn2mat(beta[:, 0, fold_id], mask)
+            beta2_mat = conn2mat(beta[:, 1, fold_id], mask)
+            # Compute the correlation
+            corr[:, fold_id] = np.array([np.corrcoef(beta1_mat[nid, :], beta2_mat[nid, :])[0, 1] for nid in range(64)])
+        return corr
+    elif mode == 'global':
+        n_iter = beta.shape[2]
+        corr = np.array([np.corrcoef(beta[:, 0, fold_id], beta[:, 1, fold_id])[0, 1] for fold_id in range(n_iter)])
+        return corr
+    else:
+        raise ValueError(f'mode {mode} is not configured!')
+
+
+def permutation_glm(pheno, conn, group, case, control, regressors='', n_iter=1000, stand=False):
+    # Make sure pheno and conn have the same number of cases
+    if not conn.shape[0] == pheno.shape[0]:
+        print(f'Conn ({conn.shape[0]}) and pheno ({pheno.shape[0]}) must be same number of cases')
+    sub_mask, case_masks = find_subset(pheno, group, [case, control])
+    sub_pheno = pheno.loc[sub_mask, :]
+    sub_pheno.loc[:, 'group'] = ''
+    sub_conn = conn[sub_mask, :]
+    # Standardize the input data
+    if stand:
+        sub_conn = standardize(sub_conn, case_masks[control])
+    n_conn = sub_conn.shape[1]
+
+    n1 = np.sum(case_masks[case])
+    n2 = np.sum(case_masks[control])
+    r2 = n2 / (n1 + n2)
+
+    splitter = skm.ShuffleSplit(n_splits=n_iter, test_size=r2, random_state=1)
+
+    i_pheno = sub_pheno.copy()
+    group_id = list(i_pheno.columns).index('group')
+
+    betas = np.zeros(shape=(n_iter, n_conn))
+    start = time.time()
+    for fold_id, (train_id, test_id) in enumerate(splitter.split(sub_conn)):
+        n1_ids = train_id[:n1]
+        n2_ids = test_id[:n2]
+        i_pheno = sub_pheno.copy()
+        i_pheno['group'] = ''
+        i_pheno.iloc[n1_ids, group_id] = 'case'
+        i_pheno.iloc[n2_ids, group_id] = 'control'
+        table = glm_wrap_cc(sub_conn, i_pheno, 'group', 'case', 'control', regressors=regressors,
+                            report=False, fast=True)
+        betas[fold_id, :] = table['betas'].values
+
+        elapsed = time.time() - start
+        done = fold_id + 1
+        remaining = n_iter - done
+        time_left = (elapsed / done) * remaining
+
+        sys.stdout.write('\r {}/{}. {:.2f}s left ({:.3f}s per permutation)'.format(done, n_iter, time_left, elapsed / done))
+        sys.stdout.flush()
+    sys.stdout.write('\r Done. Took {:.2f}s'.format(elapsed))
+    sys.stdout.flush()
+
+    return betas
diff --git a/scripts/oasis_wrangling_draft.ipynb b/scripts/oasis_wrangling_draft.ipynb
new file mode 100644
index 0000000..8a875a8
--- /dev/null
+++ b/scripts/oasis_wrangling_draft.ipynb
@@ -0,0 +1,401 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "476173ef-0750-4714-9822-1cbe24db08ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from pathlib import Path\n",
+    "\n",
+    "## Load cobre participant data and do some stuff\n",
+    "\n",
+    "root_p = Path(\"__file__\").resolve().parents[1] / \"data\" / \"oasis\"\n",
+    "\n",
+    "df = pd.read_csv(root_p / \"clarken_8_28_2023_13_54_33.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "1aaf1c89-1f7c-4b75-be35-c120ad9484bc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>MR ID</th>\n",
+       "      <th>Subject</th>\n",
+       "      <th>Age</th>\n",
+       "      <th>Scanner</th>\n",
+       "      <th>Scans</th>\n",
+       "      <th>M/F</th>\n",
+       "      <th>NPIQINF</th>\n",
+       "      <th>NPIQINFX</th>\n",
+       "      <th>DEL</th>\n",
+       "      <th>DELSEV</th>\n",
+       "      <th>...</th>\n",
+       "      <th>NEOPIF</th>\n",
+       "      <th>COGOTH</th>\n",
+       "      <th>COGOTHX</th>\n",
+       "      <th>COGOTHIF</th>\n",
+       "      <th>COGOTH2</th>\n",
+       "      <th>COGOTH2X</th>\n",
+       "      <th>COGOTH2F</th>\n",
+       "      <th>COGOTH3</th>\n",
+       "      <th>COGOTH3X</th>\n",
+       "      <th>COGOTH3F</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>OAS30001_MR_d0129</td>\n",
+       "      <td>OAS30001</td>\n",
+       "      <td>65.0</td>\n",
+       "      <td>3.0T</td>\n",
+       "      <td>bold(3), T1w(2), T2w(2)</td>\n",
+       "      <td>F</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>OAS30001_MR_d0757</td>\n",
+       "      <td>OAS30001</td>\n",
+       "      <td>67.0</td>\n",
+       "      <td>3.0T</td>\n",
+       "      <td>bold(2), dwi(1), minIP(1), swi(1), T1w(2), T2s...</td>\n",
+       "      <td>F</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>OAS30001_MR_d2430</td>\n",
+       "      <td>OAS30001</td>\n",
+       "      <td>71.0</td>\n",
+       "      <td>3.0T</td>\n",
+       "      <td>angio(1), asl(1), dwi(2), fieldmap(3), FLAIR(1...</td>\n",
+       "      <td>F</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>OAS30001_MR_d3132</td>\n",
+       "      <td>OAS30001</td>\n",
+       "      <td>73.0</td>\n",
+       "      <td>3.0T</td>\n",
+       "      <td>asl(2), bold(2), dwi(2), fieldmap(3), T1w(1), ...</td>\n",
+       "      <td>F</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>OAS30001_MR_d3746</td>\n",
+       "      <td>OAS30001</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>angio(1), asl(1), bold(2), fieldmap(3), FLAIR(...</td>\n",
+       "      <td>F</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 120 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "               MR ID   Subject   Age Scanner  \\\n",
+       "0  OAS30001_MR_d0129  OAS30001  65.0    3.0T   \n",
+       "1  OAS30001_MR_d0757  OAS30001  67.0    3.0T   \n",
+       "2  OAS30001_MR_d2430  OAS30001  71.0    3.0T   \n",
+       "3  OAS30001_MR_d3132  OAS30001  73.0    3.0T   \n",
+       "4  OAS30001_MR_d3746  OAS30001   NaN     NaN   \n",
+       "\n",
+       "                                               Scans M/F  NPIQINF  NPIQINFX  \\\n",
+       "0                            bold(3), T1w(2), T2w(2)   F      2.0       NaN   \n",
+       "1  bold(2), dwi(1), minIP(1), swi(1), T1w(2), T2s...   F      2.0       NaN   \n",
+       "2  angio(1), asl(1), dwi(2), fieldmap(3), FLAIR(1...   F      2.0       NaN   \n",
+       "3  asl(2), bold(2), dwi(2), fieldmap(3), T1w(1), ...   F      2.0       NaN   \n",
+       "4  angio(1), asl(1), bold(2), fieldmap(3), FLAIR(...   F      2.0       NaN   \n",
+       "\n",
+       "   DEL  DELSEV  ...  NEOPIF  COGOTH  COGOTHX  COGOTHIF  COGOTH2  COGOTH2X  \\\n",
+       "0  0.0     NaN  ...     NaN     0.0      NaN       NaN      NaN       NaN   \n",
+       "1  0.0     NaN  ...     NaN     0.0      NaN       NaN      NaN       NaN   \n",
+       "2  0.0     NaN  ...     NaN     0.0      NaN       NaN      0.0       NaN   \n",
+       "3  0.0     NaN  ...     NaN     0.0      NaN       NaN      0.0       NaN   \n",
+       "4  0.0     NaN  ...     NaN     0.0      NaN       NaN      NaN       NaN   \n",
+       "\n",
+       "   COGOTH2F  COGOTH3  COGOTH3X  COGOTH3F  \n",
+       "0       NaN      NaN       NaN       NaN  \n",
+       "1       NaN      NaN       NaN       NaN  \n",
+       "2       NaN      0.0       NaN       NaN  \n",
+       "3       NaN      0.0       NaN       NaN  \n",
+       "4       NaN      NaN       NaN       NaN  \n",
+       "\n",
+       "[5 rows x 120 columns]"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "e24c5b24-8227-413c-b2b8-a5d537b0c01c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Rename some columns and map values for sex\n",
+    "df.rename(\n",
+    "    columns={\"M/F\": \"sex\", \"Age\": \"age\", \"Subject\": \"participant_id\"},\n",
+    "    inplace=True,\n",
+    ")\n",
+    "df[\"sex\"] = df[\"sex\"].map({\"M\": 0, \"F\": 1})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "03e5edfa-c403-4cae-85be-e9fcb18e3057",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Find controls in one of two ways, due to how missing data was coded \n",
+    "# Create a list of column names that need to be empty\n",
+    "empty_columns = [\"NORMCOG\", \"DEMENTED\", \"MCIAMEM\", \"MCIAPLUS\", \"MCIAPLAN\", \"MCIAPATT\", \"MCIAPEX\", \"MCIAPVIS\",\n",
+    "                 \"MCINON1\", \"MCIN1LAN\", \"MCIN1ATT\", \"MCIN1EX\", \"MCIN1VIS\", \"MCINON2\", \"MCIN2LAN\", \"MCIN2ATT\",\n",
+    "                 \"MCIN2EX\", \"MCIN2VIS\"]\n",
+    "\n",
+    "# Use boolean indexing to filter rows based on conditions\n",
+    "control_df = df[((df[\"dx1\"] == 'Cognitively healthy') & (df[empty_columns].isna().all(axis=1))) |\n",
+    "                   (df['NORMCOG'] == 1)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "af86c09e-09dc-4a9b-b856-d91e0ebbd79e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# MCI only\n",
+    "# Filter rows where any of the MCI diagnoses are True\n",
+    "mci_df = df[df[['MCIAMEM', 'MCIAPLUS', 'MCIAPLAN', 'MCIAPATT', 'MCIAPEX',\n",
+    "                  'MCIAPVIS', 'MCINON1', 'MCIN1LAN', 'MCIN1ATT', 'MCIN1EX',\n",
+    "                  'MCIN1VIS', 'MCINON2', 'MCIN2LAN', 'MCIN2ATT', 'MCIN2EX', 'MCIN2VIS']].any(axis=1)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "3d728ec7-d0a5-492c-a26d-a3798af9739d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Alzheimers only\n",
+    "# Filter rows where DEMENTED column is True\n",
+    "ad_df = df[df['DEMENTED'] == 1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "5d62faec-57c0-469e-99f3-88a9d4ea064d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add group column\n",
+    "control_df.loc[:, \"group\"] = \"CON\"\n",
+    "mci_df.loc[:, \"group\"] = \"MCI\"\n",
+    "ad_df.loc[:, \"group\"] = \"AD\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "1a0facb6-4dc4-4ecb-a332-131e8efbcbfe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def _mbi_conversion(df):\n",
+    "    # Calculate MBI domains\n",
+    "    df[\"decreased_motivation\"] = df[\"APA\"]\n",
+    "    df[\"emotional_dysregulation\"] = df[\"DEPD\"] + df[\"ANX\"] + df[\"ELAT\"]\n",
+    "    df[\"impulse_dyscontrol\"] = df[\"AGIT\"] + df[\"IRR\"] + df[\"MOT\"]\n",
+    "    df[\"social_inappropriateness\"] = df[\"DISN\"]\n",
+    "    df[\"abnormal_perception\"] = df[\"DEL\"] + df[\"HALL\"]\n",
+    "\n",
+    "    # Calculate MBI total score\n",
+    "    mbi_domains = [\n",
+    "        \"decreased_motivation\",\n",
+    "        \"emotional_dysregulation\",\n",
+    "        \"impulse_dyscontrol\",\n",
+    "        \"social_inappropriateness\",\n",
+    "        \"abnormal_perception\",\n",
+    "    ]\n",
+    "    df[\"MBI_total_score\"] = df[mbi_domains].sum(axis=1)\n",
+    "    df[\"mbi_status\"] = (df[\"MBI_total_score\"] >= 1).astype(int)\n",
+    "\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "59093612-7228-4859-90af-2d4dc6d8578d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mci_df = _mbi_conversion(mci_df.copy()) \n",
+    "dementia_df = _mbi_conversion(dementia_df.copy()) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a10dfed-85f3-41a7-8f3c-6581db86b56a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5d1a3f3a-bb9e-46a0-8721-00604aa2248d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1082b745-09a1-478a-b889-b35cef7518fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Next steps: match connectomes to patient df to check which ones we have. select earliest one with mbi status.\n",
+    "# then separate into positive and negative. select all the correct columns, then merge with controls\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From fdc8f47b740b890d4cc4818ed2aef7a8beda2de7 Mon Sep 17 00:00:00 2001
From: clarkenj <natashajayne@gmail.com>
Date: Tue, 9 Apr 2024 15:09:42 -0400
Subject: [PATCH 08/14] tidied up, added npy script

---
 scripts/adni_npi_to_mbi.py          |  51 ----
 scripts/cimaq_npi_to_mbi.py         |  87 ------
 scripts/get_cobre_participants.py   |  33 ---
 scripts/get_ds00030_participants.py |  33 ---
 scripts/get_srpbs_participants.py   |  31 ---
 scripts/h5_to_npy.py                | 173 ++++++++++++
 scripts/oasis_wrangling_draft.ipynb | 401 ----------------------------
 7 files changed, 173 insertions(+), 636 deletions(-)
 delete mode 100644 scripts/adni_npi_to_mbi.py
 delete mode 100644 scripts/cimaq_npi_to_mbi.py
 delete mode 100644 scripts/get_cobre_participants.py
 delete mode 100644 scripts/get_ds00030_participants.py
 delete mode 100644 scripts/get_srpbs_participants.py
 create mode 100644 scripts/h5_to_npy.py
 delete mode 100644 scripts/oasis_wrangling_draft.ipynb

diff --git a/scripts/adni_npi_to_mbi.py b/scripts/adni_npi_to_mbi.py
deleted file mode 100644
index 8ff48ba..0000000
--- a/scripts/adni_npi_to_mbi.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import pandas as pd
-from pathlib import Path
-
-## Load NPIQ or NPI scores for ADNI and convert to MBI framework
-## Domains: https://adni.bitbucket.io/reference/npi.html
-
-
-def _mbi_conversion(df):
-    # Calculate MBI domains
-    df["decreased_motivation"] = df["NPIG"]
-    df["emotional_dysregulation"] = df["NPID"] + df["NPIE"] + df["NPIF"]
-    df["impulse_dyscontrol"] = df["NPIC"] + df["NPII"] + df["NPIJ"]
-    df["social_inappropriateness"] = df["NPIH"]
-    df["abnormal_perception"] = df["NPIA"] + df["NPIB"]
-
-    # Calculate MBI total score
-    mbi_domains = [
-        "decreased_motivation",
-        "emotional_dysregulation",
-        "impulse_dyscontrol",
-        "social_inappropriateness",
-        "abnormal_perception",
-    ]
-    df["mbi_total_score"] = df[mbi_domains].sum(axis=1)
-    df["mbi_status"] = (df["mbi_total_score"] >= 1).astype(int)
-
-    mbi_columns = [
-        "RID",
-        "EXAMDATE",
-        "decreased_motivation",
-        "emotional_dysregulation",
-        "impulse_dyscontrol",
-        "social_inappropriateness",
-        "abnormal_perception",
-        "mbi_total_score",
-        "mbi_status",
-    ]
-    mbi_df = df[mbi_columns].copy()
-
-    return mbi_df
-
-
-root_p = Path("__file__").resolve().parents[1] / "data" / "adni"
-
-# Load NPI or NPI-Q scores
-npi_df = pd.read_csv(root_p / "NPI_22Aug2023.csv")
-
-mbi_df = _mbi_conversion(npi_df)
-
-# Save results - make sure to change name
-mbi_df.to_csv(root_p / "adni_npi_mbi_status_20230829.csv", index=False)
diff --git a/scripts/cimaq_npi_to_mbi.py b/scripts/cimaq_npi_to_mbi.py
deleted file mode 100644
index c94ec4a..0000000
--- a/scripts/cimaq_npi_to_mbi.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import pandas as pd
-from pathlib import Path
-
-## Load NPIQ scores for CIMA-Q and convert to MBI framework
-
-
-def _map_values(df):
-    # Map values for columns we are using
-    mapping = {"0_non": 0, "1_oui_léger": 1, "2_oui_modéré": 2, "3_oui_sévère": 3}
-
-    columns_to_map = [
-        "22901_apathie",
-        "22901_depression_dysphorie",
-        "22901_anxiete",
-        "22901_euphorie",
-        "22901_agitation_aggressivite",
-        "22901_irritabilite",
-        "22901_comp_moteur_aberrant",
-        "22901_impulsivite",
-        "22901_idees_delirantes",
-        "22901_hallucinations",
-    ]
-
-    for column in columns_to_map:
-        df[column] = df[column].map(mapping)
-
-    return df
-
-
-def _mbi_conversion(df):
-    # Calculate MBI domains
-    df["decreased_motivation"] = df["22901_apathie"]
-    df["emotional_dysregulation"] = (
-        df["22901_depression_dysphorie"] + df["22901_anxiete"] + df["22901_euphorie"]
-    )
-    df["impulse_dyscontrol"] = (
-        df["22901_agitation_aggressivite"]
-        + df["22901_irritabilite"]
-        + df["22901_comp_moteur_aberrant"]
-    )
-    df["social_inappropriateness"] = df["22901_impulsivite"]
-    df["abnormal_perception"] = (
-        df["22901_idees_delirantes"] + df["22901_hallucinations"]
-    )
-
-    # Calculate MBI total score
-    mbi_domains = [
-        "decreased_motivation",
-        "emotional_dysregulation",
-        "impulse_dyscontrol",
-        "social_inappropriateness",
-        "abnormal_perception",
-    ]
-
-    df["mbi_total_score"] = df[mbi_domains].sum(axis=1)
-    df["mbi_status"] = (df["mbi_total_score"] >= 1).astype(int)
-
-    mbi_columns = [
-        "PSCID",
-        "Date_taken",
-        "decreased_motivation",
-        "emotional_dysregulation",
-        "impulse_dyscontrol",
-        "social_inappropriateness",
-        "abnormal_perception",
-        "mbi_total_score",
-        "mbi_status",
-    ]
-
-    mbi_df = df[mbi_columns].copy()
-
-    return mbi_df
-
-
-root_p = Path("__file__").resolve().parents[1] / "data" / "cimaq"
-
-# Load NPI-Q scores
-npi_df = pd.read_csv(
-    root_p / "22901_inventaire_neuropsychiatrique_q_initiale.tsv", sep="\t"
-)
-
-npi_df = npi_df.dropna(subset=["Date_taken"])
-npi_df = _map_values(npi_df)
-mbi_df = _mbi_conversion(npi_df)
-
-# Save results - make sure to change name
-mbi_df.to_csv(root_p / "cimaq_npiq_mbi_status_20230829.csv", index=False)
diff --git a/scripts/get_cobre_participants.py b/scripts/get_cobre_participants.py
deleted file mode 100644
index bec947a..0000000
--- a/scripts/get_cobre_participants.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import pandas as pd
-from pathlib import Path
-
-## Load cobre participant data and do some stuff
-
-root_p = Path("__file__").resolve().parents[1] / "data"
-
-df = pd.read_csv(root_p / "cobre" / "COBRE_phenotypic_data.csv")
-
-# Add columns SZ and CON
-df["SZ"] = (df["Subject Type"] == "Patient").astype(int)
-df["CON"] = (df["Subject Type"] == "Control").astype(int)
-
-# Add group column
-df["group"] = df["Subject Type"]
-df["group"] = df["group"].replace({"Patient": "SZ", "Control": "CON"})
-
-# Rename some columns and map values for sex
-df.rename(
-    columns={"Gender": "sex", "Current Age": "age", "Unnamed: 0": "participant_id"},
-    inplace=True,
-)
-df["sex"] = df["sex"].map({"Male": 0, "Female": 1})
-
-# Add cohort and site columns
-df["cohort"] = "COBRE"
-df["site"] = 0
-
-cols = ["participant_id", "age", "sex", "site", "cohort", "group", "SZ", "CON"]
-df = df[cols]
-
-# Save results
-df.to_csv(root_p / "cobre_participants.csv", index=False)
diff --git a/scripts/get_ds00030_participants.py b/scripts/get_ds00030_participants.py
deleted file mode 100644
index e49d66c..0000000
--- a/scripts/get_ds00030_participants.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import pandas as pd
-from pathlib import Path
-
-## Load ds000030 participant data and do some stuff
-
-root_p = Path("__file__").resolve().parents[1] / "data"
-
-df = pd.read_csv(root_p / "ds000030" / "participants.csv")
-
-# Keep only rows where diagnosis is CONTROL or SCHZ
-df = df[df["diagnosis"].isin(["CONTROL", "SCHZ"])]
-
-# Add columns SZ and CON
-df["SZ"] = (df["diagnosis"] == "SCHZ").astype(int)
-df["CON"] = (df["diagnosis"] == "CONTROL").astype(int)
-
-# Add group column
-df["group"] = df["diagnosis"]
-df["group"] = df["group"].replace({"SCHZ": "SZ", "CONTROL": "CON"})
-
-# Rename some columns and map values for sex
-df.rename(columns={"gender": "sex"}, inplace=True)
-df["sex"] = df["sex"].map({"M": 0, "F": 1})
-
-# Add COHORT and SITE columns
-df["cohort"] = "ds000030"
-df["site"] = 0
-
-cols = ["participant_id", "age", "sex", "site", "cohort", "group", "SZ", "CON"]
-df = df[cols]
-
-# Save results
-df.to_csv(root_p / "ds000030_participants.csv", index=False)
diff --git a/scripts/get_srpbs_participants.py b/scripts/get_srpbs_participants.py
deleted file mode 100644
index fddba39..0000000
--- a/scripts/get_srpbs_participants.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import pandas as pd
-from pathlib import Path
-
-## Load srpbs participant data and do some stuff
-
-root_p = Path("__file__").resolve().parents[1] / "data"
-
-df = pd.read_csv(root_p / "srpbs" / "participants.tsv", sep="\t")
-
-# Keep only rows where diagnosis is 0 (control) or 4 (sz)
-df = df[df["diag"].isin([0, 4])]
-
-# Add columns SZ and CON
-df["SZ"] = (df["diag"] == 4).astype(int)
-df["CON"] = (df["diag"] == 0).astype(int)
-
-# Add group column
-df["group"] = df["diag"]
-df["group"] = df["group"].replace({4: "SZ", 0: "CON"})
-
-# Map values for sex, from total N in Tanaka paper
-df["sex"] = df["sex"].map({1: 0, 2: 1})
-
-# Add cohort and site columns
-df["cohort"] = "SRPBS"
-
-cols = ["participant_id", "age", "sex", "site", "cohort", "group", "SZ", "CON"]
-df = df[cols]
-
-# Save results
-df.to_csv(root_p / "srpbs_participants.csv", index=False)
diff --git a/scripts/h5_to_npy.py b/scripts/h5_to_npy.py
new file mode 100644
index 0000000..805bb0b
--- /dev/null
+++ b/scripts/h5_to_npy.py
@@ -0,0 +1,173 @@
+#import os dont think I need this?
+import h5py
+import numpy as np
+import pandas as pd
+from pathlib import Path
+
+def load_connectome(file, participant_id, session, identifier, no_session):
+    # Construct the dataset path
+    if no_session:
+        dataset_path = f"sub-{participant_id}/{identifier}_atlas-MIST_desc-64_connectome"
+    else:
+        dataset_path = f"sub-{participant_id}/ses-{session}/{identifier}_atlas-MIST_desc-64_connectome"
+
+    dataset = file.get(dataset_path)
+
+    connectome_npy = None  # Initialise to None in case missing
+    if dataset is not None:
+        connectome_npy = dataset[...]  # Load the dataset into a NumPy array
+    return connectome_npy
+
+def save_connectome(connectome_npy, output_p, identifier):
+    if connectome_npy is not None:
+        output_file_p = output_p / f"{identifier}.npy"
+        np.save(output_file_p, connectome_npy)
+        print(f"Saved data for {identifier}")
+    else:
+        print(f"Connectome for {identifier} not found or could not be loaded")
+
+def process_connectome_participant(row, base_dir, output_p):
+    identifier = row["identifier"]
+    participant_id = row["participant_id"]
+    session = row["ses"]
+    # Construct path to subject's HDF5 file
+    hdf5_file_p = base_dir / f"{participant_id}" / f"sub-{participant_id}_atlas-MIST_desc-scrubbing.5+gsr.h5"
+
+    if hdf5_file_p.exists():
+        with h5py.File(hdf5_file_p, "r") as file:
+            connectome_npy = load_connectome(file, participant_id, session, identifier)
+            save_connectome(connectome_npy, output_p, identifier)
+
+def process_connectome_group(row, file, output_p, no_session):
+    identifier = row["identifier"]
+    participant_id = row["participant_id"]
+    session = row["ses"]
+
+    connectome_npy = load_connectome(file, participant_id, session, identifier, no_session)
+    save_connectome(connectome_npy, output_p, identifier)
+
+def process_cobre(conn_p, df, output_dir):
+    base_dir = conn_p / "cobre_connectome-0.4.1"
+    hdf5_file_p = base_dir / "atlas-MIST_desc-scrubbing.5+gsr.h5"
+    output_p = output_dir / "cobre"
+    if not output_p.exists():
+        output_p.mkdir(parents=True, exist_ok=True)
+
+    # Filter df for dataset
+    filtered_df = df[df['dataset'] == 'cobre']
+
+    with h5py.File(hdf5_file_p, "r") as file:
+        # Loop through the filtered dataframe
+        for index, row in filtered_df.iterrows():
+            process_connectome_group(row, file, output_p)
+
+def process_ds000030(conn_p, df, output_dir):
+    base_dir = conn_p / "ds000030_connectomes-0.4.1"
+    hdf5_file_p = base_dir / "atlas-MIST_desc-scrubbing.5+gsr.h5"
+    output_p = output_dir / "ds000030"
+    if not output_p.exists():
+        output_p.mkdir(parents=True, exist_ok=True)
+
+    # Filter df for dataset
+    filtered_df = df[df['dataset'] == 'ds000030']
+
+    with h5py.File(hdf5_file_p, "r") as file:
+        # Loop through the filtered dataframe
+        for index, row in filtered_df.iterrows():
+            process_connectome_group(row, file, output_p, no_session=True)
+
+def process_hcpep(conn_p, df, output_dir):
+    base_dir = conn_p / "hcp-ep_connectome-0.4.1"
+    output_p = output_dir / "hcpep"
+    if not output_p.exists():
+        output_p.mkdir(parents=True, exist_ok=True)
+
+    # Filter df for dataset
+    filtered_df = df[df['dataset'] == 'hcpep']
+
+    # Loop through the filtered dataframe
+    for index, row in filtered_df.iterrows():
+        process_connectome_participant(row, base_dir, output_p)
+
+def process_srpbs(conn_p, df, output_dir):
+    base_dir = conn_p / "srpbs_connectome-0.4.1"
+    output_p = output_dir / "srpbs"
+    if not output_p.exists():
+        output_p.mkdir(parents=True, exist_ok=True)
+
+    # Filter df for dataset
+    filtered_df = df[df['dataset'] == 'srpbs']
+
+    # Loop through the filtered dataframe
+    for index, row in filtered_df.iterrows():
+        process_connectome_participant(row, base_dir, output_p)
+
+def process_adni(conn_p, final_file_p, output_dir):
+    base_dir = conn_p / "adni_connectomes-0.4.1"
+    final_file = final_file_p / "final_adni.tsv"
+    output_p = output_dir / "adni"
+    if not output_p.exists():
+        output_p.mkdir(parents=True, exist_ok=True)
+
+    # Load the file to get identifiers and participant_ids
+    df = pd.read_csv(final_file, sep="\t")
+
+    # Loop through the filtered dataframe
+    for index, row in df.iterrows():
+        process_connectome_participant(row, base_dir, output_p)
+
+def process_oasis3(conn_p, final_file_p, output_dir):
+    base_dir = conn_p / "oasis3_connectomes-0.4.1"
+    final_file = final_file_p / "final_oasis3.tsv"
+    output_p = output_dir / "oasis3"
+    if not output_p.exists():
+        output_p.mkdir(parents=True, exist_ok=True)
+
+    # Load the file to get identifiers and participant_ids
+    df = pd.read_csv(final_file, sep="\t")
+
+    # Loop through the filtered dataframe
+    for index, row in df.iterrows():
+        process_connectome_participant(row, base_dir, output_p)
+
+def process_cimaq(conn_p, final_file_p, output_dir):
+    base_dir = conn_p / "cimaq_connectomes-0.4.1"
+    final_file = final_file_p / "final_cimaq.tsv"
+    output_p = output_dir / "cimaq"
+    if not output_p.exists():
+        output_p.mkdir(parents=True, exist_ok=True)
+
+    # Load the file to get identifiers and participant_ids
+    df = pd.read_csv(final_file, sep="\t")
+
+    # Loop through the filtered dataframe
+    for index, row in df.iterrows():
+        process_connectome_participant(row, base_dir, output_p)
+
+
+if __name__ == "__main__":
+    passed_qc_file_p = Path("/home/nclarke/projects/rrg-pbellec/nclarke/ad_sz/files/passed_qc_master.tsv")
+    conn_p = Path("/home/nclarke/scratch")
+    output_dir = Path("/home/nclarke/scratch/npy_connectome")
+
+    # These datasets needed some extra steps aftre checking QC, so get identifiers from a different tsv
+    final_file_p = Path("/home/nclarke/projects/rrg-pbellec/nclarke/ad_sz/files")
+
+    # Load the file to get identifiers and participant_ids
+    df = pd.read_csv(passed_qc_file_p, sep="\t")
+
+    #process_cobre(conn_p, df, output_dir)
+    process_ds000030(conn_p, df, output_dir)
+    #process_hcpep(conn_p, df, output_dir)
+    #process_srpbs(conn_p, df, output_dir)
+    #process_adni(conn_p, final_file_p, output_dir)
+    #process_cimaq(conn_p, final_file_p, output_dir)
+    #process_oasis3(conn_p, final_file_p, output_dir)
+
+
+
+
+
+
+
+
diff --git a/scripts/oasis_wrangling_draft.ipynb b/scripts/oasis_wrangling_draft.ipynb
deleted file mode 100644
index 8a875a8..0000000
--- a/scripts/oasis_wrangling_draft.ipynb
+++ /dev/null
@@ -1,401 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "id": "476173ef-0750-4714-9822-1cbe24db08ee",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "from pathlib import Path\n",
-    "\n",
-    "## Load cobre participant data and do some stuff\n",
-    "\n",
-    "root_p = Path(\"__file__\").resolve().parents[1] / \"data\" / \"oasis\"\n",
-    "\n",
-    "df = pd.read_csv(root_p / \"clarken_8_28_2023_13_54_33.csv\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "id": "1aaf1c89-1f7c-4b75-be35-c120ad9484bc",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>MR ID</th>\n",
-       "      <th>Subject</th>\n",
-       "      <th>Age</th>\n",
-       "      <th>Scanner</th>\n",
-       "      <th>Scans</th>\n",
-       "      <th>M/F</th>\n",
-       "      <th>NPIQINF</th>\n",
-       "      <th>NPIQINFX</th>\n",
-       "      <th>DEL</th>\n",
-       "      <th>DELSEV</th>\n",
-       "      <th>...</th>\n",
-       "      <th>NEOPIF</th>\n",
-       "      <th>COGOTH</th>\n",
-       "      <th>COGOTHX</th>\n",
-       "      <th>COGOTHIF</th>\n",
-       "      <th>COGOTH2</th>\n",
-       "      <th>COGOTH2X</th>\n",
-       "      <th>COGOTH2F</th>\n",
-       "      <th>COGOTH3</th>\n",
-       "      <th>COGOTH3X</th>\n",
-       "      <th>COGOTH3F</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>OAS30001_MR_d0129</td>\n",
-       "      <td>OAS30001</td>\n",
-       "      <td>65.0</td>\n",
-       "      <td>3.0T</td>\n",
-       "      <td>bold(3), T1w(2), T2w(2)</td>\n",
-       "      <td>F</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>OAS30001_MR_d0757</td>\n",
-       "      <td>OAS30001</td>\n",
-       "      <td>67.0</td>\n",
-       "      <td>3.0T</td>\n",
-       "      <td>bold(2), dwi(1), minIP(1), swi(1), T1w(2), T2s...</td>\n",
-       "      <td>F</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>OAS30001_MR_d2430</td>\n",
-       "      <td>OAS30001</td>\n",
-       "      <td>71.0</td>\n",
-       "      <td>3.0T</td>\n",
-       "      <td>angio(1), asl(1), dwi(2), fieldmap(3), FLAIR(1...</td>\n",
-       "      <td>F</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>OAS30001_MR_d3132</td>\n",
-       "      <td>OAS30001</td>\n",
-       "      <td>73.0</td>\n",
-       "      <td>3.0T</td>\n",
-       "      <td>asl(2), bold(2), dwi(2), fieldmap(3), T1w(1), ...</td>\n",
-       "      <td>F</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>OAS30001_MR_d3746</td>\n",
-       "      <td>OAS30001</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>angio(1), asl(1), bold(2), fieldmap(3), FLAIR(...</td>\n",
-       "      <td>F</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5 rows × 120 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "               MR ID   Subject   Age Scanner  \\\n",
-       "0  OAS30001_MR_d0129  OAS30001  65.0    3.0T   \n",
-       "1  OAS30001_MR_d0757  OAS30001  67.0    3.0T   \n",
-       "2  OAS30001_MR_d2430  OAS30001  71.0    3.0T   \n",
-       "3  OAS30001_MR_d3132  OAS30001  73.0    3.0T   \n",
-       "4  OAS30001_MR_d3746  OAS30001   NaN     NaN   \n",
-       "\n",
-       "                                               Scans M/F  NPIQINF  NPIQINFX  \\\n",
-       "0                            bold(3), T1w(2), T2w(2)   F      2.0       NaN   \n",
-       "1  bold(2), dwi(1), minIP(1), swi(1), T1w(2), T2s...   F      2.0       NaN   \n",
-       "2  angio(1), asl(1), dwi(2), fieldmap(3), FLAIR(1...   F      2.0       NaN   \n",
-       "3  asl(2), bold(2), dwi(2), fieldmap(3), T1w(1), ...   F      2.0       NaN   \n",
-       "4  angio(1), asl(1), bold(2), fieldmap(3), FLAIR(...   F      2.0       NaN   \n",
-       "\n",
-       "   DEL  DELSEV  ...  NEOPIF  COGOTH  COGOTHX  COGOTHIF  COGOTH2  COGOTH2X  \\\n",
-       "0  0.0     NaN  ...     NaN     0.0      NaN       NaN      NaN       NaN   \n",
-       "1  0.0     NaN  ...     NaN     0.0      NaN       NaN      NaN       NaN   \n",
-       "2  0.0     NaN  ...     NaN     0.0      NaN       NaN      0.0       NaN   \n",
-       "3  0.0     NaN  ...     NaN     0.0      NaN       NaN      0.0       NaN   \n",
-       "4  0.0     NaN  ...     NaN     0.0      NaN       NaN      NaN       NaN   \n",
-       "\n",
-       "   COGOTH2F  COGOTH3  COGOTH3X  COGOTH3F  \n",
-       "0       NaN      NaN       NaN       NaN  \n",
-       "1       NaN      NaN       NaN       NaN  \n",
-       "2       NaN      0.0       NaN       NaN  \n",
-       "3       NaN      0.0       NaN       NaN  \n",
-       "4       NaN      NaN       NaN       NaN  \n",
-       "\n",
-       "[5 rows x 120 columns]"
-      ]
-     },
-     "execution_count": 34,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 35,
-   "id": "e24c5b24-8227-413c-b2b8-a5d537b0c01c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Rename some columns and map values for sex\n",
-    "df.rename(\n",
-    "    columns={\"M/F\": \"sex\", \"Age\": \"age\", \"Subject\": \"participant_id\"},\n",
-    "    inplace=True,\n",
-    ")\n",
-    "df[\"sex\"] = df[\"sex\"].map({\"M\": 0, \"F\": 1})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 36,
-   "id": "03e5edfa-c403-4cae-85be-e9fcb18e3057",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Find controls in one of two ways, due to how missing data was coded \n",
-    "# Create a list of column names that need to be empty\n",
-    "empty_columns = [\"NORMCOG\", \"DEMENTED\", \"MCIAMEM\", \"MCIAPLUS\", \"MCIAPLAN\", \"MCIAPATT\", \"MCIAPEX\", \"MCIAPVIS\",\n",
-    "                 \"MCINON1\", \"MCIN1LAN\", \"MCIN1ATT\", \"MCIN1EX\", \"MCIN1VIS\", \"MCINON2\", \"MCIN2LAN\", \"MCIN2ATT\",\n",
-    "                 \"MCIN2EX\", \"MCIN2VIS\"]\n",
-    "\n",
-    "# Use boolean indexing to filter rows based on conditions\n",
-    "control_df = df[((df[\"dx1\"] == 'Cognitively healthy') & (df[empty_columns].isna().all(axis=1))) |\n",
-    "                   (df['NORMCOG'] == 1)]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "id": "af86c09e-09dc-4a9b-b856-d91e0ebbd79e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# MCI only\n",
-    "# Filter rows where any of the MCI diagnoses are True\n",
-    "mci_df = df[df[['MCIAMEM', 'MCIAPLUS', 'MCIAPLAN', 'MCIAPATT', 'MCIAPEX',\n",
-    "                  'MCIAPVIS', 'MCINON1', 'MCIN1LAN', 'MCIN1ATT', 'MCIN1EX',\n",
-    "                  'MCIN1VIS', 'MCINON2', 'MCIN2LAN', 'MCIN2ATT', 'MCIN2EX', 'MCIN2VIS']].any(axis=1)]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "id": "3d728ec7-d0a5-492c-a26d-a3798af9739d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Alzheimers only\n",
-    "# Filter rows where DEMENTED column is True\n",
-    "ad_df = df[df['DEMENTED'] == 1]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
-   "id": "5d62faec-57c0-469e-99f3-88a9d4ea064d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Add group column\n",
-    "control_df.loc[:, \"group\"] = \"CON\"\n",
-    "mci_df.loc[:, \"group\"] = \"MCI\"\n",
-    "ad_df.loc[:, \"group\"] = \"AD\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 41,
-   "id": "1a0facb6-4dc4-4ecb-a332-131e8efbcbfe",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def _mbi_conversion(df):\n",
-    "    # Calculate MBI domains\n",
-    "    df[\"decreased_motivation\"] = df[\"APA\"]\n",
-    "    df[\"emotional_dysregulation\"] = df[\"DEPD\"] + df[\"ANX\"] + df[\"ELAT\"]\n",
-    "    df[\"impulse_dyscontrol\"] = df[\"AGIT\"] + df[\"IRR\"] + df[\"MOT\"]\n",
-    "    df[\"social_inappropriateness\"] = df[\"DISN\"]\n",
-    "    df[\"abnormal_perception\"] = df[\"DEL\"] + df[\"HALL\"]\n",
-    "\n",
-    "    # Calculate MBI total score\n",
-    "    mbi_domains = [\n",
-    "        \"decreased_motivation\",\n",
-    "        \"emotional_dysregulation\",\n",
-    "        \"impulse_dyscontrol\",\n",
-    "        \"social_inappropriateness\",\n",
-    "        \"abnormal_perception\",\n",
-    "    ]\n",
-    "    df[\"MBI_total_score\"] = df[mbi_domains].sum(axis=1)\n",
-    "    df[\"mbi_status\"] = (df[\"MBI_total_score\"] >= 1).astype(int)\n",
-    "\n",
-    "    return df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "id": "59093612-7228-4859-90af-2d4dc6d8578d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mci_df = _mbi_conversion(mci_df.copy()) \n",
-    "dementia_df = _mbi_conversion(dementia_df.copy()) "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6a10dfed-85f3-41a7-8f3c-6581db86b56a",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5d1a3f3a-bb9e-46a0-8721-00604aa2248d",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1082b745-09a1-478a-b889-b35cef7518fd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Next steps: match connectomes to patient df to check which ones we have. select earliest one with mbi status.\n",
-    "# then separate into positive and negative. select all the correct columns, then merge with controls\n",
-    "\n",
-    "\n",
-    "\n"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

From cdf45dc2886f7d1c68765e42d4348b605824b38b Mon Sep 17 00:00:00 2001
From: clarkenj <natashajayne@gmail.com>
Date: Tue, 9 Apr 2024 15:51:55 -0400
Subject: [PATCH 09/14] add first draft combat file script

---
 scripts/prepare_combat_file.py | 61 ++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 scripts/prepare_combat_file.py

diff --git a/scripts/prepare_combat_file.py b/scripts/prepare_combat_file.py
new file mode 100644
index 0000000..5d09044
--- /dev/null
+++ b/scripts/prepare_combat_file.py
@@ -0,0 +1,61 @@
+from pathlib import Path
+import numpy as np
+
+def load_connectomes(base_directory_p):
+    """
+    Load connectome data from .npy files across multiple subdirectories.
+
+    Parameters:
+    - base_directory_p: Path to the base directory containing subdirectories of connectome files.
+
+    Returns:
+    - connectomes: A list of connectome matrices, one per participant across all subdirectories.
+    """
+    connectomes = []
+    # Iterate over each subdirectory in the base directory
+    for directory_p in base_directory_p.iterdir():
+        if directory_p.is_dir():
+            for file_p in directory_p.glob("*.npy"):
+                connectome = np.load(file_p)
+                connectomes.append(connectome)
+    return connectomes
+
+def connectome_to_edges_matrix(connectomes):
+    """
+    Transform a list of connectomes into a matrix suitable for ComBat,
+    where rows are ROI pairs and columns are participants.
+    """
+    num_participants = len(connectomes)
+    num_rois = connectomes[0].shape[0]
+    num_edges = num_rois * (num_rois - 1) // 2
+
+    edges_matrix = np.zeros((num_edges, num_participants))
+    for i, connectome in enumerate(connectomes):
+        edge_index = 0
+        for row in range(num_rois):
+            for col in range(row + 1, num_rois):
+                edges_matrix[edge_index, i] = connectome[row, col]
+                edge_index += 1
+
+    return edges_matrix
+
+# Set paths
+base_directory_p = Path("/home/neuromod/ad_sz/data/npy_connectome")
+output_p = "/home/neuromod/ad_sz/data/edges_matrix.tsv"
+
+# Load connectomes from all subdirectories.
+connectomes = load_connectomes(base_directory_p)
+
+print(f"Number of connectomes loaded: {len(connectomes)}")
+for connectome in connectomes[:5]:  # Print the shape of the first 5 connectomes as a check
+    print(connectome.shape)
+
+# Transform into the ComBat-ready matrix.
+edges_matrix = connectome_to_edges_matrix(connectomes)
+
+print(f"Edges matrix shape: {edges_matrix.shape}")
+
+# Save the edges_matrix to a .tsv file
+np.savetxt(output_p, edges_matrix, delimiter='\t')
+
+print(f"Saved edges matrix to {output_p}")

From 166331d524bccbe9fd4c7910beb4ba4783a6e5a3 Mon Sep 17 00:00:00 2001
From: clarkenj <natashajayne@gmail.com>
Date: Mon, 29 Apr 2024 16:46:59 -0400
Subject: [PATCH 10/14] added data processing script

---
 scripts/prepare_input_data.py | 151 ++++++++++++++++++++++++++++++++++
 1 file changed, 151 insertions(+)
 create mode 100644 scripts/prepare_input_data.py

diff --git a/scripts/prepare_input_data.py b/scripts/prepare_input_data.py
new file mode 100644
index 0000000..5962502
--- /dev/null
+++ b/scripts/prepare_input_data.py
@@ -0,0 +1,151 @@
+import h5py
+import numpy as np
+import pandas as pd
+from pathlib import Path
+
+dataset_configs = {
+    'cobre': {
+        'connectome_path': "cobre_connectome-0.4.1_MIST_afc",
+        'no_session': False,
+        'subject_folder': False
+    },
+    'cimaq': {
+        'connectome_path': "cimaq_connectome-0.4.1_MIST_afc",
+        'no_session': False,
+        'subject_folder': True
+    },
+    'hcpep': {
+        'connectome_path': "hcp-ep_connectome-0.4.1",
+        'no_session': False,
+        'subject_folder': True
+    },
+    'ds000030': {
+        'connectome_path': "ds000030_connectomes-0.4.1",
+        'no_session': True,
+        'subject_folder': False
+    },
+}
+
+def save_npy_connectome(connectome_npy, output_p, identifier):
+    # Not using this function currently
+    if connectome_npy is not None:
+        output_file_p = output_p / f"{identifier}.npy"
+        np.save(output_file_p, connectome_npy)
+        print(f"Saved data for {identifier}")
+    else:
+        print(f"Connectome for {identifier} not found or could not be loaded")
+
+def connectome_to_edges_matrix_with_ids(connectomes_with_ids):
+    # Update function to new one
+    num_participants = len(connectomes_with_ids)
+    if num_participants == 0:
+        return np.array([]), []  # Handling the case of no connectomes
+
+    num_rois = connectomes_with_ids[0][1].shape[0]  # Shape from the first connectome
+    num_edges = num_rois * (num_rois + 1) // 2
+
+    edges_matrix = np.zeros((num_edges, num_participants))
+    identifiers = []
+
+    for i, (identifier, connectome) in enumerate(connectomes_with_ids):
+        identifiers.append(identifier)
+        edge_index = 0
+        for row in range(num_rois):
+            for col in range(row, num_rois):  # Include the diagonal
+                edges_matrix[edge_index, i] = connectome[row, col]
+                edge_index += 1
+
+    return edges_matrix, identifiers
+
+
+def process_datasets(conn_p, df, dataset_name):
+    config = dataset_configs[dataset_name]
+    base_dir = conn_p / config['connectome_path']
+
+    filtered_df = df[df['dataset'] == dataset_name]
+
+    connectomes_ids = []
+    covariate_data_list = []
+    for index, row in filtered_df.iterrows():
+        participant_id = row["participant_id"]
+        identifier = row["identifier"]
+        session = None if config['no_session'] else row.get("ses", None)
+        site = row["site"]
+        sex = row["sex"]
+        age = row["age"]
+        #diagnosis = row["diagnosis"]
+
+        # Adjust file path based on whether dataset uses a subject folder
+        if config['subject_folder']:
+            file_path = base_dir / participant_id / f"sub-{participant_id}_atlas-MIST_desc-scrubbing.5+gsr.h5"
+        else:
+            file_path = base_dir / "atlas-MIST_desc-scrubbing.5+gsr.h5"
+        try:
+            with h5py.File(file_path, "r") as file:
+                connectome = load_connectome(file, participant_id, session, identifier, config['no_session'])
+                if connectome is not None:
+                    connectomes_ids.append((identifier, connectome))
+                    #covariates = pd.DataFrame({'SITE': site, 'SEX': sex, 'AGE': age, 'DIAGNOSIS': diagnosis}, index=[identifier])
+                    covariates = pd.DataFrame({'SITE': site, 'SEX': sex, 'AGE': age}, index=[identifier])
+                    covariate_data_list.append(covariates)
+        except FileNotFoundError:
+            print(f"File not found: {file_path}")
+
+    if connectomes_ids:
+        edges_matrix, identifiers = connectome_to_edges_matrix_with_ids(connectomes_ids)
+        edges_df = pd.DataFrame(edges_matrix.T, index=identifiers)
+        covariate_df = pd.concat(covariate_data_list, ignore_index=False)
+        return edges_df, covariate_df
+    else:
+        return pd.DataFrame(), pd.DataFrame()
+
+def load_connectome(file, participant_id, session, identifier, no_session):
+    # Path construction within .h5 file
+    dataset_path = f"sub-{participant_id}"
+
+    if not no_session:
+        dataset_path += f"/ses-{session}"
+
+    dataset_path += f"/{identifier}_atlas-MIST_desc-64_connectome"
+    dataset = file.get(dataset_path)
+
+    if dataset is not None:
+        return dataset[...]
+    else:
+        print(f"Connectome for {identifier} not found or could not be loaded")
+        return None
+
+def one_hot_encode_column(df, column_name):
+    dummies = pd.get_dummies(df[column_name], prefix=column_name)
+    df = pd.concat([df, dummies], axis=1)
+    df.drop(column_name, axis=1, inplace=True)
+    return df
+
+
+if __name__ == "__main__":
+    conn_p = Path("/home/neuromod/ad_sz/data")
+    output_p = Path("/home/neuromod/ad_sz/data/input_data")
+    df = pd.read_csv("/home/neuromod/wrangling-phenotype/outputs/final_master_pheno.tsv",sep="\t")
+
+    if not output_p.exists():
+        output_p.mkdir(parents=True, exist_ok=True)
+
+    all_edges = []
+    all_covariates = []
+    for dataset in dataset_configs:
+        edges_df, covariates_df = process_datasets(conn_p, df, dataset)
+        all_edges.append(edges_df)
+        all_covariates.append(covariates_df)
+
+    # Concatenate all edges and covariates
+    combined_edges_df = pd.concat(all_edges)
+    combined_covariates_df = pd.concat(all_covariates)
+
+    # One-hot encode covariates
+    combined_covariates_df = one_hot_encode_column(combined_covariates_df, 'SEX')
+    #combined_covariates_df = one_hot_encode_column(combined_covariates_df, 'DIAGNOSIS')
+
+    combined_edges_df.sort_index().to_csv(output_p / 'combined_edges.tsv', sep='\t', index=True)
+    combined_covariates_df.sort_index().to_csv(output_p / 'combat_covariates.tsv', sep='\t', index=True)
+
+    print(f"Saved edges matrix and coavraites to {output_p}")

From 09f19284e1932ef37d79158d53e71db650397eef Mon Sep 17 00:00:00 2001
From: clarkenj <natashajayne@gmail.com>
Date: Tue, 30 Apr 2024 17:10:39 -0400
Subject: [PATCH 11/14] continued work on data loading

---
 scripts/prepare_input_data.py | 185 +++++++++++++++++++++++-----------
 1 file changed, 124 insertions(+), 61 deletions(-)

diff --git a/scripts/prepare_input_data.py b/scripts/prepare_input_data.py
index 5962502..a256e6f 100644
--- a/scripts/prepare_input_data.py
+++ b/scripts/prepare_input_data.py
@@ -26,6 +26,22 @@
     },
 }
 
+def load_connectome(file, participant_id, session, identifier, no_session):
+    # Path construction within .h5 file
+    dataset_path = f"sub-{participant_id}"
+
+    if not no_session:
+        dataset_path += f"/ses-{session}"
+
+    dataset_path += f"/{identifier}_atlas-MIST_desc-64_connectome"
+    dataset = file.get(dataset_path)
+
+    if dataset is not None:
+        return dataset[...]
+    else:
+        print(f"Connectome for {identifier} not found")
+        return None
+
 def save_npy_connectome(connectome_npy, output_p, identifier):
     # Not using this function currently
     if connectome_npy is not None:
@@ -33,47 +49,63 @@ def save_npy_connectome(connectome_npy, output_p, identifier):
         np.save(output_file_p, connectome_npy)
         print(f"Saved data for {identifier}")
     else:
-        print(f"Connectome for {identifier} not found or could not be loaded")
-
-def connectome_to_edges_matrix_with_ids(connectomes_with_ids):
-    # Update function to new one
-    num_participants = len(connectomes_with_ids)
-    if num_participants == 0:
-        return np.array([]), []  # Handling the case of no connectomes
-
-    num_rois = connectomes_with_ids[0][1].shape[0]  # Shape from the first connectome
-    num_edges = num_rois * (num_rois + 1) // 2
-
-    edges_matrix = np.zeros((num_edges, num_participants))
-    identifiers = []
-
-    for i, (identifier, connectome) in enumerate(connectomes_with_ids):
-        identifiers.append(identifier)
-        edge_index = 0
-        for row in range(num_rois):
-            for col in range(row, num_rois):  # Include the diagonal
-                edges_matrix[edge_index, i] = connectome[row, col]
-                edge_index += 1
-
-    return edges_matrix, identifiers
-
+        print(f"Connectome for {identifier} not found")
+
+def connectome_to_edge_matrix(connectome):
+    """
+    Transform a single connectome into an edge matrix.
+    Extract the upper triangular part of the connectome, including the diagonal.
+    """
+    # Get the upper triangular indices including the diagonal
+    row_indices, col_indices = np.triu_indices_from(connectome)
+
+    # Extract the values at these indices
+    edge_matrix = connectome[row_indices, col_indices]
+
+    return edge_matrix
+
+def process_connectomes(connectomes_by_participant):
+    """
+    For multiple connectomes per participant, average them, apply Fisher z transform, and convert to an edge matrix.
+    """
+    edges_matrix_dict = {}
+    for participant_id, connectomes in connectomes_by_participant.items():
+        # Compute the mean connectome
+        mean_connectome = np.mean(connectomes, axis=0)
+        # Apply Fisher Z transformation
+        transformed_connectome = np.arctanh(mean_connectome)
+        # Convert connectome to a marix for ComBat
+        edges_matrix = connectome_to_edge_matrix(transformed_connectome)
+        edges_matrix_dict[participant_id] = edges_matrix
+
+    return edges_matrix_dict
+
+
+def matrix_dict_to_df(edges_matrix_dict):
+    # Create a DataFrame from the edge matrices
+    # Convert dictionary values to a 2D numpy array
+    edge_matrix_combined = np.column_stack(list(edges_matrix_dict.values()))
+    participants = list(edges_matrix_dict.keys())
+    edges_df = pd.DataFrame(edge_matrix_combined, columns=participants)
+
+    # Transpose so suitable for ComBat
+    edges_df = edges_df.T
+
+    return edges_df
 
 def process_datasets(conn_p, df, dataset_name):
     config = dataset_configs[dataset_name]
     base_dir = conn_p / config['connectome_path']
 
+    # Filter the df for required scans for the dataset
     filtered_df = df[df['dataset'] == dataset_name]
+    valid_rows = []  # To store rows where the connectome was found
 
-    connectomes_ids = []
-    covariate_data_list = []
+    connectomes_by_participant = {}
     for index, row in filtered_df.iterrows():
         participant_id = row["participant_id"]
         identifier = row["identifier"]
         session = None if config['no_session'] else row.get("ses", None)
-        site = row["site"]
-        sex = row["sex"]
-        age = row["age"]
-        #diagnosis = row["diagnosis"]
 
         # Adjust file path based on whether dataset uses a subject folder
         if config['subject_folder']:
@@ -84,36 +116,49 @@ def process_datasets(conn_p, df, dataset_name):
             with h5py.File(file_path, "r") as file:
                 connectome = load_connectome(file, participant_id, session, identifier, config['no_session'])
                 if connectome is not None:
-                    connectomes_ids.append((identifier, connectome))
-                    #covariates = pd.DataFrame({'SITE': site, 'SEX': sex, 'AGE': age, 'DIAGNOSIS': diagnosis}, index=[identifier])
-                    covariates = pd.DataFrame({'SITE': site, 'SEX': sex, 'AGE': age}, index=[identifier])
-                    covariate_data_list.append(covariates)
+                    # Append the connectome to the participant's list in the dictionary
+                    if participant_id not in connectomes_by_participant:
+                        connectomes_by_participant[participant_id] = []
+                    connectomes_by_participant[participant_id].append(connectome)
+
+                    # Add the row to valid_rows if the connectome is found
+                    valid_rows.append(row)
+
         except FileNotFoundError:
             print(f"File not found: {file_path}")
 
-    if connectomes_ids:
-        edges_matrix, identifiers = connectome_to_edges_matrix_with_ids(connectomes_ids)
-        edges_df = pd.DataFrame(edges_matrix.T, index=identifiers)
-        covariate_df = pd.concat(covariate_data_list, ignore_index=False)
-        return edges_df, covariate_df
-    else:
-        return pd.DataFrame(), pd.DataFrame()
+    # Process connectomes
+    if connectomes_by_participant:
+        edges_matrix_dict = process_connectomes(connectomes_by_participant)
+        edges_df = matrix_dict_to_df(edges_matrix_dict)
 
-def load_connectome(file, participant_id, session, identifier, no_session):
-    # Path construction within .h5 file
-    dataset_path = f"sub-{participant_id}"
+    # Convert the list of valid rows to a DataFrame
+    valid_df = pd.DataFrame(valid_rows)
 
-    if not no_session:
-        dataset_path += f"/ses-{session}"
+    return edges_df, valid_df
 
-    dataset_path += f"/{identifier}_atlas-MIST_desc-64_connectome"
-    dataset = file.get(dataset_path)
+def create_pheno_df(valid_df):
+    # Group by 'participant_id' and aggregate, taking the first entry for variables that do not change (for this use case)
+    aggregation_functions = {
+        'sex': 'first',
+        'age': 'first',
+        'diagnosis': 'first',
+        'site': 'first',
+        'mean_fd_scrubbed': 'mean',  # Averaging mean framewise displacement across scans
+        'mbi_status': 'first'
+    }
+    pheno_df = valid_df.groupby('participant_id').agg(aggregation_functions)
 
-    if dataset is not None:
-        return dataset[...]
-    else:
-        print(f"Connectome for {identifier} not found or could not be loaded")
-        return None
+    return pheno_df
+
+def create_covariates_df(valid_df):
+    # Group by participant_id and select the first entry for each group
+    first_entries = valid_df.groupby('participant_id').first()
+
+    # Extract the relevant covariates
+    covariates_df = first_entries[['site', 'sex', 'age', 'diagnosis']]
+
+    return covariates_df
 
 def one_hot_encode_column(df, column_name):
     dummies = pd.get_dummies(df[column_name], prefix=column_name)
@@ -121,6 +166,16 @@ def one_hot_encode_column(df, column_name):
     df.drop(column_name, axis=1, inplace=True)
     return df
 
+def process_covariates(combined_covariates_df):
+    processed_covariates_df = combined_covariates_df.copy()
+    # Rename diagnosis variable
+    processed_covariates_df['diagnosis'] = processed_covariates_df['diagnosis'].replace('PSYC', 'SCHZ')
+
+    # One-hot encode covariates
+    processed_covariates_df = one_hot_encode_column(processed_covariates_df, 'sex')
+    processed_covariates_df = one_hot_encode_column(processed_covariates_df, 'diagnosis')
+
+    return processed_covariates_df
 
 if __name__ == "__main__":
     conn_p = Path("/home/neuromod/ad_sz/data")
@@ -131,21 +186,29 @@ def one_hot_encode_column(df, column_name):
         output_p.mkdir(parents=True, exist_ok=True)
 
     all_edges = []
+    all_pheno = []
     all_covariates = []
     for dataset in dataset_configs:
-        edges_df, covariates_df = process_datasets(conn_p, df, dataset)
+        edges_df, valid_df = process_datasets(conn_p, df, dataset)
+        pheno_df = create_pheno_df(valid_df)
+        covariates_df = create_covariates_df(valid_df)
+
+        # Collect data per dataset
         all_edges.append(edges_df)
+        all_pheno.append(pheno_df)
         all_covariates.append(covariates_df)
 
-    # Concatenate all edges and covariates
+    # Concatenate data across datasets
     combined_edges_df = pd.concat(all_edges)
+    combined_pheno_df = pd.concat(all_pheno)
     combined_covariates_df = pd.concat(all_covariates)
 
-    # One-hot encode covariates
-    combined_covariates_df = one_hot_encode_column(combined_covariates_df, 'SEX')
-    #combined_covariates_df = one_hot_encode_column(combined_covariates_df, 'DIAGNOSIS')
+    # Process covariates and pheno data suitable for analysis
+    processed_covariates_df = process_covariates(combined_covariates_df)
 
-    combined_edges_df.sort_index().to_csv(output_p / 'combined_edges.tsv', sep='\t', index=True)
-    combined_covariates_df.sort_index().to_csv(output_p / 'combat_covariates.tsv', sep='\t', index=True)
+    # Ensure order of data is identical and output
+    combined_edges_df.sort_index().to_csv(output_p / 'combat_edges.tsv', sep='\t', index=True)
+    processed_covariates_df.sort_index().to_csv(output_p / 'combat_covariates.tsv', sep='\t', index=True)
+    combined_pheno_df.sort_index().to_csv(output_p / 'pheno.tsv', sep='\t', index=True)
 
-    print(f"Saved edges matrix and coavraites to {output_p}")
+    print(f"Saved edges matrix, covariates and pheno to {output_p}")

From 0a8ab25a36611046b5216e65222408c521f541b9 Mon Sep 17 00:00:00 2001
From: clarkenj <natashajayne@gmail.com>
Date: Mon, 6 May 2024 13:09:26 -0400
Subject: [PATCH 12/14] fully working script

---
 .gitignore                    |   2 +-
 scripts/prepare_input_data.py | 146 ++++++++++++++++++++++------------
 2 files changed, 94 insertions(+), 54 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0d05551..918137f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -113,7 +113,7 @@ venv/
 ENV/
 env.bak/
 venv.bak/
-cwas_neuro_env/
+ad_sz_env/
 
 # Spyder project settings
 .spyderproject
diff --git a/scripts/prepare_input_data.py b/scripts/prepare_input_data.py
index a256e6f..6ba6fed 100644
--- a/scripts/prepare_input_data.py
+++ b/scripts/prepare_input_data.py
@@ -4,28 +4,24 @@
 from pathlib import Path
 
 dataset_configs = {
-    'cobre': {
-        'connectome_path': "cobre_connectome-0.4.1_MIST_afc",
-        'no_session': False,
-        'subject_folder': False
+    "cobre": {
+        "connectome_path": "cobre_connectome-0.4.1_MIST_afc",
+        "no_session": False,
+        "subject_folder": False,
     },
-    'cimaq': {
-        'connectome_path': "cimaq_connectome-0.4.1_MIST_afc",
-        'no_session': False,
-        'subject_folder': True
+    "cimaq": {
+        "connectome_path": "cimaq_connectome-0.4.1_MIST_afc",
+        "no_session": False,
+        "subject_folder": True,
     },
-    'hcpep': {
-        'connectome_path': "hcp-ep_connectome-0.4.1",
-        'no_session': False,
-        'subject_folder': True
-    },
-    'ds000030': {
-        'connectome_path': "ds000030_connectomes-0.4.1",
-        'no_session': True,
-        'subject_folder': False
+    "hcpep": {
+        "connectome_path": "hcp-ep_connectome-0.4.1",
+        "no_session": False,
+        "subject_folder": True,
     },
 }
 
+
 def load_connectome(file, participant_id, session, identifier, no_session):
     # Path construction within .h5 file
     dataset_path = f"sub-{participant_id}"
@@ -42,6 +38,7 @@ def load_connectome(file, participant_id, session, identifier, no_session):
         print(f"Connectome for {identifier} not found")
         return None
 
+
 def save_npy_connectome(connectome_npy, output_p, identifier):
     # Not using this function currently
     if connectome_npy is not None:
@@ -51,6 +48,7 @@ def save_npy_connectome(connectome_npy, output_p, identifier):
     else:
         print(f"Connectome for {identifier} not found")
 
+
 def connectome_to_edge_matrix(connectome):
     """
     Transform a single connectome into an edge matrix.
@@ -64,6 +62,7 @@ def connectome_to_edge_matrix(connectome):
 
     return edge_matrix
 
+
 def process_connectomes(connectomes_by_participant):
     """
     For multiple connectomes per participant, average them, apply Fisher z transform, and convert to an edge matrix.
@@ -82,7 +81,7 @@ def process_connectomes(connectomes_by_participant):
 
 
 def matrix_dict_to_df(edges_matrix_dict):
-    # Create a DataFrame from the edge matrices
+    # Create a df from the edge matrices
     # Convert dictionary values to a 2D numpy array
     edge_matrix_combined = np.column_stack(list(edges_matrix_dict.values()))
     participants = list(edges_matrix_dict.keys())
@@ -93,28 +92,35 @@ def matrix_dict_to_df(edges_matrix_dict):
 
     return edges_df
 
+
 def process_datasets(conn_p, df, dataset_name):
     config = dataset_configs[dataset_name]
-    base_dir = conn_p / config['connectome_path']
+    base_dir = conn_p / config["connectome_path"]
 
     # Filter the df for required scans for the dataset
-    filtered_df = df[df['dataset'] == dataset_name]
+    filtered_df = df[df["dataset"] == dataset_name]
     valid_rows = []  # To store rows where the connectome was found
 
     connectomes_by_participant = {}
     for index, row in filtered_df.iterrows():
         participant_id = row["participant_id"]
         identifier = row["identifier"]
-        session = None if config['no_session'] else row.get("ses", None)
+        session = None if config["no_session"] else row.get("ses", None)
 
         # Adjust file path based on whether dataset uses a subject folder
-        if config['subject_folder']:
-            file_path = base_dir / participant_id / f"sub-{participant_id}_atlas-MIST_desc-scrubbing.5+gsr.h5"
+        if config["subject_folder"]:
+            file_path = (
+                base_dir
+                / participant_id
+                / f"sub-{participant_id}_atlas-MIST_desc-scrubbing.5+gsr.h5"
+            )
         else:
             file_path = base_dir / "atlas-MIST_desc-scrubbing.5+gsr.h5"
         try:
             with h5py.File(file_path, "r") as file:
-                connectome = load_connectome(file, participant_id, session, identifier, config['no_session'])
+                connectome = load_connectome(
+                    file, participant_id, session, identifier, config["no_session"]
+                )
                 if connectome is not None:
                     # Append the connectome to the participant's list in the dictionary
                     if participant_id not in connectomes_by_participant:
@@ -127,88 +133,122 @@ def process_datasets(conn_p, df, dataset_name):
         except FileNotFoundError:
             print(f"File not found: {file_path}")
 
+    edges_df = pd.DataFrame()
     # Process connectomes
     if connectomes_by_participant:
         edges_matrix_dict = process_connectomes(connectomes_by_participant)
         edges_df = matrix_dict_to_df(edges_matrix_dict)
 
-    # Convert the list of valid rows to a DataFrame
+    # Convert the list of valid rows to a df
     valid_df = pd.DataFrame(valid_rows)
 
     return edges_df, valid_df
 
+
 def create_pheno_df(valid_df):
     # Group by 'participant_id' and aggregate, taking the first entry for variables that do not change (for this use case)
     aggregation_functions = {
-        'sex': 'first',
-        'age': 'first',
-        'diagnosis': 'first',
-        'site': 'first',
-        'mean_fd_scrubbed': 'mean',  # Averaging mean framewise displacement across scans
-        'mbi_status': 'first'
+        "sex": "first",
+        "age": "first",
+        "diagnosis": "first",
+        "site": "first",
+        "mean_fd_scrubbed": "mean",  # Averaging mean framewise displacement across scans
+        "group": "first",
     }
-    pheno_df = valid_df.groupby('participant_id').agg(aggregation_functions)
+    pheno_df = valid_df.groupby("participant_id").agg(aggregation_functions)
 
     return pheno_df
 
-def create_covariates_df(valid_df):
+
+def create_covariates_df_old(valid_df):
     # Group by participant_id and select the first entry for each group
-    first_entries = valid_df.groupby('participant_id').first()
+    first_entries = valid_df.groupby("participant_id").first()
 
     # Extract the relevant covariates
-    covariates_df = first_entries[['site', 'sex', 'age', 'diagnosis']]
+    covariates_df = first_entries[["site", "sex", "age", "diagnosis"]]
 
     return covariates_df
 
+
 def one_hot_encode_column(df, column_name):
     dummies = pd.get_dummies(df[column_name], prefix=column_name)
     df = pd.concat([df, dummies], axis=1)
     df.drop(column_name, axis=1, inplace=True)
     return df
 
-def process_covariates(combined_covariates_df):
-    processed_covariates_df = combined_covariates_df.copy()
-    # Rename diagnosis variable
-    processed_covariates_df['diagnosis'] = processed_covariates_df['diagnosis'].replace('PSYC', 'SCHZ')
 
-    # One-hot encode covariates
-    processed_covariates_df = one_hot_encode_column(processed_covariates_df, 'sex')
-    processed_covariates_df = one_hot_encode_column(processed_covariates_df, 'diagnosis')
+def one_hot_encode_column_no_prefix(df, column_name):
+    dummies = pd.get_dummies(df[column_name])
+    df = pd.concat([df, dummies], axis=1)
+    df.drop(column_name, axis=1, inplace=True)
+    return df
+
+
+def create_covariates_df(pheno_df):
+    # Extract the relevant covariates from pheno_df
+    covariates_df = pheno_df[
+        [
+            "site",
+            "sex_male",
+            "sex_female",
+            "age",
+            "diagnosis_MCI",
+            "diagnosis_ADD",
+            "diagnosis_CON",
+            "diagnosis_SCHZ",
+        ]
+    ]  # split CON into controls from AD and SCHZ datasets?
+
+    return covariates_df
 
-    return processed_covariates_df
 
 if __name__ == "__main__":
     conn_p = Path("/home/neuromod/ad_sz/data")
     output_p = Path("/home/neuromod/ad_sz/data/input_data")
-    df = pd.read_csv("/home/neuromod/wrangling-phenotype/outputs/final_master_pheno.tsv",sep="\t")
+    df = pd.read_csv(
+        "/home/neuromod/wrangling-phenotype/outputs/final_qc_pheno.tsv", sep="\t"
+    )
 
     if not output_p.exists():
         output_p.mkdir(parents=True, exist_ok=True)
 
     all_edges = []
     all_pheno = []
-    all_covariates = []
     for dataset in dataset_configs:
         edges_df, valid_df = process_datasets(conn_p, df, dataset)
         pheno_df = create_pheno_df(valid_df)
-        covariates_df = create_covariates_df(valid_df)
 
         # Collect data per dataset
         all_edges.append(edges_df)
         all_pheno.append(pheno_df)
-        all_covariates.append(covariates_df)
 
     # Concatenate data across datasets
     combined_edges_df = pd.concat(all_edges)
     combined_pheno_df = pd.concat(all_pheno)
-    combined_covariates_df = pd.concat(all_covariates)
 
-    # Process covariates and pheno data suitable for analysis
-    processed_covariates_df = process_covariates(combined_covariates_df)
+    # One-hot encode columns
+    combined_pheno_df = one_hot_encode_column(combined_pheno_df, "sex")
+    combined_pheno_df = one_hot_encode_column(combined_pheno_df, "diagnosis")
+    combined_pheno_df = one_hot_encode_column_no_prefix(combined_pheno_df, "group")
+
+    # Create covariates_df from pheno_df
+    covariates_df = create_covariates_df(combined_pheno_df)
+
+    # Capitalize column names, necessary for combat etc
+    covariates_df.columns = [x.upper() for x in covariates_df.columns]
+    combined_pheno_df.columns = [x.upper() for x in combined_pheno_df.columns]
+
+    print("Shape of combined_edges_df = ", combined_edges_df.shape)
+    print("Shape of processed_covariates_df = ", covariates_df.shape)
+    print("Shape of pheno_hot_df = ", combined_pheno_df.shape)
 
     # Ensure order of data is identical and output
-    combined_edges_df.sort_index().to_csv(output_p / 'combat_edges.tsv', sep='\t', index=True)
-    processed_covariates_df.sort_index().to_csv(output_p / 'combat_covariates.tsv', sep='\t', index=True)
-    combined_pheno_df.sort_index().to_csv(output_p / 'pheno.tsv', sep='\t', index=True)
+    combined_edges_df.sort_index().to_csv(
+        output_p / "combat_edges.tsv", sep="\t", index=True
+    )
+    covariates_df.sort_index().to_csv(
+        output_p / "combat_covariates.tsv", sep="\t", index=True
+    )
+    combined_pheno_df.sort_index().to_csv(output_p / "pheno.tsv", sep="\t", index=True)
 
     print(f"Saved edges matrix, covariates and pheno to {output_p}")

From 2b88c3c7a72e5c2d95a2c508bc40e0c7944bde6c Mon Sep 17 00:00:00 2001
From: clarkenj <natashajayne@gmail.com>
Date: Tue, 7 May 2024 12:57:17 -0400
Subject: [PATCH 13/14] added global signal

---
 scripts/prepare_input_data.py | 52 ++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 28 deletions(-)

diff --git a/scripts/prepare_input_data.py b/scripts/prepare_input_data.py
index 6ba6fed..d81c26e 100644
--- a/scripts/prepare_input_data.py
+++ b/scripts/prepare_input_data.py
@@ -4,18 +4,8 @@
 from pathlib import Path
 
 dataset_configs = {
-    "cobre": {
-        "connectome_path": "cobre_connectome-0.4.1_MIST_afc",
-        "no_session": False,
-        "subject_folder": False,
-    },
-    "cimaq": {
-        "connectome_path": "cimaq_connectome-0.4.1_MIST_afc",
-        "no_session": False,
-        "subject_folder": True,
-    },
-    "hcpep": {
-        "connectome_path": "hcp-ep_connectome-0.4.1",
+    "adni": {
+        "connectome_path": "adni_connectome-0.4.1_MIST_afc",
         "no_session": False,
         "subject_folder": True,
     },
@@ -65,19 +55,23 @@ def connectome_to_edge_matrix(connectome):
 
 def process_connectomes(connectomes_by_participant):
     """
-    For multiple connectomes per participant, average them, apply Fisher z transform, and convert to an edge matrix.
+    Apply Fisher z transform, compute the global signal and convert to an edge matrix. For multiple connectomes per participant, connectomes are averaged first.
     """
     edges_matrix_dict = {}
+    global_signal_dict = {}
     for participant_id, connectomes in connectomes_by_participant.items():
         # Compute the mean connectome
         mean_connectome = np.mean(connectomes, axis=0)
         # Apply Fisher Z transformation
         transformed_connectome = np.arctanh(mean_connectome)
+        # Compute global signal (mean of the Z values)
+        global_signal = np.mean(transformed_connectome)
+        global_signal_dict[participant_id] = global_signal
         # Convert connectome to a marix for ComBat
         edges_matrix = connectome_to_edge_matrix(transformed_connectome)
         edges_matrix_dict[participant_id] = edges_matrix
 
-    return edges_matrix_dict
+    return edges_matrix_dict, global_signal_dict
 
 
 def matrix_dict_to_df(edges_matrix_dict):
@@ -136,27 +130,31 @@ def process_datasets(conn_p, df, dataset_name):
     edges_df = pd.DataFrame()
     # Process connectomes
     if connectomes_by_participant:
-        edges_matrix_dict = process_connectomes(connectomes_by_participant)
+        edges_matrix_dict, global_signal_dict = process_connectomes(
+            connectomes_by_participant
+        )
         edges_df = matrix_dict_to_df(edges_matrix_dict)
 
     # Convert the list of valid rows to a df
     valid_df = pd.DataFrame(valid_rows)
 
-    return edges_df, valid_df
+    return edges_df, valid_df, global_signal_dict
 
 
-def create_pheno_df(valid_df):
+def create_pheno_df(valid_df, global_signal_dict):
     # Group by 'participant_id' and aggregate, taking the first entry for variables that do not change (for this use case)
     aggregation_functions = {
         "sex": "first",
         "age": "first",
         "diagnosis": "first",
         "site": "first",
+        "dataset": "first",
         "mean_fd_scrubbed": "mean",  # Averaging mean framewise displacement across scans
         "group": "first",
     }
     pheno_df = valid_df.groupby("participant_id").agg(aggregation_functions)
-
+    # Add global signal to pheno_df
+    pheno_df["mean_conn"] = pheno_df.index.map(global_signal_dict)
     return pheno_df
 
 
@@ -203,11 +201,9 @@ def create_covariates_df(pheno_df):
 
 
 if __name__ == "__main__":
-    conn_p = Path("/home/neuromod/ad_sz/data")
-    output_p = Path("/home/neuromod/ad_sz/data/input_data")
-    df = pd.read_csv(
-        "/home/neuromod/wrangling-phenotype/outputs/final_qc_pheno.tsv", sep="\t"
-    )
+    conn_p = Path("/home/nclarke/scratch")
+    output_p = Path("/home/nclarke")
+    df = pd.read_csv("/home/nclarke/final_qc_pheno.tsv", sep="\t")
 
     if not output_p.exists():
         output_p.mkdir(parents=True, exist_ok=True)
@@ -215,8 +211,8 @@ def create_covariates_df(pheno_df):
     all_edges = []
     all_pheno = []
     for dataset in dataset_configs:
-        edges_df, valid_df = process_datasets(conn_p, df, dataset)
-        pheno_df = create_pheno_df(valid_df)
+        edges_df, valid_df, global_signal_dict = process_datasets(conn_p, df, dataset)
+        pheno_df = create_pheno_df(valid_df, global_signal_dict)
 
         # Collect data per dataset
         all_edges.append(edges_df)
@@ -229,7 +225,7 @@ def create_covariates_df(pheno_df):
     # One-hot encode columns
     combined_pheno_df = one_hot_encode_column(combined_pheno_df, "sex")
     combined_pheno_df = one_hot_encode_column(combined_pheno_df, "diagnosis")
-    combined_pheno_df = one_hot_encode_column_no_prefix(combined_pheno_df, "group")
+    combined_pheno_df = one_hot_encode_column(combined_pheno_df, "group")
 
     # Create covariates_df from pheno_df
     covariates_df = create_covariates_df(combined_pheno_df)
@@ -239,8 +235,8 @@ def create_covariates_df(pheno_df):
     combined_pheno_df.columns = [x.upper() for x in combined_pheno_df.columns]
 
     print("Shape of combined_edges_df = ", combined_edges_df.shape)
-    print("Shape of processed_covariates_df = ", covariates_df.shape)
-    print("Shape of pheno_hot_df = ", combined_pheno_df.shape)
+    print("Shape of covariates_df = ", covariates_df.shape)
+    print("Shape of combined_pheno_df = ", combined_pheno_df.shape)
 
     # Ensure order of data is identical and output
     combined_edges_df.sort_index().to_csv(

From 230f925f9ae6f5dbd765d290d860b22c2587d51a Mon Sep 17 00:00:00 2001
From: clarkenj <natashajayne@gmail.com>
Date: Tue, 7 May 2024 17:55:49 -0400
Subject: [PATCH 14/14] fixed adni session bug

---
 scripts/prepare_input_data.py | 40 ++++++++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/scripts/prepare_input_data.py b/scripts/prepare_input_data.py
index d81c26e..34f5f76 100644
--- a/scripts/prepare_input_data.py
+++ b/scripts/prepare_input_data.py
@@ -4,7 +4,37 @@
 from pathlib import Path
 
 dataset_configs = {
-    "adni": {
+    "cobre": {
+        "connectome_path": "cobre_connectome-0.4.1_MIST_afc",
+        "no_session": False,
+        "subject_folder": False,
+    },
+    "cimaq": {
+        "connectome_path": "cimaq_connectome-0.4.1_MIST_afc",
+        "no_session": False,
+        "subject_folder": True,
+    },
+    "hcpep": {
+        "connectome_path": "hcp-ep_connectome-0.4.1",
+        "no_session": False,
+        "subject_folder": True,
+    },
+    "ds000030": {
+        "connectome_path": "ds000030_connectomes-0.4.1",
+        "no_session": True,
+        "subject_folder": False,
+    },
+    "oasis3": {
+        "connectome_path": "oasis3_connectomes-0.4.1",
+        "no_session": False,
+        "subject_folder": True,
+    },
+    "srpbs": {
+        "connectome_path": "srpbs_connectome-0.4.1",
+        "no_session": False,
+        "subject_folder": True,
+    },
+        "adni": {
         "connectome_path": "adni_connectome-0.4.1_MIST_afc",
         "no_session": False,
         "subject_folder": True,
@@ -202,8 +232,12 @@ def create_covariates_df(pheno_df):
 
 if __name__ == "__main__":
     conn_p = Path("/home/nclarke/scratch")
-    output_p = Path("/home/nclarke")
-    df = pd.read_csv("/home/nclarke/final_qc_pheno.tsv", sep="\t")
+    output_p = Path("/home/nclarke/ad_sz/data/input_data")
+    df = pd.read_csv("/home/nclarke/ad_sz/data/final_qc_pheno.tsv", sep="\t")
+
+    # Convert adni sessions to strings so can formulate paths correctly
+    mask = df["dataset"] == "adni"
+    df.loc[mask, "ses"] = pd.to_datetime(df.loc[mask, "ses"]).dt.strftime("%Y%m%d")
 
     if not output_p.exists():
         output_p.mkdir(parents=True, exist_ok=True)