Skip to content

Commit

Permalink
Add arguments and filters to the calculate_msN_summary functions
Browse files Browse the repository at this point in the history
  • Loading branch information
bkieft-usa committed Apr 15, 2024
1 parent 3017b9f commit d2770ac
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 9 deletions.
28 changes: 23 additions & 5 deletions metatlas/io/feature_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,9 +343,10 @@ def get_atlas_data_from_file(filename,atlas,desired_key='ms1_pos'):#,bundle=True
return df.reset_index(drop=True)


def calculate_ms1_summary(df):
def calculate_ms1_summary(df, feature_filter=True):
"""
Calculate summary properties for features from MS1 data
Use feature_filter=False to keep unmatched data
"""

summary = {'label': [],
Expand All @@ -355,7 +356,15 @@ def calculate_ms1_summary(df):
'mz_centroid':[],
'rt_peak':[]}

for label_group, label_data in df[df['in_feature']==True].groupby('label'):
if 'label' not in df:

df['label'] = "Untitled"

if(feature_filter == True):

df = df[df['in_feature']==True]

for label_group, label_data in df.groupby('label'):

summary['label'].append(label_group)
summary['num_datapoints'].append(label_data['i'].count())
Expand All @@ -368,9 +377,10 @@ def calculate_ms1_summary(df):
return pd.DataFrame(summary)


def calculate_ms2_summary(df):
def calculate_ms2_summary(df, feature_filter=True):
"""
Calculate summary properties for features from MS2 data
Use feature_filter=False to keep unmatched data
"""

spectra = {'label':[],
Expand All @@ -379,7 +389,15 @@ def calculate_ms2_summary(df):
'precursor_mz':[],
'precursor_peak_height':[]}

for label_group, label_data in df[df['in_feature']==True].groupby('label'):
if 'label' not in df:

df['label'] = "Untitled"

if(feature_filter == True):

df = df[df['in_feature']==True]

for label_group, label_data in df.groupby('label'):

for rt_group, rt_data in pd.DataFrame(label_data).groupby('rt'):

Expand Down Expand Up @@ -430,7 +448,7 @@ def get_data(input_data,return_data=False,save_file=True):
with pd.HDFStore(input_data['outfile'],mode='a',complib='zlib',complevel=9) as f:
f.put('ms1_data',d,data_columns=True)

d = calculate_ms1_summary(d).reset_index()
d = calculate_ms1_summary(d, feature_filter=True).reset_index()

if d.shape[0]==0: #there isn't any data!
for c in ['num_datapoints','peak_area','peak_height','mz_centroid','rt_peak']:
Expand Down
35 changes: 31 additions & 4 deletions tests/unit/test_feature_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,31 +49,58 @@ def test_setup_file_slicing_parameters002(mocker, lcmsrun, metatlas_dataset_with
assert all(isinstance(ele, dict) for ele in slicing_dicts) ## Assert by type
#assert slicing_dicts[0].get('lcmsrun') == filenames[0] ## Assert by a value that's not explicitly passed

def test_calculate_ms1_summary(df_container):
def test_calculate_ms1_summary001(df_container): ## Test with feature_filter on

desired_key = 'ms1_pos'

unfiltered_data = df_container[desired_key]
unfiltered_data['label'] = np.concatenate([np.array(np.repeat("Compound1", 37)), np.array(np.repeat("Compound2", 37))]) ## Fake two compounds
unfiltered_data['in_feature'] = True

summary_df = feature_tools.calculate_ms1_summary(unfiltered_data)
summary_df = feature_tools.calculate_ms1_summary(unfiltered_data, feature_filter=True)

assert summary_df.shape == (2,6) ## MS1 data is split by compound only, resulting in two groups (rows in summary)


def test_calculate_ms2_summary(df_container):
def test_calculate_ms1_summary002(df_container): ## Test with feature_filter off

desired_key = 'ms1_pos'

unfiltered_data = df_container[desired_key]
unfiltered_data['label'] = np.concatenate([np.array(np.repeat("Compound1", 37)), np.array(np.repeat("Compound2", 37))]) ## Fake two compounds
unfiltered_data['in_feature'] = True

summary_df = feature_tools.calculate_ms1_summary(unfiltered_data, feature_filter=False)

assert summary_df.shape == (2,6) ## MS1 data is split by compound only, resulting in two groups (rows in summary)


def test_calculate_ms2_summary001(df_container): ## Test with feature_filter on

desired_key = 'ms2_pos'

unfiltered_data = df_container[desired_key]
unfiltered_data['label'] = np.concatenate([np.array(np.repeat("Compound1", 4)), np.array(np.repeat("Compound2", 4))]) ## Fake two compounds
unfiltered_data['in_feature'] = True

summary_df = feature_tools.calculate_ms2_summary(unfiltered_data)
summary_df = feature_tools.calculate_ms2_summary(unfiltered_data, feature_filter=True)

assert summary_df.shape == (4,5) ## MS2 df is split by compound and by rt, resulting in two groups of two (rows in summary)


def test_calculate_ms2_summary002(df_container): ## Test with feature_filter off

desired_key = 'ms2_pos'

unfiltered_data = df_container[desired_key]
unfiltered_data['label'] = np.concatenate([np.array(np.repeat("Compound1", 4)), np.array(np.repeat("Compound2", 4))]) ## Fake two compounds
unfiltered_data['in_feature'] = True

summary_df = feature_tools.calculate_ms2_summary(unfiltered_data, feature_filter=False)

assert summary_df.shape == (4,5) ## MS2 df is split by compound and by rt, resulting in two groups of two (rows in summary)


def test_map_mzgroups_to_data(metatlas_dataset_with_2_cids, eic):

mz_atlas = metatlas_dataset_with_2_cids.atlas_df['mz'].values[:]
Expand Down

0 comments on commit d2770ac

Please sign in to comment.