Add arguments and filters to the calculate_msN_summary functions

biorack · Apr 15, 2024 · d2770ac · d2770ac
1 parent 3017b9f
commit d2770ac
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 9 deletions.
diff --git a/metatlas/io/feature_tools.py b/metatlas/io/feature_tools.py
@@ -343,9 +343,10 @@ def get_atlas_data_from_file(filename,atlas,desired_key='ms1_pos'):#,bundle=True
         return df.reset_index(drop=True)
 
 
-def calculate_ms1_summary(df):
+def calculate_ms1_summary(df, feature_filter=True):
     """
     Calculate summary properties for features from MS1 data
+    Use feature_filter=False to keep unmatched data
     """
 
     summary = {'label': [],
@@ -355,7 +356,15 @@ def calculate_ms1_summary(df):
                'mz_centroid':[],
                'rt_peak':[]}
 
-    for label_group, label_data in df[df['in_feature']==True].groupby('label'):
+    if 'label' not in df:
+
+        df['label'] = "Untitled"
+
+    if(feature_filter == True):
+
+        df = df[df['in_feature']==True]
+
+    for label_group, label_data in df.groupby('label'):
 
         summary['label'].append(label_group)
         summary['num_datapoints'].append(label_data['i'].count())
@@ -368,9 +377,10 @@ def calculate_ms1_summary(df):
     return pd.DataFrame(summary)
 
 
-def calculate_ms2_summary(df):
+def calculate_ms2_summary(df, feature_filter=True):
     """
     Calculate summary properties for features from MS2 data
+    Use feature_filter=False to keep unmatched data
     """
 
     spectra = {'label':[], 
@@ -379,7 +389,15 @@ def calculate_ms2_summary(df):
                'precursor_mz':[],
                'precursor_peak_height':[]}
 
-    for label_group, label_data in df[df['in_feature']==True].groupby('label'):
+    if 'label' not in df:
+
+        df['label'] = "Untitled"
+
+    if(feature_filter == True):
+
+        df = df[df['in_feature']==True]
+
+    for label_group, label_data in df.groupby('label'):
 
         for rt_group, rt_data in pd.DataFrame(label_data).groupby('rt'):
 
@@ -430,7 +448,7 @@ def get_data(input_data,return_data=False,save_file=True):
         with pd.HDFStore(input_data['outfile'],mode='a',complib='zlib',complevel=9) as f:
             f.put('ms1_data',d,data_columns=True)
 
-    d = calculate_ms1_summary(d).reset_index()
+    d = calculate_ms1_summary(d, feature_filter=True).reset_index()
 
     if d.shape[0]==0: #there isn't any data!
         for c in ['num_datapoints','peak_area','peak_height','mz_centroid','rt_peak']:

diff --git a/tests/unit/test_feature_tools.py b/tests/unit/test_feature_tools.py
@@ -49,31 +49,58 @@ def test_setup_file_slicing_parameters002(mocker, lcmsrun, metatlas_dataset_with
     assert all(isinstance(ele, dict) for ele in slicing_dicts)  ## Assert by type
     #assert slicing_dicts[0].get('lcmsrun') == filenames[0]  ## Assert by a value that's not explicitly passed
 
-def test_calculate_ms1_summary(df_container):
+def test_calculate_ms1_summary001(df_container):  ## Test with feature_filter on
 
     desired_key = 'ms1_pos'    
 
     unfiltered_data = df_container[desired_key]
     unfiltered_data['label'] = np.concatenate([np.array(np.repeat("Compound1", 37)), np.array(np.repeat("Compound2", 37))])  ## Fake two compounds
     unfiltered_data['in_feature'] = True
 
-    summary_df = feature_tools.calculate_ms1_summary(unfiltered_data)
+    summary_df = feature_tools.calculate_ms1_summary(unfiltered_data, feature_filter=True)
 
     assert summary_df.shape == (2,6)  ## MS1 data is split by compound only, resulting in two groups (rows in summary)
 
 
-def test_calculate_ms2_summary(df_container):
+def test_calculate_ms1_summary002(df_container):  ## Test with feature_filter off
+
+    desired_key = 'ms1_pos'    
+
+    unfiltered_data = df_container[desired_key]
+    unfiltered_data['label'] = np.concatenate([np.array(np.repeat("Compound1", 37)), np.array(np.repeat("Compound2", 37))])  ## Fake two compounds
+    unfiltered_data['in_feature'] = True
+
+    summary_df = feature_tools.calculate_ms1_summary(unfiltered_data, feature_filter=False)
+
+    assert summary_df.shape == (2,6)  ## MS1 data is split by compound only, resulting in two groups (rows in summary)
+
+
+def test_calculate_ms2_summary001(df_container):  ## Test with feature_filter on
 
     desired_key = 'ms2_pos'    
 
     unfiltered_data = df_container[desired_key]
     unfiltered_data['label'] = np.concatenate([np.array(np.repeat("Compound1", 4)), np.array(np.repeat("Compound2", 4))])  ## Fake two compounds
     unfiltered_data['in_feature'] = True
 
-    summary_df = feature_tools.calculate_ms2_summary(unfiltered_data)
+    summary_df = feature_tools.calculate_ms2_summary(unfiltered_data, feature_filter=True)
 
     assert summary_df.shape == (4,5)  ## MS2 df is split by compound and by rt, resulting in two groups of two (rows in summary)
 
+
+def test_calculate_ms2_summary002(df_container):  ## Test with feature_filter off
+
+    desired_key = 'ms2_pos'    
+
+    unfiltered_data = df_container[desired_key]
+    unfiltered_data['label'] = np.concatenate([np.array(np.repeat("Compound1", 4)), np.array(np.repeat("Compound2", 4))])  ## Fake two compounds
+    unfiltered_data['in_feature'] = True
+
+    summary_df = feature_tools.calculate_ms2_summary(unfiltered_data, feature_filter=False)
+
+    assert summary_df.shape == (4,5)  ## MS2 df is split by compound and by rt, resulting in two groups of two (rows in summary)
+
+
 def test_map_mzgroups_to_data(metatlas_dataset_with_2_cids, eic):
 
     mz_atlas = metatlas_dataset_with_2_cids.atlas_df['mz'].values[:]