Skip to content

Commit

Permalink
feat: ability to define the location of raw data on disk or have the …
Browse files Browse the repository at this point in the history
…untargeted pipeline infer it
  • Loading branch information
bkieft-usa committed Oct 11, 2024
1 parent 551e70b commit 77054a7
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 51 deletions.
3 changes: 1 addition & 2 deletions metatlas/untargeted/run_untargeted_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,7 @@ def main():
skip_fbmn_download=step_bools[5])

##### Step 7/7: Zipping up and (optionally) uploading output folders to gdrive
mzm.zip_and_upload_untargeted_results(download_folder=args.download_dir,output_dir=args.output_dir, \
raw_data_subdir=args.raw_data_subdir, upload=args.gdrive_upload,overwrite_zip=args.overwrite_zip, \
mzm.zip_and_upload_untargeted_results(download_folder=args.download_dir,output_dir=args.output_dir, upload=args.gdrive_upload,overwrite_zip=args.overwrite_zip, \
overwrite_drive=args.overwrite_drive, min_features_admissible=args.min_features, skip_zip_upload=step_bools[6], \
add_documentation=args.add_gnps2_documentation,doc_name=args.gnps2_doc_name,direct_input=args.direct_input, \
abridged_filenames=args.abridged_filenames)
Expand Down
102 changes: 53 additions & 49 deletions metatlas/untargeted/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,6 @@ def zip_and_upload_untargeted_results(
overwrite_zip: bool,
overwrite_drive: bool,
direct_input: Optional[str] = None,
raw_data_subdir: Optional[str] = None,
min_features_admissible: int = 0
) -> None:
"""
Expand Down Expand Up @@ -233,7 +232,7 @@ def zip_and_upload_untargeted_results(
if os.path.exists(pos_mzmine_job_id_filename):
os.remove(pos_mzmine_job_id_filename)

zip_untargeted_results(target_dirs=[neg_directory,pos_directory], raw_data_subdir=raw_data_subdir, abridged_filenames=abridged_filenames, \
zip_untargeted_results(target_dirs=[neg_directory,pos_directory], abridged_filenames=abridged_filenames, \
add_documentation=add_documentation, download_folder=download_folder, doc_name=doc_name, output_zip_archive=output_zip_archive)
zip_count += 1

Expand All @@ -258,7 +257,7 @@ def zip_and_upload_untargeted_results(
if os.path.exists(pos_mzmine_job_id_filename):
os.remove(pos_mzmine_job_id_filename)

zip_untargeted_results(target_dirs=[pos_directory], raw_data_subdir=raw_data_subdir, abridged_filenames=abridged_filenames, \
zip_untargeted_results(target_dirs=[pos_directory], abridged_filenames=abridged_filenames, \
add_documentation=add_documentation, download_folder=download_folder, doc_name=doc_name, output_zip_archive=output_zip_archive)
zip_count += 1

Expand All @@ -283,7 +282,7 @@ def zip_and_upload_untargeted_results(
if os.path.exists(neg_mzmine_job_id_filename):
os.remove(neg_mzmine_job_id_filename)

zip_untargeted_results(target_dirs=[neg_directory], raw_data_subdir=raw_data_subdir, abridged_filenames=abridged_filenames, \
zip_untargeted_results(target_dirs=[neg_directory], abridged_filenames=abridged_filenames, \
add_documentation=add_documentation, download_folder=download_folder, doc_name=doc_name, output_zip_archive=output_zip_archive)
zip_count += 1

Expand All @@ -306,7 +305,6 @@ def zip_untargeted_results(
doc_name: str,
add_documentation: bool,
target_dirs: Optional[List[str]] = None,
raw_data_subdir: Optional[str] = None,
download_folder: Optional[str] = None,
output_zip_archive: Optional[str] = None
) -> None:
Expand Down Expand Up @@ -344,7 +342,7 @@ def zip_untargeted_results(
logging.info(tab_print("New untargeted results in %s zipped"%(target_dirs), 1))

if abridged_filenames is True:
rename_untargeted_files_in_archive(output_zip_archive=output_zip_archive, raw_data_subdir=raw_data_subdir)
rename_untargeted_files_in_archive(output_zip_archive=output_zip_archive)

# Change permissions of resulting zip
try:
Expand All @@ -354,33 +352,17 @@ def zip_untargeted_results(


def rename_untargeted_files_in_archive(
output_zip_archive: Optional[str] = None,
raw_data_subdir: Optional[str] = None
output_zip_archive: Optional[str] = None
) -> None:
if output_zip_archive is None:
logging.warning(tab_print("Warning! No output zip archive provided for renaming untargeted results files, but rename function is set to True.", 1))
return

project_name = os.path.splitext(os.path.basename(output_zip_archive))[0]

if raw_data_subdir is None: # Use the department name from the project name
_, validate_department, _ = vfn.field_exists(PurePath(project_name), field_num=1)
try:
department = validate_department.lower()
if department =='eb':
department = 'egsb'
if not department in ['jgi','egsb']:
logging.warning(tab_print("Warning! %s does not have a valid department name in the second field. Use --raw_data_subdir to provide a custom subdirectory for the raw data."%(project_name), 2))
return
raw_data_subdir = department.upper()
except:
logging.warning(tab_print("Warning! %s does not have a valid department name in the second field. Use --raw_data_subdir to provide a custom subdirectory for the raw data."%(project_name), 2))
return
else:
department = raw_data_subdir.upper()

if len(project_name.split('_')) >= 9:
date = project_name.split('_')[0]
department = project_name.split('_')[1]
submitter = project_name.split('_')[2]
pid = project_name.split('_')[3]
chromatography = project_name.split('_')[7]
Expand All @@ -392,10 +374,10 @@ def rename_untargeted_files_in_archive(
if not any(substring.lower() in chromatography.lower() for substring in ['C18', 'LIPID', 'HILIC']) or \
not date.isdigit() or len(date) != 8:
logging.warning(tab_print("Warning! Project name %s does not follow the standard naming convention. Skipping renaming files before zip..."%(project_name), 1))
logging.warning(tab_print("Here is what could be extracted: Date: %s, Department: %s, Submitter: %s, PID: %s, Chromatography: %s"%(date, raw_data_subdir, submitter, pid, chromatography), 2))
logging.warning(tab_print("Here is what could be extracted: Date: %s, Department: %s, Submitter: %s, PID: %s, Chromatography: %s"%(date, department, submitter, pid, chromatography), 2))
return
else:
new_project_name = f"{date}_{raw_data_subdir}_{submitter}_{pid}_{chromatography}"
new_project_name = f"{date}_{department}_{submitter}_{pid}_{chromatography}"

# Unzip the archive and rename all files
temp_dir = f"/tmp/{project_name}"
Expand Down Expand Up @@ -1021,7 +1003,21 @@ def mirror_raw_data_to_gnps2(
logging.info(f"Mirroring raw data (mzML files) for {project} to GNPS2...")

local_directory = os.path.join(raw_data_dir, raw_data_subdir, project)
remote_directory = f"/raw_data/{raw_data_subdir}/{project}"
if not os.path.exists(local_directory): # Try other possible subdirectories where raw data might be found on perlmutter
local_directory = os.path.join(raw_data_dir, "jgi", project)
if not os.path.exists(local_directory):
local_directory = os.path.join(raw_data_dir, "egsb", project)
if not os.path.exists(local_directory):
local_directory = os.path.join(raw_data_dir, "akuftin", project)
if not os.path.exists(local_directory):
local_directory = os.path.join(raw_data_dir, "agolini", project)
if not os.path.exists(local_directory):
local_directory = os.path.join(raw_data_dir, "kblouie", project)
if not os.path.exists(local_directory):
logging.error(f"Local directory for {project} raw data not found after trying several possible alternatives. Exiting.")
return
remote_subdir = local_directory.parent.name # Use the same subdir as on perlmutter instead of inferring from the project name directly
remote_directory = f"/raw_data/{remote_subdir}/{project}"
polarity_directory = f"{remote_directory}/{polarity}"
remote_host = "sftp.gnps2.org"
remote_port = 443
Expand Down Expand Up @@ -1221,16 +1217,12 @@ def submit_fbmn_jobs(
if raw_data_subdir is None:
_, validate_department, _ = vfn.field_exists(PurePath(project_name), field_num=1)
try:
department = validate_department.lower()
if department =='eb':
department = 'egsb'
if not department in ['jgi','egsb']:
logging.warning(tab_print("Warning! %s does not have a valid department name in the second field. Use --raw_data_subdir to provide a custom subdirectory for the raw data."%(project_name), 2))
continue
raw_data_subdir = department
raw_data_subdir = validate_department.lower()
if raw_data_subdir == 'eb':
raw_data_subdir = 'egsb'
except:
logging.warning(tab_print("Warning! %s does not have a valid department name in the second field. Use --raw_data_subdir to provide a custom subdirectory for the raw data."%(project_name), 2))
continue
logging.warning(tab_print("Warning! Could not infer department/raw data location for %s. Defaulting to 'jgi'. Use --raw_data_subdir to provide a custom subdirectory for the raw data."%(project_name), 2))
raw_data_subdir = "jgi" # Default to JGI

# Get mzmine results files and raw data to GNPS2 before starting FBMN job
logging.info(tab_print("Ensuring MZmine results are at GNPS2 before submitting FBMN job...", 2))
Expand Down Expand Up @@ -1754,22 +1746,34 @@ def write_metadata_per_new_project(
# Filter the DataFrame for the current parent_dir
df_filtered = df[df['parent_dir'] == parent_dir]

# Finish raw_data path
if raw_data_subdir is None:
if raw_data_subdir is None: # This means we'll have to try to infer the locations of the mzML files from the project name
_, validate_department, _ = vfn.field_exists(PurePath(parent_dir), field_num=1)
try:
department = validate_department.lower()
if department =='eb':
department = 'egsb'
if not department in ['jgi','egsb']:
logging.warning(tab_print("Warning! %s does not have a valid department name in the second field. Use --raw_data_subdir to provide a custom subdirectory for the raw data."%(parent_dir), 2))
continue
raw_data_subdir = department
except:
logging.warning(tab_print("Warning! %s does not have a valid department name in the second field. Use --raw_data_subdir to provide a custom subdirectory for the raw data."%(parent_dir), 2))
raw_data_subdir = validate_department.lower()
if raw_data_subdir == 'eb':
raw_data_subdir = 'egsb'
check_project_raw_dir = os.path.join(raw_data_dir,raw_data_subdir,parent_dir)
if os.path.exists(check_project_raw_dir):
full_mzml_path = check_project_raw_dir
else:
possible_subdirs = ["jgi", "egsb", "akuftin", "agolini", "kblouie"]
for subdir in possible_subdirs:
check_project_raw_dir = os.path.join(raw_data_dir, subdir, parent_dir)
if os.path.exists(check_project_raw_dir):
full_mzml_path = check_project_raw_dir
break
else:
logging.error(f"Raw data directory for {parent_dir} could not be found after trying several possible alternatives. Not creating metadata.")
continue
except Exception as e:
logging.error(tab_print(f"Error when trying to locate mzML files on disk: {e}", 2))
continue
full_mzml_path = os.path.join(raw_data_dir,raw_data_subdir,parent_dir)

else:
full_mzml_path = os.path.join(raw_data_dir,raw_data_subdir,parent_dir)
if not os.path.exists(full_mzml_path):
logging.error(tab_print(f"Raw data directory for {parent_dir} could not be found with user input --raw_data_subdir flag of {raw_data_subdir}. Not creating metadata.", 2))
continue

# Initialize info dict
project_dict = {'parent_dir': parent_dir}
project_dict['positive'] = {}
Expand Down

0 comments on commit 77054a7

Please sign in to comment.