Merge branch 'main' into single_instance_download

ImagingDataCommons · Oct 7, 2024 · 0b20e76 · 0b20e76
2 parents 2f81497 + 1e52cc4
commit 0b20e76
Show file tree

Hide file tree

Showing 3 changed files with 145 additions and 23 deletions.
diff --git a/idc_index/cli.py b/idc_index/cli.py
@@ -83,6 +83,13 @@ def set_log_level(log_level):
     default=None,
     help="DICOM SeriesInstanceUID(s) to filter by.",
 )
+@click.option(
+    "--crdc-series-uuid",
+    type=str,
+    multiple=True,
+    default=None,
+    help="crdc_series_uuid(s) to filter by.",
+)
 @click.option(
     "--quiet",
     type=bool,
@@ -122,6 +129,7 @@ def download_from_selection(
     patient_id,
     study_instance_uid,
     series_instance_uid,
+    crdc_series_uuid,
     quiet,
     show_progress_bar,
     use_s5cmd_sync,
@@ -159,11 +167,17 @@ def download_from_selection(
         if series_instance_uid
         else None
     )
+    crdc_series_uuid = (
+        [uid.strip() for uid in (",".join(crdc_series_uuid)).split(",")]
+        if crdc_series_uuid
+        else None
+    )
     logger_cli.debug("Inputs received from cli download:")
     logger_cli.debug(f"collection_id: {collection_id}")
     logger_cli.debug(f"patient_id: {patient_id}")
     logger_cli.debug(f"study_instance_uid: {study_instance_uid}")
     logger_cli.debug(f"series_instance_uid: {series_instance_uid}")
+    logger_cli.debug(f"crdc_series_uuid: {crdc_series_uuid}")
     logger_cli.debug(f"dry_run: {dry_run}")
     logger_cli.debug(f"quiet: {quiet}")
     logger_cli.debug(f"show_progress_bar: {show_progress_bar}")
@@ -177,6 +191,7 @@ def download_from_selection(
         patientId=patient_id,
         studyInstanceUID=study_instance_uid,
         seriesInstanceUID=series_instance_uid,
+        crdc_series_uuid=crdc_series_uuid,
         quiet=quiet,
         show_progress_bar=show_progress_bar,
         use_s5cmd_sync=use_s5cmd_sync,
@@ -346,9 +361,12 @@ def check_and_download(column_name, item_ids, download_dir, kwarg_name):
         matches_found += check_and_download(
             "SeriesInstanceUID", item_ids, download_dir, "seriesInstanceUID"
         )
+        matches_found += check_and_download(
+            "crdc_series_uuid", item_ids, download_dir, "crdc_series_uuid"
+        )
         if not matches_found:
             logger_cli.error(
-                "None of the values passed matched any of the identifiers: collection_id, PatientID, StudyInstanceUID, SeriesInstanceUID."
+                "None of the values passed matched any of the identifiers: collection_id, PatientID, StudyInstanceUID, SeriesInstanceUID, crdc_series_uuid."
             )
 
 

diff --git a/idc_index/index.py b/idc_index/index.py
@@ -64,9 +64,21 @@ def __init__(self):
         logger.debug(f"Reading index file v{idc_index_data.__version__}")
         self.index = pd.read_parquet(file_path)
 
+        # initialize crdc_series_uuid for the index
+        # TODO: in the future, after https://github.com/ImagingDataCommons/idc-index/pull/113
+        # is merged (to minimize disruption), it will make more sense to change
+        # idc-index-data to separate bucket from crdc_series_uuid, add support for GCP,
+        # and consequently simplify the code here
+        self.index["crdc_series_uuid"] = (
+            self.index["series_aws_url"].str.split("/").str[3]
+        )
+
         self.previous_versions_index_path = (
             idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH
         )
+        file_path = idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH
+
+        self.previous_versions_index = pd.read_parquet(file_path)
 
         # self.index = self.index.astype(str).replace("nan", "")
         self.index["series_size_MB"] = self.index["series_size_MB"].astype(float)
@@ -142,6 +154,7 @@ def _safe_filter_by_selection(
         studyInstanceUID,
         seriesInstanceUID,
         sopInstanceUID=None,
+        crdc_series_uuid,
     ):
         if collection_id is not None:
             if not isinstance(collection_id, str) and not isinstance(
@@ -167,30 +180,54 @@ def _safe_filter_by_selection(
             ):
                 raise TypeError("sopInstanceUID must be a string or list of strings")
 
-        if collection_id is not None:
-            result_df = IDCClient._filter_by_collection_id(df_index, collection_id)
-        else:
-            result_df = df_index
-
-        if patientId is not None:
-            result_df = IDCClient._filter_by_patient_id(result_df, patientId)
-
-        if studyInstanceUID is not None:
-            result_df = IDCClient._filter_by_dicom_study_uid(
-                result_df, studyInstanceUID
-            )
-
-        if seriesInstanceUID is not None:
-            result_df = IDCClient._filter_by_dicom_series_uid(
-                result_df, seriesInstanceUID
+        if crdc_series_uuid is not None:
+            if not isinstance(crdc_series_uuid, str) and not isinstance(
+                crdc_series_uuid, list
+            ):
+                raise TypeError("crdc_series_uuid must be a string or list of strings")
+
+        # Here we go down-up the hierarchy of filtering, taking into
+        # account the direction of one-to-many relationships
+        #   one crdc_series_uuid can be associated with one and only one SeriesInstanceUID
+        #   one SeriesInstanceUID can be associated with one and only one StudyInstanceUID
+        #   one StudyInstanceUID can be associated with one and only one PatientID
+        #   one PatientID can be associated with one and only one collection_id
+        # because of this we do not need to apply attributes above the given defined
+        # attribute in the hierarchy
+        # The earlier implemented behavior was a relic of the API from a different system
+        # that influenced the API of SlicerIDCIndex, and propagated into idc-index. Unfortunately.
+
+        if crdc_series_uuid is not None:
+            result_df = IDCClient._filter_dataframe_by_id(
+                "crdc_series_uuid", df_index, crdc_series_uuid
             )
+            return result_df
 
         if sopInstanceUID is not None:
             result_df = IDCClient._filter_by_dicom_instance_uid(
                 result_df, sopInstanceUID
             )
+            return result_df
+
+        if seriesInstanceUID is not None:
+            result_df = IDCClient._filter_by_dicom_series_uid(
+                df_index, seriesInstanceUID
+            )
+            return result_df
 
-        return result_df
+        if studyInstanceUID is not None:
+            result_df = IDCClient._filter_by_dicom_study_uid(df_index, studyInstanceUID)
+            return result_df
+
+        if patientId is not None:
+            result_df = IDCClient._filter_by_patient_id(df_index, patientId)
+            return result_df
+
+        if collection_id is not None:
+            result_df = IDCClient._filter_by_collection_id(df_index, collection_id)
+            return result_df
+
+        return None
 
     @staticmethod
     def _filter_by_collection_id(df_index, collection_id):
@@ -671,7 +708,28 @@ def _validate_update_manifest_and_get_download_size(
         manifest_df.columns = ["manifest_cp_cmd"]
 
         # create a copy of the index
-        index_df_copy = self.index
+        index_df_copy = self.index[
+            [
+                "SeriesInstanceUID",
+                "series_aws_url",
+                "series_size_MB",
+                "PatientID",
+                "collection_id",
+                "Modality",
+                "StudyInstanceUID",
+            ]
+        ]
+        previous_versions_index_df_copy = self.previous_versions_index[
+            [
+                "SeriesInstanceUID",
+                "series_aws_url",
+                "series_size_MB",
+                "PatientID",
+                "collection_id",
+                "Modality",
+                "StudyInstanceUID",
+            ]
+        ]
 
         # use default hierarchy
         if dirTemplate is not None:
@@ -766,7 +824,7 @@ def _validate_update_manifest_and_get_download_size(
                 series_size_MB,
                  {hierarchy} AS path,
             FROM
-                '{self.previous_versions_index_path}' pvip
+                previous_versions_index_df_copy pvip
 
             ),
             index_temp AS (
@@ -1447,12 +1505,14 @@ def citations_from_selection(
         Returns:
             List of citations in the requested format.
         """
+
         result_df = self._safe_filter_by_selection(
             self.index,
             collection_id=collection_id,
             patientId=patientId,
             studyInstanceUID=studyInstanceUID,
             seriesInstanceUID=seriesInstanceUID,
+            crdc_series_uuid=None,
         )
 
         citations = []
@@ -1504,6 +1564,7 @@ def download_from_selection(
         studyInstanceUID=None,
         seriesInstanceUID=None,
         sopInstanceUID=None,
+        crdc_series_uuid=None,
         quiet=True,
         show_progress_bar=True,
         use_s5cmd_sync=False,
@@ -1520,6 +1581,7 @@ def download_from_selection(
             studyInstanceUID: string or list of strings containing the values of DICOM StudyInstanceUID to filter by
             seriesInstanceUID: string or list of strings containing the values of DICOM SeriesInstanceUID to filter by
             sopInstanceUID: string or list of strings containing the values of DICOM SOPInstanceUID to filter by
+            crdc_series_uuid: string or list of strings containing the values of crdc_series_uuid to filter by
             quiet (bool): If True, suppresses the output of the subprocess. Defaults to True
             show_progress_bar (bool): If True, tracks the progress of download
             use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded
@@ -1534,7 +1596,7 @@ def download_from_selection(
             if hasattr(
                 self, "sm_instance_index"
             ):  # check if instance-level index is installed
-                index_to_be_filtered = self.sm_instance_index
+                download_df = self.sm_instance_index
             else:
                 logger.error(
                     "Instance-level access not possible because instance-level index not installed."
@@ -1543,15 +1605,48 @@ def download_from_selection(
                     "Instance-level access not possible because instance-level index not installed."
                 )
         else:
-            index_to_be_filtered = self.index
+            download_df = self.index
+
+        if crdc_series_uuid is not None:
+            download_df = pd.concat(
+                [
+                    self.index[
+                        [
+                            "PatientID",
+                            "collection_id",
+                            "Modality",
+                            "StudyInstanceUID",
+                            "SeriesInstanceUID",
+                            "crdc_series_uuid",
+                            "series_aws_url",
+                            "series_size_MB",
+                        ]
+                    ],
+                    self.previous_versions_index[
+                        [
+                            "PatientID",
+                            "collection_id",
+                            "Modality",
+                            "StudyInstanceUID",
+                            "SeriesInstanceUID",
+                            "crdc_series_uuid",
+                            "series_aws_url",
+                            "series_size_MB",
+                        ]
+                    ],
+                ],
+            )
+        else:
+            download_df = self.index
 
         result_df = self._safe_filter_by_selection(
-            index_to_be_filtered,
+            download_df,
             collection_id=collection_id,
             patientId=patientId,
             studyInstanceUID=studyInstanceUID,
             seriesInstanceUID=seriesInstanceUID,
             sopInstanceUID=sopInstanceUID,
+            crdc_series_uuid=crdc_series_uuid,
         )
 
         if not sopInstanceUID:

diff --git a/tests/idcindex.py b/tests/idcindex.py
@@ -480,10 +480,19 @@ def test_cli_download(self):
         with runner.isolated_filesystem():
             result = runner.invoke(
                 self.download,
+                # StudyInstanceUID:
                 ["1.3.6.1.4.1.14519.5.2.1.7695.1700.114861588187429958687900856462"],
             )
             assert len(os.listdir(Path.cwd())) != 0
 
+        with runner.isolated_filesystem():
+            result = runner.invoke(
+                self.download,
+                # crdc_series_uuid:
+                ["e5c5c71d-62c4-4c50-a8a9-b6799c7f8dea"],
+            )
+            assert len(os.listdir(Path.cwd())) != 0
+
     def test_prior_version_manifest(self):
         # Define the values for each optional parameter
         quiet_values = [True, False]