Skip to content

Commit

Permalink
Merge branch 'main' into single_instance_download
Browse files Browse the repository at this point in the history
  • Loading branch information
fedorov authored Oct 7, 2024
2 parents 2f81497 + 1e52cc4 commit 0b20e76
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 23 deletions.
20 changes: 19 additions & 1 deletion idc_index/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,13 @@ def set_log_level(log_level):
default=None,
help="DICOM SeriesInstanceUID(s) to filter by.",
)
@click.option(
"--crdc-series-uuid",
type=str,
multiple=True,
default=None,
help="crdc_series_uuid(s) to filter by.",
)
@click.option(
"--quiet",
type=bool,
Expand Down Expand Up @@ -122,6 +129,7 @@ def download_from_selection(
patient_id,
study_instance_uid,
series_instance_uid,
crdc_series_uuid,
quiet,
show_progress_bar,
use_s5cmd_sync,
Expand Down Expand Up @@ -159,11 +167,17 @@ def download_from_selection(
if series_instance_uid
else None
)
crdc_series_uuid = (
[uid.strip() for uid in (",".join(crdc_series_uuid)).split(",")]
if crdc_series_uuid
else None
)
logger_cli.debug("Inputs received from cli download:")
logger_cli.debug(f"collection_id: {collection_id}")
logger_cli.debug(f"patient_id: {patient_id}")
logger_cli.debug(f"study_instance_uid: {study_instance_uid}")
logger_cli.debug(f"series_instance_uid: {series_instance_uid}")
logger_cli.debug(f"crdc_series_uuid: {crdc_series_uuid}")
logger_cli.debug(f"dry_run: {dry_run}")
logger_cli.debug(f"quiet: {quiet}")
logger_cli.debug(f"show_progress_bar: {show_progress_bar}")
Expand All @@ -177,6 +191,7 @@ def download_from_selection(
patientId=patient_id,
studyInstanceUID=study_instance_uid,
seriesInstanceUID=series_instance_uid,
crdc_series_uuid=crdc_series_uuid,
quiet=quiet,
show_progress_bar=show_progress_bar,
use_s5cmd_sync=use_s5cmd_sync,
Expand Down Expand Up @@ -346,9 +361,12 @@ def check_and_download(column_name, item_ids, download_dir, kwarg_name):
matches_found += check_and_download(
"SeriesInstanceUID", item_ids, download_dir, "seriesInstanceUID"
)
matches_found += check_and_download(
"crdc_series_uuid", item_ids, download_dir, "crdc_series_uuid"
)
if not matches_found:
logger_cli.error(
"None of the values passed matched any of the identifiers: collection_id, PatientID, StudyInstanceUID, SeriesInstanceUID."
"None of the values passed matched any of the identifiers: collection_id, PatientID, StudyInstanceUID, SeriesInstanceUID, crdc_series_uuid."
)


Expand Down
139 changes: 117 additions & 22 deletions idc_index/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,21 @@ def __init__(self):
logger.debug(f"Reading index file v{idc_index_data.__version__}")
self.index = pd.read_parquet(file_path)

# initialize crdc_series_uuid for the index
# TODO: in the future, after https://github.com/ImagingDataCommons/idc-index/pull/113
# is merged (to minimize disruption), it will make more sense to change
# idc-index-data to separate bucket from crdc_series_uuid, add support for GCP,
# and consequently simplify the code here
self.index["crdc_series_uuid"] = (
self.index["series_aws_url"].str.split("/").str[3]
)

self.previous_versions_index_path = (
idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH
)
file_path = idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH

self.previous_versions_index = pd.read_parquet(file_path)

# self.index = self.index.astype(str).replace("nan", "")
self.index["series_size_MB"] = self.index["series_size_MB"].astype(float)
Expand Down Expand Up @@ -142,6 +154,7 @@ def _safe_filter_by_selection(
studyInstanceUID,
seriesInstanceUID,
sopInstanceUID=None,
crdc_series_uuid,
):
if collection_id is not None:
if not isinstance(collection_id, str) and not isinstance(
Expand All @@ -167,30 +180,54 @@ def _safe_filter_by_selection(
):
raise TypeError("sopInstanceUID must be a string or list of strings")

if collection_id is not None:
result_df = IDCClient._filter_by_collection_id(df_index, collection_id)
else:
result_df = df_index

if patientId is not None:
result_df = IDCClient._filter_by_patient_id(result_df, patientId)

if studyInstanceUID is not None:
result_df = IDCClient._filter_by_dicom_study_uid(
result_df, studyInstanceUID
)

if seriesInstanceUID is not None:
result_df = IDCClient._filter_by_dicom_series_uid(
result_df, seriesInstanceUID
if crdc_series_uuid is not None:
if not isinstance(crdc_series_uuid, str) and not isinstance(
crdc_series_uuid, list
):
raise TypeError("crdc_series_uuid must be a string or list of strings")

# Here we go down-up the hierarchy of filtering, taking into
# account the direction of one-to-many relationships
# one crdc_series_uuid can be associated with one and only one SeriesInstanceUID
# one SeriesInstanceUID can be associated with one and only one StudyInstanceUID
# one StudyInstanceUID can be associated with one and only one PatientID
# one PatientID can be associated with one and only one collection_id
# because of this we do not need to apply attributes above the given defined
# attribute in the hierarchy
# The earlier implemented behavior was a relic of the API from a different system
# that influenced the API of SlicerIDCIndex, and propagated into idc-index. Unfortunately.

if crdc_series_uuid is not None:
result_df = IDCClient._filter_dataframe_by_id(
"crdc_series_uuid", df_index, crdc_series_uuid
)
return result_df

if sopInstanceUID is not None:
result_df = IDCClient._filter_by_dicom_instance_uid(
result_df, sopInstanceUID
)
return result_df

if seriesInstanceUID is not None:
result_df = IDCClient._filter_by_dicom_series_uid(
df_index, seriesInstanceUID
)
return result_df

return result_df
if studyInstanceUID is not None:
result_df = IDCClient._filter_by_dicom_study_uid(df_index, studyInstanceUID)
return result_df

if patientId is not None:
result_df = IDCClient._filter_by_patient_id(df_index, patientId)
return result_df

if collection_id is not None:
result_df = IDCClient._filter_by_collection_id(df_index, collection_id)
return result_df

return None

@staticmethod
def _filter_by_collection_id(df_index, collection_id):
Expand Down Expand Up @@ -671,7 +708,28 @@ def _validate_update_manifest_and_get_download_size(
manifest_df.columns = ["manifest_cp_cmd"]

# create a copy of the index
index_df_copy = self.index
index_df_copy = self.index[
[
"SeriesInstanceUID",
"series_aws_url",
"series_size_MB",
"PatientID",
"collection_id",
"Modality",
"StudyInstanceUID",
]
]
previous_versions_index_df_copy = self.previous_versions_index[
[
"SeriesInstanceUID",
"series_aws_url",
"series_size_MB",
"PatientID",
"collection_id",
"Modality",
"StudyInstanceUID",
]
]

# use default hierarchy
if dirTemplate is not None:
Expand Down Expand Up @@ -766,7 +824,7 @@ def _validate_update_manifest_and_get_download_size(
series_size_MB,
{hierarchy} AS path,
FROM
'{self.previous_versions_index_path}' pvip
previous_versions_index_df_copy pvip
),
index_temp AS (
Expand Down Expand Up @@ -1447,12 +1505,14 @@ def citations_from_selection(
Returns:
List of citations in the requested format.
"""

result_df = self._safe_filter_by_selection(
self.index,
collection_id=collection_id,
patientId=patientId,
studyInstanceUID=studyInstanceUID,
seriesInstanceUID=seriesInstanceUID,
crdc_series_uuid=None,
)

citations = []
Expand Down Expand Up @@ -1504,6 +1564,7 @@ def download_from_selection(
studyInstanceUID=None,
seriesInstanceUID=None,
sopInstanceUID=None,
crdc_series_uuid=None,
quiet=True,
show_progress_bar=True,
use_s5cmd_sync=False,
Expand All @@ -1520,6 +1581,7 @@ def download_from_selection(
studyInstanceUID: string or list of strings containing the values of DICOM StudyInstanceUID to filter by
seriesInstanceUID: string or list of strings containing the values of DICOM SeriesInstanceUID to filter by
sopInstanceUID: string or list of strings containing the values of DICOM SOPInstanceUID to filter by
crdc_series_uuid: string or list of strings containing the values of crdc_series_uuid to filter by
quiet (bool): If True, suppresses the output of the subprocess. Defaults to True
show_progress_bar (bool): If True, tracks the progress of download
use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded
Expand All @@ -1534,7 +1596,7 @@ def download_from_selection(
if hasattr(
self, "sm_instance_index"
): # check if instance-level index is installed
index_to_be_filtered = self.sm_instance_index
download_df = self.sm_instance_index
else:
logger.error(
"Instance-level access not possible because instance-level index not installed."
Expand All @@ -1543,15 +1605,48 @@ def download_from_selection(
"Instance-level access not possible because instance-level index not installed."
)
else:
index_to_be_filtered = self.index
download_df = self.index

if crdc_series_uuid is not None:
download_df = pd.concat(
[
self.index[
[
"PatientID",
"collection_id",
"Modality",
"StudyInstanceUID",
"SeriesInstanceUID",
"crdc_series_uuid",
"series_aws_url",
"series_size_MB",
]
],
self.previous_versions_index[
[
"PatientID",
"collection_id",
"Modality",
"StudyInstanceUID",
"SeriesInstanceUID",
"crdc_series_uuid",
"series_aws_url",
"series_size_MB",
]
],
],
)
else:
download_df = self.index

result_df = self._safe_filter_by_selection(
index_to_be_filtered,
download_df,
collection_id=collection_id,
patientId=patientId,
studyInstanceUID=studyInstanceUID,
seriesInstanceUID=seriesInstanceUID,
sopInstanceUID=sopInstanceUID,
crdc_series_uuid=crdc_series_uuid,
)

if not sopInstanceUID:
Expand Down
9 changes: 9 additions & 0 deletions tests/idcindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,10 +480,19 @@ def test_cli_download(self):
with runner.isolated_filesystem():
result = runner.invoke(
self.download,
# StudyInstanceUID:
["1.3.6.1.4.1.14519.5.2.1.7695.1700.114861588187429958687900856462"],
)
assert len(os.listdir(Path.cwd())) != 0

with runner.isolated_filesystem():
result = runner.invoke(
self.download,
# crdc_series_uuid:
["e5c5c71d-62c4-4c50-a8a9-b6799c7f8dea"],
)
assert len(os.listdir(Path.cwd())) != 0

def test_prior_version_manifest(self):
# Define the values for each optional parameter
quiet_values = [True, False]
Expand Down

0 comments on commit 0b20e76

Please sign in to comment.