From 0116ca319e27da0dcd47a0073e5b521b1bebc3c0 Mon Sep 17 00:00:00 2001 From: Sarah Wessel Date: Thu, 17 Oct 2024 13:22:18 -0400 Subject: [PATCH 1/7] Define dimension type crud, clarify terminology --- breadbox/breadbox/crud/dimension_type.py | 34 ++++++++++++++++++++++++ breadbox/breadbox/models/dataset.py | 4 +++ 2 files changed, 38 insertions(+) create mode 100644 breadbox/breadbox/crud/dimension_type.py diff --git a/breadbox/breadbox/crud/dimension_type.py b/breadbox/breadbox/crud/dimension_type.py new file mode 100644 index 00000000..5f2d4d6e --- /dev/null +++ b/breadbox/breadbox/crud/dimension_type.py @@ -0,0 +1,34 @@ +from sqlalchemy import and_ + +from breadbox.db.session import SessionWithUser +from breadbox.models.dataset import ( + DimensionType, + TabularColumn, + TabularCell, +) + + +def get_dimension_type(db: SessionWithUser, name: str) -> DimensionType: + """Get a dimension type object by name""" + return db.query(DimensionType).filter(DimensionType.name == name).one() + + +def get_dimension_type_labels_by_id( + db: SessionWithUser, dimension_type: DimensionType +) -> dict[str, str]: + """ + For a given dimension, get all IDs and labels that exist in the metadata. + """ + label_filter_statements = [ + TabularColumn.dataset_id == dimension_type.dataset_id, + TabularColumn.given_id == "label", + ] + + labels_by_id = ( + db.query(TabularCell) + .join(TabularColumn) + .filter(and_(True, *label_filter_statements)) + .with_entities(TabularCell.dimension_given_id, TabularCell.value) + .all() + ) + return labels_by_id diff --git a/breadbox/breadbox/models/dataset.py b/breadbox/breadbox/models/dataset.py index d116ae6c..49b163d5 100644 --- a/breadbox/breadbox/models/dataset.py +++ b/breadbox/breadbox/models/dataset.py @@ -223,6 +223,8 @@ class Dimension(Base, UUIDMixin, GroupMixin): class DatasetFeature(Dimension): + """A matrix dataset feature dimension""" + # @declared_attr # def index(cls) -> Column[Integer]: # "0-indexed column number in hdf5 file" @@ -232,6 +234,8 @@ class DatasetFeature(Dimension): class DatasetSample(Dimension): + """A matrix dataset sample dimension""" + # @declared_attr # def index(cls) -> Column[Integer]: # "0-indexed column number in hdf5 file" From 1186035a9ceca9e6f37b26f1ba37a9909aa70e60 Mon Sep 17 00:00:00 2001 From: Sarah Wessel Date: Thu, 17 Oct 2024 13:29:11 -0400 Subject: [PATCH 2/7] Improve typing, use new helper --- breadbox/breadbox/crud/dataset.py | 38 +++++++++++-------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/breadbox/breadbox/crud/dataset.py b/breadbox/breadbox/crud/dataset.py index 39cb76f3..2fb64bf2 100644 --- a/breadbox/breadbox/crud/dataset.py +++ b/breadbox/breadbox/crud/dataset.py @@ -57,6 +57,7 @@ get_slice, delete_data_files, ) +from breadbox.crud.dimension_type import get_dimension_type from .metadata import cast_tabular_cell_value_type from .dataset_reference import add_id_mapping import typing @@ -995,7 +996,7 @@ def get_dataset_samples(db: SessionWithUser, dataset: Dataset, user: str): def get_dataset_feature_labels_by_id( - db: SessionWithUser, user: str, dataset: Dataset, + db: SessionWithUser, user: str, dataset: MatrixDataset, ) -> dict[str, str]: """ Try loading feature labels from metadata. @@ -1013,7 +1014,7 @@ def get_dataset_feature_labels_by_id( def get_dataset_sample_labels_by_id( - db: SessionWithUser, user: str, dataset: Dataset, + db: SessionWithUser, user: str, dataset: MatrixDataset, ) -> dict[str, str]: """ Try loading sample labels from metadata. @@ -1034,7 +1035,7 @@ def get_dataset_sample_labels_by_id( # TODO: This can probably be merged. def get_dataset_feature_annotations( - db: SessionWithUser, user: str, dataset: Dataset, metadata_col_name: str, + db: SessionWithUser, user: str, dataset: MatrixDataset, metadata_col_name: str, ) -> dict[str, Any]: """ For the given dataset, load metadata of the specified type, keyed by feature id. @@ -1047,16 +1048,9 @@ def get_dataset_feature_annotations( # Try to find the associated metadata dataset feature_metadata_dataset_id = None - if dataset.format == "matrix_dataset": - if dataset.feature_type is not None: - feature_type = ( - db.query(DimensionType) - .filter(DimensionType.name == dataset.feature_type_name) - .one() - ) - feature_metadata_dataset_id = feature_type.dataset_id - else: - feature_metadata_dataset_id = dataset.id + if dataset.feature_type is not None: + feature_type = get_dimension_type(db, dataset.feature_type_name) + feature_metadata_dataset_id = feature_type.dataset_id data_dataset_feature = aliased(DatasetFeature) @@ -1082,7 +1076,7 @@ def get_dataset_feature_annotations( def get_dataset_sample_annotations( - db: SessionWithUser, user: str, dataset: Dataset, metadata_col_name: str + db: SessionWithUser, user: str, dataset: MatrixDataset, metadata_col_name: str ) -> dict[str, Any]: """ For the given dataset, load metadata of the specified type, keyed by sample id. @@ -1095,16 +1089,9 @@ def get_dataset_sample_annotations( # Try to find the associated metadata dataset sample_metadata_dataset_id = None - if dataset.format == "matrix_dataset": - if dataset.sample_type is not None: - sample_type = ( - db.query(DimensionType) - .filter(DimensionType.name == dataset.sample_type_name) - .one() - ) - sample_metadata_dataset_id = sample_type.dataset_id - else: - sample_metadata_dataset_id = dataset.id + if dataset.sample_type is not None: + sample_type = get_dimension_type(db, dataset.sample_type_name) + sample_metadata_dataset_id = sample_type.dataset_id data_dataset_sample = aliased(DatasetSample) @@ -1513,6 +1500,7 @@ def get_subsetted_tabular_dataset_df( ) if tabular_dimensions_info.identifier == FeatureSampleIdentifier.label: + # TODO: move this duplicate query into a helper function, take optional index filter # Get the corresponding dimension ids for the dimension labels from the dataset's dimension type and use the dimension ids to filter values by dimension_type: DimensionType = db.query(DimensionType).filter( DimensionType.name == dataset.index_type_name @@ -1665,7 +1653,7 @@ def get_missing_tabular_columns_and_indices( def get_subsetted_matrix_dataset_df( db: SessionWithUser, user: str, - dataset: Dataset, + dataset: MatrixDataset, dimensions_info: MatrixDimensionsInfo, filestore_location, strict: bool = False, # False default for backwards compatibility From aac3b86036b93e3740388c933e12f3962275fced Mon Sep 17 00:00:00 2001 From: Sarah Wessel Date: Thu, 17 Oct 2024 13:53:17 -0400 Subject: [PATCH 3/7] Draft service layer --- breadbox/breadbox/crud/dataset.py | 157 +--------------------- breadbox/breadbox/service/__init__.py | 0 breadbox/breadbox/service/data_loading.py | 98 ++++++++++++++ breadbox/breadbox/service/labels.py | 93 +++++++++++++ 4 files changed, 192 insertions(+), 156 deletions(-) create mode 100644 breadbox/breadbox/service/__init__.py create mode 100644 breadbox/breadbox/service/data_loading.py create mode 100644 breadbox/breadbox/service/labels.py diff --git a/breadbox/breadbox/crud/dataset.py b/breadbox/breadbox/crud/dataset.py index 2fb64bf2..76432a4d 100644 --- a/breadbox/breadbox/crud/dataset.py +++ b/breadbox/breadbox/crud/dataset.py @@ -53,10 +53,7 @@ TRANSIENT_GROUP_ID, get_transient_group, ) -from breadbox.io.filestore_crud import ( - get_slice, - delete_data_files, -) +from breadbox.io.filestore_crud import delete_data_files from breadbox.crud.dimension_type import get_dimension_type from .metadata import cast_tabular_cell_value_type from .dataset_reference import add_id_mapping @@ -995,41 +992,6 @@ def get_dataset_samples(db: SessionWithUser, dataset: Dataset, user: str): return dataset_samples -def get_dataset_feature_labels_by_id( - db: SessionWithUser, user: str, dataset: MatrixDataset, -) -> dict[str, str]: - """ - Try loading feature labels from metadata. - If there are no labels in the metadata or there is no metadata, then just return the feature names. - """ - metadata_labels_by_given_id = get_dataset_feature_annotations( - db=db, user=user, dataset=dataset, metadata_col_name="label" - ) - - if metadata_labels_by_given_id: - return metadata_labels_by_given_id - else: - all_dataset_features = get_dataset_features(db=db, dataset=dataset, user=user) - return {feature.given_id: feature.given_id for feature in all_dataset_features} - - -def get_dataset_sample_labels_by_id( - db: SessionWithUser, user: str, dataset: MatrixDataset, -) -> dict[str, str]: - """ - Try loading sample labels from metadata. - If there are no labels in the metadata or there is no metadata, then just return the sample names. - """ - metadata_labels = get_dataset_sample_annotations( - db=db, user=user, dataset=dataset, metadata_col_name="label" - ) - if metadata_labels: - return metadata_labels - else: - samples = get_dataset_samples(db=db, dataset=dataset, user=user) - return {sample.given_id: sample.given_id for sample in samples} - - from typing import Any @@ -1413,50 +1375,6 @@ def get_dataset_sample_by_given_id( return sample -def get_dataset_feature_by_label( - db: SessionWithUser, dataset_id: str, feature_label: str -) -> DatasetFeature: - """Load the dataset feature corresponding to the given dataset ID and feature label""" - - dataset = get_dataset(db, db.user, dataset_id) - if dataset is None: - raise ResourceNotFoundError(f"Dataset '{dataset_id}' not found.") - assert_user_has_access_to_dataset(dataset, db.user) - assert isinstance(dataset, MatrixDataset) - - labels_by_given_id = get_dataset_feature_labels_by_id(db, db.user, dataset) - given_ids_by_label = {label: id for id, label in labels_by_given_id.items()} - feature_given_id = given_ids_by_label.get(feature_label) - if feature_given_id is None: - raise ResourceNotFoundError( - f"Feature label '{feature_label}' not found in dataset '{dataset_id}'." - ) - - return get_dataset_feature_by_given_id(db, dataset_id, feature_given_id) - - -def get_dataset_sample_by_label( - db: SessionWithUser, dataset_id: str, sample_label: str -) -> DatasetSample: - """Load the dataset sample corresponding to the given dataset ID and sample label""" - - dataset = get_dataset(db, db.user, dataset_id) - if dataset is None: - raise ResourceNotFoundError(f"Dataset '{dataset_id}' not found.") - assert_user_has_access_to_dataset(dataset, db.user) - assert isinstance(dataset, MatrixDataset) - - labels_by_given_id = get_dataset_sample_labels_by_id(db, db.user, dataset) - given_ids_by_label = {label: id for id, label in labels_by_given_id.items()} - sample_given_id = given_ids_by_label.get(sample_label) - if sample_given_id is None: - raise ResourceNotFoundError( - f"Sample label '{sample_label}' not found in dataset '{dataset_id}'." - ) - - return get_dataset_sample_by_given_id(db, dataset_id, sample_given_id) - - def _get_column_types(columns_metadata, columns: Optional[List[str]]): col_and_column_metadata_pairs = columns_metadata.items() if columns is None: @@ -1648,76 +1566,3 @@ def get_missing_tabular_columns_and_indices( ) return missing_columns, missing_indices - - -def get_subsetted_matrix_dataset_df( - db: SessionWithUser, - user: str, - dataset: MatrixDataset, - dimensions_info: MatrixDimensionsInfo, - filestore_location, - strict: bool = False, # False default for backwards compatibility -): - """ - Load a dataframe containing data for the specified dimensions. - If the dimensions are specified by label, then return a result indexed by labels - """ - - missing_features = [] - missing_samples = [] - - if dimensions_info.features is None: - feature_indexes = None - elif dimensions_info.feature_identifier.value == "id": - feature_indexes, missing_features = get_feature_indexes_by_given_ids( - db, user, dataset, dimensions_info.features - ) - else: - assert dimensions_info.feature_identifier.value == "label" - feature_indexes, missing_features = get_dimension_indexes_of_labels( - db, user, dataset, axis="feature", dimension_labels=dimensions_info.features - ) - - if len(missing_features) > 0: - log.warning(f"Could not find features: {missing_features}") - - if dimensions_info.samples is None: - sample_indexes = None - elif dimensions_info.sample_identifier.value == "id": - sample_indexes, missing_samples = get_sample_indexes_by_given_ids( - db, user, dataset, dimensions_info.samples - ) - else: - sample_indexes, missing_samples = get_dimension_indexes_of_labels( - db, user, dataset, axis="sample", dimension_labels=dimensions_info.samples - ) - - if len(missing_samples) > 0: - log.warning(f"Could not find samples: {missing_samples}") - - if strict: - num_missing_features = len(missing_features) - missing_features_msg = f"{num_missing_features} missing features: {missing_features[:20] + ['...'] if num_missing_features >= 20 else missing_features}" - num_missing_samples = len(missing_samples) - missing_samples_msg = f"{num_missing_samples} missing samples: {missing_samples[:20] + ['...'] if num_missing_samples >= 20 else missing_samples}" - if len(missing_features) > 0 or len(missing_samples) > 0: - raise UserError(f"{missing_features_msg} and {missing_samples_msg}") - - # call sort on the indices because hdf5_read requires indices be in ascending order - if feature_indexes is not None: - feature_indexes = sorted(feature_indexes) - if sample_indexes is not None: - sample_indexes = sorted(sample_indexes) - - df = get_slice(dataset, feature_indexes, sample_indexes, filestore_location) - - # Re-index by label if applicable - if dimensions_info.feature_identifier == FeatureSampleIdentifier.label: - labels_by_id = get_dataset_feature_labels_by_id(db, user, dataset) - df = df.rename(columns=labels_by_id) - - if dimensions_info.sample_identifier == FeatureSampleIdentifier.label: - label_by_id = get_dataset_sample_labels_by_id(db, user, dataset) - df = df.rename(index=label_by_id) - - return df diff --git a/breadbox/breadbox/service/__init__.py b/breadbox/breadbox/service/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/breadbox/breadbox/service/data_loading.py b/breadbox/breadbox/service/data_loading.py new file mode 100644 index 00000000..16b19011 --- /dev/null +++ b/breadbox/breadbox/service/data_loading.py @@ -0,0 +1,98 @@ +import logging + +from breadbox.db.session import SessionWithUser +from breadbox.crud import dataset as dataset_crud +from breadbox.io import filestore_crud as filestore_crud +from breadbox.models.dataset import MatrixDataset +from breadbox.service.labels import ( + get_dataset_feature_labels_by_id, + get_dataset_sample_labels_by_id, +) +from breadbox.schemas.custom_http_exception import UserError +from breadbox.schemas.dataset import ( + FeatureSampleIdentifier, + MatrixDimensionsInfo, +) + +log = logging.getLogger(__name__) + + +def get_subsetted_matrix_dataset_df( + db: SessionWithUser, + user: str, + dataset: MatrixDataset, + dimensions_info: MatrixDimensionsInfo, + filestore_location, + strict: bool = False, # False default for backwards compatibility +): + """ + Load a dataframe containing data for the specified dimensions. + If the dimensions are specified by label, then return a result indexed by labels + """ + + missing_features = [] + missing_samples = [] + + if dimensions_info.features is None: + feature_indexes = None + elif dimensions_info.feature_identifier.value == "id": + ( + feature_indexes, + missing_features, + ) = dataset_crud.get_feature_indexes_by_given_ids( + db, user, dataset, dimensions_info.features + ) + else: + assert dimensions_info.feature_identifier.value == "label" + ( + feature_indexes, + missing_features, + ) = dataset_crud.get_dimension_indexes_of_labels( + db, user, dataset, axis="feature", dimension_labels=dimensions_info.features + ) + + if len(missing_features) > 0: + log.warning(f"Could not find features: {missing_features}") + + if dimensions_info.samples is None: + sample_indexes = None + elif dimensions_info.sample_identifier.value == "id": + sample_indexes, missing_samples = dataset_crud.get_sample_indexes_by_given_ids( + db, user, dataset, dimensions_info.samples + ) + else: + sample_indexes, missing_samples = dataset_crud.get_dimension_indexes_of_labels( + db, user, dataset, axis="sample", dimension_labels=dimensions_info.samples + ) + + if len(missing_samples) > 0: + log.warning(f"Could not find samples: {missing_samples}") + + if strict: + num_missing_features = len(missing_features) + missing_features_msg = f"{num_missing_features} missing features: {missing_features[:20] + ['...'] if num_missing_features >= 20 else missing_features}" + num_missing_samples = len(missing_samples) + missing_samples_msg = f"{num_missing_samples} missing samples: {missing_samples[:20] + ['...'] if num_missing_samples >= 20 else missing_samples}" + if len(missing_features) > 0 or len(missing_samples) > 0: + raise UserError(f"{missing_features_msg} and {missing_samples_msg}") + + # call sort on the indices because hdf5_read requires indices be in ascending order + if feature_indexes is not None: + feature_indexes = sorted(feature_indexes) + if sample_indexes is not None: + sample_indexes = sorted(sample_indexes) + + df = filestore_crud.get_slice( + dataset, feature_indexes, sample_indexes, filestore_location + ) + + # Re-index by label if applicable + if dimensions_info.feature_identifier == FeatureSampleIdentifier.label: + labels_by_id = get_dataset_feature_labels_by_id(db, user, dataset) + df = df.rename(columns=labels_by_id) + + if dimensions_info.sample_identifier == FeatureSampleIdentifier.label: + label_by_id = get_dataset_sample_labels_by_id(db, user, dataset) + df = df.rename(index=label_by_id) + + return df diff --git a/breadbox/breadbox/service/labels.py b/breadbox/breadbox/service/labels.py new file mode 100644 index 00000000..3b193ed6 --- /dev/null +++ b/breadbox/breadbox/service/labels.py @@ -0,0 +1,93 @@ +from breadbox.db.session import SessionWithUser +from breadbox.crud import dataset as dataset_crud +from breadbox.crud import dimension_type as dimension_type_crud +from breadbox.models.dataset import ( + DatasetFeature, + DatasetSample, + MatrixDataset, + TabularDataset, +) +from breadbox.schemas.custom_http_exception import ResourceNotFoundError + + +def get_dataset_feature_labels_by_id( + db: SessionWithUser, user: str, dataset: MatrixDataset, +) -> dict[str, str]: + """ + Try loading feature labels from metadata. + If there are no labels in the metadata or there is no metadata, then just return the feature names. + """ + metadata_labels_by_given_id = dataset_crud.get_dataset_feature_annotations( + db=db, user=user, dataset=dataset, metadata_col_name="label" + ) + + if metadata_labels_by_given_id: + return metadata_labels_by_given_id + else: + all_dataset_features = dataset_crud.get_dataset_features( + db=db, dataset=dataset, user=user + ) + return {feature.given_id: feature.given_id for feature in all_dataset_features} + + +def get_dataset_sample_labels_by_id( + db: SessionWithUser, user: str, dataset: MatrixDataset, +) -> dict[str, str]: + """ + Try loading sample labels from metadata. + If there are no labels in the metadata or there is no metadata, then just return the sample names. + """ + metadata_labels = dataset_crud.get_dataset_sample_annotations( + db=db, user=user, dataset=dataset, metadata_col_name="label" + ) + if metadata_labels: + return metadata_labels + else: + samples = dataset_crud.get_dataset_samples(db=db, dataset=dataset, user=user) + return {sample.given_id: sample.given_id for sample in samples} + + +def get_dataset_feature_by_label( + db: SessionWithUser, dataset_id: str, feature_label: str +) -> DatasetFeature: + """Load the dataset feature corresponding to the given dataset ID and feature label""" + + dataset = dataset_crud.get_dataset(db, db.user, dataset_id) + if dataset is None: + raise ResourceNotFoundError(f"Dataset '{dataset_id}' not found.") + dataset_crud.assert_user_has_access_to_dataset(dataset, db.user) + assert isinstance(dataset, MatrixDataset) + + labels_by_given_id = get_dataset_feature_labels_by_id(db, db.user, dataset) + given_ids_by_label = {label: id for id, label in labels_by_given_id.items()} + feature_given_id = given_ids_by_label.get(feature_label) + if feature_given_id is None: + raise ResourceNotFoundError( + f"Feature label '{feature_label}' not found in dataset '{dataset_id}'." + ) + + return dataset_crud.get_dataset_feature_by_given_id( + db, dataset_id, feature_given_id + ) + + +def get_dataset_sample_by_label( + db: SessionWithUser, dataset_id: str, sample_label: str +) -> DatasetSample: + """Load the dataset sample corresponding to the given dataset ID and sample label""" + + dataset = dataset_crud.get_dataset(db, db.user, dataset_id) + if dataset is None: + raise ResourceNotFoundError(f"Dataset '{dataset_id}' not found.") + dataset_crud.assert_user_has_access_to_dataset(dataset, db.user) + assert isinstance(dataset, MatrixDataset) + + labels_by_given_id = get_dataset_sample_labels_by_id(db, db.user, dataset) + given_ids_by_label = {label: id for id, label in labels_by_given_id.items()} + sample_given_id = given_ids_by_label.get(sample_label) + if sample_given_id is None: + raise ResourceNotFoundError( + f"Sample label '{sample_label}' not found in dataset '{dataset_id}'." + ) + + return dataset_crud.get_dataset_sample_by_given_id(db, dataset_id, sample_given_id) From c3c53918b896c2b91a8980a83cba36a88ac6c1e3 Mon Sep 17 00:00:00 2001 From: Sarah Wessel Date: Thu, 17 Oct 2024 14:17:45 -0400 Subject: [PATCH 4/7] Fix references to service functions --- breadbox/breadbox/api/datasets.py | 15 ++-- breadbox/breadbox/compute/analysis_tasks.py | 13 +-- breadbox/breadbox/compute/download_tasks.py | 2 +- breadbox/breadbox/crud/slice.py | 15 +++- breadbox/breadbox/service/labels.py | 2 - breadbox/tests/crud/test_dataset.py | 83 ------------------- breadbox/tests/service/__init__.py | 0 breadbox/tests/service/test_labels.py | 88 +++++++++++++++++++++ 8 files changed, 117 insertions(+), 101 deletions(-) create mode 100644 breadbox/tests/service/__init__.py create mode 100644 breadbox/tests/service/test_labels.py diff --git a/breadbox/breadbox/api/datasets.py b/breadbox/breadbox/api/datasets.py index 9bd30a43..ba91b777 100644 --- a/breadbox/breadbox/api/datasets.py +++ b/breadbox/breadbox/api/datasets.py @@ -54,6 +54,11 @@ DimensionDataResponse, SliceQueryIdentifierType, ) +from breadbox.service.data_loading import get_subsetted_matrix_dataset_df +from breadbox.service.labels import ( + get_dataset_feature_labels_by_id, + get_dataset_sample_labels_by_id, +) from .dependencies import get_dataset as get_dataset_dep from .dependencies import get_db_with_user, get_user @@ -110,7 +115,7 @@ def get_dataset_features( if dataset is None: raise HTTPException(404, "Dataset not found") - feature_labels_by_id = dataset_crud.get_dataset_feature_labels_by_id( + feature_labels_by_id = get_dataset_feature_labels_by_id( db=db, user=user, dataset=dataset, ) return [{"id": id, "label": label} for id, label in feature_labels_by_id.items()] @@ -133,7 +138,7 @@ def get_dataset_samples( if dataset is None: raise HTTPException(404, "Dataset not found") - sample_labels_by_id = dataset_crud.get_dataset_sample_labels_by_id( + sample_labels_by_id = get_dataset_sample_labels_by_id( db=db, user=user, dataset=dataset, ) return [{"id": id, "label": label} for id, label in sample_labels_by_id.items()] @@ -185,7 +190,7 @@ def get_feature_data( # Get the feature label if dataset.feature_type_name: # Note: this would be faster if we had a query to load one label instead of all labels - but performance hasn't been an issue - feature_labels_by_id = dataset_crud.get_dataset_feature_labels_by_id( + feature_labels_by_id = get_dataset_feature_labels_by_id( db=db, user=user, dataset=dataset ) label = feature_labels_by_id[feature.given_id] @@ -323,7 +328,7 @@ def get_matrix_dataset_data( ] = False, ): try: - df = dataset_crud.get_subsetted_matrix_dataset_df( + df = get_subsetted_matrix_dataset_df( db, user, dataset, @@ -404,7 +409,7 @@ def get_dataset_data( except UserError as e: raise e - df = dataset_crud.get_subsetted_matrix_dataset_df( + df = get_subsetted_matrix_dataset_df( db, user, dataset, dim_info, settings.filestore_location ) diff --git a/breadbox/breadbox/compute/analysis_tasks.py b/breadbox/breadbox/compute/analysis_tasks.py index 58395459..c65cd11b 100644 --- a/breadbox/breadbox/compute/analysis_tasks.py +++ b/breadbox/breadbox/compute/analysis_tasks.py @@ -23,8 +23,11 @@ ValueType, ) from breadbox.schemas.custom_http_exception import ResourceNotFoundError, UserError - from breadbox.schemas.dataset import MatrixDatasetIn +from breadbox.service.labels import ( + get_dataset_feature_labels_by_id, + get_dataset_feature_by_label, +) from ..crud.types import get_dimension_type from ..crud import dataset as dataset_crud @@ -170,9 +173,7 @@ def get_features_info_and_dataset( result_features: List[Feature] = [] dataset_feature_ids: List[str] = [] datasets: List[Dataset] = [] - feature_labels_by_id = dataset_crud.get_dataset_feature_labels_by_id( - db, user, dataset - ) + feature_labels_by_id = get_dataset_feature_labels_by_id(db, user, dataset) feature_indices = [] for dataset_feat in dataset_features: @@ -493,12 +494,12 @@ def create_cell_line_group( # Return the feature ID associated with the new dataset feature if use_feature_ids: - feature: DatasetFeature = dataset_crud.get_dataset_feature_by_label( + feature: DatasetFeature = get_dataset_feature_by_label( db=db, dataset_id=dataset_id, feature_label=feature_label ) return _format_breadbox_shim_slice_id(feature.dataset_id, feature.given_id) else: - dataset_feature = dataset_crud.get_dataset_feature_by_label( + dataset_feature = get_dataset_feature_by_label( db, dataset_id, feature_label ) return str(dataset_feature.id) diff --git a/breadbox/breadbox/compute/download_tasks.py b/breadbox/breadbox/compute/download_tasks.py index 16cd39e7..b27c18fc 100644 --- a/breadbox/breadbox/compute/download_tasks.py +++ b/breadbox/breadbox/compute/download_tasks.py @@ -11,7 +11,6 @@ from breadbox.crud.dataset import get_sample_indexes_by_given_ids from breadbox.crud.dataset import get_all_sample_indexes from breadbox.crud.partial import get_cell_line_selector_lines -from breadbox.crud.dataset import get_dataset_feature_labels_by_id from ..config import get_settings from ..models.dataset import ( Dataset, @@ -21,6 +20,7 @@ ) from .celery import app from ..db.util import db_context +from breadbox.service.labels import get_dataset_feature_labels_by_id def _progress_callback(task, percentage, message="Fetching data"): diff --git a/breadbox/breadbox/crud/slice.py b/breadbox/breadbox/crud/slice.py index 2501b097..a1837be1 100644 --- a/breadbox/breadbox/crud/slice.py +++ b/breadbox/breadbox/crud/slice.py @@ -13,6 +13,13 @@ get_sample_slice, ) +from breadbox.service.labels import ( + get_dataset_feature_labels_by_id, + get_dataset_sample_labels_by_id, + get_dataset_feature_by_label, + get_dataset_sample_by_label, +) + from depmap_compute.slice import SliceQuery @@ -57,7 +64,7 @@ def get_slice_data( slice_data = get_feature_slice(dataset, [feature.index], filestore_location) elif slice_query.identifier_type == "feature_label": - feature = dataset_crud.get_dataset_feature_by_label( + feature = get_dataset_feature_by_label( db, dataset_id, feature_label=slice_query.identifier, ) slice_data = get_feature_slice(dataset, [feature.index], filestore_location) @@ -69,7 +76,7 @@ def get_slice_data( slice_data = get_sample_slice(dataset, [sample.index], filestore_location) elif slice_query.identifier_type == "sample_label": - sample = dataset_crud.get_dataset_sample_by_label( + sample = get_dataset_sample_by_label( db, dataset_id, sample_label=slice_query.identifier ) slice_data = get_sample_slice(dataset, [sample.index], filestore_location) @@ -99,9 +106,9 @@ def get_labels_for_slice_type( raise ResourceNotFoundError(f"Dataset '{slice_query.dataset_id}' not found.") if slice_query.identifier_type in {"feature_label", "feature_id"}: - return dataset_crud.get_dataset_sample_labels_by_id(db, db.user, dataset) + return get_dataset_sample_labels_by_id(db, db.user, dataset) elif slice_query.identifier_type in {"sample_label", "sample_id"}: - return dataset_crud.get_dataset_feature_labels_by_id(db, db.user, dataset) + return get_dataset_feature_labels_by_id(db, db.user, dataset) else: # Columns don't have labels, so just return None return None diff --git a/breadbox/breadbox/service/labels.py b/breadbox/breadbox/service/labels.py index 3b193ed6..df41cc5b 100644 --- a/breadbox/breadbox/service/labels.py +++ b/breadbox/breadbox/service/labels.py @@ -1,11 +1,9 @@ from breadbox.db.session import SessionWithUser from breadbox.crud import dataset as dataset_crud -from breadbox.crud import dimension_type as dimension_type_crud from breadbox.models.dataset import ( DatasetFeature, DatasetSample, MatrixDataset, - TabularDataset, ) from breadbox.schemas.custom_http_exception import ResourceNotFoundError diff --git a/breadbox/tests/crud/test_dataset.py b/breadbox/tests/crud/test_dataset.py index 27b9ca3c..88edf605 100644 --- a/breadbox/tests/crud/test_dataset.py +++ b/breadbox/tests/crud/test_dataset.py @@ -1,15 +1,10 @@ import numpy as np -import pandas as pd -import pytest from breadbox.db.session import SessionWithUser from breadbox.crud.dataset import ( - get_dataset_feature_by_label, get_dataset, get_dataset_feature_by_given_id, ) -from breadbox.models.dataset import AnnotationType -from breadbox.schemas.custom_http_exception import ResourceNotFoundError from tests import factories @@ -38,84 +33,6 @@ def test_get_dataset(minimal_db, settings): assert get_dataset(minimal_db, minimal_db.user, "foo") is None -def test_get_dataset_feature_by_label(minimal_db: SessionWithUser, settings): - - # Test the case where there is no metadata (where labels are given_ids) - example_matrix_values = factories.matrix_csv_data_file_with_values( - feature_ids=["featureID1", "featureID2", "featureID3"], - sample_ids=["sampleID1", "sampleID2", "sampleID3"], - values=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), - ) - dataset_with_generic_features = factories.matrix_dataset( - minimal_db, settings, feature_type=None, data_file=example_matrix_values - ) - minimal_db.reset_user(settings.default_user) - feature = get_dataset_feature_by_label( - minimal_db, - dataset_id=dataset_with_generic_features.id, - feature_label="featureID1", - ) - assert feature.dataset_id == dataset_with_generic_features.id - assert feature.given_id == "featureID1" - - # Test the case where metadata does exist - minimal_db.reset_user(settings.admin_users[0]) - factories.add_dimension_type( - minimal_db, - settings, - user=settings.admin_users[0], - name="feature-with-metadata", - display_name="Feature With Metadata", - id_column="ID", - annotation_type_mapping={ - "ID": AnnotationType.text, - "label": AnnotationType.text, - }, - axis="feature", - metadata_df=pd.DataFrame( - { - "ID": ["featureID1", "featureID2", "featureID3"], - "label": ["featureLabel1", "featureLabel2", "featureLabel3"], - } - ), - ) - example_matrix_values = factories.matrix_csv_data_file_with_values( - feature_ids=["featureID1", "featureID2", "featureID3"], - sample_ids=["sampleID1", "sampleID2", "sampleID3"], - values=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), - ) - dataset_with_metadata = factories.matrix_dataset( - minimal_db, - settings, - feature_type="feature-with-metadata", - data_file=example_matrix_values, - ) - - # Query with the non-admin user - minimal_db.reset_user(settings.default_user) - feature = get_dataset_feature_by_label( - minimal_db, dataset_id=dataset_with_metadata.id, feature_label="featureLabel1", - ) - assert feature.dataset_id == dataset_with_metadata.id - assert feature.given_id == "featureID1" - - # When the dataset does not exist, a clear error should be raised - with pytest.raises(ResourceNotFoundError): - get_dataset_feature_by_label( - minimal_db, - dataset_id="Undefined-dataset", - feature_label="someFeatureLabel", - ) - - # When the featureLabel does not exist within the dataset, a clear error should be raised - with pytest.raises(ResourceNotFoundError): - get_dataset_feature_by_label( - minimal_db, - dataset_id=dataset_with_generic_features.id, - feature_label="Undefined_Feature_Label", - ) - - def test_get_dataset_feature_by_given_id(minimal_db: SessionWithUser, settings): """ Test that this works regardless of whether dataset id or given id are passed in. diff --git a/breadbox/tests/service/__init__.py b/breadbox/tests/service/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/breadbox/tests/service/test_labels.py b/breadbox/tests/service/test_labels.py new file mode 100644 index 00000000..dd9e3319 --- /dev/null +++ b/breadbox/tests/service/test_labels.py @@ -0,0 +1,88 @@ +import numpy as np +import pandas as pd +import pytest + +from breadbox.db.session import SessionWithUser +from breadbox.service.labels import get_dataset_feature_by_label +from breadbox.models.dataset import AnnotationType +from breadbox.schemas.custom_http_exception import ResourceNotFoundError + +from tests import factories + + +def test_get_dataset_feature_by_label(minimal_db: SessionWithUser, settings): + + # Test the case where there is no metadata (where labels are given_ids) + example_matrix_values = factories.matrix_csv_data_file_with_values( + feature_ids=["featureID1", "featureID2", "featureID3"], + sample_ids=["sampleID1", "sampleID2", "sampleID3"], + values=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), + ) + dataset_with_generic_features = factories.matrix_dataset( + minimal_db, settings, feature_type=None, data_file=example_matrix_values + ) + minimal_db.reset_user(settings.default_user) + feature = get_dataset_feature_by_label( + minimal_db, + dataset_id=dataset_with_generic_features.id, + feature_label="featureID1", + ) + assert feature.dataset_id == dataset_with_generic_features.id + assert feature.given_id == "featureID1" + + # Test the case where metadata does exist + minimal_db.reset_user(settings.admin_users[0]) + factories.add_dimension_type( + minimal_db, + settings, + user=settings.admin_users[0], + name="feature-with-metadata", + display_name="Feature With Metadata", + id_column="ID", + annotation_type_mapping={ + "ID": AnnotationType.text, + "label": AnnotationType.text, + }, + axis="feature", + metadata_df=pd.DataFrame( + { + "ID": ["featureID1", "featureID2", "featureID3"], + "label": ["featureLabel1", "featureLabel2", "featureLabel3"], + } + ), + ) + example_matrix_values = factories.matrix_csv_data_file_with_values( + feature_ids=["featureID1", "featureID2", "featureID3"], + sample_ids=["sampleID1", "sampleID2", "sampleID3"], + values=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), + ) + dataset_with_metadata = factories.matrix_dataset( + minimal_db, + settings, + feature_type="feature-with-metadata", + data_file=example_matrix_values, + ) + + # Query with the non-admin user + minimal_db.reset_user(settings.default_user) + feature = get_dataset_feature_by_label( + minimal_db, dataset_id=dataset_with_metadata.id, feature_label="featureLabel1", + ) + assert feature.dataset_id == dataset_with_metadata.id + assert feature.given_id == "featureID1" + + # When the dataset does not exist, a clear error should be raised + with pytest.raises(ResourceNotFoundError): + get_dataset_feature_by_label( + minimal_db, + dataset_id="Undefined-dataset", + feature_label="someFeatureLabel", + ) + + # When the featureLabel does not exist within the dataset, a clear error should be raised + with pytest.raises(ResourceNotFoundError): + get_dataset_feature_by_label( + minimal_db, + dataset_id=dataset_with_generic_features.id, + feature_label="Undefined_Feature_Label", + ) From 57fa9695aed730ae6bbdef5905318d3a7df2bbc8 Mon Sep 17 00:00:00 2001 From: Sarah Wessel Date: Thu, 17 Oct 2024 14:21:54 -0400 Subject: [PATCH 5/7] Move slice utils to service layer --- breadbox/breadbox/api/datasets.py | 6 +++--- breadbox/breadbox/{crud => service}/slice.py | 0 breadbox/tests/{crud => service}/test_slice.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) rename breadbox/breadbox/{crud => service}/slice.py (100%) rename breadbox/tests/{crud => service}/test_slice.py (99%) diff --git a/breadbox/breadbox/api/datasets.py b/breadbox/breadbox/api/datasets.py index ba91b777..75285b9a 100644 --- a/breadbox/breadbox/api/datasets.py +++ b/breadbox/breadbox/api/datasets.py @@ -30,7 +30,6 @@ from breadbox.crud.access_control import PUBLIC_GROUP_ID from ..crud import dataset as dataset_crud from ..crud import types as type_crud -from ..crud import slice as slice_crud from ..models.dataset import ( Dataset as DatasetModel, @@ -59,6 +58,7 @@ get_dataset_feature_labels_by_id, get_dataset_sample_labels_by_id, ) +from breadbox.service.slice import get_slice_data, get_labels_for_slice_type from .dependencies import get_dataset as get_dataset_dep from .dependencies import get_db_with_user, get_user @@ -495,13 +495,13 @@ def get_dimension_data( identifier=identifier, identifier_type=identifier_type.name, ) - slice_values_by_id = slice_crud.get_slice_data( + slice_values_by_id = get_slice_data( db, settings.filestore_location, parsed_slice_query ) # Load the labels separately, ensuring they're in the same order as the other values slice_ids: list = slice_values_by_id.index.tolist() - labels_by_id = slice_crud.get_labels_for_slice_type(db, parsed_slice_query) + labels_by_id = get_labels_for_slice_type(db, parsed_slice_query) slice_labels = [labels_by_id[id] for id in slice_ids] if labels_by_id else slice_ids return { diff --git a/breadbox/breadbox/crud/slice.py b/breadbox/breadbox/service/slice.py similarity index 100% rename from breadbox/breadbox/crud/slice.py rename to breadbox/breadbox/service/slice.py diff --git a/breadbox/tests/crud/test_slice.py b/breadbox/tests/service/test_slice.py similarity index 99% rename from breadbox/tests/crud/test_slice.py rename to breadbox/tests/service/test_slice.py index 86a5a139..4ed5e469 100644 --- a/breadbox/tests/crud/test_slice.py +++ b/breadbox/tests/service/test_slice.py @@ -2,9 +2,9 @@ import pandas as pd from breadbox.db.session import SessionWithUser -from breadbox.crud.slice import get_slice_data from breadbox.models.dataset import AnnotationType from breadbox.schemas.dataset import ColumnMetadata +from breadbox.service.slice import get_slice_data from depmap_compute.slice import SliceQuery From b3aed79af6358b2288368c479af9e707a472dd44 Mon Sep 17 00:00:00 2001 From: Sarah Wessel Date: Thu, 17 Oct 2024 14:25:55 -0400 Subject: [PATCH 6/7] Debug pyright --- breadbox/breadbox/service/data_loading.py | 8 +++++--- breadbox/pyright-ratchet-errors.txt | 1 - 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/breadbox/breadbox/service/data_loading.py b/breadbox/breadbox/service/data_loading.py index 16b19011..d1ed7312 100644 --- a/breadbox/breadbox/service/data_loading.py +++ b/breadbox/breadbox/service/data_loading.py @@ -33,9 +33,10 @@ def get_subsetted_matrix_dataset_df( missing_features = [] missing_samples = [] + feature_identifier = dimensions_info.feature_identifier if dimensions_info.features is None: feature_indexes = None - elif dimensions_info.feature_identifier.value == "id": + elif feature_identifier and feature_identifier.value == "id": ( feature_indexes, missing_features, @@ -43,7 +44,7 @@ def get_subsetted_matrix_dataset_df( db, user, dataset, dimensions_info.features ) else: - assert dimensions_info.feature_identifier.value == "label" + assert feature_identifier and feature_identifier.value == "label" ( feature_indexes, missing_features, @@ -54,9 +55,10 @@ def get_subsetted_matrix_dataset_df( if len(missing_features) > 0: log.warning(f"Could not find features: {missing_features}") + sample_identifier = dimensions_info.sample_identifier if dimensions_info.samples is None: sample_indexes = None - elif dimensions_info.sample_identifier.value == "id": + elif sample_identifier and sample_identifier.value == "id": sample_indexes, missing_samples = dataset_crud.get_sample_indexes_by_given_ids( db, user, dataset, dimensions_info.samples ) diff --git a/breadbox/pyright-ratchet-errors.txt b/breadbox/pyright-ratchet-errors.txt index 1d5c7b22..593cc858 100644 --- a/breadbox/pyright-ratchet-errors.txt +++ b/breadbox/pyright-ratchet-errors.txt @@ -7,7 +7,6 @@ data_validation.py: error: Argument of type "BinaryIO" cannot be assigned to par data_validation.py: error: Cannot access attribute "all" for class "bool_t" data_validation.py: error: Invalid conditional operand of type "Series | bool_t | Unknown" data_validation.py: error: No overloads for "read_csv" match the provided arguments (reportCallIssue) -dataset.py: error: "value" is not a known attribute of "None" (reportOptionalMemberAccess) dataset.py: error: Argument of type "Hashable" cannot be assigned to parameter "dimension_given_id" of type "str" in function "__init__" dataset.py: error: Argument of type "list[dict[str, Series | Unknown | ndarray[Any, Unknown] | Any | NDArray[Unknown]]]" cannot be assigned to parameter "matching_properties" of type "List[Dict[str, str]]" in function "__init__" (reportArgumentType) dataset_tasks.py: error: Expression of type "None" is incompatible with declared type "List[Unknown]" From 69a8b2cd3cbdfebb732d2423738b0cb728500a78 Mon Sep 17 00:00:00 2001 From: Sarah Wessel Date: Thu, 17 Oct 2024 14:41:35 -0400 Subject: [PATCH 7/7] Remove old comment --- breadbox/breadbox/crud/dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/breadbox/breadbox/crud/dataset.py b/breadbox/breadbox/crud/dataset.py index 76432a4d..22cb9e0a 100644 --- a/breadbox/breadbox/crud/dataset.py +++ b/breadbox/breadbox/crud/dataset.py @@ -1418,7 +1418,6 @@ def get_subsetted_tabular_dataset_df( ) if tabular_dimensions_info.identifier == FeatureSampleIdentifier.label: - # TODO: move this duplicate query into a helper function, take optional index filter # Get the corresponding dimension ids for the dimension labels from the dataset's dimension type and use the dimension ids to filter values by dimension_type: DimensionType = db.query(DimensionType).filter( DimensionType.name == dataset.index_type_name