diff --git a/ingest/config.py b/ingest/config.py index 9a927c4f..98fc3a8b 100644 --- a/ingest/config.py +++ b/ingest/config.py @@ -10,12 +10,12 @@ MONGO_CONNECTION = MongoConnection() -def init(study_id, study_file_id, user_metric_uuid=None): +def init(study_id, study_file_id, user_metric_uuid=None, action=None): global __metric_properties study = Study(study_id) study_file = StudyFile(study_file_id) - __metric_properties = MetricProperties(study, study_file, user_metric_uuid) + __metric_properties = MetricProperties(study, study_file, user_metric_uuid, action) def set_parent_event_name(event_name): @@ -39,7 +39,7 @@ class MetricProperties: # This is a generic write-only log token, not a secret USER_ID = "2f30ec50-a04d-4d43-8fd1-b136a2045079" - def __init__(self, study, study_file, user_uuid=None): + def __init__(self, study, study_file, user_uuid=None, action=None): distinct_id = user_uuid if user_uuid else MetricProperties.USER_ID self.__properties = { "distinct_id": distinct_id, @@ -50,7 +50,11 @@ def __init__(self, study, study_file, user_uuid=None): "trigger": study_file.trigger, "logger": "ingest-pipeline", "appId": "single-cell-portal", + "action": action } + # merge in referenceAnnDataFile if necessary + if study_file.file_type == 'AnnData': + self.__properties["referenceAnnDataFile"] = study_file.is_reference_anndata def get_properties(self): return self.__properties @@ -171,12 +175,15 @@ def study_file(self, study_file_id): self.file_type = self.study_file["file_type"] self.file_size = self.study_file["upload_file_size"] self.file_name = self.study_file["name"] + upload_trigger = self.study_file.get("options", {}).get("upload_trigger") # when set, remote_location is the name of the file in the bucket - if self.study_file.get("remote_location") is not None: - if self.study_file["remote_location"] == "": - self.trigger = 'upload' - else: - self.trigger = 'sync' + if upload_trigger is not None: + self.trigger = upload_trigger + elif self.study_file["remote_location"] is not None: + self.trigger = 'upload' if self.study_file["remote_location"] == "" else 'sync' # indicate trigger state for tests/mocks else: self.trigger = 'not set' + + if self.study_file["file_type"] == 'AnnData': + self.is_reference_anndata = self.study_file.get("ann_data_file_info", {}).get("reference_file") diff --git a/ingest/ingest_pipeline.py b/ingest/ingest_pipeline.py index e2af3edc..68fc348b 100644 --- a/ingest/ingest_pipeline.py +++ b/ingest/ingest_pipeline.py @@ -124,6 +124,12 @@ class IngestPipeline: "../schema/alexandria_convention/alexandria_convention_schema.json" ) + # array of actions to use when reporting to Mixpanel + ACTION_NAMES = [ + 'ingest_cluster', 'ingest_cell_metadata', 'ingest_expression', 'ingest_anndata', 'ingest_subsample', + 'ingest_differential_expression', 'differential_expression', 'render_expression_arrays', 'rank_genes' + ] + # Logger provides more details for trouble shooting dev_logger = setup_logger(__name__, "log.txt", format="support_configs") user_logger = setup_logger(__name__ + ".usr", "user_log.txt", level=logging.ERROR) @@ -786,6 +792,10 @@ def exit_pipeline(ingest, status, status_cell_metadata, arguments): sys.exit(os.EX_DATAERR) sys.exit(1) +def get_action_from_args(arguments): + """Get the action from list of arguments denoting which data type is being ingested/extracted""" + action = list(set(IngestPipeline.ACTION_NAMES) & set(arguments)) + return action[0] if action else "" def main() -> None: """Enables running Ingest Pipeline via CLI @@ -811,6 +821,7 @@ def main() -> None: arguments["study_id"], arguments["study_file_id"], arguments["user_metrics_uuid"], + get_action_from_args(arguments) ) ingest = IngestPipeline(**arguments) status, status_cell_metadata = run_ingest(ingest, arguments, parsed_args) diff --git a/tests/test_ingest.py b/tests/test_ingest.py index 7be105a1..5fb64a4a 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -47,6 +47,7 @@ IngestPipeline, exit_pipeline, run_ingest, + get_action_from_args ) from expression_files.expression_files import GeneExpression @@ -793,6 +794,25 @@ def test_extract_processed_matrix_from_anndata(self): except: print(f"Error while deleting file : {file}") + def test_get_action_from_args(self): + args = [ + "--study-id", + "5d276a50421aa9117c982845", + "--study-file-id", + "5dd5ae25421aa910a723a337", + "ingest_subsample", + "--cluster-file", + "../tests/data/good_subsample_cluster.csv", + "--name", + "cluster1", + "--cell-metadata-file", + "../tests/data/test_cell_metadata.csv", + "--subsample", + ] + self.assertEqual("ingest_subsample", get_action_from_args(args)) + bad_args = ["foo", "bar", "bing"] + self.assertEqual("", get_action_from_args(bad_args)) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_metrics_service.py b/tests/test_metrics_service.py index c4162b3a..0ab9dd60 100644 --- a/tests/test_metrics_service.py +++ b/tests/test_metrics_service.py @@ -21,6 +21,7 @@ def mock_post_event(props): props = json.loads(props) expected_props = { "properties": { + "action": "ingest_expression", "studyAccession": "SCP123", "fileName": "File_name.txt", "fileType": "Expression matrix", @@ -29,6 +30,7 @@ def mock_post_event(props): "logger": "ingest-pipeline", "appId": "single-cell-portal", "status": "success", + "trigger": "bucket" } } if props["event"] == "file-validation": @@ -56,6 +58,8 @@ class MetricsServiceTestCase(unittest.TestCase): "file_type": "Expression matrix", "upload_file_size": 400, "name": "File_name.txt", + "remote_location": "File_name.txt", + "options": { "upload_trigger": "bucket" } } ] # Build client mock with functions and return values needed to query @@ -94,7 +98,7 @@ def test_log(self, mock_execute_ingest, mock_MONGO_CONNECTION, mock_post_event): "dense", ] # Initialize global variables - config.init("5d276a50421aa9117c982845", "5dd5ae25421aa910a723a337") + config.init("5d276a50421aa9117c982845", "5dd5ae25421aa910a723a337", None, 'ingest_expression') config.set_parent_event_name("ingest-pipeline:expression:ingest") IngestTestCase.execute_ingest(args) metrics_model = config.get_metric_properties() @@ -108,10 +112,39 @@ def test_log(self, mock_execute_ingest, mock_MONGO_CONNECTION, mock_post_event): # args.insert(3).append(user_metrics_uuid) # Initialize global variables config.init( - "5d276a50421aa9117c982845", "5dd5ae25421aa910a723a337", user_metrics_uuid + "5d276a50421aa9117c982845", "5dd5ae25421aa910a723a337", user_metrics_uuid, 'ingest_expression' ) config.set_parent_event_name("ingest-pipeline:expression:ingest") IngestTestCase.execute_ingest(args) metrics_model = config.get_metric_properties() # Log Mixpanel events MetricsService.log(config.get_parent_event_name(), metrics_model) + + @patch("config.MONGO_CONNECTION") + def test_init(self, mock_mongo_conn): + client_values = {} + client_values["study_accessions"] = MagicMock() + client_values["study_accessions"].find.return_value = [{"accession": "SCP123"}] + client_values["study_files"] = MagicMock() + client_values["study_files"].find.return_value = [ + { + "file_type": "AnnData", + "upload_file_size": 400, + "name": "matrix.h5ad", + "remote_location": "", + "ann_data_file_info": { + "reference_file": False + } + } + ] + # Build client mock with functions and return values needed to query + client_mock = MagicMock() + client_mock.__getitem__.side_effect = client_values.__getitem__ + mock_mongo_conn._client = client_mock + config.init( + "5d276a50421aa9117c982845", "5dd5ae25421aa910a723a337", user_metrics_uuid, "ingest_anndata" + ) + props = config.get_metric_properties().get_properties() + self.assertEqual("ingest_anndata", props["action"]) + self.assertEqual("upload", props["trigger"]) + self.assertFalse(props["referenceAnnDataFile"]) \ No newline at end of file