From c16248c9dfa547e0fec4cd9329e7d12f131eccf4 Mon Sep 17 00:00:00 2001 From: Steffengreiner Date: Tue, 14 Nov 2023 16:03:15 +0100 Subject: [PATCH] Remove mandatory requirement of run_id during parsing of nf-core datasets (#132) * Make run_id optional during nf-core parsing * Remove empty lines to trigger Github actions again --- pom.xml | 2 +- .../utils/BioinformaticAnalysisParser.groovy | 106 ++++++++++-------- .../life/qbic/utils/NanoporeParser.groovy | 2 - .../utils/BioinformaticAnalysisSpec.groovy | 37 ++++++ .../multiqc/star_salmon/multiqc_report.html | 0 .../execution_report_1234-56-78_90-12-34.html | 0 .../pipeline_info/software_versions.yml | 0 .../salmon/salmon.merged.gene_tpm.tsv | 0 .../validates-no-run-id/sample_ids.txt | 0 9 files changed, 96 insertions(+), 51 deletions(-) create mode 100644 src/test/resources/dummyFileSystem/bioinformatic-analysis-output/validates-no-run-id/multiqc/star_salmon/multiqc_report.html create mode 100644 src/test/resources/dummyFileSystem/bioinformatic-analysis-output/validates-no-run-id/pipeline_info/execution_report_1234-56-78_90-12-34.html create mode 100644 src/test/resources/dummyFileSystem/bioinformatic-analysis-output/validates-no-run-id/pipeline_info/software_versions.yml create mode 100644 src/test/resources/dummyFileSystem/bioinformatic-analysis-output/validates-no-run-id/salmon/salmon.merged.gene_tpm.tsv create mode 100644 src/test/resources/dummyFileSystem/bioinformatic-analysis-output/validates-no-run-id/sample_ids.txt diff --git a/pom.xml b/pom.xml index 424e2d94..69e039e0 100644 --- a/pom.xml +++ b/pom.xml @@ -128,7 +128,7 @@ data-model-lib life.qbic - 2.27.0 + 2.28.0 org.mockito diff --git a/src/main/groovy/life/qbic/utils/BioinformaticAnalysisParser.groovy b/src/main/groovy/life/qbic/utils/BioinformaticAnalysisParser.groovy index 9b430753..71176167 100644 --- a/src/main/groovy/life/qbic/utils/BioinformaticAnalysisParser.groovy +++ b/src/main/groovy/life/qbic/utils/BioinformaticAnalysisParser.groovy @@ -17,7 +17,6 @@ import java.nio.file.NotDirectoryException import java.nio.file.Path import java.text.ParseException - /** *

Parser storing the fileTree of a nf-core pipeline output directory into JSON format

*
@@ -27,7 +26,7 @@ import java.text.ParseException * @param directory path of nf-core directory whose fileTree should be converted into a JSON String * */ -class BioinformaticAnalysisParser implements DatasetParser{ +class BioinformaticAnalysisParser implements DatasetParser { /** * Contains the associated keys of the required root directory subFolders @@ -36,19 +35,28 @@ class BioinformaticAnalysisParser implements DatasetParser * @since 1.8.0 */ enum RequiredRootFolderKeys { - QUALITY_CONTROL("qualityControl"), - PIPELINE_INFORMATION("pipelineInformation"), - PROCESS_FOLDERS("processFolders") + QUALITY_CONTROL("qualityControl", "multiqc"), + PIPELINE_INFORMATION("pipelineInformation", "pipeline_info"), + //Process_Folder names can vary so no directory name can be assumed for now + PROCESS_FOLDERS("processFolders", null) private String keyName - RequiredRootFolderKeys(String keyName) { + private String folderName + + RequiredRootFolderKeys(String keyName, String folderName) { this.keyName = keyName + this.folderName = folderName } String getKeyName() { return this.keyName } + + String getFolderName() { + return this.folderName + } + } /** @@ -58,18 +66,24 @@ class BioinformaticAnalysisParser implements DatasetParser * @since 1.8.0 */ enum RequiredRootFileKeys { - RUN_ID("runId"), - SAMPLE_ID("sampleIds"), + RUN_ID("runId", "run_id.txt"), + SAMPLE_ID("sampleIds", "sample_ids.txt") private String keyName + private String fileName - RequiredRootFileKeys(String keyName) { + RequiredRootFileKeys(String keyName, String fileName) { this.keyName = keyName + this.fileName = fileName } String getKeyName() { return this.keyName } + + String getFileName() { + return this.fileName + } } /** @@ -79,19 +93,25 @@ class BioinformaticAnalysisParser implements DatasetParser * @since 1.8.0 */ enum RequiredPipelineFileKeys { - SOFTWARE_VERSIONS("softwareVersions"), - EXECUTION_REPORT("executionReport"), + SOFTWARE_VERSIONS("softwareVersions", "software_versions.yml"), + EXECUTION_REPORT("executionReport", "execution_report"), private String keyName - RequiredPipelineFileKeys(String keyName) { + private String fileName + + RequiredPipelineFileKeys(String keyName, String fileName) { this.keyName = keyName + this.fileName = fileName } String getKeyName() { return this.keyName } + String getFileName() { + return this.fileName + } } /** {@InheritDoc} */ @@ -139,32 +159,23 @@ class BioinformaticAnalysisParser implements DatasetParser List processFolders = [] rootChildren.each { currentChild -> if (currentChild.containsKey("children")) { - //folder + //directory String folderName = currentChild.get("name") - switch (folderName) { - case "multiqc": - insertAsProperty(map, currentChild, RequiredRootFolderKeys.QUALITY_CONTROL.getKeyName()) - break - case "pipeline_info": + RequiredRootFolderKeys requiredRootFolderKeys = RequiredRootFolderKeys.values().find { rootFolderKeys -> (rootFolderKeys.getFolderName() == folderName) } + if (requiredRootFolderKeys) { + if (requiredRootFolderKeys == RequiredRootFolderKeys.PIPELINE_INFORMATION) { parsePipelineInformation(currentChild) - insertAsProperty(map, currentChild, RequiredRootFolderKeys.PIPELINE_INFORMATION.getKeyName()) - break - default: - processFolders.add(currentChild) - break + } + insertAsProperty(map, currentChild, requiredRootFolderKeys.getKeyName()) + } else { + processFolders.add(currentChild) } } else if (currentChild.containsKey("fileType")) { //file - switch (currentChild.get("name")) { - case "run_id.txt": - insertAsProperty(map, currentChild, RequiredRootFileKeys.RUN_ID.getKeyName()) - break - case "sample_ids.txt": - insertAsProperty(map, currentChild, RequiredRootFileKeys.SAMPLE_ID.getKeyName()) - break - default: - //ignore other files - break + String fileName = currentChild.get("name") + RequiredRootFileKeys requiredRootFileKeys = RequiredRootFileKeys.values().find { rootFileKeys -> (rootFileKeys.getFileName() == fileName) } + if (requiredRootFileKeys) { + insertAsProperty(map, currentChild, requiredRootFileKeys.getKeyName()) } } } @@ -200,12 +211,10 @@ class BioinformaticAnalysisParser implements DatasetParser private static void parsePipelineInformation(Map pipelineInformation) { pipelineInformation.get("children").each { Map child -> - String filename = child.get("name") - if(filename.equals("software_versions.yml")){ - insertAsProperty(pipelineInformation, child, RequiredPipelineFileKeys.SOFTWARE_VERSIONS.getKeyName()) - } - else if(filename.matches("^execution_report.*")) { - insertAsProperty(pipelineInformation, child, RequiredPipelineFileKeys.EXECUTION_REPORT.getKeyName()) + String fileName = child.get("name") + RequiredPipelineFileKeys requiredPipelineFileKeys = RequiredPipelineFileKeys.values().find { pipelineFileKeys -> (fileName.contains(pipelineFileKeys.fileName)) } + if (requiredPipelineFileKeys) { + insertAsProperty(pipelineInformation, child, requiredPipelineFileKeys.getKeyName()) } } } @@ -218,7 +227,7 @@ class BioinformaticAnalysisParser implements DatasetParser * @since 1.8.0 */ private static void insertAsProperty(Map parent, Object content, String propertyName) { - parent.put(propertyName,content) + parent.put(propertyName, content) } /** @@ -236,7 +245,7 @@ class BioinformaticAnalysisParser implements DatasetParser /** * Method which checks if a given Json String matches a given Json schema * @param json Json String which will be compared to schema - * @param path to Json schema for validation of Json String + * @param path to Json schema for validation of Json String * @throws org.everit.json.schema.ValidationException */ private static void validateJson(String json) throws ValidationException { @@ -258,6 +267,7 @@ class BioinformaticAnalysisParser implements DatasetParser /* * Converts a file tree into a json object. */ + private static class DirectoryConverter { /** @@ -321,11 +331,11 @@ class BioinformaticAnalysisParser implements DatasetParser private static Map convertToRelativePaths(Map content, Path root) { //Since each value in the root map is a map we need to iterate over each key/value pair - content["path"] = toRelativePath(content["path"] as String, root) - if (content["children"]) { - // Children always contains a map, so convert recursively - content["children"] = (content["children"] as List).collect { convertToRelativePaths(it as Map, root) } - } + content["path"] = toRelativePath(content["path"] as String, root) + if (content["children"]) { + // Children always contains a map, so convert recursively + content["children"] = (content["children"] as List).collect { convertToRelativePaths(it as Map, root) } + } return content } @@ -351,8 +361,8 @@ class BioinformaticAnalysisParser implements DatasetParser def convertedFile = [ - "name" : name, - "path" : path, + "name" : name, + "path" : path, "fileType": fileType ] return convertedFile diff --git a/src/main/groovy/life/qbic/utils/NanoporeParser.groovy b/src/main/groovy/life/qbic/utils/NanoporeParser.groovy index 8362a035..29a212d3 100644 --- a/src/main/groovy/life/qbic/utils/NanoporeParser.groovy +++ b/src/main/groovy/life/qbic/utils/NanoporeParser.groovy @@ -42,9 +42,7 @@ class NanoporeParser { for (File hiddenFile : hiddenFiles) { deleteFile(hiddenFile) } - return convertedExperiment - } private static void deleteFile(File file) { diff --git a/src/test/groovy/life/qbic/utils/BioinformaticAnalysisSpec.groovy b/src/test/groovy/life/qbic/utils/BioinformaticAnalysisSpec.groovy index d3185ae8..008c51c8 100644 --- a/src/test/groovy/life/qbic/utils/BioinformaticAnalysisSpec.groovy +++ b/src/test/groovy/life/qbic/utils/BioinformaticAnalysisSpec.groovy @@ -62,8 +62,45 @@ class BioinformaticAnalysisSpec extends Specification { ExecutionReport executionReport = pipelineInfo.getExecutionReport() assert executionReport.getRelativePath() == "./pipeline_info/execution_report_1234-56-78_90-12-34.html" assert executionReport.getName() == "execution_report_1234-56-78_90-12-34.html" + } + def "parsing a valid file structure without a run_id also returns a NfCorePipelineResult object"() { + given: "A valid nf-core pipeline output data structure" + def pathToDirectory = Paths.get(exampleDirectoriesRoot, "validates-no-run-id") + when: "we parse this valid structure" + NfCorePipelineResult nfCorePipelineResult = bioinformaticAnalysisParser.parseFrom(pathToDirectory) + then: "we expect no exception should be thrown" + assert nfCorePipelineResult instanceof NfCorePipelineResult + //Root files can be parsed + assert !nfCorePipelineResult.runId + assert !nfCorePipelineResult.runId + assert nfCorePipelineResult.sampleIds.getRelativePath() == "./sample_ids.txt" + assert nfCorePipelineResult.sampleIds.getName()== "sample_ids.txt" + //Root Folder can be parsed + QualityControlFolder multiQc = nfCorePipelineResult.getQualityControlFolder() + assert multiQc.getRelativePath() == "./multiqc" + assert multiQc.getName() == "multiqc" + assert multiQc instanceof DataFolder + PipelineInformationFolder pipelineInfo = nfCorePipelineResult.getPipelineInformation() + assert pipelineInfo.getRelativePath() == "./pipeline_info" + assert pipelineInfo.getName() == "pipeline_info" + assert pipelineInfo instanceof DataFolder + + List processFolders = nfCorePipelineResult.getProcessFolders() + assert processFolders[0].getRelativePath()== "./salmon" + assert processFolders[0].getName() == "salmon" + assert processFolders[0] instanceof DataFolder + + //Files in Root folders can be parsed + + SoftwareVersions softwareVersions = pipelineInfo.getSoftwareVersions() + assert softwareVersions.getRelativePath() == "./pipeline_info/software_versions.yml" + assert softwareVersions.getName() == "software_versions.yml" + + ExecutionReport executionReport = pipelineInfo.getExecutionReport() + assert executionReport.getRelativePath() == "./pipeline_info/execution_report_1234-56-78_90-12-34.html" + assert executionReport.getName() == "execution_report_1234-56-78_90-12-34.html" } def "parsing an invalid file structure throws DatasetValidationException"() { diff --git a/src/test/resources/dummyFileSystem/bioinformatic-analysis-output/validates-no-run-id/multiqc/star_salmon/multiqc_report.html b/src/test/resources/dummyFileSystem/bioinformatic-analysis-output/validates-no-run-id/multiqc/star_salmon/multiqc_report.html new file mode 100644 index 00000000..e69de29b diff --git a/src/test/resources/dummyFileSystem/bioinformatic-analysis-output/validates-no-run-id/pipeline_info/execution_report_1234-56-78_90-12-34.html b/src/test/resources/dummyFileSystem/bioinformatic-analysis-output/validates-no-run-id/pipeline_info/execution_report_1234-56-78_90-12-34.html new file mode 100644 index 00000000..e69de29b diff --git a/src/test/resources/dummyFileSystem/bioinformatic-analysis-output/validates-no-run-id/pipeline_info/software_versions.yml b/src/test/resources/dummyFileSystem/bioinformatic-analysis-output/validates-no-run-id/pipeline_info/software_versions.yml new file mode 100644 index 00000000..e69de29b diff --git a/src/test/resources/dummyFileSystem/bioinformatic-analysis-output/validates-no-run-id/salmon/salmon.merged.gene_tpm.tsv b/src/test/resources/dummyFileSystem/bioinformatic-analysis-output/validates-no-run-id/salmon/salmon.merged.gene_tpm.tsv new file mode 100644 index 00000000..e69de29b diff --git a/src/test/resources/dummyFileSystem/bioinformatic-analysis-output/validates-no-run-id/sample_ids.txt b/src/test/resources/dummyFileSystem/bioinformatic-analysis-output/validates-no-run-id/sample_ids.txt new file mode 100644 index 00000000..e69de29b