Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable NanoporeParser to validate and parse pod5 based nanopore structures #126

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@
<dependency>
<artifactId>data-model-lib</artifactId>
<groupId>life.qbic</groupId>
<version>2.25.0</version>
<version>2.27.0</version>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
Expand Down
42 changes: 25 additions & 17 deletions src/main/groovy/life/qbic/utils/NanoporeParser.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package life.qbic.utils
import com.fasterxml.jackson.databind.ObjectMapper
import groovy.json.JsonSlurper
import groovy.util.logging.Log4j2
import life.qbic.datamodel.instruments.OxfordNanoporeInstrumentOutputDoradoMinimal
import life.qbic.datamodel.instruments.OxfordNanoporeInstrumentOutputMinimal
import net.jimblackler.jsonschemafriend.Schema
import net.jimblackler.jsonschemafriend.SchemaStore
Expand All @@ -14,7 +15,6 @@ import java.nio.file.Path
import java.nio.file.Paths
import java.text.ParseException
import life.qbic.datamodel.datasets.OxfordNanoporeExperiment

import java.util.stream.Collectors

@Log4j2
Expand Down Expand Up @@ -95,8 +95,8 @@ class NanoporeParser {
jsonStarted = true
}
if (jsonStarted) {
def split = line.replaceAll("\\s+","").split(":")
if(split.size() == 2 && split[1].replaceAll('"',"").size() <= 1){
def split = line.replaceAll("\\s+", "").split(":")
if (split.size() == 2 && split[1].replaceAll('"', "").size() <= 1) {
log.info("Metadata value ${split[0]} missing in ${reportFile["path"]}")
}
buffer.append(line)
Expand All @@ -110,12 +110,11 @@ class NanoporeParser {
new File(Paths.get(root.toString(), summaryFile["path"].toString()) as String)
.readLines().each { line ->
def split = line.split("=")
if(split.size() > 1){
if (split.size() > 1) {
finalMetaData[split[0]] = split[1]
}
else {
} else {
log.info("Metadata value ${split[0]} missing in ${summaryFile["path"]}, defaulting to empty value")
finalMetaData[split[0]] = ""
finalMetaData[split[0]] = ""
}
}
return finalMetaData
Expand Down Expand Up @@ -178,18 +177,27 @@ class NanoporeParser {
* @throws net.jimblackler.jsonschemafriend.ValidationException
*/
private static void validateJson(String json) throws ValidationException {
// Step 1: load schema
// Step 1: load json
ObjectMapper objectMapper = new ObjectMapper()
Object jsonObject = objectMapper.readValue(json, Object)

SchemaStore schemaStore = new SchemaStore()
Schema schema = schemaStore.loadSchema(OxfordNanoporeInstrumentOutputMinimal.getSchemaAsStream())
Validator validator = new Validator()
validator.validate(schema, jsonObject)
try {
//Validate against Fast5 Based Oxford Measurement
Schema schema = schemaStore.loadSchema(OxfordNanoporeInstrumentOutputMinimal.getSchemaAsStream())
validator.validate(schema, jsonObject)
} catch (ValidationException ignored) {
//Validate against Pod5 Based Oxford Measurement
Schema schema = schemaStore.loadSchema(OxfordNanoporeInstrumentOutputDoradoMinimal.getSchemaAsStream())
validator.validate(schema, jsonObject)
}
}

/*
* Converts a file tree into a json object.
*/

private static class DirectoryConverter {
private static final PREDEFINED_EXTENSIONS = ["fastq.gz"]
private static final IGNORED_FOLDERNAMES = ["qc"]
Expand Down Expand Up @@ -239,11 +247,11 @@ class NanoporeParser {
List<File> children = currentDirectory.listFiles()

List<File> visibleChildren = children.stream()
.filter(file -> !file.isHidden()).collect(Collectors.toList());
.filter(file -> !file.isHidden()).collect(Collectors.toList())

for (File file : children) {
if (!visibleChildren.contains(file)) {
hiddenFiles.add(file);
hiddenFiles.add(file)
}
}

Expand All @@ -252,11 +260,11 @@ class NanoporeParser {
return !IGNORED_FOLDERNAMES.contains(currentFolderName)
}.collect {
file ->
if (file.isFile()) {
convertFile(file.toPath())
} else if (file.isDirectory()) {
convertDirectory(file.toPath())
}
if (file.isFile()) {
convertFile(file.toPath())
} else if (file.isDirectory()) {
convertDirectory(file.toPath())
}
}

def convertedDirectory = [
Expand Down
35 changes: 35 additions & 0 deletions src/test/groovy/life/qbic/utils/NanoporeParserSpec.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,41 @@ class NanoporeParserSpec extends Specification {
thrown(ValidationException)
}

def "parsing a valid minimal file structure for dorado based basecalling containing additional unknown files and folder still returns an OxfordNanoporeExperiment Object"() {
given:
def pathToDirectory = Paths.get(exampleDirectoriesRoot, "validates/QABCD001AB_E12A345a01_PAE12345_nanopore_valid_dorado_minimal")
when:
def experiment = NanoporeParser.parseFileStructure(pathToDirectory)
then:
assert experiment instanceof OxfordNanoporeExperiment
// Check that the metadata from the report file has been retrieved
assert experiment.getMeasurements().get(0).getMachineHost() == "PCT0094"
// Check that the metadata from the summary file has been retrieved
assert experiment.getMeasurements().get(0).getLibraryPreparationKit() == "SQK-LSK109-XL"
}

def "parsing a valid file structure for dorado based basecalling containing additional unknown files and folder still returns an OxfordNanoporeExperiment Object"() {
given:
def pathToDirectory = Paths.get(exampleDirectoriesRoot, "validates/QABCD001AB_E12A345a01_PAE12345_nanopore_valid_dorado_example")
when:
def experiment = NanoporeParser.parseFileStructure(pathToDirectory)
then:
assert experiment instanceof OxfordNanoporeExperiment
// Check that the metadata from the report file has been retrieved
assert experiment.getMeasurements().get(0).getMachineHost() == "PCT0094"
// Check that the metadata from the summary file has been retrieved
assert experiment.getMeasurements().get(0).getLibraryPreparationKit() == "SQK-LSK109-XL"
}

def "parsing an invalid minimal file structure for dorado based basecalling leads to a ValidationException"() {
given:
def pathToDirectory = Paths.get(exampleDirectoriesRoot, "fails/QABCD001AB_E12A345a01_PAE12345_missing_skip_folder")
when:
def experiment = NanoporeParser.parseFileStructure(pathToDirectory)
then:
thrown(ValidationException)
}

def "parsing the alternative valid file structure with metadata missing returns an OxfordNanoporeExperiment Object"() {
given:
def pathToDirectory = Paths.get(exampleDirectoriesRoot, "validates/QABCD001AB_E12A345a01_PAE12345_nanopore_new_minimal")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
instrument=PCT0094
position=1-A3-D3
flow_cell_id=PAE24142
sample_id=QNANO027AE_E19D023a01_200211
protocol_group_id=20200211_QNANO
protocol=sequencing/sequencing_PRO002_DNA:FLO-PRO002:SQK-LSK109-XL
protocol_run_id=5a7cfc2a-81b0-412d-baa0-51b939cd8e76
acquisition_run_id=c6028297dff19d01e7c5fba6487de807d1e99c05
started=2020-02-11T15:52:10.465982+01:00
acquisition_stopped=2020-02-14T08:39:54.688916+01:00
processing_stopped=2020-02-14T08:39:58.804639+01:00
basecalling_enabled=1
sequencing_summary_file=sequencing_summary_PAE24142_c6028297.txt
fast5_files_in_final_dest=2189
fast5_files_in_fallback=0
fastq_files_in_final_dest=2189
fastq_files_in_fallback=0
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
Tracking ID
===========

{
"asic_id": "0004A30B0022C63E",
"asic_id_eeprom": "0004A30B0022C63E",
"asic_temp": "32.631687",
"asic_version": "Unknown",
"auto_update": "0",
"auto_update_source": "https://mirror.oxfordnanoportal.com/software/MinKNOW/",
"bream_is_standard": "0",
"configuration_version": "1.0.7",
"device_id": "1-E9-H9",
"device_type": "promethion",
"distribution_status": "stable",
"distribution_version": "19.12.5",
"exp_script_name": "N/A",
"exp_script_purpose": "sequencing_run",
"exp_start_time": "2020-01-28T15:17:38Z",
"flow_cell_id": "PAE26989",
"flow_cell_product_code": "FLO-PRO002",
"guppy_version": "3.2.8+bd67289",
"heatsink_temp": "36.179111",
"hostname": "PCT0094",
"hublett_board_id": "0132136faade2e15",
"hublett_firmware_version": "2.0.12",
"installation_type": "nc",
"ip_address": "",
"local_firmware_file": "1",
"mac_address": "",
"operating_system": "ubuntu 16.04",
"protocol_group_id": "20200128_QNANO",
"protocol_run_id": "",
"protocols_version": "4.3.16",
"run_id": "db9e9383d44d80bbe1e2600c7a7419056610d46d",
"sample_id": "QNANO036AD_E19D023b04",
"satellite_board_id": "0000000000000000",
"satellite_firmware_version": "2.0.12",
"usb_config": "firm_1.2.3_ware#rbt_4.5.6_rbt#ctrl#USB3",
"version": "3.6.1"
}

Duty Time
=========

ID: db9e9383d44d80bbe1e2600c7a7419056610d46d

Channel State,Experiment Time (minutes),State Time (samples),
strand,0,144832342
strand,1,158421270
strand,2,378095352
strand,3,472685319
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
instrument=PCT0094
position=1-A3-D3
flow_cell_id=PAE24142
sample_id=QNANO027AE_E19D023a01_200211
protocol_group_id=20200211_QNANO
protocol=sequencing/sequencing_PRO002_DNA:FLO-PRO002:SQK-LSK109-XL
protocol_run_id=5a7cfc2a-81b0-412d-baa0-51b939cd8e76
acquisition_run_id=c6028297dff19d01e7c5fba6487de807d1e99c05
started=2020-02-11T15:52:10.465982+01:00
acquisition_stopped=2020-02-14T08:39:54.688916+01:00
processing_stopped=2020-02-14T08:39:58.804639+01:00
basecalling_enabled=1
sequencing_summary_file=sequencing_summary_PAE24142_c6028297.txt
fast5_files_in_final_dest=2189
fast5_files_in_fallback=0
fastq_files_in_final_dest=2189
fastq_files_in_fallback=0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
Tracking ID
===========

{
"asic_id": "0004A30B0022C63E",
"asic_id_eeprom": "0004A30B0022C63E",
"asic_temp": "32.631687",
"asic_version": "Unknown",
"auto_update": "0",
"auto_update_source": "https://mirror.oxfordnanoportal.com/software/MinKNOW/",
"bream_is_standard": "0",
"configuration_version": "1.0.7",
"device_id": "1-E9-H9",
"device_type": "promethion",
"distribution_status": "stable",
"distribution_version": "19.12.5",
"exp_script_name": "N/A",
"exp_script_purpose": "sequencing_run",
"exp_start_time": "2020-01-28T15:17:38Z",
"flow_cell_id": "PAE26989",
"flow_cell_product_code": "FLO-PRO002",
"guppy_version": "3.2.8+bd67289",
"heatsink_temp": "36.179111",
"hostname": "PCT0094",
"hublett_board_id": "0132136faade2e15",
"hublett_firmware_version": "2.0.12",
"installation_type": "nc",
"ip_address": "",
"local_firmware_file": "1",
"mac_address": "",
"operating_system": "ubuntu 16.04",
"protocol_group_id": "20200128_QNANO",
"protocol_run_id": "",
"protocols_version": "4.3.16",
"run_id": "db9e9383d44d80bbe1e2600c7a7419056610d46d",
"sample_id": "QNANO036AD_E19D023b04",
"satellite_board_id": "0000000000000000",
"satellite_firmware_version": "2.0.12",
"usb_config": "firm_1.2.3_ware#rbt_4.5.6_rbt#ctrl#USB3",
"version": "3.6.1"
}

Duty Time
=========

ID: db9e9383d44d80bbe1e2600c7a7419056610d46d

Channel State,Experiment Time (minutes),State Time (samples),
strand,0,144832342
strand,1,158421270
strand,2,378095352
strand,3,472685319
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is some text
Loading
Loading