From 202c9086e12f225de3a9d8eadc5146452026effc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Laura=20R=2E=20Botigu=C3=A9?= Date: Fri, 27 Jan 2017 12:09:46 +0000 Subject: [PATCH 1/3] First test to introduce mutations to fastq files --- PAFTOL_pipeline/mutateFastq.py | 31 ++++++++++++++++ PAFTOL_pipeline/test_HybPiper_Performance.py | 31 ++++++++++++++++ PAFTOL_pipeline/test_HybPiper_Pipeline.py | 38 ++++++++++++++++++++ 3 files changed, 100 insertions(+) create mode 100644 PAFTOL_pipeline/mutateFastq.py create mode 100644 PAFTOL_pipeline/test_HybPiper_Performance.py create mode 100644 PAFTOL_pipeline/test_HybPiper_Pipeline.py diff --git a/PAFTOL_pipeline/mutateFastq.py b/PAFTOL_pipeline/mutateFastq.py new file mode 100644 index 0000000..588b3f3 --- /dev/null +++ b/PAFTOL_pipeline/mutateFastq.py @@ -0,0 +1,31 @@ +"""AIM: The goal of this script is to randomly introduce variation into a series of fastq files. +""" + +from Bio import SeqIO +import random +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Bio.Seq import MutableSeq + +inFile = SeqIO.parse("/Users/laurabotigue/HybPiper/test_dataset/EG30_R1_test.fastq", "fastq") +outFile= open("/Users/laurabotigue/HybPiper/test_dataset/EG30_R1_iter2.fastq", "w") + +bases=['A','C','G','T'] + +for read in inFile: + len_seq=len(read.seq) + pos = random.randint(0,len_seq-1) + bases.remove(str(read.seq[pos])) #check if using dictionary instead of list is more elegant. + mut = random.sample(bases, 1) + output_seq = read.seq.tomutable() + output_seq[pos] = mut[0] + bases=['A','C','G','T'] ## This is to restore the poss number of mutations + + new_seq = SeqRecord(output_seq, id=read.id, description=read.description) + new_seq.letter_annotations["phred_quality"]=read.letter_annotations["phred_quality"] # A rather convoluted way to pair the new sequence with the old fastq information + + SeqIO.write(new_seq, outFile, "fastq") + +outFile.close() + + diff --git a/PAFTOL_pipeline/test_HybPiper_Performance.py b/PAFTOL_pipeline/test_HybPiper_Performance.py new file mode 100644 index 0000000..83fe7c3 --- /dev/null +++ b/PAFTOL_pipeline/test_HybPiper_Performance.py @@ -0,0 +1,31 @@ +"""AIM: The goal of this piece of code is to test the performance of HybPiper +as the test sequences are more and more different from the target sequences. +The motivation is to explore scenarios where the species we are using is +very diverged from the ones used to generate the baits of the HybSeq array. + +This pipeline uses files within test_dataset directory in HybPiper. We will create +a loop and each time we are going to randomly introduce variation in the test_sequences. + +Every time we will generate heatmap and summary stats to explore how HybPiper performance changes. + +Notes for me: +We need to be extracareful to keep the HybPiper hierarchical directory structure, otherwise +it will fail. +Every new file should be created within the test_dataset folder. + +Remember to change R code accordingly when we do the LOOP! +Is mutable module or class? +""" +import os + +# We first want to run HybPiper's pipeline as it is +os.system("Rscript multiSampleLoop.R") + +# Now we generate heatmap and summstats files. +os.system("python ../get_seq_lengths.py test_targets.fasta namelist.txt dna > test_seq_lengths.txt") +os.system("Rscript gene_recovery_heatmap.R") +os.system("python hybpiper_stats.py test_seq_lengths.txt namelist.txt > test_stats.txt") + +# Now we change the test reads fastq. We plan to use the mutable module or class. + + diff --git a/PAFTOL_pipeline/test_HybPiper_Pipeline.py b/PAFTOL_pipeline/test_HybPiper_Pipeline.py new file mode 100644 index 0000000..efbb137 --- /dev/null +++ b/PAFTOL_pipeline/test_HybPiper_Pipeline.py @@ -0,0 +1,38 @@ + +"""AIM: The goal of this piece of code is to determine how the performance of HybPiper changes +as the sequenced reads are more diverged from the target sequence. + +Tasks: We will determine this through three different steps: + +1) Use Biopython to generate a series of fastq files of paired end reads where we are randomly introducing +an ever incerasing number of variation from the target sequence. + +2) Test HybPiper and see how it performs. What proportions of the reads are aligned with the target and what proportion of them fail.""" + +from Bio import SeqIO +from Bio import Seq +import random + +R1_out=open("200bases20Read_R1.fastqc","write") +R2_out=open("200bases20Read_R2.fastqc","write") + +sample='@M00223:27:000000000-AAF1Y:1:1101:' +fwd_read='1:N:0:14' +rev_read='2:N:0:14' + +for record in SeqIO.parse("../test_dataset/test_targets.fasta", "fasta"): + seq = record.seq + totLength=len(seq) + for read in range(0,20): # We are going to simulate 20 reads for now, each=200bases long. + n1=random.randint(20000,25000) ## this number is unique to the read + n2=random.randint(1000,2000) ## this number is an unique to the read + line1=sample+":"+str(n1)+":"+str(n2) ## this will be the first line of the fastq file + point1=random.randint(0,totLength-1) + point2=random.randint(point1+200,totLength-1) + if point1 < (totLength - 200): + point2=random.randint(point1+200,totLength-1) + R=seq[point:point+200] + line2=(str(R) + else: + R=seq[point-200:point] + line2=print(str(R)) From 747a14878702b81c3fe3c926abefece29e8bc250 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Laura=20R=2E=20Botigu=C3=A9?= Date: Fri, 27 Jan 2017 12:27:40 +0000 Subject: [PATCH 2/3] Cleaning the performance script --- PAFTOL_pipeline/test_HybPiper_Performance.sh | 31 ++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 PAFTOL_pipeline/test_HybPiper_Performance.sh diff --git a/PAFTOL_pipeline/test_HybPiper_Performance.sh b/PAFTOL_pipeline/test_HybPiper_Performance.sh new file mode 100644 index 0000000..f8b555e --- /dev/null +++ b/PAFTOL_pipeline/test_HybPiper_Performance.sh @@ -0,0 +1,31 @@ +"""AIM: The goal of this piece of code is to test the performance of HybPiper +as the test sequences are more and more different from the target sequences. +The motivation is to explore scenarios where the species we are using is +very diverged from the ones used to generate the baits of the HybSeq array. + +This pipeline uses files within test_dataset directory in HybPiper. We will create +a loop and each time we are going to randomly introduce variation in the test_sequences. + +Every time we will generate heatmap and summary stats to explore how HybPiper performance changes. + +Notes for me: +We need to be extracareful to keep the HybPiper hierarchical directory structure, otherwise +it will fail. +Every new file should be created within the test_dataset folder. + +Remember to change R code accordingly when we do the LOOP! +Is mutable module or class? +""" +## PENDING: Generate the loop across iterations with appropriate variables + +# We first want to run HybPiper's pipeline as it is +Rscript multiSampleLoop.R + +# Now we generate heatmap and summstats files. +python ../get_seq_lengths.py test_targets.fasta namelist.txt dna > test_seq_lengths.txt +Rscript gene_recovery_heatmap.R +python hybpiper_stats.py test_seq_lengths.txt namelist.txt > ~/HybPiper/PAFTOL_pipeline/test_stats.txt + +# Now we change the test reads fastq. We plan to use the mutable module or class. +python ~/HybPiper/PAFTOL_pipeline/mutateFastq.py + From 5500bc99112f2f0e97dcb4ade3f7034e14166edf Mon Sep 17 00:00:00 2001 From: rb-laura Date: Mon, 30 Jan 2017 12:03:46 +0000 Subject: [PATCH 3/3] Delete test_HybPiper_Performance.py Now a shells script --- PAFTOL_pipeline/test_HybPiper_Performance.py | 31 -------------------- 1 file changed, 31 deletions(-) delete mode 100644 PAFTOL_pipeline/test_HybPiper_Performance.py diff --git a/PAFTOL_pipeline/test_HybPiper_Performance.py b/PAFTOL_pipeline/test_HybPiper_Performance.py deleted file mode 100644 index 83fe7c3..0000000 --- a/PAFTOL_pipeline/test_HybPiper_Performance.py +++ /dev/null @@ -1,31 +0,0 @@ -"""AIM: The goal of this piece of code is to test the performance of HybPiper -as the test sequences are more and more different from the target sequences. -The motivation is to explore scenarios where the species we are using is -very diverged from the ones used to generate the baits of the HybSeq array. - -This pipeline uses files within test_dataset directory in HybPiper. We will create -a loop and each time we are going to randomly introduce variation in the test_sequences. - -Every time we will generate heatmap and summary stats to explore how HybPiper performance changes. - -Notes for me: -We need to be extracareful to keep the HybPiper hierarchical directory structure, otherwise -it will fail. -Every new file should be created within the test_dataset folder. - -Remember to change R code accordingly when we do the LOOP! -Is mutable module or class? -""" -import os - -# We first want to run HybPiper's pipeline as it is -os.system("Rscript multiSampleLoop.R") - -# Now we generate heatmap and summstats files. -os.system("python ../get_seq_lengths.py test_targets.fasta namelist.txt dna > test_seq_lengths.txt") -os.system("Rscript gene_recovery_heatmap.R") -os.system("python hybpiper_stats.py test_seq_lengths.txt namelist.txt > test_stats.txt") - -# Now we change the test reads fastq. We plan to use the mutable module or class. - -