mossmatters · rb-laura · Jan 27, 2017 · Jan 27, 2017 · Jan 30, 2017
diff --git a/PAFTOL_pipeline/mutateFastq.py b/PAFTOL_pipeline/mutateFastq.py
@@ -0,0 +1,31 @@
+"""AIM: The goal of this script is to randomly introduce variation into a series of fastq files.
+"""
+
+from Bio import SeqIO
+import random
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio.Seq import MutableSeq
+
+inFile = SeqIO.parse("/Users/laurabotigue/HybPiper/test_dataset/EG30_R1_test.fastq", "fastq")
+outFile= open("/Users/laurabotigue/HybPiper/test_dataset/EG30_R1_iter2.fastq", "w")
+
+bases=['A','C','G','T']
+
+for read in inFile:
+	len_seq=len(read.seq)
+	pos = random.randint(0,len_seq-1)
+	bases.remove(str(read.seq[pos])) #check if using dictionary instead of list is more elegant.
+	mut = random.sample(bases, 1) 
+	output_seq = read.seq.tomutable()
+	output_seq[pos] = mut[0]
+	bases=['A','C','G','T'] ## This is to restore the poss number of mutations
+
+	new_seq = SeqRecord(output_seq, id=read.id, description=read.description)
+	new_seq.letter_annotations["phred_quality"]=read.letter_annotations["phred_quality"] # A rather convoluted way to pair the new sequence with the old fastq information	
+
+	SeqIO.write(new_seq, outFile, "fastq")
+
+outFile.close()
+
+
diff --git a/PAFTOL_pipeline/test_HybPiper_Performance.sh b/PAFTOL_pipeline/test_HybPiper_Performance.sh
@@ -0,0 +1,31 @@
+"""AIM: The goal of this piece of code is to test the performance of HybPiper
+as the test sequences are more and more different from the target sequences.
+The motivation is to explore scenarios where the species we are using is 
+very diverged from the ones used to generate the baits of the HybSeq array.
+
+This pipeline uses files within test_dataset directory in HybPiper. We will create 
+a loop and each time we are going to randomly introduce variation in the test_sequences.
+
+Every time we will generate heatmap and summary stats to explore how HybPiper performance changes.
+
+Notes for me:
+We need to be extracareful to keep the HybPiper hierarchical directory structure, otherwise
+it will fail.
+Every new file should be created within the test_dataset folder.
+
+Remember to change R code accordingly  when we do the LOOP!
+Is mutable module or class?
+"""
+## PENDING: Generate the loop across iterations with appropriate variables
+
+# We first want to run HybPiper's pipeline as it is
+Rscript multiSampleLoop.R
+
+# Now we generate heatmap and summstats files.
+python ../get_seq_lengths.py test_targets.fasta namelist.txt dna > test_seq_lengths.txt
+Rscript gene_recovery_heatmap.R
+python hybpiper_stats.py test_seq_lengths.txt namelist.txt > ~/HybPiper/PAFTOL_pipeline/test_stats.txt
+
+# Now we change the test reads fastq. We plan to use the mutable module or class.
+python ~/HybPiper/PAFTOL_pipeline/mutateFastq.py 
+
diff --git a/PAFTOL_pipeline/test_HybPiper_Pipeline.py b/PAFTOL_pipeline/test_HybPiper_Pipeline.py
@@ -0,0 +1,38 @@
+
+"""AIM: The goal of this piece of code is to determine how the performance of HybPiper changes
+as the sequenced reads are more diverged from the target sequence.
+
+Tasks: We will determine this through three different steps:
+
+1) Use Biopython to generate a series of fastq files of paired end reads where we are randomly introducing
+an ever incerasing number of variation from the target sequence.
+
+2) Test HybPiper and see how it performs. What proportions of the reads are aligned with the target and what proportion of them fail."""
+
+from Bio import SeqIO
+from Bio import Seq
+import random
+
+R1_out=open("200bases20Read_R1.fastqc","write")
+R2_out=open("200bases20Read_R2.fastqc","write")
+
+sample='@M00223:27:000000000-AAF1Y:1:1101:'
+fwd_read='1:N:0:14'
+rev_read='2:N:0:14'
+
+for record in SeqIO.parse("../test_dataset/test_targets.fasta", "fasta"):
+	seq = record.seq
+	totLength=len(seq)
+	for read in range(0,20): # We are going to simulate 20 reads for now, each=200bases long.
+		n1=random.randint(20000,25000) ## this number is unique to the read
+		n2=random.randint(1000,2000) ## this number is an unique to the read
+		line1=sample+":"+str(n1)+":"+str(n2) ## this will be the first line of the fastq file
+		point1=random.randint(0,totLength-1)
+		point2=random.randint(point1+200,totLength-1)
+		if point1 < (totLength - 200):
+			point2=random.randint(point1+200,totLength-1)
+			R=seq[point:point+200]
+			line2=(str(R)
+		else:
+			R=seq[point-200:point]
+			line2=print(str(R))