From 202c9086e12f225de3a9d8eadc5146452026effc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Laura=20R=2E=20Botigu=C3=A9?= <rb.laura@gmail.com>
Date: Fri, 27 Jan 2017 12:09:46 +0000
Subject: [PATCH 1/3] First test to introduce mutations to fastq files

---
 PAFTOL_pipeline/mutateFastq.py               | 31 ++++++++++++++++
 PAFTOL_pipeline/test_HybPiper_Performance.py | 31 ++++++++++++++++
 PAFTOL_pipeline/test_HybPiper_Pipeline.py    | 38 ++++++++++++++++++++
 3 files changed, 100 insertions(+)
 create mode 100644 PAFTOL_pipeline/mutateFastq.py
 create mode 100644 PAFTOL_pipeline/test_HybPiper_Performance.py
 create mode 100644 PAFTOL_pipeline/test_HybPiper_Pipeline.py

diff --git a/PAFTOL_pipeline/mutateFastq.py b/PAFTOL_pipeline/mutateFastq.py
new file mode 100644
index 0000000..588b3f3
--- /dev/null
+++ b/PAFTOL_pipeline/mutateFastq.py
@@ -0,0 +1,31 @@
+"""AIM: The goal of this script is to randomly introduce variation into a series of fastq files.
+"""
+
+from Bio import SeqIO
+import random
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio.Seq import MutableSeq
+
+inFile = SeqIO.parse("/Users/laurabotigue/HybPiper/test_dataset/EG30_R1_test.fastq", "fastq")
+outFile= open("/Users/laurabotigue/HybPiper/test_dataset/EG30_R1_iter2.fastq", "w")
+
+bases=['A','C','G','T']
+
+for read in inFile:
+	len_seq=len(read.seq)
+	pos = random.randint(0,len_seq-1)
+	bases.remove(str(read.seq[pos])) #check if using dictionary instead of list is more elegant.
+	mut = random.sample(bases, 1) 
+	output_seq = read.seq.tomutable()
+	output_seq[pos] = mut[0]
+	bases=['A','C','G','T'] ## This is to restore the poss number of mutations
+
+	new_seq = SeqRecord(output_seq, id=read.id, description=read.description)
+	new_seq.letter_annotations["phred_quality"]=read.letter_annotations["phred_quality"] # A rather convoluted way to pair the new sequence with the old fastq information	
+	
+	SeqIO.write(new_seq, outFile, "fastq")
+	
+outFile.close()
+
+
diff --git a/PAFTOL_pipeline/test_HybPiper_Performance.py b/PAFTOL_pipeline/test_HybPiper_Performance.py
new file mode 100644
index 0000000..83fe7c3
--- /dev/null
+++ b/PAFTOL_pipeline/test_HybPiper_Performance.py
@@ -0,0 +1,31 @@
+"""AIM: The goal of this piece of code is to test the performance of HybPiper
+as the test sequences are more and more different from the target sequences.
+The motivation is to explore scenarios where the species we are using is 
+very diverged from the ones used to generate the baits of the HybSeq array.
+
+This pipeline uses files within test_dataset directory in HybPiper. We will create 
+a loop and each time we are going to randomly introduce variation in the test_sequences.
+
+Every time we will generate heatmap and summary stats to explore how HybPiper performance changes.
+
+Notes for me:
+We need to be extracareful to keep the HybPiper hierarchical directory structure, otherwise
+it will fail.
+Every new file should be created within the test_dataset folder.
+
+Remember to change R code accordingly  when we do the LOOP!
+Is mutable module or class?
+"""
+import os
+
+# We first want to run HybPiper's pipeline as it is
+os.system("Rscript multiSampleLoop.R")
+
+# Now we generate heatmap and summstats files.
+os.system("python ../get_seq_lengths.py test_targets.fasta namelist.txt dna > test_seq_lengths.txt")
+os.system("Rscript gene_recovery_heatmap.R")
+os.system("python hybpiper_stats.py test_seq_lengths.txt namelist.txt > test_stats.txt")
+
+# Now we change the test reads fastq. We plan to use the mutable module or class.
+ 
+
diff --git a/PAFTOL_pipeline/test_HybPiper_Pipeline.py b/PAFTOL_pipeline/test_HybPiper_Pipeline.py
new file mode 100644
index 0000000..efbb137
--- /dev/null
+++ b/PAFTOL_pipeline/test_HybPiper_Pipeline.py
@@ -0,0 +1,38 @@
+
+"""AIM: The goal of this piece of code is to determine how the performance of HybPiper changes
+as the sequenced reads are more diverged from the target sequence.
+
+Tasks: We will determine this through three different steps:
+
+1) Use Biopython to generate a series of fastq files of paired end reads where we are randomly introducing
+an ever incerasing number of variation from the target sequence.
+
+2) Test HybPiper and see how it performs. What proportions of the reads are aligned with the target and what proportion of them fail."""
+
+from Bio import SeqIO
+from Bio import Seq
+import random
+
+R1_out=open("200bases20Read_R1.fastqc","write")
+R2_out=open("200bases20Read_R2.fastqc","write")
+
+sample='@M00223:27:000000000-AAF1Y:1:1101:'
+fwd_read='1:N:0:14'
+rev_read='2:N:0:14'
+
+for record in SeqIO.parse("../test_dataset/test_targets.fasta", "fasta"):
+	seq = record.seq
+	totLength=len(seq)
+	for read in range(0,20): # We are going to simulate 20 reads for now, each=200bases long.
+		n1=random.randint(20000,25000) ## this number is unique to the read
+		n2=random.randint(1000,2000) ## this number is an unique to the read
+		line1=sample+":"+str(n1)+":"+str(n2) ## this will be the first line of the fastq file
+		point1=random.randint(0,totLength-1)
+		point2=random.randint(point1+200,totLength-1)
+		if point1 < (totLength - 200):
+			point2=random.randint(point1+200,totLength-1)
+			R=seq[point:point+200]
+			line2=(str(R)
+		else:
+			R=seq[point-200:point]
+			line2=print(str(R))	

From 747a14878702b81c3fe3c926abefece29e8bc250 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Laura=20R=2E=20Botigu=C3=A9?= <rb.laura@gmail.com>
Date: Fri, 27 Jan 2017 12:27:40 +0000
Subject: [PATCH 2/3] Cleaning the performance script

---
 PAFTOL_pipeline/test_HybPiper_Performance.sh | 31 ++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 PAFTOL_pipeline/test_HybPiper_Performance.sh

diff --git a/PAFTOL_pipeline/test_HybPiper_Performance.sh b/PAFTOL_pipeline/test_HybPiper_Performance.sh
new file mode 100644
index 0000000..f8b555e
--- /dev/null
+++ b/PAFTOL_pipeline/test_HybPiper_Performance.sh
@@ -0,0 +1,31 @@
+"""AIM: The goal of this piece of code is to test the performance of HybPiper
+as the test sequences are more and more different from the target sequences.
+The motivation is to explore scenarios where the species we are using is 
+very diverged from the ones used to generate the baits of the HybSeq array.
+
+This pipeline uses files within test_dataset directory in HybPiper. We will create 
+a loop and each time we are going to randomly introduce variation in the test_sequences.
+
+Every time we will generate heatmap and summary stats to explore how HybPiper performance changes.
+
+Notes for me:
+We need to be extracareful to keep the HybPiper hierarchical directory structure, otherwise
+it will fail.
+Every new file should be created within the test_dataset folder.
+
+Remember to change R code accordingly  when we do the LOOP!
+Is mutable module or class?
+"""
+## PENDING: Generate the loop across iterations with appropriate variables
+
+# We first want to run HybPiper's pipeline as it is
+Rscript multiSampleLoop.R
+
+# Now we generate heatmap and summstats files.
+python ../get_seq_lengths.py test_targets.fasta namelist.txt dna > test_seq_lengths.txt
+Rscript gene_recovery_heatmap.R
+python hybpiper_stats.py test_seq_lengths.txt namelist.txt > ~/HybPiper/PAFTOL_pipeline/test_stats.txt
+
+# Now we change the test reads fastq. We plan to use the mutable module or class.
+python ~/HybPiper/PAFTOL_pipeline/mutateFastq.py 
+

From 5500bc99112f2f0e97dcb4ade3f7034e14166edf Mon Sep 17 00:00:00 2001
From: rb-laura <rb.laura@gmail.com>
Date: Mon, 30 Jan 2017 12:03:46 +0000
Subject: [PATCH 3/3] Delete test_HybPiper_Performance.py

Now a shells script
---
 PAFTOL_pipeline/test_HybPiper_Performance.py | 31 --------------------
 1 file changed, 31 deletions(-)
 delete mode 100644 PAFTOL_pipeline/test_HybPiper_Performance.py

diff --git a/PAFTOL_pipeline/test_HybPiper_Performance.py b/PAFTOL_pipeline/test_HybPiper_Performance.py
deleted file mode 100644
index 83fe7c3..0000000
--- a/PAFTOL_pipeline/test_HybPiper_Performance.py
+++ /dev/null
@@ -1,31 +0,0 @@
-"""AIM: The goal of this piece of code is to test the performance of HybPiper
-as the test sequences are more and more different from the target sequences.
-The motivation is to explore scenarios where the species we are using is 
-very diverged from the ones used to generate the baits of the HybSeq array.
-
-This pipeline uses files within test_dataset directory in HybPiper. We will create 
-a loop and each time we are going to randomly introduce variation in the test_sequences.
-
-Every time we will generate heatmap and summary stats to explore how HybPiper performance changes.
-
-Notes for me:
-We need to be extracareful to keep the HybPiper hierarchical directory structure, otherwise
-it will fail.
-Every new file should be created within the test_dataset folder.
-
-Remember to change R code accordingly  when we do the LOOP!
-Is mutable module or class?
-"""
-import os
-
-# We first want to run HybPiper's pipeline as it is
-os.system("Rscript multiSampleLoop.R")
-
-# Now we generate heatmap and summstats files.
-os.system("python ../get_seq_lengths.py test_targets.fasta namelist.txt dna > test_seq_lengths.txt")
-os.system("Rscript gene_recovery_heatmap.R")
-os.system("python hybpiper_stats.py test_seq_lengths.txt namelist.txt > test_stats.txt")
-
-# Now we change the test reads fastq. We plan to use the mutable module or class.
- 
-