-
Notifications
You must be signed in to change notification settings - Fork 4
/
qc.R
162 lines (148 loc) · 6.1 KB
/
qc.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/env Rscript
#--vanilla is not working
##working on collect all QC in one single file
##replace the rRNA in qc53.txt
options(stringsAsFactors=FALSE)
readqcinfo<-function(type, name)
{
read.delim(paste0("multiqc/",type,"_data/multiqc_",name,".txt"),
sep="\t",head=TRUE,row.names=1,check.names=FALSE,strip.white=TRUE)
}
checknames<-function(x,title)
{
if(sum(rownames(x)!=samples)==0) return(1)
#to fix the problem of multqc that the order of S1 is later than S10
if(sum(sort(rownames(x))!=sort(samples))==0) return(2)
stop(paste0(title," is wrong"))
}
#read star qc info the multiqc misses chimeric%, go with manully collected data from Log.final.out
star<-t(read.delim("star_align/star_QC.txt",sep="\t",row.names=1,strip.white=TRUE,check.names=FALSE))
star<-star[,c(6,7,9:17,25,27,29,30,31,34)]
colnames(star)<-c("reads","avg_input_read_length","uniquely_mapped","%uniquely_mapped","avg_mapped_read_length",
"num_splices","num_annotated_splices","num_GTAG_splices","num_GCAG_splices","num_ATAC_splices",
"num_noncanonical_splices","%multimapped","%multimapped_toomany","%unmapped_mismatches","%unmapped_tooshort",
"%unmapped_other","%chimeric")
star<-sub("%$","",star)
rownames(star)<-sub(" \\|$","",sub("^star_align \\| ","",rownames(star)))
samples<-sort(rownames(star))
star<-star[samples,]
NS<-length(samples)
#read fastqc info
fastqc<-readqcinfo("pre_align","fastqc")[,c("Total Sequences","%GC","total_deduplicated_percentage")]
All_Single<-TRUE
if(length(grep("_R2$",rownames(fastqc)))>0){
#some are paired
All_single<-FALSE
##paired, so we do the average of R1 and R2
#id<-(1:(nrow(fastqc)/2))*2
#if(sum(sub("_R2$","_R1",rownames(fastqc)[id])
# !=rownames(fastqc)[id-1])!=0){
# stop("the R1 and R2 files do not match in the fastqc info")
#}
##fastqc<-(fastqc[id-1,]+fastqc[id,])/2
lab<-sub("_R2$","_R1",rownames(fastqc))
fastqc<-apply(fastqc,2,function(x){tapply(x,lab,mean)})
}
rownames(fastqc)<-sub("_R1$","",rownames(fastqc))
#read the trim info and also clean-up fastqc when necessary
TRIM<-dir.exists("fastq_trim")
UMI<-dir.exists("fastq_attach")
if(TRIM){
trim<-readqcinfo("pre_align","cutadapt")
rownames(trim)<-sub("_R[12]$","",rownames(trim))
if(checknames(trim,"trim")==2){
trim<-trim[samples,]
}
fastqc_raw<-fastqc[grep("^fastqc_raw \\| ",rownames(fastqc)),]
rownames(fastqc_raw)<-sub("^fastqc_raw \\| ","",rownames(fastqc_raw))
if(checknames(fastqc_raw,"fastq_raw")==2){
fastqc_raw<-fastqc_raw[samples,]
}
fastqc<-fastqc[grep("^fastqc \\| ",rownames(fastqc)),]
}
rownames(fastqc)<-sub("^fastqc \\| ","",rownames(fastqc))
if(checknames(fastqc,"fastqc")==2){
fastqc<-fastqc[samples,]
}
##read qc53,note that the multiqc is not working well for collectRNAmetrics
qc53<-t(read.delim("qc53.txt",sep="\t",head=TRUE,row.names=1,check.names=FALSE))
qc53<-qc53[,-match(c("RIBOSOMAL_BASES","PCT_RIBOSOMAL_BASES","SAMPLE","LIBRARY","READ_GROUP"),
colnames(qc53)),drop=FALSE]
Nqc53<-ncol(qc53)
nqc53<-nrow(qc53)
if(nqc53==NS){
qc53<-qc53[samples,,drop=FALSE]
}else{
stop("star_qc and qc53 do not match")
}
id<-c("CODING","UTR","INTRONIC","INTERGENIC","MRNA")
loc<-match(c(paste0("PCT_",id,"_BASES"),"MEDIAN_5PRIME_TO_3PRIME_BIAS"),
colnames(qc53)[1:Nqc53])
qc53<-qc53[,loc,drop=FALSE]
id2<-tolower(id)
colnames(qc53)[1:6]<-c(paste0("%",id2),"median_5_3_bias")
id<-grep("^%",colnames(qc53))
qc53[,id]<-qc53[,id]*100
qc53<-round(qc53,dig=3)
##collect data for rRNA, globin and phix and duplicates when present
misc<-matrix(NA,NS,6)
colnames(misc)<-c("globin","rRNA","phix","picard_dup","UMI_dup","adapter_detected")
for(i in 1:NS){
SID<-samples[i]
for(j in 1:3){ #change to 4 when good
zz<-pipe(paste0("tail -1 ",colnames(misc)[j],"/",SID,".txt |tr -d %"))
misc[i,j]<-scan(zz,n=1,quiet=TRUE)
close(zz)
}
misc[i,4]<-read.delim(paste0("mark_dup/",SID,".dup_metrics"),skip=6,head=TRUE,row=1)[1,"PERCENT_DUPLICATION"]*100
if(UMI){
zz<-pipe(paste0("tail -1 star_align/",SID,"/",SID,"_dup_log.txt |cut -f 6"))
misc[i,5]<-scan(zz,n=1,quiet=TRUE)*100
close(zz)
}
if(TRIM){
zz<-pipe(paste0("grep \"with adapter\" fastq_trim/log/log.",SID,"|awk -F '[(%]' '{print $2}'"))
zval<-scan(zz,quiet=TRUE)
##future plan is to consider the paired info for each indivudal files
#if(length(zval)!= PAIRED+1){
# stop("the fastq_trim log for the contained adapter% is not consistent with the pairedness of the fastq data for sample", SID)
#}
misc[i,6]<-mean(zval)
close(zz)
}
}
misc<-round(misc,dig=3)
colnames(misc)<-paste0("%",colnames(misc))
##Read the chr_info.txt
chr_info<-matrix(NA,NS,5)
colnames(chr_info)<-c("chrX","chrY","chrM","chrAuto","contig")
readchr<-function(sid){
x<-read.delim(paste0("star_align/",sid,"/chr_info.txt"),
sep="\t",row.names=1,header=FALSE)
xt<-sum(x[,2])
y<-x[c("chrX","chrY","chrM"),2]/xt*100
y<-c(y,sum(x[grep("^chr[1-9]",rownames(x)),2])/xt*100)
c(y,100-sum(y))
}
for(i in 1:NS){
chr_info[i,1:5]<-readchr(samples[i])
}
colnames(chr_info)<-paste0("%",colnames(chr_info))
##keep chry with four digits
chr_info[,2]<-round(chr_info[,2],dig=5)
chr_info[,-2]<-round(chr_info[,-2],dig=3)
#Now putting all togther
qc<-NULL
if(TRIM) qc<-cbind("reads_raw"=fastqc_raw[,1],"%adapter_detected"=misc[,6],"%trimmed"=round(100-as.numeric(star[,1])/fastqc_raw[,"Total Sequences"]*100,dig=3),
"%trimmed_bases"=round(trim[,"percent_trimmed"],dig=3))
qc<-cbind(qc,reads=star[,1],"%GC"=round(fastqc[,"%GC"],dig=3),"%dup_sequence"=round(100-fastqc[,"total_deduplicated_percentage"],dig=3),
misc[,1:4])
if(UMI){
qc<-cbind(qc,"%umi_dup"=misc[,5])
}
qc<-cbind(qc,star[,-1],chr_info,qc53)
id<-grep("_percent$",colnames(qc))
colnames(qc)[id]<-paste0("%",sub("_percent$","",colnames(qc)[id]))
colnames(qc)<-sub("^%","pct_",colnames(qc))
#We could have write a date and the project folder here
write.table(qc,"qc_info.csv",row.names=TRUE,col.names=NA,quote=FALSE,sep=",")