-
Notifications
You must be signed in to change notification settings - Fork 1
/
nurd-prc2-common.R
executable file
·162 lines (133 loc) · 6.65 KB
/
nurd-prc2-common.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env Rscript
source("~/source/Rscripts/functions.R")# {{{
source("~/source/Rscripts/granges-functions.R")
x <- c('Repitools', 'GenomicRanges', 'rtracklayer', 'plyr', 'ggplot2', 'gridExtra', 'reshape')
lapply(x, suppressMessages(library), character.only=T)# }}}
load("/nfs/research2/bertone/user/mxenoph/genome_dir/M_musculus_9/mm9.e67.ann.Rdata")
gene_assoc_IDs <- read.table("/nfs/research2/bertone/user/mxenoph/genome_dir/M_musculus_9/MM9.maps/mm9.e67.associated.geneIDs.txt", header=TRUE, as.is=TRUE, sep="\t")
args <- commandArgs(trailingOnly=TRUE)
peaks_1 <- read.table(args[1], header= TRUE, sep="\t")
peaks_2 <- read.table(args[2], header=TRUE, sep="\t")
common_targets <- as.vector(unlist(read.table(args[3])))
de <- args[4]
# Names for args[1] and args[2] separated by ;. e.g. 'SUZ12;MBD3'
pnames <- unlist(strsplit(args[5], ';'))
descr <- args[6]
out_dir <- args[7]
# Check if 'plots' dir exists in the wdr and create if not
plot_dir <- file.path(out_dir, 'plots', '')
dir.create(plot_dir)
peaks_1 <- "/nfs/research2/bertone/user/mxenoph/hendrich/chip/marks_2012/macs/2i-Suz12_peaks.xls"
peaks_2 <- "/nfs/research2/bertone/user/mxenoph/hendrich/chip/bh/chipseq/macs/7E12-2i-M2_peaks.xls"
de <- "/nfs/research2/bertone/user/mxenoph/hendrich/htseq_counts/de_anal/WT_2ivsKO_2i.txt"
common_targets <- as.vector(unlist(read.table("/nfs/research2/bertone/user/mxenoph/hendrich/ChEA/M2-Chd4.2i.SUZ12_CommonTargets.txt")))
pnames <- unlist(strsplit('SUZ12;MBD3', ';'))
##
# TODO: Check if files are xls before using macs_to_granges# {{{
peaks <- lapply(c(peaks_1, peaks_2), function(x){
#peaks <- lapply(args[1:2], function(x){
# For macs xls files
if( grepl("\\.xls$", x) ){
macs_to_granges(x)
}else if( grepl("\\.bed$", x) ){
import(x, format='bed')
}
})
names(peaks) <- pnames
genes <- my.annotation$genes
extended_genes <- resize(promoters(genes, upstream=2000), width=width(genes)+2500)# }}}
# select for the common genes
keep <- gene_assoc_IDs[toupper(gene_assoc_IDs[["Associated.Gene.Name"]]) %in% common_targets, 'Ensembl.Gene.ID']
extended_genes <- extended_genes[keep,]
peaks_map_genes <- lapply(seq_along(peaks), function(ind){
x <- peaks[[ind]]
over <- findOverlaps(x, extended_genes)
x <- x[queryHits(over)]
df <- data.frame('Name'= names(extended_genes[subjectHits(over)]))
values(x) <- cbind(values(x), df)
x <- split(x, x$Name)
if(length(x) < length(common_targets)){
warning(paste0("Not all input targets have a peak in ", names(peaks)[ind]))
x
} #shouldn't it be greater than?
else if ( length(x) < length(extended_genes)) {
# Because one associated gene name can be mapped to more than
# one unique ensembl id
x
}
})
names(peaks_map_genes) <- pnames
# Returned genes are more than the common targets used as input. WHY ???
# Answer not all genes reported by ChEA to be targets of both MBD3 and
# SUZ12 have a SUZ12 peak in the marks dataset
#
# It's a bit weird that by intersect() you don't get the min number of
# targets of the 2 ChIP. Look into it
dist <- lapply(intersect(names(peaks_map_genes[[1]]), names(peaks_map_genes[[2]])), function(name){
summit_1 <<- sort(unlist(peaks_map_genes[[1]][[name]]$summit))
summit_2 <<- sort(unlist(peaks_map_genes[[2]][[name]]$summit))
min_dist <- NULL
max_dist <- NULL
if(length(summit_1) < length(summit_2)){
small <- summit_1
big <- summit_2
} else {
small <- summit_2
big <- summit_1
}
for(i in 1:length(small)){
tmp <- min(abs(big - small[i]))
if (i ==1 || tmp < min_dist ) min_dist <- tmp
tmp <- max(abs(big - small[i]))
if (i ==1 || tmp > max_dist ) max_dist <- tmp
}
return(data.frame('Shortest.distance'= min_dist, 'Longest.distance' = max_dist, row.names=name))
})
tmp <- rbind.fill(dist)
rownames(tmp) <- unlist(lapply(dist, row.names))
dist <- tmp
# Subsets based on the 75th percentile
perc75 <- dist[dist$Shortest.distance < quantile(dist$Shortest.distance)[4], 'Shortest.distance']
de <- deseq2vect(de)
detar <- dist[rownames(dist) %in% rownames(de$up) | rownames(dist) %in% rownames(de$down), 'Shortest.distance', drop=F]
detar[rownames(detar) %in% rownames(de$up), 'DE'] <- 'up'
detar[rownames(detar) %in% rownames(de$down), 'DE'] <- 'down'
#detar75 <- detar[detar$Shortest.distance < quantile(detar$Shortest.distance)[4],]
# Subset based on the 75 percentile for the whole set not just the DE
detar75 <- detar[detar$Shortest.distance < quantile(dist$Shortest.distance)[4],]
pdf(paste0(plot_dir, paste(pnames, collapse='-'), ".", descr, '.pdf'), paper='a4')
#p <- ggplot(data= melt(dist), aes(x=factor(variable), y=value/1000))
#p <- p + geom_violin(scale="count", colour="black", fill="black", adjust=0.5)
p <- ggplot(data=melt(dist[, 'Shortest.distance']), aes(x=value/1000))
p <- p + geom_histogram()
p <- p + theme_bw()
p <- p + theme(panel.grid.major.x = element_blank(),
panel.grid.major.y = element_blank(),
panel.background= element_blank(),
panel.border= element_blank(),
axis.line= element_line())
x <- ggplot(data=melt(detar75), aes(x=value/1000, fill=DE))
x <- x + geom_density(alpha=.3)
#x <- x + geom_histogram()
x <- x + theme_bw()
x <- x + theme(panel.grid.major.x = element_blank(),
panel.grid.major.y = element_blank(),
panel.background= element_blank(),
panel.border= element_blank(),
axis.line= element_line(),
legend.position= "top")
grid.arrange(ggplotGrob(p), ggplotGrob(p %+% melt(perc75)), ggplotGrob(x),
nrow=1, ncol=3,
main= paste0("Distance (min) between ", paste(pnames, collapse="-"), " in ", descr))
grid.newpage()
de$df[is.na(de$df$padj), 'padj'] <- 1
de$df[, 'de'] <- de$df$padj < 0.05
de$df[, 'common'] <- rownames(de$df) %in% gene_assoc_IDs[toupper(gene_assoc_IDs[['Associated.Gene.Name']]) %in% common_targets, 'Ensembl.Gene.ID']
dev.off()
pp <- list('a'=p, 'b'=p %+% melt(perc75), 'c'=x)
save(pp, file=paste0("PlotObj-", descr, ".Rda"))
# do.call("grid.arrange", c(pp, ncol=2))
#selecting loci based on length (1/4) of overlaps
#wd <- width(pintersect(peaks[[1]][over$queryHits], genes[o$subjectHits]))
#ok <- wd > width(peaks[[1]][queryHits(h)]) / 4
#d <- over[ok]