annotation and ora for diffbind, fix typo in DEG.Rmd

bcbio · Oct 11, 2024 · 1e718b3 · 1e718b3
1 parent a0ad6c9
commit 1e718b3
Show file tree

Hide file tree

Showing 3 changed files with 117 additions and 5 deletions.
diff --git a/inst/templates/chipseq/diffbind/diffbind.Rmd b/inst/templates/chipseq/diffbind/diffbind.Rmd
@@ -103,6 +103,8 @@ library(qs)
 library(EnhancedVolcano)
 library(ggprism)
 library(ChIPseeker)
+library(msigdbr)
+library(fgsea)
 
 if (params$species == 'mouse'){
   library(TxDb.Mmusculus.UCSC.mm10.knownGene)
@@ -265,9 +267,15 @@ results_anno_batch <- annotatePeakInBatch(results_report,
 
 results_anno_batch_df <- results_anno_batch %>% as.data.frame()
 
-entrez_to_symbol <- AnnotationDbi::select(org.Mm.eg.db, results_anno_batch_df$feature, 
-                                          "ENTREZID", columns = 'SYMBOL') %>%
-  filter(!is.na(ENTREZID)) %>% distinct()
+if(params$species == 'mouse'){
+  entrez_to_symbol <- AnnotationDbi::select(org.Mm.eg.db, results_anno_batch_df$feature, 
+                                            "ENTREZID", columns = 'SYMBOL') %>%
+    filter(!is.na(ENTREZID)) %>% distinct()
+} else if (params$species == 'human'){
+  entrez_to_symbol <- AnnotationDbi::select(org.Hs.eg.db, results_anno_batch_df$feature, 
+                                            "ENTREZID", columns = 'SYMBOL') %>%
+    filter(!is.na(ENTREZID)) %>% distinct()
+}
 
 results_anno_batch_df <- results_anno_batch_df %>% 
   left_join(entrez_to_symbol %>% dplyr::select(feature = ENTREZID, gene_name = SYMBOL))
@@ -316,10 +324,12 @@ results_mod <- results_sig_anno_batch_df %>%
 # show <- as.data.frame(results_mod[1:6, c("Fold", "FDR", "gene_name")])
 
 show <- results_mod %>% filter(!is.na(gene_name)) %>% slice_min(n = 6, order_by = FDR)
+
+results_mod <- results_mod %>% mutate(gene_name = ifelse(peak %in% show$peak  , gene_name, NA))
 EnhancedVolcano(results_mod,
                 lab= results_mod$gene_name,
                 pCutoff = 0.05, 
-                # selectLab = c(show$gene_name),
+                selectLab = c(show$gene_name),
                 FCcutoff = 0.5,
                 x = 'Fold',
                 y = 'FDR', 
@@ -369,6 +379,64 @@ plotDistToTSS(results_sig_anno)
 
 # Functional Enrichment
 
+Over-Representation Analysis (ORA) is a statistical method used to determine whether a predefined set of genes (e.g., genes belonging to a specific biological pathway or function) is over-represented (or enriched) among a list of differentially bound genes (DEGs) from ChIP-seq. Adventages of ORA:
+
+- Simplicity: Easy to perform and interpret.
+- Biological Insight: Helps to identify pathways and processes that are significantly affected in the condition studied.
+- Prior Knowledge Integration: Utilizes existing biological knowledge through predefined gene sets.
+
+```{r get databases}
+if(params$species == 'human'){
+  all_in_life=get_databases()
+} else if (params$species == 'mouse'){
+  all_in_life = get_databases('Mus musculus')
+}
+```
+
+```{r ora}
+
+universe_mapping = results_anno_batch_df %>% 
+    filter(!is.na(FDR), !is.na(feature)) %>% 
+  dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct()
+
+ora_input = results_anno_batch_df %>% 
+  filter(!is.na(FDR), FDR < 0.01, abs(Fold) > 0.3, !is.na(feature)) %>% 
+  dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct()
+all = run_fora(ora_input, universe_mapping, all_in_life)
+
+ora_input = results_anno_batch_df %>% 
+  filter(!is.na(FDR), FDR < 0.01, Fold > 0.3, !is.na(feature)) %>% 
+  dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct()
+up = run_fora(ora_input, universe_mapping, all_in_life)
+
+ora_input = results_anno_batch_df %>% 
+  filter(!is.na(FDR), FDR < 0.01, Fold < -0.3, !is.na(feature)) %>% 
+  dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct()
+down = run_fora(ora_input, universe_mapping, all_in_life)
+  
+```
+
+
+## Significant pathways using all DB genes
+
+```{r all pathways}
+all %>% sanitize_datatable()
+```
+
+
+## Significant pathways using increased DB genes
+
+```{r up pathways}
+up %>% sanitize_datatable()
+```
+
+
+## Significant pathways using decreased DB genes
+
+```{r down pathways, results='asis'}
+down %>% sanitize_datatable()
+```
+
 # R session
 
 List and version of tools used for the report generation.

diff --git a/inst/templates/chipseq/libs/load_data.R b/inst/templates/chipseq/libs/load_data.R
@@ -114,3 +114,47 @@ make_diffbind_samplesheet <- function(coldata, bam_dir, peaks_dir, column = NULL
 
   return(samplesheet)
 }
+
+get_databases=function(sps="human"){
+  all_in_life=list(
+    msigdbr(species = sps, category = "H") %>% mutate(gs_subcat="Hallmark"),
+    #  msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME"),
+    msigdbr(species = sps, category = "C2", subcategory = "CP:KEGG"),
+    #  msigdbr(species = "human", category = "C2", subcategory = "CP:PID"),
+    msigdbr(species = sps, category = "C5", subcategory = "GO:BP"),
+    msigdbr(species = sps, category = "C5", subcategory = "GO:MF")
+    #  msigdbr(species = "human", category = "C5", subcategory = "HPO"),
+    #  msigdbr(species = "human", category = "C3", subcategory = "TFT:GTRD"),
+    #  msigdbr(species = "human", category = "C6") %>% mutate(gs_subcat="Oncogenic")
+  )
+  all_in_life
+}
+
+run_fora=function(input, uni,all_in_life){
+  # browser()
+  total_deg=length(unique(input))/length(unique(uni$ENTREZID))
+  pathways_ora_all = lapply(all_in_life, function(p){
+    pathway = split(x = p$entrez_gene, f = p$gs_name)
+    db_name = paste(p$gs_cat[1], p$gs_subcat[1],sep=":")
+    respath <- fora(pathways = pathway,
+                    genes = unique(input$ENTREZID),
+                    universe = unique(uni$ENTREZID),
+                    minSize  = 15,
+                    maxSize  = 500)
+    # coll_respath = collapsePathwaysORA(respath[order(pval)][padj < 0.1],
+    #                                    pathway, unique(input$ENTREZID), unique(uni$ENTREZID))
+    as_tibble(respath)  %>%
+      mutate(database=db_name, NES=(overlap/size)/(total_deg))
+  }) %>% bind_rows() %>%
+    mutate(analysis="ORA")
+  ora_tb = pathways_ora_all %>% unnest(overlapGenes) %>%
+    group_by(pathway) %>%
+    left_join(uni, by =c("overlapGenes"="ENTREZID")) %>%
+    dplyr::select(pathway, padj, NES, SYMBOL, analysis,
+                  database) %>%
+    group_by(pathway,padj,NES,database,analysis) %>%
+    summarise(genes=paste(SYMBOL,collapse = ","))
+  ora_tb
+
+}
+
diff --git a/inst/templates/rnaseq/DE/DEG.Rmd b/inst/templates/rnaseq/DE/DEG.Rmd
@@ -604,7 +604,7 @@ fa_list=lapply(de_list,function(contrast){
   input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL'))
   up=run_fora(input_entrezid, universe_mapping,all_in_life)
 
-  ora_input = res %>% filter(!is.na(padj), padj<0.01, lfc<0.3) %>% pull(gene_id)
+  ora_input = res %>% filter(!is.na(padj), padj<0.01, lfc<-0.3) %>% pull(gene_id)
   #change to the right species
   input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL'))
   down=run_fora(input_entrezid, universe_mapping,all_in_life)