From dd1b5b5a274fd6dc18f467232bb95f0f33f63cb6 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Wed, 29 May 2024 07:25:29 -0400 Subject: [PATCH 01/93] Initial commit --- .gitignore | 8 +- DESCRIPTION | 6 +- NAMESPACE | 3 + R/bcbioR-package.R | 2 + R/hello.R | 153 - R/helpers.R | 261 + R/orgs.R | 0 README.md | 3 +- inst/orgs/hcbc.yml | 4 + .../templates/rnaseq/skeleton/DE/DEG.Rmd | 5 + inst/templates/base/.gitignore | 8 + inst/templates/base/README.md | 39 + inst/templates/base/code/placeholder.R | 0 inst/templates/base/information.R | 6 + inst/templates/base/meta/placeholder.R | 0 inst/templates/base/scripts/placeholder | 0 inst/templates/rnaseq/de/DEG.Rmd | 443 ++ .../rnaseq/de/DE_tumor_vs_normal.html | 5864 +++++++++++++++++ .../rnaseq/de/Multiplicative_DE_docs.md | 129 + .../rnaseq/de/PCA_variance_analysis.Rmd | 32 + inst/templates/rnaseq/de/load_data.R | 146 + inst/templates/rnaseq/de/params_de-example.R | 18 + inst/templates/rnaseq/de/params_de.R | 22 + inst/templates/rnaseq/de/run_markdown.R | 32 + inst/templates/rnaseq/org/hcbc_README.md | 75 + inst/templates/rnaseq/qc/QC.Rmd | 410 ++ inst/templates/rnaseq/qc/QC_nf-core.Rmd | 583 ++ inst/templates/rnaseq/qc/params_qc.R | 4 + .../rnaseq/qc/params_qc_nf-core-example.R | 9 + inst/templates/rnaseq/qc/placeholder | 0 inst/templates/rnaseq/qc/run_markdown.R | 13 + man/bcbio_nfcore_check.Rd | 2 +- man/bcbio_set_project.Rd | 2 +- man/bcbio_templates.Rd | 2 +- 34 files changed, 8124 insertions(+), 160 deletions(-) delete mode 100644 R/hello.R create mode 100644 R/helpers.R create mode 100644 R/orgs.R create mode 100644 inst/orgs/hcbc.yml create mode 100644 inst/templates/base/.gitignore create mode 100644 inst/templates/base/README.md create mode 100644 inst/templates/base/code/placeholder.R create mode 100644 inst/templates/base/information.R create mode 100644 inst/templates/base/meta/placeholder.R create mode 100644 inst/templates/base/scripts/placeholder create mode 100644 inst/templates/rnaseq/de/DEG.Rmd create mode 100644 inst/templates/rnaseq/de/DE_tumor_vs_normal.html create mode 100644 inst/templates/rnaseq/de/Multiplicative_DE_docs.md create mode 100644 inst/templates/rnaseq/de/PCA_variance_analysis.Rmd create mode 100644 inst/templates/rnaseq/de/load_data.R create mode 100644 inst/templates/rnaseq/de/params_de-example.R create mode 100644 inst/templates/rnaseq/de/params_de.R create mode 100644 inst/templates/rnaseq/de/run_markdown.R create mode 100644 inst/templates/rnaseq/org/hcbc_README.md create mode 100644 inst/templates/rnaseq/qc/QC.Rmd create mode 100644 inst/templates/rnaseq/qc/QC_nf-core.Rmd create mode 100644 inst/templates/rnaseq/qc/params_qc.R create mode 100644 inst/templates/rnaseq/qc/params_qc_nf-core-example.R create mode 100644 inst/templates/rnaseq/qc/placeholder create mode 100644 inst/templates/rnaseq/qc/run_markdown.R diff --git a/.gitignore b/.gitignore index 607f2ff..7dead86 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,10 @@ inst/doc docs /doc/ /Meta/ -.DS* \ No newline at end of file +.DS* +inst/rmarkdown/templates/rnaseq/skeleton/DE/Multiplicative_DGE_Analysis.Rmd +tests/* +.Rdata +.httr-oauth +.DS_Store +.quarto diff --git a/DESCRIPTION b/DESCRIPTION index 66b3f42..dd2e149 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,7 +19,11 @@ Imports: ggprism, grDevices, R.utils, - readr + readr, + usethis, + fs, + jsonlite, + yaml Suggests: knitr, rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index 3cc700a..e202aa8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,11 +8,14 @@ export(cb_friendly_pal) export(list_cb_friendly_cols) export(scale_color_cb_friendly) export(scale_fill_cb_friendly) +export(use_bcbio_projects) import(DESeq2) import(R.utils) +import(fs) import(ggplot2) import(ggprism) import(hues) +import(usethis) importFrom(grDevices,colorRampPalette) importFrom(magrittr,"%>%") importFrom(readr,read_csv) diff --git a/R/bcbioR-package.R b/R/bcbioR-package.R index 64459df..903be7b 100644 --- a/R/bcbioR-package.R +++ b/R/bcbioR-package.R @@ -9,6 +9,8 @@ ## usethis namespace: end #' @import DESeq2 #' @import ggplot2 +#' @import usethis +#' @import fs #' @import hues #' @import ggprism #' @import R.utils diff --git a/R/hello.R b/R/hello.R deleted file mode 100644 index 7f3cde0..0000000 --- a/R/hello.R +++ /dev/null @@ -1,153 +0,0 @@ -.fix <- function(x){ - x <- tolower(x) %>% str_replace_all(., "[[:punct:]]", "_") - x <- str_replace_all(x, " ", "_") - return(x) -} - - -#' Function to check samplesheet for nf-core -#' -#' @param file path to CSV file for nf-core -#' @examples -#' -#' bcbio_nfcore_check(system.file("extdata", "rnaseq_good.csv", package = "bcbioR") ) -#' -#' @export -bcbio_nfcore_check <- function(file){ - required=c("sample","fastq_1","fastq_2","strandedness") - samplesheet=read_csv(file) - - if (!(all(required %in% colnames(samplesheet)))){ - print(colnames(samplesheet)) - stop("Missing required columns ", paste(required, collapse = " ")) - }else if (any(grepl("^[1-9]", samplesheet[["sample"]]))){ - stop("Avoid samples starting with numbers ") - }else if (any(is.na(samplesheet))){ - warning("Columns with missing values") - }else{ - message("All good.") - } -} - -#' Function to help deploy analysis folder inside a project folder -#' -#' This function contains Rmd, R, md, files that help to structure -#' an analysis following HCBC best-practices. -#' For rnaseq, it will deploy: QC and DE Rmd with additional files to help -#' to facilitate the analysis as needed. -#' -#' Normally these helper files are inside a report folder inside a -#' project folder. -#' -#' @param type string indicating the type of analysis, supported: rnaseq. -#' -#' @param outpath string path indicating where to copy all the files to -#' @examples -#' \dontrun{ -#' bcbio_templates("rnaseq", "path_to_projects/project1/reports") -#' } -#' @export -bcbio_templates <- function(type="rnaseq", outpath){ - switch(type, - rnaseq={ - - fpath <- system.file("rmarkdown/templates/rnaseq", "skeleton", package="bcbioR") - #file.copy(fpath, outpath, recursive = T) - copyDirectory(fpath, outpath) - }, - scrnaseq={ - - fpath <- system.file("rmarkdown/templates/singlecell", "skeleton", package="bcbioR") - #file.copy(fpath, outpath, recursive = T) - copyDirectory(fpath, outpath) - }, - { - stop('project type not recognize, please choose: ', 'rnaseq', 'scrnaseq') - } - ) -} - -#' Function to help with project name used for parent folder -#' -#' This function will ask for user input about: -#' * numeric code -#' * PI full name -#' * technology -#' * tissue -#' * organism -#' * project description -#' -#' It removes special character with `_`. The output is a guideline to -#' what the folder used can be. -#' -#' @returns A string list with hbc_code, and project folder name -#' @export -bcbio_set_project <- function() { - hbc_code <- readline("What is the hbc code (only numbers):\n") - hbc_code <- paste0("hbc", hbc_code) - pi <- readline("What is PI last name:\n") - technology <- readline("What is the technology:\n") - tissue <- readline("What is the tissue:\n") - org <- readline("What is the organism:\n") - project <- readline("What is the project name:\n") - #dropbox <- readline("What is the dropbox name:\n") - #github_org <- readline("What is the github organization:\n") - #hbc_$technology_of_$pilastname_$intervention_on_$tissue_in_$organism_$hbccode - project_full <- paste(technology, .fix(pi), .fix(project), tissue, org, hbc_code, sep="_") - #github <- c(github_org,project_full) - opts <- list(code=hbc_code, project=project_full) - #dropbox=file.path(dropbox,project_full), - #github=github) - print(opts) - return(opts) -} - - -bcbio_start_project <- function(options) { - -} - -bcbio_gitignore <- function(options) { - -} - -# This function showcases how one might write a function to be used as an -# RStudio project template. This function will be called when the user invokes -# the New Project wizard using the project template defined in the template file -# at: -# -# inst/rstudio/templates/project/hello_world.dcf -# -# The function itself just echos its inputs and outputs to a file called INDEX, -# which is then opened by RStudio when the new project is opened. -rnaseq <- function(path, ...) { - - # ensure path exists - dir.create(path, recursive = TRUE, showWarnings = FALSE) - - # generate header - header <- c( - "# This file was generated by a call to 'ptexamples::hello_world()'.", - "# The following inputs were received:", - "" - ) - - # collect inputs - dots <- list(...) - text <- lapply(seq_along(dots), function(i) { - key <- names(dots)[[i]] - val <- dots[[i]] - paste0(key, ": ", val) - }) - - # collect into single text string - contents <- paste( - paste(header, collapse = "\n"), - paste(text, collapse = "\n"), - sep = "\n" - ) - - # write to index file - writeLines(contents, con = file.path(path, "README.md")) - -} diff --git a/R/helpers.R b/R/helpers.R new file mode 100644 index 0000000..89ba469 --- /dev/null +++ b/R/helpers.R @@ -0,0 +1,261 @@ +.fix <- function(x){ + x <- tolower(x) %>% str_replace_all(., "[[:punct:]]", "_") + x <- str_replace_all(x, " ", "_") + return(x) +} + + +#' Function to check samplesheet for nf-core +#' +#' @param file path to CSV file for nf-core +#' @examples +#' +#' bcbio_nfcore_check(system.file("extdata", "rnaseq_good.csv", package = "bcbioR") ) +#' +#' @export +bcbio_nfcore_check <- function(file){ + required=c("sample","fastq_1","fastq_2","strandedness") + samplesheet=read_csv(file) + + if (!(all(required %in% colnames(samplesheet)))){ + print(colnames(samplesheet)) + stop("Missing required columns ", paste(required, collapse = " ")) + }else if (any(grepl("^[1-9]", samplesheet[["sample"]]))){ + stop("Avoid samples starting with numbers ") + }else if (any(is.na(samplesheet))){ + warning("Columns with missing values") + }else{ + message("All good.") + } +} + +#' Function to help deploy analysis folder inside a project folder +#' +#' This function contains Rmd, R, md, files that help to structure +#' an analysis following HCBC best-practices. +#' For rnaseq, it will deploy: QC and DE Rmd with additional files to help +#' to facilitate the analysis as needed. +#' +#' Normally these helper files are inside a report folder inside a +#' project folder. +#' +#' @param type string indicating the type of analysis, supported: rnaseq. +#' +#' @param outpath string path indicating where to copy all the files to +#' @examples +#' \dontrun{ +#' bcbio_templates("rnaseq", "path_to_projects/project1/reports") +#' } +#' @export +bcbio_templates <- function(type="rnaseq", outpath){ + switch(type, + rnaseq={ + + fpath <- system.file("rmarkdown/templates/rnaseq", "skeleton", package="bcbioR") + #file.copy(fpath, outpath, recursive = T) + copyDirectory(fpath, outpath) + }, + scrnaseq={ + + fpath <- system.file("rmarkdown/templates/singlecell", "skeleton", package="bcbioR") + #file.copy(fpath, outpath, recursive = T) + copyDirectory(fpath, outpath) + }, + { + stop('project type not recognize, please choose: ', 'rnaseq', 'scrnaseq') + } + ) +} + +#' Function to help with project name used for parent folder +#' +#' This function will ask for user input about: +#' * numeric code +#' * PI full name +#' * technology +#' * tissue +#' * organism +#' * project description +#' +#' It removes special character with `_`. The output is a guideline to +#' what the folder used can be. +#' +#' @returns A string list with hbc_code, and project folder name +#' @export +bcbio_set_project <- function() { + hbc_code <- readline("What is the hbc code (only numbers):\n") + hbc_code <- paste0("hbc", hbc_code) + pi <- readline("What is PI last name:\n") + technology <- readline("What is the technology:\n") + tissue <- readline("What is the tissue:\n") + org <- readline("What is the organism:\n") + project <- readline("What is the project name:\n") + #dropbox <- readline("What is the dropbox name:\n") + #github_org <- readline("What is the github organization:\n") + #hbc_$technology_of_$pilastname_$intervention_on_$tissue_in_$organism_$hbccode + project_full <- paste(technology, .fix(pi), .fix(project), tissue, org, hbc_code, sep="_") + #github <- c(github_org,project_full) + opts <- list(code=hbc_code, project=project_full) + #dropbox=file.path(dropbox,project_full), + #github=github) + print(opts) + return(opts) +} + + +guess_analysis <- function(path){ + if (!fs::dir_exists(path)) + ui_abort("{ui_val(path)} doesn't exist") + + # This file is inside star_salmon/ folder + counts_fn <- fs::path_join(path, '/star_salmon/salmon.merged.gene_counts.tsv') + # This folder called "multiqc_report_data" is inside the output directory star_salmon inside multiqc folder + multiqc_data_dir <- fs::path_join(path, 'star_salmon/multiqc_report_data') + # This file is inside star_salmon/ folder + se_object <- fs::path_join(path, 'star_salmon/salmon.merged.gene_counts.rds') + +} + +read_pipeline_info <- function(path){ + # pipeline_info/params_2024-05-28_12-28-51.json + config <- fs::path_join(nfcore, "pipeline_info") + params <- fs::dir_ls(config, regexp = "params") + metadata <- jsonlite::read_json(params)[["input"]] + # input + # tmp_rna/pipeline_info/software_versions.yml + software <- fs::path_join(nfcore, "pipeline_info", "software_versions.yml") + software_txt <- yaml::read_yaml(software) + pipeline <- grep("nf-core", names(software_text$Workflow), value = TRUE) + # Workflow: + # Nextflow: 24.04.1 + # nf-core/rnaseq: 3.14.0 + # check only rnaseq is supported + if (!(pipeline %in% c("nf-core/rnasew"))){ + iu_abort("Sorry, we don't yet support {.ui_value(pipeline)}") + } + list(metadata=metadata, pipeline=pipeline) +} + + +bcbio_params <-function(path, pipeline, metadata, copy){ + + if (pipeline=="nf-core/rnaseq"){ + if (!copy){ + se_object <- fs::path_join(path, "star_salmon/salmon.merged.gene_counts.rds") + metadata_fn <- metadata + counts_fn <- fs::path_join(path, "star_salmon/salmon.merged.gene_counts.tsv") + multiqc_data_dir <- fs::path_join(path, "multiqc/star_salmon/multiqc-report-data/") + gtf_fn <- fs::path_join(path, "genome/genome.filtered.gtf") + } + + analysis_template <- fs::path_package("bcbioR", "templates", "rnaseq", "qc") + fs::dir_copy(analysis_template, fs::path_join(path, "reports"), overwrite = FALSE) + analysis_template <- fs::path_package("bcbioR", "templates", "rnaseq", "de") + fs::dir_copy(analysis_template, fs::path_join(path, "reports"), overwrite = FALSE) + + ui_info("Please, to start the analysis, modify these parameter in QC/QC.rmd") + ui_todo("set genome to hg38, mm10, mm39, or other") + ui_todo("set factor_of_interest to a column in your metadata") + } + +} + +#' @export +use_bcbio_analysis <- function(path, nfcore=NULL, copy=FALSE, metadata=NULL){ + + if (copy){ + # deploy files + ui_info("Rmd templates will be copied but variables path won't be filled automatically.") + }else{ + if (!fs::dir_exists(nfcore)) + ui_abort("{ui_value(nfcore)} doesn't exist. point to nfcore path or turn on copy mode.") + + #guess analysis from pipeline file + information <- read_pipeline_info(nfcore) + fs::dir_create(fs::path_join(path, "meta")) + meta_path <- fs::path_join(path, "meta", fs::path_file(information$metadata)) + pipeline <- information$pipeline + if (!is.null(metadata)){ + if (!(fs::file_exists(metadata))) + ui_abort("{ui_value(metadata)} doesn't exist.") + fs::file_copy(metadata, meta_path) + }else{ + if (!fs::file_exists(information$metadata)){ + ui_warn("{ui_value(metadata)} not found. We can only work with local filesytems. For now.") + ui_todo("Please, copy {ui_value(metadata)} to {ui_value(meta_path)}.") + ui_warn("If this file is not in the folder, the code will fail.") + }else{ + fs::file_copy(information$metadata, meta_path) + } + metadata <- meta_path + } + + # set all files from analysis + bcbio_params <- set_bcbio_params(nfcore, pipeline, metadata, copy=copy) + } + +} + +#' @export +#' @examples +#' path <- withr::local_tempdir() +#' # use_bcbio_projects(path,nfcore="nf-core/rnaseq",copy=TRUE) +use_bcbio_projects <- function(path, nfcore=NULL, metadata=NULL, git=TRUE, gh=FALSE, org=NULL, copy=FALSE) { + + ui_info("Creating project at {ui_value(path)}") + if (!fs::dir_exists(path)) + fs::dir_create(path, mode = "u=xrw,g=xwr,o=r", recurse = TRUE) + + ui_info("Populating base project") + base_template <- fs::path_package("bcbioR", "templates", "base") + fs::dir_copy(base_template, path, overwrite = FALSE) + + if (is.null(nfcore)){ + is_nfcore_ready <- ui_yeah("Have you already run nf-core pipeline?", + n_yes=1, n_no =1) + if (is_nfcore_ready){ + nfcore <- readline("? Enter path to nf-core output: ") + }else{ + ui_warn("Please, turn copy = TRUE to only deploy files or") + ui_abort("Please use {.run use_bcbio_projects} again when you have the nf-core output.") + } + use_bcbio_analysis(path, nfcore, copy, metadata) + }else{ + if (fs::dir_exists(nfcore)){ + ui_info("Checking {.ui_value(nfcore)} as nf-core output directory") + use_bcbio_analysis(path, nfcore, copy, metadata) + }else if (copy){ + # deploy only files + ui_info("Deploying only templates without pipeline information.") + use_bcbio_analysis(path, nfcore, metadata=metadata, copy = TRUE) + }else{ + ui_warn("Please, provide nfcore working directory or") + ui_warn("turn copy = TRUE to only deploy files.") + } + } + + if (git){ + ui_info("Create Git local repo at {ui_value(path)}") + use_git() + } + if (gh){ + ui_info("Create GitHub repo at {ui_value(path)}") + whoami <- suppressMessages(gh::gh_whoami()) + if (is.null(whoami)) { + ui_abort(c( + "x" = "Unable to discover a GitHub personal access token.", + "i" = "A token is required in order to create and push to a new repo.", + "_" = "Call {.run usethis::gh_token_help()} for help configuring a token." + )) + } + use_github(organisation=org) + } + + answer <- ui_yeah("Please, read the README.md file as the session starts.Are you ready?", + n_yes=1, n_no =1) + if (answer) + proj_activate(path) + if (!answer) + ui_info("Please use {.run proj_activate({ui_value(path)})} to start this project.") + +} diff --git a/R/orgs.R b/R/orgs.R new file mode 100644 index 0000000..e69de29 diff --git a/README.md b/README.md index f2822fb..278bd70 100644 --- a/README.md +++ b/README.md @@ -47,8 +47,7 @@ As well, You can get this by going to File -\> New File -\> R markdown: - then ` ### Set RNAseq report folder -This code will populate the folder with HCBC data structure guidelines and Rmd code: -**You do not need to create a reports folder prior to running this code. This will create and populate the reports folder.** +This code will populate the folder with HCBC data structure guidelines and Rmd code: **You do not need to create a reports folder prior to running this code. This will create and populate the reports folder.** ``` r bcbio_templates(type="rnaseq", outpath="/path/to/analysis/folder/reports") diff --git a/inst/orgs/hcbc.yml b/inst/orgs/hcbc.yml new file mode 100644 index 0000000..9f263a5 --- /dev/null +++ b/inst/orgs/hcbc.yml @@ -0,0 +1,4 @@ +name: HCBC +description: Variables specifics to HCBC +github_org: hbc + diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd b/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd index be18e1f..ceb3b37 100644 --- a/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd +++ b/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd @@ -31,6 +31,11 @@ params: --- +```{r} +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +``` + ```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} # 1. Set up input files in this R file (params_de.R) source(params$params_file) diff --git a/inst/templates/base/.gitignore b/inst/templates/base/.gitignore new file mode 100644 index 0000000..55e4e88 --- /dev/null +++ b/inst/templates/base/.gitignore @@ -0,0 +1,8 @@ +.Rproj.user +data/* +docs/* +**/*html +**/*rds +**/*rda +**/*csv +**/*tsv diff --git a/inst/templates/base/README.md b/inst/templates/base/README.md new file mode 100644 index 0000000..3f00ca3 --- /dev/null +++ b/inst/templates/base/README.md @@ -0,0 +1,39 @@ +# Guidelines + +## Set up work-space + +- [ ] Replace the Title in this file matching the title projects +- [ ] Modify `information.R` with the right text for this project, it can be used to source in other `Rmd` files. The main `Rmd` file in this directory can be used to show general information of the project if needed. +- [ ] Use the same parent folder name to create a folder in *Dropbox*, and *GitHub* repo +- [ ] use the function `bcbio_templates` to create templates inside `reports` for each type of analysis. For instance, for *RNAseq*: + - `bcbio_templates(type="rnaseq", outpath="reports")` or + - `bcbio_templates(type="rnaseq", outpath="reports/experiment1")` + - Then go to that folder and read the `README.md` + +## Folders + +- `meta` should contain the CSV/YAML files used by *bcbio* or *nextflow* +- `scripts` should contain `sbatch` scripts or any custom scripts used in this project +- `data` contains raw data, it can contains big data objects +- `reports` contains `Rmd` and `html` together with their files that will be added to *DropBox*. Each type of project have different guidelines. +- `final` contains the output of *nextflow/bcbio* +- `code` contains any other files that support custom analysis and don't generate a report +- For any relevant client files or papers use the `docs` folder on *DropBox* + +## Download + +- [ ] Download data to the `data` directory on O2. Check the md5 checksums if available. + +## Analysis + +- [ ] Make sure that final folder is copied from *scratch* or *S3* to `/n/data1/cores/bcbio/PIs/` + +## GitHub + +- [ ] Track in *GitHub* this `README` file +- [ ] Track in *GitHub* files in `scripts`, `meta`, and `reports` that belongs to these type: + - **Note** Git add `*.Rmd *.R *ipynb *.sh *.yaml`. (feel free use `.gitignore` if you use a GUI for non-tracked files). *DO NOT* use `git add *`. *DO NOT* track `html/csv/figures` + +## Dropbox + +- [ ] Add to the *DropBox* folder all files in `reports` diff --git a/inst/templates/base/code/placeholder.R b/inst/templates/base/code/placeholder.R new file mode 100644 index 0000000..e69de29 diff --git a/inst/templates/base/information.R b/inst/templates/base/information.R new file mode 100644 index 0000000..6e15eef --- /dev/null +++ b/inst/templates/base/information.R @@ -0,0 +1,6 @@ +# info params +project = "name_hbcXXXXX" +PI = 'person name' +experiment = 'short description' +aim = 'short description' +analyst = 'person in the core' diff --git a/inst/templates/base/meta/placeholder.R b/inst/templates/base/meta/placeholder.R new file mode 100644 index 0000000..e69de29 diff --git a/inst/templates/base/scripts/placeholder b/inst/templates/base/scripts/placeholder new file mode 100644 index 0000000..e69de29 diff --git a/inst/templates/rnaseq/de/DEG.Rmd b/inst/templates/rnaseq/de/DEG.Rmd new file mode 100644 index 0000000..a2741a7 --- /dev/null +++ b/inst/templates/rnaseq/de/DEG.Rmd @@ -0,0 +1,443 @@ +--- +title: "Differential Expression" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + numerator: tumor + denominator: normal + column: sample_type + subset_column: null + subset_value: null + # Put hg38, mm10, mm39, or other + genome: hg38 + ruv: false + functions_file: load_data.R +--- + +```{r, echo = F} +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +``` + +```{r} +metadata_fn={{metadata_fn}} +se_object={{se_object}} +# This folder is in the output directory inside multiqc folder +multiqc_data_dir={{multiqc_data_dir}} +# This file is inside the genome folder in the output directory +gtf_fn={{gtf_fn}} +``` + +```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} +# 1. Set up input files in this R file (params_de.R) +source(params$params_file) +# 3. Load custom functions to load data from coldata/metrics/counts +source(params$functions_file) +# IMPORTANT set these values if you are not using the parameters at the top +genome=params$genome +column=params$column +numerator=params$numerator +denominator=params$denominator +subset_column=params$subset_column +subset_value=params$subset_value +run_ruv=params$ruv +``` + +# Overview + +{{project}} + +- Comparison: `r ifelse(is.null(subset_value), paste0(numerator, ' vs. ', denominator), paste0(subset_value, ': ', numerator, ' vs. ', denominator))` + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(rtracklayer) +library(DESeq2) +library(tidyverse) +library(stringr) +library(RUVSeq) +library(DEGreport) +library(ggpubr) +library(msigdbr) +library(fgsea) +library(org.Hs.eg.db) +library(knitr) +library(EnhancedVolcano) +library(bcbioR) +library(ggprism) +library(viridis) +library(pheatmap) +library(janitor) +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) +opts_chunk[["set"]]( + cache = F, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + echo = T, + fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) +``` + + +```{r load_data, message=F, warning=F} +# This code will load from bcbio or nf-core folder +# NOTE make sure to set numerator and denominator +coldata <- load_coldata(metadata_fn, column, + numerator, denominator, + subset_column, subset_value) +coldata$sample=row.names(coldata) + +counts <- load_counts(counts_fn) +counts <- counts[,colnames(counts) %in% coldata$sample] + +metrics <- load_metrics(se_object, multiqc_data_dir, gtf_fn, counts) %>% + left_join(coldata, by = c('sample')) %>% + as.data.frame() +rownames(metrics) <- metrics$sample +# if the names don't match in order or string check files names and coldata information +counts = counts[rownames(metrics)] +coldata = coldata[rownames(metrics),] +stopifnot(all(names(counts) == rownames(metrics))) +``` + +```{r sanitize_datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df))) +} +``` + + +```{r create_filenames} + +if (!is.null(subset_value) & !is.null(subset_value)){ + filenames = str_interp("${subset_value}_${numerator}_vs_${denominator}") +} else { + filenames = str_interp("${numerator}_vs_${denominator}") +} + +contrasts = c(column,numerator,denominator) +coef=paste0(column,"_",numerator,"_vs_",denominator) + +name_expression_fn=file.path( + basedir, + str_interp("${filenames}_expression.csv")) +name_deg_fn=file.path( + basedir, + str_interp("${filenames}_deg.csv")) +name_pathways_fn=file.path( + basedir, + str_interp("${filenames}_pathways.csv")) + +``` + +```{r load_counts_data} + +rdata = AnnotationDbi::select(org.Hs.eg.db, rownames(counts), 'SYMBOL', 'ENSEMBL') %>% + dplyr::select(gene_id = ENSEMBL, gene_name = SYMBOL) + +``` + +```{r setup_RUV} + +dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = ~1) + +vsd_before <- vst(dds_to_use) +norm_matrix = assay(vsd_before) +``` + +# Covariate analysis + +```{r covariates, fig.height = 6, fig.width = 10} +degCovariates( + norm_matrix, + metrics, +) +``` + +# PCA analysis + +```{r before_RUV} + +pca1 <- degPCA(norm_matrix, colData(dds_to_use), + condition = column) + ggtitle('PCA') +pca1 + scale_color_cb_friendly() + +``` + +```{r init_DESEQ} +formula <- as.formula(paste0("~ ", " + ", column)) +## Check if sample name matches +stopifnot(all(names(counts) == rownames(coldata))) + +dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = formula) + +vsd_before <- vst(dds_to_use) +norm_matrix = assay(vsd_before) +new_cdata <- coldata +``` + +```{r, eval=run_ruv, results='asis', echo=FALSE} +cat("# Remove Unwanted Variability + +When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data.") + +``` + +```{r do_RUV, eval=run_ruv} +# If you want to skip the code, just set up formula to be your model in the next chunk of code +design <- coldata[[column]] +diffs <- makeGroups(design) +dat <- norm_matrix +ruvset <- RUVs(dat, cIdx=rownames(dat), k=1, diffs, isLog = T, round = F) +vars <- ruvset$W + +new_cdata <- cbind(coldata, vars) + +formula <- as.formula(paste0("~ ", + paste0( + colnames(new_cdata)[grepl("W", colnames(new_cdata))], + collapse = " + " + ), " + ", column) +) +norm_matrix=ruvset$normalizedCounts +pca2 <- degPCA(norm_matrix, new_cdata, + condition = column) + ggtitle('After RUV') +pca2 + scale_color_cb_friendly() + +``` + +```{r after_RUV, eval=run_ruv} + +dds_to_use <- DESeqDataSetFromMatrix(counts, new_cdata, design = formula) +vsd_to_use<- vst(dds_to_use, blind=FALSE) + +``` + + +# Differential Expression + +Differential gene expression analysis of count data was performed using the Bioconductor R package, DESeq2, which fits the count data to a negative binomial model. + +Before fitting the model, we often look at a metric called dispersion, which is a measure for variance which also takes into consideration mean expression. A dispersion value is estimated for each individual gene, then 'shrunken' to a more accurate value based on expected variation for a typical gene exhibiting that level of expression. Finally, the shrunken dispersion value is used in the final GLM fit. + +We use the below dispersion plot, which should show an inverse relationship between dispersion and mean expression, to get an idea of whether our data is a good fit for the model. + +```{r DE} +de <- DESeq(dds_to_use) + +DESeq2::plotDispEsts(de) +``` + +Because it is difficult to accurately detect and quantify the expression of lowly expressed genes, differences in their expression between treatment conditions can be unduly exaggerated after the model is fit. We correct for this so that gene LFC is not dependent overall on basal gene expression level. + +```{r lfc_shrink} +# resultsNames(de) # check the order is right +resLFC = results(de, contrast=contrasts) +resLFCS <- lfcShrink(de, coef=coef, type="apeglm") + +res <- as.data.frame(resLFCS) %>% + rownames_to_column('gene_id') %>% left_join(rdata, by = 'gene_id') %>% + relocate(gene_name) %>% dplyr::rename(lfc = log2FoldChange) %>% + mutate(pi = abs(lfc) * -log10(padj)) %>% arrange(-pi) + +res_sig <- res %>% filter(padj < 0.05) %>% arrange(padj) %>% + mutate(gene_name = ifelse(is.na(gene_name), gene_id, gene_name)) + +res_mod <- res %>% mutate(lfc = replace(lfc, lfc < -5, -5)) %>% mutate(lfc = replace(lfc, lfc > 5, 5)) +show <- as.data.frame(res_mod[1:10, c("lfc", "padj", "gene_name")]) + +degMA(as.DEGSet(resLFC)) + ggtitle('Before LFC Shrinking') +``` + +## MA plot + +```{r after_lfc_shrink} +degMA(as.DEGSet(resLFCS), limit = 2) + ggtitle('After LFC Shrinking') + +``` + +## Volcano plot + +This volcano plot shows the genes that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in red are genes that have padj < 0.05 and a log2-fold change > 1. Points in blue have a padj < 0.05 and a log2-fold change < 1 and points in green have a padj > 0.05 and a log2-fold change > 2. Grey points are non-significant. The dashed lines correspond to the cutoff values of log2 foldchance and padj that we have chosen. + +```{r volcano_plot, fig.height=6} +# degVolcano(res_mod[,c('lfc', 'padj')], plot_text = show) +EnhancedVolcano(res_mod, + lab= res_mod$gene_name, + pCutoff = 1.345719e-03, + selectLab = c(res_sig$gene_name[1:15]), + FCcutoff = 0.5, + x = 'lfc', + y = 'padj', + title="Volcano Tumor vs. Normal", + col=as.vector(colors[c("dark_grey", "light_blue", + "purple", "purple")]), + subtitle = "", xlim=c(-5,5)) + +``` + +## Heatmap + +```{r heapmap} +### Run pheatmap using the metadata data frame for the annotation +ma=norm_matrix[res_sig$gene_id,] +colma=coldata[,c(column), drop=FALSE] +colors=lapply(colnames(colma), function(c){ + l.col=colors[1:length(unique(colma[[c]]))] + names(l.col)=unique(colma[[c]]) + l.col +}) +names(colors)=colnames(colma) +pheatmap(ma, + color = inferno(10), + cluster_rows = T, + show_rownames = F, + annotation = colma, + annotation_colors = colors, + border_color = NA, + fontsize = 10, + scale = "row", + fontsize_row = 10, + height = 20) +``` + + +## Differentially Expressed Genes + +```{r sig_genes_table} +res_sig %>% sanitize_datatable +``` + +## Plot top 16 genes + +```{r top n DEGs, fig.height = 6, fig.width = 8} +n = 16 +top_n <- res_sig %>% slice_min(order_by = padj, n = n, with_ties = F) %>% + dplyr::select(gene_name, gene_id) +top_n_exp <- norm_matrix %>% as.data.frame() %>% + rownames_to_column('gene_id') %>% + # dplyr::select(-group, -group_name) %>% + pivot_longer(!gene_id, names_to = 'sample', values_to = 'log2_expression') %>% + right_join(top_n, relationship = "many-to-many") %>% + left_join(coldata, by = 'sample') + +ggplot(top_n_exp, aes_string(x = column, y = 'log2_expression')) + + geom_boxplot(outlier.shape = NA, linewidth=0.5, color="grey") + + geom_point() + + facet_wrap(~gene_name) + + ggtitle(str_interp('Expression of Top ${n} DEGs')) + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + +``` + +# Pathway Enrichment + +From the set of differentially expressed genes and using publicly available information about gene sets involved in biological processes and functions, we can calculate which biological processes and functions are significantly perturbed as a result of the treatment. + +```{r} +universe=res %>% + filter(!is.na(padj)) %>% pull(gene_id) +mapping = AnnotationDbi::select(org.Hs.eg.db, universe, 'ENTREZID', 'ENSEMBL') + +all_in_life=list( + msigdbr(species = "human", category = "H") %>% mutate(gs_subcat="Hallmark"), + msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME"), + msigdbr(species = "human", category = "C2", subcategory = "CP:KEGG"), + msigdbr(species = "human", category = "C2", subcategory = "CP:PID"), + msigdbr(species = "human", category = "C5", subcategory = "GO:BP"), + msigdbr(species = "human", category = "C5", subcategory = "GO:MF"), + msigdbr(species = "human", category = "C5", subcategory = "HPO"), + msigdbr(species = "human", category = "C3", subcategory = "TFT:GTRD"), + msigdbr(species = "human", category = "C6") %>% mutate(gs_subcat="Oncogenic") +) + +ora_input = res %>% filter(!is.na(padj), padj<0.01, abs(lfc)>0.3) %>% pull(gene_id) +input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL')) + +total_deg=length(unique(ora_input))/length(unique(mapping$ENTREZID)) +pathways_ora_all = lapply(all_in_life, function(p){ + pathway = split(x = p$entrez_gene, f = p$gs_name) + db_name = paste(p$gs_cat[1], p$gs_subcat[1],sep=":") + respath <- fora(pathways = pathway, + genes = unique(input_entrezid$ENTREZID), + universe = unique(mapping$ENTREZID), + minSize = 15, + maxSize = 500) + coll_respath = collapsePathwaysORA(respath[order(pval)][padj < 0.1], + pathway, unique(input_entrezid$ENTREZID), unique(mapping$ENTREZID)) + as_tibble(respath[pathway %in% coll_respath$mainPathways]) %>% + mutate(database=db_name, NES=(overlap/size)/(total_deg)) +}) %>% bind_rows() %>% + mutate(analysis="ORA") + +ora_tb = pathways_ora_all %>% unnest(overlapGenes) %>% + group_by(pathway) %>% + left_join(mapping, by =c("overlapGenes"="ENTREZID")) %>% + dplyr::select(pathway, padj, NES, ENSEMBL, analysis, + database) + +pathways_long = ora_tb + +``` + + +```{r pathaways_table} +pathways_ora_all %>% sanitize_datatable() +``` + + +```{r write-files} +counts_norm=norm_matrix %>% as.data.frame() %>% + rownames_to_column("gene_id") %>% + mutate(comparison = str_interp("${numerator}_vs_${denominator}")) + +res_for_writing <- res %>% + mutate(comparison = str_interp("${numerator}_vs_${denominator}")) + +pathways_for_writing <- pathways_long %>% + mutate(comparison = str_interp("${numerator}_vs_${denominator}")) + +if (!is.null(subset_value)){ + counts_norm <- counts_norm %>% + mutate(subset = subset_value) + res_for_writing <- res_for_writing %>% + mutate(subset = subset_value) + pathways_for_writing <- pathways_for_writing %>% + mutate(subset = subset_value) +} + +write_csv(counts_norm, name_expression_fn) +write_csv(res_for_writing, name_deg_fn) +write_csv(pathways_for_writing, name_pathways_fn) +``` +# R session + +List and version of tools used for the DE report generation. + +```{r} +sessionInfo() +``` diff --git a/inst/templates/rnaseq/de/DE_tumor_vs_normal.html b/inst/templates/rnaseq/de/DE_tumor_vs_normal.html new file mode 100644 index 0000000..9060229 --- /dev/null +++ b/inst/templates/rnaseq/de/DE_tumor_vs_normal.html @@ -0,0 +1,5864 @@ + + + + + + + + + + + + + + + +Differential Expression + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+
+
+
+
+ +
+ + + + + + + +
# 1. Set up input files in this R file (params_de.R)
+source(params$params_file)
+# 2. Set up project file (already done from QC probably)
+source(params$project_file)
+# 3. Load custom functions to load data from coldata/metrics/counts
+source(params$functions_file)
+# IMPORTANT set these values if you are not using the parameters at the top
+genome=params$genome
+column=params$column
+numerator=params$numerator
+denominator=params$denominator
+subset_column=params$subset_column
+subset_value=params$subset_value
+run_ruv=params$ruv
+
library(rtracklayer)
+library(DESeq2)
+library(tidyverse)
+library(stringr)
+library(RUVSeq)
+library(DEGreport)
+library(ggpubr)
+library(msigdbr)
+library(fgsea)
+library(org.Hs.eg.db)
+library(knitr)
+library(EnhancedVolcano)
+library(bcbioR)
+library(ggprism)
+library(viridis)
+library(pheatmap)
+library(janitor)
+colors=cb_friendly_cols(1:15)
+ggplot2::theme_set(theme_prism(base_size = 14))
+opts_chunk[["set"]](
+    cache = F,
+    cache.lazy = FALSE,
+    dev = c("png", "pdf"),
+    error = TRUE,
+    highlight = TRUE,
+    message = FALSE,
+    prompt = FALSE,
+    tidy = FALSE,
+    warning = FALSE,
+    echo = T, 
+    fig.height = 4)
+
+# set seed for reproducibility
+set.seed(1234567890L)
+
# This code will load from bcbio or nf-core folder
+# NOTE make sure to set numerator and denominator
+coldata <- load_coldata(coldata_fn, column,
+                        numerator, denominator,
+                        subset_column, subset_value)
+coldata$sample=row.names(coldata)
+
+counts <- load_counts(counts_fn)
+counts <- counts[,colnames(counts) %in% coldata$sample]
+
+metrics <- load_metrics(se_object, multiqc_data_dir, gtf_fn, counts) %>% 
+  left_join(coldata, by = c('sample')) %>% 
+  as.data.frame()
+rownames(metrics) <- metrics$sample
+# if the names don't match in order or string check files names and coldata information
+counts = counts[rownames(metrics)]
+coldata = coldata[rownames(metrics),]
+stopifnot(all(names(counts) == rownames(metrics)))
+
sanitize_datatable = function(df, ...) {
+ # remove dashes which cause wrapping
+ DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)),
+                   colnames=gsub("-", "_", colnames(df)))
+}
+
+

1 Overview

+
    +
  • Project: name_hbcXXXXX
  • +
  • PI: person name
  • +
  • Analyst: person in the core
  • +
  • Experiment: short description
  • +
  • Aim: short description
  • +
  • Comparison: tumor vs. normal
  • +
+
if (!is.null(subset_value) & !is.null(subset_value)){
+  filenames = str_interp("${subset_value}_${numerator}_vs_${denominator}")
+} else {
+  filenames = str_interp("${numerator}_vs_${denominator}")
+}
+
+contrasts = c(column,numerator,denominator)
+coef=paste0(column,"_",numerator,"_vs_",denominator)
+
+name_expression_fn=file.path(
+                             basedir,
+                             str_interp("${filenames}_expression.csv"))
+name_deg_fn=file.path(
+                      basedir,
+                      str_interp("${filenames}_deg.csv"))
+name_pathways_fn=file.path(
+                           basedir,
+                           str_interp("${filenames}_pathways.csv"))
+
rdata = AnnotationDbi::select(org.Hs.eg.db, rownames(counts), 'SYMBOL', 'ENSEMBL') %>%
+  dplyr::select(gene_id = ENSEMBL, gene_name = SYMBOL)
+
dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = ~1)
+
+vsd_before <- vst(dds_to_use)
+norm_matrix = assay(vsd_before)
+
+
+

2 Covariate analysis

+
degCovariates(
+  norm_matrix,
+  metrics,
+)
+

+
+
+

3 PCA analysis

+
pca1 <- degPCA(norm_matrix, colData(dds_to_use), 
+       condition = column) + ggtitle('PCA')
+pca1 + scale_color_cb_friendly()
+

+
formula <- as.formula(paste0("~ ", " + ", column))
+## Check if sample name matches
+stopifnot(all(names(counts) == rownames(coldata)))
+
+dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = formula)
+
+vsd_before <- vst(dds_to_use)
+norm_matrix = assay(vsd_before)
+new_cdata <- coldata
+
# If you want to skip the code, just set up formula to be your model in the next chunk of code
+design <- coldata[[column]]
+diffs <- makeGroups(design)
+dat <- norm_matrix
+ruvset <- RUVs(dat, cIdx=rownames(dat), k=1, diffs, isLog = T, round = F)
+vars <- ruvset$W
+
+new_cdata <- cbind(coldata, vars)
+
+formula <- as.formula(paste0("~ ", 
+                          paste0(
+                            colnames(new_cdata)[grepl("W", colnames(new_cdata))], 
+                            collapse = " + "
+                          ), " + ", column)
+)
+norm_matrix=ruvset$normalizedCounts
+pca2 <- degPCA(norm_matrix, new_cdata, 
+       condition = column) + ggtitle('After RUV')
+pca2 + scale_color_cb_friendly()
+
dds_to_use <- DESeqDataSetFromMatrix(counts, new_cdata, design = formula)
+vsd_to_use<- vst(dds_to_use, blind=FALSE)
+
+
+

4 Differential +Expression

+

Differential gene expression analysis of count data was performed +using the Bioconductor R package, DESeq2, which fits the count data to a +negative binomial model.

+

Before fitting the model, we often look at a metric called +dispersion, which is a measure for variance which also takes into +consideration mean expression. A dispersion value is estimated for each +individual gene, then ‘shrunken’ to a more accurate value based on +expected variation for a typical gene exhibiting that level of +expression. Finally, the shrunken dispersion value is used in the final +GLM fit.

+

We use the below dispersion plot, which should show an inverse +relationship between dispersion and mean expression, to get an idea of +whether our data is a good fit for the model.

+
de <- DESeq(dds_to_use)
+
+DESeq2::plotDispEsts(de)
+

+

Because it is difficult to accurately detect and quantify the +expression of lowly expressed genes, differences in their expression +between treatment conditions can be unduly exaggerated after the model +is fit. We correct for this so that gene LFC is not dependent overall on +basal gene expression level.

+
# resultsNames(de) # check the order is right
+resLFC = results(de, contrast=contrasts)
+resLFCS <- lfcShrink(de, coef=coef, type="apeglm")
+
+res <- as.data.frame(resLFCS) %>%
+  rownames_to_column('gene_id') %>% left_join(rdata, by = 'gene_id') %>% 
+  relocate(gene_name) %>% dplyr::rename(lfc = log2FoldChange) %>%
+  mutate(pi = abs(lfc) * -log10(padj)) %>% arrange(-pi)
+
+res_sig <- res %>% filter(padj < 0.05) %>% arrange(padj) %>%
+  mutate(gene_name = ifelse(is.na(gene_name), gene_id, gene_name))
+
+res_mod <- res %>% mutate(lfc = replace(lfc, lfc < -5, -5)) %>% mutate(lfc = replace(lfc, lfc > 5, 5))
+show <- as.data.frame(res_mod[1:10, c("lfc", "padj", "gene_name")])
+
+degMA(as.DEGSet(resLFC)) + ggtitle('Before LFC Shrinking')
+

+
+

4.1 MA plot

+
degMA(as.DEGSet(resLFCS), limit = 2) + ggtitle('After LFC Shrinking')
+

+
+
+

4.2 Volcano plot

+

This volcano plot shows the genes that are significantly up- and +down-regulated as a result of the analysis comparison. The points +highlighted in red are genes that have padj < 0.05 and a log2-fold +change > 1. Points in blue have a padj < 0.05 and a log2-fold +change < 1 and points in green have a padj > 0.05 and a log2-fold +change > 2. Grey points are non-significant. The dashed lines +correspond to the cutoff values of log2 foldchance and padj that we have +chosen.

+
# degVolcano(res_mod[,c('lfc', 'padj')], plot_text = show) 
+EnhancedVolcano(res_mod,
+                lab= res_mod$gene_name, 
+                pCutoff = 1.345719e-03, 
+                selectLab = c(res_sig$gene_name[1:15]),
+                FCcutoff = 0.5,
+                x = 'lfc',
+                y = 'padj', 
+                title="Volcano Tumor vs. Normal",
+                col=as.vector(colors[c("dark_grey", "light_blue",
+                                         "purple", "purple")]),
+                subtitle = "", xlim=c(-5,5)) 
+

+
+
+

4.3 Heatmap

+
### Run pheatmap using the metadata data frame for the annotation
+ma=norm_matrix[res_sig$gene_id,]
+colma=coldata[,c(column), drop=FALSE]
+colors=lapply(colnames(colma), function(c){
+  l.col=colors[1:length(unique(colma[[c]]))]
+  names(l.col)=unique(colma[[c]])
+  l.col
+})
+names(colors)=colnames(colma)
+pheatmap(ma, 
+         color = inferno(10), 
+         cluster_rows = T, 
+         show_rownames = F,
+         annotation = colma, 
+         annotation_colors = colors,
+         border_color = NA, 
+         fontsize = 10, 
+         scale = "row", 
+         fontsize_row = 10, 
+         height = 20)
+
+
+

4.4 Differentially +Expressed Genes

+
res_sig %>% sanitize_datatable
+
+ +
+
+

4.5 Plot top 16 +genes

+
n = 16
+top_n <- res_sig %>% slice_min(order_by = padj, n = n, with_ties = F) %>% 
+  dplyr::select(gene_name, gene_id)
+top_n_exp <- norm_matrix %>% as.data.frame() %>% 
+  rownames_to_column('gene_id') %>%
+  # dplyr::select(-group, -group_name) %>% 
+  pivot_longer(!gene_id, names_to = 'sample', values_to = 'log2_expression') %>%
+  right_join(top_n, relationship = "many-to-many") %>%
+  left_join(coldata, by = 'sample')
+
+ggplot(top_n_exp, aes_string(x = column, y = 'log2_expression')) +
+  geom_boxplot(outlier.shape = NA, linewidth=0.5, color="grey") + 
+  geom_point() +
+  facet_wrap(~gene_name) + 
+  ggtitle(str_interp('Expression of Top ${n} DEGs')) +
+  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
+

+
+
+
+

5 Pathway Enrichment

+

From the set of differentially expressed genes and using publicly +available information about gene sets involved in biological processes +and functions, we can calculate which biological processes and functions +are significantly perturbed as a result of the treatment.

+
universe=res %>% 
+  filter(!is.na(padj)) %>% pull(gene_id)
+mapping = AnnotationDbi::select(org.Hs.eg.db, universe, 'ENTREZID', 'ENSEMBL')
+
+all_in_life=list(
+  msigdbr(species = "human", category = "H") %>% mutate(gs_subcat="Hallmark"),
+  msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME"),
+  msigdbr(species = "human", category = "C2", subcategory = "CP:KEGG"),
+  msigdbr(species = "human", category = "C2", subcategory = "CP:PID"),
+  msigdbr(species = "human", category = "C5", subcategory = "GO:BP"),
+  msigdbr(species = "human", category = "C5", subcategory = "GO:MF"),
+  msigdbr(species = "human", category = "C5", subcategory = "HPO"),
+  msigdbr(species = "human", category = "C3", subcategory = "TFT:GTRD"),
+  msigdbr(species = "human", category = "C6") %>% mutate(gs_subcat="Oncogenic")
+)
+
+ora_input = res %>% filter(!is.na(padj), padj<0.01, abs(lfc)>0.3) %>% pull(gene_id)
+input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL'))
+
+total_deg=length(unique(ora_input))/length(unique(mapping$ENTREZID))
+pathways_ora_all = lapply(all_in_life, function(p){
+  pathway = split(x = p$entrez_gene, f = p$gs_name)
+  db_name = paste(p$gs_cat[1], p$gs_subcat[1],sep=":")
+  respath <- fora(pathways = pathway, 
+                  genes = unique(input_entrezid$ENTREZID),
+                  universe = unique(mapping$ENTREZID),
+                  minSize  = 15,
+                  maxSize  = 500)
+  coll_respath = collapsePathwaysORA(respath[order(pval)][padj < 0.1], 
+                                     pathway, unique(input_entrezid$ENTREZID), unique(mapping$ENTREZID))
+  as_tibble(respath[pathway %in% coll_respath$mainPathways])  %>% 
+    mutate(database=db_name, NES=(overlap/size)/(total_deg))
+}) %>% bind_rows() %>% 
+  mutate(analysis="ORA")
+  
+ora_tb = pathways_ora_all %>% unnest(overlapGenes) %>%
+  group_by(pathway) %>% 
+  left_join(mapping, by =c("overlapGenes"="ENTREZID")) %>% 
+  dplyr::select(pathway, padj, NES, ENSEMBL, analysis,
+                database)
+
+pathways_long = ora_tb
+
pathways_ora_all %>% sanitize_datatable()
+
+ +
counts_norm=norm_matrix %>% as.data.frame() %>% 
+  rownames_to_column("gene_id") %>% 
+  mutate(comparison = str_interp("${numerator}_vs_${denominator}"))
+
+res_for_writing <- res %>% 
+  mutate(comparison = str_interp("${numerator}_vs_${denominator}"))
+
+pathways_for_writing <- pathways_long %>% 
+  mutate(comparison = str_interp("${numerator}_vs_${denominator}"))
+         
+if (!is.null(subset_value)){
+  counts_norm <- counts_norm %>% 
+    mutate(subset = subset_value)  
+  res_for_writing <- res_for_writing %>% 
+    mutate(subset = subset_value)
+  pathways_for_writing <- pathways_for_writing %>% 
+    mutate(subset = subset_value)
+}
+
+write_csv(counts_norm, name_expression_fn)
+write_csv(res_for_writing, name_deg_fn)
+write_csv(pathways_for_writing, name_pathways_fn)
+
+
+

6 R session

+

List and version of tools used for the DE report generation.

+
sessionInfo()
+
## R version 4.3.3 (2024-02-29)
+## Platform: aarch64-apple-darwin20 (64-bit)
+## Running under: macOS Sonoma 14.4.1
+## 
+## Matrix products: default
+## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
+## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
+## 
+## locale:
+## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
+## 
+## time zone: America/New_York
+## tzcode source: internal
+## 
+## attached base packages:
+## [1] stats4    stats     graphics  grDevices utils     datasets  methods   base     
+## 
+## other attached packages:
+##  [1] rmarkdown_2.26              viridis_0.6.5               viridisLite_0.4.2          
+##  [4] ggprism_1.0.5               EnhancedVolcano_1.20.0      org.Hs.eg.db_3.18.0        
+##  [7] AnnotationDbi_1.64.1        fgsea_1.28.0                msigdbr_7.5.1              
+## [10] ggpubr_0.6.0                RUVSeq_1.36.0               edgeR_4.0.16               
+## [13] limma_3.58.1                EDASeq_2.36.0               ShortRead_1.60.0           
+## [16] GenomicAlignments_1.36.0    Rsamtools_2.16.0            Biostrings_2.68.1          
+## [19] XVector_0.40.0              BiocParallel_1.34.2         janitor_2.2.0              
+## [22] DT_0.33                     pheatmap_1.0.12             ggrepel_0.9.5              
+## [25] DEGreport_1.38.5            DESeq2_1.42.1               SummarizedExperiment_1.30.2
+## [28] Biobase_2.60.0              MatrixGenerics_1.12.3       matrixStats_1.3.0          
+## [31] rtracklayer_1.60.1          GenomicRanges_1.52.1        GenomeInfoDb_1.36.4        
+## [34] IRanges_2.34.1              S4Vectors_0.38.2            BiocGenerics_0.46.0        
+## [37] knitr_1.45                  lubridate_1.9.3             forcats_1.0.0              
+## [40] stringr_1.5.1               dplyr_1.1.4                 purrr_1.0.2                
+## [43] readr_2.1.5                 tidyr_1.3.1                 tibble_3.2.1               
+## [46] ggplot2_3.5.0               tidyverse_2.0.0             bcbioR_0.1.2               
+## 
+## loaded via a namespace (and not attached):
+##   [1] BiocIO_1.10.0               bitops_1.0-7                filelock_1.0.3             
+##   [4] R.oo_1.26.0                 XML_3.99-0.16.1             lifecycle_1.0.4            
+##   [7] rstatix_0.7.2               doParallel_1.0.17           lattice_0.22-5             
+##  [10] vroom_1.6.5                 MASS_7.3-60.0.1             crosstalk_1.2.1            
+##  [13] backports_1.4.1             magrittr_2.0.3              sass_0.4.9                 
+##  [16] jquerylib_0.1.4             yaml_2.3.8                  cowplot_1.1.3              
+##  [19] DBI_1.2.2                   RColorBrewer_1.1-3          ConsensusClusterPlus_1.66.0
+##  [22] abind_1.4-5                 zlibbioc_1.46.0             R.utils_2.12.3             
+##  [25] RCurl_1.98-1.14             rappdirs_0.3.3              circlize_0.4.16            
+##  [28] GenomeInfoDbData_1.2.10     codetools_0.2-19            DelayedArray_0.26.7        
+##  [31] xml2_1.3.6                  tidyselect_1.2.1            shape_1.4.6.1              
+##  [34] farver_2.1.1                BiocFileCache_2.8.0         jsonlite_1.8.8             
+##  [37] GetoptLong_1.0.5            iterators_1.0.14            bbmle_1.0.25.1             
+##  [40] foreach_1.5.2               tools_4.3.3                 progress_1.2.3             
+##  [43] Rcpp_1.0.12                 glue_1.7.0                  gridExtra_2.3              
+##  [46] mnormt_2.1.1                SparseArray_1.2.4           xfun_0.43                  
+##  [49] numDeriv_2016.8-1.1         withr_3.0.0                 fastmap_1.1.1              
+##  [52] latticeExtra_0.6-30         fansi_1.0.6                 digest_0.6.35              
+##  [55] timechange_0.3.0            R6_2.5.1                    colorspace_2.1-0           
+##  [58] jpeg_0.1-10                 biomaRt_2.56.1              RSQLite_2.3.6              
+##  [61] R.methodsS3_1.8.2           utf8_1.2.4                  generics_0.1.3             
+##  [64] data.table_1.15.4           prettyunits_1.2.0           httr_1.4.7                 
+##  [67] htmlwidgets_1.6.4           S4Arrays_1.2.1              pkgconfig_2.0.3            
+##  [70] gtable_0.3.4                blob_1.2.4                  ComplexHeatmap_2.18.0      
+##  [73] hwriter_1.3.2.1             htmltools_0.5.8.1           carData_3.0-5              
+##  [76] clue_0.3-65                 scales_1.3.0                logging_0.10-108           
+##  [79] hues_0.2.0                  png_0.1-8                   snakecase_0.11.1           
+##  [82] ggdendro_0.2.0              rstudioapi_0.16.0           tzdb_0.4.0                 
+##  [85] rjson_0.2.21                coda_0.19-4.1               nlme_3.1-164               
+##  [88] curl_5.2.1                  bdsmatrix_1.3-7             cachem_1.0.8               
+##  [91] GlobalOptions_0.1.2         parallel_4.3.3              restfulr_0.0.15            
+##  [94] apeglm_1.24.0               pillar_1.9.0                grid_4.3.3                 
+##  [97] reshape_0.8.9               vctrs_0.6.5                 car_3.1-2                  
+## [100] dbplyr_2.5.0                cluster_2.1.6               evaluate_0.23              
+## [103] GenomicFeatures_1.52.2      mvtnorm_1.2-4               cli_3.6.2                  
+## [106] locfit_1.5-9.9              compiler_4.3.3              rlang_1.1.3                
+## [109] crayon_1.5.2                ggsignif_0.6.4              labeling_0.4.3             
+## [112] interp_1.1-6                aroma.light_3.32.0          emdbook_1.3.13             
+## [115] plyr_1.8.9                  stringi_1.8.3               psych_2.4.3                
+## [118] deldir_2.0-4                babelgene_22.9              munsell_0.5.1              
+## [121] Matrix_1.6-5                hms_1.1.3                   bit64_4.0.5                
+## [124] KEGGREST_1.40.1             statmod_1.5.0               highr_0.10                 
+## [127] broom_1.0.5                 memoise_2.0.1               bslib_0.7.0                
+## [130] fastmatch_1.1-4             bit_4.0.5
+
+ + + +
+
+ +
+ + + + + + + + + + + + + + + + + diff --git a/inst/templates/rnaseq/de/Multiplicative_DE_docs.md b/inst/templates/rnaseq/de/Multiplicative_DE_docs.md new file mode 100644 index 0000000..aa35021 --- /dev/null +++ b/inst/templates/rnaseq/de/Multiplicative_DE_docs.md @@ -0,0 +1,129 @@ +# Overview + +This is an example of complex DE analysis with multiple covariates with multiple levels. + +We have the SEX variable (2 levels) and the GENOTYPE VARIABLE (4 levels) + +# Intercept Analysis + +``` +# Model design and creating dds object from the dataset +design = ~sex + genotype + sex:genotype +dds <- DESeqDataSet(se_Striatum, design) +``` + +## Filtering lowly expressed genes +We are filtering out genes with fewer than 10 raw counts in total and are present in fewer than 3 samples. + +``` +keep <- rowSums(counts(dds)>=10) >=4 +dds <- dds[keep, ] +#dds # comment out this line to print the dds object and compare the dimension of the dataset before and after filtering is applied. +``` + +setting up WT as reference genotype and Male and reference sex. Otherwise DESeq2 will use the conditions in their alphabetical order. + +``` +dds$genotype <- relevel(dds$genotype, ref = "WT") +dds$sex <- relevel(dds$sex, ref = "Male") + +#Checking model design and reference condition comment out the three lines below to print the design and order of genotype and sex +design(dds) +levels(dds$genotype) +levels(dds$sex) + +#estimating size factors for normalization and fitting our model with DESeq model +dds <- estimateSizeFactors(dds) +dds <- DESeq(dds) +resultsNames(dds) #This will print out the name of coefficients being compared, comment it out to view + +# get coefficient matrix +mod_mat <- model.matrix(design(dds), data = colData(dds)) +mod_mat + +(Intercept) sexFemale genotypeCR3KO genotypeQ175 genotypeQ175_CR3KO sexFemale:genotypeCR3KO sexFemale:genotypeQ175 sexFemale:genotypeQ175_CR3KO +a10_st_q175_m_r1 1 0 0 1 0 0 0 0 +a12_st_q175_f_r1 1 1 0 1 0 0 1 0 +a14_st_wt_m_r1 1 0 0 0 0 0 0 0 +a16_st_wt_f_r1 1 1 0 0 0 0 0 0 +``` + + +coefficient weights extracted from the mod_mat above + +``` +WT_M <- c(1, 0, 0, 0, 0, 0, 0, 0) +WT_F <- c(1, 1, 0, 0, 0, 0, 0, 0) +WTCR3ko_M <- c(1, 0, 1, 0, 0, 0, 0, 0) +WTCR3ko_F <- c( 1, 1, 1, 0, 0, 1, 0, 0) +Q175_M <- c(1, 0, 0, 1, 0, 0, 0, 0) +Q175_F <- c(1, 1, 0, 1, 0, 0, 1, 0) +Q175CR3ko_M <- c(1, 0, 0, 0, 1, 0, 0, 0) +Q175CR3ko_F <- c(1, 1, 0, 0, 1, 0, 0, 1) +``` + +# Differential gene expression analysis + +## Comp_2: Female vs Male : WTCR3ko +``` +comp2_F.v.M_WTCR3ko <- results(dds, contrast = c(WTCR3ko_F - WTCR3ko_M)) +comp2_F.v.M_WTCR3ko_shrink <- lfcShrink(dds, contrast = c(WTCR3ko_F - WTCR3ko_M), type = "ashr") +summary(comp2_F.v.M_WTCR3ko) +``` + +## Comp_3: Female vs Male : Q175 +``` +comp3_F.v.M_Q175 <- results(dds, contrast = c(Q175_F - Q175_M)) +comp3_F.v.M_Q175_shrink <- lfcShrink(dds, contrast = c(Q175_F - Q175_M), type = "ashr") +summary(comp3_F.v.M_Q175) +``` + +## Comp_5: WTCR3ko vs WT : Male +```{r} +comp5_WTCR3ko.v.WT_Male <- results(dds, contrast = c(WTCR3ko_M - WT_M)) +comp5_WTCR3ko.v.WT_Male_shrink <- lfcShrink(dds, contrast = c(WTCR3ko_M - WT_M), type = "ashr") +summary(comp5_WTCR3ko.v.WT_Male) +``` + +## Comp_6: WTCR3ko vs WT : Female +```{r} +comp6_WTCR3ko.v.WT_Female <- results(dds, contrast = c(WTCR3ko_F - WT_F)) +comp6_WTCR3ko.v.WT_Female_shrink <- lfcShrink(dds, contrast = c(WTCR3ko_F - WT_F), type = "ashr") +summary(comp6_WTCR3ko.v.WT_Female) +``` + +## Comp_11: Q175CR3ko vs Q175 : Male +``` +comp11_Q175CR3ko.v.Q175_Male <- results(dds, contrast = c(Q175CR3ko_M - Q175_M)) +comp11_Q175CR3ko.v.Q175_Male_shrink <- lfcShrink(dds, contrast = c(Q175CR3ko_M - Q175_M), type = "ashr") +summary(comp11_Q175CR3ko.v.Q175_Male) +``` + +## Comp_12: Q175CR3ko vs Q175 : Female +``` +comp12_Q175CR3ko.v.Q175_Female <- results(dds, contrast = c(Q175CR3ko_F - Q175_F)) +comp12_Q175CR3ko.v.Q175_Female_shrink <- lfcShrink(dds, contrast = c(Q175CR3ko_F - Q175_F), type = "ashr") +summary(comp12_Q175CR3ko.v.Q175_Female) +``` + +## Comp_15: (Q175CR3ko-Q176) - (WTCR3ko - WT) : Male + +Does the CR3 knockout in Q175 differ from CR3 knockout in WT for Males? + +``` +comp15_CR3koinQ175.v.CR3koinWT_Male <- results(dds, + contrast = c(Q175CR3ko_M - Q175_M) - (WTCR3ko_M - WT_M)) +comp15_CR3koinQ175.v.CR3koinWT_Male_shrink <- lfcShrink(dds, + contrast = c(Q175CR3ko_M - Q175_M) - (WTCR3ko_M - WT_M), type = "ashr") +summary(comp15_CR3koinQ175.v.CR3koinWT_Male) +``` + +## Comp_17: (WTCR3koall) - (WTall) + +Does the average of the samples in WTCR3KO differ from average of the samples in WT + +``` +comp17_WTCR3all.v.WTall <- results(dds, contrast = c(WTCR3ko_M + WTCR3ko_F)/2 - (WT_M + WT_F)/2) +comp17_WTCR3all.v.WTall_shrink <- lfcShrink(dds, contrast = c((WTCR3ko_M + WTCR3ko_F)/2 - (WT_M + WT_F)/2), type = "ashr") +summary(comp17_WTCR3all.v.WTall) +``` diff --git a/inst/templates/rnaseq/de/PCA_variance_analysis.Rmd b/inst/templates/rnaseq/de/PCA_variance_analysis.Rmd new file mode 100644 index 0000000..101c6b8 --- /dev/null +++ b/inst/templates/rnaseq/de/PCA_variance_analysis.Rmd @@ -0,0 +1,32 @@ +--- +title: "PCA with variance analysis" +author: "Harvard Chan Bioinformatics Core" +--- + +```{r } +library(DEGreport) +library(ggplot2) +library(ggforce) + +data("bcbio_vsd_data") + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) + +pca <- degPCA(assay(bcbio_vsd_data), colData(bcbio_vsd_data), + condition = "sample_type", name = "sample", data = T) + +pca$plot + ggtitle(paste0("All samples", "\nPCA using ", nrow(vst), " genes")) + + theme(plot.title=element_text(hjust=0.5)) + + geom_mark_ellipse(aes(color = sample_type)) +``` +```{r} +# NOTE:This is not confirmed to be a valid test but it could help to understand the data +library(vegan) +vare.disa <- vegdist(t(assay(bcbio_vsd_data))) + +mod = betadisper(vare.disa, colData(bcbio_vsd_data)[['sample_type']]) +anova(mod) +``` + + diff --git a/inst/templates/rnaseq/de/load_data.R b/inst/templates/rnaseq/de/load_data.R new file mode 100644 index 0000000..8a1d297 --- /dev/null +++ b/inst/templates/rnaseq/de/load_data.R @@ -0,0 +1,146 @@ +library(tidyverse) +library(SummarizedExperiment) +library(janitor) +load_metrics <- function(se_object, multiqc_data_dir, gtf_fn, counts){ + + # bcbio input + if (!is.na(se_object)){ + + se <- readRDS(se_object) + metrics <- metadata(se)$metrics %>% as.data.frame() + # left_join(coldata %>% rownames_to_column('sample')) %>% column_to_rownames('sample') + } else { #nf-core input + + # Get metrics from nf-core into bcbio like table + # many metrics are already in the Genereal Table of MultiQC, this reads the file + metrics <- read_tsv(file.path(multiqc_data_dir, 'multiqc_general_stats.txt')) + + # we get some more metrics from Qualimap and rename columns + metrics_qualimap <- read_tsv(file.path(multiqc_data_dir, 'mqc_qualimap_genomic_origin_1.txt')) + metrics <- metrics %>% full_join(metrics_qualimap) + metrics <- metrics %>% + clean_names() %>% + dplyr::rename_with(~gsub('.*mqc_generalstats_', '', .)) + + # This uses the fastqc metrics to get total reads + total_reads <- metrics %>% + dplyr::filter(!is.na(fastqc_raw_total_sequences)) %>% + remove_empty(which = 'cols') %>% + dplyr::rename(single_sample = sample) %>% + mutate(sample = gsub('_[12]+$', '', single_sample)) %>% + group_by(sample) %>% + summarize(total_reads = sum(fastqc_raw_total_sequences)) + + # This renames to user-friendly names the metrics columns + metrics <- metrics %>% + dplyr::filter(is.na(fastqc_raw_total_sequences)) %>% + remove_empty(which = 'cols') %>% + full_join(total_reads) %>% + mutate(mapped_reads = samtools_reads_mapped) %>% + mutate(exonic_rate = exonic/(star_uniquely_mapped * 2)) %>% + mutate(intronic_rate = intronic/(star_uniquely_mapped * 2)) %>% + mutate(intergenic_rate = intergenic/(star_uniquely_mapped * 2)) %>% + mutate(x5_3_bias = qualimap_5_3_bias) + + # Sometimes we don't have rRNA due to mismatch annotation, We skip this if is the case + gtf <- NULL + if (genome =="other"){ + gtf <- gtf_fn + }else{ + if (genome == "hg38") { + gtf <- "hg38.rna.gtf.gz" + } else if (genome == "mm10") { + gtf <- "mm10.rna.gtf.gz" + } else if (genome == "mm39") { + gtf <- "mm39.rna.gtf.gz" + } + gtf <- system.file("extdata", "annotation", + gtf, + package="bcbioR") + } + if (is.null(gtf)) { + print("No genome provided! Please add it at the top of this Rmd") + } + + gtf=rtracklayer::import(gtf) + + + one=grep("gene_type", colnames(as.data.frame(gtf)), value = TRUE) + another=grep("gene_biotype", colnames(as.data.frame(gtf)), value = TRUE) + biotype=NULL + if(length(one)==1){ + biotype=one + }else if(length(another)==1){ + biotype=another + }else{ + warning("No gene biotype founded") + } + + if (!is.null(biotype)){ + annotation=as.data.frame(gtf) %>% .[,c("gene_id", biotype)] + rRNA=grepl("rRNA|tRNA",annotation[[biotype]]) + genes=intersect(annotation[rRNA,"gene_id"],row.names(counts)) + ratio=data.frame(sample=colnames(counts), + r_and_t_rna_rate=colSums(counts[genes,])/colSums(counts)) + metrics = left_join(metrics, ratio, by="sample") + }else{ + metrics[["r_and_t_rna_rate"]] <- NA + } + + # if ("custom_content_biotype_counts_percent_r_rna" %in% colnames(metrics)){ + # metrics <- mutate(metrics, r_rna_rate = custom_content_biotype_counts_percent_r_rna) + # }else{ + # metrics[["r_rna_rate"]] <- NA + # } + metrics=metrics[,c("sample","mapped_reads","exonic_rate","intronic_rate", + "total_reads", + "x5_3_bias", "r_and_t_rna_rate","intergenic_rate")] + } + metrics$sample <- make.names(metrics$sample) + rownames(metrics) <- metrics$sample + return(metrics) +} + +load_coldata <- function(coldata_fn, column, numerator, denominator, subset_column = NULL, subset_value = NULL){ + coldata=read.csv(coldata_fn) %>% + dplyr::select(!matches("fastq") & !matches("strandness")) %>% + distinct() + if('description' %in% names(coldata)){ + coldata$sample <- tolower(coldata$description) + } + coldata <- coldata %>% distinct(sample, .keep_all = T) + stopifnot(column %in% names(coldata)) + + # use only some samples, by default use all + if (!is.null(subset_column)){ + coldata <- coldata[coldata[[paste(subset_column)]] == subset_value, ] + } + #coldata <- coldata[coldata[[paste(column)]] %in% c(numerator, denominator), ] + #browser() + coldata$sample <- make.names(coldata$sample) + rownames(coldata) <- coldata$sample + coldata$description <- coldata$sample + + coldata[[column]] = relevel(as.factor(coldata[[column]]), denominator) + + return(coldata) +} + +load_counts <- function(counts_fn){ + + # bcbio input + if(grepl('csv', counts_fn)){ + counts <- read_csv(counts_fn) %>% + mutate(gene = str_replace(gene, pattern = "\\.[0-9]+$", "")) %>% + column_to_rownames('gene') + colnames(counts) = tolower(colnames(counts)) + return(counts) + } else { # nf-core input + counts <- read_tsv(counts_fn) %>% dplyr::select(-gene_name) %>% + mutate(gene_id = str_replace(gene_id, pattern = "\\.[0-9]+$", "")) %>% + column_to_rownames('gene_id') %>% round + + return(counts) + } + +} diff --git a/inst/templates/rnaseq/de/params_de-example.R b/inst/templates/rnaseq/de/params_de-example.R new file mode 100644 index 0000000..cc75ad2 --- /dev/null +++ b/inst/templates/rnaseq/de/params_de-example.R @@ -0,0 +1,18 @@ +# project params +date = "YYYYMMDD" +basedir <- './' # where to write down output files + +# params for bcbio +# coldata_fn = "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv" +# counts_fn = 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/tximport-counts.csv' +# se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds") +# + +# Example data +coldata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/coldata.csv' +counts_fn=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.tsv') +# This folder is in the output directory inside multiqc folder +multiqc_data_dir='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/multiqc/star_salmon/multiqc-report-data/' +# This file is inside the genome folder in the output directory +gtf_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/devel/nf-core/genome/genome.filtered.gtf.gz' +se_object = NA diff --git a/inst/templates/rnaseq/de/params_de.R b/inst/templates/rnaseq/de/params_de.R new file mode 100644 index 0000000..8426428 --- /dev/null +++ b/inst/templates/rnaseq/de/params_de.R @@ -0,0 +1,22 @@ +# project params +date = "YYYYMMDD" +basedir <- './' # where to write down output files + +# params for bcbio +# coldata_fn = "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv" +# counts_fn = 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/tximport-counts.csv' +# se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds") +# + +# params for nfcore +# Your data +# This is the file used to run nf-core or compatible to that +coldata_fn='/Path/to/metadata/meta.csv' +# This file is inside star_salmon/ folder +counts_fn='/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.tsv' +# This folder called "multiqc_report_data" is inside the output directory star_salmon inside multiqc folder +multiqc_data_dir='/path/to/nf-core/output/star_salmon/multiqc_report_data' +# This file is inside the genome folder in the output directory, use this only non-model organism +# gtf_fn='/path/to/nf-core/output/genome/hg38.filtered.gtf' +se_object = NA + diff --git a/inst/templates/rnaseq/de/run_markdown.R b/inst/templates/rnaseq/de/run_markdown.R new file mode 100644 index 0000000..79e15a0 --- /dev/null +++ b/inst/templates/rnaseq/de/run_markdown.R @@ -0,0 +1,32 @@ +library(rmarkdown) +# set working directory to this file before using the function + + +# set directory to this file folder +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +# example running with test data +render_de <- function(column, numerator, denominator, subset_value = NULL, + params_file = 'params_de-testdata.R'){ + + rmarkdown::render(input = "DEG.Rmd", + output_dir = ".", + output_format = "html_document", + output_file = ifelse(!is.null(subset_value), + paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'), + paste0('DE_', numerator, '_vs_', denominator, '.html') + ), + clean = TRUE, + envir = new.env(), + params = list( + column = column, + subset_value = subset_value, + numerator = numerator, + denominator = denominator, + params_file = params_file, + project_file = '../information.R', + functions_file = 'load_data.R' + ) + ) +} +#Example data +render_de("sample_type","tumor", "normal") diff --git a/inst/templates/rnaseq/org/hcbc_README.md b/inst/templates/rnaseq/org/hcbc_README.md new file mode 100644 index 0000000..50f8f16 --- /dev/null +++ b/inst/templates/rnaseq/org/hcbc_README.md @@ -0,0 +1,75 @@ +# Guideline for RNAseq downstream analysis + +Make sure there is a project name for this. + +## Run data with nf-core rnaseq + +- Make sure you have access to our [Seqera WorkSpace](https://cloud.seqera.io/orgs/HBC/workspaces/core_production/launchpad) +- Transfer data to HCBC S3: Ask Alex/Lorena. Files will be at our S3 bucket `input/rawdata` folder +- Prepare the CSV file according this [instructions](https://nf-co.re/rnaseq/3.14.0/docs/usage#multiple-runs-of-the-same-sample). File should look like this: + +```csv +sample,fastq_1,fastq_2,strandedness +CONTROL_REP1,s3path/AEG588A1_S1_L002_R1_001.fastq.gz,s3path/AEG588A1_S1_L002_R2_001.fastq.gz,auto +CONTROL_REP1,s3path/AEG588A1_S1_L003_R1_001.fastq.gz,s3path/AEG588A1_S1_L003_R2_001.fastq.gz,auto +CONTROL_REP1,s3path/AEG588A1_S1_L004_R1_001.fastq.gz,s3path/AEG588A1_S1_L004_R2_001.fastq.gz,auto +``` + +Use `bcbio_nfcore_check(csv_file)` to check the file is correct. + +You can add more columns to this file with more metadata, and use this file as the `coldata` file the templates. + +- Upload file to our `Datasets` in Seqera using the name of the project but starting with `nfcore-rnaseq` +- Go to `Launchpad`, select `nf-core_rnaseq` pipeline, and select the previous created `Datasets` in the `input` parameter after clicking in `Browser` + - Select an output directory with the same name used for the `Dataset` inside the `results` folder in S3 +- When pipeline is down, data will be copied to our on-premise HPC in the scratch system under `scratch/groups/hsph/hbc/bcbio/` folder + +## Downstream analysis + +Please, modify `information.R` with the right information. You can use this file with any other Rmd to include the project/analysis information. + +### QC + +`QC/QC.Rmd` is a template for QC metrics. Use `params_qc.R` for `bcbio` + or `QC/QC_nf-core.Rmd` `params_qc_nf-core.R` for `nf-core/rnaseq` outputs. + +Read instruction in the R and Rmd scripts to render it. + +### DE + +`DE/DEG.Rmd` is a template for two groups comparison. `params_de.R` has the information of the input files to load. You can point to `bcbio` or `nf-core/rnaseq` output files. + +On the `YAML` header file of the `Rmd` you can specify some parameters or just set them up in the first chunk of code of the template. This template has examples of: + +- sub-setting data +- two groups comparison +- volcano plot +- MA plot +- Pathway analysis +- Tables + +There are some code related to alternative analysis: + +- `DE/Multiplicative_DE_docs.md` that shows some cases when there is multiple variables in the model with multiple levels: sex (2 levels) and genotype (4 levels) + +## DropBox + +- In `reports/QC` + - [ ] copy `bcbio-se.rds` and `tximport-counts.csv` + - [ ] copy QC `Rmd/R/html/figures` +- In `reports/DE` + - [ ] Normalized counts for all genes x all samples (csv format) +- In `reports/DE`, for *each analysis*: + - **Note** For multiple comparisons/analysis, do a single report/template if possible in the parent folder using parameters whenever possible. + - Create a folder with the comparison names in the files. Numbering by comparison (`01.1_DE_comp1`, `01.2_DE_comp2`, etc.). If you’re running multiple models for the same comparison, append `_M#`. Add the following files under each folder: + - [ ] Normalized count table with the samples used in this analysis/comparison. + - [ ] Full results `DESeq2` for all genes (csv format) with annotation columns appended. + - [ ] Significant genes results file (subset of annotated full results by chosen p-value and LFC). Separate files will be created for each individual contrast. + - [ ] Significant genes results file as described above, but additionally append columns containing normalized count values for each sample. + - Make sure to append the gene symbols to these tables so the researcher can interpret the results. + +## GitHub + +- [ ] Push all `*Rmd` `*R` files used for the *QC* and *DE* analysis respecting folder structure. + +Please, ignore `*html/figures/csv` and any output of the code. diff --git a/inst/templates/rnaseq/qc/QC.Rmd b/inst/templates/rnaseq/qc/QC.Rmd new file mode 100644 index 0000000..48afe76 --- /dev/null +++ b/inst/templates/rnaseq/qc/QC.Rmd @@ -0,0 +1,410 @@ +--- +title: "Quality Control" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + params_file: params_qc.R + project_file: ../information.R +--- + + +```{r source_params, echo = F} +source(params$params_file) +source(params$project_file) +``` + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(tidyverse) +library(knitr) +library(DESeq2) +library(DEGreport) +library(ggrepel) +library(pheatmap) +# library(RColorBrewer) +library(DT) +library(pheatmap) +library(bcbioR) +ggplot2::theme_set(theme_light(base_size = 14)) +opts_chunk[["set"]]( + cache = FALSE, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + fig.height = 4) +``` + + +```{r subchunkify, echo=FALSE, eval=FALSE} +#' Create sub-chunks for plots +#' +#' taken from: https://stackoverflow.com/questions/15365829/dynamic-height-and-width-for-knitr-plots +#' +#' @param pl a plot object +#' @param fig.height figure height +#' @param fig.width figure width +#' @param chunk_name name of the chunk +#' +#' @author Andreas Scharmueller \email{andschar@@protonmail.com} +#' +subchunkify = function(pl, + fig.height = 7, + fig.width = 5, + chunk_name = 'plot') { + pl_deparsed = paste0(deparse(function() { + pl + }), collapse = '') + + sub_chunk = paste0( + "```{r ", + chunk_name, + ", fig.height=", + fig.height, + ", fig.width=", + fig.width, + ", dpi=72", + ", echo=FALSE, message=FALSE, warning=FALSE, fig.align='center'}", + "\n(", + pl_deparsed, + ")()", + "\n```" + ) + + cat(knitr::knit( + text = knitr::knit_expand(text = sub_chunk), + quiet = TRUE + )) +} + +``` + + +```{r sanitize-datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df))) +} +``` + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` +- Aim: `r aim` + + +# Samples and metadata + +```{r load_metadata} +meta_df=read_csv(metadata_fn) %>% mutate(sample = tolower(description)) %>% + dplyr::select(-description) + +ggplot(meta_df, aes(sample_type, fill = sample_type)) + + geom_bar() + ylab("") + xlab("") + + scale_fill_cb_friendly() +``` + + +```{r show-metadata} +se <- readRDS(se_object) #local + + +metrics <- metadata(se)$metrics %>% + full_join(meta_df , by = c("sample" = "sample")) + +meta_sm <- meta_df %>% + as.data.frame() %>% + column_to_rownames("sample") + +meta_sm %>% sanitize_datatable() + +``` + +# Read metrics {.tabset} + +## Total reads + +Here, we want to see consistency and a minimum of 20 million reads. + +```{r plot_total_reads} +metrics %>% + ggplot(aes(x = sample_type, + y = total_reads, + color = sample_type)) + + geom_point(alpha=0.5) + + coord_flip() + + scale_y_continuous(name = "million reads") + + scale_color_cb_friendly() + + ggtitle("Total reads") + +``` + +```{r calc_min_max_pct_mapped} +#get min percent mapped reads for reference +min_pct_mapped <- round(min(metrics$mapped_reads/metrics$total_reads)*100,1) +max_pct_mapped <- round(max(metrics$mapped_reads/metrics$total_reads)*100,1) +``` + +## Mapping rate + +The genomic mapping rate represents the percentage of reads mapping to the reference genome. We want to see consistent mapping rates between samples and over 70% mapping. These samples have mapping rates (`r min_pct_mapped` - `r max_pct_mapped`%). + +```{r plot_mapping_rate} +metrics$mapped_reads_pct <- round(metrics$mapped_reads/metrics$total_reads*100,1) +metrics %>% + ggplot(aes(x = sample_type, + y = mapped_reads_pct, + color = sample_type)) + + geom_point() + + coord_flip() + + scale_color_cb_friendly() + + ylim(0, 100) + + ggtitle("Mapping rate") + + geom_hline(yintercept=70, color = cb_friendly_cols('blue')) +``` + + +## Number of genes detected + +The number of genes represented in every sample is expected to be consistent and over 20K (blue line). + +```{r plot_genes_detected} +genes_detected <- colSums(assays(se)[["raw"]] > 0) %>% enframe() +sample_names <- metrics[,c("sample"), drop=F] +genes_detected <- left_join(genes_detected, sample_names, by = c("name" = "sample")) +genes_detected <- genes_detected %>% group_by(name) +genes_detected <- summarise(genes_detected, + n_genes = max(value)) + +metrics <- metrics %>% + left_join(genes_detected, by = c("sample" = "name")) +ggplot(metrics,aes(x = sample_type, + y = n_genes, color = sample_type)) + + geom_point() + + coord_flip() + + scale_color_cb_friendly() + + ggtitle("Number of genes") + + ylab("Number of genes") + + xlab("") + + geom_hline(yintercept=20000, color = cb_friendly_cols('blue')) +``` + + +## Gene detection saturation + +This plot shows how complex the samples are. We expect samples with more reads to detect more genes. + +```{r plot_gene_saturation} +metrics %>% + ggplot(aes(x = total_reads, + y = n_genes, + color = sample_type)) + + geom_point()+ + scale_x_log10() + + scale_color_cb_friendly() + + ggtitle("Gene saturation") + + ylab("Number of genes") +``` + +## Exonic mapping rate + +Here we are looking for consistency, and exonic mapping rates around 70% or 75% (blue and red lines, respectively). + +```{r plot_exonic_mapping_rate} +metrics %>% + ggplot(aes(x = sample_type, + y = exonic_rate * 100, + color = sample_type)) + + geom_point() + + ylab("Exonic rate %") + + ggtitle("Exonic mapping rate") + + scale_color_cb_friendly() + + coord_flip() + + xlab("") + + ylim(c(0,100)) + + geom_hline(yintercept=70, color = cb_friendly_cols('blue')) + + geom_hline(yintercept=75, color = cb_friendly_cols('brown')) +``` + +## Intronic mapping rate + +Here, we expect a low intronic mapping rate (≤ 15% - 20%) + +```{r plot_intronic_mapping_rate} +metrics %>% + ggplot(aes(x = sample_type, + y = intronic_rate * 100, + color = sample_type)) + + geom_point() + + ylab("Intronic rate %") + + ggtitle("Intronic mapping rate") + + scale_color_cb_friendly() + + coord_flip() + + xlab("") + + ylim(c(0,100)) + + geom_hline(yintercept=20, color = cb_friendly_cols('blue')) + + geom_hline(yintercept=15, color = cb_friendly_cols('brown')) +``` + +## Intergenic mapping rate + +Here, we expect a low intergenic mapping rate, which is true for all samples. + +```{r plot_intergenic_mapping_rate} +metrics %>% + ggplot(aes(x = sample_type, + y = intergenic_rate * 100, + color = sample_type)) + + geom_point() + + ylab("Intergenic rate %") + + ggtitle("Intergenic mapping rate") + + coord_flip() + + scale_color_cb_friendly() + + ylim(c(0, 100)) +``` + +## rRNA mapping rate + +Samples should have a ribosomal RNA (rRNA) "contamination" rate below 10% + +```{r plot_rrna_mapping_rate} +# for some bad samples it could be > 50% +rrna_ylim <- max(round(metrics$r_rna_rate*100, 2)) + 10 +metrics %>% + ggplot(aes(x = sample_type, + y = r_rna_rate * 100, + color = sample_type)) + + geom_point() + + ylab("rRNA rate, %")+ + ylim(0, rrna_ylim) + + ggtitle("rRNA mapping rate") + + coord_flip() + + scale_color_cb_friendly() +``` + +## 5'->3' bias + +There should be little bias, i.e. the values should be close to 1, or at least consistent among samples + +```{r plot_53_bias} +metrics %>% + ggplot(aes(x = sample_type, + y = x5_3_bias, + color = sample_type)) + + geom_point() + + ggtitle("5'-3' bias") + + coord_flip() + + ylim(c(0.5,1.5)) + + scale_color_cb_friendly()+ + geom_hline(yintercept=1, color = cb_friendly_cols('blue')) +``` + +## Counts per gene - all genes + +We expect consistency in the box plots here between the samples, i.e. the distribution of counts across the genes is similar + +```{r plot_counts_per_gene} +metrics_small <- metrics %>% dplyr::select(sample, sample_type) +metrics_small <- left_join(sample_names, metrics_small) + +counts <- + assays(se)[["raw"]] %>% + as_tibble() %>% + filter(rowSums(.)!=0) %>% + gather(name, counts) + +counts <- left_join(counts, metrics, by = c("name" = "sample")) + +ggplot(counts, aes(sample_type, + log2(counts+1), + fill = sample_type)) + + geom_boxplot() + + scale_fill_cb_friendly() + + ggtitle("Counts per gene, all non-zero genes") + + scale_color_cb_friendly() +``` + + +# Sample similarity analysis + +In this section, we look at how well the different groups in the dataset cluster with each other. Samples from the same group should ideally be clustering together. We use Principal Component Analysis (PCA). + +## Principal component analysis (PCA) {.tabset} + +Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells). + + +```{r PCA1:5 summary, all, unlabeled, fig.width= 7, fig.height = 5} +raw_counts <- assays(se)[["raw"]] %>% + as_tibble() %>% + filter(rowSums(.)!=0) %>% + as.matrix() + +vst <- vst(raw_counts) + +#fix samples names +coldat_for_pca <- as.data.frame(metrics) +rownames(coldat_for_pca) <- coldat_for_pca$sample +coldat_for_pca <- coldat_for_pca[colnames(raw_counts),] +pca1 <- degPCA(vst, coldat_for_pca, + condition = "sample_type", data = T)[["plot"]] +pca2 <- degPCA(vst, coldat_for_pca, + condition = "sample_type", data = T, pc1="PC3", pc2="PC4")[["plot"]] + +pca1 + scale_color_cb_friendly() +pca2 + scale_color_cb_friendly() +``` + + +```{r, eval=FALSE} +variables=degCovariates(vst, coldat_for_pca) +``` + + +```{r clustering fig, fig.width = 10, fig.asp = .62} +## Hierarchical clustering + +vst_cor <- cor(vst) + +annotation_cols <- cb_friendly_pal('grey')(length(unique(coldat_for_pca$sample_type))) +names(annotation_cols) <- unique(coldat_for_pca$sample_type) + +p <- pheatmap(vst_cor, + annotation = coldat_for_pca %>% select(sample_type) %>% mutate(sample_type = as.factor(sample_type)), + show_rownames = T, + show_colnames = T, + color = cb_friendly_pal('heatmap')(15), + annotation_colors = list(sample_type = annotation_cols) +) +p + +``` + +# R session + +List and version of tools used for the QC report generation. + +```{r} +sessionInfo() +``` diff --git a/inst/templates/rnaseq/qc/QC_nf-core.Rmd b/inst/templates/rnaseq/qc/QC_nf-core.Rmd new file mode 100644 index 0000000..b376eb3 --- /dev/null +++ b/inst/templates/rnaseq/qc/QC_nf-core.Rmd @@ -0,0 +1,583 @@ +--- +title: "Quality Control" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + # Put hg38, mm10, mm39, or other + genome: hg38 + factor_of_interest: sample_type +--- + +```{r, echo = F} +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +``` + +```{r} +metadata_fn={{metadata_fn}} +se_object={{se_object}} +# This folder is in the output directory inside multiqc folder +multiqc_data_dir={{multiqc_data_dir}} +# This file is inside the genome folder in the output directory +gtf_fn={{gtf_fn}} +``` + +```{r source_params, echo = F} +#knitr::opts_knit$set(root.dir = getSourceEditorContext()$path) +# 1. set up factor_of_interest parameter from parameter above or manualy +# this is used to color plots, it needs to be part of the metadata +factor_of_interest=params$factor_of_interest +genome=params$genome +# 2. Set input files in this file +# This is the file used to run nf-core or compatible to that +metadata_fn='/Path/to/metadata/meta.csv' +# This file is inside star_salmon/ folder +se_object='/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.rds' +# This folder called "multiqc_report_data" is inside the output directory star_salmon inside multiqc folder +multiqc_data_dir='/path/to/nf-core/output/star_salmon/multiqc_report_data' +# This file is inside the genome folder in the output directory, use this only for non-model organism +# gtf_fn='/path/to/nf-core/output/genome/hg38.filtered.gtf' +``` + +# Overview + +{{ project }} + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(tidyverse) +library(knitr) +library(rtracklayer) +library(DESeq2) +library(DEGreport) +library(ggrepel) +# library(RColorBrewer) +library(DT) +library(pheatmap) +library(bcbioR) +library(janitor) +ggplot2::theme_set(theme_light(base_size = 14)) +opts_chunk[["set"]]( + cache = FALSE, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + fig.height = 6) +``` + + +```{r subchunkify, echo=FALSE, eval=FALSE} +#' Create sub-chunks for plots +#' +#' taken from: https://stackoverflow.com/questions/15365829/dynamic-height-and-width-for-knitr-plots +#' +#' @param pl a plot object +#' @param fig.height figure height +#' @param fig.width figure width +#' @param chunk_name name of the chunk +#' +#' @author Andreas Scharmueller \email{andschar@@protonmail.com} +#' +subchunkify = function(pl, + fig.height = 7, + fig.width = 5, + chunk_name = 'plot') { + pl_deparsed = paste0(deparse(function() { + pl + }), collapse = '') + + sub_chunk = paste0( + "```{r ", + chunk_name, + ", fig.height=", + fig.height, + ", fig.width=", + fig.width, + ", dpi=72", + ", echo=FALSE, message=FALSE, warning=FALSE, fig.align='center'}", + "\n(", + pl_deparsed, + ")()", + "\n```" + ) + + cat(knitr::knit( + text = knitr::knit_expand(text = sub_chunk), + quiet = TRUE + )) +} + +``` + + +```{r sanitize-datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df))) +} +``` + + +# Samples and metadata + + +```{r load_metadata} + +meta_df=read_csv(metadata_fn) %>% + arrange(.data[[factor_of_interest]]) %>% + distinct(sample, .keep_all = T) %>% + dplyr::select(!matches("fastq"), !matches("strandness")) +meta_df$sample <- make.names(meta_df$sample) +order <- meta_df$sample + +# remove some columns +meta_df <- data.frame(meta_df[,!(colnames(meta_df) %in% c("fastq_1", "fastq_2", "strandedness"))]) + + +ggplot(meta_df, aes(.data[[factor_of_interest]], + fill = .data[[factor_of_interest]])) + + geom_bar() + ylab("") + xlab("") + ylab("# of samples") + + scale_fill_cb_friendly() +``` + + +```{r} +# read counts from SE object +se <- readRDS(se_object) +raw_counts <- assays(se)[["counts"]] %>% round() %>% + as.matrix() +raw_counts=raw_counts[rowSums(raw_counts)!=0,] +``` + +```{r prepare metrics} +# Get metrics from nf-core into bcbio like table +# many metrics are already in the Genereal Table of MultiQC, this reads the file +metrics <- read_tsv(file.path(multiqc_data_dir, 'multiqc_general_stats.txt')) + +# we get some more metrics from Qualimap and rename columns +metrics_qualimap <- read_tsv(file.path(multiqc_data_dir, 'mqc_qualimap_genomic_origin_1.txt')) +metrics <- metrics %>% full_join(metrics_qualimap) +metrics <- metrics %>% + clean_names() %>% + dplyr::rename_with(~gsub('.*mqc_generalstats_', '', .)) + +# This uses the fastqc metrics to get total reads +total_reads <- metrics %>% + dplyr::filter(!is.na(fastqc_raw_total_sequences)) %>% + remove_empty(which = 'cols') %>% + dplyr::rename(single_sample = sample) %>% + mutate(sample = gsub('_[12]+$', '', single_sample)) %>% + group_by(sample) %>% + summarize(total_reads = sum(fastqc_raw_total_sequences)) + +# This renames to user-friendly names the metrics columns +metrics <- metrics %>% + dplyr::filter(is.na(fastqc_raw_total_sequences)) %>% + remove_empty(which = 'cols') %>% + full_join(total_reads) %>% + mutate(mapped_reads = samtools_reads_mapped) %>% + mutate(exonic_rate = exonic/(star_uniquely_mapped * 2)) %>% + mutate(intronic_rate = intronic/(star_uniquely_mapped * 2)) %>% + mutate(intergenic_rate = intergenic/(star_uniquely_mapped * 2)) %>% + mutate(x5_3_bias = qualimap_5_3_bias) + +# Sometimes we don't have rRNA due to mismatch annotation, We skip this if is the case +gtf <- NULL +if (genome =="other"){ + gtf <- gtf_fn +}else{ + if (genome == "hg38") { + gtf <- "hg38.rna.gtf.gz" + } else if (genome == "mm10") { + gtf <- "mm10.rna.gtf.gz" + } else if (genome == "mm39") { + gtf <- "mm39.rna.gtf.gz" + } + gtf <- system.file("extdata", "annotation", + gtf, + package="bcbioR") +} +if (is.null(gtf)) { + print("No genome provided! Please add it at the top of this Rmd") +} + +gtf=rtracklayer::import(gtf) + +one=grep("gene_type", colnames(as.data.frame(gtf)), value = TRUE) +another=grep("gene_biotype", colnames(as.data.frame(gtf)), value = TRUE) +biotype=NULL +if(length(one)==1){ + biotype=one +}else if(length(another)==1){ + biotype=another +}else{ + warning("No gene biotype founded") +} + +if (!is.null(biotype)){ + annotation=as.data.frame(gtf) %>% .[,c("gene_id", biotype)] + rRNA=grepl("rRNA|tRNA",annotation[[biotype]]) + genes=intersect(annotation[rRNA,"gene_id"],row.names(raw_counts)) + ratio=data.frame(sample=colnames(raw_counts), + r_and_t_rna_rate=colSums(raw_counts[genes,])/colSums(raw_counts)) + metrics = left_join(metrics, ratio, by="sample") +}else{ + metrics[["r_and_t_rna_rate"]] <- NA +} + +# if ("custom_content_biotype_counts_percent_r_rna" %in% colnames(metrics)){ +# metrics <- mutate(metrics, r_rna_rate = custom_content_biotype_counts_percent_r_rna) +# }else{ +# metrics[["r_rna_rate"]] <- NA +# } +metrics=metrics[,c("sample","mapped_reads","exonic_rate","intronic_rate", + "total_reads", + "x5_3_bias", "r_and_t_rna_rate","intergenic_rate")] +metrics$sample <- make.names(metrics$sample) +metrics <- metrics %>% + full_join(meta_df , by = c("sample" = "sample")) %>% + dplyr::select(where(~!all(is.na(.)))) + +``` + +```{r show-metadata} +meta_sm <- meta_df %>% + as.data.frame() %>% + column_to_rownames("sample") + +meta_sm %>% sanitize_datatable() + +``` + +# Read metrics {.tabset} + +## Total reads + +Here, we want to see consistency and a minimum of 20 million reads (the grey line). + +```{r plot_total_reads} +metrics %>% + ggplot(aes(x = factor(sample, level = order), + y = total_reads, + fill = .data[[factor_of_interest]])) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "million reads") + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Total reads") + + geom_hline(yintercept=20000000, color = "grey", size=2) + +metrics %>% + ggplot(aes(x = .data[[factor_of_interest]], + y = total_reads, + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + + coord_flip() + + scale_y_continuous(name = "million reads") + + scale_color_cb_friendly() + xlab("") + + ggtitle("Total reads") +``` + +```{r calc_min_max_pct_mapped} +#get min percent mapped reads for reference +min_pct_mapped <- round(min(metrics$mapped_reads/metrics$total_reads)*100,1) +max_pct_mapped <- round(max(metrics$mapped_reads/metrics$total_reads)*100,1) +``` + +## Mapping rate + +The genomic mapping rate represents the percentage of reads mapping to the reference genome. We want to see consistent mapping rates between samples and over 70% mapping (the grey line). These samples have mapping rates: `r min_pct_mapped` - `r max_pct_mapped`%. + +```{r plot_mapping_rate} +metrics$mapped_reads_pct <- round(metrics$mapped_reads/metrics$total_reads*100,1) +metrics %>% + ggplot(aes(x = factor(sample, level = order), + y = mapped_reads_pct, + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + + coord_flip() + + scale_color_cb_friendly() + + ylim(0, 100) + + ggtitle("Mapping rate") + xlab("") + + geom_hline(yintercept=70, color = "grey", size=2) +``` + + +## Number of genes detected + +The number of genes represented in every sample is expected to be consistent and over 20K (grey line). + +```{r calc_genes_detected} +genes_detected <- colSums(assays(se)[["counts"]] > 0) %>% enframe() +sample_names <- metrics[,c("sample"), drop=F] +genes_detected <- left_join(genes_detected, sample_names, by = c("name" = "sample")) +genes_detected <- genes_detected %>% group_by(name) +genes_detected <- summarise(genes_detected, + n_genes = max(value)) + +metrics <- metrics %>% + left_join(genes_detected, by = c("sample" = "name")) + +``` + + +```{r plot_genes_detected} +ggplot(metrics,aes(x = factor(sample, level = order), + y = n_genes, fill = .data[[factor_of_interest]])) + + geom_bar(stat = "identity") + + coord_flip() + + scale_fill_cb_friendly() + + ggtitle("Number of genes") + + ylab("Number of genes") + + xlab("") + + geom_hline(yintercept=20000, color = "grey", size=2) + +metrics %>% + ggplot(aes(x = .data[[factor_of_interest]], + y = n_genes, + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + + coord_flip() + + scale_y_continuous(name = "million reads") + + scale_color_cb_friendly() + xlab("") + + ggtitle("Total reads") + +``` + + + +## Gene detection saturation + +This plot shows how complex the samples are. We expect samples with more reads to detect more genes. + +```{r plot_gene_saturation} +metrics %>% + ggplot(aes(x = total_reads, + y = n_genes, + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + + scale_x_log10() + + scale_color_cb_friendly() + + ggtitle("Gene saturation") + + ylab("Number of genes") +``` + +## Exonic mapping rate + +Here we are looking for consistency, and exonic mapping rates around or above 70% (grey line). + +```{r plot_exonic_mapping_rate} +metrics %>% + ggplot(aes(x = factor(sample, level = order), + y = exonic_rate * 100, + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + + ylab("Exonic rate %") + + ggtitle("Exonic mapping rate") + + scale_color_cb_friendly() + + coord_flip() + + xlab("") + + ylim(c(0,100)) + + geom_hline(yintercept=70, color = "grey", size=2) +``` + +## Intronic mapping rate + +Here, we expect a low intronic mapping rate (≤ 15% - 20%). The grey line indicates 20%. + +```{r plot_intronic_mapping_rate} +metrics %>% + ggplot(aes(x = factor(sample, level = order), + y = intronic_rate * 100, + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + + ylab("Intronic rate %") + + ggtitle("Intronic mapping rate") + + scale_color_cb_friendly() + + coord_flip() + + xlab("") + + ylim(c(0,100)) + + geom_hline(yintercept=20, color = "grey", size=2) +``` + +## Intergenic mapping rate + +Here, we expect a low intergenic mapping rate, which is true for all samples. The grey line indicates 15% + +```{r plot_intergenic_mapping_rate} +metrics %>% + ggplot(aes(x = factor(sample, level = order), + y = intergenic_rate * 100, + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + + ylab("Intergenic rate %") + + ggtitle("Intergenic mapping rate") + + coord_flip() + xlab("") + + scale_color_cb_friendly() + + ylim(c(0, 100)) + + geom_hline(yintercept=15, color = "grey", size=2) +``` + +## tRNA/rRNA mapping rate + +Samples should have a ribosomal RNA (rRNA) "contamination" rate below 10% (the grey line). + +```{r plot_rrna_mapping_rate} + +rrna_ylim <- max(round(metrics$r_and_t_rna_rate*100, 2)) + 10 +metrics %>% + ggplot(aes(x = factor(sample, level = order), + y = r_and_t_rna_rate * 100, + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5) + + ylab("tRNA/rRNA rate, %")+ + ylim(0, rrna_ylim) + + ggtitle("tRNA/rRNA mapping rate") + + coord_flip() + + scale_color_cb_friendly() + + ylim(c(0, 100)) + xlab("") + + geom_hline(yintercept=10, color = "grey", size=2) +``` + +## 5'->3' bias + +There should be little bias, i.e. the values should be close to 1, or at least consistent among samples + +```{r plot_53_bias} +metrics %>% + ggplot(aes(x = factor(sample, level = order), + y = x5_3_bias, + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + + ggtitle("5'-3' bias") + + coord_flip() + + ylim(c(0.5,1.5)) + xlab("") + ylab("5'-3' bias") + + scale_color_cb_friendly()+ + geom_hline(yintercept=1, color = "grey", size=2) +``` + +## Counts per gene - all genes + +We expect consistency in the box plots here between the samples, i.e. the distribution of counts across the genes is similar + +```{r plot_counts_per_gene} +metrics_small <- metrics %>% dplyr::select(sample, .data[[factor_of_interest]]) +metrics_small <- left_join(sample_names, metrics_small) + +counts <- + assays(se)[["counts"]] %>% + as_tibble() %>% + filter(rowSums(.)!=0) %>% + gather(name, counts) + +counts <- left_join(counts, metrics_small, by = c("name" = "sample")) + +ggplot(counts, aes(factor(name, level = order), + log2(counts+1), + fill = .data[[factor_of_interest]])) + + geom_boxplot() + + scale_fill_cb_friendly() + + coord_flip() + xlab("") + + ggtitle("Counts per gene, all non-zero genes") + + scale_color_cb_friendly() +``` + + +# Sample similarity analysis + +In this section, we look at how well the different groups in the dataset cluster with each other. Samples from the same group should ideally be clustering together. We use Principal Component Analysis (PCA). + +## Principal component analysis (PCA) + +Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells). + + +```{r PCA1:5 summary, all, unlabeled, fig.width= 7, fig.height = 5} + +vst <- vst(raw_counts) + +coldat_for_pca <- as.data.frame(metrics) +rownames(coldat_for_pca) <- coldat_for_pca$sample +coldat_for_pca <- coldat_for_pca[colnames(raw_counts),] +pca1 <- degPCA(vst, coldat_for_pca, + condition = factor_of_interest, data = T)[["plot"]] +pca2 <- degPCA(vst, coldat_for_pca, + condition = factor_of_interest, data = T, pc1="PC3", pc2="PC4")[["plot"]] + + + +pca1 + scale_color_cb_friendly() +pca2 + scale_color_cb_friendly() + +``` + +# Covariates analysis + +When there are multiple factors that can influence the results of a given experiment, it is useful to assess which of them is responsible for the most variance as determined by PCA. This method adapts the method described by Daily et al. for which they integrated a method to correlate covariates with principal components values to determine the importance of each factor. + +```{r covariate-plot,fig.height=12, fig.width=10} +## Remove non-useful columns output by nf-core +coldat_2 <- data.frame(coldat_for_pca[,!(colnames(coldat_for_pca) %in% c("fastq_1", "fastq_2", "salmon_library_types", "salmon_compatible_fragment_ratio", "samtools_reads_mapped_percent", "samtools_reads_properly_paired_percent", "samtools_mapped_passed_pct", "strandedness", "qualimap_5_3_bias"))]) + +# Remove missing data +coldat_2 <- na.omit(coldat_2) +degCovariates(vst, metadata = coldat_2) +``` + +## Hierarchical clustering + +Inter-correlation analysis (ICA) is another way to look at how well samples +cluster by plotting the correlation between the expression profiles of the +samples. + +```{r clustering fig, fig.width = 10, fig.asp = .62} + +vst_cor <- cor(vst) + +colma=meta_df %>% as.data.frame() +rownames(colma) <- colma$sample +colma <- colma[rownames(vst_cor), ] +colma <- colma %>% dplyr::select(.data[[factor_of_interest]]) +anno_colors=lapply(colnames(colma), function(c){ + l.col=cb_friendly_pal('grey')(length(unique(colma[[c]]))) + names(l.col)=unique(colma[[c]]) + l.col +}) +names(anno_colors)=colnames(colma) + +p <- pheatmap(vst_cor, + annotation = colma, + annotation_colors = anno_colors, + show_rownames = T, + show_colnames = T, + color = cb_friendly_pal('heatmap')(15) + ) +p +``` + +# R session + +List and version of tools used for the QC report generation. + +```{r} +sessionInfo() +``` diff --git a/inst/templates/rnaseq/qc/params_qc.R b/inst/templates/rnaseq/qc/params_qc.R new file mode 100644 index 0000000..84dabfa --- /dev/null +++ b/inst/templates/rnaseq/qc/params_qc.R @@ -0,0 +1,4 @@ +# info params + +metadata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv' +se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds") diff --git a/inst/templates/rnaseq/qc/params_qc_nf-core-example.R b/inst/templates/rnaseq/qc/params_qc_nf-core-example.R new file mode 100644 index 0000000..dae62ce --- /dev/null +++ b/inst/templates/rnaseq/qc/params_qc_nf-core-example.R @@ -0,0 +1,9 @@ +# info params + +# Example data: COMMENT THESE LINE IF YOU ARE USING YOUR DATA +metadata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/coldata.csv' +se_object=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.rds') +# This folder is in the output directory inside multiqc folder +multiqc_data_dir='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/multiqc/star_salmon/multiqc-report-data/' +# This file is inside the genome folder in the output directory +gtf_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/devel/nf-core/genome/genome.filtered.gtf.gz' diff --git a/inst/templates/rnaseq/qc/placeholder b/inst/templates/rnaseq/qc/placeholder new file mode 100644 index 0000000..e69de29 diff --git a/inst/templates/rnaseq/qc/run_markdown.R b/inst/templates/rnaseq/qc/run_markdown.R new file mode 100644 index 0000000..51acbef --- /dev/null +++ b/inst/templates/rnaseq/qc/run_markdown.R @@ -0,0 +1,13 @@ +library(rmarkdown) + +# set directory to this file folder +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +# example running with test data +rmarkdown::render("QC_nf-core.Rmd", + output_dir = ".", + clean = TRUE, + output_format = "html_document", + params = list( + params_file = 'params_qc_nf-core-testdata.R', + project_file = '../information.R') + ) diff --git a/man/bcbio_nfcore_check.Rd b/man/bcbio_nfcore_check.Rd index 76c954f..022c903 100644 --- a/man/bcbio_nfcore_check.Rd +++ b/man/bcbio_nfcore_check.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hello.R +% Please edit documentation in R/helpers.R \name{bcbio_nfcore_check} \alias{bcbio_nfcore_check} \title{Function to check samplesheet for nf-core} diff --git a/man/bcbio_set_project.Rd b/man/bcbio_set_project.Rd index b3c67ef..4e686bf 100644 --- a/man/bcbio_set_project.Rd +++ b/man/bcbio_set_project.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hello.R +% Please edit documentation in R/helpers.R \name{bcbio_set_project} \alias{bcbio_set_project} \title{Function to help with project name used for parent folder} diff --git a/man/bcbio_templates.Rd b/man/bcbio_templates.Rd index bc2e50f..217ea0c 100644 --- a/man/bcbio_templates.Rd +++ b/man/bcbio_templates.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hello.R +% Please edit documentation in R/helpers.R \name{bcbio_templates} \alias{bcbio_templates} \title{Function to help deploy analysis folder inside a project folder} From 6b4b52571912ed578763f10cdb3f2a0a442a20db Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Wed, 29 May 2024 11:17:57 -0400 Subject: [PATCH 02/93] working code to deploy rnaseq --- R/helpers.R | 119 ++++++++++++------ inst/templates/rnaseq/README.md | 31 +++++ inst/templates/rnaseq/de/DEG.Rmd | 13 +- .../rnaseq/de/PCA_variance_analysis.Rmd | 3 + inst/templates/rnaseq/de/params_de.R | 22 ---- .../rnaseq/qc/{QC.Rmd => QC-bcbio.Rmd} | 5 +- inst/templates/rnaseq/qc/QC_nf-core.Rmd | 8 +- inst/templates/rnaseq/qc/params_qc.R | 4 - tests/testthat/rnaseq.R | 54 ++++---- 9 files changed, 159 insertions(+), 100 deletions(-) create mode 100644 inst/templates/rnaseq/README.md delete mode 100644 inst/templates/rnaseq/de/params_de.R rename inst/templates/rnaseq/qc/{QC.Rmd => QC-bcbio.Rmd} (98%) delete mode 100644 inst/templates/rnaseq/qc/params_qc.R diff --git a/R/helpers.R b/R/helpers.R index 89ba469..fca3034 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -105,59 +105,81 @@ bcbio_set_project <- function() { guess_analysis <- function(path){ if (!fs::dir_exists(path)) - ui_abort("{ui_val(path)} doesn't exist") + ui_stop("{ui_val(path)} doesn't exist") # This file is inside star_salmon/ folder - counts_fn <- fs::path_join(path, '/star_salmon/salmon.merged.gene_counts.tsv') + counts_fn <- fs::path_join(c(path, '/star_salmon/salmon.merged.gene_counts.tsv')) # This folder called "multiqc_report_data" is inside the output directory star_salmon inside multiqc folder - multiqc_data_dir <- fs::path_join(path, 'star_salmon/multiqc_report_data') + multiqc_data_dir <- fs::path_join(c(path, 'star_salmon/multiqc_report_data')) # This file is inside star_salmon/ folder - se_object <- fs::path_join(path, 'star_salmon/salmon.merged.gene_counts.rds') + se_object <- fs::path_join(c(path, 'star_salmon/salmon.merged.gene_counts.rds')) } -read_pipeline_info <- function(path){ +read_pipeline_info <- function(nfcore){ # pipeline_info/params_2024-05-28_12-28-51.json - config <- fs::path_join(nfcore, "pipeline_info") + config <- fs::path_join(c(nfcore, "pipeline_info")) params <- fs::dir_ls(config, regexp = "params") metadata <- jsonlite::read_json(params)[["input"]] # input # tmp_rna/pipeline_info/software_versions.yml - software <- fs::path_join(nfcore, "pipeline_info", "software_versions.yml") + software <- fs::path_join(c(nfcore, "pipeline_info", "software_versions.yml")) software_txt <- yaml::read_yaml(software) - pipeline <- grep("nf-core", names(software_text$Workflow), value = TRUE) + pipeline <- grep("nf-core", names(software_txt$Workflow), value = TRUE) # Workflow: # Nextflow: 24.04.1 # nf-core/rnaseq: 3.14.0 # check only rnaseq is supported - if (!(pipeline %in% c("nf-core/rnasew"))){ - iu_abort("Sorry, we don't yet support {.ui_value(pipeline)}") + if (!(pipeline %in% c("nf-core/rnaseq"))){ + ui_stop("Sorry, we don't yet support {ui_value(pipeline)}") } list(metadata=metadata, pipeline=pipeline) } -bcbio_params <-function(path, pipeline, metadata, copy){ +render_rmd <- function(infile, outfile, ls_data){ + whisker.render(read_file(infile), + ls_data) %>% + write_file(outfile) +} +bcbio_params <-function(nfcore_path, pipeline, metadata, copy){ + ui_info("Reading input files from {ui_value(nfcore_path)}") if (pipeline=="nf-core/rnaseq"){ if (!copy){ - se_object <- fs::path_join(path, "star_salmon/salmon.merged.gene_counts.rds") - metadata_fn <- metadata - counts_fn <- fs::path_join(path, "star_salmon/salmon.merged.gene_counts.tsv") - multiqc_data_dir <- fs::path_join(path, "multiqc/star_salmon/multiqc-report-data/") - gtf_fn <- fs::path_join(path, "genome/genome.filtered.gtf") + ls_data<-list( + se_object =fs::path_join(c(nfcore_path, "star_salmon/salmon.merged.gene_counts.rds")), + metadata_fn = metadata, + counts_fn = fs::path_join(c(nfcore_path, "star_salmon/salmon.merged.gene_counts.tsv")), + multiqc_data_dir = fs::path_join(c(nfcore_path, "multiqc/star_salmon/multiqc-report-data/")), + gtf_fn = fs::path_join(c(nfcore_path, "genome/genome.filtered.gtf"))) + return(ls_data) } + } + +} +bcbio_render <- function(path, pipeline, data){ + if (pipeline=="nf-core/rnaseq"){ analysis_template <- fs::path_package("bcbioR", "templates", "rnaseq", "qc") - fs::dir_copy(analysis_template, fs::path_join(path, "reports"), overwrite = FALSE) + fs::dir_copy(analysis_template, fs::path_join(c(path, "reports", "qc")), overwrite=TRUE) analysis_template <- fs::path_package("bcbioR", "templates", "rnaseq", "de") - fs::dir_copy(analysis_template, fs::path_join(path, "reports"), overwrite = FALSE) + fs::dir_copy(analysis_template, fs::path_join(c(path, "reports", "de")), overwrite=TRUE) + render_rmd( + fs::path_join(c(path, "reports", "qc", "QC_nf-core.Rmd")), + fs::path_join(c(path, "reports", "qc", "QC_nf-core.Rmd")), + data + ) + render_rmd( + fs::path_join(c(path, "reports", "de", "DEG.Rmd")), + fs::path_join(c(path, "reports", "de", "DEG.Rmd")), + data + ) ui_info("Please, to start the analysis, modify these parameter in QC/QC.rmd") ui_todo("set genome to hg38, mm10, mm39, or other") ui_todo("set factor_of_interest to a column in your metadata") } - } #' @export @@ -166,36 +188,54 @@ use_bcbio_analysis <- function(path, nfcore=NULL, copy=FALSE, metadata=NULL){ if (copy){ # deploy files ui_info("Rmd templates will be copied but variables path won't be filled automatically.") + pipeline <- nfcore }else{ if (!fs::dir_exists(nfcore)) - ui_abort("{ui_value(nfcore)} doesn't exist. point to nfcore path or turn on copy mode.") + ui_stop("{ui_value(nfcore)} doesn't exist. point to nfcore path or turn on copy mode.") #guess analysis from pipeline file information <- read_pipeline_info(nfcore) - fs::dir_create(fs::path_join(path, "meta")) - meta_path <- fs::path_join(path, "meta", fs::path_file(information$metadata)) + fs::dir_create(fs::path_join(c(path, "meta"))) + meta_path <- fs::path_join(c(path, "meta", fs::path_file(information$metadata))) pipeline <- information$pipeline if (!is.null(metadata)){ if (!(fs::file_exists(metadata))) - ui_abort("{ui_value(metadata)} doesn't exist.") + ui_stop("{ui_value(metadata)} doesn't exist.") fs::file_copy(metadata, meta_path) }else{ if (!fs::file_exists(information$metadata)){ ui_warn("{ui_value(metadata)} not found. We can only work with local filesytems. For now.") ui_todo("Please, copy {ui_value(metadata)} to {ui_value(meta_path)}.") - ui_warn("If this file is not in the folder, the code will fail.") + ui_warn("If this file is manually set up, the Rmd code will fail.") }else{ fs::file_copy(information$metadata, meta_path) } metadata <- meta_path } - - # set all files from analysis - bcbio_params <- set_bcbio_params(nfcore, pipeline, metadata, copy=copy) } + # set all files from analysis + data <- bcbio_params(nfcore, pipeline, metadata, copy=copy) + if (!copy) + bcbio_render(path, pipeline, data) } +copy_files_in_folder<- function(origin, remote){ + to_copy <- fs::dir_ls(origin) + for (element in to_copy){ + full_new_path <- fs::path_join(c(remote, fs::path_file(element))) + + if (fs::is_dir(element)){ + if (!(fs::dir_exists(full_new_path))) + fs::dir_copy(element, full_new_path) + } + if (fs::is_file(element)){ + if (!(fs::file_exists(full_new_path))) + fs::file_copy(element, full_new_path) + } + } +} + #' @export #' @examples #' path <- withr::local_tempdir() @@ -208,26 +248,27 @@ use_bcbio_projects <- function(path, nfcore=NULL, metadata=NULL, git=TRUE, gh=FA ui_info("Populating base project") base_template <- fs::path_package("bcbioR", "templates", "base") - fs::dir_copy(base_template, path, overwrite = FALSE) + copy_files_in_folder(base_template, path) - if (is.null(nfcore)){ + is_nfcore_ready <- FALSE + if (is.null(nfcore) && rlang::is_interactive()){ is_nfcore_ready <- ui_yeah("Have you already run nf-core pipeline?", n_yes=1, n_no =1) - if (is_nfcore_ready){ + if (is_nfcore_ready && rlang::is_interactive()){ nfcore <- readline("? Enter path to nf-core output: ") }else{ - ui_warn("Please, turn copy = TRUE to only deploy files or") - ui_abort("Please use {.run use_bcbio_projects} again when you have the nf-core output.") + ui_warn("Please, turn copy = TRUE to only deploy files or,") + ui_stop("Please use {ui_code('use_bcbio_projects')} again when you have the nf-core output.") } use_bcbio_analysis(path, nfcore, copy, metadata) }else{ if (fs::dir_exists(nfcore)){ - ui_info("Checking {.ui_value(nfcore)} as nf-core output directory") + ui_info("Checking {ui_value(nfcore)} as nf-core output directory") use_bcbio_analysis(path, nfcore, copy, metadata) }else if (copy){ # deploy only files ui_info("Deploying only templates without pipeline information.") - use_bcbio_analysis(path, nfcore, metadata=metadata, copy = TRUE) + use_bcbio_analysis(path, nfcore, copy = TRUE, metadata=metadata) }else{ ui_warn("Please, provide nfcore working directory or") ui_warn("turn copy = TRUE to only deploy files.") @@ -242,7 +283,7 @@ use_bcbio_projects <- function(path, nfcore=NULL, metadata=NULL, git=TRUE, gh=FA ui_info("Create GitHub repo at {ui_value(path)}") whoami <- suppressMessages(gh::gh_whoami()) if (is.null(whoami)) { - ui_abort(c( + ui_stop(c( "x" = "Unable to discover a GitHub personal access token.", "i" = "A token is required in order to create and push to a new repo.", "_" = "Call {.run usethis::gh_token_help()} for help configuring a token." @@ -251,11 +292,13 @@ use_bcbio_projects <- function(path, nfcore=NULL, metadata=NULL, git=TRUE, gh=FA use_github(organisation=org) } - answer <- ui_yeah("Please, read the README.md file as the session starts.Are you ready?", - n_yes=1, n_no =1) + answer <- FALSE + if (rlang::is_interactive()) + answer <- ui_yeah("Please, read the README.md file as the session starts.Are you ready?", + n_yes=1, n_no =1, shuffle=FALSE) if (answer) proj_activate(path) if (!answer) - ui_info("Please use {.run proj_activate({ui_value(path)})} to start this project.") + ui_info("Please use proj_activate({ui_value(path)})} to start this project.") } diff --git a/inst/templates/rnaseq/README.md b/inst/templates/rnaseq/README.md new file mode 100644 index 0000000..7df3cb1 --- /dev/null +++ b/inst/templates/rnaseq/README.md @@ -0,0 +1,31 @@ +# Guideline for RNAseq downstream analysis + + +## Downstream analysis + +Please, modify `information.R` with the right information. You can use this file with any other Rmd to include the project/analysis information. + +### QC + +`QC/QC.Rmd` is a template for QC metrics. Use `params_qc.R` for `bcbio` + or `QC/QC_nf-core.Rmd` `params_qc_nf-core.R` for `nf-core/rnaseq` outputs. + +Read instruction in the R and Rmd scripts to render it. + +### DE + +`DE/DEG.Rmd` is a template for two groups comparison. `params_de.R` has the information of the input files to load. You can point to `bcbio` or `nf-core/rnaseq` output files. + +On the `YAML` header file of the `Rmd` you can specify some parameters or just set them up in the first chunk of code of the template. This template has examples of: + +- sub-setting data +- two groups comparison +- volcano plot +- MA plot +- Pathway analysis +- Tables + +There are some code related to alternative analysis: + +- `DE/PCA_variance_analysis.R` that shows how to compare variance among groups to decide how to perform DE analysis. +- `DE/Multiplicative_DE_docs.md` that shows some cases when there is multiple variables in the model with multiple levels: sex (2 levels) and genotype (4 levels) diff --git a/inst/templates/rnaseq/de/DEG.Rmd b/inst/templates/rnaseq/de/DEG.Rmd index a2741a7..fd01681 100644 --- a/inst/templates/rnaseq/de/DEG.Rmd +++ b/inst/templates/rnaseq/de/DEG.Rmd @@ -34,12 +34,17 @@ setwd(fs::path_dir(getSourceEditorContext()$path)) ``` ```{r} -metadata_fn={{metadata_fn}} -se_object={{se_object}} +# params for bcbio +# coldata_fn = "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv" +# counts_fn = 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/tximport-counts.csv' +# se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds") +# Params for nfcore/rnaseq +metadata_fn="{{metadata_fn}}" +se_object="{{se_object}}" # This folder is in the output directory inside multiqc folder -multiqc_data_dir={{multiqc_data_dir}} +multiqc_data_dir="{{multiqc_data_dir}}" # This file is inside the genome folder in the output directory -gtf_fn={{gtf_fn}} +gtf_fn="{{gtf_fn}}" ``` ```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} diff --git a/inst/templates/rnaseq/de/PCA_variance_analysis.Rmd b/inst/templates/rnaseq/de/PCA_variance_analysis.Rmd index 101c6b8..aadabec 100644 --- a/inst/templates/rnaseq/de/PCA_variance_analysis.Rmd +++ b/inst/templates/rnaseq/de/PCA_variance_analysis.Rmd @@ -20,6 +20,9 @@ pca$plot + ggtitle(paste0("All samples", "\nPCA using ", nrow(vst), " genes")) + theme(plot.title=element_text(hjust=0.5)) + geom_mark_ellipse(aes(color = sample_type)) ``` + +Information on [betadisper](https://uw.pressbooks.pub/appliedmultivariatestatistics/chapter/permdisp/) to do analyses of multivariate homogeneity of group dispersions (variances). + ```{r} # NOTE:This is not confirmed to be a valid test but it could help to understand the data library(vegan) diff --git a/inst/templates/rnaseq/de/params_de.R b/inst/templates/rnaseq/de/params_de.R deleted file mode 100644 index 8426428..0000000 --- a/inst/templates/rnaseq/de/params_de.R +++ /dev/null @@ -1,22 +0,0 @@ -# project params -date = "YYYYMMDD" -basedir <- './' # where to write down output files - -# params for bcbio -# coldata_fn = "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv" -# counts_fn = 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/tximport-counts.csv' -# se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds") -# - -# params for nfcore -# Your data -# This is the file used to run nf-core or compatible to that -coldata_fn='/Path/to/metadata/meta.csv' -# This file is inside star_salmon/ folder -counts_fn='/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.tsv' -# This folder called "multiqc_report_data" is inside the output directory star_salmon inside multiqc folder -multiqc_data_dir='/path/to/nf-core/output/star_salmon/multiqc_report_data' -# This file is inside the genome folder in the output directory, use this only non-model organism -# gtf_fn='/path/to/nf-core/output/genome/hg38.filtered.gtf' -se_object = NA - diff --git a/inst/templates/rnaseq/qc/QC.Rmd b/inst/templates/rnaseq/qc/QC-bcbio.Rmd similarity index 98% rename from inst/templates/rnaseq/qc/QC.Rmd rename to inst/templates/rnaseq/qc/QC-bcbio.Rmd index 48afe76..aca91c1 100644 --- a/inst/templates/rnaseq/qc/QC.Rmd +++ b/inst/templates/rnaseq/qc/QC-bcbio.Rmd @@ -18,13 +18,12 @@ editor_options: chunk_output_type: console params: params_file: params_qc.R - project_file: ../information.R --- ```{r source_params, echo = F} -source(params$params_file) -source(params$project_file) +metadata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv' +se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds") ``` ```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} diff --git a/inst/templates/rnaseq/qc/QC_nf-core.Rmd b/inst/templates/rnaseq/qc/QC_nf-core.Rmd index b376eb3..87a6a9e 100644 --- a/inst/templates/rnaseq/qc/QC_nf-core.Rmd +++ b/inst/templates/rnaseq/qc/QC_nf-core.Rmd @@ -28,12 +28,12 @@ setwd(fs::path_dir(getSourceEditorContext()$path)) ``` ```{r} -metadata_fn={{metadata_fn}} -se_object={{se_object}} +metadata_fn="{{metadata_fn}}" +se_object="{{se_object}}" # This folder is in the output directory inside multiqc folder -multiqc_data_dir={{multiqc_data_dir}} +multiqc_data_dir="{{multiqc_data_dir}}" # This file is inside the genome folder in the output directory -gtf_fn={{gtf_fn}} +gtf_fn="{{gtf_fn}}" ``` ```{r source_params, echo = F} diff --git a/inst/templates/rnaseq/qc/params_qc.R b/inst/templates/rnaseq/qc/params_qc.R deleted file mode 100644 index 84dabfa..0000000 --- a/inst/templates/rnaseq/qc/params_qc.R +++ /dev/null @@ -1,4 +0,0 @@ -# info params - -metadata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv' -se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds") diff --git a/tests/testthat/rnaseq.R b/tests/testthat/rnaseq.R index 9a683ac..a53926e 100644 --- a/tests/testthat/rnaseq.R +++ b/tests/testthat/rnaseq.R @@ -1,29 +1,33 @@ library(bcbioR) - -test_that("rnaseq testing", { +test_that("rnaseq copy",{ path <- withr::local_tempdir() - print(path) - bcbio_templates(type="rnaseq", outpath=path) - numerator="tumor" - denominator="normal" - subset_value=NA - rmarkdown::render(input = file.path(path,"DE/DEG.Rmd"), - output_dir = file.path(path,"DE"), - output_format = "html_document", - output_file = ifelse(!is.na(subset_value), - paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'), - paste0('DE_', numerator, '_vs_', denominator, '.html') - ), - clean = TRUE, - envir = new.env(), - params = list( - subset_value = subset_value, - numerator = numerator, - denominator = denominator, - params_file = file.path(path,'DE/params_de.R'), - project_file = file.path(path,'information.R'), - functions_file = file.path(path,'DE/load_data.R') - ) - ) + use_bcbio_projects(path, nfcore="nf-core/rnaseq", copy=TRUE, git=FALSE) }) + +# test_that("rnaseq testing", { +# path <- withr::local_tempdir() +# print(path) +# bcbio_templates(type="rnaseq", outpath=path) +# numerator="tumor" +# denominator="normal" +# subset_value=NA +# rmarkdown::render(input = file.path(path,"DE/DEG.Rmd"), +# output_dir = file.path(path,"DE"), +# output_format = "html_document", +# output_file = ifelse(!is.na(subset_value), +# paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'), +# paste0('DE_', numerator, '_vs_', denominator, '.html') +# ), +# clean = TRUE, +# envir = new.env(), +# params = list( +# subset_value = subset_value, +# numerator = numerator, +# denominator = denominator, +# params_file = file.path(path,'DE/params_de.R'), +# project_file = file.path(path,'information.R'), +# functions_file = file.path(path,'DE/load_data.R') +# ) +# ) +# }) From 3b5300d97afd0632684915cfe63eb07ee13610bc Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Wed, 29 May 2024 11:18:13 -0400 Subject: [PATCH 03/93] update DESCRIPTION --- DESCRIPTION | 4 +- .../rnaseq/de/DE_tumor_vs_normal.html | 5864 ----------------- 2 files changed, 3 insertions(+), 5865 deletions(-) delete mode 100644 inst/templates/rnaseq/de/DE_tumor_vs_normal.html diff --git a/DESCRIPTION b/DESCRIPTION index dd2e149..0af4683 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -23,7 +23,9 @@ Imports: usethis, fs, jsonlite, - yaml + yaml, + whisker, + rlang Suggests: knitr, rmarkdown, diff --git a/inst/templates/rnaseq/de/DE_tumor_vs_normal.html b/inst/templates/rnaseq/de/DE_tumor_vs_normal.html deleted file mode 100644 index 9060229..0000000 --- a/inst/templates/rnaseq/de/DE_tumor_vs_normal.html +++ /dev/null @@ -1,5864 +0,0 @@ - - - - - - - - - - - - - - - -Differential Expression - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - -
-
-
-
-
- -
- - - - - - - -
# 1. Set up input files in this R file (params_de.R)
-source(params$params_file)
-# 2. Set up project file (already done from QC probably)
-source(params$project_file)
-# 3. Load custom functions to load data from coldata/metrics/counts
-source(params$functions_file)
-# IMPORTANT set these values if you are not using the parameters at the top
-genome=params$genome
-column=params$column
-numerator=params$numerator
-denominator=params$denominator
-subset_column=params$subset_column
-subset_value=params$subset_value
-run_ruv=params$ruv
-
library(rtracklayer)
-library(DESeq2)
-library(tidyverse)
-library(stringr)
-library(RUVSeq)
-library(DEGreport)
-library(ggpubr)
-library(msigdbr)
-library(fgsea)
-library(org.Hs.eg.db)
-library(knitr)
-library(EnhancedVolcano)
-library(bcbioR)
-library(ggprism)
-library(viridis)
-library(pheatmap)
-library(janitor)
-colors=cb_friendly_cols(1:15)
-ggplot2::theme_set(theme_prism(base_size = 14))
-opts_chunk[["set"]](
-    cache = F,
-    cache.lazy = FALSE,
-    dev = c("png", "pdf"),
-    error = TRUE,
-    highlight = TRUE,
-    message = FALSE,
-    prompt = FALSE,
-    tidy = FALSE,
-    warning = FALSE,
-    echo = T, 
-    fig.height = 4)
-
-# set seed for reproducibility
-set.seed(1234567890L)
-
# This code will load from bcbio or nf-core folder
-# NOTE make sure to set numerator and denominator
-coldata <- load_coldata(coldata_fn, column,
-                        numerator, denominator,
-                        subset_column, subset_value)
-coldata$sample=row.names(coldata)
-
-counts <- load_counts(counts_fn)
-counts <- counts[,colnames(counts) %in% coldata$sample]
-
-metrics <- load_metrics(se_object, multiqc_data_dir, gtf_fn, counts) %>% 
-  left_join(coldata, by = c('sample')) %>% 
-  as.data.frame()
-rownames(metrics) <- metrics$sample
-# if the names don't match in order or string check files names and coldata information
-counts = counts[rownames(metrics)]
-coldata = coldata[rownames(metrics),]
-stopifnot(all(names(counts) == rownames(metrics)))
-
sanitize_datatable = function(df, ...) {
- # remove dashes which cause wrapping
- DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)),
-                   colnames=gsub("-", "_", colnames(df)))
-}
-
-

1 Overview

-
    -
  • Project: name_hbcXXXXX
  • -
  • PI: person name
  • -
  • Analyst: person in the core
  • -
  • Experiment: short description
  • -
  • Aim: short description
  • -
  • Comparison: tumor vs. normal
  • -
-
if (!is.null(subset_value) & !is.null(subset_value)){
-  filenames = str_interp("${subset_value}_${numerator}_vs_${denominator}")
-} else {
-  filenames = str_interp("${numerator}_vs_${denominator}")
-}
-
-contrasts = c(column,numerator,denominator)
-coef=paste0(column,"_",numerator,"_vs_",denominator)
-
-name_expression_fn=file.path(
-                             basedir,
-                             str_interp("${filenames}_expression.csv"))
-name_deg_fn=file.path(
-                      basedir,
-                      str_interp("${filenames}_deg.csv"))
-name_pathways_fn=file.path(
-                           basedir,
-                           str_interp("${filenames}_pathways.csv"))
-
rdata = AnnotationDbi::select(org.Hs.eg.db, rownames(counts), 'SYMBOL', 'ENSEMBL') %>%
-  dplyr::select(gene_id = ENSEMBL, gene_name = SYMBOL)
-
dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = ~1)
-
-vsd_before <- vst(dds_to_use)
-norm_matrix = assay(vsd_before)
-
-
-

2 Covariate analysis

-
degCovariates(
-  norm_matrix,
-  metrics,
-)
-

-
-
-

3 PCA analysis

-
pca1 <- degPCA(norm_matrix, colData(dds_to_use), 
-       condition = column) + ggtitle('PCA')
-pca1 + scale_color_cb_friendly()
-

-
formula <- as.formula(paste0("~ ", " + ", column))
-## Check if sample name matches
-stopifnot(all(names(counts) == rownames(coldata)))
-
-dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = formula)
-
-vsd_before <- vst(dds_to_use)
-norm_matrix = assay(vsd_before)
-new_cdata <- coldata
-
# If you want to skip the code, just set up formula to be your model in the next chunk of code
-design <- coldata[[column]]
-diffs <- makeGroups(design)
-dat <- norm_matrix
-ruvset <- RUVs(dat, cIdx=rownames(dat), k=1, diffs, isLog = T, round = F)
-vars <- ruvset$W
-
-new_cdata <- cbind(coldata, vars)
-
-formula <- as.formula(paste0("~ ", 
-                          paste0(
-                            colnames(new_cdata)[grepl("W", colnames(new_cdata))], 
-                            collapse = " + "
-                          ), " + ", column)
-)
-norm_matrix=ruvset$normalizedCounts
-pca2 <- degPCA(norm_matrix, new_cdata, 
-       condition = column) + ggtitle('After RUV')
-pca2 + scale_color_cb_friendly()
-
dds_to_use <- DESeqDataSetFromMatrix(counts, new_cdata, design = formula)
-vsd_to_use<- vst(dds_to_use, blind=FALSE)
-
-
-

4 Differential -Expression

-

Differential gene expression analysis of count data was performed -using the Bioconductor R package, DESeq2, which fits the count data to a -negative binomial model.

-

Before fitting the model, we often look at a metric called -dispersion, which is a measure for variance which also takes into -consideration mean expression. A dispersion value is estimated for each -individual gene, then ‘shrunken’ to a more accurate value based on -expected variation for a typical gene exhibiting that level of -expression. Finally, the shrunken dispersion value is used in the final -GLM fit.

-

We use the below dispersion plot, which should show an inverse -relationship between dispersion and mean expression, to get an idea of -whether our data is a good fit for the model.

-
de <- DESeq(dds_to_use)
-
-DESeq2::plotDispEsts(de)
-

-

Because it is difficult to accurately detect and quantify the -expression of lowly expressed genes, differences in their expression -between treatment conditions can be unduly exaggerated after the model -is fit. We correct for this so that gene LFC is not dependent overall on -basal gene expression level.

-
# resultsNames(de) # check the order is right
-resLFC = results(de, contrast=contrasts)
-resLFCS <- lfcShrink(de, coef=coef, type="apeglm")
-
-res <- as.data.frame(resLFCS) %>%
-  rownames_to_column('gene_id') %>% left_join(rdata, by = 'gene_id') %>% 
-  relocate(gene_name) %>% dplyr::rename(lfc = log2FoldChange) %>%
-  mutate(pi = abs(lfc) * -log10(padj)) %>% arrange(-pi)
-
-res_sig <- res %>% filter(padj < 0.05) %>% arrange(padj) %>%
-  mutate(gene_name = ifelse(is.na(gene_name), gene_id, gene_name))
-
-res_mod <- res %>% mutate(lfc = replace(lfc, lfc < -5, -5)) %>% mutate(lfc = replace(lfc, lfc > 5, 5))
-show <- as.data.frame(res_mod[1:10, c("lfc", "padj", "gene_name")])
-
-degMA(as.DEGSet(resLFC)) + ggtitle('Before LFC Shrinking')
-

-
-

4.1 MA plot

-
degMA(as.DEGSet(resLFCS), limit = 2) + ggtitle('After LFC Shrinking')
-

-
-
-

4.2 Volcano plot

-

This volcano plot shows the genes that are significantly up- and -down-regulated as a result of the analysis comparison. The points -highlighted in red are genes that have padj < 0.05 and a log2-fold -change > 1. Points in blue have a padj < 0.05 and a log2-fold -change < 1 and points in green have a padj > 0.05 and a log2-fold -change > 2. Grey points are non-significant. The dashed lines -correspond to the cutoff values of log2 foldchance and padj that we have -chosen.

-
# degVolcano(res_mod[,c('lfc', 'padj')], plot_text = show) 
-EnhancedVolcano(res_mod,
-                lab= res_mod$gene_name, 
-                pCutoff = 1.345719e-03, 
-                selectLab = c(res_sig$gene_name[1:15]),
-                FCcutoff = 0.5,
-                x = 'lfc',
-                y = 'padj', 
-                title="Volcano Tumor vs. Normal",
-                col=as.vector(colors[c("dark_grey", "light_blue",
-                                         "purple", "purple")]),
-                subtitle = "", xlim=c(-5,5)) 
-

-
-
-

4.3 Heatmap

-
### Run pheatmap using the metadata data frame for the annotation
-ma=norm_matrix[res_sig$gene_id,]
-colma=coldata[,c(column), drop=FALSE]
-colors=lapply(colnames(colma), function(c){
-  l.col=colors[1:length(unique(colma[[c]]))]
-  names(l.col)=unique(colma[[c]])
-  l.col
-})
-names(colors)=colnames(colma)
-pheatmap(ma, 
-         color = inferno(10), 
-         cluster_rows = T, 
-         show_rownames = F,
-         annotation = colma, 
-         annotation_colors = colors,
-         border_color = NA, 
-         fontsize = 10, 
-         scale = "row", 
-         fontsize_row = 10, 
-         height = 20)
-
-
-

4.4 Differentially -Expressed Genes

-
res_sig %>% sanitize_datatable
-
- -
-
-

4.5 Plot top 16 -genes

-
n = 16
-top_n <- res_sig %>% slice_min(order_by = padj, n = n, with_ties = F) %>% 
-  dplyr::select(gene_name, gene_id)
-top_n_exp <- norm_matrix %>% as.data.frame() %>% 
-  rownames_to_column('gene_id') %>%
-  # dplyr::select(-group, -group_name) %>% 
-  pivot_longer(!gene_id, names_to = 'sample', values_to = 'log2_expression') %>%
-  right_join(top_n, relationship = "many-to-many") %>%
-  left_join(coldata, by = 'sample')
-
-ggplot(top_n_exp, aes_string(x = column, y = 'log2_expression')) +
-  geom_boxplot(outlier.shape = NA, linewidth=0.5, color="grey") + 
-  geom_point() +
-  facet_wrap(~gene_name) + 
-  ggtitle(str_interp('Expression of Top ${n} DEGs')) +
-  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
-

-
-
-
-

5 Pathway Enrichment

-

From the set of differentially expressed genes and using publicly -available information about gene sets involved in biological processes -and functions, we can calculate which biological processes and functions -are significantly perturbed as a result of the treatment.

-
universe=res %>% 
-  filter(!is.na(padj)) %>% pull(gene_id)
-mapping = AnnotationDbi::select(org.Hs.eg.db, universe, 'ENTREZID', 'ENSEMBL')
-
-all_in_life=list(
-  msigdbr(species = "human", category = "H") %>% mutate(gs_subcat="Hallmark"),
-  msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME"),
-  msigdbr(species = "human", category = "C2", subcategory = "CP:KEGG"),
-  msigdbr(species = "human", category = "C2", subcategory = "CP:PID"),
-  msigdbr(species = "human", category = "C5", subcategory = "GO:BP"),
-  msigdbr(species = "human", category = "C5", subcategory = "GO:MF"),
-  msigdbr(species = "human", category = "C5", subcategory = "HPO"),
-  msigdbr(species = "human", category = "C3", subcategory = "TFT:GTRD"),
-  msigdbr(species = "human", category = "C6") %>% mutate(gs_subcat="Oncogenic")
-)
-
-ora_input = res %>% filter(!is.na(padj), padj<0.01, abs(lfc)>0.3) %>% pull(gene_id)
-input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL'))
-
-total_deg=length(unique(ora_input))/length(unique(mapping$ENTREZID))
-pathways_ora_all = lapply(all_in_life, function(p){
-  pathway = split(x = p$entrez_gene, f = p$gs_name)
-  db_name = paste(p$gs_cat[1], p$gs_subcat[1],sep=":")
-  respath <- fora(pathways = pathway, 
-                  genes = unique(input_entrezid$ENTREZID),
-                  universe = unique(mapping$ENTREZID),
-                  minSize  = 15,
-                  maxSize  = 500)
-  coll_respath = collapsePathwaysORA(respath[order(pval)][padj < 0.1], 
-                                     pathway, unique(input_entrezid$ENTREZID), unique(mapping$ENTREZID))
-  as_tibble(respath[pathway %in% coll_respath$mainPathways])  %>% 
-    mutate(database=db_name, NES=(overlap/size)/(total_deg))
-}) %>% bind_rows() %>% 
-  mutate(analysis="ORA")
-  
-ora_tb = pathways_ora_all %>% unnest(overlapGenes) %>%
-  group_by(pathway) %>% 
-  left_join(mapping, by =c("overlapGenes"="ENTREZID")) %>% 
-  dplyr::select(pathway, padj, NES, ENSEMBL, analysis,
-                database)
-
-pathways_long = ora_tb
-
pathways_ora_all %>% sanitize_datatable()
-
- -
counts_norm=norm_matrix %>% as.data.frame() %>% 
-  rownames_to_column("gene_id") %>% 
-  mutate(comparison = str_interp("${numerator}_vs_${denominator}"))
-
-res_for_writing <- res %>% 
-  mutate(comparison = str_interp("${numerator}_vs_${denominator}"))
-
-pathways_for_writing <- pathways_long %>% 
-  mutate(comparison = str_interp("${numerator}_vs_${denominator}"))
-         
-if (!is.null(subset_value)){
-  counts_norm <- counts_norm %>% 
-    mutate(subset = subset_value)  
-  res_for_writing <- res_for_writing %>% 
-    mutate(subset = subset_value)
-  pathways_for_writing <- pathways_for_writing %>% 
-    mutate(subset = subset_value)
-}
-
-write_csv(counts_norm, name_expression_fn)
-write_csv(res_for_writing, name_deg_fn)
-write_csv(pathways_for_writing, name_pathways_fn)
-
-
-

6 R session

-

List and version of tools used for the DE report generation.

-
sessionInfo()
-
## R version 4.3.3 (2024-02-29)
-## Platform: aarch64-apple-darwin20 (64-bit)
-## Running under: macOS Sonoma 14.4.1
-## 
-## Matrix products: default
-## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
-## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
-## 
-## locale:
-## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
-## 
-## time zone: America/New_York
-## tzcode source: internal
-## 
-## attached base packages:
-## [1] stats4    stats     graphics  grDevices utils     datasets  methods   base     
-## 
-## other attached packages:
-##  [1] rmarkdown_2.26              viridis_0.6.5               viridisLite_0.4.2          
-##  [4] ggprism_1.0.5               EnhancedVolcano_1.20.0      org.Hs.eg.db_3.18.0        
-##  [7] AnnotationDbi_1.64.1        fgsea_1.28.0                msigdbr_7.5.1              
-## [10] ggpubr_0.6.0                RUVSeq_1.36.0               edgeR_4.0.16               
-## [13] limma_3.58.1                EDASeq_2.36.0               ShortRead_1.60.0           
-## [16] GenomicAlignments_1.36.0    Rsamtools_2.16.0            Biostrings_2.68.1          
-## [19] XVector_0.40.0              BiocParallel_1.34.2         janitor_2.2.0              
-## [22] DT_0.33                     pheatmap_1.0.12             ggrepel_0.9.5              
-## [25] DEGreport_1.38.5            DESeq2_1.42.1               SummarizedExperiment_1.30.2
-## [28] Biobase_2.60.0              MatrixGenerics_1.12.3       matrixStats_1.3.0          
-## [31] rtracklayer_1.60.1          GenomicRanges_1.52.1        GenomeInfoDb_1.36.4        
-## [34] IRanges_2.34.1              S4Vectors_0.38.2            BiocGenerics_0.46.0        
-## [37] knitr_1.45                  lubridate_1.9.3             forcats_1.0.0              
-## [40] stringr_1.5.1               dplyr_1.1.4                 purrr_1.0.2                
-## [43] readr_2.1.5                 tidyr_1.3.1                 tibble_3.2.1               
-## [46] ggplot2_3.5.0               tidyverse_2.0.0             bcbioR_0.1.2               
-## 
-## loaded via a namespace (and not attached):
-##   [1] BiocIO_1.10.0               bitops_1.0-7                filelock_1.0.3             
-##   [4] R.oo_1.26.0                 XML_3.99-0.16.1             lifecycle_1.0.4            
-##   [7] rstatix_0.7.2               doParallel_1.0.17           lattice_0.22-5             
-##  [10] vroom_1.6.5                 MASS_7.3-60.0.1             crosstalk_1.2.1            
-##  [13] backports_1.4.1             magrittr_2.0.3              sass_0.4.9                 
-##  [16] jquerylib_0.1.4             yaml_2.3.8                  cowplot_1.1.3              
-##  [19] DBI_1.2.2                   RColorBrewer_1.1-3          ConsensusClusterPlus_1.66.0
-##  [22] abind_1.4-5                 zlibbioc_1.46.0             R.utils_2.12.3             
-##  [25] RCurl_1.98-1.14             rappdirs_0.3.3              circlize_0.4.16            
-##  [28] GenomeInfoDbData_1.2.10     codetools_0.2-19            DelayedArray_0.26.7        
-##  [31] xml2_1.3.6                  tidyselect_1.2.1            shape_1.4.6.1              
-##  [34] farver_2.1.1                BiocFileCache_2.8.0         jsonlite_1.8.8             
-##  [37] GetoptLong_1.0.5            iterators_1.0.14            bbmle_1.0.25.1             
-##  [40] foreach_1.5.2               tools_4.3.3                 progress_1.2.3             
-##  [43] Rcpp_1.0.12                 glue_1.7.0                  gridExtra_2.3              
-##  [46] mnormt_2.1.1                SparseArray_1.2.4           xfun_0.43                  
-##  [49] numDeriv_2016.8-1.1         withr_3.0.0                 fastmap_1.1.1              
-##  [52] latticeExtra_0.6-30         fansi_1.0.6                 digest_0.6.35              
-##  [55] timechange_0.3.0            R6_2.5.1                    colorspace_2.1-0           
-##  [58] jpeg_0.1-10                 biomaRt_2.56.1              RSQLite_2.3.6              
-##  [61] R.methodsS3_1.8.2           utf8_1.2.4                  generics_0.1.3             
-##  [64] data.table_1.15.4           prettyunits_1.2.0           httr_1.4.7                 
-##  [67] htmlwidgets_1.6.4           S4Arrays_1.2.1              pkgconfig_2.0.3            
-##  [70] gtable_0.3.4                blob_1.2.4                  ComplexHeatmap_2.18.0      
-##  [73] hwriter_1.3.2.1             htmltools_0.5.8.1           carData_3.0-5              
-##  [76] clue_0.3-65                 scales_1.3.0                logging_0.10-108           
-##  [79] hues_0.2.0                  png_0.1-8                   snakecase_0.11.1           
-##  [82] ggdendro_0.2.0              rstudioapi_0.16.0           tzdb_0.4.0                 
-##  [85] rjson_0.2.21                coda_0.19-4.1               nlme_3.1-164               
-##  [88] curl_5.2.1                  bdsmatrix_1.3-7             cachem_1.0.8               
-##  [91] GlobalOptions_0.1.2         parallel_4.3.3              restfulr_0.0.15            
-##  [94] apeglm_1.24.0               pillar_1.9.0                grid_4.3.3                 
-##  [97] reshape_0.8.9               vctrs_0.6.5                 car_3.1-2                  
-## [100] dbplyr_2.5.0                cluster_2.1.6               evaluate_0.23              
-## [103] GenomicFeatures_1.52.2      mvtnorm_1.2-4               cli_3.6.2                  
-## [106] locfit_1.5-9.9              compiler_4.3.3              rlang_1.1.3                
-## [109] crayon_1.5.2                ggsignif_0.6.4              labeling_0.4.3             
-## [112] interp_1.1-6                aroma.light_3.32.0          emdbook_1.3.13             
-## [115] plyr_1.8.9                  stringi_1.8.3               psych_2.4.3                
-## [118] deldir_2.0-4                babelgene_22.9              munsell_0.5.1              
-## [121] Matrix_1.6-5                hms_1.1.3                   bit64_4.0.5                
-## [124] KEGGREST_1.40.1             statmod_1.5.0               highr_0.10                 
-## [127] broom_1.0.5                 memoise_2.0.1               bslib_0.7.0                
-## [130] fastmatch_1.1-4             bit_4.0.5
-
- - - -
-
- -
- - - - - - - - - - - - - - - - - From f8ee3573805e9b025e41690be1889805f02de973 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Fri, 31 May 2024 09:05:57 -0400 Subject: [PATCH 04/93] add app project name --- R/app.R | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ R/helpers.R | 21 +++++---------------- 2 files changed, 54 insertions(+), 16 deletions(-) create mode 100644 R/app.R diff --git a/R/app.R b/R/app.R new file mode 100644 index 0000000..0991c9c --- /dev/null +++ b/R/app.R @@ -0,0 +1,49 @@ +# Global variables can go here +library(stringr) +.fix <- function(x){ + x <- tolower(x) + x <- str_replace_all(x, "[[:punct:]]", "_") + x <- str_replace_all(x, " ", "_") + return(x) +} + + +# Define the UI +ui <- fluidPage( + # Application title + titlePanel("Create project name"), + + sidebarLayout( + # Sidebar with a slider and selection inputs + sidebarPanel( + textInput('hbc', 'hbc-code (no letters)', value = "00000"), + textInput('pi', 'What is PI last name:', value = "lastname"), + textInput('tech', 'What is the technology:', value = "rnaseq"), + textInput('tissue', 'What is the tissue:', value = "mix|cells|heart"), + textInput('org', 'What is the organism:', value = "mix|human"), + textInput('proj', 'What is the project name:', value = "this_analysis_is_cool"), + + ), + + # Show Word Cloud + mainPanel( + br("Suggested project name:"), + br(), + verbatimTextOutput('project') + ) + ) +) + + +# Define the server code +server <- function(input, output, session) { + output$project <- renderText({ + hbc_code <- paste0("hbc", input$hbc) + project_full <- paste(input$tech, .fix(input$pi), .fix(input$project), + input$tissue, input$org, hbc_code, sep="_") + project_full + }) +} + +# Return a Shiny app object +shinyApp(ui = ui, server = server) diff --git a/R/helpers.R b/R/helpers.R index fca3034..5b944a0 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -103,19 +103,6 @@ bcbio_set_project <- function() { } -guess_analysis <- function(path){ - if (!fs::dir_exists(path)) - ui_stop("{ui_val(path)} doesn't exist") - - # This file is inside star_salmon/ folder - counts_fn <- fs::path_join(c(path, '/star_salmon/salmon.merged.gene_counts.tsv')) - # This folder called "multiqc_report_data" is inside the output directory star_salmon inside multiqc folder - multiqc_data_dir <- fs::path_join(c(path, 'star_salmon/multiqc_report_data')) - # This file is inside star_salmon/ folder - se_object <- fs::path_join(c(path, 'star_salmon/salmon.merged.gene_counts.rds')) - -} - read_pipeline_info <- function(nfcore){ # pipeline_info/params_2024-05-28_12-28-51.json config <- fs::path_join(c(nfcore, "pipeline_info")) @@ -136,7 +123,6 @@ read_pipeline_info <- function(nfcore){ list(metadata=metadata, pipeline=pipeline) } - render_rmd <- function(infile, outfile, ls_data){ whisker.render(read_file(infile), ls_data) %>% @@ -193,7 +179,7 @@ use_bcbio_analysis <- function(path, nfcore=NULL, copy=FALSE, metadata=NULL){ if (!fs::dir_exists(nfcore)) ui_stop("{ui_value(nfcore)} doesn't exist. point to nfcore path or turn on copy mode.") - #guess analysis from pipeline file + # guess analysis from pipeline file information <- read_pipeline_info(nfcore) fs::dir_create(fs::path_join(c(path, "meta"))) meta_path <- fs::path_join(c(path, "meta", fs::path_file(information$metadata))) @@ -206,12 +192,15 @@ use_bcbio_analysis <- function(path, nfcore=NULL, copy=FALSE, metadata=NULL){ if (!fs::file_exists(information$metadata)){ ui_warn("{ui_value(metadata)} not found. We can only work with local filesytems. For now.") ui_todo("Please, copy {ui_value(metadata)} to {ui_value(meta_path)}.") - ui_warn("If this file is manually set up, the Rmd code will fail.") + ui_warn("If this file isn't manually set up, the Rmd code will fail.") }else{ + ui_info("Copy metadata to {ui_value(meta_path)}") fs::file_copy(information$metadata, meta_path) } metadata <- meta_path } + path_final <- fs::path_join(c(path, "final")) + ui_todo("Please, copy nf-core output directory to {ui_value(path_final)}") } # set all files from analysis data <- bcbio_params(nfcore, pipeline, metadata, copy=copy) From cc29d405fb3927ca94c7e3675c5c09c99c18517c Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Fri, 31 May 2024 17:03:13 -0400 Subject: [PATCH 05/93] fix typo --- R/app.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/app.R b/R/app.R index 0991c9c..531206b 100644 --- a/R/app.R +++ b/R/app.R @@ -18,6 +18,7 @@ ui <- fluidPage( sidebarPanel( textInput('hbc', 'hbc-code (no letters)', value = "00000"), textInput('pi', 'What is PI last name:', value = "lastname"), + textInput('scientist', 'What is the scientist last name:', value = "scientist"), textInput('tech', 'What is the technology:', value = "rnaseq"), textInput('tissue', 'What is the tissue:', value = "mix|cells|heart"), textInput('org', 'What is the organism:', value = "mix|human"), @@ -39,7 +40,8 @@ ui <- fluidPage( server <- function(input, output, session) { output$project <- renderText({ hbc_code <- paste0("hbc", input$hbc) - project_full <- paste(input$tech, .fix(input$pi), .fix(input$project), + project_full <- paste(input$tech, .fix(input$pi), .fix(input$scientist), + .fix(input$proj), input$tissue, input$org, hbc_code, sep="_") project_full }) From f9215227176329abca0e9e140473a44a50c1b95d Mon Sep 17 00:00:00 2001 From: eberdan Date: Tue, 2 Jul 2024 11:49:41 -0400 Subject: [PATCH 06/93] Update DEG.Rmd added extra filtering --- inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd | 3 +++ 1 file changed, 3 insertions(+) diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd b/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd index 5da794c..be4487d 100644 --- a/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd +++ b/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd @@ -400,6 +400,9 @@ res <- as.data.frame(resLFCS) %>% relocate(gene_name) %>% dplyr::rename(lfc = log2FoldChange) %>% mutate(pi = abs(lfc) * -log10(padj)) %>% arrange(-pi) +## Filter out genes that have no expression or were filtered out by DESEQ2 +res <- res[res$baseMean>0,] %>% drop_na(padj) %>% drop_na(pvalue) + res_sig <- res %>% filter(padj < 0.05) %>% arrange(padj) %>% mutate(gene_name = ifelse(is.na(gene_name), gene_id, gene_name)) From 0aa7569d5321f458a7cd081fd5016e3336dd9ad3 Mon Sep 17 00:00:00 2001 From: eberdan Date: Wed, 3 Jul 2024 16:51:15 -0400 Subject: [PATCH 07/93] first part drafted --- .../singlecell/skeleton/starting_steps.md | 146 ++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 inst/rmarkdown/templates/singlecell/skeleton/starting_steps.md diff --git a/inst/rmarkdown/templates/singlecell/skeleton/starting_steps.md b/inst/rmarkdown/templates/singlecell/skeleton/starting_steps.md new file mode 100644 index 0000000..f4fbfe8 --- /dev/null +++ b/inst/rmarkdown/templates/singlecell/skeleton/starting_steps.md @@ -0,0 +1,146 @@ +--- +title: "From raw data to Seurat" +--- + + +# Overview + +This tutorial assumes that you are starting with 10x genomic data that has not yet been run through cellranger. If you have output files from cellranger (raw_feature_bc_matrix.h5) files skip to step 2. + +# Step 1 running cellranger + +## Set up + +Here are the steps that need to be completed prior to running cellranger. + +### Locate or create your genome + +#### I have mouse or human + +We have prebuilt references for mouse and human located here: + +#### I have another genome + +It is easy to generate a cellranger reference for any genome. All you need as input are a fasta file and a gtf file. Here is some [information](https://kb.10xgenomics.com/hc/en-us/articles/115003327112-How-can-we-add-genes-to-a-reference-package-for-Cell-Ranger) about what is required for the gtf file. + +**Note: what is listed as "gene_id" (required in gtf) or "gene_name" (if used will be preferred) will be your row names (i.e. gene names). Make sure this is something useful or can be connected to information on what these genes are.** + +Below is an example script for a non-model reference + +``` +#!/bin/sh +#SBATCH --partition=short +#SBATCH -o run.o +#SBATCH -e run.e +#SBATCH -t 0-2:30 +#SBATCH -c 1 +#SBATCH --mem=48G + +module load cellranger/7.1.0 + +cellranger mkref \ + --genome=my_nonmodel_genome \ + --fasta=/path/to/my/genome/fasta/file/my_nonmodel_genome.fasta \ + --genes=/path/to/my/genome/annotation/file/my_nonmodel_genome.gtf + +``` + + +### Fastq data + +You should have a number of output files from each sample. These should look like those below: + +``` +sample1_I1_001.fastq.gz +sample1_I2_001.fastq.gz +sample1_R1_001.fastq.gz +sample1_R2_001.fastq.gz +``` + +Cellranger will be looking for both the I and R files. If you do not have both you may have to run demultiplexing, [See here](https://www.10xgenomics.com/support/software/cell-ranger/latest/analysis/inputs/cr-mkfastq). + +**NOTE: you may have multiple lanes per sample. there is no need to concatentate these prior to running cellranger.** + + +It is best to create one folder per sample with that sample name and put all of the files there. + +**Cellranger expects 1 folder per sample** + +Here is an example file structure + +``` +fastq_files +├── sample1 +│   ├── sample1_I1_001.fastq.gz +│   ├── sample1_I2_001.fastq.gz +│   ├── sample1_R1_001.fastq.gz +│   ├── sample1_R2_001.fastq.gz +├── sample2 +│   ├── sample2_I1_001.fastq.gz +``` + +## Run Cellranger + +The easiest way to run cellranger is using the array feature on O2. [Here](https://github.com/hbc/knowledgebase/blob/master/rc/arrays_in_slurm.md) is a tutorial on arrays. + +To run cellranger as an array you will need one extra file. This file called `samples.txt` will have the name of each sample on its own line. + +``` +sample1 +sample2 +sample3 +... +sampleN +``` + +for ease `samples.txt` should be in the same directory as your sbatch script. + +Here is an example sbatch script for running cellranger as an array + +```(bash) +#!/bin/bash + +#SBATCH --job-name=CellRangerCount3 # Job name +#SBATCH --partition=short # Partition name +#SBATCH --time=0-05:00 # Runtime in D-HH:MM format +#SBATCH --nodes=1 # Number of nodes (keep at 1) +#SBATCH --ntasks=1 # Number of tasks per node (keep at 1) +#SBATCH --cpus-per-task=16 # CPU cores requested per task (change for threaded jobs) +#SBATCH --mem=128G # Memory needed per node (total) +#SBATCH --error=jobid_%j.err # File to which STDERR will be written, including job ID +#SBATCH --output=jobid_%j.out # File to which STDOUT will be written, including job ID + + +samp=$(awk -v awkvar="${SLURM_ARRAY_TASK_ID}" 'NR==awkvar' samples.txt) ### This line will take the numeric slurm array task id and find the corresponding line number in samples.txt. The sample name is made into a variable called samp. + +module load cellranger/7.1.0 + +cellranger count \ + --id=${samp} \ ## This is what your output folders will be named + --fastqs=/path/to/your/folder/of/fastq/folders/${samp} \ + --transcriptome=/path/to/your/genome \ + --localcores=16 \ + --localmem=128 + +``` + +This script can be run depending on the number of samples you have. Here we will call it N: + +``` +sbatch --array=1-N run_cellranger.sh +``` + +For example if you have 9 samples to run you can use: + +``` +sbatch --array=1-9 run_cellranger.sh +``` + +Arrays are also handy if you need to re-run just a single sample. Let's say you need to re-run the 1st and 9th sample in samples.txt + +``` +sbatch --array=1,9 run_cellranger.sh +``` + +# Step 2 - going from cellranger output to Seurat + From 8e236c167db566f6aa5219469c7d15d8b810e0ab Mon Sep 17 00:00:00 2001 From: eberdan Date: Wed, 3 Jul 2024 17:30:25 -0400 Subject: [PATCH 08/93] finished first draft --- .../singlecell/skeleton/starting_steps.md | 208 ++++++++++++++++++ 1 file changed, 208 insertions(+) diff --git a/inst/rmarkdown/templates/singlecell/skeleton/starting_steps.md b/inst/rmarkdown/templates/singlecell/skeleton/starting_steps.md index f4fbfe8..4299b4c 100644 --- a/inst/rmarkdown/templates/singlecell/skeleton/starting_steps.md +++ b/inst/rmarkdown/templates/singlecell/skeleton/starting_steps.md @@ -144,3 +144,211 @@ sbatch --array=1,9 run_cellranger.sh # Step 2 - going from cellranger output to Seurat +All of your output is now in the output folders you denoted in your cellranger script. These files are almost identical inside. The bit we want to keep is going to always be the same + +``` +sample1/out/raw_feature_bc_matrix.h5 +``` + +**Note: Always use the raw matrix and not filtered_feature_bc_matrix.h5. We will do our own QC downstream.** + +In this part of the tutorial we will go through the parts of creating the seurat object piece by piece. At the bottom is the entire script that you can copy and paste and edit. + +## Part 1 - reading all the files in and creating initial seurat object + +This should all be done in an interactive session on O2 with at least 96G of memory. + + +```(R) + +library(Seurat) +library(data.table) +library(hdf5r) + +### Set up run information +data_dir <- "/path/to/cellranger/output/folders/" + +samples <- c("sample1", "sample2", "sample3") + +### Make individual seurat objects for each sample + +for (i in 1:length(samples)){ + seurat_data <- Read10X_h5(paste(c(data_dir,samples[i],"/outs/raw_feature_bc_matrix.h5"),sep="",collapse = "")) + seurat_obj <- CreateSeuratObject(counts = seurat_data, + min.features = 100, ## only keep cells with at least 100 genes + project = samples[i]) + assign(paste0(samples[i], "_seurat"), + seurat_obj) # stores Seurat object in variable of corresponding sample name +} + +### Merge all seurat objects + +seurat_ID <- paste0(samples, "_seurat") # get names of all objects + + +u <- get(seurat_ID[2]) +for (i in 3:length(seurat_ID)) { + u <- c(u, get(seurat_ID[i])) +} ## makes a list of all seurat objects + +seurat_merge <- merge(x = get(seurat_ID[1]), + y = u, + add.cell.ids = all_samples, + project = "my_scRNA_project") + +``` + +## Part 2 - Add mitochondrial information + +This code is for using mouse mitochondrial genes: + + +```(R) +# Mitochondrial genes for mouse genome +idx <- grep("^mt-", rownames(GetAssay(seurat_merge, "RNA"))) +rownames(GetAssay(seurat_merge, "RNA"))[idx] +# Mitochondrial genes vs. nuclear genes ratio +seurat_merge$mitoRatio <- PercentageFeatureSet(object = seurat_merge, pattern = "^mt-") +seurat_merge$mitoRatio <- seurat_merge@meta.data$mitoRatio/100 # Divide by 100 for Ratio instead of Percentage +``` + +This code is for using human mitochondrial genes: + +```(R) +# Mitochondrial genes for human genome +idx <- grep("^MT-", rownames(GetAssay(seurat_obj, "RNA"))) +rownames(GetAssay(seurat_merge, "RNA"))[idx] +# Mitochondrial genes vs. nuclear genes ratio +seurat_merge$mitoRatio <- PercentageFeatureSet(object = seurat_merge, pattern = "^mt-") +seurat_merge$mitoRatio <- seurat_merge@meta.data$mitoRatio/100 # Divide by 100 for Ratio instead of Percentage +``` + +Below is an example code from the siberian hamster. Here I all of the mitochondrial genes were found manually a list was created. + +```(R) +mito_genes <- c("ND1", "ND2","COX1","COX2","ATP8","ATP6","COX3","ND3","ND4L","ND4","ND5","ND6","CYTB") +idx <- (rownames(GetAssay(seurat_merge, "RNA")) %in% mito_genes) +rownames(GetAssay(seurat_merge, "RNA"))[idx] +# Mitochondrial genes vs. nuclear genes ratio +seurat_merge$mitoRatio <- PercentageFeatureSet(object = seurat_merge, features = mito_genes) +seurat_merge$mitoRatio <- seurat_merge@meta.data$mitoRatio/100 # +``` + +## Part 3 - Add additional metadata + +Here we add some additonal metrics and sample metadata from the client. + +```(R) +# Number of genes per UMI for each cell +seurat_merge$Log10GenesPerUMI <- log10(seurat_merge$nFeature_RNA) / log10(seurat_merge$nCount_RNA) + +# Import experimental metadata +metaexp <- read.csv("/path/to/experimental/metadata/meta.csv") + +# Check matching of IDs +all(metaexp$sample %in% metadata$orig.ident) +all(metadata$orig.ident %in% metaexp$sample) + +#change headings to match +colnames(metaexp)[1] <- "orig.ident" + +metafull <- plyr::join(metadata, metaexp, + by = c("orig.ident")) + +# Replace seurat object metadata +if(all(metafull$barcode == rownames(seurat_merge@meta.data))) { + rownames(metafull) <- metafull$barcode + seurat_merge@meta.data <- metafull +} +``` + +## Part 4 - Save object + +```(R) +## Join layers (each sample is a separate layer) +seurat_merge[["RNA"]] <- JoinLayers(seurat_merge[["RNA"]]) + +### Save Seurat object for future processing +save(seurat_merge, file = "seurat_pre-filtered.rds") +write.csv(seurat_merge@meta.data, file = "metadata_pre-filtered.csv") +``` + +## Full script + +Below find all pieces to copy and paste. We are assuming a mouse genome. + +``` + +library(Seurat) +library(data.table) +library(hdf5r) + +### Set up run information +data_dir <- "/path/to/cellranger/output/folders/" + +samples <- c("sample1", "sample2", "sample3") + +### Make individual seurat objects for each sample + +for (i in 1:length(samples)){ + seurat_data <- Read10X_h5(paste(c(data_dir,samples[i],"/outs/raw_feature_bc_matrix.h5"),sep="",collapse = "")) + seurat_obj <- CreateSeuratObject(counts = seurat_data, + min.features = 100, ## only keep cells with at least 100 genes + project = samples[i]) + assign(paste0(samples[i], "_seurat"), + seurat_obj) # stores Seurat object in variable of corresponding sample name +} + +### Merge all seurat objects + +seurat_ID <- paste0(samples, "_seurat") # get names of all objects + + +u <- get(seurat_ID[2]) +for (i in 3:length(seurat_ID)) { + u <- c(u, get(seurat_ID[i])) +} ## makes a list of all seurat objects + +seurat_merge <- merge(x = get(seurat_ID[1]), + y = u, + add.cell.ids = all_samples, + project = "my_scRNA_project") + + +# Mitochondrial genes for mouse genome +idx <- grep("^mt-", rownames(GetAssay(seurat_merge, "RNA"))) +rownames(GetAssay(seurat_merge, "RNA"))[idx] +# Mitochondrial genes vs. nuclear genes ratio +seurat_merge$mitoRatio <- PercentageFeatureSet(object = seurat_merge, pattern = "^mt-") +seurat_merge$mitoRatio <- seurat_merge@meta.data$mitoRatio/100 # Divide by 100 for Ratio instead of Percentage + +# Number of genes per UMI for each cell +seurat_merge$Log10GenesPerUMI <- log10(seurat_merge$nFeature_RNA) / log10(seurat_merge$nCount_RNA) + +# Import experimental metadata +metaexp <- read.csv("/path/to/experimental/metadata/meta.csv") + +# Check matching of IDs +all(metaexp$sample %in% metadata$orig.ident) +all(metadata$orig.ident %in% metaexp$sample) + +#change headings to match +colnames(metaexp)[1] <- "orig.ident" + +metafull <- plyr::join(metadata, metaexp, + by = c("orig.ident")) + +# Replace seurat object metadata +if(all(metafull$barcode == rownames(seurat_merge@meta.data))) { + rownames(metafull) <- metafull$barcode + seurat_merge@meta.data <- metafull +} + + +## Join layers (each sample is a separate layer) +seurat_merge[["RNA"]] <- JoinLayers(seurat_merge[["RNA"]]) + +### Save Seurat object for future processing +save(seurat_merge, file = "seurat_pre-filtered.rds") +write.csv(seurat_merge@meta.data, file = "metadata_pre-filtered.csv") +``` \ No newline at end of file From f7888247aa662e781f3d12fb2fcc98847f5e557f Mon Sep 17 00:00:00 2001 From: eberdan Date: Mon, 8 Jul 2024 11:55:46 -0400 Subject: [PATCH 09/93] changed how object saved --- .../rmarkdown/templates/singlecell/skeleton/starting_steps.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/rmarkdown/templates/singlecell/skeleton/starting_steps.md b/inst/rmarkdown/templates/singlecell/skeleton/starting_steps.md index 4299b4c..1e10850 100644 --- a/inst/rmarkdown/templates/singlecell/skeleton/starting_steps.md +++ b/inst/rmarkdown/templates/singlecell/skeleton/starting_steps.md @@ -269,7 +269,7 @@ if(all(metafull$barcode == rownames(seurat_merge@meta.data))) { seurat_merge[["RNA"]] <- JoinLayers(seurat_merge[["RNA"]]) ### Save Seurat object for future processing -save(seurat_merge, file = "seurat_pre-filtered.rds") +saveRDS(seurat_merge, file = "seurat_pre-filtered.rds") write.csv(seurat_merge@meta.data, file = "metadata_pre-filtered.csv") ``` @@ -349,6 +349,6 @@ if(all(metafull$barcode == rownames(seurat_merge@meta.data))) { seurat_merge[["RNA"]] <- JoinLayers(seurat_merge[["RNA"]]) ### Save Seurat object for future processing -save(seurat_merge, file = "seurat_pre-filtered.rds") +saveRDS(seurat_merge, file = "seurat_pre-filtered.rds") write.csv(seurat_merge@meta.data, file = "metadata_pre-filtered.csv") ``` \ No newline at end of file From 3a4ed7348f20f900fda7d413bef21839f4ae925c Mon Sep 17 00:00:00 2001 From: eberdan Date: Mon, 8 Jul 2024 12:19:36 -0400 Subject: [PATCH 10/93] initial QC --- .../singlecell/skeleton/scRNA_qc_template.rmd | 414 ++++++++++++++++++ 1 file changed, 414 insertions(+) create mode 100644 inst/rmarkdown/templates/singlecell/skeleton/scRNA_qc_template.rmd diff --git a/inst/rmarkdown/templates/singlecell/skeleton/scRNA_qc_template.rmd b/inst/rmarkdown/templates/singlecell/skeleton/scRNA_qc_template.rmd new file mode 100644 index 0000000..44bba13 --- /dev/null +++ b/inst/rmarkdown/templates/singlecell/skeleton/scRNA_qc_template.rmd @@ -0,0 +1,414 @@ +--- +title: "scRNA QC" +output: html_document +date: "`r Sys.Date()`" +params: + ## If you have Ribosomal ratio in your raw seurat object put this as TRUE otherwise leave as FALSE + ribosomal: FALSE +--- + +# Overview + +- Project: project +- PI: PI +- Analyst: analyst + + +```{r, eval=FALSE} +### READ ME FIRST + +# This is a template for scRNA QC to present to your client. The actual QC can be done using our rshiny app: + +# After you have decided on your QC metrics load your raw object (i.e. right after you first read data into seurat) and create your QC object by editing lines 49-67. + +# Edit text line 246 with your chosen QC cutoffs! +``` + + + +```{r setup, include=FALSE} +library(Seurat) +library(tidyverse) +library(ggplot2) + +knitr::opts_chunk[["set"]]( + cache = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + fig.height = 4, + echo=FALSE) +``` + + + +```{r load and filter} +## Load data + +seurat_raw <- seurat_clust <- readRDS("seurat_pre-filtered.rds") + +## Creat QC object USE METRICS YOU CHOSE IN THE RSHINY APP + +seurat_qc <- subset(x = seurat_raw, + subset = (nCount_RNA >= 1500) + & (nFeature_RNA >= 2200) + & (mitoRatio < 0.1) + ## & (riboRatio < 0.4) + & (Log10GenesPerUMI > 0.80) + ) + + +## Save QC object + +saveRDS(seurat_qc, file = "seurat_post-QC.rds") + +``` + + + +```{r prep-info} + +## Prep information for plotting +metadata0 <- seurat_raw@meta.data + +metadata0 = metadata0 %>% dplyr::rename(nUMI = nCount_RNA, + nGene = nFeature_RNA) + +metadata1 <- seurat_qc@meta.data + + +metadata1 = metadata1 %>% dplyr::rename(nUMI = nCount_RNA, + nGene = nFeature_RNA) +``` + + +# QC metrics: raw data {.tabset} + +In this section, we review quality control (QC) metrics for the **raw feature matrices** generated by `Cellranger`. Only a low level filter excluding cells with <100 nUMIs (= number of unique molecular identifiers, or sequenced reads per cell) was applied when uploading the data into `R`. + + +## Cells per sample + +```{r load_data} +table(metadata0$orig.ident) + +``` + + +## UMIs per cell + +Here, we look at the distribution of UMIs (unique molecular identifiers, or sequenced reads) per cell (droplet) in the dataset. Before QC, we expect a biomodal distribution with a first *small* peak at low numbers of UMIs (<250) corresponding to droplets that encapsulated background/dying cells, and a second higher peak centered at >1000. The line is at 250. + + +```{r raw_nUMIs} +metadata0 %>% + ggplot(aes(x = nUMI, color = orig.ident, fill = orig.ident)) + + geom_density(alpha = 0.2) + + theme_classic() + + ylab("Cell density") + scale_x_log10() + + geom_vline(xintercept = 250) + + facet_wrap(. ~ orig.ident) + + ggtitle("UMIs per cell in raw dataset") +``` + + + +```{r} +# Visualize the distribution of nUMIs per cell (boxplot) +metadata0 %>% + ggplot(aes(x=orig.ident, y=log10(nUMI), fill=orig.ident)) + + geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + + theme_classic() + + geom_hline(yintercept = c(log10(1000), log10(50000))) + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + + theme(plot.title = element_text(hjust = 0.5, face = "bold")) +``` + +## Genes per cell + +Here, we look at the number of different genes that were detected in each cell. By "detected", we mean genes with a non-zero read count measurement. Gene detection in the range of 500 to 5000 is normal for most single-cell experiments. The line is at 750. + +```{r raw_nGene} +# Visualize the distribution of genes detected per cell (histogram) +metadata0 %>% + ggplot(aes(x = nGene, color = orig.ident, fill = orig.ident)) + + geom_density(alpha = 0.2) + + theme_classic() + + scale_x_log10() + + geom_vline(xintercept = c(700)) + + facet_wrap(. ~ orig.ident) + + ggtitle("Detected genes per cell in raw dataset") +``` + + +```{r} +# Visualize the distribution of nUMIs per cell (boxplot) +metadata0 %>% + ggplot(aes(x=orig.ident, y=log10(nGene), fill=orig.ident)) + + geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + + theme_classic() + + geom_hline(yintercept = c(log10(700))) + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + + theme(plot.title = element_text(hjust = 0.5, face = "bold")) +``` + + +## Mitochondrial ratio + +We evaluate overall mitochondrial gene expression as a biomarker of cellular stress during sample preparation. Typically, we expect mitochondrial genes to account for <20% of overall transcripts in each cell. The line indicates 10%. + +```{r raw_mito, warning=FALSE} +# Visualize the distribution of mitochondrial gene expression detected per cell +metadata0 %>% + ggplot(aes(color = orig.ident, x = mitoRatio, fill = orig.ident)) + + geom_density(alpha = 0.2) + + scale_x_log10() + + theme_classic() + + geom_vline(xintercept = c(0.1)) + + facet_wrap(. ~ surgery) + + ggtitle("Percentage of mitochondrial gene expression per cell in raw dataset") +``` + + + +```{r raw_ribo, eval=ribosomal, warning=FALSE, results='asis'} + +cat("## Ribosomal ratio \n") + +cat("We evaluate overall ribosomal gene expression. The line indicates 5%. \n" +) +# Visualize the distribution of mitochondrial gene expression detected per cell +metadata0 %>% + ggplot(aes(color = orig.ident, x = riboRatio, fill = orig.ident)) + + geom_density(alpha = 0.2) + + scale_x_log10() + + theme_classic() + + geom_vline(xintercept = c(0.05)) + + facet_wrap(. ~ orig.ident) + + ggtitle("Percentage of ribosomal gene expression per cell in raw dataset") +``` + + +## UMIs vs. Genes + +By plotting the number of UMIs per cell (x-axis) vs. the number of genes per cell (y-axis), we can visually assess whether there is a large proportion of low quality cells with low read counts and/or gene detection (bottom left quadrant of the plot). In the following representation, cells are further color-coded based on the percentage of mitochondrial genes found among total detected genes. The line for nUMI is at 1000 and the line for nGene is at 700. + +```{r raw_gene_by_umi, fig.height=12, fig.width=15, warning=FALSE} +# Visualize the correlation between genes detected and number of UMIs and determine whether strong presence of cells with low numbers of genes/UMIs +metadata0 %>% + ggplot(aes(x=nUMI, y=nGene, color=mitoRatio)) + + geom_point() + + stat_smooth(method=lm) + + scale_x_log10() + + scale_y_log10() + + theme_classic() + + geom_vline(xintercept = 1000) + + geom_hline(yintercept = 700) + + ggtitle("Genes vs. nUMIs in raw dataset") + + facet_wrap(~orig.ident) +``` + + +## Complexity + +Another way to assess the quality and purity of a single-cell dataset is to look for cells that have fewer detected genes per UMI than others. Typical values for this metric are >0.8 for most cells. Cells with lower diversity in the genes they express may be low-complexity cell types such as red blood cells. With sorted populations, we expect high purity and a very similar complexity distribution across samples. + +```{r raw_novelty} +# Visualize the overall novelty of the gene expression by visualizing the genes detected per UMI +metadata0 %>% + ggplot(aes(x = Log10GenesPerUMI, color = orig.ident, fill = orig.ident)) + + geom_density(alpha = 0.2) + + theme_classic() + + geom_vline(xintercept = c(0.85)) + + facet_wrap(. ~ orig.ident) + + ggtitle("log10(Genes per UMI) in raw dataset") +``` + + +```{r} +# Visualize the distribution of nUMIs per cell (boxplot) +metadata0 %>% + ggplot(aes(x=orig.ident, Log10GenesPerUMI, fill=orig.ident)) + + geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + + theme_classic() + + geom_hline(yintercept = c(0.8, 0.85)) + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + + theme(plot.title = element_text(hjust = 0.5, face = "bold")) +``` + + +# QC metrics: Filtered data {.tabset} + +Based on the above QC metrics, we filtered the dataset to isolate cells passing the following thresholds: **>250 UMIs, >250 genes, <0.2 mitochondrial gene ratio, and >0.8 complexity**. + +In this section, we review QC metrics for our filtered dataset. + +```{r filtering1, echo=F} +# Filter out low quality reads using selected thresholds +seurat_qc <- subset(x = seurat_merge, + subset = (nCount_RNA >= 250) + #& (nCount_RNA < 50000) + & (nFeature_RNA >= 250) + & (mitoRatio < 0.2) + & (Log10GenesPerUMI > 0.80) + ) +#saveRDS(seurat_qc, file = "seurat_olfactory_mice_qc_full.rds") +#write.csv(seurat_qc@meta.data, file = "metadata_olfactory_mice_qc_1_full.csv") +nCells <- nrow(seurat_qc@meta.data) +metadata1 <- seurat_qc@meta.data + + +metadata1 = metadata1 %>% dplyr::rename(nUMI = nCount_RNA, + nGene = nFeature_RNA) + +table(metadata1$orig.ident) +``` + + +## UMIs per cell + +The line is at 1000 + +```{r qc1_nUMIs} +metadata1 %>% + ggplot(aes(color = orig.ident, x = nUMI, fill = orig.ident)) + + geom_density(alpha = 0.2) + + scale_x_log10() + + theme_classic() + + ylab("Cell density") + xlab("nUMI") + + geom_vline(xintercept = c(1000)) + + facet_wrap(. ~ orig.ident) +``` + + +```{r} +# Visualize the distribution of nUMIs per cell (boxplot) +metadata1 %>% + ggplot(aes(x=orig.ident, y=log10(nUMI), fill=orig.ident)) + + geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + + theme_classic() + + geom_hline(yintercept = c(log10(1000))) + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + + theme(plot.title = element_text(hjust = 0.5, face = "bold")) +``` + + +## Genes detected + +The line is at 750 + +```{r qc1_genes} +# Visualize the distribution of genes detected per cell via histogram +metadata1 %>% + ggplot(aes(color = orig.ident, x = nGene, fill= orig.ident)) + + geom_density(alpha = 0.2) + + theme_classic() + + scale_x_log10() + xlab("nGene") + + facet_wrap(. ~ orig.ident) + + geom_vline(xintercept = c(750)) +``` + + +```{r} +# Visualize the distribution of nUMIs per cell (boxplot) +metadata1 %>% + ggplot(aes(x=orig.ident, y=log10(nGene), fill=orig.ident)) + + geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + + theme_classic() + + geom_hline(yintercept = c(log10(7500))) + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + + theme(plot.title = element_text(hjust = 0.5, face = "bold")) + +``` + + +## Mitochondrial ratio + +The line is at 10%. + +```{r qc1_mitoratio, message=FALSE, warning=FALSE} +# Visualize the distribution of mitochondrial gene expression detected per cell +metadata1 %>% + ggplot(aes(color = orig.ident, x = mitoRatio, fill = orig.ident)) + + geom_density(alpha = 0.2) + + scale_x_log10() + + theme_classic() + + geom_vline(xintercept = 0.1) + + facet_wrap(. ~ surgery) +``` + + +```{r qc1_ribo, eval=ribosomal, warning=FALSE, results='asis'} + +cat("## Ribosomal ratio \n") + +cat("We evaluate overall ribosomal gene expression. The line indicates 10%. \n" +) +# Visualize the distribution of mitochondrial gene expression detected per cell +metadata0 %>% + ggplot(aes(color = orig.ident, x = riboRatio, fill = orig.ident)) + + geom_density(alpha = 0.2) + + scale_x_log10() + + theme_classic() + + geom_vline(xintercept = c(0.1)) + + facet_wrap(. ~ orig.ident) + + ggtitle("Percentage of ribosomal gene expression per cell in raw dataset") +``` + + +## UMIs vs. Genes + +Both the horizontal and vertical lines are at 1000. + +```{r qc1_genes_per_UMI, fig.height=12, fig.width=15, warning=FALSE} +# Visualize the correlation between genes detected and number of UMIs and determine whether strong presence of cells with low numbers of genes/UMIs +metadata1 %>% + ggplot(aes(x = nUMI, y = nGene, color = mitoRatio)) + + geom_point() + + stat_smooth(method=lm) + + scale_x_log10() + + scale_y_log10() + + theme_classic() + + geom_vline(xintercept = c(1000)) + + geom_hline(yintercept = c(1000)) + + ggtitle("Genes vs. nUMIs in raw dataset") + + xlab("nUMI") + ylab("nGene") + + facet_wrap(~orig.ident) +``` + + +## Complexity + +```{r qc1_complexity} +# Visualize the overall novelty of the gene expression by visualizing the genes detected per UMI +metadata1 %>% + ggplot(aes(x = Log10GenesPerUMI, color = orig.ident, fill = orig.ident)) + + geom_density(alpha = 0.2) + + theme_classic() + + #geom_vline(xintercept = c(0.85)) + + facet_wrap(. ~ orig.ident) +``` + +```{r} +# Visualize the distribution of nUMIs per cell (boxplot) +metadata1 %>% + ggplot(aes(x=orig.ident, Log10GenesPerUMI, fill=orig.ident)) + + geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + + theme_classic() + + #geom_hline(yintercept = c(0.85)) + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + + theme(plot.title = element_text(hjust = 0.5, face = "bold")) +``` + + + +# R session + +```{r} +sessionInfo() +``` + From 39815250a6bc96872a697358ae5b32b987f610e4 Mon Sep 17 00:00:00 2001 From: eberdan Date: Mon, 8 Jul 2024 12:42:56 -0400 Subject: [PATCH 11/93] few edits --- .../singlecell/skeleton/scRNA_qc_template.rmd | 22 ++++--------------- 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/inst/rmarkdown/templates/singlecell/skeleton/scRNA_qc_template.rmd b/inst/rmarkdown/templates/singlecell/skeleton/scRNA_qc_template.rmd index 44bba13..60bb9de 100644 --- a/inst/rmarkdown/templates/singlecell/skeleton/scRNA_qc_template.rmd +++ b/inst/rmarkdown/templates/singlecell/skeleton/scRNA_qc_template.rmd @@ -93,7 +93,7 @@ In this section, we review quality control (QC) metrics for the **raw feature ma ## Cells per sample -```{r load_data} +```{r cells raw} table(metadata0$orig.ident) ``` @@ -247,25 +247,11 @@ Based on the above QC metrics, we filtered the dataset to isolate cells passing In this section, we review QC metrics for our filtered dataset. -```{r filtering1, echo=F} -# Filter out low quality reads using selected thresholds -seurat_qc <- subset(x = seurat_merge, - subset = (nCount_RNA >= 250) - #& (nCount_RNA < 50000) - & (nFeature_RNA >= 250) - & (mitoRatio < 0.2) - & (Log10GenesPerUMI > 0.80) - ) -#saveRDS(seurat_qc, file = "seurat_olfactory_mice_qc_full.rds") -#write.csv(seurat_qc@meta.data, file = "metadata_olfactory_mice_qc_1_full.csv") -nCells <- nrow(seurat_qc@meta.data) -metadata1 <- seurat_qc@meta.data - - -metadata1 = metadata1 %>% dplyr::rename(nUMI = nCount_RNA, - nGene = nFeature_RNA) +## Cells per sample +```{r cells filtered} table(metadata1$orig.ident) + ``` From 2770e95e9823b87f86389b6357ee176490fce29a Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Mon, 8 Jul 2024 12:47:39 -0400 Subject: [PATCH 12/93] org DE code, add pairwise comparison --- .../skeleton/DE/Cross-comparison-analysis.Rmd | 236 ++++++++++++++++++ .../PCA_variance_analysis.Rmd | 0 2 files changed, 236 insertions(+) create mode 100644 inst/rmarkdown/templates/rnaseq/skeleton/DE/Cross-comparison-analysis.Rmd rename {inst/rmarkdown/templates/rnaseq/skeleton/DE => vignettes}/PCA_variance_analysis.Rmd (100%) diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/Cross-comparison-analysis.Rmd b/inst/rmarkdown/templates/rnaseq/skeleton/DE/Cross-comparison-analysis.Rmd new file mode 100644 index 0000000..4e78ea6 --- /dev/null +++ b/inst/rmarkdown/templates/rnaseq/skeleton/DE/Cross-comparison-analysis.Rmd @@ -0,0 +1,236 @@ +--- +title: "Comparing DE Results - Pairwise" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + project_file: ../information.R +--- + + + +```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} +# 1. Set up input files in this R file (params_pairwisecomp.R) +## Full results file (all genes) for contrastt 1 +comp1_fn <- 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group1.csv.gz' +## Name of contrast 1. This will be displayed on the figures +comp1_name <- "DMSO vs. Group1" +## Full results file (all genes) for contrast 2 +comp2_fn <- 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group2.csv.gz' +## Name of contrast 2. This will be displayed on the figures +comp2_name <- "DMSO vs. Group2" +## Adjusted P-value used for significance +padj_co <- 0.05 +## Log2FC used for significance. If no cutoff used put 0 +LFC <- 0.5 + +comp1 <- read_csv(comp1_fn) %>% + dplyr::filter(padj <= 1) +comp2 <- read_csv(comp2_fn) %>% + dplyr::filter(padj <= 1) +``` + + + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(rtracklayer) +library(tidyverse) +library(stringr) +library(ggpubr) +library(knitr) +library(bcbioR) +library(ggprism) +library(viridis) +library(pheatmap) +library(janitor) +library(ggvenn) +library(ggplot2) + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) +opts_chunk[["set"]]( + cache = F, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + echo = T, + fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) +``` + + +# Load Data + +We load our dataset + +```{r load_data} +# this code will load from bcbio or nf-core folder +# NOTE make sure to set numerator and denominator + +comp1_sig <- comp1 %>% + dplyr::filter(padj < padj_co, abs(lfc) > LFC) + +comp2_sig <- comp2 %>% + dplyr::filter(padj < padj_co, abs(lfc) > LFC) +``` + + + +# Comparisons + +We start with a venn diagram looking at the overlap between our two contrasts + +```{r, fig.height=8, fig.width=8} +name1 <- rlang::ensym(comp1_name) +name2 <- rlang::ensym(comp2_name) +names <- c(name1, name2) + +full <- list(comp1_sig$gene_id,comp2_sig$gene_id) +names(full) <-names + +ggvenn(full, show_percentage = F) + +``` + +## Compare effect sizes and direction + +We plot Log2FC for our contrasts and color points by whether or not they are significant in our contrasts. The black line is 1:1. + + +```{r fig.height=6, fig.width=8} +# Edit based on the data you are using + +#make sure to only use genes present in both results files +test_intersect <- intersect(comp1$gene_id, comp2$gene_id) +comp1_sub <- subset(comp1, comp1$gene_id %in% test_intersect) +comp2_sub <- subset(comp2, comp2$gene_id %in% test_intersect) + +## Check that gene names match +all(comp1_sub$gene_id== comp2_sub$gene_id) + +## Gather necessary data +lfc <- data.frame(comp1_sub$gene_id, comp1_sub$gene_name, comp1_sub$lfc, comp2_sub$lfc) +colnames(lfc) <- c("gene_id","gene_name", "comp1", "comp2") + +# subset to only include genes in both datasets and color by grouping +DE_comp1 <- setdiff(comp1_sig$gene_id, comp2_sig$gene_id) +DE_comp2 <- setdiff(comp2_sig$gene_id, comp1_sig$gene_id) +DE_both <- intersect(comp2_sig$gene_id, comp1_sig$gene_id) +not_sig <- comp1_sub$gene_id[!(comp1_sub$gene_id %in% c(DE_comp1,DE_comp2,DE_both))] + + +col <- rep(4, nrow(lfc)) +col[lfc$gene_id %in% not_sig] <- 1 +col[lfc$gene_id %in% DE_comp1] <- 2 +col[lfc$gene_id %in% DE_comp2] <- 3 +col[lfc$gene_id %in% DE_both] <- 4 + + +lfc$col <- lfc %>% + dplyr::mutate(color = case_when( + gene_id %in% DE_both ~ 3, + gene_id %in% DE_comp1 ~ 1, + gene_id %in% DE_comp2 ~ 2, + gene_id %in% not_sig ~ 8 + )) %>% pull(color) +lfc$col <- as.factor(lfc$col) + + +ggplot(lfc, aes(x=comp1, y=comp2, color=col)) + geom_point() + + labs(color="Group") + + scale_color_discrete(name = "Group", labels = c(paste0("Only DE in ",paste0(comp1_name)), paste0("Only DE in ",paste0(comp2_name)),"DE in both comparisons", "Not Significant")) + + geom_abline(intercept=0, slope=1) + + geom_hline(aes(yintercept=0)) + + geom_vline(aes(xintercept=0)) + + scale_color_cb_friendly() + + xlab(paste0("Log2FC in ",paste0(comp1_name))) + + ylab(paste0("Log2FC in ",paste0(comp2_name))) + +``` + + + +## Compare ajusted P-values + +We plot adjusted P-values for our contrasts and color points by whether or not they are significant in our contrasts. The black line is 1:1. + + +```{r fig.height=6, fig.width=8} +# Edit based on the data you are using + +#make sure to only use genes present in both results files +test_intersect <- intersect(comp1$gene_id, comp2$gene_id) +comp1_sub <- subset(comp1, comp1$gene_id %in% test_intersect) +comp2_sub <- subset(comp2, comp2$gene_id %in% test_intersect) + +## Check that gene names match +all(comp1_sub$gene_id== comp2_sub$gene_id) + +## Gather necessary data +lfc <- data.frame(comp1_sub$gene_id, comp1_sub$gene_name, comp1_sub$padj, comp2_sub$padj) +colnames(lfc) <- c("gene_id","gene_name", "comp1", "comp2") + +# subset to only include genes in both datasets and color by grouping +DE_comp1 <- setdiff(comp1_sig$gene_id, comp2_sig$gene_id) +DE_comp2 <- setdiff(comp2_sig$gene_id, comp1_sig$gene_id) +DE_both <- intersect(comp2_sig$gene_id, comp1_sig$gene_id) +not_sig <- comp1_sub$gene_id[!(comp1_sub$gene_id %in% c(DE_comp1,DE_comp2,DE_both))] + + +col <- rep(4, nrow(lfc)) +col[lfc$gene_id %in% not_sig] <- 1 +col[lfc$gene_id %in% DE_comp1] <- 2 +col[lfc$gene_id %in% DE_comp2] <- 3 +col[lfc$gene_id %in% DE_both] <- 4 + + +lfc$col <- lfc %>% + dplyr::mutate(color = case_when( + gene_id %in% DE_both ~ 3, + gene_id %in% DE_comp1 ~ 1, + gene_id %in% DE_comp2 ~ 2, + gene_id %in% not_sig ~ 8 + )) %>% pull(color) +lfc$col <- as.factor(lfc$col) + + +ggplot(lfc, aes(x=-log10(comp1), y=-log10(comp2), color=col)) + + geom_point() + labs(color="Group") + + scale_color_discrete(name = "Group", labels = c(paste0("-Log10 adjusted p-value ",paste0(comp1_name)), paste0("-Log10 adjusted p-value ",paste0(comp2_name)),"DE in both comparisons", "Not Significant")) + + geom_abline(intercept=0, slope=1) + + geom_hline(aes(yintercept=0)) + + geom_vline(aes(xintercept=0)) + + scale_color_cb_friendly() + + xlab(paste0("Log2FC in ",paste0(comp1_name))) + + ylab(paste0("Log2FC in ",paste0(comp2_name))) + +``` + + +# R session + +List and version of tools used for the QC report generation. + +```{r} +sessionInfo() +``` diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/PCA_variance_analysis.Rmd b/vignettes/PCA_variance_analysis.Rmd similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/DE/PCA_variance_analysis.Rmd rename to vignettes/PCA_variance_analysis.Rmd From 531a5186f458ccf190f1bca685a16ff65f147fb3 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Thu, 11 Jul 2024 16:43:12 -0400 Subject: [PATCH 13/93] add intersection analysis --- .../skeleton/DE/Comparison-intersections.Rmd | 258 ++++++++++++++++++ 1 file changed, 258 insertions(+) create mode 100644 inst/rmarkdown/templates/rnaseq/skeleton/DE/Comparison-intersections.Rmd diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/Comparison-intersections.Rmd b/inst/rmarkdown/templates/rnaseq/skeleton/DE/Comparison-intersections.Rmd new file mode 100644 index 0000000..b073469 --- /dev/null +++ b/inst/rmarkdown/templates/rnaseq/skeleton/DE/Comparison-intersections.Rmd @@ -0,0 +1,258 @@ +--- +title: "Comparing DE Results - Multiple Contrasts" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + project_file: ../information.R +--- + +```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} +## Adjusted P-value used for significance +padj_co <- 0.05 +## Log2FC used for significance. If no cutoff used put 0 +LFC <- 0.5 +## Normalized counts for ALL samples +# norm <- "/Users/emb016/Documents/comparisons_templates/norm_counts.csv" +# Load the count data, for this example it is the last columns of the DE table +norm_counts <- read.csv("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/norm_counts.csv.gz", + row.names = 1) + +# Load the meta data, here we are making one for the exmaple +metadata <- read_csv("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/meta.csv.gz") %>% as.data.frame() + +## Full results file (all genes) for contrast 1 +files=c("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group1.csv.gz", + "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group2.csv.gz", + "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group3.csv.gz") + +``` + + + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(rtracklayer) +library(tidyverse) +library(stringr) +library(ggpubr) +library(knitr) +library(bcbioR) +library(ggprism) +library(viridis) +library(pheatmap) +library(janitor) +library(ggvenn) +library(ggplot2) +library(UpSetR) +library(ggprism) +#library(org.Ce.eg.db) +library(org.Hs.eg.db) +#library(org.Mm.eg.db) + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) +opts_chunk[["set"]]( + cache = F, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + echo = T, + fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) +``` + + +# Load Data + +We load our dataset + +```{r load_data} + +## Name of contrast. This will be displayed on the figures. +# you can manually indicate a list of names as comp_names=c("name1","name2"...) +comp_names = basename(files) %>% + str_remove_all("all_results_|.csv|.gz") %>% + str_replace_all("_", " ") +names(files)=comp_names +N=length(files) +stopifnot(length(files)==length(comp_names)) + +## Make sure you have set up N above +all_genes=lapply(names(files), function(name){ + data <- read_csv(files[name]) %>% + dplyr::filter(padj <= 1) +}) +sign_genes=lapply(names(files), function(name){ + data <- read_csv(files[name]) %>% + dplyr::filter(padj <= 1) + data %>% + dplyr::filter(padj < padj_co, abs(lfc) > LFC) +}) +``` + + + +# Make list of comparisons + + +```{r, fig.height=8, fig.width=8, warning=FALSE, error=FALSE, message=FALSE} +de=lapply(sign_genes, function(x){ + x$gene_id +}) +names(de) <- comp_names +``` + +## Make an upset plot + +Because we have done so many tests venn diagrams no longer work for our data. Instead we will use upset plots. *These plots are relatively intuitive for 2 or 3 categories, but can tend to get more complex for >3 categories. In all cases, you will find the categories being compared and their size listed below the bar plots on the left. As you look to the right (directly below each bar) there are dots with connecting lines that denote which categories the overlap is between, or if there is no overlap (just a dot). The numbers at the top of the bars denote the size of the overlap.* + + +```{r, fig.height=8, fig.width=12} +upset(fromList(de), order.by = "freq", nsets=N) + +``` + +## Pull intersect(s) of interest + +After identifying intersect(s) of interest we can determine which genes are found in which intersections + + +```{r, warning=FALSE, error=FALSE, message=FALSE} +## Grab intersection +gene_names <- data.frame(gene=unique(unlist(de))) + +df1 <- lapply(de,function(x){ + data.frame(gene = x) +}) %>% + bind_rows(.id = "path") + +df_int <- lapply(gene_names$gene,function(x){ + # pull the name of the intersections + intersection <- df1 %>% + dplyr::filter(gene==x) %>% + arrange(path) %>% + pull("path") %>% + paste0(collapse = "|") + # build the dataframe + data.frame(gene = x,int = intersection) +}) %>% bind_rows() +``` + + +```{r, eval=F} +## Run this code to find the name of your intersect of interest. You will use this in the next code chunk +table(df_int$int) +``` + +```{r, warning=FALSE, error=FALSE, message=FALSE} +## subset interaction of interest replace the intersect name with the name of the intersect from above. You can copy and paste the below commands to grab multiple intersects. + +Intersect1 <- subset(df_int, df_int$int=="DMSO vs Group2|DMSO vs Group3") +``` + +## Get annotation data +```{r, warning=FALSE, error=FALSE, message=FALSE} + +# edit this to be the correct organism. One set of annotations per intersect. +# rdata = AnnotationDbi::select(org.Hs.eg.db, Intersect1$gene, 'SYMBOL', 'ENSEMBL') %>% +# dplyr::select(gene_id = ENSEMBL, gene_name = SYMBOL) %>% distinct(gene_id, .keep_all = T) + +# FIX: following code is only for test data, use the above with real data +rdata=data.frame(gene_id=row.names(norm_counts), gene_name=row.names(norm_counts)) +``` + + + +## Heatmap of intersect + +We generate a heatmap with all samples to see the patterns contained in this intersect. + +```{r, fig.height=6, warning=FALSE, error=FALSE, message=FALSE} +## Assign factors of interest. These need to correspond to columns in your metadata. + +factor1 <- "Treatment" +factor2 <- "Cell_line" + +# Extract significant genes +stopifnot(all(Intersect1$gene %in% row.names(norm_counts))) +sigGenes <- Intersect1$gene + +### Extract normalized expression for significant genes +norm_sig <- norm_counts[sigGenes,] +meta <- data.frame(metadata[,print(factor1)],metadata[,print(factor1)]) +colnames(meta) <- c(print(factor1),print(factor2)) +rownames(meta) <- colnames(norm_sig) +### Set a color palette +heat_colors <- lapply(colnames(norm_sig), function(c){ + l.col=colors[1:length(unique(norm_sig[[c]]))] + names(l.col)=unique(norm_sig[[c]]) + l.col +}) + +### Run pheatmap using the metadata data frame for the annotation (11 x 5) +pheatmap(norm_sig, + color = inferno(10), + cluster_rows = T, + show_rownames = F, + annotation = meta, + annotation_colors = heat_colors, + border_color = NA, + fontsize = 10, + scale = "row", + fontsize_row = 10, + height = 20) +``` + + +## Graph all genes in intersect + +```{r, warning=FALSE, error=FALSE, message=FALSE} +Intersect1_annot <- Intersect1 %>% left_join(rdata, by=c("gene"="gene_id")) +# REMOVE to plot all +Intersect1_annot <- Intersect1_annot[1:10] + +graphs <- length(Intersect1_annot$gene) +to_test <- t(norm_counts) +rna = Intersect1_annot$gene +names = Intersect1_annot$gene_name + +to_graph = data.frame(to_test[,rna]) +to_graph = to_graph[Intersect1_annot$gene] +to_graph$Factor1 <- metadata[,factor1] +to_graph$Factor2 <- metadata[,factor2] + +#out <- vector("list", length = graphs) +for (i in seq(1,graphs)) { + to_graph$temp=to_graph[[i]] + print(ggplot(to_graph,aes(x=Factor1,y=temp,color=Factor2)) + + geom_boxplot() + geom_point(alpha=0.5) + ylab(paste0(names[[i]])) + xlab(factor1) + scale_color_discrete(name = "Covariate")) +} +``` + +# R session + +List and version of tools used for the QC report generation. + +```{r} +sessionInfo() +``` From 9702600a9bd4cb2e327cd6c0650c345d0639433f Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Fri, 19 Jul 2024 11:12:38 -0400 Subject: [PATCH 14/93] cellchat template --- .../templates/cellchat/skeleton/cellchat.Rmd | 440 ++++++++++++++++++ .../templates/cellchat/template.yaml | 3 + 2 files changed, 443 insertions(+) create mode 100644 inst/rmarkdown/templates/cellchat/skeleton/cellchat.Rmd create mode 100644 inst/rmarkdown/templates/cellchat/template.yaml diff --git a/inst/rmarkdown/templates/cellchat/skeleton/cellchat.Rmd b/inst/rmarkdown/templates/cellchat/skeleton/cellchat.Rmd new file mode 100644 index 0000000..57cc24c --- /dev/null +++ b/inst/rmarkdown/templates/cellchat/skeleton/cellchat.Rmd @@ -0,0 +1,440 @@ +--- +title: "CellChat" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: false + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + project_file: information_snrna.R + seurat_fn: ../data/fDat_sn_RC.rds + cellchat_fn: ../data/snrna_cellchat.qs + cellchat_grade2_fn: ../data/snrna_cellchat_grade2.qs + cellchat_grade0_fn: ../data/snrna_cellchat_grade0.qs +--- + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE, echo=FALSE,} + +reticulate::use_virtualenv("/n/app/bcbio/R4.3.1_python_cellchat") +reticulate::py_config() # should show v3.9.14 +Sys.getenv("PYTHONPATH") # should be empty + +current_libs <- .libPaths() +.libPaths(c('/n/app/bcbio/R4.3.1_cellchat/', current_libs)) +library(CellChat) + +library(tidyverse) +library(Seurat) +library(bcbioR) +library(ggprism) +library(knitr) +library(tools) +library(qs) +library(patchwork) +library(ComplexHeatmap) + +options(stringsAsFactors = FALSE) + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) +opts_chunk[["set"]]( + cache = F, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + echo = T, + fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) + +cellchat_ran <- file.exists(params$cellchat_fn) +cellchat_rejection_ran <- file.exists(params$cellchat_grade2_fn) & file.exists(params$cellchat_grade0_fn) + +``` + +# Clustering + +```{r load_data } + +snrna <- readRDS(params$seurat_fn) + +# in this case, Chris_annot = cell_type +DimPlot(snrna, reduction = 'umap', group.by = 'Chris_annot') + +``` + +```{r prep cellchat inputs, eval = !cellchat_ran } + +# need to use normalized counts as input +data.input <- snrna[["SCT"]]@data +labels <- snrna$Chris_annot +meta <- data.frame(labels = labels, row.names = names(labels), samples = snrna$orig.ident) + +``` + +```{r create cellchat object, eval = !cellchat_ran } +cellchat <- createCellChat(object = data.input, meta = meta, group.by = "labels") + +``` + +```{r set cellchat db, eval = !cellchat_ran} +CellChatDB <- CellChatDB.human +CellChatDB.use <- subsetDB(CellChatDB) +cellchat@DB <- CellChatDB.use + +``` + +```{r subset and preprocess data, eval = !cellchat_ran } + +cellchat <- subsetData(cellchat) +cellchat <- updateCellChat(cellchat) +future::plan("multisession", workers = 8) # recommend running with at 8-16 cores +cellchat <- identifyOverExpressedGenes(cellchat) # may take a couple minutes +cellchat <- identifyOverExpressedInteractions(cellchat) # may take a couple minutes + +``` + +```{r compute communication prob, eval = !cellchat_ran} + +# Not recommended: project gene expression data onto protein-protein interaction network. +# Useful with shallow sequencing depth but introduces many weak communications. +# If used, must set raw.use = FALSE when running computeCommunProb +# cellchat <- projectData(cellchat, PPI.human) + + +# this next command takes 0.5-2+ hours +# can choose various methods for caculating average gene exp per group, +# 'triMean' allegedly produces fewer but stronger interactions +cellchat <- computeCommunProb(cellchat, type = "triMean") + +# filter out the cell-cell communication if < 50 cells per group +cellchat <- filterCommunication(cellchat, min.cells = 50) + +qsave(cellchat, '../data/snrna_cellchat.qs', preset = 'fast') + +``` + +# Overall Results + +```{r load cellchat, eval = cellchat_ran} +cellchat <- qread(params$cellchat_fn) + +df.net <- subsetCommunication(cellchat) %>% dplyr::arrange(pval) +df.net %>% sanitize_datatable() + +``` + +## Top interactions {.tabset} +```{r check pairs, results = 'asis', fig.width = 8, fig.height = 12} + +top_ints <- (df.net %>% pull(interaction_name) %>% unique)[1:10] +for (interaction in top_ints){ + cat('\n') + cat('### ', as.character(interaction), '\n') + interactors <- unlist(strsplit(as.character(interaction), '_')) + p1 <- VlnPlot(snrna, features = interactors, group.by = 'Chris_annot', + pt.size = 0.1, log = T, ncol = 1) + print(p1) + cat('\n') +} + +``` + +```{r compute pathway communication probs, eval = cellchat_ran} +cellchat <- computeCommunProbPathway(cellchat) +cellchat <- aggregateNet(cellchat) + +``` + +## Visualize Cell-Cell Communication Networks + +```{r chord plots, fig.width = 10, fig.height = 8, eval = cellchat_ran} + +groupSize <- as.numeric(table(cellchat@idents)) +par(mfrow = c(1,2), xpd=TRUE) +netVisual_circle(cellchat@net$count, vertex.weight = rowSums(cellchat@net$count), + weight.scale = T, label.edge= F, title.name = "Number of interactions") +netVisual_circle(cellchat@net$weight, vertex.weight = rowSums(cellchat@net$weight), + weight.scale = T, label.edge= F, title.name = "Interaction weights/strength") + +``` + +```{r heatmaps, eval = cellchat_ran} + +netVisual_heatmap(cellchat, measure = "count", color.heatmap = "Blues") +netVisual_heatmap(cellchat, measure = "weight", color.heatmap = "Blues") + +``` + +# Comparison Results + +Here we run the CellChat analysis twice, once on the Grade 2 rejection samples and once on the Grade 0 rejection samples. We compare the significant signaling interactions and investigate changes in them between rejection grades. + +```{r prep inputs rejection, eval=!cellchat_rejection_ran} + +grade2 <- subset(snrna, orig.ident %in% c('BRI-2396', 'BRI-2402')) +grade0 <- subset(snrna, orig.ident %in% c('BRI-2395', 'BRI-2411')) + +data.input_grade2 <- grade2[["SCT"]]@data +labels_grade2 <- grade2$Chris_annot +meta_grade2 <- data.frame(labels = labels_grade2, row.names = names(labels_grade2), samples = grade2$orig.ident) + +data.input_grade0 <- grade0[["SCT"]]@data +labels_grade0 <- grade0$Chris_annot +meta_grade0 <- data.frame(labels = labels_grade0, row.names = names(labels_grade0), samples = grade0$orig.ident) + +``` + +```{r create cellchat object rejection, eval=!cellchat_rejection_ran} +cellchat_grade2 <- createCellChat(object = data.input_grade2, meta = meta_grade2, group.by = "labels") +cellchat_grade0 <- createCellChat(object = data.input_grade0, meta = meta_grade0, group.by = "labels") + +``` + +```{r subset and preprocess data rejection, eval=!cellchat_rejection_ran} + +future::plan("multisession", workers = 8) # recommend running with at 8-16 cores + +cellchat_grade2@DB <- CellChatDB.use +cellchat_grade0@DB <- CellChatDB.use + +cellchat_grade2 <- subsetData(cellchat_grade2) +cellchat_grade2 <- updateCellChat(cellchat_grade2) +cellchat_grade2 <- identifyOverExpressedGenes(cellchat_grade2) # may take a couple minutes +cellchat_grade2 <- identifyOverExpressedInteractions(cellchat_grade2) # may take a couple minutes + +cellchat_grade0 <- subsetData(cellchat_grade0) +cellchat_grade0 <- updateCellChat(cellchat_grade0) +cellchat_grade0 <- identifyOverExpressedGenes(cellchat_grade0) # may take a couple minutes +cellchat_grade0 <- identifyOverExpressedInteractions(cellchat_grade0) # may take a couple minutes + +``` + +```{r compute communication prob rejection, eval=!cellchat_rejection_ran} +cellchat_grade2 <- computeCommunProb(cellchat_grade2, type = "triMean") # command takes 0.5-2+ hours +cellchat_grade2 <- filterCommunication(cellchat_grade2, min.cells = 50) +qsave(cellchat_grade2, params$cellchat_grade2_fn, preset = 'fast') + +cellchat_grade0 <- computeCommunProb(cellchat_grade0, type = "triMean") # command takes 0.5-2+ hours +cellchat_grade0 <- filterCommunication(cellchat_grade0, min.cells = 50) +qsave(cellchat_grade0, params$cellchat_grade0_fn, preset = 'fast') + +``` + +```{r load cellchat rejection, eval = cellchat_rejection_ran} + +cellchat_grade2 <- qread(params$cellchat_grade2_fn) +cellchat_grade0 <- qread(params$cellchat_grade0_fn) + +cellchat_grade2 <- filterCommunication(cellchat_grade2, min.cells = 50) +cellchat_grade0 <- filterCommunication(cellchat_grade0, min.cells = 50) + +df.net_grade2 <- subsetCommunication(cellchat_grade2)%>% dplyr::arrange(pval) +df.net_grade0 <- subsetCommunication(cellchat_grade0)%>% dplyr::arrange(pval) + +``` + +## Grade 2 + +```{r datatable grade 2, eval = cellchat_rejection_ran} +df.net_grade2 %>% sanitize_datatable() + +``` + +### Top interactions {.tabset} +```{r check pairs grade 2, results = 'asis', fig.width = 8, fig.height = 12} + +top_ints <- (df.net_grade2 %>% pull(interaction_name) %>% unique)[1:10] +for (interaction in top_ints){ + cat('\n') + cat('#### ', as.character(interaction), '\n') + interactors <- unlist(strsplit(as.character(interaction), '_')) + p1 <- VlnPlot(snrna, features = interactors, group.by = 'Chris_annot', pt.size = 0.1, log = T, ncol = 1) + print(p1) + cat('\n') +} + +``` + + +## Grade 0 + +```{r datatable grade 0, eval = cellchat_rejection_ran} +df.net_grade0 %>% sanitize_datatable() + +``` + +### Top interactions {.tabset} +```{r check pairs grade 0, results = 'asis', fig.width = 8, fig.height = 12} + +top_ints <- (df.net_grade0 %>% pull(interaction_name) %>% unique)[1:10] +for (interaction in top_ints){ + cat('\n') + cat('#### ', as.character(interaction), '\n') + interactors <- unlist(strsplit(as.character(interaction), '_')) + p1 <- VlnPlot(snrna, features = interactors, group.by = 'Chris_annot', pt.size = 0.1, log = T, ncol = 1) + print(p1) + cat('\n') +} + +``` + +```{r merge rejection objects, eval = cellchat_rejection_ran} + +cellchat_grade2 <- computeCommunProbPathway(cellchat_grade2) +cellchat_grade2 <- aggregateNet(cellchat_grade2) +cellchat_grade2 <- netAnalysis_computeCentrality(cellchat_grade2) +cellchat_grade0 <- computeCommunProbPathway(cellchat_grade0) +cellchat_grade0 <- aggregateNet(cellchat_grade0) +cellchat_grade0 <- netAnalysis_computeCentrality(cellchat_grade0) + +object.list <- list(grade0 = cellchat_grade0, grade2 = cellchat_grade2) +cellchat_merged <- mergeCellChat(object.list, add.names = names(object.list)) + +df.net_merged <- subsetCommunication(cellchat_merged) + +``` + +## Compare Interactions/Interaction Strength + +```{r compare interactions, eval = cellchat_rejection_ran} + +gg1 <- compareInteractions(cellchat_merged, show.legend = F, group = c(1,2)) +gg2 <- compareInteractions(cellchat_merged, show.legend = F, group = c(1,2), measure = "weight") +gg1 + gg2 + +``` + +```{r chord plots merged, eval = cellchat_rejection_ran, fig.width = 10, fig.height = 8} +par(mfrow = c(1,2), xpd=TRUE) +netVisual_diffInteraction(cellchat_merged, weight.scale = T) +netVisual_diffInteraction(cellchat_merged, weight.scale = T, measure = "weight") + +``` + +```{r heatmaps merged, eval = cellchat_rejection_ran, fig.width = 10, fig.height = 8} + +gg1 <- netVisual_heatmap(cellchat_merged) +gg2 <- netVisual_heatmap(cellchat_merged, measure = "weight") +gg1 + gg2 + +``` + +## Compare Major Pathway Sources and Targets + +From the CellChat documentation: "Comparing the outgoing and incoming interaction strength in a 2D space allows ready identification of the cell populations with significant changes in sending or receiving signals between different datasets." + +```{r compare send/receive changes, eval = cellchat_rejection_ran, fig.width = 10, fig.height = 6} + +num.link <- sapply(object.list, function(x) {rowSums(x@net$count) + colSums(x@net$count)-diag(x@net$count)}) +weight.MinMax <- c(min(num.link), max(num.link)) # control the dot size in the different datasets +gg <- list() +for (i in 1:length(object.list)) { + gg[[i]] <- netAnalysis_signalingRole_scatter(object.list[[i]], title = names(object.list)[i], weight.MinMax = weight.MinMax) +} +patchwork::wrap_plots(plots = gg) +``` + + +```{r identify signaling changes, eval = cellchat_rejection_ran, fig.width = 12, fig.height = 12} +gg1 <- netAnalysis_signalingChanges_scatter(cellchat_merged, idents.use = "Vascular_EC") +gg2 <- netAnalysis_signalingChanges_scatter(cellchat_merged, idents.use = "Lymphatic_EC") +gg3 <- netAnalysis_signalingChanges_scatter(cellchat_merged, idents.use = "Pericyte") +patchwork::wrap_plots(plots = list(gg1,gg2,gg3), nrow = 3, ncol = 1) + +``` + +## Cluster Altered Signaling Interactions + +From the CellChat documentation: "CellChat performs joint manifold learning and classification of the inferred communication networks based on their functional and topological similarity across different conditions. + +By quantifying the similarity between the cellular communication networks of signaling pathways across conditions, this analysis highlights the potentially altered signaling pathways. CellChat adopts the concept of network rewiring from network biology and hypothesized that the difference between different communication networks may affect biological processes across conditions. UMAP is used for visualizing signaling relationship and interpreting our signaling outputs in an intuitive way without involving the classification of conditions. + +Functional similarity: High degree of functional similarity indicates major senders and receivers are similar, and it can be interpreted as the two signaling pathways or two ligand-receptor pairs exhibit similar and/or redundant roles. + +Structural similarity: A structural similarity was used to compare their signaling network structure, without considering the similarity of senders and receivers." + + +### Based on Functional Similarity + +```{r identify signaling groups functional, eval = cellchat_rejection_ran} + +cellchat_merged <- computeNetSimilarityPairwise(cellchat_merged, type = "functional") +cellchat_merged <- netEmbedding(cellchat_merged, type = "functional") +cellchat_merged <- netClustering(cellchat_merged, type = "functional") +netVisual_embeddingPairwise(cellchat_merged, type = "functional", label.size = 3.5) + +``` + +### Based on Structural Similarity + +```{r identify signaling groups structural, eval = cellchat_rejection_ran} +cellchat_merged <- computeNetSimilarityPairwise(cellchat_merged, type = "structural") +cellchat_merged <- netEmbedding(cellchat_merged, type = "structural") +cellchat_merged <- netClustering(cellchat_merged, type = "structural") +netVisual_embeddingPairwise(cellchat_merged, type = "structural", label.size = 3.5) +``` + +## Compare Overall Signaling Information Flow + +"CellChat can identify the conserved and context-specific signaling pathways by simply comparing the information flow for each signaling pathway, which is defined by the sum of communication probability among all pairs of cell groups in the inferred network (i.e., the total weights in the network)." + +```{r info flow, fig.height = 9, eval = cellchat_rejection_ran} + +rankNet(cellchat_merged, mode = "comparison", measure = "weight", sources.use = NULL, targets.use = NULL, stacked = F, do.stat = TRUE) + +``` + +## Compare Signaling Patterns Across Cell Populations + +"In this heatmap, colobar represents the relative signaling strength of a signaling pathway across cell groups (Note that values are row-scaled). The top colored bar plot shows the total signaling strength of a cell group by summarizing all signaling pathways displayed in the heatmap. The right grey bar plot shows the total signaling strength of a signaling pathway by summarizing all cell groups displayed in the heatmap." + + +```{r outgoing signaling, fig.height = 9, eval = cellchat_rejection_ran} + +i = 1 +pathway.union <- union(object.list[[i]]@netP$pathways, object.list[[i+1]]@netP$pathways) +ht1 = netAnalysis_signalingRole_heatmap(object.list[[i]], pattern = "outgoing", signaling = pathway.union, title = names(object.list)[i], width = 5, height = 16, cluster.cols = T) +ht2 = netAnalysis_signalingRole_heatmap(object.list[[i+1]], pattern = "outgoing", signaling = pathway.union, title = names(object.list)[i+1], width = 5, height = 16, cluster.cols = T) +draw(ht1 + ht2, ht_gap = unit(0.5, "cm")) +``` + +```{r incoming signaling, fig.height = 9, eval = cellchat_rejection_ran} +ht1 = netAnalysis_signalingRole_heatmap(object.list[[i]], pattern = "incoming", signaling = pathway.union, title = names(object.list)[i], width = 5, height = 16, cluster.cols = T) +ht2 = netAnalysis_signalingRole_heatmap(object.list[[i+1]], pattern = "incoming", signaling = pathway.union, title = names(object.list)[i+1], width = 5, height = 16, cluster.cols = T) +draw(ht1 + ht2, ht_gap = unit(0.5, "cm")) +``` + +## Identify Dysfunctional Interaction Signaling Using Communication Probabilities + +"CellChat can identify the up-regulated (increased) and down-regulated (decreased) signaling ligand-receptor pairs in one dataset compared to the other dataset by comparing the communication probability between two datasets for each L-R pair and each pair of cell groups" + +```{r compare signaling, fig.height = 12, fig.width = 8, eval = cellchat_rejection_ran} + +gg1 <- netVisual_bubble(cellchat_merged, + # sources.use = c('Vascular_EC', 'Lymphatic_EC', 'Pericyte'), + # targets.use = c('Vascular_EC', 'Lymphatic_EC', 'Pericyte'), + comparison = c(1, 2), + max.dataset = 2, + title.name = "Increased signaling in Grade 2", + angle.x = 45, + remove.isolate = T) +gg1 +signaling.grade2_increased = gg1$data \ No newline at end of file diff --git a/inst/rmarkdown/templates/cellchat/template.yaml b/inst/rmarkdown/templates/cellchat/template.yaml new file mode 100644 index 0000000..aa08347 --- /dev/null +++ b/inst/rmarkdown/templates/cellchat/template.yaml @@ -0,0 +1,3 @@ +name: bcbio CellChat +description: Standard CellChat analyses +create_dir: false From 48f1b44ab4bb2fc6a394fd9a6eb3b15e605823e2 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Thu, 25 Jul 2024 14:54:35 -0400 Subject: [PATCH 15/93] move to templates --- .../common/skeleton/code/placeholder.R | 0 .../templates/common/skeleton/data/readme | 0 .../common/skeleton/meta/placeholder.R | 0 .../common/skeleton/scripts/placeholder | 0 .../templates/rnaseq/skeleton/QC/placeholder | 0 .../templates/teaseq/skeleton/skeleton.Rmd | 25 ------------------- .../skeleton => templates/base}/.gitignore | 0 .../skeleton => templates/base}/README.md | 0 .../skeleton => templates/base}/information.R | 0 .../skeleton => templates/cosmx}/QC/QC.Rmd | 0 .../cosmx}/QC/run_markdown.R | 0 .../cosmx}/information.R | 0 .../rnaseq}/DE/Comparison-intersections.Rmd | 0 .../rnaseq}/DE/Cross-comparison-analysis.Rmd | 0 .../skeleton => templates/rnaseq}/DE/DEG.Rmd | 0 .../rnaseq}/DE/load_data.R | 0 .../rnaseq}/DE/params_de-example.R | 0 .../rnaseq}/DE/params_de.R | 0 .../rnaseq}/DE/run_markdown.R | 0 .../skeleton => templates/rnaseq}/QC/QC.Rmd | 0 .../rnaseq}/QC/QC_nf-core.Rmd | 0 .../rnaseq}/QC/params_qc.R | 0 .../rnaseq}/QC/params_qc_nf-core-example.R | 0 .../rnaseq}/QC/params_qc_nf-core.R | 0 .../rnaseq}/QC/run_markdown.R | 0 .../skeleton => templates/rnaseq}/README.md | 0 .../rnaseq}/information.R | 0 .../singlecell}/Integration/helpers.R | 0 .../singlecell}/README.md | 0 .../singlecell}/cellchat.Rmd | 2 +- .../singlecell}/information.R | 0 .../singlecell}/scRNA_qc_template.rmd | 0 .../singlecell}/skeleton.Rmd | 0 .../singlecell}/starting_steps.md | 0 .../teaseq}/QC/QC-01-load_data.R | 0 .../teaseq}/QC/QC-02-run_analysis.R | 0 .../skeleton => templates/teaseq}/QC/QC.Rmd | 0 .../skeleton => templates/teaseq}/README.md | 0 .../teaseq}/information.R | 0 .../teaseq}/scripts/fix_filenames.R | 0 .../teaseq}/scripts/gex_adt_hto.sbatch | 0 .../teaseq}/scripts/gex_atac.sbatch | 0 42 files changed, 1 insertion(+), 26 deletions(-) delete mode 100644 inst/rmarkdown/templates/common/skeleton/code/placeholder.R delete mode 100644 inst/rmarkdown/templates/common/skeleton/data/readme delete mode 100644 inst/rmarkdown/templates/common/skeleton/meta/placeholder.R delete mode 100644 inst/rmarkdown/templates/common/skeleton/scripts/placeholder delete mode 100644 inst/rmarkdown/templates/rnaseq/skeleton/QC/placeholder delete mode 100644 inst/rmarkdown/templates/teaseq/skeleton/skeleton.Rmd rename inst/{rmarkdown/templates/common/skeleton => templates/base}/.gitignore (100%) rename inst/{rmarkdown/templates/common/skeleton => templates/base}/README.md (100%) rename inst/{rmarkdown/templates/common/skeleton => templates/base}/information.R (100%) rename inst/{rmarkdown/templates/cosmx/skeleton => templates/cosmx}/QC/QC.Rmd (100%) rename inst/{rmarkdown/templates/cosmx/skeleton => templates/cosmx}/QC/run_markdown.R (100%) rename inst/{rmarkdown/templates/cosmx/skeleton => templates/cosmx}/information.R (100%) rename inst/{rmarkdown/templates/rnaseq/skeleton => templates/rnaseq}/DE/Comparison-intersections.Rmd (100%) rename inst/{rmarkdown/templates/rnaseq/skeleton => templates/rnaseq}/DE/Cross-comparison-analysis.Rmd (100%) rename inst/{rmarkdown/templates/rnaseq/skeleton => templates/rnaseq}/DE/DEG.Rmd (100%) rename inst/{rmarkdown/templates/rnaseq/skeleton => templates/rnaseq}/DE/load_data.R (100%) rename inst/{rmarkdown/templates/rnaseq/skeleton => templates/rnaseq}/DE/params_de-example.R (100%) rename inst/{rmarkdown/templates/rnaseq/skeleton => templates/rnaseq}/DE/params_de.R (100%) rename inst/{rmarkdown/templates/rnaseq/skeleton => templates/rnaseq}/DE/run_markdown.R (100%) rename inst/{rmarkdown/templates/rnaseq/skeleton => templates/rnaseq}/QC/QC.Rmd (100%) rename inst/{rmarkdown/templates/rnaseq/skeleton => templates/rnaseq}/QC/QC_nf-core.Rmd (100%) rename inst/{rmarkdown/templates/rnaseq/skeleton => templates/rnaseq}/QC/params_qc.R (100%) rename inst/{rmarkdown/templates/rnaseq/skeleton => templates/rnaseq}/QC/params_qc_nf-core-example.R (100%) rename inst/{rmarkdown/templates/rnaseq/skeleton => templates/rnaseq}/QC/params_qc_nf-core.R (100%) rename inst/{rmarkdown/templates/rnaseq/skeleton => templates/rnaseq}/QC/run_markdown.R (100%) rename inst/{rmarkdown/templates/rnaseq/skeleton => templates/rnaseq}/README.md (100%) rename inst/{rmarkdown/templates/rnaseq/skeleton => templates/rnaseq}/information.R (100%) rename inst/{rmarkdown/templates/singlecell/skeleton => templates/singlecell}/Integration/helpers.R (100%) rename inst/{rmarkdown/templates/singlecell/skeleton => templates/singlecell}/README.md (100%) rename inst/{rmarkdown/templates/cellchat/skeleton => templates/singlecell}/cellchat.Rmd (99%) rename inst/{rmarkdown/templates/singlecell/skeleton => templates/singlecell}/information.R (100%) rename inst/{rmarkdown/templates/singlecell/skeleton => templates/singlecell}/scRNA_qc_template.rmd (100%) rename inst/{rmarkdown/templates/singlecell/skeleton => templates/singlecell}/skeleton.Rmd (100%) rename inst/{rmarkdown/templates/singlecell/skeleton => templates/singlecell}/starting_steps.md (100%) rename inst/{rmarkdown/templates/teaseq/skeleton => templates/teaseq}/QC/QC-01-load_data.R (100%) rename inst/{rmarkdown/templates/teaseq/skeleton => templates/teaseq}/QC/QC-02-run_analysis.R (100%) rename inst/{rmarkdown/templates/teaseq/skeleton => templates/teaseq}/QC/QC.Rmd (100%) rename inst/{rmarkdown/templates/teaseq/skeleton => templates/teaseq}/README.md (100%) rename inst/{rmarkdown/templates/teaseq/skeleton => templates/teaseq}/information.R (100%) rename inst/{rmarkdown/templates/teaseq/skeleton => templates/teaseq}/scripts/fix_filenames.R (100%) rename inst/{rmarkdown/templates/teaseq/skeleton => templates/teaseq}/scripts/gex_adt_hto.sbatch (100%) rename inst/{rmarkdown/templates/teaseq/skeleton => templates/teaseq}/scripts/gex_atac.sbatch (100%) diff --git a/inst/rmarkdown/templates/common/skeleton/code/placeholder.R b/inst/rmarkdown/templates/common/skeleton/code/placeholder.R deleted file mode 100644 index e69de29..0000000 diff --git a/inst/rmarkdown/templates/common/skeleton/data/readme b/inst/rmarkdown/templates/common/skeleton/data/readme deleted file mode 100644 index e69de29..0000000 diff --git a/inst/rmarkdown/templates/common/skeleton/meta/placeholder.R b/inst/rmarkdown/templates/common/skeleton/meta/placeholder.R deleted file mode 100644 index e69de29..0000000 diff --git a/inst/rmarkdown/templates/common/skeleton/scripts/placeholder b/inst/rmarkdown/templates/common/skeleton/scripts/placeholder deleted file mode 100644 index e69de29..0000000 diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/placeholder b/inst/rmarkdown/templates/rnaseq/skeleton/QC/placeholder deleted file mode 100644 index e69de29..0000000 diff --git a/inst/rmarkdown/templates/teaseq/skeleton/skeleton.Rmd b/inst/rmarkdown/templates/teaseq/skeleton/skeleton.Rmd deleted file mode 100644 index dc4bbf5..0000000 --- a/inst/rmarkdown/templates/teaseq/skeleton/skeleton.Rmd +++ /dev/null @@ -1,25 +0,0 @@ ---- -title: "General Project Information" -author: "Harvard Chan Bioinformatics Core" -date: "`r Sys.Date()`" -output: - html_document: - code_folding: hide - df_print: paged - highlights: pygments - number_sections: true - self_contained: true - theme: default - toc: true - toc_float: - collapsed: true - smooth_scroll: true -editor_options: - chunk_output_type: console -params: - params_file: information.R ---- - -```{r echo = F} -source(params$params_file) -``` diff --git a/inst/rmarkdown/templates/common/skeleton/.gitignore b/inst/templates/base/.gitignore similarity index 100% rename from inst/rmarkdown/templates/common/skeleton/.gitignore rename to inst/templates/base/.gitignore diff --git a/inst/rmarkdown/templates/common/skeleton/README.md b/inst/templates/base/README.md similarity index 100% rename from inst/rmarkdown/templates/common/skeleton/README.md rename to inst/templates/base/README.md diff --git a/inst/rmarkdown/templates/common/skeleton/information.R b/inst/templates/base/information.R similarity index 100% rename from inst/rmarkdown/templates/common/skeleton/information.R rename to inst/templates/base/information.R diff --git a/inst/rmarkdown/templates/cosmx/skeleton/QC/QC.Rmd b/inst/templates/cosmx/QC/QC.Rmd similarity index 100% rename from inst/rmarkdown/templates/cosmx/skeleton/QC/QC.Rmd rename to inst/templates/cosmx/QC/QC.Rmd diff --git a/inst/rmarkdown/templates/cosmx/skeleton/QC/run_markdown.R b/inst/templates/cosmx/QC/run_markdown.R similarity index 100% rename from inst/rmarkdown/templates/cosmx/skeleton/QC/run_markdown.R rename to inst/templates/cosmx/QC/run_markdown.R diff --git a/inst/rmarkdown/templates/cosmx/skeleton/information.R b/inst/templates/cosmx/information.R similarity index 100% rename from inst/rmarkdown/templates/cosmx/skeleton/information.R rename to inst/templates/cosmx/information.R diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/Comparison-intersections.Rmd b/inst/templates/rnaseq/DE/Comparison-intersections.Rmd similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/DE/Comparison-intersections.Rmd rename to inst/templates/rnaseq/DE/Comparison-intersections.Rmd diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/Cross-comparison-analysis.Rmd b/inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/DE/Cross-comparison-analysis.Rmd rename to inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd b/inst/templates/rnaseq/DE/DEG.Rmd similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd rename to inst/templates/rnaseq/DE/DEG.Rmd diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/load_data.R b/inst/templates/rnaseq/DE/load_data.R similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/DE/load_data.R rename to inst/templates/rnaseq/DE/load_data.R diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/params_de-example.R b/inst/templates/rnaseq/DE/params_de-example.R similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/DE/params_de-example.R rename to inst/templates/rnaseq/DE/params_de-example.R diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/params_de.R b/inst/templates/rnaseq/DE/params_de.R similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/DE/params_de.R rename to inst/templates/rnaseq/DE/params_de.R diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/run_markdown.R b/inst/templates/rnaseq/DE/run_markdown.R similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/DE/run_markdown.R rename to inst/templates/rnaseq/DE/run_markdown.R diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/QC.Rmd b/inst/templates/rnaseq/QC/QC.Rmd similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/QC/QC.Rmd rename to inst/templates/rnaseq/QC/QC.Rmd diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/QC_nf-core.Rmd b/inst/templates/rnaseq/QC/QC_nf-core.Rmd similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/QC/QC_nf-core.Rmd rename to inst/templates/rnaseq/QC/QC_nf-core.Rmd diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc.R b/inst/templates/rnaseq/QC/params_qc.R similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc.R rename to inst/templates/rnaseq/QC/params_qc.R diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core-example.R b/inst/templates/rnaseq/QC/params_qc_nf-core-example.R similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core-example.R rename to inst/templates/rnaseq/QC/params_qc_nf-core-example.R diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core.R b/inst/templates/rnaseq/QC/params_qc_nf-core.R similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core.R rename to inst/templates/rnaseq/QC/params_qc_nf-core.R diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/run_markdown.R b/inst/templates/rnaseq/QC/run_markdown.R similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/QC/run_markdown.R rename to inst/templates/rnaseq/QC/run_markdown.R diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/README.md b/inst/templates/rnaseq/README.md similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/README.md rename to inst/templates/rnaseq/README.md diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/information.R b/inst/templates/rnaseq/information.R similarity index 100% rename from inst/rmarkdown/templates/rnaseq/skeleton/information.R rename to inst/templates/rnaseq/information.R diff --git a/inst/rmarkdown/templates/singlecell/skeleton/Integration/helpers.R b/inst/templates/singlecell/Integration/helpers.R similarity index 100% rename from inst/rmarkdown/templates/singlecell/skeleton/Integration/helpers.R rename to inst/templates/singlecell/Integration/helpers.R diff --git a/inst/rmarkdown/templates/singlecell/skeleton/README.md b/inst/templates/singlecell/README.md similarity index 100% rename from inst/rmarkdown/templates/singlecell/skeleton/README.md rename to inst/templates/singlecell/README.md diff --git a/inst/rmarkdown/templates/cellchat/skeleton/cellchat.Rmd b/inst/templates/singlecell/cellchat.Rmd similarity index 99% rename from inst/rmarkdown/templates/cellchat/skeleton/cellchat.Rmd rename to inst/templates/singlecell/cellchat.Rmd index 57cc24c..d4cc6f4 100644 --- a/inst/rmarkdown/templates/cellchat/skeleton/cellchat.Rmd +++ b/inst/templates/singlecell/cellchat.Rmd @@ -17,7 +17,7 @@ output: editor_options: chunk_output_type: console params: - project_file: information_snrna.R + project_file: information.R seurat_fn: ../data/fDat_sn_RC.rds cellchat_fn: ../data/snrna_cellchat.qs cellchat_grade2_fn: ../data/snrna_cellchat_grade2.qs diff --git a/inst/rmarkdown/templates/singlecell/skeleton/information.R b/inst/templates/singlecell/information.R similarity index 100% rename from inst/rmarkdown/templates/singlecell/skeleton/information.R rename to inst/templates/singlecell/information.R diff --git a/inst/rmarkdown/templates/singlecell/skeleton/scRNA_qc_template.rmd b/inst/templates/singlecell/scRNA_qc_template.rmd similarity index 100% rename from inst/rmarkdown/templates/singlecell/skeleton/scRNA_qc_template.rmd rename to inst/templates/singlecell/scRNA_qc_template.rmd diff --git a/inst/rmarkdown/templates/singlecell/skeleton/skeleton.Rmd b/inst/templates/singlecell/skeleton.Rmd similarity index 100% rename from inst/rmarkdown/templates/singlecell/skeleton/skeleton.Rmd rename to inst/templates/singlecell/skeleton.Rmd diff --git a/inst/rmarkdown/templates/singlecell/skeleton/starting_steps.md b/inst/templates/singlecell/starting_steps.md similarity index 100% rename from inst/rmarkdown/templates/singlecell/skeleton/starting_steps.md rename to inst/templates/singlecell/starting_steps.md diff --git a/inst/rmarkdown/templates/teaseq/skeleton/QC/QC-01-load_data.R b/inst/templates/teaseq/QC/QC-01-load_data.R similarity index 100% rename from inst/rmarkdown/templates/teaseq/skeleton/QC/QC-01-load_data.R rename to inst/templates/teaseq/QC/QC-01-load_data.R diff --git a/inst/rmarkdown/templates/teaseq/skeleton/QC/QC-02-run_analysis.R b/inst/templates/teaseq/QC/QC-02-run_analysis.R similarity index 100% rename from inst/rmarkdown/templates/teaseq/skeleton/QC/QC-02-run_analysis.R rename to inst/templates/teaseq/QC/QC-02-run_analysis.R diff --git a/inst/rmarkdown/templates/teaseq/skeleton/QC/QC.Rmd b/inst/templates/teaseq/QC/QC.Rmd similarity index 100% rename from inst/rmarkdown/templates/teaseq/skeleton/QC/QC.Rmd rename to inst/templates/teaseq/QC/QC.Rmd diff --git a/inst/rmarkdown/templates/teaseq/skeleton/README.md b/inst/templates/teaseq/README.md similarity index 100% rename from inst/rmarkdown/templates/teaseq/skeleton/README.md rename to inst/templates/teaseq/README.md diff --git a/inst/rmarkdown/templates/teaseq/skeleton/information.R b/inst/templates/teaseq/information.R similarity index 100% rename from inst/rmarkdown/templates/teaseq/skeleton/information.R rename to inst/templates/teaseq/information.R diff --git a/inst/rmarkdown/templates/teaseq/skeleton/scripts/fix_filenames.R b/inst/templates/teaseq/scripts/fix_filenames.R similarity index 100% rename from inst/rmarkdown/templates/teaseq/skeleton/scripts/fix_filenames.R rename to inst/templates/teaseq/scripts/fix_filenames.R diff --git a/inst/rmarkdown/templates/teaseq/skeleton/scripts/gex_adt_hto.sbatch b/inst/templates/teaseq/scripts/gex_adt_hto.sbatch similarity index 100% rename from inst/rmarkdown/templates/teaseq/skeleton/scripts/gex_adt_hto.sbatch rename to inst/templates/teaseq/scripts/gex_adt_hto.sbatch diff --git a/inst/rmarkdown/templates/teaseq/skeleton/scripts/gex_atac.sbatch b/inst/templates/teaseq/scripts/gex_atac.sbatch similarity index 100% rename from inst/rmarkdown/templates/teaseq/skeleton/scripts/gex_atac.sbatch rename to inst/templates/teaseq/scripts/gex_atac.sbatch From 2a4cf2fcaedebffb4d6bdadf2260eb30b7bf5534 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Thu, 25 Jul 2024 15:07:16 -0400 Subject: [PATCH 16/93] save QC --- inst/templates/rnaseq/qc/QC_nf-core.Rmd | 84 +++++++++++++------------ 1 file changed, 43 insertions(+), 41 deletions(-) diff --git a/inst/templates/rnaseq/qc/QC_nf-core.Rmd b/inst/templates/rnaseq/qc/QC_nf-core.Rmd index 87a6a9e..dee4712 100644 --- a/inst/templates/rnaseq/qc/QC_nf-core.Rmd +++ b/inst/templates/rnaseq/qc/QC_nf-core.Rmd @@ -17,45 +17,41 @@ output: editor_options: chunk_output_type: console params: + # params_file: params_qc_nf-core-example.R # example data + # Fill this file with the right paths to nfcore output + params_file: params_qc_nf-core.R # Put hg38, mm10, mm39, or other genome: hg38 + project_file: ../information.R factor_of_interest: sample_type --- -```{r, echo = F} +```{r} +# This set up the working directory to this file so all files can be found library(rstudioapi) setwd(fs::path_dir(getSourceEditorContext()$path)) ``` -```{r} -metadata_fn="{{metadata_fn}}" -se_object="{{se_object}}" -# This folder is in the output directory inside multiqc folder -multiqc_data_dir="{{multiqc_data_dir}}" -# This file is inside the genome folder in the output directory -gtf_fn="{{gtf_fn}}" -``` ```{r source_params, echo = F} -#knitr::opts_knit$set(root.dir = getSourceEditorContext()$path) -# 1. set up factor_of_interest parameter from parameter above or manualy +# 1. set up factor_of_interest parameter from parameter above or manually # this is used to color plots, it needs to be part of the metadata factor_of_interest=params$factor_of_interest genome=params$genome # 2. Set input files in this file -# This is the file used to run nf-core or compatible to that -metadata_fn='/Path/to/metadata/meta.csv' -# This file is inside star_salmon/ folder -se_object='/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.rds' -# This folder called "multiqc_report_data" is inside the output directory star_salmon inside multiqc folder -multiqc_data_dir='/path/to/nf-core/output/star_salmon/multiqc_report_data' -# This file is inside the genome folder in the output directory, use this only for non-model organism -# gtf_fn='/path/to/nf-core/output/genome/hg38.filtered.gtf' +source(params$params_file) +# 3. If you set up this file, project information will be printed below and +#. it can be reused for other Rmd files. +source(params$project_file) ``` # Overview -{{ project }} +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` + ```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} library(tidyverse) @@ -80,7 +76,7 @@ opts_chunk[["set"]]( prompt = FALSE, tidy = FALSE, warning = FALSE, - fig.height = 6) + fig.height = 4) ``` @@ -149,18 +145,14 @@ meta_df=read_csv(metadata_fn) %>% meta_df$sample <- make.names(meta_df$sample) order <- meta_df$sample -# remove some columns -meta_df <- data.frame(meta_df[,!(colnames(meta_df) %in% c("fastq_1", "fastq_2", "strandedness"))]) - - ggplot(meta_df, aes(.data[[factor_of_interest]], fill = .data[[factor_of_interest]])) + geom_bar() + ylab("") + xlab("") + ylab("# of samples") + - scale_fill_cb_friendly() + scale_fill_cb_friendly() + theme(axis.text.x=element_text(angle = 90, vjust = 0.5), legend.position = "none") ``` -```{r} +```{r load_data} # read counts from SE object se <- readRDS(se_object) raw_counts <- assays(se)[["counts"]] %>% round() %>% @@ -168,9 +160,9 @@ raw_counts <- assays(se)[["counts"]] %>% round() %>% raw_counts=raw_counts[rowSums(raw_counts)!=0,] ``` -```{r prepare metrics} +```{r prepare_metrics} # Get metrics from nf-core into bcbio like table -# many metrics are already in the Genereal Table of MultiQC, this reads the file +# many metrics are already in the General Table of MultiQC, this reads the file metrics <- read_tsv(file.path(multiqc_data_dir, 'multiqc_general_stats.txt')) # we get some more metrics from Qualimap and rename columns @@ -259,7 +251,7 @@ metrics <- metrics %>% ``` -```{r show-metadata} +```{r show_metadata} meta_sm <- meta_df %>% as.data.frame() %>% column_to_rownames("sample") @@ -282,9 +274,10 @@ metrics %>% geom_bar(stat = "identity") + coord_flip() + scale_y_continuous(name = "million reads") + + scale_x_discrete(limits = rev) + scale_fill_cb_friendly() + xlab("") + ggtitle("Total reads") + - geom_hline(yintercept=20000000, color = "grey", size=2) + geom_hline(yintercept=20000000, color = "grey", linewidth=2) metrics %>% ggplot(aes(x = .data[[factor_of_interest]], @@ -315,10 +308,11 @@ metrics %>% color = .data[[factor_of_interest]])) + geom_point(alpha = 0.5, size=4) + coord_flip() + + scale_x_discrete(limits = rev) + scale_color_cb_friendly() + ylim(0, 100) + ggtitle("Mapping rate") + xlab("") + - geom_hline(yintercept=70, color = "grey", size=2) + geom_hline(yintercept=70, color = "grey", linewidth=2) ``` @@ -346,10 +340,11 @@ ggplot(metrics,aes(x = factor(sample, level = order), geom_bar(stat = "identity") + coord_flip() + scale_fill_cb_friendly() + + scale_x_discrete(limits = rev) + ggtitle("Number of genes") + ylab("Number of genes") + xlab("") + - geom_hline(yintercept=20000, color = "grey", size=2) + geom_hline(yintercept=20000, color = "grey", linewidth=2) metrics %>% ggplot(aes(x = .data[[factor_of_interest]], @@ -357,9 +352,10 @@ metrics %>% color = .data[[factor_of_interest]])) + geom_point(alpha = 0.5, size=4) + coord_flip() + + scale_x_discrete(limits = rev) + scale_y_continuous(name = "million reads") + scale_color_cb_friendly() + xlab("") + - ggtitle("Total reads") + ggtitle("Number of Genes") ``` @@ -393,11 +389,12 @@ metrics %>% geom_point(alpha = 0.5, size=4) + ylab("Exonic rate %") + ggtitle("Exonic mapping rate") + + scale_x_discrete(limits = rev) + scale_color_cb_friendly() + coord_flip() + xlab("") + ylim(c(0,100)) + - geom_hline(yintercept=70, color = "grey", size=2) + geom_hline(yintercept=70, color = "grey", linewidth=2) ``` ## Intronic mapping rate @@ -412,11 +409,12 @@ metrics %>% geom_point(alpha = 0.5, size=4) + ylab("Intronic rate %") + ggtitle("Intronic mapping rate") + + scale_x_discrete(limits = rev) + scale_color_cb_friendly() + coord_flip() + xlab("") + ylim(c(0,100)) + - geom_hline(yintercept=20, color = "grey", size=2) + geom_hline(yintercept=20, color = "grey", linewidth=2) ``` ## Intergenic mapping rate @@ -432,9 +430,10 @@ metrics %>% ylab("Intergenic rate %") + ggtitle("Intergenic mapping rate") + coord_flip() + xlab("") + + scale_x_discrete(limits = rev) + scale_color_cb_friendly() + ylim(c(0, 100)) + - geom_hline(yintercept=15, color = "grey", size=2) + geom_hline(yintercept=15, color = "grey", linewidth=2) ``` ## tRNA/rRNA mapping rate @@ -448,14 +447,15 @@ metrics %>% ggplot(aes(x = factor(sample, level = order), y = r_and_t_rna_rate * 100, color = .data[[factor_of_interest]])) + - geom_point(alpha = 0.5) + + geom_point(alpha = 0.5, size=4) + ylab("tRNA/rRNA rate, %")+ ylim(0, rrna_ylim) + ggtitle("tRNA/rRNA mapping rate") + coord_flip() + + scale_x_discrete(limits = rev) + scale_color_cb_friendly() + ylim(c(0, 100)) + xlab("") + - geom_hline(yintercept=10, color = "grey", size=2) + geom_hline(yintercept=10, color = "grey", linewidth=2) ``` ## 5'->3' bias @@ -470,9 +470,10 @@ metrics %>% geom_point(alpha = 0.5, size=4) + ggtitle("5'-3' bias") + coord_flip() + + scale_x_discrete(limits = rev) + ylim(c(0.5,1.5)) + xlab("") + ylab("5'-3' bias") + scale_color_cb_friendly()+ - geom_hline(yintercept=1, color = "grey", size=2) + geom_hline(yintercept=1, color = "grey", linewidth=2) ``` ## Counts per gene - all genes @@ -495,6 +496,7 @@ ggplot(counts, aes(factor(name, level = order), log2(counts+1), fill = .data[[factor_of_interest]])) + geom_boxplot() + + scale_x_discrete(limits = rev) + scale_fill_cb_friendly() + coord_flip() + xlab("") + ggtitle("Counts per gene, all non-zero genes") + @@ -510,7 +512,7 @@ In this section, we look at how well the different groups in the dataset cluster Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells). - + ```{r PCA1:5 summary, all, unlabeled, fig.width= 7, fig.height = 5} vst <- vst(raw_counts) From 7918f2c8e51909f7e0932f1c5a1335a599aec19a Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Thu, 25 Jul 2024 15:08:29 -0400 Subject: [PATCH 17/93] small fix --- inst/templates/rnaseq/de/DEG.Rmd | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/inst/templates/rnaseq/de/DEG.Rmd b/inst/templates/rnaseq/de/DEG.Rmd index bd3db73..c2069be 100644 --- a/inst/templates/rnaseq/de/DEG.Rmd +++ b/inst/templates/rnaseq/de/DEG.Rmd @@ -175,7 +175,6 @@ rdata = AnnotationDbi::select(org.Hs.eg.db, rownames(counts), 'SYMBOL', 'ENSEMBL ``` ```{r setup_RUV} - dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = ~1) vsd_before <- vst(dds_to_use) @@ -579,4 +578,4 @@ List and version of tools used for the DE report generation. ```{r} sessionInfo() -``` \ No newline at end of file +``` From cff4a5c0daeac3b4d62a6cfe01c95d034167be5e Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Thu, 25 Jul 2024 15:47:35 -0400 Subject: [PATCH 18/93] Initial commit --- DESCRIPTION | 4 +- NAMESPACE | 2 +- R/helpers.R | 157 +++-- {R => inst/apps}/app.R | 0 inst/templates/rnaseq/DE/load_data.R | 146 ----- inst/templates/rnaseq/QC/QC_nf-core.Rmd | 585 ------------------ .../rnaseq/QC/params_qc_nf-core-example.R | 9 - inst/templates/rnaseq/de/params_de-example.R | 18 - inst/templates/rnaseq/de/run_markdown.R | 32 - inst/templates/rnaseq/qc/run_markdown.R | 13 - man/bcbio_set_project.Rd | 26 - man/bcbio_templates.Rd | 8 +- 12 files changed, 73 insertions(+), 927 deletions(-) rename {R => inst/apps}/app.R (100%) delete mode 100644 inst/templates/rnaseq/DE/load_data.R delete mode 100644 inst/templates/rnaseq/QC/QC_nf-core.Rmd delete mode 100644 inst/templates/rnaseq/QC/params_qc_nf-core-example.R delete mode 100644 inst/templates/rnaseq/de/params_de-example.R delete mode 100644 inst/templates/rnaseq/de/run_markdown.R delete mode 100644 inst/templates/rnaseq/qc/run_markdown.R delete mode 100644 man/bcbio_set_project.Rd diff --git a/DESCRIPTION b/DESCRIPTION index d556d1a..e7d76dc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,7 +20,7 @@ Imports: grDevices, R.utils, readr, - withr + withr, usethis, fs, jsonlite, @@ -35,4 +35,4 @@ VignetteBuilder: knitr URL: http://bcb.io/bcbioR/ Config/testthat/edition: 3 Depends: - R (>= 2.10) + R (>= 4.3.1) diff --git a/NAMESPACE b/NAMESPACE index e202aa8..00ac2cc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,13 +1,13 @@ # Generated by roxygen2: do not edit by hand export(bcbio_nfcore_check) -export(bcbio_set_project) export(bcbio_templates) export(cb_friendly_cols) export(cb_friendly_pal) export(list_cb_friendly_cols) export(scale_color_cb_friendly) export(scale_fill_cb_friendly) +export(use_bcbio_analysis) export(use_bcbio_projects) import(DESeq2) import(R.utils) diff --git a/R/helpers.R b/R/helpers.R index 7d13000..4d58de7 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -48,61 +48,22 @@ bcbio_nfcore_check <- function(file){ #' } #' @export bcbio_templates <- function(type="rnaseq", outpath){ + fs::dir_create(outpath) switch(type, rnaseq={ - - fpath <- system.file("rmarkdown/templates/rnaseq", "skeleton", package="bcbioR") #file.copy(fpath, outpath, recursive = T) - copyDirectory(fpath, outpath) + copy_templates(outpath, "nf-core/rnaseq") }, scrnaseq={ - - fpath <- system.file("rmarkdown/templates/singlecell", "skeleton", package="bcbioR") #file.copy(fpath, outpath, recursive = T) - copyDirectory(fpath, outpath) + copy_templates(outpath, "singlecell") }, { - stop('project type not recognize, please choose: ', 'rnaseq', 'scrnaseq') + stop('project type not recognize, please choose: ', 'rnaseq', 'singlecell') } ) } -#' Function to help with project name used for parent folder -#' -#' This function will ask for user input about: -#' * numeric code -#' * PI full name -#' * technology -#' * tissue -#' * organism -#' * project description -#' -#' It removes special character with `_`. The output is a guideline to -#' what the folder used can be. -#' -#' @returns A string list with hbc_code, and project folder name -#' @export -bcbio_set_project <- function() { - hbc_code <- readline("What is the hbc code (only numbers):\n") - hbc_code <- paste0("hbc", hbc_code) - pi <- readline("What is PI last name:\n") - technology <- readline("What is the technology:\n") - tissue <- readline("What is the tissue:\n") - org <- readline("What is the organism:\n") - project <- readline("What is the project name:\n") - #dropbox <- readline("What is the dropbox name:\n") - #github_org <- readline("What is the github organization:\n") - #hbc_$technology_of_$pilastname_$intervention_on_$tissue_in_$organism_$hbccode - project_full <- paste(technology, .fix(pi), .fix(project), tissue, org, hbc_code, sep="_") - #github <- c(github_org,project_full) - opts <- list(code=hbc_code, project=project_full) - #dropbox=file.path(dropbox,project_full), - #github=github) - print(opts) - return(opts) -} - - read_pipeline_info <- function(nfcore){ # pipeline_info/params_2024-05-28_12-28-51.json config <- fs::path_join(c(nfcore, "pipeline_info")) @@ -145,8 +106,10 @@ bcbio_params <-function(nfcore_path, pipeline, metadata, copy){ } -copy_files_in_folder<- function(origin, remote){ +copy_files_in_folder<- function(remote, origin){ to_copy <- fs::dir_ls(origin) + to_copy <- grep("org", to_copy, + value = TRUE, invert = TRUE) for (element in to_copy){ full_new_path <- fs::path_join(c(remote, fs::path_file(element))) @@ -162,24 +125,25 @@ copy_files_in_folder<- function(origin, remote){ } copy_templates <- function(path, pipeline){ - base = c("bcbioR", "templates") + base = c("bcbioR") if (pipeline=="nf-core/rnaseq"){ - parts = c(base, "rnaseq") - }else if(pipeline=="scrnaseq"){ - parts = c(base, "scrnaseq") + parts = c("templates/rnaseq") + }else if(pipeline=="singlecell"){ + parts = c("templates/singlecell") }else if(pipeline=="teaseq"){ - parts = c(base, "teaseq") + parts = c("templates/teaseq") }else if(pipeline=="cosmx"){ - parts = c(base, "cosmx") + parts = c("templates/cosmx") } - analysis_template <- fs::path_package(parts) - ls_files <- grep("org", list.files(analysis_template, full.names = TRUE), - value = TRUE, invert = TRUE) - copy_files_in_folder(path, ls_files) + analysis_template <- fs::path_package(base, parts) + ui_info("Getting templates from {ui_value(analysis_template)}") + # ls_files <- grep("org", list.files(analysis_template, full.names = TRUE), + # value = TRUE, invert = TRUE) + # ui_info("{ui_value(length(ls_files))} amount of files to copy") + copy_files_in_folder(path, analysis_template) } bcbio_render <- function(path, pipeline, data){ - copy_templates(fs::path_join(c(path, "reports"), pipeline)) if (pipeline=="nf-core/rnaseq"){ # analysis_template <- fs::path_package("bcbioR", "templates", "rnaseq", "qc") @@ -206,18 +170,23 @@ bcbio_render <- function(path, pipeline, data){ } #' @export -use_bcbio_analysis <- function(path, nfcore=NULL, copy=FALSE, metadata=NULL){ +use_bcbio_analysis <- function(path, pipeline, copy=FALSE, metadata=NULL){ if (copy){ # deploy files ui_info("Rmd templates will be copied but variables path won't be filled automatically.") - pipeline <- nfcore + if (!is.null(metadata)){ + meta_path <- fs::path_join(c(path, "meta", fs::path_file(metadata))) + if (!(fs::file_exists(metadata))) + ui_stop("{ui_value(metadata)} doesn't exist.") + fs::file_copy(metadata, meta_path) + } }else{ - if (!fs::dir_exists(nfcore)) + if (!fs::dir_exists(pipeline)) ui_stop("{ui_value(nfcore)} doesn't exist. point to nfcore path or turn on copy mode.") # guess analysis from pipeline file - information <- read_pipeline_info(nfcore) + information <- read_pipeline_info(pipeline) fs::dir_create(fs::path_join(c(path, "meta"))) meta_path <- fs::path_join(c(path, "meta", fs::path_file(information$metadata))) pipeline <- information$pipeline @@ -227,7 +196,7 @@ use_bcbio_analysis <- function(path, nfcore=NULL, copy=FALSE, metadata=NULL){ fs::file_copy(metadata, meta_path) }else{ if (!fs::file_exists(information$metadata)){ - ui_warn("{ui_value(metadata)} not found. We can only work with local filesytems. For now.") + ui_warn("{ui_value(metadata)} not found. We can only work with local filesytems right now.") ui_todo("Please, copy {ui_value(metadata)} to {ui_value(meta_path)}.") ui_warn("If this file isn't manually set up, the Rmd code will fail.") }else{ @@ -240,17 +209,22 @@ use_bcbio_analysis <- function(path, nfcore=NULL, copy=FALSE, metadata=NULL){ ui_todo("Please, copy nf-core output directory to {ui_value(path_final)}") } # set all files from analysis - data <- bcbio_params(nfcore, pipeline, metadata, copy=copy) - if (!copy) + copy_templates(fs::path_join(c(path, "reports")), pipeline) + if (!copy){ + data <- bcbio_params(nfcore, pipeline, metadata, copy=copy) bcbio_render(path, pipeline, data) + } + } #' @export #' @examples #' path <- withr::local_tempdir() -#' # use_bcbio_projects(path,nfcore="nf-core/rnaseq",copy=TRUE) -use_bcbio_projects <- function(path, nfcore=NULL, metadata=NULL, git=TRUE, gh=FALSE, org=NULL, copy=FALSE) { +#' use_bcbio_projects(path,pipeline="nf-core/rnaseq",copy=TRUE) +#' fs::dir_ls(path) +use_bcbio_projects <- function(path, pipeline=NULL, metadata=NULL, + git=TRUE, gh=FALSE, org=NULL, copy=TRUE) { ui_info("Creating project at {ui_value(path)}") if (!fs::dir_exists(path)) @@ -260,30 +234,34 @@ use_bcbio_projects <- function(path, nfcore=NULL, metadata=NULL, git=TRUE, gh=FA base_template <- fs::path_package("bcbioR", "templates", "base") copy_files_in_folder(base_template, path) - is_nfcore_ready <- FALSE - if (is.null(nfcore) && rlang::is_interactive()){ - is_nfcore_ready <- ui_yeah("Have you already run nf-core pipeline?", - n_yes=1, n_no =1) - if (is_nfcore_ready && rlang::is_interactive()){ - nfcore <- readline("? Enter path to nf-core output: ") - }else{ - ui_warn("Please, turn copy = TRUE to only deploy files or,") - ui_stop("Please use {ui_code('use_bcbio_projects')} again when you have the nf-core output.") - } - use_bcbio_analysis(path, nfcore, copy, metadata) - }else{ - if (fs::dir_exists(nfcore)){ - ui_info("Checking {ui_value(nfcore)} as nf-core output directory") - use_bcbio_analysis(path, nfcore, copy, metadata) - }else if (copy){ - # deploy only files - ui_info("Deploying only templates without pipeline information.") - use_bcbio_analysis(path, nfcore, copy = TRUE, metadata=metadata) - }else{ - ui_warn("Please, provide nfcore working directory or") - ui_warn("turn copy = TRUE to only deploy files.") - } + if (!is.null(pipeline)){ + ui_info("Using this pipeline templates {ui_value(pipeline)}") + use_bcbio_analysis(path, pipeline, copy = TRUE, metadata=metadata) } + # is_nfcore_ready <- FALSE + # if (is.null(pipeline) && rlang::is_interactive()){ + # is_nfcore_ready <- ui_yeah("Have you already run nf-core pipeline?", + # n_yes=1, n_no =1) + # if (is_nfcore_ready && rlang::is_interactive()){ + # nfcore <- readline("? Enter path to nf-core output: ") + # }else{ + # ui_warn("Please, turn copy = TRUE to only deploy files or,") + # ui_stop("Please use {ui_code('use_bcbio_projects')} again when you have the nf-core output.") + # } + # use_bcbio_analysis(path, nfcore, copy, metadata) + # }else{ + # if (fs::dir_exists(nfcore)){ + # ui_info("Checking {ui_value(nfcore)} as nf-core output directory") + # use_bcbio_analysis(path, nfcore, copy, metadata) + # }else if (copy){ + # # deploy only files + # ui_info("Deploying only templates without pipeline information.") + # use_bcbio_analysis(path, nfcore, copy = TRUE, metadata=metadata) + # }else{ + # ui_warn("Please, provide nfcore working directory or") + # ui_warn("turn copy = TRUE to only deploy files.") + # } + # } if (git){ ui_info("Create Git local repo at {ui_value(path)}") @@ -299,7 +277,10 @@ use_bcbio_projects <- function(path, nfcore=NULL, metadata=NULL, git=TRUE, gh=FA "_" = "Call {.run usethis::gh_token_help()} for help configuring a token." )) } - use_github(organisation=org) + use_github(organisation=org, private = TRUE) + }else{ + ui_info("You decided not to create a repo, please use this to push when ready") + ui_info("use_github(organisation=org), private = TRUE") } answer <- FALSE diff --git a/R/app.R b/inst/apps/app.R similarity index 100% rename from R/app.R rename to inst/apps/app.R diff --git a/inst/templates/rnaseq/DE/load_data.R b/inst/templates/rnaseq/DE/load_data.R deleted file mode 100644 index 8a1d297..0000000 --- a/inst/templates/rnaseq/DE/load_data.R +++ /dev/null @@ -1,146 +0,0 @@ -library(tidyverse) -library(SummarizedExperiment) -library(janitor) -load_metrics <- function(se_object, multiqc_data_dir, gtf_fn, counts){ - - # bcbio input - if (!is.na(se_object)){ - - se <- readRDS(se_object) - metrics <- metadata(se)$metrics %>% as.data.frame() - # left_join(coldata %>% rownames_to_column('sample')) %>% column_to_rownames('sample') - } else { #nf-core input - - # Get metrics from nf-core into bcbio like table - # many metrics are already in the Genereal Table of MultiQC, this reads the file - metrics <- read_tsv(file.path(multiqc_data_dir, 'multiqc_general_stats.txt')) - - # we get some more metrics from Qualimap and rename columns - metrics_qualimap <- read_tsv(file.path(multiqc_data_dir, 'mqc_qualimap_genomic_origin_1.txt')) - metrics <- metrics %>% full_join(metrics_qualimap) - metrics <- metrics %>% - clean_names() %>% - dplyr::rename_with(~gsub('.*mqc_generalstats_', '', .)) - - # This uses the fastqc metrics to get total reads - total_reads <- metrics %>% - dplyr::filter(!is.na(fastqc_raw_total_sequences)) %>% - remove_empty(which = 'cols') %>% - dplyr::rename(single_sample = sample) %>% - mutate(sample = gsub('_[12]+$', '', single_sample)) %>% - group_by(sample) %>% - summarize(total_reads = sum(fastqc_raw_total_sequences)) - - # This renames to user-friendly names the metrics columns - metrics <- metrics %>% - dplyr::filter(is.na(fastqc_raw_total_sequences)) %>% - remove_empty(which = 'cols') %>% - full_join(total_reads) %>% - mutate(mapped_reads = samtools_reads_mapped) %>% - mutate(exonic_rate = exonic/(star_uniquely_mapped * 2)) %>% - mutate(intronic_rate = intronic/(star_uniquely_mapped * 2)) %>% - mutate(intergenic_rate = intergenic/(star_uniquely_mapped * 2)) %>% - mutate(x5_3_bias = qualimap_5_3_bias) - - # Sometimes we don't have rRNA due to mismatch annotation, We skip this if is the case - gtf <- NULL - if (genome =="other"){ - gtf <- gtf_fn - }else{ - if (genome == "hg38") { - gtf <- "hg38.rna.gtf.gz" - } else if (genome == "mm10") { - gtf <- "mm10.rna.gtf.gz" - } else if (genome == "mm39") { - gtf <- "mm39.rna.gtf.gz" - } - gtf <- system.file("extdata", "annotation", - gtf, - package="bcbioR") - } - if (is.null(gtf)) { - print("No genome provided! Please add it at the top of this Rmd") - } - - gtf=rtracklayer::import(gtf) - - - one=grep("gene_type", colnames(as.data.frame(gtf)), value = TRUE) - another=grep("gene_biotype", colnames(as.data.frame(gtf)), value = TRUE) - biotype=NULL - if(length(one)==1){ - biotype=one - }else if(length(another)==1){ - biotype=another - }else{ - warning("No gene biotype founded") - } - - if (!is.null(biotype)){ - annotation=as.data.frame(gtf) %>% .[,c("gene_id", biotype)] - rRNA=grepl("rRNA|tRNA",annotation[[biotype]]) - genes=intersect(annotation[rRNA,"gene_id"],row.names(counts)) - ratio=data.frame(sample=colnames(counts), - r_and_t_rna_rate=colSums(counts[genes,])/colSums(counts)) - metrics = left_join(metrics, ratio, by="sample") - }else{ - metrics[["r_and_t_rna_rate"]] <- NA - } - - # if ("custom_content_biotype_counts_percent_r_rna" %in% colnames(metrics)){ - # metrics <- mutate(metrics, r_rna_rate = custom_content_biotype_counts_percent_r_rna) - # }else{ - # metrics[["r_rna_rate"]] <- NA - # } - metrics=metrics[,c("sample","mapped_reads","exonic_rate","intronic_rate", - "total_reads", - "x5_3_bias", "r_and_t_rna_rate","intergenic_rate")] - } - metrics$sample <- make.names(metrics$sample) - rownames(metrics) <- metrics$sample - return(metrics) -} - -load_coldata <- function(coldata_fn, column, numerator, denominator, subset_column = NULL, subset_value = NULL){ - coldata=read.csv(coldata_fn) %>% - dplyr::select(!matches("fastq") & !matches("strandness")) %>% - distinct() - if('description' %in% names(coldata)){ - coldata$sample <- tolower(coldata$description) - } - coldata <- coldata %>% distinct(sample, .keep_all = T) - stopifnot(column %in% names(coldata)) - - # use only some samples, by default use all - if (!is.null(subset_column)){ - coldata <- coldata[coldata[[paste(subset_column)]] == subset_value, ] - } - #coldata <- coldata[coldata[[paste(column)]] %in% c(numerator, denominator), ] - #browser() - coldata$sample <- make.names(coldata$sample) - rownames(coldata) <- coldata$sample - coldata$description <- coldata$sample - - coldata[[column]] = relevel(as.factor(coldata[[column]]), denominator) - - return(coldata) -} - -load_counts <- function(counts_fn){ - - # bcbio input - if(grepl('csv', counts_fn)){ - counts <- read_csv(counts_fn) %>% - mutate(gene = str_replace(gene, pattern = "\\.[0-9]+$", "")) %>% - column_to_rownames('gene') - colnames(counts) = tolower(colnames(counts)) - return(counts) - } else { # nf-core input - counts <- read_tsv(counts_fn) %>% dplyr::select(-gene_name) %>% - mutate(gene_id = str_replace(gene_id, pattern = "\\.[0-9]+$", "")) %>% - column_to_rownames('gene_id') %>% round - - return(counts) - } - -} diff --git a/inst/templates/rnaseq/QC/QC_nf-core.Rmd b/inst/templates/rnaseq/QC/QC_nf-core.Rmd deleted file mode 100644 index dee4712..0000000 --- a/inst/templates/rnaseq/QC/QC_nf-core.Rmd +++ /dev/null @@ -1,585 +0,0 @@ ---- -title: "Quality Control" -author: "Harvard Chan Bioinformatics Core" -date: "`r Sys.Date()`" -output: - html_document: - code_folding: hide - df_print: paged - highlights: pygments - number_sections: true - self_contained: true - theme: default - toc: true - toc_float: - collapsed: true - smooth_scroll: true -editor_options: - chunk_output_type: console -params: - # params_file: params_qc_nf-core-example.R # example data - # Fill this file with the right paths to nfcore output - params_file: params_qc_nf-core.R - # Put hg38, mm10, mm39, or other - genome: hg38 - project_file: ../information.R - factor_of_interest: sample_type ---- - -```{r} -# This set up the working directory to this file so all files can be found -library(rstudioapi) -setwd(fs::path_dir(getSourceEditorContext()$path)) -``` - - -```{r source_params, echo = F} -# 1. set up factor_of_interest parameter from parameter above or manually -# this is used to color plots, it needs to be part of the metadata -factor_of_interest=params$factor_of_interest -genome=params$genome -# 2. Set input files in this file -source(params$params_file) -# 3. If you set up this file, project information will be printed below and -#. it can be reused for other Rmd files. -source(params$project_file) -``` - -# Overview - -- Project: `r project` -- PI: `r PI` -- Analyst: `r analyst` -- Experiment: `r experiment` - - -```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} -library(tidyverse) -library(knitr) -library(rtracklayer) -library(DESeq2) -library(DEGreport) -library(ggrepel) -# library(RColorBrewer) -library(DT) -library(pheatmap) -library(bcbioR) -library(janitor) -ggplot2::theme_set(theme_light(base_size = 14)) -opts_chunk[["set"]]( - cache = FALSE, - cache.lazy = FALSE, - dev = c("png", "pdf"), - error = TRUE, - highlight = TRUE, - message = FALSE, - prompt = FALSE, - tidy = FALSE, - warning = FALSE, - fig.height = 4) -``` - - -```{r subchunkify, echo=FALSE, eval=FALSE} -#' Create sub-chunks for plots -#' -#' taken from: https://stackoverflow.com/questions/15365829/dynamic-height-and-width-for-knitr-plots -#' -#' @param pl a plot object -#' @param fig.height figure height -#' @param fig.width figure width -#' @param chunk_name name of the chunk -#' -#' @author Andreas Scharmueller \email{andschar@@protonmail.com} -#' -subchunkify = function(pl, - fig.height = 7, - fig.width = 5, - chunk_name = 'plot') { - pl_deparsed = paste0(deparse(function() { - pl - }), collapse = '') - - sub_chunk = paste0( - "```{r ", - chunk_name, - ", fig.height=", - fig.height, - ", fig.width=", - fig.width, - ", dpi=72", - ", echo=FALSE, message=FALSE, warning=FALSE, fig.align='center'}", - "\n(", - pl_deparsed, - ")()", - "\n```" - ) - - cat(knitr::knit( - text = knitr::knit_expand(text = sub_chunk), - quiet = TRUE - )) -} - -``` - - -```{r sanitize-datatable} -sanitize_datatable = function(df, ...) { - # remove dashes which cause wrapping - DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), - colnames=gsub("-", "_", colnames(df))) -} -``` - - -# Samples and metadata - - -```{r load_metadata} - -meta_df=read_csv(metadata_fn) %>% - arrange(.data[[factor_of_interest]]) %>% - distinct(sample, .keep_all = T) %>% - dplyr::select(!matches("fastq"), !matches("strandness")) -meta_df$sample <- make.names(meta_df$sample) -order <- meta_df$sample - -ggplot(meta_df, aes(.data[[factor_of_interest]], - fill = .data[[factor_of_interest]])) + - geom_bar() + ylab("") + xlab("") + ylab("# of samples") + - scale_fill_cb_friendly() + theme(axis.text.x=element_text(angle = 90, vjust = 0.5), legend.position = "none") -``` - - -```{r load_data} -# read counts from SE object -se <- readRDS(se_object) -raw_counts <- assays(se)[["counts"]] %>% round() %>% - as.matrix() -raw_counts=raw_counts[rowSums(raw_counts)!=0,] -``` - -```{r prepare_metrics} -# Get metrics from nf-core into bcbio like table -# many metrics are already in the General Table of MultiQC, this reads the file -metrics <- read_tsv(file.path(multiqc_data_dir, 'multiqc_general_stats.txt')) - -# we get some more metrics from Qualimap and rename columns -metrics_qualimap <- read_tsv(file.path(multiqc_data_dir, 'mqc_qualimap_genomic_origin_1.txt')) -metrics <- metrics %>% full_join(metrics_qualimap) -metrics <- metrics %>% - clean_names() %>% - dplyr::rename_with(~gsub('.*mqc_generalstats_', '', .)) - -# This uses the fastqc metrics to get total reads -total_reads <- metrics %>% - dplyr::filter(!is.na(fastqc_raw_total_sequences)) %>% - remove_empty(which = 'cols') %>% - dplyr::rename(single_sample = sample) %>% - mutate(sample = gsub('_[12]+$', '', single_sample)) %>% - group_by(sample) %>% - summarize(total_reads = sum(fastqc_raw_total_sequences)) - -# This renames to user-friendly names the metrics columns -metrics <- metrics %>% - dplyr::filter(is.na(fastqc_raw_total_sequences)) %>% - remove_empty(which = 'cols') %>% - full_join(total_reads) %>% - mutate(mapped_reads = samtools_reads_mapped) %>% - mutate(exonic_rate = exonic/(star_uniquely_mapped * 2)) %>% - mutate(intronic_rate = intronic/(star_uniquely_mapped * 2)) %>% - mutate(intergenic_rate = intergenic/(star_uniquely_mapped * 2)) %>% - mutate(x5_3_bias = qualimap_5_3_bias) - -# Sometimes we don't have rRNA due to mismatch annotation, We skip this if is the case -gtf <- NULL -if (genome =="other"){ - gtf <- gtf_fn -}else{ - if (genome == "hg38") { - gtf <- "hg38.rna.gtf.gz" - } else if (genome == "mm10") { - gtf <- "mm10.rna.gtf.gz" - } else if (genome == "mm39") { - gtf <- "mm39.rna.gtf.gz" - } - gtf <- system.file("extdata", "annotation", - gtf, - package="bcbioR") -} -if (is.null(gtf)) { - print("No genome provided! Please add it at the top of this Rmd") -} - -gtf=rtracklayer::import(gtf) - -one=grep("gene_type", colnames(as.data.frame(gtf)), value = TRUE) -another=grep("gene_biotype", colnames(as.data.frame(gtf)), value = TRUE) -biotype=NULL -if(length(one)==1){ - biotype=one -}else if(length(another)==1){ - biotype=another -}else{ - warning("No gene biotype founded") -} - -if (!is.null(biotype)){ - annotation=as.data.frame(gtf) %>% .[,c("gene_id", biotype)] - rRNA=grepl("rRNA|tRNA",annotation[[biotype]]) - genes=intersect(annotation[rRNA,"gene_id"],row.names(raw_counts)) - ratio=data.frame(sample=colnames(raw_counts), - r_and_t_rna_rate=colSums(raw_counts[genes,])/colSums(raw_counts)) - metrics = left_join(metrics, ratio, by="sample") -}else{ - metrics[["r_and_t_rna_rate"]] <- NA -} - -# if ("custom_content_biotype_counts_percent_r_rna" %in% colnames(metrics)){ -# metrics <- mutate(metrics, r_rna_rate = custom_content_biotype_counts_percent_r_rna) -# }else{ -# metrics[["r_rna_rate"]] <- NA -# } -metrics=metrics[,c("sample","mapped_reads","exonic_rate","intronic_rate", - "total_reads", - "x5_3_bias", "r_and_t_rna_rate","intergenic_rate")] -metrics$sample <- make.names(metrics$sample) -metrics <- metrics %>% - full_join(meta_df , by = c("sample" = "sample")) %>% - dplyr::select(where(~!all(is.na(.)))) - -``` - -```{r show_metadata} -meta_sm <- meta_df %>% - as.data.frame() %>% - column_to_rownames("sample") - -meta_sm %>% sanitize_datatable() - -``` - -# Read metrics {.tabset} - -## Total reads - -Here, we want to see consistency and a minimum of 20 million reads (the grey line). - -```{r plot_total_reads} -metrics %>% - ggplot(aes(x = factor(sample, level = order), - y = total_reads, - fill = .data[[factor_of_interest]])) + - geom_bar(stat = "identity") + - coord_flip() + - scale_y_continuous(name = "million reads") + - scale_x_discrete(limits = rev) + - scale_fill_cb_friendly() + xlab("") + - ggtitle("Total reads") + - geom_hline(yintercept=20000000, color = "grey", linewidth=2) - -metrics %>% - ggplot(aes(x = .data[[factor_of_interest]], - y = total_reads, - color = .data[[factor_of_interest]])) + - geom_point(alpha = 0.5, size=4) + - coord_flip() + - scale_y_continuous(name = "million reads") + - scale_color_cb_friendly() + xlab("") + - ggtitle("Total reads") -``` - -```{r calc_min_max_pct_mapped} -#get min percent mapped reads for reference -min_pct_mapped <- round(min(metrics$mapped_reads/metrics$total_reads)*100,1) -max_pct_mapped <- round(max(metrics$mapped_reads/metrics$total_reads)*100,1) -``` - -## Mapping rate - -The genomic mapping rate represents the percentage of reads mapping to the reference genome. We want to see consistent mapping rates between samples and over 70% mapping (the grey line). These samples have mapping rates: `r min_pct_mapped` - `r max_pct_mapped`%. - -```{r plot_mapping_rate} -metrics$mapped_reads_pct <- round(metrics$mapped_reads/metrics$total_reads*100,1) -metrics %>% - ggplot(aes(x = factor(sample, level = order), - y = mapped_reads_pct, - color = .data[[factor_of_interest]])) + - geom_point(alpha = 0.5, size=4) + - coord_flip() + - scale_x_discrete(limits = rev) + - scale_color_cb_friendly() + - ylim(0, 100) + - ggtitle("Mapping rate") + xlab("") + - geom_hline(yintercept=70, color = "grey", linewidth=2) -``` - - -## Number of genes detected - -The number of genes represented in every sample is expected to be consistent and over 20K (grey line). - -```{r calc_genes_detected} -genes_detected <- colSums(assays(se)[["counts"]] > 0) %>% enframe() -sample_names <- metrics[,c("sample"), drop=F] -genes_detected <- left_join(genes_detected, sample_names, by = c("name" = "sample")) -genes_detected <- genes_detected %>% group_by(name) -genes_detected <- summarise(genes_detected, - n_genes = max(value)) - -metrics <- metrics %>% - left_join(genes_detected, by = c("sample" = "name")) - -``` - - -```{r plot_genes_detected} -ggplot(metrics,aes(x = factor(sample, level = order), - y = n_genes, fill = .data[[factor_of_interest]])) + - geom_bar(stat = "identity") + - coord_flip() + - scale_fill_cb_friendly() + - scale_x_discrete(limits = rev) + - ggtitle("Number of genes") + - ylab("Number of genes") + - xlab("") + - geom_hline(yintercept=20000, color = "grey", linewidth=2) - -metrics %>% - ggplot(aes(x = .data[[factor_of_interest]], - y = n_genes, - color = .data[[factor_of_interest]])) + - geom_point(alpha = 0.5, size=4) + - coord_flip() + - scale_x_discrete(limits = rev) + - scale_y_continuous(name = "million reads") + - scale_color_cb_friendly() + xlab("") + - ggtitle("Number of Genes") - -``` - - - -## Gene detection saturation - -This plot shows how complex the samples are. We expect samples with more reads to detect more genes. - -```{r plot_gene_saturation} -metrics %>% - ggplot(aes(x = total_reads, - y = n_genes, - color = .data[[factor_of_interest]])) + - geom_point(alpha = 0.5, size=4) + - scale_x_log10() + - scale_color_cb_friendly() + - ggtitle("Gene saturation") + - ylab("Number of genes") -``` - -## Exonic mapping rate - -Here we are looking for consistency, and exonic mapping rates around or above 70% (grey line). - -```{r plot_exonic_mapping_rate} -metrics %>% - ggplot(aes(x = factor(sample, level = order), - y = exonic_rate * 100, - color = .data[[factor_of_interest]])) + - geom_point(alpha = 0.5, size=4) + - ylab("Exonic rate %") + - ggtitle("Exonic mapping rate") + - scale_x_discrete(limits = rev) + - scale_color_cb_friendly() + - coord_flip() + - xlab("") + - ylim(c(0,100)) + - geom_hline(yintercept=70, color = "grey", linewidth=2) -``` - -## Intronic mapping rate - -Here, we expect a low intronic mapping rate (≤ 15% - 20%). The grey line indicates 20%. - -```{r plot_intronic_mapping_rate} -metrics %>% - ggplot(aes(x = factor(sample, level = order), - y = intronic_rate * 100, - color = .data[[factor_of_interest]])) + - geom_point(alpha = 0.5, size=4) + - ylab("Intronic rate %") + - ggtitle("Intronic mapping rate") + - scale_x_discrete(limits = rev) + - scale_color_cb_friendly() + - coord_flip() + - xlab("") + - ylim(c(0,100)) + - geom_hline(yintercept=20, color = "grey", linewidth=2) -``` - -## Intergenic mapping rate - -Here, we expect a low intergenic mapping rate, which is true for all samples. The grey line indicates 15% - -```{r plot_intergenic_mapping_rate} -metrics %>% - ggplot(aes(x = factor(sample, level = order), - y = intergenic_rate * 100, - color = .data[[factor_of_interest]])) + - geom_point(alpha = 0.5, size=4) + - ylab("Intergenic rate %") + - ggtitle("Intergenic mapping rate") + - coord_flip() + xlab("") + - scale_x_discrete(limits = rev) + - scale_color_cb_friendly() + - ylim(c(0, 100)) + - geom_hline(yintercept=15, color = "grey", linewidth=2) -``` - -## tRNA/rRNA mapping rate - -Samples should have a ribosomal RNA (rRNA) "contamination" rate below 10% (the grey line). - -```{r plot_rrna_mapping_rate} - -rrna_ylim <- max(round(metrics$r_and_t_rna_rate*100, 2)) + 10 -metrics %>% - ggplot(aes(x = factor(sample, level = order), - y = r_and_t_rna_rate * 100, - color = .data[[factor_of_interest]])) + - geom_point(alpha = 0.5, size=4) + - ylab("tRNA/rRNA rate, %")+ - ylim(0, rrna_ylim) + - ggtitle("tRNA/rRNA mapping rate") + - coord_flip() + - scale_x_discrete(limits = rev) + - scale_color_cb_friendly() + - ylim(c(0, 100)) + xlab("") + - geom_hline(yintercept=10, color = "grey", linewidth=2) -``` - -## 5'->3' bias - -There should be little bias, i.e. the values should be close to 1, or at least consistent among samples - -```{r plot_53_bias} -metrics %>% - ggplot(aes(x = factor(sample, level = order), - y = x5_3_bias, - color = .data[[factor_of_interest]])) + - geom_point(alpha = 0.5, size=4) + - ggtitle("5'-3' bias") + - coord_flip() + - scale_x_discrete(limits = rev) + - ylim(c(0.5,1.5)) + xlab("") + ylab("5'-3' bias") + - scale_color_cb_friendly()+ - geom_hline(yintercept=1, color = "grey", linewidth=2) -``` - -## Counts per gene - all genes - -We expect consistency in the box plots here between the samples, i.e. the distribution of counts across the genes is similar - -```{r plot_counts_per_gene} -metrics_small <- metrics %>% dplyr::select(sample, .data[[factor_of_interest]]) -metrics_small <- left_join(sample_names, metrics_small) - -counts <- - assays(se)[["counts"]] %>% - as_tibble() %>% - filter(rowSums(.)!=0) %>% - gather(name, counts) - -counts <- left_join(counts, metrics_small, by = c("name" = "sample")) - -ggplot(counts, aes(factor(name, level = order), - log2(counts+1), - fill = .data[[factor_of_interest]])) + - geom_boxplot() + - scale_x_discrete(limits = rev) + - scale_fill_cb_friendly() + - coord_flip() + xlab("") + - ggtitle("Counts per gene, all non-zero genes") + - scale_color_cb_friendly() -``` - - -# Sample similarity analysis - -In this section, we look at how well the different groups in the dataset cluster with each other. Samples from the same group should ideally be clustering together. We use Principal Component Analysis (PCA). - -## Principal component analysis (PCA) - -Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells). - - -```{r PCA1:5 summary, all, unlabeled, fig.width= 7, fig.height = 5} - -vst <- vst(raw_counts) - -coldat_for_pca <- as.data.frame(metrics) -rownames(coldat_for_pca) <- coldat_for_pca$sample -coldat_for_pca <- coldat_for_pca[colnames(raw_counts),] -pca1 <- degPCA(vst, coldat_for_pca, - condition = factor_of_interest, data = T)[["plot"]] -pca2 <- degPCA(vst, coldat_for_pca, - condition = factor_of_interest, data = T, pc1="PC3", pc2="PC4")[["plot"]] - - - -pca1 + scale_color_cb_friendly() -pca2 + scale_color_cb_friendly() - -``` - -# Covariates analysis - -When there are multiple factors that can influence the results of a given experiment, it is useful to assess which of them is responsible for the most variance as determined by PCA. This method adapts the method described by Daily et al. for which they integrated a method to correlate covariates with principal components values to determine the importance of each factor. - -```{r covariate-plot,fig.height=12, fig.width=10} -## Remove non-useful columns output by nf-core -coldat_2 <- data.frame(coldat_for_pca[,!(colnames(coldat_for_pca) %in% c("fastq_1", "fastq_2", "salmon_library_types", "salmon_compatible_fragment_ratio", "samtools_reads_mapped_percent", "samtools_reads_properly_paired_percent", "samtools_mapped_passed_pct", "strandedness", "qualimap_5_3_bias"))]) - -# Remove missing data -coldat_2 <- na.omit(coldat_2) -degCovariates(vst, metadata = coldat_2) -``` - -## Hierarchical clustering - -Inter-correlation analysis (ICA) is another way to look at how well samples -cluster by plotting the correlation between the expression profiles of the -samples. - -```{r clustering fig, fig.width = 10, fig.asp = .62} - -vst_cor <- cor(vst) - -colma=meta_df %>% as.data.frame() -rownames(colma) <- colma$sample -colma <- colma[rownames(vst_cor), ] -colma <- colma %>% dplyr::select(.data[[factor_of_interest]]) -anno_colors=lapply(colnames(colma), function(c){ - l.col=cb_friendly_pal('grey')(length(unique(colma[[c]]))) - names(l.col)=unique(colma[[c]]) - l.col -}) -names(anno_colors)=colnames(colma) - -p <- pheatmap(vst_cor, - annotation = colma, - annotation_colors = anno_colors, - show_rownames = T, - show_colnames = T, - color = cb_friendly_pal('heatmap')(15) - ) -p -``` - -# R session - -List and version of tools used for the QC report generation. - -```{r} -sessionInfo() -``` diff --git a/inst/templates/rnaseq/QC/params_qc_nf-core-example.R b/inst/templates/rnaseq/QC/params_qc_nf-core-example.R deleted file mode 100644 index dae62ce..0000000 --- a/inst/templates/rnaseq/QC/params_qc_nf-core-example.R +++ /dev/null @@ -1,9 +0,0 @@ -# info params - -# Example data: COMMENT THESE LINE IF YOU ARE USING YOUR DATA -metadata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/coldata.csv' -se_object=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.rds') -# This folder is in the output directory inside multiqc folder -multiqc_data_dir='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/multiqc/star_salmon/multiqc-report-data/' -# This file is inside the genome folder in the output directory -gtf_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/devel/nf-core/genome/genome.filtered.gtf.gz' diff --git a/inst/templates/rnaseq/de/params_de-example.R b/inst/templates/rnaseq/de/params_de-example.R deleted file mode 100644 index cc75ad2..0000000 --- a/inst/templates/rnaseq/de/params_de-example.R +++ /dev/null @@ -1,18 +0,0 @@ -# project params -date = "YYYYMMDD" -basedir <- './' # where to write down output files - -# params for bcbio -# coldata_fn = "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv" -# counts_fn = 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/tximport-counts.csv' -# se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds") -# - -# Example data -coldata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/coldata.csv' -counts_fn=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.tsv') -# This folder is in the output directory inside multiqc folder -multiqc_data_dir='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/multiqc/star_salmon/multiqc-report-data/' -# This file is inside the genome folder in the output directory -gtf_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/devel/nf-core/genome/genome.filtered.gtf.gz' -se_object = NA diff --git a/inst/templates/rnaseq/de/run_markdown.R b/inst/templates/rnaseq/de/run_markdown.R deleted file mode 100644 index 79e15a0..0000000 --- a/inst/templates/rnaseq/de/run_markdown.R +++ /dev/null @@ -1,32 +0,0 @@ -library(rmarkdown) -# set working directory to this file before using the function - - -# set directory to this file folder -setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) -# example running with test data -render_de <- function(column, numerator, denominator, subset_value = NULL, - params_file = 'params_de-testdata.R'){ - - rmarkdown::render(input = "DEG.Rmd", - output_dir = ".", - output_format = "html_document", - output_file = ifelse(!is.null(subset_value), - paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'), - paste0('DE_', numerator, '_vs_', denominator, '.html') - ), - clean = TRUE, - envir = new.env(), - params = list( - column = column, - subset_value = subset_value, - numerator = numerator, - denominator = denominator, - params_file = params_file, - project_file = '../information.R', - functions_file = 'load_data.R' - ) - ) -} -#Example data -render_de("sample_type","tumor", "normal") diff --git a/inst/templates/rnaseq/qc/run_markdown.R b/inst/templates/rnaseq/qc/run_markdown.R deleted file mode 100644 index 51acbef..0000000 --- a/inst/templates/rnaseq/qc/run_markdown.R +++ /dev/null @@ -1,13 +0,0 @@ -library(rmarkdown) - -# set directory to this file folder -setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) -# example running with test data -rmarkdown::render("QC_nf-core.Rmd", - output_dir = ".", - clean = TRUE, - output_format = "html_document", - params = list( - params_file = 'params_qc_nf-core-testdata.R', - project_file = '../information.R') - ) diff --git a/man/bcbio_set_project.Rd b/man/bcbio_set_project.Rd deleted file mode 100644 index 4e686bf..0000000 --- a/man/bcbio_set_project.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helpers.R -\name{bcbio_set_project} -\alias{bcbio_set_project} -\title{Function to help with project name used for parent folder} -\usage{ -bcbio_set_project() -} -\value{ -A string list with hbc_code, and project folder name -} -\description{ -This function will ask for user input about: -\itemize{ -\item numeric code -\item PI full name -\item technology -\item tissue -\item organism -\item project description -} -} -\details{ -It removes special character with \verb{_}. The output is a guideline to -what the folder used can be. -} diff --git a/man/bcbio_templates.Rd b/man/bcbio_templates.Rd index c47ab63..217ea0c 100644 --- a/man/bcbio_templates.Rd +++ b/man/bcbio_templates.Rd @@ -7,13 +7,7 @@ bcbio_templates(type = "rnaseq", outpath) } \arguments{ -\item{type}{string indicating the type of analysis, supported: -\itemize{ -\item base -\item rnaseq, scrnaseq, -\item teaseq -\item cosmx -}} +\item{type}{string indicating the type of analysis, supported: rnaseq.} \item{outpath}{string path indicating where to copy all the files to} } From 304c57d35ac6cc0ea3243c5b39dfa2632d69720e Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Thu, 25 Jul 2024 16:07:21 -0400 Subject: [PATCH 19/93] update readme and get working projects --- R/helpers.R | 60 ++++++++++++++++++----------------- inst/templates/base/README.md | 17 ++++++++-- 2 files changed, 45 insertions(+), 32 deletions(-) diff --git a/R/helpers.R b/R/helpers.R index 4d58de7..799ae39 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -106,7 +106,7 @@ bcbio_params <-function(nfcore_path, pipeline, metadata, copy){ } -copy_files_in_folder<- function(remote, origin){ +copy_files_in_folder<- function(origin, remote){ to_copy <- fs::dir_ls(origin) to_copy <- grep("org", to_copy, value = TRUE, invert = TRUE) @@ -140,7 +140,7 @@ copy_templates <- function(path, pipeline){ # ls_files <- grep("org", list.files(analysis_template, full.names = TRUE), # value = TRUE, invert = TRUE) # ui_info("{ui_value(length(ls_files))} amount of files to copy") - copy_files_in_folder(path, analysis_template) + copy_files_in_folder(analysis_template, path) } bcbio_render <- function(path, pipeline, data){ @@ -170,7 +170,7 @@ bcbio_render <- function(path, pipeline, data){ } #' @export -use_bcbio_analysis <- function(path, pipeline, copy=FALSE, metadata=NULL){ +use_bcbio_analysis <- function(path, pipeline, copy=TRUE, metadata=NULL){ if (copy){ # deploy files @@ -181,10 +181,10 @@ use_bcbio_analysis <- function(path, pipeline, copy=FALSE, metadata=NULL){ ui_stop("{ui_value(metadata)} doesn't exist.") fs::file_copy(metadata, meta_path) } - }else{ - if (!fs::dir_exists(pipeline)) - ui_stop("{ui_value(nfcore)} doesn't exist. point to nfcore path or turn on copy mode.") - + } + if (!is.null(pipeline) & fs::dir_exists(pipeline)){ + # ui_stop("{ui_value(nfcore)} doesn't exist. point to nfcore path or turn on copy mode.") + ui_info("Trying to guess nf-core pipeline at {ui_value(pipeline)}") # guess analysis from pipeline file information <- read_pipeline_info(pipeline) fs::dir_create(fs::path_join(c(path, "meta"))) @@ -210,8 +210,8 @@ use_bcbio_analysis <- function(path, pipeline, copy=FALSE, metadata=NULL){ } # set all files from analysis copy_templates(fs::path_join(c(path, "reports")), pipeline) - if (!copy){ - data <- bcbio_params(nfcore, pipeline, metadata, copy=copy) + if (fs::dir_exists(pipeline)){ + data <- bcbio_params(nfcore, pipeline, metadata) bcbio_render(path, pipeline, data) } @@ -236,7 +236,7 @@ use_bcbio_projects <- function(path, pipeline=NULL, metadata=NULL, if (!is.null(pipeline)){ ui_info("Using this pipeline templates {ui_value(pipeline)}") - use_bcbio_analysis(path, pipeline, copy = TRUE, metadata=metadata) + use_bcbio_analysis(path, pipeline, copy = copy, metadata=metadata) } # is_nfcore_ready <- FALSE # if (is.null(pipeline) && rlang::is_interactive()){ @@ -263,25 +263,27 @@ use_bcbio_projects <- function(path, pipeline=NULL, metadata=NULL, # } # } - if (git){ - ui_info("Create Git local repo at {ui_value(path)}") - use_git() - } - if (gh){ - ui_info("Create GitHub repo at {ui_value(path)}") - whoami <- suppressMessages(gh::gh_whoami()) - if (is.null(whoami)) { - ui_stop(c( - "x" = "Unable to discover a GitHub personal access token.", - "i" = "A token is required in order to create and push to a new repo.", - "_" = "Call {.run usethis::gh_token_help()} for help configuring a token." - )) - } - use_github(organisation=org, private = TRUE) - }else{ - ui_info("You decided not to create a repo, please use this to push when ready") - ui_info("use_github(organisation=org), private = TRUE") - } + # if (git){ + # ui_info("Create Git local repo at {ui_value(path)}") + # use_git() + # } + # if (gh){ + # ui_info("Create GitHub repo at {ui_value(path)}") + # whoami <- suppressMessages(gh::gh_whoami()) + # if (is.null(whoami)) { + # ui_warn(c( + # "x" = "Unable to discover a GitHub personal access token.", + # "i" = "A token is required in order to create and push to a new repo.", + # "_" = "Call {.run usethis::gh_token_help()} for help configuring a token." + # )) + # ui_todo("Try this later: use_github(organisation=org), private = TRUE") + # + # } + # use_github(organisation=org, private = TRUE) + # }else{ + # ui_info("You decided not to create a repo, please use this to push when ready") + # ui_todo("Try this later: use_github(organisation=org), private = TRUE") + # } answer <- FALSE if (rlang::is_interactive()) diff --git a/inst/templates/base/README.md b/inst/templates/base/README.md index 6218884..d286e6a 100644 --- a/inst/templates/base/README.md +++ b/inst/templates/base/README.md @@ -1,13 +1,24 @@ # Guidelines +## Set Repository + +- Start a git repository: `usethis::use_git()` +- Push this project to GitHub, follow these steps: + +* Only once every 30 days, set up your github credentials: `usethis::gh_token_help()` + * **NOTE** You may want to run this first (one time) to keep this token working in future sessions: `git config --global credential.helper store` + +- Push repository to HBC github as private: `usethis::use_github(org="hbc",private=TRUE)` + ## Set up work-space - [ ] Replace the title in this file to match the project's title - [ ] Modify `information.R` with the right text for this project, it can be used to source in other `Rmd` files. The main `Rmd` file in this directory can be used to show general information of the project if needed. - [ ] Use the same project name to create a folder in *Dropbox* and a repo in *GitHub* -- [ ] Use the function `bcbio_templates` to create templates inside `reports` for each type of analysis. For instance, for *RNAseq*: - - `bcbio_templates(type="rnaseq", outpath="reports")` or - - `bcbio_templates(type="rnaseq", outpath="reports/experiment1")` +- [ ] If you didn't provide the pipeline when creating this project: + Use the function `bcbio_templates` to create templates inside `reports` for each type of analysis. For instance, for *RNAseq*: + - `use_bcbio_analysis(".", 'nf-core/rnaseq', copy = TRUE)` or + - `use_bcbio_analysis(".", 'singlecell', copy = TRUE)` - Then go to that folder and read the `README.md` ## Folders From 9b83b1fc2befe4f243f44cf69c78d7ce50125d56 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Thu, 25 Jul 2024 16:10:37 -0400 Subject: [PATCH 20/93] remove rmarkdown --- .../templates/cellchat/template.yaml | 3 -- .../templates/common/skeleton/skeleton.Rmd | 28 ------------------- inst/rmarkdown/templates/common/template.yaml | 3 -- inst/rmarkdown/templates/cosmx/template.yml | 3 -- .../templates/rnaseq/skeleton/skeleton.Rmd | 25 ----------------- inst/rmarkdown/templates/rnaseq/template.yaml | 3 -- .../templates/singlecell/template.yml | 3 -- inst/rmarkdown/templates/teaseq/template.yml | 3 -- 8 files changed, 71 deletions(-) delete mode 100644 inst/rmarkdown/templates/cellchat/template.yaml delete mode 100644 inst/rmarkdown/templates/common/skeleton/skeleton.Rmd delete mode 100644 inst/rmarkdown/templates/common/template.yaml delete mode 100644 inst/rmarkdown/templates/cosmx/template.yml delete mode 100644 inst/rmarkdown/templates/rnaseq/skeleton/skeleton.Rmd delete mode 100644 inst/rmarkdown/templates/rnaseq/template.yaml delete mode 100644 inst/rmarkdown/templates/singlecell/template.yml delete mode 100644 inst/rmarkdown/templates/teaseq/template.yml diff --git a/inst/rmarkdown/templates/cellchat/template.yaml b/inst/rmarkdown/templates/cellchat/template.yaml deleted file mode 100644 index aa08347..0000000 --- a/inst/rmarkdown/templates/cellchat/template.yaml +++ /dev/null @@ -1,3 +0,0 @@ -name: bcbio CellChat -description: Standard CellChat analyses -create_dir: false diff --git a/inst/rmarkdown/templates/common/skeleton/skeleton.Rmd b/inst/rmarkdown/templates/common/skeleton/skeleton.Rmd deleted file mode 100644 index 49c7ed5..0000000 --- a/inst/rmarkdown/templates/common/skeleton/skeleton.Rmd +++ /dev/null @@ -1,28 +0,0 @@ ---- -title: "General Project Information" -author: "Harvard Chan Bioinformatics Core" -date: "`r Sys.Date()`" -output: - html_document: - code_folding: hide - df_print: paged - highlights: pygments - number_sections: true - self_contained: true - theme: default - toc: true - toc_float: - collapsed: true - smooth_scroll: true -editor_options: - chunk_output_type: console -params: - params_file: information.R ---- - -```{r echo = F} -source(params$params_file) -``` - - - diff --git a/inst/rmarkdown/templates/common/template.yaml b/inst/rmarkdown/templates/common/template.yaml deleted file mode 100644 index 328d8b0..0000000 --- a/inst/rmarkdown/templates/common/template.yaml +++ /dev/null @@ -1,3 +0,0 @@ -name: bcbio base -description: Standard NGS down-stream analyses -create_dir: false diff --git a/inst/rmarkdown/templates/cosmx/template.yml b/inst/rmarkdown/templates/cosmx/template.yml deleted file mode 100644 index 12712ff..0000000 --- a/inst/rmarkdown/templates/cosmx/template.yml +++ /dev/null @@ -1,3 +0,0 @@ -name: bcbio CosMx -description: Standard CoxMx down-stream analyses -create_dir: false diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/skeleton.Rmd b/inst/rmarkdown/templates/rnaseq/skeleton/skeleton.Rmd deleted file mode 100644 index dc4bbf5..0000000 --- a/inst/rmarkdown/templates/rnaseq/skeleton/skeleton.Rmd +++ /dev/null @@ -1,25 +0,0 @@ ---- -title: "General Project Information" -author: "Harvard Chan Bioinformatics Core" -date: "`r Sys.Date()`" -output: - html_document: - code_folding: hide - df_print: paged - highlights: pygments - number_sections: true - self_contained: true - theme: default - toc: true - toc_float: - collapsed: true - smooth_scroll: true -editor_options: - chunk_output_type: console -params: - params_file: information.R ---- - -```{r echo = F} -source(params$params_file) -``` diff --git a/inst/rmarkdown/templates/rnaseq/template.yaml b/inst/rmarkdown/templates/rnaseq/template.yaml deleted file mode 100644 index 17cd50f..0000000 --- a/inst/rmarkdown/templates/rnaseq/template.yaml +++ /dev/null @@ -1,3 +0,0 @@ -name: bcbio RNAseq -description: Standard RNAseq down-stream analyses -create_dir: false diff --git a/inst/rmarkdown/templates/singlecell/template.yml b/inst/rmarkdown/templates/singlecell/template.yml deleted file mode 100644 index f6b1119..0000000 --- a/inst/rmarkdown/templates/singlecell/template.yml +++ /dev/null @@ -1,3 +0,0 @@ -name: bcbio scRNAseq -description: Standard scRNAseq down-stream analyses -create_dir: false diff --git a/inst/rmarkdown/templates/teaseq/template.yml b/inst/rmarkdown/templates/teaseq/template.yml deleted file mode 100644 index 6838f13..0000000 --- a/inst/rmarkdown/templates/teaseq/template.yml +++ /dev/null @@ -1,3 +0,0 @@ -name: bcbio TEAseq -description: Standard TEAseq down-stream analyses -create_dir: false From 57f45453f2197beb13f305939e6258d576537f25 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Thu, 25 Jul 2024 16:34:02 -0400 Subject: [PATCH 21/93] reorg singlecell --- inst/templates/singlecell/README.md | 10 ++- ...g_steps.md => pre-process-w-cellranger.md} | 81 +------------------ .../singlecell/scripts/seurat_init.R | 73 +++++++++++++++++ inst/templates/singlecell/skeleton.Rmd | 25 ------ 4 files changed, 81 insertions(+), 108 deletions(-) rename inst/templates/singlecell/{starting_steps.md => pre-process-w-cellranger.md} (77%) create mode 100644 inst/templates/singlecell/scripts/seurat_init.R delete mode 100644 inst/templates/singlecell/skeleton.Rmd diff --git a/inst/templates/singlecell/README.md b/inst/templates/singlecell/README.md index 0a6a5b2..c698f79 100644 --- a/inst/templates/singlecell/README.md +++ b/inst/templates/singlecell/README.md @@ -1,6 +1,10 @@ -# Tipical steps for scRNAseq downstream analysis +# Project name -# DropBox +## Start with cell-ranger + +`pre-process-w-cellranger.md` contains step by step guidelines on how to run cellranger in O2 and load data into R. This `scripts/seurat_init.R` script contains all the pieces to go from cellranger output to Seurat obj. It is assuming a mouse genome. + +## DropBox - In `reports/QC` - [ ] copy QC `Rmd/R/html/figures` @@ -10,7 +14,7 @@ - In `reports/DE`, for *each analysis*: - TBD -# GitHub +## GitHub - [ ] Push all `*Rmd` `*R` files used for the *QC* and *DE* analysis respecting folder structure. diff --git a/inst/templates/singlecell/starting_steps.md b/inst/templates/singlecell/pre-process-w-cellranger.md similarity index 77% rename from inst/templates/singlecell/starting_steps.md rename to inst/templates/singlecell/pre-process-w-cellranger.md index 1e10850..05a57e4 100644 --- a/inst/templates/singlecell/starting_steps.md +++ b/inst/templates/singlecell/pre-process-w-cellranger.md @@ -1,3 +1,4 @@ +# Tipical steps for scRNAseq downstream analysis --- title: "From raw data to Seurat" --- @@ -272,83 +273,3 @@ seurat_merge[["RNA"]] <- JoinLayers(seurat_merge[["RNA"]]) saveRDS(seurat_merge, file = "seurat_pre-filtered.rds") write.csv(seurat_merge@meta.data, file = "metadata_pre-filtered.csv") ``` - -## Full script - -Below find all pieces to copy and paste. We are assuming a mouse genome. - -``` - -library(Seurat) -library(data.table) -library(hdf5r) - -### Set up run information -data_dir <- "/path/to/cellranger/output/folders/" - -samples <- c("sample1", "sample2", "sample3") - -### Make individual seurat objects for each sample - -for (i in 1:length(samples)){ - seurat_data <- Read10X_h5(paste(c(data_dir,samples[i],"/outs/raw_feature_bc_matrix.h5"),sep="",collapse = "")) - seurat_obj <- CreateSeuratObject(counts = seurat_data, - min.features = 100, ## only keep cells with at least 100 genes - project = samples[i]) - assign(paste0(samples[i], "_seurat"), - seurat_obj) # stores Seurat object in variable of corresponding sample name -} - -### Merge all seurat objects - -seurat_ID <- paste0(samples, "_seurat") # get names of all objects - - -u <- get(seurat_ID[2]) -for (i in 3:length(seurat_ID)) { - u <- c(u, get(seurat_ID[i])) -} ## makes a list of all seurat objects - -seurat_merge <- merge(x = get(seurat_ID[1]), - y = u, - add.cell.ids = all_samples, - project = "my_scRNA_project") - - -# Mitochondrial genes for mouse genome -idx <- grep("^mt-", rownames(GetAssay(seurat_merge, "RNA"))) -rownames(GetAssay(seurat_merge, "RNA"))[idx] -# Mitochondrial genes vs. nuclear genes ratio -seurat_merge$mitoRatio <- PercentageFeatureSet(object = seurat_merge, pattern = "^mt-") -seurat_merge$mitoRatio <- seurat_merge@meta.data$mitoRatio/100 # Divide by 100 for Ratio instead of Percentage - -# Number of genes per UMI for each cell -seurat_merge$Log10GenesPerUMI <- log10(seurat_merge$nFeature_RNA) / log10(seurat_merge$nCount_RNA) - -# Import experimental metadata -metaexp <- read.csv("/path/to/experimental/metadata/meta.csv") - -# Check matching of IDs -all(metaexp$sample %in% metadata$orig.ident) -all(metadata$orig.ident %in% metaexp$sample) - -#change headings to match -colnames(metaexp)[1] <- "orig.ident" - -metafull <- plyr::join(metadata, metaexp, - by = c("orig.ident")) - -# Replace seurat object metadata -if(all(metafull$barcode == rownames(seurat_merge@meta.data))) { - rownames(metafull) <- metafull$barcode - seurat_merge@meta.data <- metafull -} - - -## Join layers (each sample is a separate layer) -seurat_merge[["RNA"]] <- JoinLayers(seurat_merge[["RNA"]]) - -### Save Seurat object for future processing -saveRDS(seurat_merge, file = "seurat_pre-filtered.rds") -write.csv(seurat_merge@meta.data, file = "metadata_pre-filtered.csv") -``` \ No newline at end of file diff --git a/inst/templates/singlecell/scripts/seurat_init.R b/inst/templates/singlecell/scripts/seurat_init.R new file mode 100644 index 0000000..d345556 --- /dev/null +++ b/inst/templates/singlecell/scripts/seurat_init.R @@ -0,0 +1,73 @@ + +library(Seurat) +library(data.table) +library(hdf5r) + +### Set up run information +data_dir <- "/path/to/cellranger/output/folders/" + +samples <- c("sample1", "sample2", "sample3") + +### Make individual seurat objects for each sample + +for (i in 1:length(samples)){ + seurat_data <- Read10X_h5(paste(c(data_dir,samples[i],"/outs/raw_feature_bc_matrix.h5"),sep="",collapse = "")) + seurat_obj <- CreateSeuratObject(counts = seurat_data, + min.features = 100, ## only keep cells with at least 100 genes + project = samples[i]) + assign(paste0(samples[i], "_seurat"), + seurat_obj) # stores Seurat object in variable of corresponding sample name +} + +### Merge all seurat objects + +seurat_ID <- paste0(samples, "_seurat") # get names of all objects + + +u <- get(seurat_ID[2]) +for (i in 3:length(seurat_ID)) { + u <- c(u, get(seurat_ID[i])) +} ## makes a list of all seurat objects + +seurat_merge <- merge(x = get(seurat_ID[1]), + y = u, + add.cell.ids = all_samples, + project = "my_scRNA_project") + + +# Mitochondrial genes for mouse genome +idx <- grep("^mt-", rownames(GetAssay(seurat_merge, "RNA"))) +rownames(GetAssay(seurat_merge, "RNA"))[idx] +# Mitochondrial genes vs. nuclear genes ratio +seurat_merge$mitoRatio <- PercentageFeatureSet(object = seurat_merge, pattern = "^mt-") +seurat_merge$mitoRatio <- seurat_merge@meta.data$mitoRatio/100 # Divide by 100 for Ratio instead of Percentage + +# Number of genes per UMI for each cell +seurat_merge$Log10GenesPerUMI <- log10(seurat_merge$nFeature_RNA) / log10(seurat_merge$nCount_RNA) + +# Import experimental metadata +metaexp <- read.csv("/path/to/experimental/metadata/meta.csv") + +# Check matching of IDs +all(metaexp$sample %in% metadata$orig.ident) +all(metadata$orig.ident %in% metaexp$sample) + +#change headings to match +colnames(metaexp)[1] <- "orig.ident" + +metafull <- plyr::join(metadata, metaexp, + by = c("orig.ident")) + +# Replace seurat object metadata +if(all(metafull$barcode == rownames(seurat_merge@meta.data))) { + rownames(metafull) <- metafull$barcode + seurat_merge@meta.data <- metafull +} + + +## Join layers (each sample is a separate layer) +seurat_merge[["RNA"]] <- JoinLayers(seurat_merge[["RNA"]]) + +### Save Seurat object for future processing +saveRDS(seurat_merge, file = "seurat_pre-filtered.rds") +write.csv(seurat_merge@meta.data, file = "metadata_pre-filtered.csv") diff --git a/inst/templates/singlecell/skeleton.Rmd b/inst/templates/singlecell/skeleton.Rmd deleted file mode 100644 index dc4bbf5..0000000 --- a/inst/templates/singlecell/skeleton.Rmd +++ /dev/null @@ -1,25 +0,0 @@ ---- -title: "General Project Information" -author: "Harvard Chan Bioinformatics Core" -date: "`r Sys.Date()`" -output: - html_document: - code_folding: hide - df_print: paged - highlights: pygments - number_sections: true - self_contained: true - theme: default - toc: true - toc_float: - collapsed: true - smooth_scroll: true -editor_options: - chunk_output_type: console -params: - params_file: information.R ---- - -```{r echo = F} -source(params$params_file) -``` From cb64952673b30013024ebe9fe0e72689e50de23b Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Thu, 25 Jul 2024 16:36:54 -0400 Subject: [PATCH 22/93] move cell chat --- inst/templates/singlecell/{ => CellToCell}/cellchat.Rmd | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename inst/templates/singlecell/{ => CellToCell}/cellchat.Rmd (100%) diff --git a/inst/templates/singlecell/cellchat.Rmd b/inst/templates/singlecell/CellToCell/cellchat.Rmd similarity index 100% rename from inst/templates/singlecell/cellchat.Rmd rename to inst/templates/singlecell/CellToCell/cellchat.Rmd From ef4f8050f71edd8effffb6491f8821d9d87a454a Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Thu, 25 Jul 2024 16:37:28 -0400 Subject: [PATCH 23/93] remove common information --- inst/templates/singlecell/README.md | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/inst/templates/singlecell/README.md b/inst/templates/singlecell/README.md index c698f79..b49763e 100644 --- a/inst/templates/singlecell/README.md +++ b/inst/templates/singlecell/README.md @@ -3,19 +3,3 @@ ## Start with cell-ranger `pre-process-w-cellranger.md` contains step by step guidelines on how to run cellranger in O2 and load data into R. This `scripts/seurat_init.R` script contains all the pieces to go from cellranger output to Seurat obj. It is assuming a mouse genome. - -## DropBox - -- In `reports/QC` - - [ ] copy QC `Rmd/R/html/figures` -- In `reports/Clusters` - - [ ] the analysis of `SCTransform`, ,`RunPCA` ,`FindNeighbors`, ,`FindClusters`, `RunUMAP` - - [ ] the analysis of `FindMarkers` and `Cell Identification` -- In `reports/DE`, for *each analysis*: - - TBD - -## GitHub - -- [ ] Push all `*Rmd` `*R` files used for the *QC* and *DE* analysis respecting folder structure. - -Please, ignore `*html/figures/csv` and any output of the code. From 19efbff24ee08674a30f34e01a1806d92bc35d50 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Thu, 25 Jul 2024 16:39:24 -0400 Subject: [PATCH 24/93] rename integration --- .../singlecell/Integration/{helpers.R => 01-integration.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename inst/templates/singlecell/Integration/{helpers.R => 01-integration.R} (100%) diff --git a/inst/templates/singlecell/Integration/helpers.R b/inst/templates/singlecell/Integration/01-integration.R similarity index 100% rename from inst/templates/singlecell/Integration/helpers.R rename to inst/templates/singlecell/Integration/01-integration.R From fe3827277f35463ecd2866a8fa7e4daeb8934db2 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Thu, 25 Jul 2024 16:46:12 -0400 Subject: [PATCH 25/93] remove base project information from analysis --- inst/templates/rnaseq/README.md | 5 ----- inst/templates/singlecell/README.md | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/inst/templates/rnaseq/README.md b/inst/templates/rnaseq/README.md index b04a5d5..6de2117 100644 --- a/inst/templates/rnaseq/README.md +++ b/inst/templates/rnaseq/README.md @@ -64,8 +64,3 @@ On the `YAML` header file of the `Rmd` you can specify some parameters or just s - [ ] Significant genes results file as described above, but additionally append columns containing normalized count values for each sample. - Make sure to append the gene symbols to these tables so the researcher can interpret the results. -## GitHub - -- [ ] Push all `*Rmd` `*R` files used for the *QC* and *DE* analysis respecting folder structure. - -Please, ignore `*html/figures/csv` and any output of the code. diff --git a/inst/templates/singlecell/README.md b/inst/templates/singlecell/README.md index b49763e..28d4347 100644 --- a/inst/templates/singlecell/README.md +++ b/inst/templates/singlecell/README.md @@ -1,5 +1,5 @@ # Project name -## Start with cell-ranger +# Start with cell-ranger `pre-process-w-cellranger.md` contains step by step guidelines on how to run cellranger in O2 and load data into R. This `scripts/seurat_init.R` script contains all the pieces to go from cellranger output to Seurat obj. It is assuming a mouse genome. From 6e5d6106f37721ebf6794b9050673c3e35b85e00 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Fri, 26 Jul 2024 12:33:52 -0400 Subject: [PATCH 26/93] get ready for testing --- .../templates/singlecell/CellToCell/cellchat.Rmd | 2 +- .../singlecell/{scRNA_qc_template.rmd => QC.rmd} | 0 inst/templates/singlecell/README.md | 16 ++++++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) rename inst/templates/singlecell/{scRNA_qc_template.rmd => QC.rmd} (100%) diff --git a/inst/templates/singlecell/CellToCell/cellchat.Rmd b/inst/templates/singlecell/CellToCell/cellchat.Rmd index d4cc6f4..b9da33e 100644 --- a/inst/templates/singlecell/CellToCell/cellchat.Rmd +++ b/inst/templates/singlecell/CellToCell/cellchat.Rmd @@ -437,4 +437,4 @@ gg1 <- netVisual_bubble(cellchat_merged, angle.x = 45, remove.isolate = T) gg1 -signaling.grade2_increased = gg1$data \ No newline at end of file +signaling.grade2_increased = gg1$data diff --git a/inst/templates/singlecell/scRNA_qc_template.rmd b/inst/templates/singlecell/QC.rmd similarity index 100% rename from inst/templates/singlecell/scRNA_qc_template.rmd rename to inst/templates/singlecell/QC.rmd diff --git a/inst/templates/singlecell/README.md b/inst/templates/singlecell/README.md index 28d4347..9680182 100644 --- a/inst/templates/singlecell/README.md +++ b/inst/templates/singlecell/README.md @@ -3,3 +3,19 @@ # Start with cell-ranger `pre-process-w-cellranger.md` contains step by step guidelines on how to run cellranger in O2 and load data into R. This `scripts/seurat_init.R` script contains all the pieces to go from cellranger output to Seurat obj. It is assuming a mouse genome. + +# QC + +Currently we are working on deploying a shiny app to inspect the single cell object and find the best cut-offs for filtering. The Rmd that helps to visualize the before and after is `QC.Rmd`. + +# Integration + +Currently we are working on guidelines and templates for this step. There is some draft under *Integration** folder. + +# Cell to cell communication + +CellChat template is at `CellToCell/cellchat.Rmd`. We have built a stable environment in O2 using the following modules: + +``` +# gcc/9.2.0 imageMagick/7.1.0 geos/3.10.2 cmake/3.22.2 R/4.3.1 fftw/3.3.10 gdal/3.1.4 udunits/2.2.28 boost/1.75.0 python/3.9.14 +``` From 5a8401ce598b9bbed99543fc7b9344db1fa14e2a Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Fri, 26 Jul 2024 12:37:43 -0400 Subject: [PATCH 27/93] update readme --- README.md | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 35161bb..649e1f6 100644 --- a/README.md +++ b/README.md @@ -20,15 +20,6 @@ devtools::install_github("bcbio/bcbioR",ref = "devel") ## Quick start -Use this code to generate a standard project name for all of your folders. **This code will not create any folders or files.** - -``` -library(bcbioR) -## basic example code -# will help you to build a folder name following HCBC naming rules -bcbio_set_project() -``` - ### Set base project use `setwd()` to set your current directory to the place where you want to work. The bcbioR functions will automatically write to whatever directory you have set. @@ -40,7 +31,9 @@ setwd("/path/to/analysis/folder") The following code will pop up a Rmd template will populate that folder with HCBC data structure guidelines ``` -bcbio_templates(type="base", outpath="/path/to/analysis/folder") +use_bcbio_projects(path,pipeline="nf-core/rnaseq") +use_bcbio_projects(path,pipeline="singlecell") + ``` ### Set RNAseq report folder From eb49e562f2100d5f0480546e6d5a5594326b652b Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Wed, 31 Jul 2024 17:55:01 -0400 Subject: [PATCH 28/93] move templates --- inst/templates/{ => multiomics}/teaseq/QC/QC-01-load_data.R | 0 inst/templates/{ => multiomics}/teaseq/QC/QC-02-run_analysis.R | 0 inst/templates/{ => multiomics}/teaseq/QC/QC.Rmd | 0 inst/templates/{ => multiomics}/teaseq/README.md | 0 inst/templates/{ => multiomics}/teaseq/information.R | 0 inst/templates/{ => multiomics}/teaseq/scripts/fix_filenames.R | 0 inst/templates/{ => multiomics}/teaseq/scripts/gex_adt_hto.sbatch | 0 inst/templates/{ => multiomics}/teaseq/scripts/gex_atac.sbatch | 0 inst/templates/{ => spatial}/cosmx/QC/QC.Rmd | 0 inst/templates/{ => spatial}/cosmx/QC/run_markdown.R | 0 inst/templates/{ => spatial}/cosmx/information.R | 0 11 files changed, 0 insertions(+), 0 deletions(-) rename inst/templates/{ => multiomics}/teaseq/QC/QC-01-load_data.R (100%) rename inst/templates/{ => multiomics}/teaseq/QC/QC-02-run_analysis.R (100%) rename inst/templates/{ => multiomics}/teaseq/QC/QC.Rmd (100%) rename inst/templates/{ => multiomics}/teaseq/README.md (100%) rename inst/templates/{ => multiomics}/teaseq/information.R (100%) rename inst/templates/{ => multiomics}/teaseq/scripts/fix_filenames.R (100%) rename inst/templates/{ => multiomics}/teaseq/scripts/gex_adt_hto.sbatch (100%) rename inst/templates/{ => multiomics}/teaseq/scripts/gex_atac.sbatch (100%) rename inst/templates/{ => spatial}/cosmx/QC/QC.Rmd (100%) rename inst/templates/{ => spatial}/cosmx/QC/run_markdown.R (100%) rename inst/templates/{ => spatial}/cosmx/information.R (100%) diff --git a/inst/templates/teaseq/QC/QC-01-load_data.R b/inst/templates/multiomics/teaseq/QC/QC-01-load_data.R similarity index 100% rename from inst/templates/teaseq/QC/QC-01-load_data.R rename to inst/templates/multiomics/teaseq/QC/QC-01-load_data.R diff --git a/inst/templates/teaseq/QC/QC-02-run_analysis.R b/inst/templates/multiomics/teaseq/QC/QC-02-run_analysis.R similarity index 100% rename from inst/templates/teaseq/QC/QC-02-run_analysis.R rename to inst/templates/multiomics/teaseq/QC/QC-02-run_analysis.R diff --git a/inst/templates/teaseq/QC/QC.Rmd b/inst/templates/multiomics/teaseq/QC/QC.Rmd similarity index 100% rename from inst/templates/teaseq/QC/QC.Rmd rename to inst/templates/multiomics/teaseq/QC/QC.Rmd diff --git a/inst/templates/teaseq/README.md b/inst/templates/multiomics/teaseq/README.md similarity index 100% rename from inst/templates/teaseq/README.md rename to inst/templates/multiomics/teaseq/README.md diff --git a/inst/templates/teaseq/information.R b/inst/templates/multiomics/teaseq/information.R similarity index 100% rename from inst/templates/teaseq/information.R rename to inst/templates/multiomics/teaseq/information.R diff --git a/inst/templates/teaseq/scripts/fix_filenames.R b/inst/templates/multiomics/teaseq/scripts/fix_filenames.R similarity index 100% rename from inst/templates/teaseq/scripts/fix_filenames.R rename to inst/templates/multiomics/teaseq/scripts/fix_filenames.R diff --git a/inst/templates/teaseq/scripts/gex_adt_hto.sbatch b/inst/templates/multiomics/teaseq/scripts/gex_adt_hto.sbatch similarity index 100% rename from inst/templates/teaseq/scripts/gex_adt_hto.sbatch rename to inst/templates/multiomics/teaseq/scripts/gex_adt_hto.sbatch diff --git a/inst/templates/teaseq/scripts/gex_atac.sbatch b/inst/templates/multiomics/teaseq/scripts/gex_atac.sbatch similarity index 100% rename from inst/templates/teaseq/scripts/gex_atac.sbatch rename to inst/templates/multiomics/teaseq/scripts/gex_atac.sbatch diff --git a/inst/templates/cosmx/QC/QC.Rmd b/inst/templates/spatial/cosmx/QC/QC.Rmd similarity index 100% rename from inst/templates/cosmx/QC/QC.Rmd rename to inst/templates/spatial/cosmx/QC/QC.Rmd diff --git a/inst/templates/cosmx/QC/run_markdown.R b/inst/templates/spatial/cosmx/QC/run_markdown.R similarity index 100% rename from inst/templates/cosmx/QC/run_markdown.R rename to inst/templates/spatial/cosmx/QC/run_markdown.R diff --git a/inst/templates/cosmx/information.R b/inst/templates/spatial/cosmx/information.R similarity index 100% rename from inst/templates/cosmx/information.R rename to inst/templates/spatial/cosmx/information.R From d243a707435bac6eccd9396f4e38bb964dacba0a Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Wed, 31 Jul 2024 17:56:16 -0400 Subject: [PATCH 29/93] move qc single cell --- inst/templates/singlecell/QC/QC.rmd | 400 ++++++++++++++++++++++++++++ 1 file changed, 400 insertions(+) create mode 100644 inst/templates/singlecell/QC/QC.rmd diff --git a/inst/templates/singlecell/QC/QC.rmd b/inst/templates/singlecell/QC/QC.rmd new file mode 100644 index 0000000..60bb9de --- /dev/null +++ b/inst/templates/singlecell/QC/QC.rmd @@ -0,0 +1,400 @@ +--- +title: "scRNA QC" +output: html_document +date: "`r Sys.Date()`" +params: + ## If you have Ribosomal ratio in your raw seurat object put this as TRUE otherwise leave as FALSE + ribosomal: FALSE +--- + +# Overview + +- Project: project +- PI: PI +- Analyst: analyst + + +```{r, eval=FALSE} +### READ ME FIRST + +# This is a template for scRNA QC to present to your client. The actual QC can be done using our rshiny app: + +# After you have decided on your QC metrics load your raw object (i.e. right after you first read data into seurat) and create your QC object by editing lines 49-67. + +# Edit text line 246 with your chosen QC cutoffs! +``` + + + +```{r setup, include=FALSE} +library(Seurat) +library(tidyverse) +library(ggplot2) + +knitr::opts_chunk[["set"]]( + cache = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + fig.height = 4, + echo=FALSE) +``` + + + +```{r load and filter} +## Load data + +seurat_raw <- seurat_clust <- readRDS("seurat_pre-filtered.rds") + +## Creat QC object USE METRICS YOU CHOSE IN THE RSHINY APP + +seurat_qc <- subset(x = seurat_raw, + subset = (nCount_RNA >= 1500) + & (nFeature_RNA >= 2200) + & (mitoRatio < 0.1) + ## & (riboRatio < 0.4) + & (Log10GenesPerUMI > 0.80) + ) + + +## Save QC object + +saveRDS(seurat_qc, file = "seurat_post-QC.rds") + +``` + + + +```{r prep-info} + +## Prep information for plotting +metadata0 <- seurat_raw@meta.data + +metadata0 = metadata0 %>% dplyr::rename(nUMI = nCount_RNA, + nGene = nFeature_RNA) + +metadata1 <- seurat_qc@meta.data + + +metadata1 = metadata1 %>% dplyr::rename(nUMI = nCount_RNA, + nGene = nFeature_RNA) +``` + + +# QC metrics: raw data {.tabset} + +In this section, we review quality control (QC) metrics for the **raw feature matrices** generated by `Cellranger`. Only a low level filter excluding cells with <100 nUMIs (= number of unique molecular identifiers, or sequenced reads per cell) was applied when uploading the data into `R`. + + +## Cells per sample + +```{r cells raw} +table(metadata0$orig.ident) + +``` + + +## UMIs per cell + +Here, we look at the distribution of UMIs (unique molecular identifiers, or sequenced reads) per cell (droplet) in the dataset. Before QC, we expect a biomodal distribution with a first *small* peak at low numbers of UMIs (<250) corresponding to droplets that encapsulated background/dying cells, and a second higher peak centered at >1000. The line is at 250. + + +```{r raw_nUMIs} +metadata0 %>% + ggplot(aes(x = nUMI, color = orig.ident, fill = orig.ident)) + + geom_density(alpha = 0.2) + + theme_classic() + + ylab("Cell density") + scale_x_log10() + + geom_vline(xintercept = 250) + + facet_wrap(. ~ orig.ident) + + ggtitle("UMIs per cell in raw dataset") +``` + + + +```{r} +# Visualize the distribution of nUMIs per cell (boxplot) +metadata0 %>% + ggplot(aes(x=orig.ident, y=log10(nUMI), fill=orig.ident)) + + geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + + theme_classic() + + geom_hline(yintercept = c(log10(1000), log10(50000))) + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + + theme(plot.title = element_text(hjust = 0.5, face = "bold")) +``` + +## Genes per cell + +Here, we look at the number of different genes that were detected in each cell. By "detected", we mean genes with a non-zero read count measurement. Gene detection in the range of 500 to 5000 is normal for most single-cell experiments. The line is at 750. + +```{r raw_nGene} +# Visualize the distribution of genes detected per cell (histogram) +metadata0 %>% + ggplot(aes(x = nGene, color = orig.ident, fill = orig.ident)) + + geom_density(alpha = 0.2) + + theme_classic() + + scale_x_log10() + + geom_vline(xintercept = c(700)) + + facet_wrap(. ~ orig.ident) + + ggtitle("Detected genes per cell in raw dataset") +``` + + +```{r} +# Visualize the distribution of nUMIs per cell (boxplot) +metadata0 %>% + ggplot(aes(x=orig.ident, y=log10(nGene), fill=orig.ident)) + + geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + + theme_classic() + + geom_hline(yintercept = c(log10(700))) + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + + theme(plot.title = element_text(hjust = 0.5, face = "bold")) +``` + + +## Mitochondrial ratio + +We evaluate overall mitochondrial gene expression as a biomarker of cellular stress during sample preparation. Typically, we expect mitochondrial genes to account for <20% of overall transcripts in each cell. The line indicates 10%. + +```{r raw_mito, warning=FALSE} +# Visualize the distribution of mitochondrial gene expression detected per cell +metadata0 %>% + ggplot(aes(color = orig.ident, x = mitoRatio, fill = orig.ident)) + + geom_density(alpha = 0.2) + + scale_x_log10() + + theme_classic() + + geom_vline(xintercept = c(0.1)) + + facet_wrap(. ~ surgery) + + ggtitle("Percentage of mitochondrial gene expression per cell in raw dataset") +``` + + + +```{r raw_ribo, eval=ribosomal, warning=FALSE, results='asis'} + +cat("## Ribosomal ratio \n") + +cat("We evaluate overall ribosomal gene expression. The line indicates 5%. \n" +) +# Visualize the distribution of mitochondrial gene expression detected per cell +metadata0 %>% + ggplot(aes(color = orig.ident, x = riboRatio, fill = orig.ident)) + + geom_density(alpha = 0.2) + + scale_x_log10() + + theme_classic() + + geom_vline(xintercept = c(0.05)) + + facet_wrap(. ~ orig.ident) + + ggtitle("Percentage of ribosomal gene expression per cell in raw dataset") +``` + + +## UMIs vs. Genes + +By plotting the number of UMIs per cell (x-axis) vs. the number of genes per cell (y-axis), we can visually assess whether there is a large proportion of low quality cells with low read counts and/or gene detection (bottom left quadrant of the plot). In the following representation, cells are further color-coded based on the percentage of mitochondrial genes found among total detected genes. The line for nUMI is at 1000 and the line for nGene is at 700. + +```{r raw_gene_by_umi, fig.height=12, fig.width=15, warning=FALSE} +# Visualize the correlation between genes detected and number of UMIs and determine whether strong presence of cells with low numbers of genes/UMIs +metadata0 %>% + ggplot(aes(x=nUMI, y=nGene, color=mitoRatio)) + + geom_point() + + stat_smooth(method=lm) + + scale_x_log10() + + scale_y_log10() + + theme_classic() + + geom_vline(xintercept = 1000) + + geom_hline(yintercept = 700) + + ggtitle("Genes vs. nUMIs in raw dataset") + + facet_wrap(~orig.ident) +``` + + +## Complexity + +Another way to assess the quality and purity of a single-cell dataset is to look for cells that have fewer detected genes per UMI than others. Typical values for this metric are >0.8 for most cells. Cells with lower diversity in the genes they express may be low-complexity cell types such as red blood cells. With sorted populations, we expect high purity and a very similar complexity distribution across samples. + +```{r raw_novelty} +# Visualize the overall novelty of the gene expression by visualizing the genes detected per UMI +metadata0 %>% + ggplot(aes(x = Log10GenesPerUMI, color = orig.ident, fill = orig.ident)) + + geom_density(alpha = 0.2) + + theme_classic() + + geom_vline(xintercept = c(0.85)) + + facet_wrap(. ~ orig.ident) + + ggtitle("log10(Genes per UMI) in raw dataset") +``` + + +```{r} +# Visualize the distribution of nUMIs per cell (boxplot) +metadata0 %>% + ggplot(aes(x=orig.ident, Log10GenesPerUMI, fill=orig.ident)) + + geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + + theme_classic() + + geom_hline(yintercept = c(0.8, 0.85)) + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + + theme(plot.title = element_text(hjust = 0.5, face = "bold")) +``` + + +# QC metrics: Filtered data {.tabset} + +Based on the above QC metrics, we filtered the dataset to isolate cells passing the following thresholds: **>250 UMIs, >250 genes, <0.2 mitochondrial gene ratio, and >0.8 complexity**. + +In this section, we review QC metrics for our filtered dataset. + +## Cells per sample + +```{r cells filtered} +table(metadata1$orig.ident) + +``` + + +## UMIs per cell + +The line is at 1000 + +```{r qc1_nUMIs} +metadata1 %>% + ggplot(aes(color = orig.ident, x = nUMI, fill = orig.ident)) + + geom_density(alpha = 0.2) + + scale_x_log10() + + theme_classic() + + ylab("Cell density") + xlab("nUMI") + + geom_vline(xintercept = c(1000)) + + facet_wrap(. ~ orig.ident) +``` + + +```{r} +# Visualize the distribution of nUMIs per cell (boxplot) +metadata1 %>% + ggplot(aes(x=orig.ident, y=log10(nUMI), fill=orig.ident)) + + geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + + theme_classic() + + geom_hline(yintercept = c(log10(1000))) + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + + theme(plot.title = element_text(hjust = 0.5, face = "bold")) +``` + + +## Genes detected + +The line is at 750 + +```{r qc1_genes} +# Visualize the distribution of genes detected per cell via histogram +metadata1 %>% + ggplot(aes(color = orig.ident, x = nGene, fill= orig.ident)) + + geom_density(alpha = 0.2) + + theme_classic() + + scale_x_log10() + xlab("nGene") + + facet_wrap(. ~ orig.ident) + + geom_vline(xintercept = c(750)) +``` + + +```{r} +# Visualize the distribution of nUMIs per cell (boxplot) +metadata1 %>% + ggplot(aes(x=orig.ident, y=log10(nGene), fill=orig.ident)) + + geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + + theme_classic() + + geom_hline(yintercept = c(log10(7500))) + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + + theme(plot.title = element_text(hjust = 0.5, face = "bold")) + +``` + + +## Mitochondrial ratio + +The line is at 10%. + +```{r qc1_mitoratio, message=FALSE, warning=FALSE} +# Visualize the distribution of mitochondrial gene expression detected per cell +metadata1 %>% + ggplot(aes(color = orig.ident, x = mitoRatio, fill = orig.ident)) + + geom_density(alpha = 0.2) + + scale_x_log10() + + theme_classic() + + geom_vline(xintercept = 0.1) + + facet_wrap(. ~ surgery) +``` + + +```{r qc1_ribo, eval=ribosomal, warning=FALSE, results='asis'} + +cat("## Ribosomal ratio \n") + +cat("We evaluate overall ribosomal gene expression. The line indicates 10%. \n" +) +# Visualize the distribution of mitochondrial gene expression detected per cell +metadata0 %>% + ggplot(aes(color = orig.ident, x = riboRatio, fill = orig.ident)) + + geom_density(alpha = 0.2) + + scale_x_log10() + + theme_classic() + + geom_vline(xintercept = c(0.1)) + + facet_wrap(. ~ orig.ident) + + ggtitle("Percentage of ribosomal gene expression per cell in raw dataset") +``` + + +## UMIs vs. Genes + +Both the horizontal and vertical lines are at 1000. + +```{r qc1_genes_per_UMI, fig.height=12, fig.width=15, warning=FALSE} +# Visualize the correlation between genes detected and number of UMIs and determine whether strong presence of cells with low numbers of genes/UMIs +metadata1 %>% + ggplot(aes(x = nUMI, y = nGene, color = mitoRatio)) + + geom_point() + + stat_smooth(method=lm) + + scale_x_log10() + + scale_y_log10() + + theme_classic() + + geom_vline(xintercept = c(1000)) + + geom_hline(yintercept = c(1000)) + + ggtitle("Genes vs. nUMIs in raw dataset") + + xlab("nUMI") + ylab("nGene") + + facet_wrap(~orig.ident) +``` + + +## Complexity + +```{r qc1_complexity} +# Visualize the overall novelty of the gene expression by visualizing the genes detected per UMI +metadata1 %>% + ggplot(aes(x = Log10GenesPerUMI, color = orig.ident, fill = orig.ident)) + + geom_density(alpha = 0.2) + + theme_classic() + + #geom_vline(xintercept = c(0.85)) + + facet_wrap(. ~ orig.ident) +``` + +```{r} +# Visualize the distribution of nUMIs per cell (boxplot) +metadata1 %>% + ggplot(aes(x=orig.ident, Log10GenesPerUMI, fill=orig.ident)) + + geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + + theme_classic() + + #geom_hline(yintercept = c(0.85)) + + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + + theme(plot.title = element_text(hjust = 0.5, face = "bold")) +``` + + + +# R session + +```{r} +sessionInfo() +``` + From 57d1ff0e82f137eab3d14d492e7c8aba7f46ed3b Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Wed, 31 Jul 2024 17:57:08 -0400 Subject: [PATCH 30/93] remove file --- inst/templates/singlecell/QC.rmd | 400 ------------------------------- 1 file changed, 400 deletions(-) delete mode 100644 inst/templates/singlecell/QC.rmd diff --git a/inst/templates/singlecell/QC.rmd b/inst/templates/singlecell/QC.rmd deleted file mode 100644 index 60bb9de..0000000 --- a/inst/templates/singlecell/QC.rmd +++ /dev/null @@ -1,400 +0,0 @@ ---- -title: "scRNA QC" -output: html_document -date: "`r Sys.Date()`" -params: - ## If you have Ribosomal ratio in your raw seurat object put this as TRUE otherwise leave as FALSE - ribosomal: FALSE ---- - -# Overview - -- Project: project -- PI: PI -- Analyst: analyst - - -```{r, eval=FALSE} -### READ ME FIRST - -# This is a template for scRNA QC to present to your client. The actual QC can be done using our rshiny app: - -# After you have decided on your QC metrics load your raw object (i.e. right after you first read data into seurat) and create your QC object by editing lines 49-67. - -# Edit text line 246 with your chosen QC cutoffs! -``` - - - -```{r setup, include=FALSE} -library(Seurat) -library(tidyverse) -library(ggplot2) - -knitr::opts_chunk[["set"]]( - cache = FALSE, - dev = c("png", "pdf"), - error = TRUE, - highlight = TRUE, - message = FALSE, - prompt = FALSE, - tidy = FALSE, - warning = FALSE, - fig.height = 4, - echo=FALSE) -``` - - - -```{r load and filter} -## Load data - -seurat_raw <- seurat_clust <- readRDS("seurat_pre-filtered.rds") - -## Creat QC object USE METRICS YOU CHOSE IN THE RSHINY APP - -seurat_qc <- subset(x = seurat_raw, - subset = (nCount_RNA >= 1500) - & (nFeature_RNA >= 2200) - & (mitoRatio < 0.1) - ## & (riboRatio < 0.4) - & (Log10GenesPerUMI > 0.80) - ) - - -## Save QC object - -saveRDS(seurat_qc, file = "seurat_post-QC.rds") - -``` - - - -```{r prep-info} - -## Prep information for plotting -metadata0 <- seurat_raw@meta.data - -metadata0 = metadata0 %>% dplyr::rename(nUMI = nCount_RNA, - nGene = nFeature_RNA) - -metadata1 <- seurat_qc@meta.data - - -metadata1 = metadata1 %>% dplyr::rename(nUMI = nCount_RNA, - nGene = nFeature_RNA) -``` - - -# QC metrics: raw data {.tabset} - -In this section, we review quality control (QC) metrics for the **raw feature matrices** generated by `Cellranger`. Only a low level filter excluding cells with <100 nUMIs (= number of unique molecular identifiers, or sequenced reads per cell) was applied when uploading the data into `R`. - - -## Cells per sample - -```{r cells raw} -table(metadata0$orig.ident) - -``` - - -## UMIs per cell - -Here, we look at the distribution of UMIs (unique molecular identifiers, or sequenced reads) per cell (droplet) in the dataset. Before QC, we expect a biomodal distribution with a first *small* peak at low numbers of UMIs (<250) corresponding to droplets that encapsulated background/dying cells, and a second higher peak centered at >1000. The line is at 250. - - -```{r raw_nUMIs} -metadata0 %>% - ggplot(aes(x = nUMI, color = orig.ident, fill = orig.ident)) + - geom_density(alpha = 0.2) + - theme_classic() + - ylab("Cell density") + scale_x_log10() + - geom_vline(xintercept = 250) + - facet_wrap(. ~ orig.ident) + - ggtitle("UMIs per cell in raw dataset") -``` - - - -```{r} -# Visualize the distribution of nUMIs per cell (boxplot) -metadata0 %>% - ggplot(aes(x=orig.ident, y=log10(nUMI), fill=orig.ident)) + - geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + - theme_classic() + - geom_hline(yintercept = c(log10(1000), log10(50000))) + - theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + - theme(plot.title = element_text(hjust = 0.5, face = "bold")) -``` - -## Genes per cell - -Here, we look at the number of different genes that were detected in each cell. By "detected", we mean genes with a non-zero read count measurement. Gene detection in the range of 500 to 5000 is normal for most single-cell experiments. The line is at 750. - -```{r raw_nGene} -# Visualize the distribution of genes detected per cell (histogram) -metadata0 %>% - ggplot(aes(x = nGene, color = orig.ident, fill = orig.ident)) + - geom_density(alpha = 0.2) + - theme_classic() + - scale_x_log10() + - geom_vline(xintercept = c(700)) + - facet_wrap(. ~ orig.ident) + - ggtitle("Detected genes per cell in raw dataset") -``` - - -```{r} -# Visualize the distribution of nUMIs per cell (boxplot) -metadata0 %>% - ggplot(aes(x=orig.ident, y=log10(nGene), fill=orig.ident)) + - geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + - theme_classic() + - geom_hline(yintercept = c(log10(700))) + - theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + - theme(plot.title = element_text(hjust = 0.5, face = "bold")) -``` - - -## Mitochondrial ratio - -We evaluate overall mitochondrial gene expression as a biomarker of cellular stress during sample preparation. Typically, we expect mitochondrial genes to account for <20% of overall transcripts in each cell. The line indicates 10%. - -```{r raw_mito, warning=FALSE} -# Visualize the distribution of mitochondrial gene expression detected per cell -metadata0 %>% - ggplot(aes(color = orig.ident, x = mitoRatio, fill = orig.ident)) + - geom_density(alpha = 0.2) + - scale_x_log10() + - theme_classic() + - geom_vline(xintercept = c(0.1)) + - facet_wrap(. ~ surgery) + - ggtitle("Percentage of mitochondrial gene expression per cell in raw dataset") -``` - - - -```{r raw_ribo, eval=ribosomal, warning=FALSE, results='asis'} - -cat("## Ribosomal ratio \n") - -cat("We evaluate overall ribosomal gene expression. The line indicates 5%. \n" -) -# Visualize the distribution of mitochondrial gene expression detected per cell -metadata0 %>% - ggplot(aes(color = orig.ident, x = riboRatio, fill = orig.ident)) + - geom_density(alpha = 0.2) + - scale_x_log10() + - theme_classic() + - geom_vline(xintercept = c(0.05)) + - facet_wrap(. ~ orig.ident) + - ggtitle("Percentage of ribosomal gene expression per cell in raw dataset") -``` - - -## UMIs vs. Genes - -By plotting the number of UMIs per cell (x-axis) vs. the number of genes per cell (y-axis), we can visually assess whether there is a large proportion of low quality cells with low read counts and/or gene detection (bottom left quadrant of the plot). In the following representation, cells are further color-coded based on the percentage of mitochondrial genes found among total detected genes. The line for nUMI is at 1000 and the line for nGene is at 700. - -```{r raw_gene_by_umi, fig.height=12, fig.width=15, warning=FALSE} -# Visualize the correlation between genes detected and number of UMIs and determine whether strong presence of cells with low numbers of genes/UMIs -metadata0 %>% - ggplot(aes(x=nUMI, y=nGene, color=mitoRatio)) + - geom_point() + - stat_smooth(method=lm) + - scale_x_log10() + - scale_y_log10() + - theme_classic() + - geom_vline(xintercept = 1000) + - geom_hline(yintercept = 700) + - ggtitle("Genes vs. nUMIs in raw dataset") + - facet_wrap(~orig.ident) -``` - - -## Complexity - -Another way to assess the quality and purity of a single-cell dataset is to look for cells that have fewer detected genes per UMI than others. Typical values for this metric are >0.8 for most cells. Cells with lower diversity in the genes they express may be low-complexity cell types such as red blood cells. With sorted populations, we expect high purity and a very similar complexity distribution across samples. - -```{r raw_novelty} -# Visualize the overall novelty of the gene expression by visualizing the genes detected per UMI -metadata0 %>% - ggplot(aes(x = Log10GenesPerUMI, color = orig.ident, fill = orig.ident)) + - geom_density(alpha = 0.2) + - theme_classic() + - geom_vline(xintercept = c(0.85)) + - facet_wrap(. ~ orig.ident) + - ggtitle("log10(Genes per UMI) in raw dataset") -``` - - -```{r} -# Visualize the distribution of nUMIs per cell (boxplot) -metadata0 %>% - ggplot(aes(x=orig.ident, Log10GenesPerUMI, fill=orig.ident)) + - geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + - theme_classic() + - geom_hline(yintercept = c(0.8, 0.85)) + - theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + - theme(plot.title = element_text(hjust = 0.5, face = "bold")) -``` - - -# QC metrics: Filtered data {.tabset} - -Based on the above QC metrics, we filtered the dataset to isolate cells passing the following thresholds: **>250 UMIs, >250 genes, <0.2 mitochondrial gene ratio, and >0.8 complexity**. - -In this section, we review QC metrics for our filtered dataset. - -## Cells per sample - -```{r cells filtered} -table(metadata1$orig.ident) - -``` - - -## UMIs per cell - -The line is at 1000 - -```{r qc1_nUMIs} -metadata1 %>% - ggplot(aes(color = orig.ident, x = nUMI, fill = orig.ident)) + - geom_density(alpha = 0.2) + - scale_x_log10() + - theme_classic() + - ylab("Cell density") + xlab("nUMI") + - geom_vline(xintercept = c(1000)) + - facet_wrap(. ~ orig.ident) -``` - - -```{r} -# Visualize the distribution of nUMIs per cell (boxplot) -metadata1 %>% - ggplot(aes(x=orig.ident, y=log10(nUMI), fill=orig.ident)) + - geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + - theme_classic() + - geom_hline(yintercept = c(log10(1000))) + - theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + - theme(plot.title = element_text(hjust = 0.5, face = "bold")) -``` - - -## Genes detected - -The line is at 750 - -```{r qc1_genes} -# Visualize the distribution of genes detected per cell via histogram -metadata1 %>% - ggplot(aes(color = orig.ident, x = nGene, fill= orig.ident)) + - geom_density(alpha = 0.2) + - theme_classic() + - scale_x_log10() + xlab("nGene") + - facet_wrap(. ~ orig.ident) + - geom_vline(xintercept = c(750)) -``` - - -```{r} -# Visualize the distribution of nUMIs per cell (boxplot) -metadata1 %>% - ggplot(aes(x=orig.ident, y=log10(nGene), fill=orig.ident)) + - geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + - theme_classic() + - geom_hline(yintercept = c(log10(7500))) + - theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + - theme(plot.title = element_text(hjust = 0.5, face = "bold")) - -``` - - -## Mitochondrial ratio - -The line is at 10%. - -```{r qc1_mitoratio, message=FALSE, warning=FALSE} -# Visualize the distribution of mitochondrial gene expression detected per cell -metadata1 %>% - ggplot(aes(color = orig.ident, x = mitoRatio, fill = orig.ident)) + - geom_density(alpha = 0.2) + - scale_x_log10() + - theme_classic() + - geom_vline(xintercept = 0.1) + - facet_wrap(. ~ surgery) -``` - - -```{r qc1_ribo, eval=ribosomal, warning=FALSE, results='asis'} - -cat("## Ribosomal ratio \n") - -cat("We evaluate overall ribosomal gene expression. The line indicates 10%. \n" -) -# Visualize the distribution of mitochondrial gene expression detected per cell -metadata0 %>% - ggplot(aes(color = orig.ident, x = riboRatio, fill = orig.ident)) + - geom_density(alpha = 0.2) + - scale_x_log10() + - theme_classic() + - geom_vline(xintercept = c(0.1)) + - facet_wrap(. ~ orig.ident) + - ggtitle("Percentage of ribosomal gene expression per cell in raw dataset") -``` - - -## UMIs vs. Genes - -Both the horizontal and vertical lines are at 1000. - -```{r qc1_genes_per_UMI, fig.height=12, fig.width=15, warning=FALSE} -# Visualize the correlation between genes detected and number of UMIs and determine whether strong presence of cells with low numbers of genes/UMIs -metadata1 %>% - ggplot(aes(x = nUMI, y = nGene, color = mitoRatio)) + - geom_point() + - stat_smooth(method=lm) + - scale_x_log10() + - scale_y_log10() + - theme_classic() + - geom_vline(xintercept = c(1000)) + - geom_hline(yintercept = c(1000)) + - ggtitle("Genes vs. nUMIs in raw dataset") + - xlab("nUMI") + ylab("nGene") + - facet_wrap(~orig.ident) -``` - - -## Complexity - -```{r qc1_complexity} -# Visualize the overall novelty of the gene expression by visualizing the genes detected per UMI -metadata1 %>% - ggplot(aes(x = Log10GenesPerUMI, color = orig.ident, fill = orig.ident)) + - geom_density(alpha = 0.2) + - theme_classic() + - #geom_vline(xintercept = c(0.85)) + - facet_wrap(. ~ orig.ident) -``` - -```{r} -# Visualize the distribution of nUMIs per cell (boxplot) -metadata1 %>% - ggplot(aes(x=orig.ident, Log10GenesPerUMI, fill=orig.ident)) + - geom_violin() + geom_boxplot(width = 0.1, fill = alpha("white", 0.7)) + - theme_classic() + - #geom_hline(yintercept = c(0.85)) + - theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + - theme(plot.title = element_text(hjust = 0.5, face = "bold")) -``` - - - -# R session - -```{r} -sessionInfo() -``` - From 0a1dcfa85df5bfb0f3cb4bfe4a1340e3562f8d86 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Wed, 31 Jul 2024 18:01:34 -0400 Subject: [PATCH 31/93] remove old files --- inst/templates/rnaseq/QC/QC.Rmd | 410 ------------------ .../rnaseq/de/Multiplicative_DE_docs.md | 129 ------ .../rnaseq/de/PCA_variance_analysis.Rmd | 35 -- inst/templates/rnaseq/qc/placeholder | 0 4 files changed, 574 deletions(-) delete mode 100644 inst/templates/rnaseq/QC/QC.Rmd delete mode 100644 inst/templates/rnaseq/de/Multiplicative_DE_docs.md delete mode 100644 inst/templates/rnaseq/de/PCA_variance_analysis.Rmd delete mode 100644 inst/templates/rnaseq/qc/placeholder diff --git a/inst/templates/rnaseq/QC/QC.Rmd b/inst/templates/rnaseq/QC/QC.Rmd deleted file mode 100644 index 48afe76..0000000 --- a/inst/templates/rnaseq/QC/QC.Rmd +++ /dev/null @@ -1,410 +0,0 @@ ---- -title: "Quality Control" -author: "Harvard Chan Bioinformatics Core" -date: "`r Sys.Date()`" -output: - html_document: - code_folding: hide - df_print: paged - highlights: pygments - number_sections: true - self_contained: true - theme: default - toc: true - toc_float: - collapsed: true - smooth_scroll: true -editor_options: - chunk_output_type: console -params: - params_file: params_qc.R - project_file: ../information.R ---- - - -```{r source_params, echo = F} -source(params$params_file) -source(params$project_file) -``` - -```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} -library(tidyverse) -library(knitr) -library(DESeq2) -library(DEGreport) -library(ggrepel) -library(pheatmap) -# library(RColorBrewer) -library(DT) -library(pheatmap) -library(bcbioR) -ggplot2::theme_set(theme_light(base_size = 14)) -opts_chunk[["set"]]( - cache = FALSE, - cache.lazy = FALSE, - dev = c("png", "pdf"), - error = TRUE, - highlight = TRUE, - message = FALSE, - prompt = FALSE, - tidy = FALSE, - warning = FALSE, - fig.height = 4) -``` - - -```{r subchunkify, echo=FALSE, eval=FALSE} -#' Create sub-chunks for plots -#' -#' taken from: https://stackoverflow.com/questions/15365829/dynamic-height-and-width-for-knitr-plots -#' -#' @param pl a plot object -#' @param fig.height figure height -#' @param fig.width figure width -#' @param chunk_name name of the chunk -#' -#' @author Andreas Scharmueller \email{andschar@@protonmail.com} -#' -subchunkify = function(pl, - fig.height = 7, - fig.width = 5, - chunk_name = 'plot') { - pl_deparsed = paste0(deparse(function() { - pl - }), collapse = '') - - sub_chunk = paste0( - "```{r ", - chunk_name, - ", fig.height=", - fig.height, - ", fig.width=", - fig.width, - ", dpi=72", - ", echo=FALSE, message=FALSE, warning=FALSE, fig.align='center'}", - "\n(", - pl_deparsed, - ")()", - "\n```" - ) - - cat(knitr::knit( - text = knitr::knit_expand(text = sub_chunk), - quiet = TRUE - )) -} - -``` - - -```{r sanitize-datatable} -sanitize_datatable = function(df, ...) { - # remove dashes which cause wrapping - DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), - colnames=gsub("-", "_", colnames(df))) -} -``` - -# Overview - -- Project: `r project` -- PI: `r PI` -- Analyst: `r analyst` -- Experiment: `r experiment` -- Aim: `r aim` - - -# Samples and metadata - -```{r load_metadata} -meta_df=read_csv(metadata_fn) %>% mutate(sample = tolower(description)) %>% - dplyr::select(-description) - -ggplot(meta_df, aes(sample_type, fill = sample_type)) + - geom_bar() + ylab("") + xlab("") + - scale_fill_cb_friendly() -``` - - -```{r show-metadata} -se <- readRDS(se_object) #local - - -metrics <- metadata(se)$metrics %>% - full_join(meta_df , by = c("sample" = "sample")) - -meta_sm <- meta_df %>% - as.data.frame() %>% - column_to_rownames("sample") - -meta_sm %>% sanitize_datatable() - -``` - -# Read metrics {.tabset} - -## Total reads - -Here, we want to see consistency and a minimum of 20 million reads. - -```{r plot_total_reads} -metrics %>% - ggplot(aes(x = sample_type, - y = total_reads, - color = sample_type)) + - geom_point(alpha=0.5) + - coord_flip() + - scale_y_continuous(name = "million reads") + - scale_color_cb_friendly() + - ggtitle("Total reads") - -``` - -```{r calc_min_max_pct_mapped} -#get min percent mapped reads for reference -min_pct_mapped <- round(min(metrics$mapped_reads/metrics$total_reads)*100,1) -max_pct_mapped <- round(max(metrics$mapped_reads/metrics$total_reads)*100,1) -``` - -## Mapping rate - -The genomic mapping rate represents the percentage of reads mapping to the reference genome. We want to see consistent mapping rates between samples and over 70% mapping. These samples have mapping rates (`r min_pct_mapped` - `r max_pct_mapped`%). - -```{r plot_mapping_rate} -metrics$mapped_reads_pct <- round(metrics$mapped_reads/metrics$total_reads*100,1) -metrics %>% - ggplot(aes(x = sample_type, - y = mapped_reads_pct, - color = sample_type)) + - geom_point() + - coord_flip() + - scale_color_cb_friendly() + - ylim(0, 100) + - ggtitle("Mapping rate") + - geom_hline(yintercept=70, color = cb_friendly_cols('blue')) -``` - - -## Number of genes detected - -The number of genes represented in every sample is expected to be consistent and over 20K (blue line). - -```{r plot_genes_detected} -genes_detected <- colSums(assays(se)[["raw"]] > 0) %>% enframe() -sample_names <- metrics[,c("sample"), drop=F] -genes_detected <- left_join(genes_detected, sample_names, by = c("name" = "sample")) -genes_detected <- genes_detected %>% group_by(name) -genes_detected <- summarise(genes_detected, - n_genes = max(value)) - -metrics <- metrics %>% - left_join(genes_detected, by = c("sample" = "name")) -ggplot(metrics,aes(x = sample_type, - y = n_genes, color = sample_type)) + - geom_point() + - coord_flip() + - scale_color_cb_friendly() + - ggtitle("Number of genes") + - ylab("Number of genes") + - xlab("") + - geom_hline(yintercept=20000, color = cb_friendly_cols('blue')) -``` - - -## Gene detection saturation - -This plot shows how complex the samples are. We expect samples with more reads to detect more genes. - -```{r plot_gene_saturation} -metrics %>% - ggplot(aes(x = total_reads, - y = n_genes, - color = sample_type)) + - geom_point()+ - scale_x_log10() + - scale_color_cb_friendly() + - ggtitle("Gene saturation") + - ylab("Number of genes") -``` - -## Exonic mapping rate - -Here we are looking for consistency, and exonic mapping rates around 70% or 75% (blue and red lines, respectively). - -```{r plot_exonic_mapping_rate} -metrics %>% - ggplot(aes(x = sample_type, - y = exonic_rate * 100, - color = sample_type)) + - geom_point() + - ylab("Exonic rate %") + - ggtitle("Exonic mapping rate") + - scale_color_cb_friendly() + - coord_flip() + - xlab("") + - ylim(c(0,100)) + - geom_hline(yintercept=70, color = cb_friendly_cols('blue')) + - geom_hline(yintercept=75, color = cb_friendly_cols('brown')) -``` - -## Intronic mapping rate - -Here, we expect a low intronic mapping rate (≤ 15% - 20%) - -```{r plot_intronic_mapping_rate} -metrics %>% - ggplot(aes(x = sample_type, - y = intronic_rate * 100, - color = sample_type)) + - geom_point() + - ylab("Intronic rate %") + - ggtitle("Intronic mapping rate") + - scale_color_cb_friendly() + - coord_flip() + - xlab("") + - ylim(c(0,100)) + - geom_hline(yintercept=20, color = cb_friendly_cols('blue')) + - geom_hline(yintercept=15, color = cb_friendly_cols('brown')) -``` - -## Intergenic mapping rate - -Here, we expect a low intergenic mapping rate, which is true for all samples. - -```{r plot_intergenic_mapping_rate} -metrics %>% - ggplot(aes(x = sample_type, - y = intergenic_rate * 100, - color = sample_type)) + - geom_point() + - ylab("Intergenic rate %") + - ggtitle("Intergenic mapping rate") + - coord_flip() + - scale_color_cb_friendly() + - ylim(c(0, 100)) -``` - -## rRNA mapping rate - -Samples should have a ribosomal RNA (rRNA) "contamination" rate below 10% - -```{r plot_rrna_mapping_rate} -# for some bad samples it could be > 50% -rrna_ylim <- max(round(metrics$r_rna_rate*100, 2)) + 10 -metrics %>% - ggplot(aes(x = sample_type, - y = r_rna_rate * 100, - color = sample_type)) + - geom_point() + - ylab("rRNA rate, %")+ - ylim(0, rrna_ylim) + - ggtitle("rRNA mapping rate") + - coord_flip() + - scale_color_cb_friendly() -``` - -## 5'->3' bias - -There should be little bias, i.e. the values should be close to 1, or at least consistent among samples - -```{r plot_53_bias} -metrics %>% - ggplot(aes(x = sample_type, - y = x5_3_bias, - color = sample_type)) + - geom_point() + - ggtitle("5'-3' bias") + - coord_flip() + - ylim(c(0.5,1.5)) + - scale_color_cb_friendly()+ - geom_hline(yintercept=1, color = cb_friendly_cols('blue')) -``` - -## Counts per gene - all genes - -We expect consistency in the box plots here between the samples, i.e. the distribution of counts across the genes is similar - -```{r plot_counts_per_gene} -metrics_small <- metrics %>% dplyr::select(sample, sample_type) -metrics_small <- left_join(sample_names, metrics_small) - -counts <- - assays(se)[["raw"]] %>% - as_tibble() %>% - filter(rowSums(.)!=0) %>% - gather(name, counts) - -counts <- left_join(counts, metrics, by = c("name" = "sample")) - -ggplot(counts, aes(sample_type, - log2(counts+1), - fill = sample_type)) + - geom_boxplot() + - scale_fill_cb_friendly() + - ggtitle("Counts per gene, all non-zero genes") + - scale_color_cb_friendly() -``` - - -# Sample similarity analysis - -In this section, we look at how well the different groups in the dataset cluster with each other. Samples from the same group should ideally be clustering together. We use Principal Component Analysis (PCA). - -## Principal component analysis (PCA) {.tabset} - -Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells). - - -```{r PCA1:5 summary, all, unlabeled, fig.width= 7, fig.height = 5} -raw_counts <- assays(se)[["raw"]] %>% - as_tibble() %>% - filter(rowSums(.)!=0) %>% - as.matrix() - -vst <- vst(raw_counts) - -#fix samples names -coldat_for_pca <- as.data.frame(metrics) -rownames(coldat_for_pca) <- coldat_for_pca$sample -coldat_for_pca <- coldat_for_pca[colnames(raw_counts),] -pca1 <- degPCA(vst, coldat_for_pca, - condition = "sample_type", data = T)[["plot"]] -pca2 <- degPCA(vst, coldat_for_pca, - condition = "sample_type", data = T, pc1="PC3", pc2="PC4")[["plot"]] - -pca1 + scale_color_cb_friendly() -pca2 + scale_color_cb_friendly() -``` - - -```{r, eval=FALSE} -variables=degCovariates(vst, coldat_for_pca) -``` - - -```{r clustering fig, fig.width = 10, fig.asp = .62} -## Hierarchical clustering - -vst_cor <- cor(vst) - -annotation_cols <- cb_friendly_pal('grey')(length(unique(coldat_for_pca$sample_type))) -names(annotation_cols) <- unique(coldat_for_pca$sample_type) - -p <- pheatmap(vst_cor, - annotation = coldat_for_pca %>% select(sample_type) %>% mutate(sample_type = as.factor(sample_type)), - show_rownames = T, - show_colnames = T, - color = cb_friendly_pal('heatmap')(15), - annotation_colors = list(sample_type = annotation_cols) -) -p - -``` - -# R session - -List and version of tools used for the QC report generation. - -```{r} -sessionInfo() -``` diff --git a/inst/templates/rnaseq/de/Multiplicative_DE_docs.md b/inst/templates/rnaseq/de/Multiplicative_DE_docs.md deleted file mode 100644 index aa35021..0000000 --- a/inst/templates/rnaseq/de/Multiplicative_DE_docs.md +++ /dev/null @@ -1,129 +0,0 @@ -# Overview - -This is an example of complex DE analysis with multiple covariates with multiple levels. - -We have the SEX variable (2 levels) and the GENOTYPE VARIABLE (4 levels) - -# Intercept Analysis - -``` -# Model design and creating dds object from the dataset -design = ~sex + genotype + sex:genotype -dds <- DESeqDataSet(se_Striatum, design) -``` - -## Filtering lowly expressed genes -We are filtering out genes with fewer than 10 raw counts in total and are present in fewer than 3 samples. - -``` -keep <- rowSums(counts(dds)>=10) >=4 -dds <- dds[keep, ] -#dds # comment out this line to print the dds object and compare the dimension of the dataset before and after filtering is applied. -``` - -setting up WT as reference genotype and Male and reference sex. Otherwise DESeq2 will use the conditions in their alphabetical order. - -``` -dds$genotype <- relevel(dds$genotype, ref = "WT") -dds$sex <- relevel(dds$sex, ref = "Male") - -#Checking model design and reference condition comment out the three lines below to print the design and order of genotype and sex -design(dds) -levels(dds$genotype) -levels(dds$sex) - -#estimating size factors for normalization and fitting our model with DESeq model -dds <- estimateSizeFactors(dds) -dds <- DESeq(dds) -resultsNames(dds) #This will print out the name of coefficients being compared, comment it out to view - -# get coefficient matrix -mod_mat <- model.matrix(design(dds), data = colData(dds)) -mod_mat - -(Intercept) sexFemale genotypeCR3KO genotypeQ175 genotypeQ175_CR3KO sexFemale:genotypeCR3KO sexFemale:genotypeQ175 sexFemale:genotypeQ175_CR3KO -a10_st_q175_m_r1 1 0 0 1 0 0 0 0 -a12_st_q175_f_r1 1 1 0 1 0 0 1 0 -a14_st_wt_m_r1 1 0 0 0 0 0 0 0 -a16_st_wt_f_r1 1 1 0 0 0 0 0 0 -``` - - -coefficient weights extracted from the mod_mat above - -``` -WT_M <- c(1, 0, 0, 0, 0, 0, 0, 0) -WT_F <- c(1, 1, 0, 0, 0, 0, 0, 0) -WTCR3ko_M <- c(1, 0, 1, 0, 0, 0, 0, 0) -WTCR3ko_F <- c( 1, 1, 1, 0, 0, 1, 0, 0) -Q175_M <- c(1, 0, 0, 1, 0, 0, 0, 0) -Q175_F <- c(1, 1, 0, 1, 0, 0, 1, 0) -Q175CR3ko_M <- c(1, 0, 0, 0, 1, 0, 0, 0) -Q175CR3ko_F <- c(1, 1, 0, 0, 1, 0, 0, 1) -``` - -# Differential gene expression analysis - -## Comp_2: Female vs Male : WTCR3ko -``` -comp2_F.v.M_WTCR3ko <- results(dds, contrast = c(WTCR3ko_F - WTCR3ko_M)) -comp2_F.v.M_WTCR3ko_shrink <- lfcShrink(dds, contrast = c(WTCR3ko_F - WTCR3ko_M), type = "ashr") -summary(comp2_F.v.M_WTCR3ko) -``` - -## Comp_3: Female vs Male : Q175 -``` -comp3_F.v.M_Q175 <- results(dds, contrast = c(Q175_F - Q175_M)) -comp3_F.v.M_Q175_shrink <- lfcShrink(dds, contrast = c(Q175_F - Q175_M), type = "ashr") -summary(comp3_F.v.M_Q175) -``` - -## Comp_5: WTCR3ko vs WT : Male -```{r} -comp5_WTCR3ko.v.WT_Male <- results(dds, contrast = c(WTCR3ko_M - WT_M)) -comp5_WTCR3ko.v.WT_Male_shrink <- lfcShrink(dds, contrast = c(WTCR3ko_M - WT_M), type = "ashr") -summary(comp5_WTCR3ko.v.WT_Male) -``` - -## Comp_6: WTCR3ko vs WT : Female -```{r} -comp6_WTCR3ko.v.WT_Female <- results(dds, contrast = c(WTCR3ko_F - WT_F)) -comp6_WTCR3ko.v.WT_Female_shrink <- lfcShrink(dds, contrast = c(WTCR3ko_F - WT_F), type = "ashr") -summary(comp6_WTCR3ko.v.WT_Female) -``` - -## Comp_11: Q175CR3ko vs Q175 : Male -``` -comp11_Q175CR3ko.v.Q175_Male <- results(dds, contrast = c(Q175CR3ko_M - Q175_M)) -comp11_Q175CR3ko.v.Q175_Male_shrink <- lfcShrink(dds, contrast = c(Q175CR3ko_M - Q175_M), type = "ashr") -summary(comp11_Q175CR3ko.v.Q175_Male) -``` - -## Comp_12: Q175CR3ko vs Q175 : Female -``` -comp12_Q175CR3ko.v.Q175_Female <- results(dds, contrast = c(Q175CR3ko_F - Q175_F)) -comp12_Q175CR3ko.v.Q175_Female_shrink <- lfcShrink(dds, contrast = c(Q175CR3ko_F - Q175_F), type = "ashr") -summary(comp12_Q175CR3ko.v.Q175_Female) -``` - -## Comp_15: (Q175CR3ko-Q176) - (WTCR3ko - WT) : Male - -Does the CR3 knockout in Q175 differ from CR3 knockout in WT for Males? - -``` -comp15_CR3koinQ175.v.CR3koinWT_Male <- results(dds, - contrast = c(Q175CR3ko_M - Q175_M) - (WTCR3ko_M - WT_M)) -comp15_CR3koinQ175.v.CR3koinWT_Male_shrink <- lfcShrink(dds, - contrast = c(Q175CR3ko_M - Q175_M) - (WTCR3ko_M - WT_M), type = "ashr") -summary(comp15_CR3koinQ175.v.CR3koinWT_Male) -``` - -## Comp_17: (WTCR3koall) - (WTall) - -Does the average of the samples in WTCR3KO differ from average of the samples in WT - -``` -comp17_WTCR3all.v.WTall <- results(dds, contrast = c(WTCR3ko_M + WTCR3ko_F)/2 - (WT_M + WT_F)/2) -comp17_WTCR3all.v.WTall_shrink <- lfcShrink(dds, contrast = c((WTCR3ko_M + WTCR3ko_F)/2 - (WT_M + WT_F)/2), type = "ashr") -summary(comp17_WTCR3all.v.WTall) -``` diff --git a/inst/templates/rnaseq/de/PCA_variance_analysis.Rmd b/inst/templates/rnaseq/de/PCA_variance_analysis.Rmd deleted file mode 100644 index aadabec..0000000 --- a/inst/templates/rnaseq/de/PCA_variance_analysis.Rmd +++ /dev/null @@ -1,35 +0,0 @@ ---- -title: "PCA with variance analysis" -author: "Harvard Chan Bioinformatics Core" ---- - -```{r } -library(DEGreport) -library(ggplot2) -library(ggforce) - -data("bcbio_vsd_data") - -colors=cb_friendly_cols(1:15) -ggplot2::theme_set(theme_prism(base_size = 14)) - -pca <- degPCA(assay(bcbio_vsd_data), colData(bcbio_vsd_data), - condition = "sample_type", name = "sample", data = T) - -pca$plot + ggtitle(paste0("All samples", "\nPCA using ", nrow(vst), " genes")) + - theme(plot.title=element_text(hjust=0.5)) + - geom_mark_ellipse(aes(color = sample_type)) -``` - -Information on [betadisper](https://uw.pressbooks.pub/appliedmultivariatestatistics/chapter/permdisp/) to do analyses of multivariate homogeneity of group dispersions (variances). - -```{r} -# NOTE:This is not confirmed to be a valid test but it could help to understand the data -library(vegan) -vare.disa <- vegdist(t(assay(bcbio_vsd_data))) - -mod = betadisper(vare.disa, colData(bcbio_vsd_data)[['sample_type']]) -anova(mod) -``` - - diff --git a/inst/templates/rnaseq/qc/placeholder b/inst/templates/rnaseq/qc/placeholder deleted file mode 100644 index e69de29..0000000 From 94a149eefe0d723b5841c77ca11db941f140c031 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Wed, 31 Jul 2024 18:02:12 -0400 Subject: [PATCH 32/93] trigger name change --- inst/templates/rnaseq/DEa/DEG.Rmd | 581 ++++++++++++++++++++++++++++++ 1 file changed, 581 insertions(+) create mode 100644 inst/templates/rnaseq/DEa/DEG.Rmd diff --git a/inst/templates/rnaseq/DEa/DEG.Rmd b/inst/templates/rnaseq/DEa/DEG.Rmd new file mode 100644 index 0000000..c2069be --- /dev/null +++ b/inst/templates/rnaseq/DEa/DEG.Rmd @@ -0,0 +1,581 @@ +--- +title: "Differential Expression" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + # Put hg38, mm10, mm39, or other + + ## Combatseq and ruv can both be false or ONLY ONE can be true + ## Both cannot be true + numerator: tumor + denominator: normal + column: sample_type + subset_column: null + subset_value: null + genome: hg38 + ruv: false + combatseq: false + params_file: params_de-example.R + project_file: ../information.R + functions_file: load_data.R +--- + + +```{r} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +``` + + +```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} +# 1. Set up input files in this R file (params_de.R) +source(params$params_file) +# 2. Set up project file (already done from QC probably) +source(params$project_file) +# 3. Load custom functions to load data from coldata/metrics/counts +source(params$functions_file) +# IMPORTANT set these values if you are not using the parameters in the header (lines 22-31) +genome=params$genome +column=params$column +numerator=params$numerator +denominator=params$denominator +subset_column=params$subset_column +subset_value=params$subset_value +run_ruv=params$ruv +run_combatseq=params$combatseq +factor_of_interest <- column +``` + + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(rtracklayer) +library(DESeq2) +library(tidyverse) +library(stringr) +library(DEGreport) +library(ggpubr) +library(msigdbr) +library(fgsea) +library(org.Hs.eg.db) +library(knitr) +library(EnhancedVolcano) +library(bcbioR) +library(ggprism) +library(viridis) +library(pheatmap) +library(janitor) +library(ggforce) +library(vegan) + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) +opts_chunk[["set"]]( + cache = F, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + echo = T, + fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) +``` + +```{r sanitize_datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df))) +} +``` + + + +```{r load_data, message=F, warning=F} +# This code will load from bcbio or nf-core folder +# NOTE make sure to set numerator and denominator +coldata <- load_coldata(coldata_fn, column, + numerator, denominator, + subset_column, subset_value) +coldata$sample=row.names(coldata) + +counts <- load_counts(counts_fn) +counts <- counts[,colnames(counts) %in% coldata$sample] + +metrics <- load_metrics(se_object, multiqc_data_dir, gtf_fn, counts) %>% + left_join(coldata, by = c('sample')) %>% + as.data.frame() +rownames(metrics) <- metrics$sample +# if the names don't match in order or string check files names and coldata information +counts = counts[,rownames(metrics)] +coldata = coldata[rownames(metrics),] +stopifnot(all(names(counts) == rownames(metrics))) +``` + + + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` +- Aim: `r aim` +- Comparison: `r ifelse(is.null(subset_value), paste0(numerator, ' vs. ', denominator), paste0(subset_value, ': ', numerator, ' vs. ', denominator))` + +```{r create_filenames} + +if (!is.null(subset_value) & !is.null(subset_value)){ + filenames = str_interp("${subset_value}_${numerator}_vs_${denominator}") +} else { + filenames = str_interp("${numerator}_vs_${denominator}") +} + +contrasts = c(column,numerator,denominator) +coef=paste0(column,"_",numerator,"_vs_",denominator) + +name_expression_fn=file.path( + basedir, + str_interp("${filenames}_expression.csv")) +name_deg_fn=file.path( + basedir, + str_interp("${filenames}_deg.csv")) +name_pathways_fn=file.path( + basedir, + str_interp("${filenames}_pathways.csv")) + +``` + +```{r load_counts_data} + +rdata = AnnotationDbi::select(org.Hs.eg.db, rownames(counts), 'SYMBOL', 'ENSEMBL') %>% + dplyr::select(gene_id = ENSEMBL, gene_name = SYMBOL) %>% + distinct(gene_id, .keep_all = TRUE) + +``` + +```{r setup_RUV} +dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = ~1) + +vsd_before <- vst(dds_to_use) +norm_matrix = assay(vsd_before) +``` + + +# PCA and group level variance. + +**Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells).** + +Dispersion estimates are a key part of the DESEQ2 analysis. DESEQ2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per gene dispersions to match this distribution. If one group has higher variance than all others this will affect the dispersion estimates. Here we visually check that the variance per group is similar using a PCA. The ellipses are minimal volume enclosing ellipses using the Khachiyan algorithm. + +**It is best practice NOT to subset your data unless one group has significantly higher variance than the others. The best dispersion estimates are obtained with more data.** + +**This code automatically uses the column value from the header. You can also manually add a factor of interest to define the groups. One can be created by combining multiple metadata columns using the paste0 function.** + +```{r set group, eval=FALSE, echo=FALSE} +## Example of creating a group covariate + +meta$group <- paste0(meta$sex,"_", meta$age,"_",meta$treatment) + +factor_of_interest <- "insert column name for covariate of interest" +``` + + +```{r PCA} +pca <- degPCA(norm_matrix, metrics, + condition = factor_of_interest, name = "sample", data = T) + +pca$plot + ggtitle(paste0("All samples", "\nPCA using ", nrow(vsd_before), " genes")) + + theme(plot.title=element_text(hjust=0.5)) + + geom_mark_ellipse(aes(color = sample_type)) + scale_color_cb_friendly() +``` + +## PERMDISP + +Groups in a univariate analysis can also differ with regard to their mean values, variation around those means, or both. In univariate analyses, dispersion can be examined using Levene’s test. PERMDISP is a multivariate extension of Levene’s test to examine whether groups differ in variability. In essence, PERMDISP involves calculating the distance from each data point to its group centroid and then testing whether those distances differ among the groups. [Source](https://uw.pressbooks.pub/appliedmultivariatestatistics/chapter/permdisp/) + +Here we apply this test to our variance stabilized data. We calculate distances between samples and then use the `betadisper()` function from the popular vegan package. We get two overall p-values where significant means that the dispersions are different between groups. The first p-value comes from the `anova()` function and the second from the `permutest()` function. We also get pairwise p-values for every group-group comparison. + +```{r PERMDISP} +vare.disa <- vegdist(t(assay(vsd_before))) + +mod = betadisper(vare.disa, metrics[[factor_of_interest]]) +anova(mod) +permutest(mod, pairwise = TRUE) + +``` + + + +# Covariate analysis + +Multiple factors related to the experimental design or quality of sequencing may influence the outcomes of a given RNA-seq experiment. To further determine whether any confounding covariate risks affecting the results of our differential expression analyses, it is useful to assess the correlation between covariates and principal component (PC) values. + +Here, we are using `DEGreport::degCovariates()` to explore potential correlations between variables provided in the metadata and all PCs that account for at least 5% of the variability in the data. If applicable, significant correlations (FDR < 0.1) are circled. **This diagnostic plot helps us determine which variables we may need to add to our DE model.** + + +```{r covariates, fig.height = 6, fig.width = 10} +degCovariates( + norm_matrix, + metrics, +) +``` + + + +```{r init_DESEQ} +formula <- as.formula(paste0("~ ", " + ", column)) +## Check if sample name matches +stopifnot(all(names(counts) == rownames(coldata))) + +dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = formula) + +vsd_before <- vst(dds_to_use) +norm_matrix = assay(vsd_before) +new_cdata <- coldata +``` + + +```{r, eval=F, echo=FALSE} +#### IF YOU ARE RUNNING RUV OR COMBATSEQ RUN THE CHUNKS BELOW OTHERWISE SKIP TO Differential Expression SECTION + +### RUV - LINES 261-296 +### COMBATSEQ - LINES 303-369 +``` + + + +```{r, eval=run_ruv, results='asis', echo=run_ruv} +cat("# Remove Unwanted Variability + +When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data.") +``` + +```{r do_RUV, eval=run_ruv, echo=run_ruv} +library(RUVSeq) + +# If you want to skip the code, just set up formula to be your model in the next chunk of code +design <- coldata[[column]] +diffs <- makeGroups(design) +dat <- norm_matrix +# by default is running one variable, +# change K parameter to other number to find more unknown covariates +ruvset <- RUVs(dat, cIdx=rownames(dat), k=1, diffs, isLog = T, round = F) +vars <- ruvset$W + +new_cdata <- cbind(coldata, vars) + +formula <- as.formula(paste0("~ ", + paste0( + colnames(new_cdata)[grepl("W", colnames(new_cdata))], + collapse = " + " + ), " + ", column) +) +norm_matrix=ruvset$normalizedCounts +pca2 <- degPCA(norm_matrix, new_cdata, + condition = column) + ggtitle('After RUV') +pca2 + scale_color_cb_friendly() + +``` + +```{r after_RUV, eval=run_ruv} + +dds_to_use <- DESeqDataSetFromMatrix(counts, new_cdata, design = formula) +vsd_to_use<- vst(dds_to_use, blind=FALSE) + +``` + +```{r combat-text , eval=run_combatseq, results='asis', echo=run_combatseq} +library(sva) + +cat("# Remove Batch Effects + +Here we apply Combat-seq (https://github.com/zhangyuqing/ComBat-seq) to try to remove batch effects so we can better tease out the effects of interest. + +Combat-seq uses a negative binomial regression to model batch effects, providing adjusted data by mapping the original data to an expected distribution if there were no batch effects. The adjusted data preserves the integer nature of counts, so that it is compatible with the assumptions of state-of-the-art differential expression software (e.g. edgeR, DESeq2, which specifically request untransformed count data).") + +``` + + +```{r set_variable_combatseq, eval=run_combatseq, echo=run_combatseq} + +## FILL OUT THIS CHUNK OF CODE IF YOU WANT TO RUN COMBATSEQ + +## Set your batch effect variable here this is the variable that combatseq will try to remove + +## Column name of your batch variable +to_remove = "batch" + +## Column name of of your variable(s) of interest + +to_keep = "sample_type" + + +coldata[[to_remove]] <- as.factor(coldata[[to_remove]]) +coldata[[to_keep]] <- as.factor(coldata[[to_keep]]) + + +batch = coldata[[to_remove]] +treatment = coldata[[to_keep]] + +## If you have multiple variables of interest you will need to cbind them into one variable + +#treatment1 = metrics[[to_keep]] +#treatment2 = metrics[[to_keep]] +#treatment3 = metrics[[to_keep]] + + +# imp = cbind(as.numeric(as.character(treatment1)),as.numeric(as.character(treatment2)), as.numeric(as.character(treatment3))) + +``` + + +```{r do_combatseq, eval=run_combatseq} +adjusted_counts <- ComBat_seq(as.matrix(counts), batch=batch, group = treatment) + +## For multiple variables of interest + +# adjusted_counts <- ComBat_seq(as.matrix(counts2), batch=batch, covar_mod = imp) + +``` + +```{r after_combatseq, eval=run_combatseq} +# NOTE: Make sure the formula doens't contain the covariates used in combatseq above +dds_to_use <- DESeqDataSetFromMatrix(adjusted_counts, coldata, design = formula) +vsd_combat<- vst(dds_to_use, blind=FALSE) + +combat_matrix = assay(vsd_combat) + +pca_combat <- degPCA(combat_matrix, coldata, + condition = column) + ggtitle('After Combatseq') +pca_combat + scale_color_cb_friendly() + +``` + + +# Differential Expression + +Differential gene expression analysis of count data was performed using the Bioconductor R package, DESeq2, which fits the count data to a negative binomial model. + +Before fitting the model, we often look at a metric called dispersion, which is a measure for variance which also takes into consideration mean expression. A dispersion value is estimated for each individual gene, then 'shrunken' to a more accurate value based on expected variation for a typical gene exhibiting that level of expression. Finally, the shrunken dispersion value is used in the final GLM fit. + +We use the below dispersion plot, which should show an inverse relationship between dispersion and mean expression, to get an idea of whether our data is a good fit for the model. + +```{r DE} +de <- DESeq(dds_to_use) + +DESeq2::plotDispEsts(de) +``` + +Because it is difficult to accurately detect and quantify the expression of lowly expressed genes, differences in their expression between treatment conditions can be unduly exaggerated after the model is fit. We correct for this so that gene LFC is not dependent overall on basal gene expression level. + +```{r lfc_shrink} +# resultsNames(de) # check the order is right +resLFC = results(de, contrast=contrasts) +resLFCS <- lfcShrink(de, coef=coef, type="apeglm") + +res <- as.data.frame(resLFCS) %>% + rownames_to_column('gene_id') %>% left_join(rdata, by = 'gene_id') %>% + relocate(gene_name) %>% dplyr::rename(lfc = log2FoldChange) %>% + mutate(pi = abs(lfc) * -log10(padj)) %>% arrange(-pi) + +## Filter out genes that have no expression or were filtered out by DESEQ2 +res <- res[res$baseMean>0,] %>% drop_na(padj) %>% drop_na(pvalue) + +res_sig <- res %>% filter(padj < 0.05) %>% arrange(padj) %>% + mutate(gene_name = ifelse(is.na(gene_name), gene_id, gene_name)) + +res_mod <- res %>% mutate(lfc = replace(lfc, lfc < -5, -5)) %>% mutate(lfc = replace(lfc, lfc > 5, 5)) +show <- as.data.frame(res_mod[1:10, c("lfc", "padj", "gene_name")]) + +degMA(as.DEGSet(resLFC)) + ggtitle('Before LFC Shrinking') +``` + +## MA plot + +```{r after_lfc_shrink} +degMA(as.DEGSet(resLFCS), limit = 2) + ggtitle('After LFC Shrinking') + +``` + +## Volcano plot + +This volcano plot shows the genes that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in red are genes that have padj < 0.05 and a log2-fold change > 1. Points in blue have a padj < 0.05 and a log2-fold change < 1 and points in green have a padj > 0.05 and a log2-fold change > 2. Grey points are non-significant. The dashed lines correspond to the cutoff values of log2 foldchance and padj that we have chosen. + +```{r volcano_plot, fig.height=6} +# degVolcano(res_mod[,c('lfc', 'padj')], plot_text = show) +EnhancedVolcano(res_mod, + lab= res_mod$gene_name, + pCutoff = 0.05, + selectLab = c(res_sig$gene_name[1:15]), + FCcutoff = 0.5, + x = 'lfc', + y = 'padj', + title="Volcano Tumor vs. Normal", + col=as.vector(colors[c("dark_grey", "light_blue", + "purple", "purple")]), + subtitle = "", drawConnectors = T, max.overlaps = Inf) +``` + +## Heatmap + +```{r heapmap} +### Run pheatmap using the metadata data frame for the annotation +ma=norm_matrix[res_sig$gene_id,] +colma=coldata[,c(column), drop=FALSE] +colors=lapply(colnames(colma), function(c){ + l.col=colors[1:length(unique(colma[[c]]))] + names(l.col)=unique(colma[[c]]) + l.col +}) +names(colors)=colnames(colma) +pheatmap(ma, + color = inferno(10), + cluster_rows = T, + show_rownames = F, + annotation = colma, + annotation_colors = colors, + border_color = NA, + fontsize = 10, + scale = "row", + fontsize_row = 10, + height = 20) +``` + + +## Differentially Expressed Genes + +```{r sig_genes_table} +res_sig %>% sanitize_datatable +``` + +## Plot top 16 genes + +```{r top n DEGs, fig.height = 6, fig.width = 8} +n = 16 +top_n <- res_sig %>% slice_min(order_by = padj, n = n, with_ties = F) %>% + dplyr::select(gene_name, gene_id) +top_n_exp <- norm_matrix %>% as.data.frame() %>% + rownames_to_column('gene_id') %>% + # dplyr::select(-group, -group_name) %>% + pivot_longer(!gene_id, names_to = 'sample', values_to = 'log2_expression') %>% + right_join(top_n, relationship = "many-to-many") %>% + left_join(coldata, by = 'sample') + +ggplot(top_n_exp, aes_string(x = column, y = 'log2_expression')) + + geom_boxplot(outlier.shape = NA, linewidth=0.5, color="grey") + + geom_point() + + facet_wrap(~gene_name) + + ggtitle(str_interp('Expression of Top ${n} DEGs')) + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + +``` + +# Pathway Enrichment + +From the set of differentially expressed genes and using publicly available information about gene sets involved in biological processes and functions, we can calculate which biological processes and functions are significantly perturbed as a result of the treatment. + +```{r} +universe=res %>% + filter(!is.na(padj)) %>% pull(gene_id) +mapping = AnnotationDbi::select(org.Hs.eg.db, universe, 'ENTREZID', 'ENSEMBL') + +all_in_life=list( + msigdbr(species = "human", category = "H") %>% mutate(gs_subcat="Hallmark"), + msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME"), + msigdbr(species = "human", category = "C2", subcategory = "CP:KEGG"), + msigdbr(species = "human", category = "C2", subcategory = "CP:PID"), + msigdbr(species = "human", category = "C5", subcategory = "GO:BP"), + msigdbr(species = "human", category = "C5", subcategory = "GO:MF"), + msigdbr(species = "human", category = "C5", subcategory = "HPO"), + msigdbr(species = "human", category = "C3", subcategory = "TFT:GTRD"), + msigdbr(species = "human", category = "C6") %>% mutate(gs_subcat="Oncogenic") +) + +ora_input = res %>% filter(!is.na(padj), padj<0.01, abs(lfc)>0.3) %>% pull(gene_id) +input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL')) + +total_deg=length(unique(ora_input))/length(unique(mapping$ENTREZID)) +pathways_ora_all = lapply(all_in_life, function(p){ + pathway = split(x = p$entrez_gene, f = p$gs_name) + db_name = paste(p$gs_cat[1], p$gs_subcat[1],sep=":") + respath <- fora(pathways = pathway, + genes = unique(input_entrezid$ENTREZID), + universe = unique(mapping$ENTREZID), + minSize = 15, + maxSize = 500) + coll_respath = collapsePathwaysORA(respath[order(pval)][padj < 0.1], + pathway, unique(input_entrezid$ENTREZID), unique(mapping$ENTREZID)) + as_tibble(respath[pathway %in% coll_respath$mainPathways]) %>% + mutate(database=db_name, NES=(overlap/size)/(total_deg)) +}) %>% bind_rows() %>% + mutate(analysis="ORA") + +ora_tb = pathways_ora_all %>% unnest(overlapGenes) %>% + group_by(pathway) %>% + left_join(mapping, by =c("overlapGenes"="ENTREZID")) %>% + dplyr::select(pathway, padj, NES, ENSEMBL, analysis, + database) + +pathways_long = ora_tb + +``` + + +```{r pathaways_table} +pathways_ora_all %>% sanitize_datatable() +``` + + +```{r write-files} +counts_norm=norm_matrix %>% as.data.frame() %>% + rownames_to_column("gene_id") %>% + mutate(comparison = str_interp("${numerator}_vs_${denominator}")) + +res_for_writing <- res %>% + mutate(comparison = str_interp("${numerator}_vs_${denominator}")) + +pathways_for_writing <- pathways_long %>% + mutate(comparison = str_interp("${numerator}_vs_${denominator}")) + +if (!is.null(subset_value)){ + counts_norm <- counts_norm %>% + mutate(subset = subset_value) + res_for_writing <- res_for_writing %>% + mutate(subset = subset_value) + pathways_for_writing <- pathways_for_writing %>% + mutate(subset = subset_value) +} + +write_csv(counts_norm, name_expression_fn) +write_csv(res_for_writing, name_deg_fn) +write_csv(pathways_for_writing, name_pathways_fn) +``` + +# R session + +List and version of tools used for the DE report generation. + +```{r} +sessionInfo() +``` From c67499a1b028de1b1abcfc03437eed86bb9a2fd4 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Wed, 31 Jul 2024 18:03:53 -0400 Subject: [PATCH 33/93] remove rnaseq --- .../rnaseq/DE/Comparison-intersections.Rmd | 258 -------- .../rnaseq/DE/Cross-comparison-analysis.Rmd | 236 ------- inst/templates/rnaseq/DE/params_de-example.R | 18 - inst/templates/rnaseq/DE/params_de.R | 22 - inst/templates/rnaseq/DE/run_markdown.R | 32 - inst/templates/rnaseq/DEa/DEG.Rmd | 581 ----------------- inst/templates/rnaseq/QC/params_qc.R | 4 - inst/templates/rnaseq/QC/params_qc_nf-core.R | 11 - inst/templates/rnaseq/QC/run_markdown.R | 13 - inst/templates/rnaseq/README.md | 66 -- inst/templates/rnaseq/de/DEG.Rmd | 581 ----------------- inst/templates/rnaseq/de/load_data.R | 146 ----- inst/templates/rnaseq/information.R | 6 - inst/templates/rnaseq/org/hcbc_README.md | 75 --- inst/templates/rnaseq/qc/QC-bcbio.Rmd | 409 ------------ inst/templates/rnaseq/qc/QC_nf-core.Rmd | 585 ------------------ .../rnaseq/qc/params_qc_nf-core-example.R | 9 - 17 files changed, 3052 deletions(-) delete mode 100644 inst/templates/rnaseq/DE/Comparison-intersections.Rmd delete mode 100644 inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd delete mode 100644 inst/templates/rnaseq/DE/params_de-example.R delete mode 100644 inst/templates/rnaseq/DE/params_de.R delete mode 100644 inst/templates/rnaseq/DE/run_markdown.R delete mode 100644 inst/templates/rnaseq/DEa/DEG.Rmd delete mode 100644 inst/templates/rnaseq/QC/params_qc.R delete mode 100644 inst/templates/rnaseq/QC/params_qc_nf-core.R delete mode 100644 inst/templates/rnaseq/QC/run_markdown.R delete mode 100644 inst/templates/rnaseq/README.md delete mode 100644 inst/templates/rnaseq/de/DEG.Rmd delete mode 100644 inst/templates/rnaseq/de/load_data.R delete mode 100644 inst/templates/rnaseq/information.R delete mode 100644 inst/templates/rnaseq/org/hcbc_README.md delete mode 100644 inst/templates/rnaseq/qc/QC-bcbio.Rmd delete mode 100644 inst/templates/rnaseq/qc/QC_nf-core.Rmd delete mode 100644 inst/templates/rnaseq/qc/params_qc_nf-core-example.R diff --git a/inst/templates/rnaseq/DE/Comparison-intersections.Rmd b/inst/templates/rnaseq/DE/Comparison-intersections.Rmd deleted file mode 100644 index b073469..0000000 --- a/inst/templates/rnaseq/DE/Comparison-intersections.Rmd +++ /dev/null @@ -1,258 +0,0 @@ ---- -title: "Comparing DE Results - Multiple Contrasts" -author: "Harvard Chan Bioinformatics Core" -date: "`r Sys.Date()`" -output: - html_document: - code_folding: hide - df_print: paged - highlights: pygments - number_sections: true - self_contained: true - theme: default - toc: true - toc_float: - collapsed: true - smooth_scroll: true -editor_options: - chunk_output_type: console -params: - project_file: ../information.R ---- - -```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} -## Adjusted P-value used for significance -padj_co <- 0.05 -## Log2FC used for significance. If no cutoff used put 0 -LFC <- 0.5 -## Normalized counts for ALL samples -# norm <- "/Users/emb016/Documents/comparisons_templates/norm_counts.csv" -# Load the count data, for this example it is the last columns of the DE table -norm_counts <- read.csv("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/norm_counts.csv.gz", - row.names = 1) - -# Load the meta data, here we are making one for the exmaple -metadata <- read_csv("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/meta.csv.gz") %>% as.data.frame() - -## Full results file (all genes) for contrast 1 -files=c("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group1.csv.gz", - "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group2.csv.gz", - "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group3.csv.gz") - -``` - - - -```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} -library(rtracklayer) -library(tidyverse) -library(stringr) -library(ggpubr) -library(knitr) -library(bcbioR) -library(ggprism) -library(viridis) -library(pheatmap) -library(janitor) -library(ggvenn) -library(ggplot2) -library(UpSetR) -library(ggprism) -#library(org.Ce.eg.db) -library(org.Hs.eg.db) -#library(org.Mm.eg.db) - -colors=cb_friendly_cols(1:15) -ggplot2::theme_set(theme_prism(base_size = 14)) -opts_chunk[["set"]]( - cache = F, - cache.lazy = FALSE, - dev = c("png", "pdf"), - error = TRUE, - highlight = TRUE, - message = FALSE, - prompt = FALSE, - tidy = FALSE, - warning = FALSE, - echo = T, - fig.height = 4) - -# set seed for reproducibility -set.seed(1234567890L) -``` - - -# Load Data - -We load our dataset - -```{r load_data} - -## Name of contrast. This will be displayed on the figures. -# you can manually indicate a list of names as comp_names=c("name1","name2"...) -comp_names = basename(files) %>% - str_remove_all("all_results_|.csv|.gz") %>% - str_replace_all("_", " ") -names(files)=comp_names -N=length(files) -stopifnot(length(files)==length(comp_names)) - -## Make sure you have set up N above -all_genes=lapply(names(files), function(name){ - data <- read_csv(files[name]) %>% - dplyr::filter(padj <= 1) -}) -sign_genes=lapply(names(files), function(name){ - data <- read_csv(files[name]) %>% - dplyr::filter(padj <= 1) - data %>% - dplyr::filter(padj < padj_co, abs(lfc) > LFC) -}) -``` - - - -# Make list of comparisons - - -```{r, fig.height=8, fig.width=8, warning=FALSE, error=FALSE, message=FALSE} -de=lapply(sign_genes, function(x){ - x$gene_id -}) -names(de) <- comp_names -``` - -## Make an upset plot - -Because we have done so many tests venn diagrams no longer work for our data. Instead we will use upset plots. *These plots are relatively intuitive for 2 or 3 categories, but can tend to get more complex for >3 categories. In all cases, you will find the categories being compared and their size listed below the bar plots on the left. As you look to the right (directly below each bar) there are dots with connecting lines that denote which categories the overlap is between, or if there is no overlap (just a dot). The numbers at the top of the bars denote the size of the overlap.* - - -```{r, fig.height=8, fig.width=12} -upset(fromList(de), order.by = "freq", nsets=N) - -``` - -## Pull intersect(s) of interest - -After identifying intersect(s) of interest we can determine which genes are found in which intersections - - -```{r, warning=FALSE, error=FALSE, message=FALSE} -## Grab intersection -gene_names <- data.frame(gene=unique(unlist(de))) - -df1 <- lapply(de,function(x){ - data.frame(gene = x) -}) %>% - bind_rows(.id = "path") - -df_int <- lapply(gene_names$gene,function(x){ - # pull the name of the intersections - intersection <- df1 %>% - dplyr::filter(gene==x) %>% - arrange(path) %>% - pull("path") %>% - paste0(collapse = "|") - # build the dataframe - data.frame(gene = x,int = intersection) -}) %>% bind_rows() -``` - - -```{r, eval=F} -## Run this code to find the name of your intersect of interest. You will use this in the next code chunk -table(df_int$int) -``` - -```{r, warning=FALSE, error=FALSE, message=FALSE} -## subset interaction of interest replace the intersect name with the name of the intersect from above. You can copy and paste the below commands to grab multiple intersects. - -Intersect1 <- subset(df_int, df_int$int=="DMSO vs Group2|DMSO vs Group3") -``` - -## Get annotation data -```{r, warning=FALSE, error=FALSE, message=FALSE} - -# edit this to be the correct organism. One set of annotations per intersect. -# rdata = AnnotationDbi::select(org.Hs.eg.db, Intersect1$gene, 'SYMBOL', 'ENSEMBL') %>% -# dplyr::select(gene_id = ENSEMBL, gene_name = SYMBOL) %>% distinct(gene_id, .keep_all = T) - -# FIX: following code is only for test data, use the above with real data -rdata=data.frame(gene_id=row.names(norm_counts), gene_name=row.names(norm_counts)) -``` - - - -## Heatmap of intersect - -We generate a heatmap with all samples to see the patterns contained in this intersect. - -```{r, fig.height=6, warning=FALSE, error=FALSE, message=FALSE} -## Assign factors of interest. These need to correspond to columns in your metadata. - -factor1 <- "Treatment" -factor2 <- "Cell_line" - -# Extract significant genes -stopifnot(all(Intersect1$gene %in% row.names(norm_counts))) -sigGenes <- Intersect1$gene - -### Extract normalized expression for significant genes -norm_sig <- norm_counts[sigGenes,] -meta <- data.frame(metadata[,print(factor1)],metadata[,print(factor1)]) -colnames(meta) <- c(print(factor1),print(factor2)) -rownames(meta) <- colnames(norm_sig) -### Set a color palette -heat_colors <- lapply(colnames(norm_sig), function(c){ - l.col=colors[1:length(unique(norm_sig[[c]]))] - names(l.col)=unique(norm_sig[[c]]) - l.col -}) - -### Run pheatmap using the metadata data frame for the annotation (11 x 5) -pheatmap(norm_sig, - color = inferno(10), - cluster_rows = T, - show_rownames = F, - annotation = meta, - annotation_colors = heat_colors, - border_color = NA, - fontsize = 10, - scale = "row", - fontsize_row = 10, - height = 20) -``` - - -## Graph all genes in intersect - -```{r, warning=FALSE, error=FALSE, message=FALSE} -Intersect1_annot <- Intersect1 %>% left_join(rdata, by=c("gene"="gene_id")) -# REMOVE to plot all -Intersect1_annot <- Intersect1_annot[1:10] - -graphs <- length(Intersect1_annot$gene) -to_test <- t(norm_counts) -rna = Intersect1_annot$gene -names = Intersect1_annot$gene_name - -to_graph = data.frame(to_test[,rna]) -to_graph = to_graph[Intersect1_annot$gene] -to_graph$Factor1 <- metadata[,factor1] -to_graph$Factor2 <- metadata[,factor2] - -#out <- vector("list", length = graphs) -for (i in seq(1,graphs)) { - to_graph$temp=to_graph[[i]] - print(ggplot(to_graph,aes(x=Factor1,y=temp,color=Factor2)) + - geom_boxplot() + geom_point(alpha=0.5) + ylab(paste0(names[[i]])) + xlab(factor1) + scale_color_discrete(name = "Covariate")) -} -``` - -# R session - -List and version of tools used for the QC report generation. - -```{r} -sessionInfo() -``` diff --git a/inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd b/inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd deleted file mode 100644 index 4e78ea6..0000000 --- a/inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd +++ /dev/null @@ -1,236 +0,0 @@ ---- -title: "Comparing DE Results - Pairwise" -author: "Harvard Chan Bioinformatics Core" -date: "`r Sys.Date()`" -output: - html_document: - code_folding: hide - df_print: paged - highlights: pygments - number_sections: true - self_contained: true - theme: default - toc: true - toc_float: - collapsed: true - smooth_scroll: true -editor_options: - chunk_output_type: console -params: - project_file: ../information.R ---- - - - -```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} -# 1. Set up input files in this R file (params_pairwisecomp.R) -## Full results file (all genes) for contrastt 1 -comp1_fn <- 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group1.csv.gz' -## Name of contrast 1. This will be displayed on the figures -comp1_name <- "DMSO vs. Group1" -## Full results file (all genes) for contrast 2 -comp2_fn <- 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group2.csv.gz' -## Name of contrast 2. This will be displayed on the figures -comp2_name <- "DMSO vs. Group2" -## Adjusted P-value used for significance -padj_co <- 0.05 -## Log2FC used for significance. If no cutoff used put 0 -LFC <- 0.5 - -comp1 <- read_csv(comp1_fn) %>% - dplyr::filter(padj <= 1) -comp2 <- read_csv(comp2_fn) %>% - dplyr::filter(padj <= 1) -``` - - - -```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} -library(rtracklayer) -library(tidyverse) -library(stringr) -library(ggpubr) -library(knitr) -library(bcbioR) -library(ggprism) -library(viridis) -library(pheatmap) -library(janitor) -library(ggvenn) -library(ggplot2) - -colors=cb_friendly_cols(1:15) -ggplot2::theme_set(theme_prism(base_size = 14)) -opts_chunk[["set"]]( - cache = F, - cache.lazy = FALSE, - dev = c("png", "pdf"), - error = TRUE, - highlight = TRUE, - message = FALSE, - prompt = FALSE, - tidy = FALSE, - warning = FALSE, - echo = T, - fig.height = 4) - -# set seed for reproducibility -set.seed(1234567890L) -``` - - -# Load Data - -We load our dataset - -```{r load_data} -# this code will load from bcbio or nf-core folder -# NOTE make sure to set numerator and denominator - -comp1_sig <- comp1 %>% - dplyr::filter(padj < padj_co, abs(lfc) > LFC) - -comp2_sig <- comp2 %>% - dplyr::filter(padj < padj_co, abs(lfc) > LFC) -``` - - - -# Comparisons - -We start with a venn diagram looking at the overlap between our two contrasts - -```{r, fig.height=8, fig.width=8} -name1 <- rlang::ensym(comp1_name) -name2 <- rlang::ensym(comp2_name) -names <- c(name1, name2) - -full <- list(comp1_sig$gene_id,comp2_sig$gene_id) -names(full) <-names - -ggvenn(full, show_percentage = F) - -``` - -## Compare effect sizes and direction - -We plot Log2FC for our contrasts and color points by whether or not they are significant in our contrasts. The black line is 1:1. - - -```{r fig.height=6, fig.width=8} -# Edit based on the data you are using - -#make sure to only use genes present in both results files -test_intersect <- intersect(comp1$gene_id, comp2$gene_id) -comp1_sub <- subset(comp1, comp1$gene_id %in% test_intersect) -comp2_sub <- subset(comp2, comp2$gene_id %in% test_intersect) - -## Check that gene names match -all(comp1_sub$gene_id== comp2_sub$gene_id) - -## Gather necessary data -lfc <- data.frame(comp1_sub$gene_id, comp1_sub$gene_name, comp1_sub$lfc, comp2_sub$lfc) -colnames(lfc) <- c("gene_id","gene_name", "comp1", "comp2") - -# subset to only include genes in both datasets and color by grouping -DE_comp1 <- setdiff(comp1_sig$gene_id, comp2_sig$gene_id) -DE_comp2 <- setdiff(comp2_sig$gene_id, comp1_sig$gene_id) -DE_both <- intersect(comp2_sig$gene_id, comp1_sig$gene_id) -not_sig <- comp1_sub$gene_id[!(comp1_sub$gene_id %in% c(DE_comp1,DE_comp2,DE_both))] - - -col <- rep(4, nrow(lfc)) -col[lfc$gene_id %in% not_sig] <- 1 -col[lfc$gene_id %in% DE_comp1] <- 2 -col[lfc$gene_id %in% DE_comp2] <- 3 -col[lfc$gene_id %in% DE_both] <- 4 - - -lfc$col <- lfc %>% - dplyr::mutate(color = case_when( - gene_id %in% DE_both ~ 3, - gene_id %in% DE_comp1 ~ 1, - gene_id %in% DE_comp2 ~ 2, - gene_id %in% not_sig ~ 8 - )) %>% pull(color) -lfc$col <- as.factor(lfc$col) - - -ggplot(lfc, aes(x=comp1, y=comp2, color=col)) + geom_point() + - labs(color="Group") + - scale_color_discrete(name = "Group", labels = c(paste0("Only DE in ",paste0(comp1_name)), paste0("Only DE in ",paste0(comp2_name)),"DE in both comparisons", "Not Significant")) + - geom_abline(intercept=0, slope=1) + - geom_hline(aes(yintercept=0)) + - geom_vline(aes(xintercept=0)) + - scale_color_cb_friendly() + - xlab(paste0("Log2FC in ",paste0(comp1_name))) + - ylab(paste0("Log2FC in ",paste0(comp2_name))) - -``` - - - -## Compare ajusted P-values - -We plot adjusted P-values for our contrasts and color points by whether or not they are significant in our contrasts. The black line is 1:1. - - -```{r fig.height=6, fig.width=8} -# Edit based on the data you are using - -#make sure to only use genes present in both results files -test_intersect <- intersect(comp1$gene_id, comp2$gene_id) -comp1_sub <- subset(comp1, comp1$gene_id %in% test_intersect) -comp2_sub <- subset(comp2, comp2$gene_id %in% test_intersect) - -## Check that gene names match -all(comp1_sub$gene_id== comp2_sub$gene_id) - -## Gather necessary data -lfc <- data.frame(comp1_sub$gene_id, comp1_sub$gene_name, comp1_sub$padj, comp2_sub$padj) -colnames(lfc) <- c("gene_id","gene_name", "comp1", "comp2") - -# subset to only include genes in both datasets and color by grouping -DE_comp1 <- setdiff(comp1_sig$gene_id, comp2_sig$gene_id) -DE_comp2 <- setdiff(comp2_sig$gene_id, comp1_sig$gene_id) -DE_both <- intersect(comp2_sig$gene_id, comp1_sig$gene_id) -not_sig <- comp1_sub$gene_id[!(comp1_sub$gene_id %in% c(DE_comp1,DE_comp2,DE_both))] - - -col <- rep(4, nrow(lfc)) -col[lfc$gene_id %in% not_sig] <- 1 -col[lfc$gene_id %in% DE_comp1] <- 2 -col[lfc$gene_id %in% DE_comp2] <- 3 -col[lfc$gene_id %in% DE_both] <- 4 - - -lfc$col <- lfc %>% - dplyr::mutate(color = case_when( - gene_id %in% DE_both ~ 3, - gene_id %in% DE_comp1 ~ 1, - gene_id %in% DE_comp2 ~ 2, - gene_id %in% not_sig ~ 8 - )) %>% pull(color) -lfc$col <- as.factor(lfc$col) - - -ggplot(lfc, aes(x=-log10(comp1), y=-log10(comp2), color=col)) + - geom_point() + labs(color="Group") + - scale_color_discrete(name = "Group", labels = c(paste0("-Log10 adjusted p-value ",paste0(comp1_name)), paste0("-Log10 adjusted p-value ",paste0(comp2_name)),"DE in both comparisons", "Not Significant")) + - geom_abline(intercept=0, slope=1) + - geom_hline(aes(yintercept=0)) + - geom_vline(aes(xintercept=0)) + - scale_color_cb_friendly() + - xlab(paste0("Log2FC in ",paste0(comp1_name))) + - ylab(paste0("Log2FC in ",paste0(comp2_name))) - -``` - - -# R session - -List and version of tools used for the QC report generation. - -```{r} -sessionInfo() -``` diff --git a/inst/templates/rnaseq/DE/params_de-example.R b/inst/templates/rnaseq/DE/params_de-example.R deleted file mode 100644 index cc75ad2..0000000 --- a/inst/templates/rnaseq/DE/params_de-example.R +++ /dev/null @@ -1,18 +0,0 @@ -# project params -date = "YYYYMMDD" -basedir <- './' # where to write down output files - -# params for bcbio -# coldata_fn = "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv" -# counts_fn = 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/tximport-counts.csv' -# se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds") -# - -# Example data -coldata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/coldata.csv' -counts_fn=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.tsv') -# This folder is in the output directory inside multiqc folder -multiqc_data_dir='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/multiqc/star_salmon/multiqc-report-data/' -# This file is inside the genome folder in the output directory -gtf_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/devel/nf-core/genome/genome.filtered.gtf.gz' -se_object = NA diff --git a/inst/templates/rnaseq/DE/params_de.R b/inst/templates/rnaseq/DE/params_de.R deleted file mode 100644 index 8426428..0000000 --- a/inst/templates/rnaseq/DE/params_de.R +++ /dev/null @@ -1,22 +0,0 @@ -# project params -date = "YYYYMMDD" -basedir <- './' # where to write down output files - -# params for bcbio -# coldata_fn = "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv" -# counts_fn = 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/tximport-counts.csv' -# se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds") -# - -# params for nfcore -# Your data -# This is the file used to run nf-core or compatible to that -coldata_fn='/Path/to/metadata/meta.csv' -# This file is inside star_salmon/ folder -counts_fn='/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.tsv' -# This folder called "multiqc_report_data" is inside the output directory star_salmon inside multiqc folder -multiqc_data_dir='/path/to/nf-core/output/star_salmon/multiqc_report_data' -# This file is inside the genome folder in the output directory, use this only non-model organism -# gtf_fn='/path/to/nf-core/output/genome/hg38.filtered.gtf' -se_object = NA - diff --git a/inst/templates/rnaseq/DE/run_markdown.R b/inst/templates/rnaseq/DE/run_markdown.R deleted file mode 100644 index 79e15a0..0000000 --- a/inst/templates/rnaseq/DE/run_markdown.R +++ /dev/null @@ -1,32 +0,0 @@ -library(rmarkdown) -# set working directory to this file before using the function - - -# set directory to this file folder -setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) -# example running with test data -render_de <- function(column, numerator, denominator, subset_value = NULL, - params_file = 'params_de-testdata.R'){ - - rmarkdown::render(input = "DEG.Rmd", - output_dir = ".", - output_format = "html_document", - output_file = ifelse(!is.null(subset_value), - paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'), - paste0('DE_', numerator, '_vs_', denominator, '.html') - ), - clean = TRUE, - envir = new.env(), - params = list( - column = column, - subset_value = subset_value, - numerator = numerator, - denominator = denominator, - params_file = params_file, - project_file = '../information.R', - functions_file = 'load_data.R' - ) - ) -} -#Example data -render_de("sample_type","tumor", "normal") diff --git a/inst/templates/rnaseq/DEa/DEG.Rmd b/inst/templates/rnaseq/DEa/DEG.Rmd deleted file mode 100644 index c2069be..0000000 --- a/inst/templates/rnaseq/DEa/DEG.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Differential Expression" -author: "Harvard Chan Bioinformatics Core" -date: "`r Sys.Date()`" -output: - html_document: - code_folding: hide - df_print: paged - highlights: pygments - number_sections: true - self_contained: true - theme: default - toc: true - toc_float: - collapsed: true - smooth_scroll: true -editor_options: - chunk_output_type: console -params: - # Put hg38, mm10, mm39, or other - - ## Combatseq and ruv can both be false or ONLY ONE can be true - ## Both cannot be true - numerator: tumor - denominator: normal - column: sample_type - subset_column: null - subset_value: null - genome: hg38 - ruv: false - combatseq: false - params_file: params_de-example.R - project_file: ../information.R - functions_file: load_data.R ---- - - -```{r} -# This set up the working directory to this file so all files can be found -library(rstudioapi) -setwd(fs::path_dir(getSourceEditorContext()$path)) -``` - - -```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} -# 1. Set up input files in this R file (params_de.R) -source(params$params_file) -# 2. Set up project file (already done from QC probably) -source(params$project_file) -# 3. Load custom functions to load data from coldata/metrics/counts -source(params$functions_file) -# IMPORTANT set these values if you are not using the parameters in the header (lines 22-31) -genome=params$genome -column=params$column -numerator=params$numerator -denominator=params$denominator -subset_column=params$subset_column -subset_value=params$subset_value -run_ruv=params$ruv -run_combatseq=params$combatseq -factor_of_interest <- column -``` - - -```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} -library(rtracklayer) -library(DESeq2) -library(tidyverse) -library(stringr) -library(DEGreport) -library(ggpubr) -library(msigdbr) -library(fgsea) -library(org.Hs.eg.db) -library(knitr) -library(EnhancedVolcano) -library(bcbioR) -library(ggprism) -library(viridis) -library(pheatmap) -library(janitor) -library(ggforce) -library(vegan) - -colors=cb_friendly_cols(1:15) -ggplot2::theme_set(theme_prism(base_size = 14)) -opts_chunk[["set"]]( - cache = F, - cache.lazy = FALSE, - dev = c("png", "pdf"), - error = TRUE, - highlight = TRUE, - message = FALSE, - prompt = FALSE, - tidy = FALSE, - warning = FALSE, - echo = T, - fig.height = 4) - -# set seed for reproducibility -set.seed(1234567890L) -``` - -```{r sanitize_datatable} -sanitize_datatable = function(df, ...) { - # remove dashes which cause wrapping - DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), - colnames=gsub("-", "_", colnames(df))) -} -``` - - - -```{r load_data, message=F, warning=F} -# This code will load from bcbio or nf-core folder -# NOTE make sure to set numerator and denominator -coldata <- load_coldata(coldata_fn, column, - numerator, denominator, - subset_column, subset_value) -coldata$sample=row.names(coldata) - -counts <- load_counts(counts_fn) -counts <- counts[,colnames(counts) %in% coldata$sample] - -metrics <- load_metrics(se_object, multiqc_data_dir, gtf_fn, counts) %>% - left_join(coldata, by = c('sample')) %>% - as.data.frame() -rownames(metrics) <- metrics$sample -# if the names don't match in order or string check files names and coldata information -counts = counts[,rownames(metrics)] -coldata = coldata[rownames(metrics),] -stopifnot(all(names(counts) == rownames(metrics))) -``` - - - -# Overview - -- Project: `r project` -- PI: `r PI` -- Analyst: `r analyst` -- Experiment: `r experiment` -- Aim: `r aim` -- Comparison: `r ifelse(is.null(subset_value), paste0(numerator, ' vs. ', denominator), paste0(subset_value, ': ', numerator, ' vs. ', denominator))` - -```{r create_filenames} - -if (!is.null(subset_value) & !is.null(subset_value)){ - filenames = str_interp("${subset_value}_${numerator}_vs_${denominator}") -} else { - filenames = str_interp("${numerator}_vs_${denominator}") -} - -contrasts = c(column,numerator,denominator) -coef=paste0(column,"_",numerator,"_vs_",denominator) - -name_expression_fn=file.path( - basedir, - str_interp("${filenames}_expression.csv")) -name_deg_fn=file.path( - basedir, - str_interp("${filenames}_deg.csv")) -name_pathways_fn=file.path( - basedir, - str_interp("${filenames}_pathways.csv")) - -``` - -```{r load_counts_data} - -rdata = AnnotationDbi::select(org.Hs.eg.db, rownames(counts), 'SYMBOL', 'ENSEMBL') %>% - dplyr::select(gene_id = ENSEMBL, gene_name = SYMBOL) %>% - distinct(gene_id, .keep_all = TRUE) - -``` - -```{r setup_RUV} -dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = ~1) - -vsd_before <- vst(dds_to_use) -norm_matrix = assay(vsd_before) -``` - - -# PCA and group level variance. - -**Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells).** - -Dispersion estimates are a key part of the DESEQ2 analysis. DESEQ2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per gene dispersions to match this distribution. If one group has higher variance than all others this will affect the dispersion estimates. Here we visually check that the variance per group is similar using a PCA. The ellipses are minimal volume enclosing ellipses using the Khachiyan algorithm. - -**It is best practice NOT to subset your data unless one group has significantly higher variance than the others. The best dispersion estimates are obtained with more data.** - -**This code automatically uses the column value from the header. You can also manually add a factor of interest to define the groups. One can be created by combining multiple metadata columns using the paste0 function.** - -```{r set group, eval=FALSE, echo=FALSE} -## Example of creating a group covariate - -meta$group <- paste0(meta$sex,"_", meta$age,"_",meta$treatment) - -factor_of_interest <- "insert column name for covariate of interest" -``` - - -```{r PCA} -pca <- degPCA(norm_matrix, metrics, - condition = factor_of_interest, name = "sample", data = T) - -pca$plot + ggtitle(paste0("All samples", "\nPCA using ", nrow(vsd_before), " genes")) + - theme(plot.title=element_text(hjust=0.5)) + - geom_mark_ellipse(aes(color = sample_type)) + scale_color_cb_friendly() -``` - -## PERMDISP - -Groups in a univariate analysis can also differ with regard to their mean values, variation around those means, or both. In univariate analyses, dispersion can be examined using Levene’s test. PERMDISP is a multivariate extension of Levene’s test to examine whether groups differ in variability. In essence, PERMDISP involves calculating the distance from each data point to its group centroid and then testing whether those distances differ among the groups. [Source](https://uw.pressbooks.pub/appliedmultivariatestatistics/chapter/permdisp/) - -Here we apply this test to our variance stabilized data. We calculate distances between samples and then use the `betadisper()` function from the popular vegan package. We get two overall p-values where significant means that the dispersions are different between groups. The first p-value comes from the `anova()` function and the second from the `permutest()` function. We also get pairwise p-values for every group-group comparison. - -```{r PERMDISP} -vare.disa <- vegdist(t(assay(vsd_before))) - -mod = betadisper(vare.disa, metrics[[factor_of_interest]]) -anova(mod) -permutest(mod, pairwise = TRUE) - -``` - - - -# Covariate analysis - -Multiple factors related to the experimental design or quality of sequencing may influence the outcomes of a given RNA-seq experiment. To further determine whether any confounding covariate risks affecting the results of our differential expression analyses, it is useful to assess the correlation between covariates and principal component (PC) values. - -Here, we are using `DEGreport::degCovariates()` to explore potential correlations between variables provided in the metadata and all PCs that account for at least 5% of the variability in the data. If applicable, significant correlations (FDR < 0.1) are circled. **This diagnostic plot helps us determine which variables we may need to add to our DE model.** - - -```{r covariates, fig.height = 6, fig.width = 10} -degCovariates( - norm_matrix, - metrics, -) -``` - - - -```{r init_DESEQ} -formula <- as.formula(paste0("~ ", " + ", column)) -## Check if sample name matches -stopifnot(all(names(counts) == rownames(coldata))) - -dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = formula) - -vsd_before <- vst(dds_to_use) -norm_matrix = assay(vsd_before) -new_cdata <- coldata -``` - - -```{r, eval=F, echo=FALSE} -#### IF YOU ARE RUNNING RUV OR COMBATSEQ RUN THE CHUNKS BELOW OTHERWISE SKIP TO Differential Expression SECTION - -### RUV - LINES 261-296 -### COMBATSEQ - LINES 303-369 -``` - - - -```{r, eval=run_ruv, results='asis', echo=run_ruv} -cat("# Remove Unwanted Variability - -When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data.") -``` - -```{r do_RUV, eval=run_ruv, echo=run_ruv} -library(RUVSeq) - -# If you want to skip the code, just set up formula to be your model in the next chunk of code -design <- coldata[[column]] -diffs <- makeGroups(design) -dat <- norm_matrix -# by default is running one variable, -# change K parameter to other number to find more unknown covariates -ruvset <- RUVs(dat, cIdx=rownames(dat), k=1, diffs, isLog = T, round = F) -vars <- ruvset$W - -new_cdata <- cbind(coldata, vars) - -formula <- as.formula(paste0("~ ", - paste0( - colnames(new_cdata)[grepl("W", colnames(new_cdata))], - collapse = " + " - ), " + ", column) -) -norm_matrix=ruvset$normalizedCounts -pca2 <- degPCA(norm_matrix, new_cdata, - condition = column) + ggtitle('After RUV') -pca2 + scale_color_cb_friendly() - -``` - -```{r after_RUV, eval=run_ruv} - -dds_to_use <- DESeqDataSetFromMatrix(counts, new_cdata, design = formula) -vsd_to_use<- vst(dds_to_use, blind=FALSE) - -``` - -```{r combat-text , eval=run_combatseq, results='asis', echo=run_combatseq} -library(sva) - -cat("# Remove Batch Effects - -Here we apply Combat-seq (https://github.com/zhangyuqing/ComBat-seq) to try to remove batch effects so we can better tease out the effects of interest. - -Combat-seq uses a negative binomial regression to model batch effects, providing adjusted data by mapping the original data to an expected distribution if there were no batch effects. The adjusted data preserves the integer nature of counts, so that it is compatible with the assumptions of state-of-the-art differential expression software (e.g. edgeR, DESeq2, which specifically request untransformed count data).") - -``` - - -```{r set_variable_combatseq, eval=run_combatseq, echo=run_combatseq} - -## FILL OUT THIS CHUNK OF CODE IF YOU WANT TO RUN COMBATSEQ - -## Set your batch effect variable here this is the variable that combatseq will try to remove - -## Column name of your batch variable -to_remove = "batch" - -## Column name of of your variable(s) of interest - -to_keep = "sample_type" - - -coldata[[to_remove]] <- as.factor(coldata[[to_remove]]) -coldata[[to_keep]] <- as.factor(coldata[[to_keep]]) - - -batch = coldata[[to_remove]] -treatment = coldata[[to_keep]] - -## If you have multiple variables of interest you will need to cbind them into one variable - -#treatment1 = metrics[[to_keep]] -#treatment2 = metrics[[to_keep]] -#treatment3 = metrics[[to_keep]] - - -# imp = cbind(as.numeric(as.character(treatment1)),as.numeric(as.character(treatment2)), as.numeric(as.character(treatment3))) - -``` - - -```{r do_combatseq, eval=run_combatseq} -adjusted_counts <- ComBat_seq(as.matrix(counts), batch=batch, group = treatment) - -## For multiple variables of interest - -# adjusted_counts <- ComBat_seq(as.matrix(counts2), batch=batch, covar_mod = imp) - -``` - -```{r after_combatseq, eval=run_combatseq} -# NOTE: Make sure the formula doens't contain the covariates used in combatseq above -dds_to_use <- DESeqDataSetFromMatrix(adjusted_counts, coldata, design = formula) -vsd_combat<- vst(dds_to_use, blind=FALSE) - -combat_matrix = assay(vsd_combat) - -pca_combat <- degPCA(combat_matrix, coldata, - condition = column) + ggtitle('After Combatseq') -pca_combat + scale_color_cb_friendly() - -``` - - -# Differential Expression - -Differential gene expression analysis of count data was performed using the Bioconductor R package, DESeq2, which fits the count data to a negative binomial model. - -Before fitting the model, we often look at a metric called dispersion, which is a measure for variance which also takes into consideration mean expression. A dispersion value is estimated for each individual gene, then 'shrunken' to a more accurate value based on expected variation for a typical gene exhibiting that level of expression. Finally, the shrunken dispersion value is used in the final GLM fit. - -We use the below dispersion plot, which should show an inverse relationship between dispersion and mean expression, to get an idea of whether our data is a good fit for the model. - -```{r DE} -de <- DESeq(dds_to_use) - -DESeq2::plotDispEsts(de) -``` - -Because it is difficult to accurately detect and quantify the expression of lowly expressed genes, differences in their expression between treatment conditions can be unduly exaggerated after the model is fit. We correct for this so that gene LFC is not dependent overall on basal gene expression level. - -```{r lfc_shrink} -# resultsNames(de) # check the order is right -resLFC = results(de, contrast=contrasts) -resLFCS <- lfcShrink(de, coef=coef, type="apeglm") - -res <- as.data.frame(resLFCS) %>% - rownames_to_column('gene_id') %>% left_join(rdata, by = 'gene_id') %>% - relocate(gene_name) %>% dplyr::rename(lfc = log2FoldChange) %>% - mutate(pi = abs(lfc) * -log10(padj)) %>% arrange(-pi) - -## Filter out genes that have no expression or were filtered out by DESEQ2 -res <- res[res$baseMean>0,] %>% drop_na(padj) %>% drop_na(pvalue) - -res_sig <- res %>% filter(padj < 0.05) %>% arrange(padj) %>% - mutate(gene_name = ifelse(is.na(gene_name), gene_id, gene_name)) - -res_mod <- res %>% mutate(lfc = replace(lfc, lfc < -5, -5)) %>% mutate(lfc = replace(lfc, lfc > 5, 5)) -show <- as.data.frame(res_mod[1:10, c("lfc", "padj", "gene_name")]) - -degMA(as.DEGSet(resLFC)) + ggtitle('Before LFC Shrinking') -``` - -## MA plot - -```{r after_lfc_shrink} -degMA(as.DEGSet(resLFCS), limit = 2) + ggtitle('After LFC Shrinking') - -``` - -## Volcano plot - -This volcano plot shows the genes that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in red are genes that have padj < 0.05 and a log2-fold change > 1. Points in blue have a padj < 0.05 and a log2-fold change < 1 and points in green have a padj > 0.05 and a log2-fold change > 2. Grey points are non-significant. The dashed lines correspond to the cutoff values of log2 foldchance and padj that we have chosen. - -```{r volcano_plot, fig.height=6} -# degVolcano(res_mod[,c('lfc', 'padj')], plot_text = show) -EnhancedVolcano(res_mod, - lab= res_mod$gene_name, - pCutoff = 0.05, - selectLab = c(res_sig$gene_name[1:15]), - FCcutoff = 0.5, - x = 'lfc', - y = 'padj', - title="Volcano Tumor vs. Normal", - col=as.vector(colors[c("dark_grey", "light_blue", - "purple", "purple")]), - subtitle = "", drawConnectors = T, max.overlaps = Inf) -``` - -## Heatmap - -```{r heapmap} -### Run pheatmap using the metadata data frame for the annotation -ma=norm_matrix[res_sig$gene_id,] -colma=coldata[,c(column), drop=FALSE] -colors=lapply(colnames(colma), function(c){ - l.col=colors[1:length(unique(colma[[c]]))] - names(l.col)=unique(colma[[c]]) - l.col -}) -names(colors)=colnames(colma) -pheatmap(ma, - color = inferno(10), - cluster_rows = T, - show_rownames = F, - annotation = colma, - annotation_colors = colors, - border_color = NA, - fontsize = 10, - scale = "row", - fontsize_row = 10, - height = 20) -``` - - -## Differentially Expressed Genes - -```{r sig_genes_table} -res_sig %>% sanitize_datatable -``` - -## Plot top 16 genes - -```{r top n DEGs, fig.height = 6, fig.width = 8} -n = 16 -top_n <- res_sig %>% slice_min(order_by = padj, n = n, with_ties = F) %>% - dplyr::select(gene_name, gene_id) -top_n_exp <- norm_matrix %>% as.data.frame() %>% - rownames_to_column('gene_id') %>% - # dplyr::select(-group, -group_name) %>% - pivot_longer(!gene_id, names_to = 'sample', values_to = 'log2_expression') %>% - right_join(top_n, relationship = "many-to-many") %>% - left_join(coldata, by = 'sample') - -ggplot(top_n_exp, aes_string(x = column, y = 'log2_expression')) + - geom_boxplot(outlier.shape = NA, linewidth=0.5, color="grey") + - geom_point() + - facet_wrap(~gene_name) + - ggtitle(str_interp('Expression of Top ${n} DEGs')) + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) - -``` - -# Pathway Enrichment - -From the set of differentially expressed genes and using publicly available information about gene sets involved in biological processes and functions, we can calculate which biological processes and functions are significantly perturbed as a result of the treatment. - -```{r} -universe=res %>% - filter(!is.na(padj)) %>% pull(gene_id) -mapping = AnnotationDbi::select(org.Hs.eg.db, universe, 'ENTREZID', 'ENSEMBL') - -all_in_life=list( - msigdbr(species = "human", category = "H") %>% mutate(gs_subcat="Hallmark"), - msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME"), - msigdbr(species = "human", category = "C2", subcategory = "CP:KEGG"), - msigdbr(species = "human", category = "C2", subcategory = "CP:PID"), - msigdbr(species = "human", category = "C5", subcategory = "GO:BP"), - msigdbr(species = "human", category = "C5", subcategory = "GO:MF"), - msigdbr(species = "human", category = "C5", subcategory = "HPO"), - msigdbr(species = "human", category = "C3", subcategory = "TFT:GTRD"), - msigdbr(species = "human", category = "C6") %>% mutate(gs_subcat="Oncogenic") -) - -ora_input = res %>% filter(!is.na(padj), padj<0.01, abs(lfc)>0.3) %>% pull(gene_id) -input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL')) - -total_deg=length(unique(ora_input))/length(unique(mapping$ENTREZID)) -pathways_ora_all = lapply(all_in_life, function(p){ - pathway = split(x = p$entrez_gene, f = p$gs_name) - db_name = paste(p$gs_cat[1], p$gs_subcat[1],sep=":") - respath <- fora(pathways = pathway, - genes = unique(input_entrezid$ENTREZID), - universe = unique(mapping$ENTREZID), - minSize = 15, - maxSize = 500) - coll_respath = collapsePathwaysORA(respath[order(pval)][padj < 0.1], - pathway, unique(input_entrezid$ENTREZID), unique(mapping$ENTREZID)) - as_tibble(respath[pathway %in% coll_respath$mainPathways]) %>% - mutate(database=db_name, NES=(overlap/size)/(total_deg)) -}) %>% bind_rows() %>% - mutate(analysis="ORA") - -ora_tb = pathways_ora_all %>% unnest(overlapGenes) %>% - group_by(pathway) %>% - left_join(mapping, by =c("overlapGenes"="ENTREZID")) %>% - dplyr::select(pathway, padj, NES, ENSEMBL, analysis, - database) - -pathways_long = ora_tb - -``` - - -```{r pathaways_table} -pathways_ora_all %>% sanitize_datatable() -``` - - -```{r write-files} -counts_norm=norm_matrix %>% as.data.frame() %>% - rownames_to_column("gene_id") %>% - mutate(comparison = str_interp("${numerator}_vs_${denominator}")) - -res_for_writing <- res %>% - mutate(comparison = str_interp("${numerator}_vs_${denominator}")) - -pathways_for_writing <- pathways_long %>% - mutate(comparison = str_interp("${numerator}_vs_${denominator}")) - -if (!is.null(subset_value)){ - counts_norm <- counts_norm %>% - mutate(subset = subset_value) - res_for_writing <- res_for_writing %>% - mutate(subset = subset_value) - pathways_for_writing <- pathways_for_writing %>% - mutate(subset = subset_value) -} - -write_csv(counts_norm, name_expression_fn) -write_csv(res_for_writing, name_deg_fn) -write_csv(pathways_for_writing, name_pathways_fn) -``` - -# R session - -List and version of tools used for the DE report generation. - -```{r} -sessionInfo() -``` diff --git a/inst/templates/rnaseq/QC/params_qc.R b/inst/templates/rnaseq/QC/params_qc.R deleted file mode 100644 index 84dabfa..0000000 --- a/inst/templates/rnaseq/QC/params_qc.R +++ /dev/null @@ -1,4 +0,0 @@ -# info params - -metadata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv' -se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds") diff --git a/inst/templates/rnaseq/QC/params_qc_nf-core.R b/inst/templates/rnaseq/QC/params_qc_nf-core.R deleted file mode 100644 index 08b3ec0..0000000 --- a/inst/templates/rnaseq/QC/params_qc_nf-core.R +++ /dev/null @@ -1,11 +0,0 @@ -# info params - -# Your data -# This is the file used to run nf-core or compatible to that -metadata_fn='/Path/to/metadata/meta.csv' -# This file is inside star_salmon/ folder -se_object='/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.rds' -# This folder called "multiqc_report_data" is inside the output directory star_salmon inside multiqc folder -multiqc_data_dir='/path/to/nf-core/output/multiqc/star_salmon/multiqc_report_data' -# This file is inside the genome folder in the output directory, use this only for non-model organism -# gtf_fn='/path/to/nf-core/output/genome/hg38.filtered.gtf' diff --git a/inst/templates/rnaseq/QC/run_markdown.R b/inst/templates/rnaseq/QC/run_markdown.R deleted file mode 100644 index 51acbef..0000000 --- a/inst/templates/rnaseq/QC/run_markdown.R +++ /dev/null @@ -1,13 +0,0 @@ -library(rmarkdown) - -# set directory to this file folder -setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) -# example running with test data -rmarkdown::render("QC_nf-core.Rmd", - output_dir = ".", - clean = TRUE, - output_format = "html_document", - params = list( - params_file = 'params_qc_nf-core-testdata.R', - project_file = '../information.R') - ) diff --git a/inst/templates/rnaseq/README.md b/inst/templates/rnaseq/README.md deleted file mode 100644 index 6de2117..0000000 --- a/inst/templates/rnaseq/README.md +++ /dev/null @@ -1,66 +0,0 @@ -# Guideline for RNAseq downstream analysis - -Make sure there is a project name for this. - -## Run data with nf-core rnaseq - -- Make sure you have access to our [Seqera WorkSpace](https://cloud.seqera.io/orgs/HBC/workspaces/core_production/launchpad) -- Transfer data to HCBC S3: Ask Alex/Lorena. Files will be at our S3 bucket `input/rawdata` folder -- Prepare the CSV file according this [instructions](https://nf-co.re/rnaseq/3.14.0/docs/usage#multiple-runs-of-the-same-sample). File should look like this: - -```csv -sample,fastq_1,fastq_2,strandedness -CONTROL_REP1,s3path/AEG588A1_S1_L002_R1_001.fastq.gz,s3path/AEG588A1_S1_L002_R2_001.fastq.gz,auto -CONTROL_REP1,s3path/AEG588A1_S1_L003_R1_001.fastq.gz,s3path/AEG588A1_S1_L003_R2_001.fastq.gz,auto -CONTROL_REP1,s3path/AEG588A1_S1_L004_R1_001.fastq.gz,s3path/AEG588A1_S1_L004_R2_001.fastq.gz,auto -``` - -Use `bcbio_nfcore_check(csv_file)` to check the file is correct. - -You can add more columns to this file with more metadata, and use this file as the `coldata` file in the templates. - -- Upload file to our `Datasets` in Seqera using the name of the project but starting with `rnaseq-pi_lastname-hbc_code` -- Go to `Launchpad`, select `nf-core_rnaseq` pipeline, and select the previous created `Datasets` in the `input` parameter after clicking in `Browser` - - Select an output directory with the same name used for the `Dataset` inside the `results` folder in S3 -- When pipeline is down, data will be copied to our on-premise HPC in the scratch system under `scratch/groups/hsph/hbc/bcbio/` folder - -## Downstream analysis - -Please, modify `information.R` with the right information. You can use this file with any other Rmd to include the project/analysis information. - -### QC - -`QC/QC.Rmd` is a template for QC metrics. Use `params_qc.R` for `bcbio` - or `QC/QC_nf-core.Rmd` `params_qc_nf-core.R` for `nf-core/rnaseq` outputs. - -Read instruction in the R and Rmd scripts to render it. - -### DE - -`DE/DEG.Rmd` is a template for two groups comparison. `params_de.R` has the information of the input files to load. You can point to `bcbio` or `nf-core/rnaseq` output files. - -On the `YAML` header file of the `Rmd` you can specify some parameters or just set them up in the first chunk of code of the template. This template has examples of: - -- sub-setting data -- two groups comparison -- volcano plot -- MA plot -- Pathway analysis -- Tables - -## DropBox - -- In `reports/QC` - - [ ] copy `bcbio-se.rds` and `tximport-counts.csv` - - [ ] copy QC `Rmd/R/html/figures` -- In `reports/DE` - - [ ] Normalized counts for all genes x all samples (csv format) -- In `reports/DE`, for *each analysis*: - - **Note** For multiple comparisons/analysis, do a single report/template if possible in the parent folder using parameters whenever possible. - - Create a folder with the comparison names in the files. Numbering by comparison (`01.1_DE_comp1`, `01.2_DE_comp2`, etc.). If you’re running multiple models for the same comparison, append `_M#`. Add the following files under each folder: - - [ ] Normalized count table with the samples used in this analysis/comparison. - - [ ] Full results `DESeq2` for all genes (csv format) with annotation columns appended. - - [ ] Significant genes results file (subset of annotated full results by chosen p-value and LFC). Separate files will be created for each individual contrast. - - [ ] Significant genes results file as described above, but additionally append columns containing normalized count values for each sample. - - Make sure to append the gene symbols to these tables so the researcher can interpret the results. - diff --git a/inst/templates/rnaseq/de/DEG.Rmd b/inst/templates/rnaseq/de/DEG.Rmd deleted file mode 100644 index c2069be..0000000 --- a/inst/templates/rnaseq/de/DEG.Rmd +++ /dev/null @@ -1,581 +0,0 @@ ---- -title: "Differential Expression" -author: "Harvard Chan Bioinformatics Core" -date: "`r Sys.Date()`" -output: - html_document: - code_folding: hide - df_print: paged - highlights: pygments - number_sections: true - self_contained: true - theme: default - toc: true - toc_float: - collapsed: true - smooth_scroll: true -editor_options: - chunk_output_type: console -params: - # Put hg38, mm10, mm39, or other - - ## Combatseq and ruv can both be false or ONLY ONE can be true - ## Both cannot be true - numerator: tumor - denominator: normal - column: sample_type - subset_column: null - subset_value: null - genome: hg38 - ruv: false - combatseq: false - params_file: params_de-example.R - project_file: ../information.R - functions_file: load_data.R ---- - - -```{r} -# This set up the working directory to this file so all files can be found -library(rstudioapi) -setwd(fs::path_dir(getSourceEditorContext()$path)) -``` - - -```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} -# 1. Set up input files in this R file (params_de.R) -source(params$params_file) -# 2. Set up project file (already done from QC probably) -source(params$project_file) -# 3. Load custom functions to load data from coldata/metrics/counts -source(params$functions_file) -# IMPORTANT set these values if you are not using the parameters in the header (lines 22-31) -genome=params$genome -column=params$column -numerator=params$numerator -denominator=params$denominator -subset_column=params$subset_column -subset_value=params$subset_value -run_ruv=params$ruv -run_combatseq=params$combatseq -factor_of_interest <- column -``` - - -```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} -library(rtracklayer) -library(DESeq2) -library(tidyverse) -library(stringr) -library(DEGreport) -library(ggpubr) -library(msigdbr) -library(fgsea) -library(org.Hs.eg.db) -library(knitr) -library(EnhancedVolcano) -library(bcbioR) -library(ggprism) -library(viridis) -library(pheatmap) -library(janitor) -library(ggforce) -library(vegan) - -colors=cb_friendly_cols(1:15) -ggplot2::theme_set(theme_prism(base_size = 14)) -opts_chunk[["set"]]( - cache = F, - cache.lazy = FALSE, - dev = c("png", "pdf"), - error = TRUE, - highlight = TRUE, - message = FALSE, - prompt = FALSE, - tidy = FALSE, - warning = FALSE, - echo = T, - fig.height = 4) - -# set seed for reproducibility -set.seed(1234567890L) -``` - -```{r sanitize_datatable} -sanitize_datatable = function(df, ...) { - # remove dashes which cause wrapping - DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), - colnames=gsub("-", "_", colnames(df))) -} -``` - - - -```{r load_data, message=F, warning=F} -# This code will load from bcbio or nf-core folder -# NOTE make sure to set numerator and denominator -coldata <- load_coldata(coldata_fn, column, - numerator, denominator, - subset_column, subset_value) -coldata$sample=row.names(coldata) - -counts <- load_counts(counts_fn) -counts <- counts[,colnames(counts) %in% coldata$sample] - -metrics <- load_metrics(se_object, multiqc_data_dir, gtf_fn, counts) %>% - left_join(coldata, by = c('sample')) %>% - as.data.frame() -rownames(metrics) <- metrics$sample -# if the names don't match in order or string check files names and coldata information -counts = counts[,rownames(metrics)] -coldata = coldata[rownames(metrics),] -stopifnot(all(names(counts) == rownames(metrics))) -``` - - - -# Overview - -- Project: `r project` -- PI: `r PI` -- Analyst: `r analyst` -- Experiment: `r experiment` -- Aim: `r aim` -- Comparison: `r ifelse(is.null(subset_value), paste0(numerator, ' vs. ', denominator), paste0(subset_value, ': ', numerator, ' vs. ', denominator))` - -```{r create_filenames} - -if (!is.null(subset_value) & !is.null(subset_value)){ - filenames = str_interp("${subset_value}_${numerator}_vs_${denominator}") -} else { - filenames = str_interp("${numerator}_vs_${denominator}") -} - -contrasts = c(column,numerator,denominator) -coef=paste0(column,"_",numerator,"_vs_",denominator) - -name_expression_fn=file.path( - basedir, - str_interp("${filenames}_expression.csv")) -name_deg_fn=file.path( - basedir, - str_interp("${filenames}_deg.csv")) -name_pathways_fn=file.path( - basedir, - str_interp("${filenames}_pathways.csv")) - -``` - -```{r load_counts_data} - -rdata = AnnotationDbi::select(org.Hs.eg.db, rownames(counts), 'SYMBOL', 'ENSEMBL') %>% - dplyr::select(gene_id = ENSEMBL, gene_name = SYMBOL) %>% - distinct(gene_id, .keep_all = TRUE) - -``` - -```{r setup_RUV} -dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = ~1) - -vsd_before <- vst(dds_to_use) -norm_matrix = assay(vsd_before) -``` - - -# PCA and group level variance. - -**Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells).** - -Dispersion estimates are a key part of the DESEQ2 analysis. DESEQ2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per gene dispersions to match this distribution. If one group has higher variance than all others this will affect the dispersion estimates. Here we visually check that the variance per group is similar using a PCA. The ellipses are minimal volume enclosing ellipses using the Khachiyan algorithm. - -**It is best practice NOT to subset your data unless one group has significantly higher variance than the others. The best dispersion estimates are obtained with more data.** - -**This code automatically uses the column value from the header. You can also manually add a factor of interest to define the groups. One can be created by combining multiple metadata columns using the paste0 function.** - -```{r set group, eval=FALSE, echo=FALSE} -## Example of creating a group covariate - -meta$group <- paste0(meta$sex,"_", meta$age,"_",meta$treatment) - -factor_of_interest <- "insert column name for covariate of interest" -``` - - -```{r PCA} -pca <- degPCA(norm_matrix, metrics, - condition = factor_of_interest, name = "sample", data = T) - -pca$plot + ggtitle(paste0("All samples", "\nPCA using ", nrow(vsd_before), " genes")) + - theme(plot.title=element_text(hjust=0.5)) + - geom_mark_ellipse(aes(color = sample_type)) + scale_color_cb_friendly() -``` - -## PERMDISP - -Groups in a univariate analysis can also differ with regard to their mean values, variation around those means, or both. In univariate analyses, dispersion can be examined using Levene’s test. PERMDISP is a multivariate extension of Levene’s test to examine whether groups differ in variability. In essence, PERMDISP involves calculating the distance from each data point to its group centroid and then testing whether those distances differ among the groups. [Source](https://uw.pressbooks.pub/appliedmultivariatestatistics/chapter/permdisp/) - -Here we apply this test to our variance stabilized data. We calculate distances between samples and then use the `betadisper()` function from the popular vegan package. We get two overall p-values where significant means that the dispersions are different between groups. The first p-value comes from the `anova()` function and the second from the `permutest()` function. We also get pairwise p-values for every group-group comparison. - -```{r PERMDISP} -vare.disa <- vegdist(t(assay(vsd_before))) - -mod = betadisper(vare.disa, metrics[[factor_of_interest]]) -anova(mod) -permutest(mod, pairwise = TRUE) - -``` - - - -# Covariate analysis - -Multiple factors related to the experimental design or quality of sequencing may influence the outcomes of a given RNA-seq experiment. To further determine whether any confounding covariate risks affecting the results of our differential expression analyses, it is useful to assess the correlation between covariates and principal component (PC) values. - -Here, we are using `DEGreport::degCovariates()` to explore potential correlations between variables provided in the metadata and all PCs that account for at least 5% of the variability in the data. If applicable, significant correlations (FDR < 0.1) are circled. **This diagnostic plot helps us determine which variables we may need to add to our DE model.** - - -```{r covariates, fig.height = 6, fig.width = 10} -degCovariates( - norm_matrix, - metrics, -) -``` - - - -```{r init_DESEQ} -formula <- as.formula(paste0("~ ", " + ", column)) -## Check if sample name matches -stopifnot(all(names(counts) == rownames(coldata))) - -dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = formula) - -vsd_before <- vst(dds_to_use) -norm_matrix = assay(vsd_before) -new_cdata <- coldata -``` - - -```{r, eval=F, echo=FALSE} -#### IF YOU ARE RUNNING RUV OR COMBATSEQ RUN THE CHUNKS BELOW OTHERWISE SKIP TO Differential Expression SECTION - -### RUV - LINES 261-296 -### COMBATSEQ - LINES 303-369 -``` - - - -```{r, eval=run_ruv, results='asis', echo=run_ruv} -cat("# Remove Unwanted Variability - -When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data.") -``` - -```{r do_RUV, eval=run_ruv, echo=run_ruv} -library(RUVSeq) - -# If you want to skip the code, just set up formula to be your model in the next chunk of code -design <- coldata[[column]] -diffs <- makeGroups(design) -dat <- norm_matrix -# by default is running one variable, -# change K parameter to other number to find more unknown covariates -ruvset <- RUVs(dat, cIdx=rownames(dat), k=1, diffs, isLog = T, round = F) -vars <- ruvset$W - -new_cdata <- cbind(coldata, vars) - -formula <- as.formula(paste0("~ ", - paste0( - colnames(new_cdata)[grepl("W", colnames(new_cdata))], - collapse = " + " - ), " + ", column) -) -norm_matrix=ruvset$normalizedCounts -pca2 <- degPCA(norm_matrix, new_cdata, - condition = column) + ggtitle('After RUV') -pca2 + scale_color_cb_friendly() - -``` - -```{r after_RUV, eval=run_ruv} - -dds_to_use <- DESeqDataSetFromMatrix(counts, new_cdata, design = formula) -vsd_to_use<- vst(dds_to_use, blind=FALSE) - -``` - -```{r combat-text , eval=run_combatseq, results='asis', echo=run_combatseq} -library(sva) - -cat("# Remove Batch Effects - -Here we apply Combat-seq (https://github.com/zhangyuqing/ComBat-seq) to try to remove batch effects so we can better tease out the effects of interest. - -Combat-seq uses a negative binomial regression to model batch effects, providing adjusted data by mapping the original data to an expected distribution if there were no batch effects. The adjusted data preserves the integer nature of counts, so that it is compatible with the assumptions of state-of-the-art differential expression software (e.g. edgeR, DESeq2, which specifically request untransformed count data).") - -``` - - -```{r set_variable_combatseq, eval=run_combatseq, echo=run_combatseq} - -## FILL OUT THIS CHUNK OF CODE IF YOU WANT TO RUN COMBATSEQ - -## Set your batch effect variable here this is the variable that combatseq will try to remove - -## Column name of your batch variable -to_remove = "batch" - -## Column name of of your variable(s) of interest - -to_keep = "sample_type" - - -coldata[[to_remove]] <- as.factor(coldata[[to_remove]]) -coldata[[to_keep]] <- as.factor(coldata[[to_keep]]) - - -batch = coldata[[to_remove]] -treatment = coldata[[to_keep]] - -## If you have multiple variables of interest you will need to cbind them into one variable - -#treatment1 = metrics[[to_keep]] -#treatment2 = metrics[[to_keep]] -#treatment3 = metrics[[to_keep]] - - -# imp = cbind(as.numeric(as.character(treatment1)),as.numeric(as.character(treatment2)), as.numeric(as.character(treatment3))) - -``` - - -```{r do_combatseq, eval=run_combatseq} -adjusted_counts <- ComBat_seq(as.matrix(counts), batch=batch, group = treatment) - -## For multiple variables of interest - -# adjusted_counts <- ComBat_seq(as.matrix(counts2), batch=batch, covar_mod = imp) - -``` - -```{r after_combatseq, eval=run_combatseq} -# NOTE: Make sure the formula doens't contain the covariates used in combatseq above -dds_to_use <- DESeqDataSetFromMatrix(adjusted_counts, coldata, design = formula) -vsd_combat<- vst(dds_to_use, blind=FALSE) - -combat_matrix = assay(vsd_combat) - -pca_combat <- degPCA(combat_matrix, coldata, - condition = column) + ggtitle('After Combatseq') -pca_combat + scale_color_cb_friendly() - -``` - - -# Differential Expression - -Differential gene expression analysis of count data was performed using the Bioconductor R package, DESeq2, which fits the count data to a negative binomial model. - -Before fitting the model, we often look at a metric called dispersion, which is a measure for variance which also takes into consideration mean expression. A dispersion value is estimated for each individual gene, then 'shrunken' to a more accurate value based on expected variation for a typical gene exhibiting that level of expression. Finally, the shrunken dispersion value is used in the final GLM fit. - -We use the below dispersion plot, which should show an inverse relationship between dispersion and mean expression, to get an idea of whether our data is a good fit for the model. - -```{r DE} -de <- DESeq(dds_to_use) - -DESeq2::plotDispEsts(de) -``` - -Because it is difficult to accurately detect and quantify the expression of lowly expressed genes, differences in their expression between treatment conditions can be unduly exaggerated after the model is fit. We correct for this so that gene LFC is not dependent overall on basal gene expression level. - -```{r lfc_shrink} -# resultsNames(de) # check the order is right -resLFC = results(de, contrast=contrasts) -resLFCS <- lfcShrink(de, coef=coef, type="apeglm") - -res <- as.data.frame(resLFCS) %>% - rownames_to_column('gene_id') %>% left_join(rdata, by = 'gene_id') %>% - relocate(gene_name) %>% dplyr::rename(lfc = log2FoldChange) %>% - mutate(pi = abs(lfc) * -log10(padj)) %>% arrange(-pi) - -## Filter out genes that have no expression or were filtered out by DESEQ2 -res <- res[res$baseMean>0,] %>% drop_na(padj) %>% drop_na(pvalue) - -res_sig <- res %>% filter(padj < 0.05) %>% arrange(padj) %>% - mutate(gene_name = ifelse(is.na(gene_name), gene_id, gene_name)) - -res_mod <- res %>% mutate(lfc = replace(lfc, lfc < -5, -5)) %>% mutate(lfc = replace(lfc, lfc > 5, 5)) -show <- as.data.frame(res_mod[1:10, c("lfc", "padj", "gene_name")]) - -degMA(as.DEGSet(resLFC)) + ggtitle('Before LFC Shrinking') -``` - -## MA plot - -```{r after_lfc_shrink} -degMA(as.DEGSet(resLFCS), limit = 2) + ggtitle('After LFC Shrinking') - -``` - -## Volcano plot - -This volcano plot shows the genes that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in red are genes that have padj < 0.05 and a log2-fold change > 1. Points in blue have a padj < 0.05 and a log2-fold change < 1 and points in green have a padj > 0.05 and a log2-fold change > 2. Grey points are non-significant. The dashed lines correspond to the cutoff values of log2 foldchance and padj that we have chosen. - -```{r volcano_plot, fig.height=6} -# degVolcano(res_mod[,c('lfc', 'padj')], plot_text = show) -EnhancedVolcano(res_mod, - lab= res_mod$gene_name, - pCutoff = 0.05, - selectLab = c(res_sig$gene_name[1:15]), - FCcutoff = 0.5, - x = 'lfc', - y = 'padj', - title="Volcano Tumor vs. Normal", - col=as.vector(colors[c("dark_grey", "light_blue", - "purple", "purple")]), - subtitle = "", drawConnectors = T, max.overlaps = Inf) -``` - -## Heatmap - -```{r heapmap} -### Run pheatmap using the metadata data frame for the annotation -ma=norm_matrix[res_sig$gene_id,] -colma=coldata[,c(column), drop=FALSE] -colors=lapply(colnames(colma), function(c){ - l.col=colors[1:length(unique(colma[[c]]))] - names(l.col)=unique(colma[[c]]) - l.col -}) -names(colors)=colnames(colma) -pheatmap(ma, - color = inferno(10), - cluster_rows = T, - show_rownames = F, - annotation = colma, - annotation_colors = colors, - border_color = NA, - fontsize = 10, - scale = "row", - fontsize_row = 10, - height = 20) -``` - - -## Differentially Expressed Genes - -```{r sig_genes_table} -res_sig %>% sanitize_datatable -``` - -## Plot top 16 genes - -```{r top n DEGs, fig.height = 6, fig.width = 8} -n = 16 -top_n <- res_sig %>% slice_min(order_by = padj, n = n, with_ties = F) %>% - dplyr::select(gene_name, gene_id) -top_n_exp <- norm_matrix %>% as.data.frame() %>% - rownames_to_column('gene_id') %>% - # dplyr::select(-group, -group_name) %>% - pivot_longer(!gene_id, names_to = 'sample', values_to = 'log2_expression') %>% - right_join(top_n, relationship = "many-to-many") %>% - left_join(coldata, by = 'sample') - -ggplot(top_n_exp, aes_string(x = column, y = 'log2_expression')) + - geom_boxplot(outlier.shape = NA, linewidth=0.5, color="grey") + - geom_point() + - facet_wrap(~gene_name) + - ggtitle(str_interp('Expression of Top ${n} DEGs')) + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) - -``` - -# Pathway Enrichment - -From the set of differentially expressed genes and using publicly available information about gene sets involved in biological processes and functions, we can calculate which biological processes and functions are significantly perturbed as a result of the treatment. - -```{r} -universe=res %>% - filter(!is.na(padj)) %>% pull(gene_id) -mapping = AnnotationDbi::select(org.Hs.eg.db, universe, 'ENTREZID', 'ENSEMBL') - -all_in_life=list( - msigdbr(species = "human", category = "H") %>% mutate(gs_subcat="Hallmark"), - msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME"), - msigdbr(species = "human", category = "C2", subcategory = "CP:KEGG"), - msigdbr(species = "human", category = "C2", subcategory = "CP:PID"), - msigdbr(species = "human", category = "C5", subcategory = "GO:BP"), - msigdbr(species = "human", category = "C5", subcategory = "GO:MF"), - msigdbr(species = "human", category = "C5", subcategory = "HPO"), - msigdbr(species = "human", category = "C3", subcategory = "TFT:GTRD"), - msigdbr(species = "human", category = "C6") %>% mutate(gs_subcat="Oncogenic") -) - -ora_input = res %>% filter(!is.na(padj), padj<0.01, abs(lfc)>0.3) %>% pull(gene_id) -input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL')) - -total_deg=length(unique(ora_input))/length(unique(mapping$ENTREZID)) -pathways_ora_all = lapply(all_in_life, function(p){ - pathway = split(x = p$entrez_gene, f = p$gs_name) - db_name = paste(p$gs_cat[1], p$gs_subcat[1],sep=":") - respath <- fora(pathways = pathway, - genes = unique(input_entrezid$ENTREZID), - universe = unique(mapping$ENTREZID), - minSize = 15, - maxSize = 500) - coll_respath = collapsePathwaysORA(respath[order(pval)][padj < 0.1], - pathway, unique(input_entrezid$ENTREZID), unique(mapping$ENTREZID)) - as_tibble(respath[pathway %in% coll_respath$mainPathways]) %>% - mutate(database=db_name, NES=(overlap/size)/(total_deg)) -}) %>% bind_rows() %>% - mutate(analysis="ORA") - -ora_tb = pathways_ora_all %>% unnest(overlapGenes) %>% - group_by(pathway) %>% - left_join(mapping, by =c("overlapGenes"="ENTREZID")) %>% - dplyr::select(pathway, padj, NES, ENSEMBL, analysis, - database) - -pathways_long = ora_tb - -``` - - -```{r pathaways_table} -pathways_ora_all %>% sanitize_datatable() -``` - - -```{r write-files} -counts_norm=norm_matrix %>% as.data.frame() %>% - rownames_to_column("gene_id") %>% - mutate(comparison = str_interp("${numerator}_vs_${denominator}")) - -res_for_writing <- res %>% - mutate(comparison = str_interp("${numerator}_vs_${denominator}")) - -pathways_for_writing <- pathways_long %>% - mutate(comparison = str_interp("${numerator}_vs_${denominator}")) - -if (!is.null(subset_value)){ - counts_norm <- counts_norm %>% - mutate(subset = subset_value) - res_for_writing <- res_for_writing %>% - mutate(subset = subset_value) - pathways_for_writing <- pathways_for_writing %>% - mutate(subset = subset_value) -} - -write_csv(counts_norm, name_expression_fn) -write_csv(res_for_writing, name_deg_fn) -write_csv(pathways_for_writing, name_pathways_fn) -``` - -# R session - -List and version of tools used for the DE report generation. - -```{r} -sessionInfo() -``` diff --git a/inst/templates/rnaseq/de/load_data.R b/inst/templates/rnaseq/de/load_data.R deleted file mode 100644 index 8a1d297..0000000 --- a/inst/templates/rnaseq/de/load_data.R +++ /dev/null @@ -1,146 +0,0 @@ -library(tidyverse) -library(SummarizedExperiment) -library(janitor) -load_metrics <- function(se_object, multiqc_data_dir, gtf_fn, counts){ - - # bcbio input - if (!is.na(se_object)){ - - se <- readRDS(se_object) - metrics <- metadata(se)$metrics %>% as.data.frame() - # left_join(coldata %>% rownames_to_column('sample')) %>% column_to_rownames('sample') - } else { #nf-core input - - # Get metrics from nf-core into bcbio like table - # many metrics are already in the Genereal Table of MultiQC, this reads the file - metrics <- read_tsv(file.path(multiqc_data_dir, 'multiqc_general_stats.txt')) - - # we get some more metrics from Qualimap and rename columns - metrics_qualimap <- read_tsv(file.path(multiqc_data_dir, 'mqc_qualimap_genomic_origin_1.txt')) - metrics <- metrics %>% full_join(metrics_qualimap) - metrics <- metrics %>% - clean_names() %>% - dplyr::rename_with(~gsub('.*mqc_generalstats_', '', .)) - - # This uses the fastqc metrics to get total reads - total_reads <- metrics %>% - dplyr::filter(!is.na(fastqc_raw_total_sequences)) %>% - remove_empty(which = 'cols') %>% - dplyr::rename(single_sample = sample) %>% - mutate(sample = gsub('_[12]+$', '', single_sample)) %>% - group_by(sample) %>% - summarize(total_reads = sum(fastqc_raw_total_sequences)) - - # This renames to user-friendly names the metrics columns - metrics <- metrics %>% - dplyr::filter(is.na(fastqc_raw_total_sequences)) %>% - remove_empty(which = 'cols') %>% - full_join(total_reads) %>% - mutate(mapped_reads = samtools_reads_mapped) %>% - mutate(exonic_rate = exonic/(star_uniquely_mapped * 2)) %>% - mutate(intronic_rate = intronic/(star_uniquely_mapped * 2)) %>% - mutate(intergenic_rate = intergenic/(star_uniquely_mapped * 2)) %>% - mutate(x5_3_bias = qualimap_5_3_bias) - - # Sometimes we don't have rRNA due to mismatch annotation, We skip this if is the case - gtf <- NULL - if (genome =="other"){ - gtf <- gtf_fn - }else{ - if (genome == "hg38") { - gtf <- "hg38.rna.gtf.gz" - } else if (genome == "mm10") { - gtf <- "mm10.rna.gtf.gz" - } else if (genome == "mm39") { - gtf <- "mm39.rna.gtf.gz" - } - gtf <- system.file("extdata", "annotation", - gtf, - package="bcbioR") - } - if (is.null(gtf)) { - print("No genome provided! Please add it at the top of this Rmd") - } - - gtf=rtracklayer::import(gtf) - - - one=grep("gene_type", colnames(as.data.frame(gtf)), value = TRUE) - another=grep("gene_biotype", colnames(as.data.frame(gtf)), value = TRUE) - biotype=NULL - if(length(one)==1){ - biotype=one - }else if(length(another)==1){ - biotype=another - }else{ - warning("No gene biotype founded") - } - - if (!is.null(biotype)){ - annotation=as.data.frame(gtf) %>% .[,c("gene_id", biotype)] - rRNA=grepl("rRNA|tRNA",annotation[[biotype]]) - genes=intersect(annotation[rRNA,"gene_id"],row.names(counts)) - ratio=data.frame(sample=colnames(counts), - r_and_t_rna_rate=colSums(counts[genes,])/colSums(counts)) - metrics = left_join(metrics, ratio, by="sample") - }else{ - metrics[["r_and_t_rna_rate"]] <- NA - } - - # if ("custom_content_biotype_counts_percent_r_rna" %in% colnames(metrics)){ - # metrics <- mutate(metrics, r_rna_rate = custom_content_biotype_counts_percent_r_rna) - # }else{ - # metrics[["r_rna_rate"]] <- NA - # } - metrics=metrics[,c("sample","mapped_reads","exonic_rate","intronic_rate", - "total_reads", - "x5_3_bias", "r_and_t_rna_rate","intergenic_rate")] - } - metrics$sample <- make.names(metrics$sample) - rownames(metrics) <- metrics$sample - return(metrics) -} - -load_coldata <- function(coldata_fn, column, numerator, denominator, subset_column = NULL, subset_value = NULL){ - coldata=read.csv(coldata_fn) %>% - dplyr::select(!matches("fastq") & !matches("strandness")) %>% - distinct() - if('description' %in% names(coldata)){ - coldata$sample <- tolower(coldata$description) - } - coldata <- coldata %>% distinct(sample, .keep_all = T) - stopifnot(column %in% names(coldata)) - - # use only some samples, by default use all - if (!is.null(subset_column)){ - coldata <- coldata[coldata[[paste(subset_column)]] == subset_value, ] - } - #coldata <- coldata[coldata[[paste(column)]] %in% c(numerator, denominator), ] - #browser() - coldata$sample <- make.names(coldata$sample) - rownames(coldata) <- coldata$sample - coldata$description <- coldata$sample - - coldata[[column]] = relevel(as.factor(coldata[[column]]), denominator) - - return(coldata) -} - -load_counts <- function(counts_fn){ - - # bcbio input - if(grepl('csv', counts_fn)){ - counts <- read_csv(counts_fn) %>% - mutate(gene = str_replace(gene, pattern = "\\.[0-9]+$", "")) %>% - column_to_rownames('gene') - colnames(counts) = tolower(colnames(counts)) - return(counts) - } else { # nf-core input - counts <- read_tsv(counts_fn) %>% dplyr::select(-gene_name) %>% - mutate(gene_id = str_replace(gene_id, pattern = "\\.[0-9]+$", "")) %>% - column_to_rownames('gene_id') %>% round - - return(counts) - } - -} diff --git a/inst/templates/rnaseq/information.R b/inst/templates/rnaseq/information.R deleted file mode 100644 index 6e15eef..0000000 --- a/inst/templates/rnaseq/information.R +++ /dev/null @@ -1,6 +0,0 @@ -# info params -project = "name_hbcXXXXX" -PI = 'person name' -experiment = 'short description' -aim = 'short description' -analyst = 'person in the core' diff --git a/inst/templates/rnaseq/org/hcbc_README.md b/inst/templates/rnaseq/org/hcbc_README.md deleted file mode 100644 index 50f8f16..0000000 --- a/inst/templates/rnaseq/org/hcbc_README.md +++ /dev/null @@ -1,75 +0,0 @@ -# Guideline for RNAseq downstream analysis - -Make sure there is a project name for this. - -## Run data with nf-core rnaseq - -- Make sure you have access to our [Seqera WorkSpace](https://cloud.seqera.io/orgs/HBC/workspaces/core_production/launchpad) -- Transfer data to HCBC S3: Ask Alex/Lorena. Files will be at our S3 bucket `input/rawdata` folder -- Prepare the CSV file according this [instructions](https://nf-co.re/rnaseq/3.14.0/docs/usage#multiple-runs-of-the-same-sample). File should look like this: - -```csv -sample,fastq_1,fastq_2,strandedness -CONTROL_REP1,s3path/AEG588A1_S1_L002_R1_001.fastq.gz,s3path/AEG588A1_S1_L002_R2_001.fastq.gz,auto -CONTROL_REP1,s3path/AEG588A1_S1_L003_R1_001.fastq.gz,s3path/AEG588A1_S1_L003_R2_001.fastq.gz,auto -CONTROL_REP1,s3path/AEG588A1_S1_L004_R1_001.fastq.gz,s3path/AEG588A1_S1_L004_R2_001.fastq.gz,auto -``` - -Use `bcbio_nfcore_check(csv_file)` to check the file is correct. - -You can add more columns to this file with more metadata, and use this file as the `coldata` file the templates. - -- Upload file to our `Datasets` in Seqera using the name of the project but starting with `nfcore-rnaseq` -- Go to `Launchpad`, select `nf-core_rnaseq` pipeline, and select the previous created `Datasets` in the `input` parameter after clicking in `Browser` - - Select an output directory with the same name used for the `Dataset` inside the `results` folder in S3 -- When pipeline is down, data will be copied to our on-premise HPC in the scratch system under `scratch/groups/hsph/hbc/bcbio/` folder - -## Downstream analysis - -Please, modify `information.R` with the right information. You can use this file with any other Rmd to include the project/analysis information. - -### QC - -`QC/QC.Rmd` is a template for QC metrics. Use `params_qc.R` for `bcbio` - or `QC/QC_nf-core.Rmd` `params_qc_nf-core.R` for `nf-core/rnaseq` outputs. - -Read instruction in the R and Rmd scripts to render it. - -### DE - -`DE/DEG.Rmd` is a template for two groups comparison. `params_de.R` has the information of the input files to load. You can point to `bcbio` or `nf-core/rnaseq` output files. - -On the `YAML` header file of the `Rmd` you can specify some parameters or just set them up in the first chunk of code of the template. This template has examples of: - -- sub-setting data -- two groups comparison -- volcano plot -- MA plot -- Pathway analysis -- Tables - -There are some code related to alternative analysis: - -- `DE/Multiplicative_DE_docs.md` that shows some cases when there is multiple variables in the model with multiple levels: sex (2 levels) and genotype (4 levels) - -## DropBox - -- In `reports/QC` - - [ ] copy `bcbio-se.rds` and `tximport-counts.csv` - - [ ] copy QC `Rmd/R/html/figures` -- In `reports/DE` - - [ ] Normalized counts for all genes x all samples (csv format) -- In `reports/DE`, for *each analysis*: - - **Note** For multiple comparisons/analysis, do a single report/template if possible in the parent folder using parameters whenever possible. - - Create a folder with the comparison names in the files. Numbering by comparison (`01.1_DE_comp1`, `01.2_DE_comp2`, etc.). If you’re running multiple models for the same comparison, append `_M#`. Add the following files under each folder: - - [ ] Normalized count table with the samples used in this analysis/comparison. - - [ ] Full results `DESeq2` for all genes (csv format) with annotation columns appended. - - [ ] Significant genes results file (subset of annotated full results by chosen p-value and LFC). Separate files will be created for each individual contrast. - - [ ] Significant genes results file as described above, but additionally append columns containing normalized count values for each sample. - - Make sure to append the gene symbols to these tables so the researcher can interpret the results. - -## GitHub - -- [ ] Push all `*Rmd` `*R` files used for the *QC* and *DE* analysis respecting folder structure. - -Please, ignore `*html/figures/csv` and any output of the code. diff --git a/inst/templates/rnaseq/qc/QC-bcbio.Rmd b/inst/templates/rnaseq/qc/QC-bcbio.Rmd deleted file mode 100644 index aca91c1..0000000 --- a/inst/templates/rnaseq/qc/QC-bcbio.Rmd +++ /dev/null @@ -1,409 +0,0 @@ ---- -title: "Quality Control" -author: "Harvard Chan Bioinformatics Core" -date: "`r Sys.Date()`" -output: - html_document: - code_folding: hide - df_print: paged - highlights: pygments - number_sections: true - self_contained: true - theme: default - toc: true - toc_float: - collapsed: true - smooth_scroll: true -editor_options: - chunk_output_type: console -params: - params_file: params_qc.R ---- - - -```{r source_params, echo = F} -metadata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv' -se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds") -``` - -```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} -library(tidyverse) -library(knitr) -library(DESeq2) -library(DEGreport) -library(ggrepel) -library(pheatmap) -# library(RColorBrewer) -library(DT) -library(pheatmap) -library(bcbioR) -ggplot2::theme_set(theme_light(base_size = 14)) -opts_chunk[["set"]]( - cache = FALSE, - cache.lazy = FALSE, - dev = c("png", "pdf"), - error = TRUE, - highlight = TRUE, - message = FALSE, - prompt = FALSE, - tidy = FALSE, - warning = FALSE, - fig.height = 4) -``` - - -```{r subchunkify, echo=FALSE, eval=FALSE} -#' Create sub-chunks for plots -#' -#' taken from: https://stackoverflow.com/questions/15365829/dynamic-height-and-width-for-knitr-plots -#' -#' @param pl a plot object -#' @param fig.height figure height -#' @param fig.width figure width -#' @param chunk_name name of the chunk -#' -#' @author Andreas Scharmueller \email{andschar@@protonmail.com} -#' -subchunkify = function(pl, - fig.height = 7, - fig.width = 5, - chunk_name = 'plot') { - pl_deparsed = paste0(deparse(function() { - pl - }), collapse = '') - - sub_chunk = paste0( - "```{r ", - chunk_name, - ", fig.height=", - fig.height, - ", fig.width=", - fig.width, - ", dpi=72", - ", echo=FALSE, message=FALSE, warning=FALSE, fig.align='center'}", - "\n(", - pl_deparsed, - ")()", - "\n```" - ) - - cat(knitr::knit( - text = knitr::knit_expand(text = sub_chunk), - quiet = TRUE - )) -} - -``` - - -```{r sanitize-datatable} -sanitize_datatable = function(df, ...) { - # remove dashes which cause wrapping - DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), - colnames=gsub("-", "_", colnames(df))) -} -``` - -# Overview - -- Project: `r project` -- PI: `r PI` -- Analyst: `r analyst` -- Experiment: `r experiment` -- Aim: `r aim` - - -# Samples and metadata - -```{r load_metadata} -meta_df=read_csv(metadata_fn) %>% mutate(sample = tolower(description)) %>% - dplyr::select(-description) - -ggplot(meta_df, aes(sample_type, fill = sample_type)) + - geom_bar() + ylab("") + xlab("") + - scale_fill_cb_friendly() -``` - - -```{r show-metadata} -se <- readRDS(se_object) #local - - -metrics <- metadata(se)$metrics %>% - full_join(meta_df , by = c("sample" = "sample")) - -meta_sm <- meta_df %>% - as.data.frame() %>% - column_to_rownames("sample") - -meta_sm %>% sanitize_datatable() - -``` - -# Read metrics {.tabset} - -## Total reads - -Here, we want to see consistency and a minimum of 20 million reads. - -```{r plot_total_reads} -metrics %>% - ggplot(aes(x = sample_type, - y = total_reads, - color = sample_type)) + - geom_point(alpha=0.5) + - coord_flip() + - scale_y_continuous(name = "million reads") + - scale_color_cb_friendly() + - ggtitle("Total reads") - -``` - -```{r calc_min_max_pct_mapped} -#get min percent mapped reads for reference -min_pct_mapped <- round(min(metrics$mapped_reads/metrics$total_reads)*100,1) -max_pct_mapped <- round(max(metrics$mapped_reads/metrics$total_reads)*100,1) -``` - -## Mapping rate - -The genomic mapping rate represents the percentage of reads mapping to the reference genome. We want to see consistent mapping rates between samples and over 70% mapping. These samples have mapping rates (`r min_pct_mapped` - `r max_pct_mapped`%). - -```{r plot_mapping_rate} -metrics$mapped_reads_pct <- round(metrics$mapped_reads/metrics$total_reads*100,1) -metrics %>% - ggplot(aes(x = sample_type, - y = mapped_reads_pct, - color = sample_type)) + - geom_point() + - coord_flip() + - scale_color_cb_friendly() + - ylim(0, 100) + - ggtitle("Mapping rate") + - geom_hline(yintercept=70, color = cb_friendly_cols('blue')) -``` - - -## Number of genes detected - -The number of genes represented in every sample is expected to be consistent and over 20K (blue line). - -```{r plot_genes_detected} -genes_detected <- colSums(assays(se)[["raw"]] > 0) %>% enframe() -sample_names <- metrics[,c("sample"), drop=F] -genes_detected <- left_join(genes_detected, sample_names, by = c("name" = "sample")) -genes_detected <- genes_detected %>% group_by(name) -genes_detected <- summarise(genes_detected, - n_genes = max(value)) - -metrics <- metrics %>% - left_join(genes_detected, by = c("sample" = "name")) -ggplot(metrics,aes(x = sample_type, - y = n_genes, color = sample_type)) + - geom_point() + - coord_flip() + - scale_color_cb_friendly() + - ggtitle("Number of genes") + - ylab("Number of genes") + - xlab("") + - geom_hline(yintercept=20000, color = cb_friendly_cols('blue')) -``` - - -## Gene detection saturation - -This plot shows how complex the samples are. We expect samples with more reads to detect more genes. - -```{r plot_gene_saturation} -metrics %>% - ggplot(aes(x = total_reads, - y = n_genes, - color = sample_type)) + - geom_point()+ - scale_x_log10() + - scale_color_cb_friendly() + - ggtitle("Gene saturation") + - ylab("Number of genes") -``` - -## Exonic mapping rate - -Here we are looking for consistency, and exonic mapping rates around 70% or 75% (blue and red lines, respectively). - -```{r plot_exonic_mapping_rate} -metrics %>% - ggplot(aes(x = sample_type, - y = exonic_rate * 100, - color = sample_type)) + - geom_point() + - ylab("Exonic rate %") + - ggtitle("Exonic mapping rate") + - scale_color_cb_friendly() + - coord_flip() + - xlab("") + - ylim(c(0,100)) + - geom_hline(yintercept=70, color = cb_friendly_cols('blue')) + - geom_hline(yintercept=75, color = cb_friendly_cols('brown')) -``` - -## Intronic mapping rate - -Here, we expect a low intronic mapping rate (≤ 15% - 20%) - -```{r plot_intronic_mapping_rate} -metrics %>% - ggplot(aes(x = sample_type, - y = intronic_rate * 100, - color = sample_type)) + - geom_point() + - ylab("Intronic rate %") + - ggtitle("Intronic mapping rate") + - scale_color_cb_friendly() + - coord_flip() + - xlab("") + - ylim(c(0,100)) + - geom_hline(yintercept=20, color = cb_friendly_cols('blue')) + - geom_hline(yintercept=15, color = cb_friendly_cols('brown')) -``` - -## Intergenic mapping rate - -Here, we expect a low intergenic mapping rate, which is true for all samples. - -```{r plot_intergenic_mapping_rate} -metrics %>% - ggplot(aes(x = sample_type, - y = intergenic_rate * 100, - color = sample_type)) + - geom_point() + - ylab("Intergenic rate %") + - ggtitle("Intergenic mapping rate") + - coord_flip() + - scale_color_cb_friendly() + - ylim(c(0, 100)) -``` - -## rRNA mapping rate - -Samples should have a ribosomal RNA (rRNA) "contamination" rate below 10% - -```{r plot_rrna_mapping_rate} -# for some bad samples it could be > 50% -rrna_ylim <- max(round(metrics$r_rna_rate*100, 2)) + 10 -metrics %>% - ggplot(aes(x = sample_type, - y = r_rna_rate * 100, - color = sample_type)) + - geom_point() + - ylab("rRNA rate, %")+ - ylim(0, rrna_ylim) + - ggtitle("rRNA mapping rate") + - coord_flip() + - scale_color_cb_friendly() -``` - -## 5'->3' bias - -There should be little bias, i.e. the values should be close to 1, or at least consistent among samples - -```{r plot_53_bias} -metrics %>% - ggplot(aes(x = sample_type, - y = x5_3_bias, - color = sample_type)) + - geom_point() + - ggtitle("5'-3' bias") + - coord_flip() + - ylim(c(0.5,1.5)) + - scale_color_cb_friendly()+ - geom_hline(yintercept=1, color = cb_friendly_cols('blue')) -``` - -## Counts per gene - all genes - -We expect consistency in the box plots here between the samples, i.e. the distribution of counts across the genes is similar - -```{r plot_counts_per_gene} -metrics_small <- metrics %>% dplyr::select(sample, sample_type) -metrics_small <- left_join(sample_names, metrics_small) - -counts <- - assays(se)[["raw"]] %>% - as_tibble() %>% - filter(rowSums(.)!=0) %>% - gather(name, counts) - -counts <- left_join(counts, metrics, by = c("name" = "sample")) - -ggplot(counts, aes(sample_type, - log2(counts+1), - fill = sample_type)) + - geom_boxplot() + - scale_fill_cb_friendly() + - ggtitle("Counts per gene, all non-zero genes") + - scale_color_cb_friendly() -``` - - -# Sample similarity analysis - -In this section, we look at how well the different groups in the dataset cluster with each other. Samples from the same group should ideally be clustering together. We use Principal Component Analysis (PCA). - -## Principal component analysis (PCA) {.tabset} - -Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells). - - -```{r PCA1:5 summary, all, unlabeled, fig.width= 7, fig.height = 5} -raw_counts <- assays(se)[["raw"]] %>% - as_tibble() %>% - filter(rowSums(.)!=0) %>% - as.matrix() - -vst <- vst(raw_counts) - -#fix samples names -coldat_for_pca <- as.data.frame(metrics) -rownames(coldat_for_pca) <- coldat_for_pca$sample -coldat_for_pca <- coldat_for_pca[colnames(raw_counts),] -pca1 <- degPCA(vst, coldat_for_pca, - condition = "sample_type", data = T)[["plot"]] -pca2 <- degPCA(vst, coldat_for_pca, - condition = "sample_type", data = T, pc1="PC3", pc2="PC4")[["plot"]] - -pca1 + scale_color_cb_friendly() -pca2 + scale_color_cb_friendly() -``` - - -```{r, eval=FALSE} -variables=degCovariates(vst, coldat_for_pca) -``` - - -```{r clustering fig, fig.width = 10, fig.asp = .62} -## Hierarchical clustering - -vst_cor <- cor(vst) - -annotation_cols <- cb_friendly_pal('grey')(length(unique(coldat_for_pca$sample_type))) -names(annotation_cols) <- unique(coldat_for_pca$sample_type) - -p <- pheatmap(vst_cor, - annotation = coldat_for_pca %>% select(sample_type) %>% mutate(sample_type = as.factor(sample_type)), - show_rownames = T, - show_colnames = T, - color = cb_friendly_pal('heatmap')(15), - annotation_colors = list(sample_type = annotation_cols) -) -p - -``` - -# R session - -List and version of tools used for the QC report generation. - -```{r} -sessionInfo() -``` diff --git a/inst/templates/rnaseq/qc/QC_nf-core.Rmd b/inst/templates/rnaseq/qc/QC_nf-core.Rmd deleted file mode 100644 index dee4712..0000000 --- a/inst/templates/rnaseq/qc/QC_nf-core.Rmd +++ /dev/null @@ -1,585 +0,0 @@ ---- -title: "Quality Control" -author: "Harvard Chan Bioinformatics Core" -date: "`r Sys.Date()`" -output: - html_document: - code_folding: hide - df_print: paged - highlights: pygments - number_sections: true - self_contained: true - theme: default - toc: true - toc_float: - collapsed: true - smooth_scroll: true -editor_options: - chunk_output_type: console -params: - # params_file: params_qc_nf-core-example.R # example data - # Fill this file with the right paths to nfcore output - params_file: params_qc_nf-core.R - # Put hg38, mm10, mm39, or other - genome: hg38 - project_file: ../information.R - factor_of_interest: sample_type ---- - -```{r} -# This set up the working directory to this file so all files can be found -library(rstudioapi) -setwd(fs::path_dir(getSourceEditorContext()$path)) -``` - - -```{r source_params, echo = F} -# 1. set up factor_of_interest parameter from parameter above or manually -# this is used to color plots, it needs to be part of the metadata -factor_of_interest=params$factor_of_interest -genome=params$genome -# 2. Set input files in this file -source(params$params_file) -# 3. If you set up this file, project information will be printed below and -#. it can be reused for other Rmd files. -source(params$project_file) -``` - -# Overview - -- Project: `r project` -- PI: `r PI` -- Analyst: `r analyst` -- Experiment: `r experiment` - - -```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} -library(tidyverse) -library(knitr) -library(rtracklayer) -library(DESeq2) -library(DEGreport) -library(ggrepel) -# library(RColorBrewer) -library(DT) -library(pheatmap) -library(bcbioR) -library(janitor) -ggplot2::theme_set(theme_light(base_size = 14)) -opts_chunk[["set"]]( - cache = FALSE, - cache.lazy = FALSE, - dev = c("png", "pdf"), - error = TRUE, - highlight = TRUE, - message = FALSE, - prompt = FALSE, - tidy = FALSE, - warning = FALSE, - fig.height = 4) -``` - - -```{r subchunkify, echo=FALSE, eval=FALSE} -#' Create sub-chunks for plots -#' -#' taken from: https://stackoverflow.com/questions/15365829/dynamic-height-and-width-for-knitr-plots -#' -#' @param pl a plot object -#' @param fig.height figure height -#' @param fig.width figure width -#' @param chunk_name name of the chunk -#' -#' @author Andreas Scharmueller \email{andschar@@protonmail.com} -#' -subchunkify = function(pl, - fig.height = 7, - fig.width = 5, - chunk_name = 'plot') { - pl_deparsed = paste0(deparse(function() { - pl - }), collapse = '') - - sub_chunk = paste0( - "```{r ", - chunk_name, - ", fig.height=", - fig.height, - ", fig.width=", - fig.width, - ", dpi=72", - ", echo=FALSE, message=FALSE, warning=FALSE, fig.align='center'}", - "\n(", - pl_deparsed, - ")()", - "\n```" - ) - - cat(knitr::knit( - text = knitr::knit_expand(text = sub_chunk), - quiet = TRUE - )) -} - -``` - - -```{r sanitize-datatable} -sanitize_datatable = function(df, ...) { - # remove dashes which cause wrapping - DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), - colnames=gsub("-", "_", colnames(df))) -} -``` - - -# Samples and metadata - - -```{r load_metadata} - -meta_df=read_csv(metadata_fn) %>% - arrange(.data[[factor_of_interest]]) %>% - distinct(sample, .keep_all = T) %>% - dplyr::select(!matches("fastq"), !matches("strandness")) -meta_df$sample <- make.names(meta_df$sample) -order <- meta_df$sample - -ggplot(meta_df, aes(.data[[factor_of_interest]], - fill = .data[[factor_of_interest]])) + - geom_bar() + ylab("") + xlab("") + ylab("# of samples") + - scale_fill_cb_friendly() + theme(axis.text.x=element_text(angle = 90, vjust = 0.5), legend.position = "none") -``` - - -```{r load_data} -# read counts from SE object -se <- readRDS(se_object) -raw_counts <- assays(se)[["counts"]] %>% round() %>% - as.matrix() -raw_counts=raw_counts[rowSums(raw_counts)!=0,] -``` - -```{r prepare_metrics} -# Get metrics from nf-core into bcbio like table -# many metrics are already in the General Table of MultiQC, this reads the file -metrics <- read_tsv(file.path(multiqc_data_dir, 'multiqc_general_stats.txt')) - -# we get some more metrics from Qualimap and rename columns -metrics_qualimap <- read_tsv(file.path(multiqc_data_dir, 'mqc_qualimap_genomic_origin_1.txt')) -metrics <- metrics %>% full_join(metrics_qualimap) -metrics <- metrics %>% - clean_names() %>% - dplyr::rename_with(~gsub('.*mqc_generalstats_', '', .)) - -# This uses the fastqc metrics to get total reads -total_reads <- metrics %>% - dplyr::filter(!is.na(fastqc_raw_total_sequences)) %>% - remove_empty(which = 'cols') %>% - dplyr::rename(single_sample = sample) %>% - mutate(sample = gsub('_[12]+$', '', single_sample)) %>% - group_by(sample) %>% - summarize(total_reads = sum(fastqc_raw_total_sequences)) - -# This renames to user-friendly names the metrics columns -metrics <- metrics %>% - dplyr::filter(is.na(fastqc_raw_total_sequences)) %>% - remove_empty(which = 'cols') %>% - full_join(total_reads) %>% - mutate(mapped_reads = samtools_reads_mapped) %>% - mutate(exonic_rate = exonic/(star_uniquely_mapped * 2)) %>% - mutate(intronic_rate = intronic/(star_uniquely_mapped * 2)) %>% - mutate(intergenic_rate = intergenic/(star_uniquely_mapped * 2)) %>% - mutate(x5_3_bias = qualimap_5_3_bias) - -# Sometimes we don't have rRNA due to mismatch annotation, We skip this if is the case -gtf <- NULL -if (genome =="other"){ - gtf <- gtf_fn -}else{ - if (genome == "hg38") { - gtf <- "hg38.rna.gtf.gz" - } else if (genome == "mm10") { - gtf <- "mm10.rna.gtf.gz" - } else if (genome == "mm39") { - gtf <- "mm39.rna.gtf.gz" - } - gtf <- system.file("extdata", "annotation", - gtf, - package="bcbioR") -} -if (is.null(gtf)) { - print("No genome provided! Please add it at the top of this Rmd") -} - -gtf=rtracklayer::import(gtf) - -one=grep("gene_type", colnames(as.data.frame(gtf)), value = TRUE) -another=grep("gene_biotype", colnames(as.data.frame(gtf)), value = TRUE) -biotype=NULL -if(length(one)==1){ - biotype=one -}else if(length(another)==1){ - biotype=another -}else{ - warning("No gene biotype founded") -} - -if (!is.null(biotype)){ - annotation=as.data.frame(gtf) %>% .[,c("gene_id", biotype)] - rRNA=grepl("rRNA|tRNA",annotation[[biotype]]) - genes=intersect(annotation[rRNA,"gene_id"],row.names(raw_counts)) - ratio=data.frame(sample=colnames(raw_counts), - r_and_t_rna_rate=colSums(raw_counts[genes,])/colSums(raw_counts)) - metrics = left_join(metrics, ratio, by="sample") -}else{ - metrics[["r_and_t_rna_rate"]] <- NA -} - -# if ("custom_content_biotype_counts_percent_r_rna" %in% colnames(metrics)){ -# metrics <- mutate(metrics, r_rna_rate = custom_content_biotype_counts_percent_r_rna) -# }else{ -# metrics[["r_rna_rate"]] <- NA -# } -metrics=metrics[,c("sample","mapped_reads","exonic_rate","intronic_rate", - "total_reads", - "x5_3_bias", "r_and_t_rna_rate","intergenic_rate")] -metrics$sample <- make.names(metrics$sample) -metrics <- metrics %>% - full_join(meta_df , by = c("sample" = "sample")) %>% - dplyr::select(where(~!all(is.na(.)))) - -``` - -```{r show_metadata} -meta_sm <- meta_df %>% - as.data.frame() %>% - column_to_rownames("sample") - -meta_sm %>% sanitize_datatable() - -``` - -# Read metrics {.tabset} - -## Total reads - -Here, we want to see consistency and a minimum of 20 million reads (the grey line). - -```{r plot_total_reads} -metrics %>% - ggplot(aes(x = factor(sample, level = order), - y = total_reads, - fill = .data[[factor_of_interest]])) + - geom_bar(stat = "identity") + - coord_flip() + - scale_y_continuous(name = "million reads") + - scale_x_discrete(limits = rev) + - scale_fill_cb_friendly() + xlab("") + - ggtitle("Total reads") + - geom_hline(yintercept=20000000, color = "grey", linewidth=2) - -metrics %>% - ggplot(aes(x = .data[[factor_of_interest]], - y = total_reads, - color = .data[[factor_of_interest]])) + - geom_point(alpha = 0.5, size=4) + - coord_flip() + - scale_y_continuous(name = "million reads") + - scale_color_cb_friendly() + xlab("") + - ggtitle("Total reads") -``` - -```{r calc_min_max_pct_mapped} -#get min percent mapped reads for reference -min_pct_mapped <- round(min(metrics$mapped_reads/metrics$total_reads)*100,1) -max_pct_mapped <- round(max(metrics$mapped_reads/metrics$total_reads)*100,1) -``` - -## Mapping rate - -The genomic mapping rate represents the percentage of reads mapping to the reference genome. We want to see consistent mapping rates between samples and over 70% mapping (the grey line). These samples have mapping rates: `r min_pct_mapped` - `r max_pct_mapped`%. - -```{r plot_mapping_rate} -metrics$mapped_reads_pct <- round(metrics$mapped_reads/metrics$total_reads*100,1) -metrics %>% - ggplot(aes(x = factor(sample, level = order), - y = mapped_reads_pct, - color = .data[[factor_of_interest]])) + - geom_point(alpha = 0.5, size=4) + - coord_flip() + - scale_x_discrete(limits = rev) + - scale_color_cb_friendly() + - ylim(0, 100) + - ggtitle("Mapping rate") + xlab("") + - geom_hline(yintercept=70, color = "grey", linewidth=2) -``` - - -## Number of genes detected - -The number of genes represented in every sample is expected to be consistent and over 20K (grey line). - -```{r calc_genes_detected} -genes_detected <- colSums(assays(se)[["counts"]] > 0) %>% enframe() -sample_names <- metrics[,c("sample"), drop=F] -genes_detected <- left_join(genes_detected, sample_names, by = c("name" = "sample")) -genes_detected <- genes_detected %>% group_by(name) -genes_detected <- summarise(genes_detected, - n_genes = max(value)) - -metrics <- metrics %>% - left_join(genes_detected, by = c("sample" = "name")) - -``` - - -```{r plot_genes_detected} -ggplot(metrics,aes(x = factor(sample, level = order), - y = n_genes, fill = .data[[factor_of_interest]])) + - geom_bar(stat = "identity") + - coord_flip() + - scale_fill_cb_friendly() + - scale_x_discrete(limits = rev) + - ggtitle("Number of genes") + - ylab("Number of genes") + - xlab("") + - geom_hline(yintercept=20000, color = "grey", linewidth=2) - -metrics %>% - ggplot(aes(x = .data[[factor_of_interest]], - y = n_genes, - color = .data[[factor_of_interest]])) + - geom_point(alpha = 0.5, size=4) + - coord_flip() + - scale_x_discrete(limits = rev) + - scale_y_continuous(name = "million reads") + - scale_color_cb_friendly() + xlab("") + - ggtitle("Number of Genes") - -``` - - - -## Gene detection saturation - -This plot shows how complex the samples are. We expect samples with more reads to detect more genes. - -```{r plot_gene_saturation} -metrics %>% - ggplot(aes(x = total_reads, - y = n_genes, - color = .data[[factor_of_interest]])) + - geom_point(alpha = 0.5, size=4) + - scale_x_log10() + - scale_color_cb_friendly() + - ggtitle("Gene saturation") + - ylab("Number of genes") -``` - -## Exonic mapping rate - -Here we are looking for consistency, and exonic mapping rates around or above 70% (grey line). - -```{r plot_exonic_mapping_rate} -metrics %>% - ggplot(aes(x = factor(sample, level = order), - y = exonic_rate * 100, - color = .data[[factor_of_interest]])) + - geom_point(alpha = 0.5, size=4) + - ylab("Exonic rate %") + - ggtitle("Exonic mapping rate") + - scale_x_discrete(limits = rev) + - scale_color_cb_friendly() + - coord_flip() + - xlab("") + - ylim(c(0,100)) + - geom_hline(yintercept=70, color = "grey", linewidth=2) -``` - -## Intronic mapping rate - -Here, we expect a low intronic mapping rate (≤ 15% - 20%). The grey line indicates 20%. - -```{r plot_intronic_mapping_rate} -metrics %>% - ggplot(aes(x = factor(sample, level = order), - y = intronic_rate * 100, - color = .data[[factor_of_interest]])) + - geom_point(alpha = 0.5, size=4) + - ylab("Intronic rate %") + - ggtitle("Intronic mapping rate") + - scale_x_discrete(limits = rev) + - scale_color_cb_friendly() + - coord_flip() + - xlab("") + - ylim(c(0,100)) + - geom_hline(yintercept=20, color = "grey", linewidth=2) -``` - -## Intergenic mapping rate - -Here, we expect a low intergenic mapping rate, which is true for all samples. The grey line indicates 15% - -```{r plot_intergenic_mapping_rate} -metrics %>% - ggplot(aes(x = factor(sample, level = order), - y = intergenic_rate * 100, - color = .data[[factor_of_interest]])) + - geom_point(alpha = 0.5, size=4) + - ylab("Intergenic rate %") + - ggtitle("Intergenic mapping rate") + - coord_flip() + xlab("") + - scale_x_discrete(limits = rev) + - scale_color_cb_friendly() + - ylim(c(0, 100)) + - geom_hline(yintercept=15, color = "grey", linewidth=2) -``` - -## tRNA/rRNA mapping rate - -Samples should have a ribosomal RNA (rRNA) "contamination" rate below 10% (the grey line). - -```{r plot_rrna_mapping_rate} - -rrna_ylim <- max(round(metrics$r_and_t_rna_rate*100, 2)) + 10 -metrics %>% - ggplot(aes(x = factor(sample, level = order), - y = r_and_t_rna_rate * 100, - color = .data[[factor_of_interest]])) + - geom_point(alpha = 0.5, size=4) + - ylab("tRNA/rRNA rate, %")+ - ylim(0, rrna_ylim) + - ggtitle("tRNA/rRNA mapping rate") + - coord_flip() + - scale_x_discrete(limits = rev) + - scale_color_cb_friendly() + - ylim(c(0, 100)) + xlab("") + - geom_hline(yintercept=10, color = "grey", linewidth=2) -``` - -## 5'->3' bias - -There should be little bias, i.e. the values should be close to 1, or at least consistent among samples - -```{r plot_53_bias} -metrics %>% - ggplot(aes(x = factor(sample, level = order), - y = x5_3_bias, - color = .data[[factor_of_interest]])) + - geom_point(alpha = 0.5, size=4) + - ggtitle("5'-3' bias") + - coord_flip() + - scale_x_discrete(limits = rev) + - ylim(c(0.5,1.5)) + xlab("") + ylab("5'-3' bias") + - scale_color_cb_friendly()+ - geom_hline(yintercept=1, color = "grey", linewidth=2) -``` - -## Counts per gene - all genes - -We expect consistency in the box plots here between the samples, i.e. the distribution of counts across the genes is similar - -```{r plot_counts_per_gene} -metrics_small <- metrics %>% dplyr::select(sample, .data[[factor_of_interest]]) -metrics_small <- left_join(sample_names, metrics_small) - -counts <- - assays(se)[["counts"]] %>% - as_tibble() %>% - filter(rowSums(.)!=0) %>% - gather(name, counts) - -counts <- left_join(counts, metrics_small, by = c("name" = "sample")) - -ggplot(counts, aes(factor(name, level = order), - log2(counts+1), - fill = .data[[factor_of_interest]])) + - geom_boxplot() + - scale_x_discrete(limits = rev) + - scale_fill_cb_friendly() + - coord_flip() + xlab("") + - ggtitle("Counts per gene, all non-zero genes") + - scale_color_cb_friendly() -``` - - -# Sample similarity analysis - -In this section, we look at how well the different groups in the dataset cluster with each other. Samples from the same group should ideally be clustering together. We use Principal Component Analysis (PCA). - -## Principal component analysis (PCA) - -Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells). - - -```{r PCA1:5 summary, all, unlabeled, fig.width= 7, fig.height = 5} - -vst <- vst(raw_counts) - -coldat_for_pca <- as.data.frame(metrics) -rownames(coldat_for_pca) <- coldat_for_pca$sample -coldat_for_pca <- coldat_for_pca[colnames(raw_counts),] -pca1 <- degPCA(vst, coldat_for_pca, - condition = factor_of_interest, data = T)[["plot"]] -pca2 <- degPCA(vst, coldat_for_pca, - condition = factor_of_interest, data = T, pc1="PC3", pc2="PC4")[["plot"]] - - - -pca1 + scale_color_cb_friendly() -pca2 + scale_color_cb_friendly() - -``` - -# Covariates analysis - -When there are multiple factors that can influence the results of a given experiment, it is useful to assess which of them is responsible for the most variance as determined by PCA. This method adapts the method described by Daily et al. for which they integrated a method to correlate covariates with principal components values to determine the importance of each factor. - -```{r covariate-plot,fig.height=12, fig.width=10} -## Remove non-useful columns output by nf-core -coldat_2 <- data.frame(coldat_for_pca[,!(colnames(coldat_for_pca) %in% c("fastq_1", "fastq_2", "salmon_library_types", "salmon_compatible_fragment_ratio", "samtools_reads_mapped_percent", "samtools_reads_properly_paired_percent", "samtools_mapped_passed_pct", "strandedness", "qualimap_5_3_bias"))]) - -# Remove missing data -coldat_2 <- na.omit(coldat_2) -degCovariates(vst, metadata = coldat_2) -``` - -## Hierarchical clustering - -Inter-correlation analysis (ICA) is another way to look at how well samples -cluster by plotting the correlation between the expression profiles of the -samples. - -```{r clustering fig, fig.width = 10, fig.asp = .62} - -vst_cor <- cor(vst) - -colma=meta_df %>% as.data.frame() -rownames(colma) <- colma$sample -colma <- colma[rownames(vst_cor), ] -colma <- colma %>% dplyr::select(.data[[factor_of_interest]]) -anno_colors=lapply(colnames(colma), function(c){ - l.col=cb_friendly_pal('grey')(length(unique(colma[[c]]))) - names(l.col)=unique(colma[[c]]) - l.col -}) -names(anno_colors)=colnames(colma) - -p <- pheatmap(vst_cor, - annotation = colma, - annotation_colors = anno_colors, - show_rownames = T, - show_colnames = T, - color = cb_friendly_pal('heatmap')(15) - ) -p -``` - -# R session - -List and version of tools used for the QC report generation. - -```{r} -sessionInfo() -``` diff --git a/inst/templates/rnaseq/qc/params_qc_nf-core-example.R b/inst/templates/rnaseq/qc/params_qc_nf-core-example.R deleted file mode 100644 index dae62ce..0000000 --- a/inst/templates/rnaseq/qc/params_qc_nf-core-example.R +++ /dev/null @@ -1,9 +0,0 @@ -# info params - -# Example data: COMMENT THESE LINE IF YOU ARE USING YOUR DATA -metadata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/coldata.csv' -se_object=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.rds') -# This folder is in the output directory inside multiqc folder -multiqc_data_dir='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/multiqc/star_salmon/multiqc-report-data/' -# This file is inside the genome folder in the output directory -gtf_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/devel/nf-core/genome/genome.filtered.gtf.gz' From b9a3c0556be045231b2f2bce32095a0694b53b95 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Wed, 31 Jul 2024 18:05:29 -0400 Subject: [PATCH 34/93] try to fix names in rnaseq --- .../rnaseq/DE/Comparison-intersections.Rmd | 258 ++++++++ .../rnaseq/DE/Cross-comparison-analysis.Rmd | 236 +++++++ inst/templates/rnaseq/DE/DEG.Rmd | 581 +++++++++++++++++ inst/templates/rnaseq/DE/load_data.R | 146 +++++ inst/templates/rnaseq/DE/params_de-example.R | 18 + inst/templates/rnaseq/DE/params_de.R | 22 + inst/templates/rnaseq/DE/run_markdown.R | 32 + inst/templates/rnaseq/QC/QC-bcbio.Rmd | 409 ++++++++++++ inst/templates/rnaseq/QC/QC_nf-core.Rmd | 585 ++++++++++++++++++ inst/templates/rnaseq/QC/params_qc.R | 4 + .../rnaseq/QC/params_qc_nf-core-example.R | 9 + inst/templates/rnaseq/QC/params_qc_nf-core.R | 11 + inst/templates/rnaseq/QC/run_markdown.R | 13 + inst/templates/rnaseq/README.md | 66 ++ inst/templates/rnaseq/information.R | 6 + inst/templates/rnaseq/org/hcbc_README.md | 75 +++ 16 files changed, 2471 insertions(+) create mode 100644 inst/templates/rnaseq/DE/Comparison-intersections.Rmd create mode 100644 inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd create mode 100644 inst/templates/rnaseq/DE/DEG.Rmd create mode 100644 inst/templates/rnaseq/DE/load_data.R create mode 100644 inst/templates/rnaseq/DE/params_de-example.R create mode 100644 inst/templates/rnaseq/DE/params_de.R create mode 100644 inst/templates/rnaseq/DE/run_markdown.R create mode 100644 inst/templates/rnaseq/QC/QC-bcbio.Rmd create mode 100644 inst/templates/rnaseq/QC/QC_nf-core.Rmd create mode 100644 inst/templates/rnaseq/QC/params_qc.R create mode 100644 inst/templates/rnaseq/QC/params_qc_nf-core-example.R create mode 100644 inst/templates/rnaseq/QC/params_qc_nf-core.R create mode 100644 inst/templates/rnaseq/QC/run_markdown.R create mode 100644 inst/templates/rnaseq/README.md create mode 100644 inst/templates/rnaseq/information.R create mode 100644 inst/templates/rnaseq/org/hcbc_README.md diff --git a/inst/templates/rnaseq/DE/Comparison-intersections.Rmd b/inst/templates/rnaseq/DE/Comparison-intersections.Rmd new file mode 100644 index 0000000..b073469 --- /dev/null +++ b/inst/templates/rnaseq/DE/Comparison-intersections.Rmd @@ -0,0 +1,258 @@ +--- +title: "Comparing DE Results - Multiple Contrasts" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + project_file: ../information.R +--- + +```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} +## Adjusted P-value used for significance +padj_co <- 0.05 +## Log2FC used for significance. If no cutoff used put 0 +LFC <- 0.5 +## Normalized counts for ALL samples +# norm <- "/Users/emb016/Documents/comparisons_templates/norm_counts.csv" +# Load the count data, for this example it is the last columns of the DE table +norm_counts <- read.csv("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/norm_counts.csv.gz", + row.names = 1) + +# Load the meta data, here we are making one for the exmaple +metadata <- read_csv("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/meta.csv.gz") %>% as.data.frame() + +## Full results file (all genes) for contrast 1 +files=c("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group1.csv.gz", + "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group2.csv.gz", + "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group3.csv.gz") + +``` + + + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(rtracklayer) +library(tidyverse) +library(stringr) +library(ggpubr) +library(knitr) +library(bcbioR) +library(ggprism) +library(viridis) +library(pheatmap) +library(janitor) +library(ggvenn) +library(ggplot2) +library(UpSetR) +library(ggprism) +#library(org.Ce.eg.db) +library(org.Hs.eg.db) +#library(org.Mm.eg.db) + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) +opts_chunk[["set"]]( + cache = F, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + echo = T, + fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) +``` + + +# Load Data + +We load our dataset + +```{r load_data} + +## Name of contrast. This will be displayed on the figures. +# you can manually indicate a list of names as comp_names=c("name1","name2"...) +comp_names = basename(files) %>% + str_remove_all("all_results_|.csv|.gz") %>% + str_replace_all("_", " ") +names(files)=comp_names +N=length(files) +stopifnot(length(files)==length(comp_names)) + +## Make sure you have set up N above +all_genes=lapply(names(files), function(name){ + data <- read_csv(files[name]) %>% + dplyr::filter(padj <= 1) +}) +sign_genes=lapply(names(files), function(name){ + data <- read_csv(files[name]) %>% + dplyr::filter(padj <= 1) + data %>% + dplyr::filter(padj < padj_co, abs(lfc) > LFC) +}) +``` + + + +# Make list of comparisons + + +```{r, fig.height=8, fig.width=8, warning=FALSE, error=FALSE, message=FALSE} +de=lapply(sign_genes, function(x){ + x$gene_id +}) +names(de) <- comp_names +``` + +## Make an upset plot + +Because we have done so many tests venn diagrams no longer work for our data. Instead we will use upset plots. *These plots are relatively intuitive for 2 or 3 categories, but can tend to get more complex for >3 categories. In all cases, you will find the categories being compared and their size listed below the bar plots on the left. As you look to the right (directly below each bar) there are dots with connecting lines that denote which categories the overlap is between, or if there is no overlap (just a dot). The numbers at the top of the bars denote the size of the overlap.* + + +```{r, fig.height=8, fig.width=12} +upset(fromList(de), order.by = "freq", nsets=N) + +``` + +## Pull intersect(s) of interest + +After identifying intersect(s) of interest we can determine which genes are found in which intersections + + +```{r, warning=FALSE, error=FALSE, message=FALSE} +## Grab intersection +gene_names <- data.frame(gene=unique(unlist(de))) + +df1 <- lapply(de,function(x){ + data.frame(gene = x) +}) %>% + bind_rows(.id = "path") + +df_int <- lapply(gene_names$gene,function(x){ + # pull the name of the intersections + intersection <- df1 %>% + dplyr::filter(gene==x) %>% + arrange(path) %>% + pull("path") %>% + paste0(collapse = "|") + # build the dataframe + data.frame(gene = x,int = intersection) +}) %>% bind_rows() +``` + + +```{r, eval=F} +## Run this code to find the name of your intersect of interest. You will use this in the next code chunk +table(df_int$int) +``` + +```{r, warning=FALSE, error=FALSE, message=FALSE} +## subset interaction of interest replace the intersect name with the name of the intersect from above. You can copy and paste the below commands to grab multiple intersects. + +Intersect1 <- subset(df_int, df_int$int=="DMSO vs Group2|DMSO vs Group3") +``` + +## Get annotation data +```{r, warning=FALSE, error=FALSE, message=FALSE} + +# edit this to be the correct organism. One set of annotations per intersect. +# rdata = AnnotationDbi::select(org.Hs.eg.db, Intersect1$gene, 'SYMBOL', 'ENSEMBL') %>% +# dplyr::select(gene_id = ENSEMBL, gene_name = SYMBOL) %>% distinct(gene_id, .keep_all = T) + +# FIX: following code is only for test data, use the above with real data +rdata=data.frame(gene_id=row.names(norm_counts), gene_name=row.names(norm_counts)) +``` + + + +## Heatmap of intersect + +We generate a heatmap with all samples to see the patterns contained in this intersect. + +```{r, fig.height=6, warning=FALSE, error=FALSE, message=FALSE} +## Assign factors of interest. These need to correspond to columns in your metadata. + +factor1 <- "Treatment" +factor2 <- "Cell_line" + +# Extract significant genes +stopifnot(all(Intersect1$gene %in% row.names(norm_counts))) +sigGenes <- Intersect1$gene + +### Extract normalized expression for significant genes +norm_sig <- norm_counts[sigGenes,] +meta <- data.frame(metadata[,print(factor1)],metadata[,print(factor1)]) +colnames(meta) <- c(print(factor1),print(factor2)) +rownames(meta) <- colnames(norm_sig) +### Set a color palette +heat_colors <- lapply(colnames(norm_sig), function(c){ + l.col=colors[1:length(unique(norm_sig[[c]]))] + names(l.col)=unique(norm_sig[[c]]) + l.col +}) + +### Run pheatmap using the metadata data frame for the annotation (11 x 5) +pheatmap(norm_sig, + color = inferno(10), + cluster_rows = T, + show_rownames = F, + annotation = meta, + annotation_colors = heat_colors, + border_color = NA, + fontsize = 10, + scale = "row", + fontsize_row = 10, + height = 20) +``` + + +## Graph all genes in intersect + +```{r, warning=FALSE, error=FALSE, message=FALSE} +Intersect1_annot <- Intersect1 %>% left_join(rdata, by=c("gene"="gene_id")) +# REMOVE to plot all +Intersect1_annot <- Intersect1_annot[1:10] + +graphs <- length(Intersect1_annot$gene) +to_test <- t(norm_counts) +rna = Intersect1_annot$gene +names = Intersect1_annot$gene_name + +to_graph = data.frame(to_test[,rna]) +to_graph = to_graph[Intersect1_annot$gene] +to_graph$Factor1 <- metadata[,factor1] +to_graph$Factor2 <- metadata[,factor2] + +#out <- vector("list", length = graphs) +for (i in seq(1,graphs)) { + to_graph$temp=to_graph[[i]] + print(ggplot(to_graph,aes(x=Factor1,y=temp,color=Factor2)) + + geom_boxplot() + geom_point(alpha=0.5) + ylab(paste0(names[[i]])) + xlab(factor1) + scale_color_discrete(name = "Covariate")) +} +``` + +# R session + +List and version of tools used for the QC report generation. + +```{r} +sessionInfo() +``` diff --git a/inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd b/inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd new file mode 100644 index 0000000..4e78ea6 --- /dev/null +++ b/inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd @@ -0,0 +1,236 @@ +--- +title: "Comparing DE Results - Pairwise" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + project_file: ../information.R +--- + + + +```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} +# 1. Set up input files in this R file (params_pairwisecomp.R) +## Full results file (all genes) for contrastt 1 +comp1_fn <- 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group1.csv.gz' +## Name of contrast 1. This will be displayed on the figures +comp1_name <- "DMSO vs. Group1" +## Full results file (all genes) for contrast 2 +comp2_fn <- 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group2.csv.gz' +## Name of contrast 2. This will be displayed on the figures +comp2_name <- "DMSO vs. Group2" +## Adjusted P-value used for significance +padj_co <- 0.05 +## Log2FC used for significance. If no cutoff used put 0 +LFC <- 0.5 + +comp1 <- read_csv(comp1_fn) %>% + dplyr::filter(padj <= 1) +comp2 <- read_csv(comp2_fn) %>% + dplyr::filter(padj <= 1) +``` + + + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(rtracklayer) +library(tidyverse) +library(stringr) +library(ggpubr) +library(knitr) +library(bcbioR) +library(ggprism) +library(viridis) +library(pheatmap) +library(janitor) +library(ggvenn) +library(ggplot2) + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) +opts_chunk[["set"]]( + cache = F, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + echo = T, + fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) +``` + + +# Load Data + +We load our dataset + +```{r load_data} +# this code will load from bcbio or nf-core folder +# NOTE make sure to set numerator and denominator + +comp1_sig <- comp1 %>% + dplyr::filter(padj < padj_co, abs(lfc) > LFC) + +comp2_sig <- comp2 %>% + dplyr::filter(padj < padj_co, abs(lfc) > LFC) +``` + + + +# Comparisons + +We start with a venn diagram looking at the overlap between our two contrasts + +```{r, fig.height=8, fig.width=8} +name1 <- rlang::ensym(comp1_name) +name2 <- rlang::ensym(comp2_name) +names <- c(name1, name2) + +full <- list(comp1_sig$gene_id,comp2_sig$gene_id) +names(full) <-names + +ggvenn(full, show_percentage = F) + +``` + +## Compare effect sizes and direction + +We plot Log2FC for our contrasts and color points by whether or not they are significant in our contrasts. The black line is 1:1. + + +```{r fig.height=6, fig.width=8} +# Edit based on the data you are using + +#make sure to only use genes present in both results files +test_intersect <- intersect(comp1$gene_id, comp2$gene_id) +comp1_sub <- subset(comp1, comp1$gene_id %in% test_intersect) +comp2_sub <- subset(comp2, comp2$gene_id %in% test_intersect) + +## Check that gene names match +all(comp1_sub$gene_id== comp2_sub$gene_id) + +## Gather necessary data +lfc <- data.frame(comp1_sub$gene_id, comp1_sub$gene_name, comp1_sub$lfc, comp2_sub$lfc) +colnames(lfc) <- c("gene_id","gene_name", "comp1", "comp2") + +# subset to only include genes in both datasets and color by grouping +DE_comp1 <- setdiff(comp1_sig$gene_id, comp2_sig$gene_id) +DE_comp2 <- setdiff(comp2_sig$gene_id, comp1_sig$gene_id) +DE_both <- intersect(comp2_sig$gene_id, comp1_sig$gene_id) +not_sig <- comp1_sub$gene_id[!(comp1_sub$gene_id %in% c(DE_comp1,DE_comp2,DE_both))] + + +col <- rep(4, nrow(lfc)) +col[lfc$gene_id %in% not_sig] <- 1 +col[lfc$gene_id %in% DE_comp1] <- 2 +col[lfc$gene_id %in% DE_comp2] <- 3 +col[lfc$gene_id %in% DE_both] <- 4 + + +lfc$col <- lfc %>% + dplyr::mutate(color = case_when( + gene_id %in% DE_both ~ 3, + gene_id %in% DE_comp1 ~ 1, + gene_id %in% DE_comp2 ~ 2, + gene_id %in% not_sig ~ 8 + )) %>% pull(color) +lfc$col <- as.factor(lfc$col) + + +ggplot(lfc, aes(x=comp1, y=comp2, color=col)) + geom_point() + + labs(color="Group") + + scale_color_discrete(name = "Group", labels = c(paste0("Only DE in ",paste0(comp1_name)), paste0("Only DE in ",paste0(comp2_name)),"DE in both comparisons", "Not Significant")) + + geom_abline(intercept=0, slope=1) + + geom_hline(aes(yintercept=0)) + + geom_vline(aes(xintercept=0)) + + scale_color_cb_friendly() + + xlab(paste0("Log2FC in ",paste0(comp1_name))) + + ylab(paste0("Log2FC in ",paste0(comp2_name))) + +``` + + + +## Compare ajusted P-values + +We plot adjusted P-values for our contrasts and color points by whether or not they are significant in our contrasts. The black line is 1:1. + + +```{r fig.height=6, fig.width=8} +# Edit based on the data you are using + +#make sure to only use genes present in both results files +test_intersect <- intersect(comp1$gene_id, comp2$gene_id) +comp1_sub <- subset(comp1, comp1$gene_id %in% test_intersect) +comp2_sub <- subset(comp2, comp2$gene_id %in% test_intersect) + +## Check that gene names match +all(comp1_sub$gene_id== comp2_sub$gene_id) + +## Gather necessary data +lfc <- data.frame(comp1_sub$gene_id, comp1_sub$gene_name, comp1_sub$padj, comp2_sub$padj) +colnames(lfc) <- c("gene_id","gene_name", "comp1", "comp2") + +# subset to only include genes in both datasets and color by grouping +DE_comp1 <- setdiff(comp1_sig$gene_id, comp2_sig$gene_id) +DE_comp2 <- setdiff(comp2_sig$gene_id, comp1_sig$gene_id) +DE_both <- intersect(comp2_sig$gene_id, comp1_sig$gene_id) +not_sig <- comp1_sub$gene_id[!(comp1_sub$gene_id %in% c(DE_comp1,DE_comp2,DE_both))] + + +col <- rep(4, nrow(lfc)) +col[lfc$gene_id %in% not_sig] <- 1 +col[lfc$gene_id %in% DE_comp1] <- 2 +col[lfc$gene_id %in% DE_comp2] <- 3 +col[lfc$gene_id %in% DE_both] <- 4 + + +lfc$col <- lfc %>% + dplyr::mutate(color = case_when( + gene_id %in% DE_both ~ 3, + gene_id %in% DE_comp1 ~ 1, + gene_id %in% DE_comp2 ~ 2, + gene_id %in% not_sig ~ 8 + )) %>% pull(color) +lfc$col <- as.factor(lfc$col) + + +ggplot(lfc, aes(x=-log10(comp1), y=-log10(comp2), color=col)) + + geom_point() + labs(color="Group") + + scale_color_discrete(name = "Group", labels = c(paste0("-Log10 adjusted p-value ",paste0(comp1_name)), paste0("-Log10 adjusted p-value ",paste0(comp2_name)),"DE in both comparisons", "Not Significant")) + + geom_abline(intercept=0, slope=1) + + geom_hline(aes(yintercept=0)) + + geom_vline(aes(xintercept=0)) + + scale_color_cb_friendly() + + xlab(paste0("Log2FC in ",paste0(comp1_name))) + + ylab(paste0("Log2FC in ",paste0(comp2_name))) + +``` + + +# R session + +List and version of tools used for the QC report generation. + +```{r} +sessionInfo() +``` diff --git a/inst/templates/rnaseq/DE/DEG.Rmd b/inst/templates/rnaseq/DE/DEG.Rmd new file mode 100644 index 0000000..c2069be --- /dev/null +++ b/inst/templates/rnaseq/DE/DEG.Rmd @@ -0,0 +1,581 @@ +--- +title: "Differential Expression" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + # Put hg38, mm10, mm39, or other + + ## Combatseq and ruv can both be false or ONLY ONE can be true + ## Both cannot be true + numerator: tumor + denominator: normal + column: sample_type + subset_column: null + subset_value: null + genome: hg38 + ruv: false + combatseq: false + params_file: params_de-example.R + project_file: ../information.R + functions_file: load_data.R +--- + + +```{r} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +``` + + +```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} +# 1. Set up input files in this R file (params_de.R) +source(params$params_file) +# 2. Set up project file (already done from QC probably) +source(params$project_file) +# 3. Load custom functions to load data from coldata/metrics/counts +source(params$functions_file) +# IMPORTANT set these values if you are not using the parameters in the header (lines 22-31) +genome=params$genome +column=params$column +numerator=params$numerator +denominator=params$denominator +subset_column=params$subset_column +subset_value=params$subset_value +run_ruv=params$ruv +run_combatseq=params$combatseq +factor_of_interest <- column +``` + + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(rtracklayer) +library(DESeq2) +library(tidyverse) +library(stringr) +library(DEGreport) +library(ggpubr) +library(msigdbr) +library(fgsea) +library(org.Hs.eg.db) +library(knitr) +library(EnhancedVolcano) +library(bcbioR) +library(ggprism) +library(viridis) +library(pheatmap) +library(janitor) +library(ggforce) +library(vegan) + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) +opts_chunk[["set"]]( + cache = F, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + echo = T, + fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) +``` + +```{r sanitize_datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df))) +} +``` + + + +```{r load_data, message=F, warning=F} +# This code will load from bcbio or nf-core folder +# NOTE make sure to set numerator and denominator +coldata <- load_coldata(coldata_fn, column, + numerator, denominator, + subset_column, subset_value) +coldata$sample=row.names(coldata) + +counts <- load_counts(counts_fn) +counts <- counts[,colnames(counts) %in% coldata$sample] + +metrics <- load_metrics(se_object, multiqc_data_dir, gtf_fn, counts) %>% + left_join(coldata, by = c('sample')) %>% + as.data.frame() +rownames(metrics) <- metrics$sample +# if the names don't match in order or string check files names and coldata information +counts = counts[,rownames(metrics)] +coldata = coldata[rownames(metrics),] +stopifnot(all(names(counts) == rownames(metrics))) +``` + + + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` +- Aim: `r aim` +- Comparison: `r ifelse(is.null(subset_value), paste0(numerator, ' vs. ', denominator), paste0(subset_value, ': ', numerator, ' vs. ', denominator))` + +```{r create_filenames} + +if (!is.null(subset_value) & !is.null(subset_value)){ + filenames = str_interp("${subset_value}_${numerator}_vs_${denominator}") +} else { + filenames = str_interp("${numerator}_vs_${denominator}") +} + +contrasts = c(column,numerator,denominator) +coef=paste0(column,"_",numerator,"_vs_",denominator) + +name_expression_fn=file.path( + basedir, + str_interp("${filenames}_expression.csv")) +name_deg_fn=file.path( + basedir, + str_interp("${filenames}_deg.csv")) +name_pathways_fn=file.path( + basedir, + str_interp("${filenames}_pathways.csv")) + +``` + +```{r load_counts_data} + +rdata = AnnotationDbi::select(org.Hs.eg.db, rownames(counts), 'SYMBOL', 'ENSEMBL') %>% + dplyr::select(gene_id = ENSEMBL, gene_name = SYMBOL) %>% + distinct(gene_id, .keep_all = TRUE) + +``` + +```{r setup_RUV} +dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = ~1) + +vsd_before <- vst(dds_to_use) +norm_matrix = assay(vsd_before) +``` + + +# PCA and group level variance. + +**Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells).** + +Dispersion estimates are a key part of the DESEQ2 analysis. DESEQ2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per gene dispersions to match this distribution. If one group has higher variance than all others this will affect the dispersion estimates. Here we visually check that the variance per group is similar using a PCA. The ellipses are minimal volume enclosing ellipses using the Khachiyan algorithm. + +**It is best practice NOT to subset your data unless one group has significantly higher variance than the others. The best dispersion estimates are obtained with more data.** + +**This code automatically uses the column value from the header. You can also manually add a factor of interest to define the groups. One can be created by combining multiple metadata columns using the paste0 function.** + +```{r set group, eval=FALSE, echo=FALSE} +## Example of creating a group covariate + +meta$group <- paste0(meta$sex,"_", meta$age,"_",meta$treatment) + +factor_of_interest <- "insert column name for covariate of interest" +``` + + +```{r PCA} +pca <- degPCA(norm_matrix, metrics, + condition = factor_of_interest, name = "sample", data = T) + +pca$plot + ggtitle(paste0("All samples", "\nPCA using ", nrow(vsd_before), " genes")) + + theme(plot.title=element_text(hjust=0.5)) + + geom_mark_ellipse(aes(color = sample_type)) + scale_color_cb_friendly() +``` + +## PERMDISP + +Groups in a univariate analysis can also differ with regard to their mean values, variation around those means, or both. In univariate analyses, dispersion can be examined using Levene’s test. PERMDISP is a multivariate extension of Levene’s test to examine whether groups differ in variability. In essence, PERMDISP involves calculating the distance from each data point to its group centroid and then testing whether those distances differ among the groups. [Source](https://uw.pressbooks.pub/appliedmultivariatestatistics/chapter/permdisp/) + +Here we apply this test to our variance stabilized data. We calculate distances between samples and then use the `betadisper()` function from the popular vegan package. We get two overall p-values where significant means that the dispersions are different between groups. The first p-value comes from the `anova()` function and the second from the `permutest()` function. We also get pairwise p-values for every group-group comparison. + +```{r PERMDISP} +vare.disa <- vegdist(t(assay(vsd_before))) + +mod = betadisper(vare.disa, metrics[[factor_of_interest]]) +anova(mod) +permutest(mod, pairwise = TRUE) + +``` + + + +# Covariate analysis + +Multiple factors related to the experimental design or quality of sequencing may influence the outcomes of a given RNA-seq experiment. To further determine whether any confounding covariate risks affecting the results of our differential expression analyses, it is useful to assess the correlation between covariates and principal component (PC) values. + +Here, we are using `DEGreport::degCovariates()` to explore potential correlations between variables provided in the metadata and all PCs that account for at least 5% of the variability in the data. If applicable, significant correlations (FDR < 0.1) are circled. **This diagnostic plot helps us determine which variables we may need to add to our DE model.** + + +```{r covariates, fig.height = 6, fig.width = 10} +degCovariates( + norm_matrix, + metrics, +) +``` + + + +```{r init_DESEQ} +formula <- as.formula(paste0("~ ", " + ", column)) +## Check if sample name matches +stopifnot(all(names(counts) == rownames(coldata))) + +dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = formula) + +vsd_before <- vst(dds_to_use) +norm_matrix = assay(vsd_before) +new_cdata <- coldata +``` + + +```{r, eval=F, echo=FALSE} +#### IF YOU ARE RUNNING RUV OR COMBATSEQ RUN THE CHUNKS BELOW OTHERWISE SKIP TO Differential Expression SECTION + +### RUV - LINES 261-296 +### COMBATSEQ - LINES 303-369 +``` + + + +```{r, eval=run_ruv, results='asis', echo=run_ruv} +cat("# Remove Unwanted Variability + +When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data.") +``` + +```{r do_RUV, eval=run_ruv, echo=run_ruv} +library(RUVSeq) + +# If you want to skip the code, just set up formula to be your model in the next chunk of code +design <- coldata[[column]] +diffs <- makeGroups(design) +dat <- norm_matrix +# by default is running one variable, +# change K parameter to other number to find more unknown covariates +ruvset <- RUVs(dat, cIdx=rownames(dat), k=1, diffs, isLog = T, round = F) +vars <- ruvset$W + +new_cdata <- cbind(coldata, vars) + +formula <- as.formula(paste0("~ ", + paste0( + colnames(new_cdata)[grepl("W", colnames(new_cdata))], + collapse = " + " + ), " + ", column) +) +norm_matrix=ruvset$normalizedCounts +pca2 <- degPCA(norm_matrix, new_cdata, + condition = column) + ggtitle('After RUV') +pca2 + scale_color_cb_friendly() + +``` + +```{r after_RUV, eval=run_ruv} + +dds_to_use <- DESeqDataSetFromMatrix(counts, new_cdata, design = formula) +vsd_to_use<- vst(dds_to_use, blind=FALSE) + +``` + +```{r combat-text , eval=run_combatseq, results='asis', echo=run_combatseq} +library(sva) + +cat("# Remove Batch Effects + +Here we apply Combat-seq (https://github.com/zhangyuqing/ComBat-seq) to try to remove batch effects so we can better tease out the effects of interest. + +Combat-seq uses a negative binomial regression to model batch effects, providing adjusted data by mapping the original data to an expected distribution if there were no batch effects. The adjusted data preserves the integer nature of counts, so that it is compatible with the assumptions of state-of-the-art differential expression software (e.g. edgeR, DESeq2, which specifically request untransformed count data).") + +``` + + +```{r set_variable_combatseq, eval=run_combatseq, echo=run_combatseq} + +## FILL OUT THIS CHUNK OF CODE IF YOU WANT TO RUN COMBATSEQ + +## Set your batch effect variable here this is the variable that combatseq will try to remove + +## Column name of your batch variable +to_remove = "batch" + +## Column name of of your variable(s) of interest + +to_keep = "sample_type" + + +coldata[[to_remove]] <- as.factor(coldata[[to_remove]]) +coldata[[to_keep]] <- as.factor(coldata[[to_keep]]) + + +batch = coldata[[to_remove]] +treatment = coldata[[to_keep]] + +## If you have multiple variables of interest you will need to cbind them into one variable + +#treatment1 = metrics[[to_keep]] +#treatment2 = metrics[[to_keep]] +#treatment3 = metrics[[to_keep]] + + +# imp = cbind(as.numeric(as.character(treatment1)),as.numeric(as.character(treatment2)), as.numeric(as.character(treatment3))) + +``` + + +```{r do_combatseq, eval=run_combatseq} +adjusted_counts <- ComBat_seq(as.matrix(counts), batch=batch, group = treatment) + +## For multiple variables of interest + +# adjusted_counts <- ComBat_seq(as.matrix(counts2), batch=batch, covar_mod = imp) + +``` + +```{r after_combatseq, eval=run_combatseq} +# NOTE: Make sure the formula doens't contain the covariates used in combatseq above +dds_to_use <- DESeqDataSetFromMatrix(adjusted_counts, coldata, design = formula) +vsd_combat<- vst(dds_to_use, blind=FALSE) + +combat_matrix = assay(vsd_combat) + +pca_combat <- degPCA(combat_matrix, coldata, + condition = column) + ggtitle('After Combatseq') +pca_combat + scale_color_cb_friendly() + +``` + + +# Differential Expression + +Differential gene expression analysis of count data was performed using the Bioconductor R package, DESeq2, which fits the count data to a negative binomial model. + +Before fitting the model, we often look at a metric called dispersion, which is a measure for variance which also takes into consideration mean expression. A dispersion value is estimated for each individual gene, then 'shrunken' to a more accurate value based on expected variation for a typical gene exhibiting that level of expression. Finally, the shrunken dispersion value is used in the final GLM fit. + +We use the below dispersion plot, which should show an inverse relationship between dispersion and mean expression, to get an idea of whether our data is a good fit for the model. + +```{r DE} +de <- DESeq(dds_to_use) + +DESeq2::plotDispEsts(de) +``` + +Because it is difficult to accurately detect and quantify the expression of lowly expressed genes, differences in their expression between treatment conditions can be unduly exaggerated after the model is fit. We correct for this so that gene LFC is not dependent overall on basal gene expression level. + +```{r lfc_shrink} +# resultsNames(de) # check the order is right +resLFC = results(de, contrast=contrasts) +resLFCS <- lfcShrink(de, coef=coef, type="apeglm") + +res <- as.data.frame(resLFCS) %>% + rownames_to_column('gene_id') %>% left_join(rdata, by = 'gene_id') %>% + relocate(gene_name) %>% dplyr::rename(lfc = log2FoldChange) %>% + mutate(pi = abs(lfc) * -log10(padj)) %>% arrange(-pi) + +## Filter out genes that have no expression or were filtered out by DESEQ2 +res <- res[res$baseMean>0,] %>% drop_na(padj) %>% drop_na(pvalue) + +res_sig <- res %>% filter(padj < 0.05) %>% arrange(padj) %>% + mutate(gene_name = ifelse(is.na(gene_name), gene_id, gene_name)) + +res_mod <- res %>% mutate(lfc = replace(lfc, lfc < -5, -5)) %>% mutate(lfc = replace(lfc, lfc > 5, 5)) +show <- as.data.frame(res_mod[1:10, c("lfc", "padj", "gene_name")]) + +degMA(as.DEGSet(resLFC)) + ggtitle('Before LFC Shrinking') +``` + +## MA plot + +```{r after_lfc_shrink} +degMA(as.DEGSet(resLFCS), limit = 2) + ggtitle('After LFC Shrinking') + +``` + +## Volcano plot + +This volcano plot shows the genes that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in red are genes that have padj < 0.05 and a log2-fold change > 1. Points in blue have a padj < 0.05 and a log2-fold change < 1 and points in green have a padj > 0.05 and a log2-fold change > 2. Grey points are non-significant. The dashed lines correspond to the cutoff values of log2 foldchance and padj that we have chosen. + +```{r volcano_plot, fig.height=6} +# degVolcano(res_mod[,c('lfc', 'padj')], plot_text = show) +EnhancedVolcano(res_mod, + lab= res_mod$gene_name, + pCutoff = 0.05, + selectLab = c(res_sig$gene_name[1:15]), + FCcutoff = 0.5, + x = 'lfc', + y = 'padj', + title="Volcano Tumor vs. Normal", + col=as.vector(colors[c("dark_grey", "light_blue", + "purple", "purple")]), + subtitle = "", drawConnectors = T, max.overlaps = Inf) +``` + +## Heatmap + +```{r heapmap} +### Run pheatmap using the metadata data frame for the annotation +ma=norm_matrix[res_sig$gene_id,] +colma=coldata[,c(column), drop=FALSE] +colors=lapply(colnames(colma), function(c){ + l.col=colors[1:length(unique(colma[[c]]))] + names(l.col)=unique(colma[[c]]) + l.col +}) +names(colors)=colnames(colma) +pheatmap(ma, + color = inferno(10), + cluster_rows = T, + show_rownames = F, + annotation = colma, + annotation_colors = colors, + border_color = NA, + fontsize = 10, + scale = "row", + fontsize_row = 10, + height = 20) +``` + + +## Differentially Expressed Genes + +```{r sig_genes_table} +res_sig %>% sanitize_datatable +``` + +## Plot top 16 genes + +```{r top n DEGs, fig.height = 6, fig.width = 8} +n = 16 +top_n <- res_sig %>% slice_min(order_by = padj, n = n, with_ties = F) %>% + dplyr::select(gene_name, gene_id) +top_n_exp <- norm_matrix %>% as.data.frame() %>% + rownames_to_column('gene_id') %>% + # dplyr::select(-group, -group_name) %>% + pivot_longer(!gene_id, names_to = 'sample', values_to = 'log2_expression') %>% + right_join(top_n, relationship = "many-to-many") %>% + left_join(coldata, by = 'sample') + +ggplot(top_n_exp, aes_string(x = column, y = 'log2_expression')) + + geom_boxplot(outlier.shape = NA, linewidth=0.5, color="grey") + + geom_point() + + facet_wrap(~gene_name) + + ggtitle(str_interp('Expression of Top ${n} DEGs')) + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + +``` + +# Pathway Enrichment + +From the set of differentially expressed genes and using publicly available information about gene sets involved in biological processes and functions, we can calculate which biological processes and functions are significantly perturbed as a result of the treatment. + +```{r} +universe=res %>% + filter(!is.na(padj)) %>% pull(gene_id) +mapping = AnnotationDbi::select(org.Hs.eg.db, universe, 'ENTREZID', 'ENSEMBL') + +all_in_life=list( + msigdbr(species = "human", category = "H") %>% mutate(gs_subcat="Hallmark"), + msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME"), + msigdbr(species = "human", category = "C2", subcategory = "CP:KEGG"), + msigdbr(species = "human", category = "C2", subcategory = "CP:PID"), + msigdbr(species = "human", category = "C5", subcategory = "GO:BP"), + msigdbr(species = "human", category = "C5", subcategory = "GO:MF"), + msigdbr(species = "human", category = "C5", subcategory = "HPO"), + msigdbr(species = "human", category = "C3", subcategory = "TFT:GTRD"), + msigdbr(species = "human", category = "C6") %>% mutate(gs_subcat="Oncogenic") +) + +ora_input = res %>% filter(!is.na(padj), padj<0.01, abs(lfc)>0.3) %>% pull(gene_id) +input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL')) + +total_deg=length(unique(ora_input))/length(unique(mapping$ENTREZID)) +pathways_ora_all = lapply(all_in_life, function(p){ + pathway = split(x = p$entrez_gene, f = p$gs_name) + db_name = paste(p$gs_cat[1], p$gs_subcat[1],sep=":") + respath <- fora(pathways = pathway, + genes = unique(input_entrezid$ENTREZID), + universe = unique(mapping$ENTREZID), + minSize = 15, + maxSize = 500) + coll_respath = collapsePathwaysORA(respath[order(pval)][padj < 0.1], + pathway, unique(input_entrezid$ENTREZID), unique(mapping$ENTREZID)) + as_tibble(respath[pathway %in% coll_respath$mainPathways]) %>% + mutate(database=db_name, NES=(overlap/size)/(total_deg)) +}) %>% bind_rows() %>% + mutate(analysis="ORA") + +ora_tb = pathways_ora_all %>% unnest(overlapGenes) %>% + group_by(pathway) %>% + left_join(mapping, by =c("overlapGenes"="ENTREZID")) %>% + dplyr::select(pathway, padj, NES, ENSEMBL, analysis, + database) + +pathways_long = ora_tb + +``` + + +```{r pathaways_table} +pathways_ora_all %>% sanitize_datatable() +``` + + +```{r write-files} +counts_norm=norm_matrix %>% as.data.frame() %>% + rownames_to_column("gene_id") %>% + mutate(comparison = str_interp("${numerator}_vs_${denominator}")) + +res_for_writing <- res %>% + mutate(comparison = str_interp("${numerator}_vs_${denominator}")) + +pathways_for_writing <- pathways_long %>% + mutate(comparison = str_interp("${numerator}_vs_${denominator}")) + +if (!is.null(subset_value)){ + counts_norm <- counts_norm %>% + mutate(subset = subset_value) + res_for_writing <- res_for_writing %>% + mutate(subset = subset_value) + pathways_for_writing <- pathways_for_writing %>% + mutate(subset = subset_value) +} + +write_csv(counts_norm, name_expression_fn) +write_csv(res_for_writing, name_deg_fn) +write_csv(pathways_for_writing, name_pathways_fn) +``` + +# R session + +List and version of tools used for the DE report generation. + +```{r} +sessionInfo() +``` diff --git a/inst/templates/rnaseq/DE/load_data.R b/inst/templates/rnaseq/DE/load_data.R new file mode 100644 index 0000000..8a1d297 --- /dev/null +++ b/inst/templates/rnaseq/DE/load_data.R @@ -0,0 +1,146 @@ +library(tidyverse) +library(SummarizedExperiment) +library(janitor) +load_metrics <- function(se_object, multiqc_data_dir, gtf_fn, counts){ + + # bcbio input + if (!is.na(se_object)){ + + se <- readRDS(se_object) + metrics <- metadata(se)$metrics %>% as.data.frame() + # left_join(coldata %>% rownames_to_column('sample')) %>% column_to_rownames('sample') + } else { #nf-core input + + # Get metrics from nf-core into bcbio like table + # many metrics are already in the Genereal Table of MultiQC, this reads the file + metrics <- read_tsv(file.path(multiqc_data_dir, 'multiqc_general_stats.txt')) + + # we get some more metrics from Qualimap and rename columns + metrics_qualimap <- read_tsv(file.path(multiqc_data_dir, 'mqc_qualimap_genomic_origin_1.txt')) + metrics <- metrics %>% full_join(metrics_qualimap) + metrics <- metrics %>% + clean_names() %>% + dplyr::rename_with(~gsub('.*mqc_generalstats_', '', .)) + + # This uses the fastqc metrics to get total reads + total_reads <- metrics %>% + dplyr::filter(!is.na(fastqc_raw_total_sequences)) %>% + remove_empty(which = 'cols') %>% + dplyr::rename(single_sample = sample) %>% + mutate(sample = gsub('_[12]+$', '', single_sample)) %>% + group_by(sample) %>% + summarize(total_reads = sum(fastqc_raw_total_sequences)) + + # This renames to user-friendly names the metrics columns + metrics <- metrics %>% + dplyr::filter(is.na(fastqc_raw_total_sequences)) %>% + remove_empty(which = 'cols') %>% + full_join(total_reads) %>% + mutate(mapped_reads = samtools_reads_mapped) %>% + mutate(exonic_rate = exonic/(star_uniquely_mapped * 2)) %>% + mutate(intronic_rate = intronic/(star_uniquely_mapped * 2)) %>% + mutate(intergenic_rate = intergenic/(star_uniquely_mapped * 2)) %>% + mutate(x5_3_bias = qualimap_5_3_bias) + + # Sometimes we don't have rRNA due to mismatch annotation, We skip this if is the case + gtf <- NULL + if (genome =="other"){ + gtf <- gtf_fn + }else{ + if (genome == "hg38") { + gtf <- "hg38.rna.gtf.gz" + } else if (genome == "mm10") { + gtf <- "mm10.rna.gtf.gz" + } else if (genome == "mm39") { + gtf <- "mm39.rna.gtf.gz" + } + gtf <- system.file("extdata", "annotation", + gtf, + package="bcbioR") + } + if (is.null(gtf)) { + print("No genome provided! Please add it at the top of this Rmd") + } + + gtf=rtracklayer::import(gtf) + + + one=grep("gene_type", colnames(as.data.frame(gtf)), value = TRUE) + another=grep("gene_biotype", colnames(as.data.frame(gtf)), value = TRUE) + biotype=NULL + if(length(one)==1){ + biotype=one + }else if(length(another)==1){ + biotype=another + }else{ + warning("No gene biotype founded") + } + + if (!is.null(biotype)){ + annotation=as.data.frame(gtf) %>% .[,c("gene_id", biotype)] + rRNA=grepl("rRNA|tRNA",annotation[[biotype]]) + genes=intersect(annotation[rRNA,"gene_id"],row.names(counts)) + ratio=data.frame(sample=colnames(counts), + r_and_t_rna_rate=colSums(counts[genes,])/colSums(counts)) + metrics = left_join(metrics, ratio, by="sample") + }else{ + metrics[["r_and_t_rna_rate"]] <- NA + } + + # if ("custom_content_biotype_counts_percent_r_rna" %in% colnames(metrics)){ + # metrics <- mutate(metrics, r_rna_rate = custom_content_biotype_counts_percent_r_rna) + # }else{ + # metrics[["r_rna_rate"]] <- NA + # } + metrics=metrics[,c("sample","mapped_reads","exonic_rate","intronic_rate", + "total_reads", + "x5_3_bias", "r_and_t_rna_rate","intergenic_rate")] + } + metrics$sample <- make.names(metrics$sample) + rownames(metrics) <- metrics$sample + return(metrics) +} + +load_coldata <- function(coldata_fn, column, numerator, denominator, subset_column = NULL, subset_value = NULL){ + coldata=read.csv(coldata_fn) %>% + dplyr::select(!matches("fastq") & !matches("strandness")) %>% + distinct() + if('description' %in% names(coldata)){ + coldata$sample <- tolower(coldata$description) + } + coldata <- coldata %>% distinct(sample, .keep_all = T) + stopifnot(column %in% names(coldata)) + + # use only some samples, by default use all + if (!is.null(subset_column)){ + coldata <- coldata[coldata[[paste(subset_column)]] == subset_value, ] + } + #coldata <- coldata[coldata[[paste(column)]] %in% c(numerator, denominator), ] + #browser() + coldata$sample <- make.names(coldata$sample) + rownames(coldata) <- coldata$sample + coldata$description <- coldata$sample + + coldata[[column]] = relevel(as.factor(coldata[[column]]), denominator) + + return(coldata) +} + +load_counts <- function(counts_fn){ + + # bcbio input + if(grepl('csv', counts_fn)){ + counts <- read_csv(counts_fn) %>% + mutate(gene = str_replace(gene, pattern = "\\.[0-9]+$", "")) %>% + column_to_rownames('gene') + colnames(counts) = tolower(colnames(counts)) + return(counts) + } else { # nf-core input + counts <- read_tsv(counts_fn) %>% dplyr::select(-gene_name) %>% + mutate(gene_id = str_replace(gene_id, pattern = "\\.[0-9]+$", "")) %>% + column_to_rownames('gene_id') %>% round + + return(counts) + } + +} diff --git a/inst/templates/rnaseq/DE/params_de-example.R b/inst/templates/rnaseq/DE/params_de-example.R new file mode 100644 index 0000000..cc75ad2 --- /dev/null +++ b/inst/templates/rnaseq/DE/params_de-example.R @@ -0,0 +1,18 @@ +# project params +date = "YYYYMMDD" +basedir <- './' # where to write down output files + +# params for bcbio +# coldata_fn = "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv" +# counts_fn = 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/tximport-counts.csv' +# se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds") +# + +# Example data +coldata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/coldata.csv' +counts_fn=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.tsv') +# This folder is in the output directory inside multiqc folder +multiqc_data_dir='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/multiqc/star_salmon/multiqc-report-data/' +# This file is inside the genome folder in the output directory +gtf_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/devel/nf-core/genome/genome.filtered.gtf.gz' +se_object = NA diff --git a/inst/templates/rnaseq/DE/params_de.R b/inst/templates/rnaseq/DE/params_de.R new file mode 100644 index 0000000..8426428 --- /dev/null +++ b/inst/templates/rnaseq/DE/params_de.R @@ -0,0 +1,22 @@ +# project params +date = "YYYYMMDD" +basedir <- './' # where to write down output files + +# params for bcbio +# coldata_fn = "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv" +# counts_fn = 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/tximport-counts.csv' +# se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds") +# + +# params for nfcore +# Your data +# This is the file used to run nf-core or compatible to that +coldata_fn='/Path/to/metadata/meta.csv' +# This file is inside star_salmon/ folder +counts_fn='/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.tsv' +# This folder called "multiqc_report_data" is inside the output directory star_salmon inside multiqc folder +multiqc_data_dir='/path/to/nf-core/output/star_salmon/multiqc_report_data' +# This file is inside the genome folder in the output directory, use this only non-model organism +# gtf_fn='/path/to/nf-core/output/genome/hg38.filtered.gtf' +se_object = NA + diff --git a/inst/templates/rnaseq/DE/run_markdown.R b/inst/templates/rnaseq/DE/run_markdown.R new file mode 100644 index 0000000..79e15a0 --- /dev/null +++ b/inst/templates/rnaseq/DE/run_markdown.R @@ -0,0 +1,32 @@ +library(rmarkdown) +# set working directory to this file before using the function + + +# set directory to this file folder +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +# example running with test data +render_de <- function(column, numerator, denominator, subset_value = NULL, + params_file = 'params_de-testdata.R'){ + + rmarkdown::render(input = "DEG.Rmd", + output_dir = ".", + output_format = "html_document", + output_file = ifelse(!is.null(subset_value), + paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'), + paste0('DE_', numerator, '_vs_', denominator, '.html') + ), + clean = TRUE, + envir = new.env(), + params = list( + column = column, + subset_value = subset_value, + numerator = numerator, + denominator = denominator, + params_file = params_file, + project_file = '../information.R', + functions_file = 'load_data.R' + ) + ) +} +#Example data +render_de("sample_type","tumor", "normal") diff --git a/inst/templates/rnaseq/QC/QC-bcbio.Rmd b/inst/templates/rnaseq/QC/QC-bcbio.Rmd new file mode 100644 index 0000000..aca91c1 --- /dev/null +++ b/inst/templates/rnaseq/QC/QC-bcbio.Rmd @@ -0,0 +1,409 @@ +--- +title: "Quality Control" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + params_file: params_qc.R +--- + + +```{r source_params, echo = F} +metadata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv' +se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds") +``` + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(tidyverse) +library(knitr) +library(DESeq2) +library(DEGreport) +library(ggrepel) +library(pheatmap) +# library(RColorBrewer) +library(DT) +library(pheatmap) +library(bcbioR) +ggplot2::theme_set(theme_light(base_size = 14)) +opts_chunk[["set"]]( + cache = FALSE, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + fig.height = 4) +``` + + +```{r subchunkify, echo=FALSE, eval=FALSE} +#' Create sub-chunks for plots +#' +#' taken from: https://stackoverflow.com/questions/15365829/dynamic-height-and-width-for-knitr-plots +#' +#' @param pl a plot object +#' @param fig.height figure height +#' @param fig.width figure width +#' @param chunk_name name of the chunk +#' +#' @author Andreas Scharmueller \email{andschar@@protonmail.com} +#' +subchunkify = function(pl, + fig.height = 7, + fig.width = 5, + chunk_name = 'plot') { + pl_deparsed = paste0(deparse(function() { + pl + }), collapse = '') + + sub_chunk = paste0( + "```{r ", + chunk_name, + ", fig.height=", + fig.height, + ", fig.width=", + fig.width, + ", dpi=72", + ", echo=FALSE, message=FALSE, warning=FALSE, fig.align='center'}", + "\n(", + pl_deparsed, + ")()", + "\n```" + ) + + cat(knitr::knit( + text = knitr::knit_expand(text = sub_chunk), + quiet = TRUE + )) +} + +``` + + +```{r sanitize-datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df))) +} +``` + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` +- Aim: `r aim` + + +# Samples and metadata + +```{r load_metadata} +meta_df=read_csv(metadata_fn) %>% mutate(sample = tolower(description)) %>% + dplyr::select(-description) + +ggplot(meta_df, aes(sample_type, fill = sample_type)) + + geom_bar() + ylab("") + xlab("") + + scale_fill_cb_friendly() +``` + + +```{r show-metadata} +se <- readRDS(se_object) #local + + +metrics <- metadata(se)$metrics %>% + full_join(meta_df , by = c("sample" = "sample")) + +meta_sm <- meta_df %>% + as.data.frame() %>% + column_to_rownames("sample") + +meta_sm %>% sanitize_datatable() + +``` + +# Read metrics {.tabset} + +## Total reads + +Here, we want to see consistency and a minimum of 20 million reads. + +```{r plot_total_reads} +metrics %>% + ggplot(aes(x = sample_type, + y = total_reads, + color = sample_type)) + + geom_point(alpha=0.5) + + coord_flip() + + scale_y_continuous(name = "million reads") + + scale_color_cb_friendly() + + ggtitle("Total reads") + +``` + +```{r calc_min_max_pct_mapped} +#get min percent mapped reads for reference +min_pct_mapped <- round(min(metrics$mapped_reads/metrics$total_reads)*100,1) +max_pct_mapped <- round(max(metrics$mapped_reads/metrics$total_reads)*100,1) +``` + +## Mapping rate + +The genomic mapping rate represents the percentage of reads mapping to the reference genome. We want to see consistent mapping rates between samples and over 70% mapping. These samples have mapping rates (`r min_pct_mapped` - `r max_pct_mapped`%). + +```{r plot_mapping_rate} +metrics$mapped_reads_pct <- round(metrics$mapped_reads/metrics$total_reads*100,1) +metrics %>% + ggplot(aes(x = sample_type, + y = mapped_reads_pct, + color = sample_type)) + + geom_point() + + coord_flip() + + scale_color_cb_friendly() + + ylim(0, 100) + + ggtitle("Mapping rate") + + geom_hline(yintercept=70, color = cb_friendly_cols('blue')) +``` + + +## Number of genes detected + +The number of genes represented in every sample is expected to be consistent and over 20K (blue line). + +```{r plot_genes_detected} +genes_detected <- colSums(assays(se)[["raw"]] > 0) %>% enframe() +sample_names <- metrics[,c("sample"), drop=F] +genes_detected <- left_join(genes_detected, sample_names, by = c("name" = "sample")) +genes_detected <- genes_detected %>% group_by(name) +genes_detected <- summarise(genes_detected, + n_genes = max(value)) + +metrics <- metrics %>% + left_join(genes_detected, by = c("sample" = "name")) +ggplot(metrics,aes(x = sample_type, + y = n_genes, color = sample_type)) + + geom_point() + + coord_flip() + + scale_color_cb_friendly() + + ggtitle("Number of genes") + + ylab("Number of genes") + + xlab("") + + geom_hline(yintercept=20000, color = cb_friendly_cols('blue')) +``` + + +## Gene detection saturation + +This plot shows how complex the samples are. We expect samples with more reads to detect more genes. + +```{r plot_gene_saturation} +metrics %>% + ggplot(aes(x = total_reads, + y = n_genes, + color = sample_type)) + + geom_point()+ + scale_x_log10() + + scale_color_cb_friendly() + + ggtitle("Gene saturation") + + ylab("Number of genes") +``` + +## Exonic mapping rate + +Here we are looking for consistency, and exonic mapping rates around 70% or 75% (blue and red lines, respectively). + +```{r plot_exonic_mapping_rate} +metrics %>% + ggplot(aes(x = sample_type, + y = exonic_rate * 100, + color = sample_type)) + + geom_point() + + ylab("Exonic rate %") + + ggtitle("Exonic mapping rate") + + scale_color_cb_friendly() + + coord_flip() + + xlab("") + + ylim(c(0,100)) + + geom_hline(yintercept=70, color = cb_friendly_cols('blue')) + + geom_hline(yintercept=75, color = cb_friendly_cols('brown')) +``` + +## Intronic mapping rate + +Here, we expect a low intronic mapping rate (≤ 15% - 20%) + +```{r plot_intronic_mapping_rate} +metrics %>% + ggplot(aes(x = sample_type, + y = intronic_rate * 100, + color = sample_type)) + + geom_point() + + ylab("Intronic rate %") + + ggtitle("Intronic mapping rate") + + scale_color_cb_friendly() + + coord_flip() + + xlab("") + + ylim(c(0,100)) + + geom_hline(yintercept=20, color = cb_friendly_cols('blue')) + + geom_hline(yintercept=15, color = cb_friendly_cols('brown')) +``` + +## Intergenic mapping rate + +Here, we expect a low intergenic mapping rate, which is true for all samples. + +```{r plot_intergenic_mapping_rate} +metrics %>% + ggplot(aes(x = sample_type, + y = intergenic_rate * 100, + color = sample_type)) + + geom_point() + + ylab("Intergenic rate %") + + ggtitle("Intergenic mapping rate") + + coord_flip() + + scale_color_cb_friendly() + + ylim(c(0, 100)) +``` + +## rRNA mapping rate + +Samples should have a ribosomal RNA (rRNA) "contamination" rate below 10% + +```{r plot_rrna_mapping_rate} +# for some bad samples it could be > 50% +rrna_ylim <- max(round(metrics$r_rna_rate*100, 2)) + 10 +metrics %>% + ggplot(aes(x = sample_type, + y = r_rna_rate * 100, + color = sample_type)) + + geom_point() + + ylab("rRNA rate, %")+ + ylim(0, rrna_ylim) + + ggtitle("rRNA mapping rate") + + coord_flip() + + scale_color_cb_friendly() +``` + +## 5'->3' bias + +There should be little bias, i.e. the values should be close to 1, or at least consistent among samples + +```{r plot_53_bias} +metrics %>% + ggplot(aes(x = sample_type, + y = x5_3_bias, + color = sample_type)) + + geom_point() + + ggtitle("5'-3' bias") + + coord_flip() + + ylim(c(0.5,1.5)) + + scale_color_cb_friendly()+ + geom_hline(yintercept=1, color = cb_friendly_cols('blue')) +``` + +## Counts per gene - all genes + +We expect consistency in the box plots here between the samples, i.e. the distribution of counts across the genes is similar + +```{r plot_counts_per_gene} +metrics_small <- metrics %>% dplyr::select(sample, sample_type) +metrics_small <- left_join(sample_names, metrics_small) + +counts <- + assays(se)[["raw"]] %>% + as_tibble() %>% + filter(rowSums(.)!=0) %>% + gather(name, counts) + +counts <- left_join(counts, metrics, by = c("name" = "sample")) + +ggplot(counts, aes(sample_type, + log2(counts+1), + fill = sample_type)) + + geom_boxplot() + + scale_fill_cb_friendly() + + ggtitle("Counts per gene, all non-zero genes") + + scale_color_cb_friendly() +``` + + +# Sample similarity analysis + +In this section, we look at how well the different groups in the dataset cluster with each other. Samples from the same group should ideally be clustering together. We use Principal Component Analysis (PCA). + +## Principal component analysis (PCA) {.tabset} + +Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells). + + +```{r PCA1:5 summary, all, unlabeled, fig.width= 7, fig.height = 5} +raw_counts <- assays(se)[["raw"]] %>% + as_tibble() %>% + filter(rowSums(.)!=0) %>% + as.matrix() + +vst <- vst(raw_counts) + +#fix samples names +coldat_for_pca <- as.data.frame(metrics) +rownames(coldat_for_pca) <- coldat_for_pca$sample +coldat_for_pca <- coldat_for_pca[colnames(raw_counts),] +pca1 <- degPCA(vst, coldat_for_pca, + condition = "sample_type", data = T)[["plot"]] +pca2 <- degPCA(vst, coldat_for_pca, + condition = "sample_type", data = T, pc1="PC3", pc2="PC4")[["plot"]] + +pca1 + scale_color_cb_friendly() +pca2 + scale_color_cb_friendly() +``` + + +```{r, eval=FALSE} +variables=degCovariates(vst, coldat_for_pca) +``` + + +```{r clustering fig, fig.width = 10, fig.asp = .62} +## Hierarchical clustering + +vst_cor <- cor(vst) + +annotation_cols <- cb_friendly_pal('grey')(length(unique(coldat_for_pca$sample_type))) +names(annotation_cols) <- unique(coldat_for_pca$sample_type) + +p <- pheatmap(vst_cor, + annotation = coldat_for_pca %>% select(sample_type) %>% mutate(sample_type = as.factor(sample_type)), + show_rownames = T, + show_colnames = T, + color = cb_friendly_pal('heatmap')(15), + annotation_colors = list(sample_type = annotation_cols) +) +p + +``` + +# R session + +List and version of tools used for the QC report generation. + +```{r} +sessionInfo() +``` diff --git a/inst/templates/rnaseq/QC/QC_nf-core.Rmd b/inst/templates/rnaseq/QC/QC_nf-core.Rmd new file mode 100644 index 0000000..dee4712 --- /dev/null +++ b/inst/templates/rnaseq/QC/QC_nf-core.Rmd @@ -0,0 +1,585 @@ +--- +title: "Quality Control" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + # params_file: params_qc_nf-core-example.R # example data + # Fill this file with the right paths to nfcore output + params_file: params_qc_nf-core.R + # Put hg38, mm10, mm39, or other + genome: hg38 + project_file: ../information.R + factor_of_interest: sample_type +--- + +```{r} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +``` + + +```{r source_params, echo = F} +# 1. set up factor_of_interest parameter from parameter above or manually +# this is used to color plots, it needs to be part of the metadata +factor_of_interest=params$factor_of_interest +genome=params$genome +# 2. Set input files in this file +source(params$params_file) +# 3. If you set up this file, project information will be printed below and +#. it can be reused for other Rmd files. +source(params$project_file) +``` + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` + + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(tidyverse) +library(knitr) +library(rtracklayer) +library(DESeq2) +library(DEGreport) +library(ggrepel) +# library(RColorBrewer) +library(DT) +library(pheatmap) +library(bcbioR) +library(janitor) +ggplot2::theme_set(theme_light(base_size = 14)) +opts_chunk[["set"]]( + cache = FALSE, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + fig.height = 4) +``` + + +```{r subchunkify, echo=FALSE, eval=FALSE} +#' Create sub-chunks for plots +#' +#' taken from: https://stackoverflow.com/questions/15365829/dynamic-height-and-width-for-knitr-plots +#' +#' @param pl a plot object +#' @param fig.height figure height +#' @param fig.width figure width +#' @param chunk_name name of the chunk +#' +#' @author Andreas Scharmueller \email{andschar@@protonmail.com} +#' +subchunkify = function(pl, + fig.height = 7, + fig.width = 5, + chunk_name = 'plot') { + pl_deparsed = paste0(deparse(function() { + pl + }), collapse = '') + + sub_chunk = paste0( + "```{r ", + chunk_name, + ", fig.height=", + fig.height, + ", fig.width=", + fig.width, + ", dpi=72", + ", echo=FALSE, message=FALSE, warning=FALSE, fig.align='center'}", + "\n(", + pl_deparsed, + ")()", + "\n```" + ) + + cat(knitr::knit( + text = knitr::knit_expand(text = sub_chunk), + quiet = TRUE + )) +} + +``` + + +```{r sanitize-datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df))) +} +``` + + +# Samples and metadata + + +```{r load_metadata} + +meta_df=read_csv(metadata_fn) %>% + arrange(.data[[factor_of_interest]]) %>% + distinct(sample, .keep_all = T) %>% + dplyr::select(!matches("fastq"), !matches("strandness")) +meta_df$sample <- make.names(meta_df$sample) +order <- meta_df$sample + +ggplot(meta_df, aes(.data[[factor_of_interest]], + fill = .data[[factor_of_interest]])) + + geom_bar() + ylab("") + xlab("") + ylab("# of samples") + + scale_fill_cb_friendly() + theme(axis.text.x=element_text(angle = 90, vjust = 0.5), legend.position = "none") +``` + + +```{r load_data} +# read counts from SE object +se <- readRDS(se_object) +raw_counts <- assays(se)[["counts"]] %>% round() %>% + as.matrix() +raw_counts=raw_counts[rowSums(raw_counts)!=0,] +``` + +```{r prepare_metrics} +# Get metrics from nf-core into bcbio like table +# many metrics are already in the General Table of MultiQC, this reads the file +metrics <- read_tsv(file.path(multiqc_data_dir, 'multiqc_general_stats.txt')) + +# we get some more metrics from Qualimap and rename columns +metrics_qualimap <- read_tsv(file.path(multiqc_data_dir, 'mqc_qualimap_genomic_origin_1.txt')) +metrics <- metrics %>% full_join(metrics_qualimap) +metrics <- metrics %>% + clean_names() %>% + dplyr::rename_with(~gsub('.*mqc_generalstats_', '', .)) + +# This uses the fastqc metrics to get total reads +total_reads <- metrics %>% + dplyr::filter(!is.na(fastqc_raw_total_sequences)) %>% + remove_empty(which = 'cols') %>% + dplyr::rename(single_sample = sample) %>% + mutate(sample = gsub('_[12]+$', '', single_sample)) %>% + group_by(sample) %>% + summarize(total_reads = sum(fastqc_raw_total_sequences)) + +# This renames to user-friendly names the metrics columns +metrics <- metrics %>% + dplyr::filter(is.na(fastqc_raw_total_sequences)) %>% + remove_empty(which = 'cols') %>% + full_join(total_reads) %>% + mutate(mapped_reads = samtools_reads_mapped) %>% + mutate(exonic_rate = exonic/(star_uniquely_mapped * 2)) %>% + mutate(intronic_rate = intronic/(star_uniquely_mapped * 2)) %>% + mutate(intergenic_rate = intergenic/(star_uniquely_mapped * 2)) %>% + mutate(x5_3_bias = qualimap_5_3_bias) + +# Sometimes we don't have rRNA due to mismatch annotation, We skip this if is the case +gtf <- NULL +if (genome =="other"){ + gtf <- gtf_fn +}else{ + if (genome == "hg38") { + gtf <- "hg38.rna.gtf.gz" + } else if (genome == "mm10") { + gtf <- "mm10.rna.gtf.gz" + } else if (genome == "mm39") { + gtf <- "mm39.rna.gtf.gz" + } + gtf <- system.file("extdata", "annotation", + gtf, + package="bcbioR") +} +if (is.null(gtf)) { + print("No genome provided! Please add it at the top of this Rmd") +} + +gtf=rtracklayer::import(gtf) + +one=grep("gene_type", colnames(as.data.frame(gtf)), value = TRUE) +another=grep("gene_biotype", colnames(as.data.frame(gtf)), value = TRUE) +biotype=NULL +if(length(one)==1){ + biotype=one +}else if(length(another)==1){ + biotype=another +}else{ + warning("No gene biotype founded") +} + +if (!is.null(biotype)){ + annotation=as.data.frame(gtf) %>% .[,c("gene_id", biotype)] + rRNA=grepl("rRNA|tRNA",annotation[[biotype]]) + genes=intersect(annotation[rRNA,"gene_id"],row.names(raw_counts)) + ratio=data.frame(sample=colnames(raw_counts), + r_and_t_rna_rate=colSums(raw_counts[genes,])/colSums(raw_counts)) + metrics = left_join(metrics, ratio, by="sample") +}else{ + metrics[["r_and_t_rna_rate"]] <- NA +} + +# if ("custom_content_biotype_counts_percent_r_rna" %in% colnames(metrics)){ +# metrics <- mutate(metrics, r_rna_rate = custom_content_biotype_counts_percent_r_rna) +# }else{ +# metrics[["r_rna_rate"]] <- NA +# } +metrics=metrics[,c("sample","mapped_reads","exonic_rate","intronic_rate", + "total_reads", + "x5_3_bias", "r_and_t_rna_rate","intergenic_rate")] +metrics$sample <- make.names(metrics$sample) +metrics <- metrics %>% + full_join(meta_df , by = c("sample" = "sample")) %>% + dplyr::select(where(~!all(is.na(.)))) + +``` + +```{r show_metadata} +meta_sm <- meta_df %>% + as.data.frame() %>% + column_to_rownames("sample") + +meta_sm %>% sanitize_datatable() + +``` + +# Read metrics {.tabset} + +## Total reads + +Here, we want to see consistency and a minimum of 20 million reads (the grey line). + +```{r plot_total_reads} +metrics %>% + ggplot(aes(x = factor(sample, level = order), + y = total_reads, + fill = .data[[factor_of_interest]])) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "million reads") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Total reads") + + geom_hline(yintercept=20000000, color = "grey", linewidth=2) + +metrics %>% + ggplot(aes(x = .data[[factor_of_interest]], + y = total_reads, + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + + coord_flip() + + scale_y_continuous(name = "million reads") + + scale_color_cb_friendly() + xlab("") + + ggtitle("Total reads") +``` + +```{r calc_min_max_pct_mapped} +#get min percent mapped reads for reference +min_pct_mapped <- round(min(metrics$mapped_reads/metrics$total_reads)*100,1) +max_pct_mapped <- round(max(metrics$mapped_reads/metrics$total_reads)*100,1) +``` + +## Mapping rate + +The genomic mapping rate represents the percentage of reads mapping to the reference genome. We want to see consistent mapping rates between samples and over 70% mapping (the grey line). These samples have mapping rates: `r min_pct_mapped` - `r max_pct_mapped`%. + +```{r plot_mapping_rate} +metrics$mapped_reads_pct <- round(metrics$mapped_reads/metrics$total_reads*100,1) +metrics %>% + ggplot(aes(x = factor(sample, level = order), + y = mapped_reads_pct, + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + + coord_flip() + + scale_x_discrete(limits = rev) + + scale_color_cb_friendly() + + ylim(0, 100) + + ggtitle("Mapping rate") + xlab("") + + geom_hline(yintercept=70, color = "grey", linewidth=2) +``` + + +## Number of genes detected + +The number of genes represented in every sample is expected to be consistent and over 20K (grey line). + +```{r calc_genes_detected} +genes_detected <- colSums(assays(se)[["counts"]] > 0) %>% enframe() +sample_names <- metrics[,c("sample"), drop=F] +genes_detected <- left_join(genes_detected, sample_names, by = c("name" = "sample")) +genes_detected <- genes_detected %>% group_by(name) +genes_detected <- summarise(genes_detected, + n_genes = max(value)) + +metrics <- metrics %>% + left_join(genes_detected, by = c("sample" = "name")) + +``` + + +```{r plot_genes_detected} +ggplot(metrics,aes(x = factor(sample, level = order), + y = n_genes, fill = .data[[factor_of_interest]])) + + geom_bar(stat = "identity") + + coord_flip() + + scale_fill_cb_friendly() + + scale_x_discrete(limits = rev) + + ggtitle("Number of genes") + + ylab("Number of genes") + + xlab("") + + geom_hline(yintercept=20000, color = "grey", linewidth=2) + +metrics %>% + ggplot(aes(x = .data[[factor_of_interest]], + y = n_genes, + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + + coord_flip() + + scale_x_discrete(limits = rev) + + scale_y_continuous(name = "million reads") + + scale_color_cb_friendly() + xlab("") + + ggtitle("Number of Genes") + +``` + + + +## Gene detection saturation + +This plot shows how complex the samples are. We expect samples with more reads to detect more genes. + +```{r plot_gene_saturation} +metrics %>% + ggplot(aes(x = total_reads, + y = n_genes, + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + + scale_x_log10() + + scale_color_cb_friendly() + + ggtitle("Gene saturation") + + ylab("Number of genes") +``` + +## Exonic mapping rate + +Here we are looking for consistency, and exonic mapping rates around or above 70% (grey line). + +```{r plot_exonic_mapping_rate} +metrics %>% + ggplot(aes(x = factor(sample, level = order), + y = exonic_rate * 100, + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + + ylab("Exonic rate %") + + ggtitle("Exonic mapping rate") + + scale_x_discrete(limits = rev) + + scale_color_cb_friendly() + + coord_flip() + + xlab("") + + ylim(c(0,100)) + + geom_hline(yintercept=70, color = "grey", linewidth=2) +``` + +## Intronic mapping rate + +Here, we expect a low intronic mapping rate (≤ 15% - 20%). The grey line indicates 20%. + +```{r plot_intronic_mapping_rate} +metrics %>% + ggplot(aes(x = factor(sample, level = order), + y = intronic_rate * 100, + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + + ylab("Intronic rate %") + + ggtitle("Intronic mapping rate") + + scale_x_discrete(limits = rev) + + scale_color_cb_friendly() + + coord_flip() + + xlab("") + + ylim(c(0,100)) + + geom_hline(yintercept=20, color = "grey", linewidth=2) +``` + +## Intergenic mapping rate + +Here, we expect a low intergenic mapping rate, which is true for all samples. The grey line indicates 15% + +```{r plot_intergenic_mapping_rate} +metrics %>% + ggplot(aes(x = factor(sample, level = order), + y = intergenic_rate * 100, + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + + ylab("Intergenic rate %") + + ggtitle("Intergenic mapping rate") + + coord_flip() + xlab("") + + scale_x_discrete(limits = rev) + + scale_color_cb_friendly() + + ylim(c(0, 100)) + + geom_hline(yintercept=15, color = "grey", linewidth=2) +``` + +## tRNA/rRNA mapping rate + +Samples should have a ribosomal RNA (rRNA) "contamination" rate below 10% (the grey line). + +```{r plot_rrna_mapping_rate} + +rrna_ylim <- max(round(metrics$r_and_t_rna_rate*100, 2)) + 10 +metrics %>% + ggplot(aes(x = factor(sample, level = order), + y = r_and_t_rna_rate * 100, + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + + ylab("tRNA/rRNA rate, %")+ + ylim(0, rrna_ylim) + + ggtitle("tRNA/rRNA mapping rate") + + coord_flip() + + scale_x_discrete(limits = rev) + + scale_color_cb_friendly() + + ylim(c(0, 100)) + xlab("") + + geom_hline(yintercept=10, color = "grey", linewidth=2) +``` + +## 5'->3' bias + +There should be little bias, i.e. the values should be close to 1, or at least consistent among samples + +```{r plot_53_bias} +metrics %>% + ggplot(aes(x = factor(sample, level = order), + y = x5_3_bias, + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + + ggtitle("5'-3' bias") + + coord_flip() + + scale_x_discrete(limits = rev) + + ylim(c(0.5,1.5)) + xlab("") + ylab("5'-3' bias") + + scale_color_cb_friendly()+ + geom_hline(yintercept=1, color = "grey", linewidth=2) +``` + +## Counts per gene - all genes + +We expect consistency in the box plots here between the samples, i.e. the distribution of counts across the genes is similar + +```{r plot_counts_per_gene} +metrics_small <- metrics %>% dplyr::select(sample, .data[[factor_of_interest]]) +metrics_small <- left_join(sample_names, metrics_small) + +counts <- + assays(se)[["counts"]] %>% + as_tibble() %>% + filter(rowSums(.)!=0) %>% + gather(name, counts) + +counts <- left_join(counts, metrics_small, by = c("name" = "sample")) + +ggplot(counts, aes(factor(name, level = order), + log2(counts+1), + fill = .data[[factor_of_interest]])) + + geom_boxplot() + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + + coord_flip() + xlab("") + + ggtitle("Counts per gene, all non-zero genes") + + scale_color_cb_friendly() +``` + + +# Sample similarity analysis + +In this section, we look at how well the different groups in the dataset cluster with each other. Samples from the same group should ideally be clustering together. We use Principal Component Analysis (PCA). + +## Principal component analysis (PCA) + +Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells). + + +```{r PCA1:5 summary, all, unlabeled, fig.width= 7, fig.height = 5} + +vst <- vst(raw_counts) + +coldat_for_pca <- as.data.frame(metrics) +rownames(coldat_for_pca) <- coldat_for_pca$sample +coldat_for_pca <- coldat_for_pca[colnames(raw_counts),] +pca1 <- degPCA(vst, coldat_for_pca, + condition = factor_of_interest, data = T)[["plot"]] +pca2 <- degPCA(vst, coldat_for_pca, + condition = factor_of_interest, data = T, pc1="PC3", pc2="PC4")[["plot"]] + + + +pca1 + scale_color_cb_friendly() +pca2 + scale_color_cb_friendly() + +``` + +# Covariates analysis + +When there are multiple factors that can influence the results of a given experiment, it is useful to assess which of them is responsible for the most variance as determined by PCA. This method adapts the method described by Daily et al. for which they integrated a method to correlate covariates with principal components values to determine the importance of each factor. + +```{r covariate-plot,fig.height=12, fig.width=10} +## Remove non-useful columns output by nf-core +coldat_2 <- data.frame(coldat_for_pca[,!(colnames(coldat_for_pca) %in% c("fastq_1", "fastq_2", "salmon_library_types", "salmon_compatible_fragment_ratio", "samtools_reads_mapped_percent", "samtools_reads_properly_paired_percent", "samtools_mapped_passed_pct", "strandedness", "qualimap_5_3_bias"))]) + +# Remove missing data +coldat_2 <- na.omit(coldat_2) +degCovariates(vst, metadata = coldat_2) +``` + +## Hierarchical clustering + +Inter-correlation analysis (ICA) is another way to look at how well samples +cluster by plotting the correlation between the expression profiles of the +samples. + +```{r clustering fig, fig.width = 10, fig.asp = .62} + +vst_cor <- cor(vst) + +colma=meta_df %>% as.data.frame() +rownames(colma) <- colma$sample +colma <- colma[rownames(vst_cor), ] +colma <- colma %>% dplyr::select(.data[[factor_of_interest]]) +anno_colors=lapply(colnames(colma), function(c){ + l.col=cb_friendly_pal('grey')(length(unique(colma[[c]]))) + names(l.col)=unique(colma[[c]]) + l.col +}) +names(anno_colors)=colnames(colma) + +p <- pheatmap(vst_cor, + annotation = colma, + annotation_colors = anno_colors, + show_rownames = T, + show_colnames = T, + color = cb_friendly_pal('heatmap')(15) + ) +p +``` + +# R session + +List and version of tools used for the QC report generation. + +```{r} +sessionInfo() +``` diff --git a/inst/templates/rnaseq/QC/params_qc.R b/inst/templates/rnaseq/QC/params_qc.R new file mode 100644 index 0000000..84dabfa --- /dev/null +++ b/inst/templates/rnaseq/QC/params_qc.R @@ -0,0 +1,4 @@ +# info params + +metadata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv' +se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds") diff --git a/inst/templates/rnaseq/QC/params_qc_nf-core-example.R b/inst/templates/rnaseq/QC/params_qc_nf-core-example.R new file mode 100644 index 0000000..dae62ce --- /dev/null +++ b/inst/templates/rnaseq/QC/params_qc_nf-core-example.R @@ -0,0 +1,9 @@ +# info params + +# Example data: COMMENT THESE LINE IF YOU ARE USING YOUR DATA +metadata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/coldata.csv' +se_object=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.rds') +# This folder is in the output directory inside multiqc folder +multiqc_data_dir='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/multiqc/star_salmon/multiqc-report-data/' +# This file is inside the genome folder in the output directory +gtf_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/devel/nf-core/genome/genome.filtered.gtf.gz' diff --git a/inst/templates/rnaseq/QC/params_qc_nf-core.R b/inst/templates/rnaseq/QC/params_qc_nf-core.R new file mode 100644 index 0000000..08b3ec0 --- /dev/null +++ b/inst/templates/rnaseq/QC/params_qc_nf-core.R @@ -0,0 +1,11 @@ +# info params + +# Your data +# This is the file used to run nf-core or compatible to that +metadata_fn='/Path/to/metadata/meta.csv' +# This file is inside star_salmon/ folder +se_object='/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.rds' +# This folder called "multiqc_report_data" is inside the output directory star_salmon inside multiqc folder +multiqc_data_dir='/path/to/nf-core/output/multiqc/star_salmon/multiqc_report_data' +# This file is inside the genome folder in the output directory, use this only for non-model organism +# gtf_fn='/path/to/nf-core/output/genome/hg38.filtered.gtf' diff --git a/inst/templates/rnaseq/QC/run_markdown.R b/inst/templates/rnaseq/QC/run_markdown.R new file mode 100644 index 0000000..51acbef --- /dev/null +++ b/inst/templates/rnaseq/QC/run_markdown.R @@ -0,0 +1,13 @@ +library(rmarkdown) + +# set directory to this file folder +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +# example running with test data +rmarkdown::render("QC_nf-core.Rmd", + output_dir = ".", + clean = TRUE, + output_format = "html_document", + params = list( + params_file = 'params_qc_nf-core-testdata.R', + project_file = '../information.R') + ) diff --git a/inst/templates/rnaseq/README.md b/inst/templates/rnaseq/README.md new file mode 100644 index 0000000..6de2117 --- /dev/null +++ b/inst/templates/rnaseq/README.md @@ -0,0 +1,66 @@ +# Guideline for RNAseq downstream analysis + +Make sure there is a project name for this. + +## Run data with nf-core rnaseq + +- Make sure you have access to our [Seqera WorkSpace](https://cloud.seqera.io/orgs/HBC/workspaces/core_production/launchpad) +- Transfer data to HCBC S3: Ask Alex/Lorena. Files will be at our S3 bucket `input/rawdata` folder +- Prepare the CSV file according this [instructions](https://nf-co.re/rnaseq/3.14.0/docs/usage#multiple-runs-of-the-same-sample). File should look like this: + +```csv +sample,fastq_1,fastq_2,strandedness +CONTROL_REP1,s3path/AEG588A1_S1_L002_R1_001.fastq.gz,s3path/AEG588A1_S1_L002_R2_001.fastq.gz,auto +CONTROL_REP1,s3path/AEG588A1_S1_L003_R1_001.fastq.gz,s3path/AEG588A1_S1_L003_R2_001.fastq.gz,auto +CONTROL_REP1,s3path/AEG588A1_S1_L004_R1_001.fastq.gz,s3path/AEG588A1_S1_L004_R2_001.fastq.gz,auto +``` + +Use `bcbio_nfcore_check(csv_file)` to check the file is correct. + +You can add more columns to this file with more metadata, and use this file as the `coldata` file in the templates. + +- Upload file to our `Datasets` in Seqera using the name of the project but starting with `rnaseq-pi_lastname-hbc_code` +- Go to `Launchpad`, select `nf-core_rnaseq` pipeline, and select the previous created `Datasets` in the `input` parameter after clicking in `Browser` + - Select an output directory with the same name used for the `Dataset` inside the `results` folder in S3 +- When pipeline is down, data will be copied to our on-premise HPC in the scratch system under `scratch/groups/hsph/hbc/bcbio/` folder + +## Downstream analysis + +Please, modify `information.R` with the right information. You can use this file with any other Rmd to include the project/analysis information. + +### QC + +`QC/QC.Rmd` is a template for QC metrics. Use `params_qc.R` for `bcbio` + or `QC/QC_nf-core.Rmd` `params_qc_nf-core.R` for `nf-core/rnaseq` outputs. + +Read instruction in the R and Rmd scripts to render it. + +### DE + +`DE/DEG.Rmd` is a template for two groups comparison. `params_de.R` has the information of the input files to load. You can point to `bcbio` or `nf-core/rnaseq` output files. + +On the `YAML` header file of the `Rmd` you can specify some parameters or just set them up in the first chunk of code of the template. This template has examples of: + +- sub-setting data +- two groups comparison +- volcano plot +- MA plot +- Pathway analysis +- Tables + +## DropBox + +- In `reports/QC` + - [ ] copy `bcbio-se.rds` and `tximport-counts.csv` + - [ ] copy QC `Rmd/R/html/figures` +- In `reports/DE` + - [ ] Normalized counts for all genes x all samples (csv format) +- In `reports/DE`, for *each analysis*: + - **Note** For multiple comparisons/analysis, do a single report/template if possible in the parent folder using parameters whenever possible. + - Create a folder with the comparison names in the files. Numbering by comparison (`01.1_DE_comp1`, `01.2_DE_comp2`, etc.). If you’re running multiple models for the same comparison, append `_M#`. Add the following files under each folder: + - [ ] Normalized count table with the samples used in this analysis/comparison. + - [ ] Full results `DESeq2` for all genes (csv format) with annotation columns appended. + - [ ] Significant genes results file (subset of annotated full results by chosen p-value and LFC). Separate files will be created for each individual contrast. + - [ ] Significant genes results file as described above, but additionally append columns containing normalized count values for each sample. + - Make sure to append the gene symbols to these tables so the researcher can interpret the results. + diff --git a/inst/templates/rnaseq/information.R b/inst/templates/rnaseq/information.R new file mode 100644 index 0000000..6e15eef --- /dev/null +++ b/inst/templates/rnaseq/information.R @@ -0,0 +1,6 @@ +# info params +project = "name_hbcXXXXX" +PI = 'person name' +experiment = 'short description' +aim = 'short description' +analyst = 'person in the core' diff --git a/inst/templates/rnaseq/org/hcbc_README.md b/inst/templates/rnaseq/org/hcbc_README.md new file mode 100644 index 0000000..50f8f16 --- /dev/null +++ b/inst/templates/rnaseq/org/hcbc_README.md @@ -0,0 +1,75 @@ +# Guideline for RNAseq downstream analysis + +Make sure there is a project name for this. + +## Run data with nf-core rnaseq + +- Make sure you have access to our [Seqera WorkSpace](https://cloud.seqera.io/orgs/HBC/workspaces/core_production/launchpad) +- Transfer data to HCBC S3: Ask Alex/Lorena. Files will be at our S3 bucket `input/rawdata` folder +- Prepare the CSV file according this [instructions](https://nf-co.re/rnaseq/3.14.0/docs/usage#multiple-runs-of-the-same-sample). File should look like this: + +```csv +sample,fastq_1,fastq_2,strandedness +CONTROL_REP1,s3path/AEG588A1_S1_L002_R1_001.fastq.gz,s3path/AEG588A1_S1_L002_R2_001.fastq.gz,auto +CONTROL_REP1,s3path/AEG588A1_S1_L003_R1_001.fastq.gz,s3path/AEG588A1_S1_L003_R2_001.fastq.gz,auto +CONTROL_REP1,s3path/AEG588A1_S1_L004_R1_001.fastq.gz,s3path/AEG588A1_S1_L004_R2_001.fastq.gz,auto +``` + +Use `bcbio_nfcore_check(csv_file)` to check the file is correct. + +You can add more columns to this file with more metadata, and use this file as the `coldata` file the templates. + +- Upload file to our `Datasets` in Seqera using the name of the project but starting with `nfcore-rnaseq` +- Go to `Launchpad`, select `nf-core_rnaseq` pipeline, and select the previous created `Datasets` in the `input` parameter after clicking in `Browser` + - Select an output directory with the same name used for the `Dataset` inside the `results` folder in S3 +- When pipeline is down, data will be copied to our on-premise HPC in the scratch system under `scratch/groups/hsph/hbc/bcbio/` folder + +## Downstream analysis + +Please, modify `information.R` with the right information. You can use this file with any other Rmd to include the project/analysis information. + +### QC + +`QC/QC.Rmd` is a template for QC metrics. Use `params_qc.R` for `bcbio` + or `QC/QC_nf-core.Rmd` `params_qc_nf-core.R` for `nf-core/rnaseq` outputs. + +Read instruction in the R and Rmd scripts to render it. + +### DE + +`DE/DEG.Rmd` is a template for two groups comparison. `params_de.R` has the information of the input files to load. You can point to `bcbio` or `nf-core/rnaseq` output files. + +On the `YAML` header file of the `Rmd` you can specify some parameters or just set them up in the first chunk of code of the template. This template has examples of: + +- sub-setting data +- two groups comparison +- volcano plot +- MA plot +- Pathway analysis +- Tables + +There are some code related to alternative analysis: + +- `DE/Multiplicative_DE_docs.md` that shows some cases when there is multiple variables in the model with multiple levels: sex (2 levels) and genotype (4 levels) + +## DropBox + +- In `reports/QC` + - [ ] copy `bcbio-se.rds` and `tximport-counts.csv` + - [ ] copy QC `Rmd/R/html/figures` +- In `reports/DE` + - [ ] Normalized counts for all genes x all samples (csv format) +- In `reports/DE`, for *each analysis*: + - **Note** For multiple comparisons/analysis, do a single report/template if possible in the parent folder using parameters whenever possible. + - Create a folder with the comparison names in the files. Numbering by comparison (`01.1_DE_comp1`, `01.2_DE_comp2`, etc.). If you’re running multiple models for the same comparison, append `_M#`. Add the following files under each folder: + - [ ] Normalized count table with the samples used in this analysis/comparison. + - [ ] Full results `DESeq2` for all genes (csv format) with annotation columns appended. + - [ ] Significant genes results file (subset of annotated full results by chosen p-value and LFC). Separate files will be created for each individual contrast. + - [ ] Significant genes results file as described above, but additionally append columns containing normalized count values for each sample. + - Make sure to append the gene symbols to these tables so the researcher can interpret the results. + +## GitHub + +- [ ] Push all `*Rmd` `*R` files used for the *QC* and *DE* analysis respecting folder structure. + +Please, ignore `*html/figures/csv` and any output of the code. From 3a42cffeaca69229fce01a1f5eda5186be06d6eb Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Wed, 31 Jul 2024 18:11:25 -0400 Subject: [PATCH 35/93] fix folders --- R/helpers.R | 26 +- .../singlecell_delux/CellToCell/cellchat.Rmd | 440 ++++++++++++++++++ 2 files changed, 460 insertions(+), 6 deletions(-) create mode 100644 inst/templates/singlecell_delux/CellToCell/cellchat.Rmd diff --git a/R/helpers.R b/R/helpers.R index 799ae39..aba6f42 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -54,12 +54,24 @@ bcbio_templates <- function(type="rnaseq", outpath){ #file.copy(fpath, outpath, recursive = T) copy_templates(outpath, "nf-core/rnaseq") }, - scrnaseq={ + singlecell={ #file.copy(fpath, outpath, recursive = T) copy_templates(outpath, "singlecell") }, + singlecell_delux={ + #file.copy(fpath, outpath, recursive = T) + copy_templates(outpath, "singlecell_delux") + }, + spatial={ + #file.copy(fpath, outpath, recursive = T) + copy_templates(outpath, "spatial") + }, + multiomics={ + #file.copy(fpath, outpath, recursive = T) + copy_templates(outpath, "multiomics") + }, { - stop('project type not recognize, please choose: ', 'rnaseq', 'singlecell') + stop('project type not recognize, please choose: ', 'rnaseq', 'singlecell','singlecell_delux','spatial') } ) } @@ -130,10 +142,12 @@ copy_templates <- function(path, pipeline){ parts = c("templates/rnaseq") }else if(pipeline=="singlecell"){ parts = c("templates/singlecell") - }else if(pipeline=="teaseq"){ - parts = c("templates/teaseq") - }else if(pipeline=="cosmx"){ - parts = c("templates/cosmx") + }else if(pipeline=="singlecell_delux"){ + parts = c("templates/singlecell_delux") + }else if(pipeline=="multiomics"){ + parts = c("templates/multiomics") + }else if(pipeline=="spatial"){ + parts = c("templates/spatial") } analysis_template <- fs::path_package(base, parts) ui_info("Getting templates from {ui_value(analysis_template)}") diff --git a/inst/templates/singlecell_delux/CellToCell/cellchat.Rmd b/inst/templates/singlecell_delux/CellToCell/cellchat.Rmd new file mode 100644 index 0000000..b9da33e --- /dev/null +++ b/inst/templates/singlecell_delux/CellToCell/cellchat.Rmd @@ -0,0 +1,440 @@ +--- +title: "CellChat" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: false + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + project_file: information.R + seurat_fn: ../data/fDat_sn_RC.rds + cellchat_fn: ../data/snrna_cellchat.qs + cellchat_grade2_fn: ../data/snrna_cellchat_grade2.qs + cellchat_grade0_fn: ../data/snrna_cellchat_grade0.qs +--- + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE, echo=FALSE,} + +reticulate::use_virtualenv("/n/app/bcbio/R4.3.1_python_cellchat") +reticulate::py_config() # should show v3.9.14 +Sys.getenv("PYTHONPATH") # should be empty + +current_libs <- .libPaths() +.libPaths(c('/n/app/bcbio/R4.3.1_cellchat/', current_libs)) +library(CellChat) + +library(tidyverse) +library(Seurat) +library(bcbioR) +library(ggprism) +library(knitr) +library(tools) +library(qs) +library(patchwork) +library(ComplexHeatmap) + +options(stringsAsFactors = FALSE) + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) +opts_chunk[["set"]]( + cache = F, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + echo = T, + fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) + +cellchat_ran <- file.exists(params$cellchat_fn) +cellchat_rejection_ran <- file.exists(params$cellchat_grade2_fn) & file.exists(params$cellchat_grade0_fn) + +``` + +# Clustering + +```{r load_data } + +snrna <- readRDS(params$seurat_fn) + +# in this case, Chris_annot = cell_type +DimPlot(snrna, reduction = 'umap', group.by = 'Chris_annot') + +``` + +```{r prep cellchat inputs, eval = !cellchat_ran } + +# need to use normalized counts as input +data.input <- snrna[["SCT"]]@data +labels <- snrna$Chris_annot +meta <- data.frame(labels = labels, row.names = names(labels), samples = snrna$orig.ident) + +``` + +```{r create cellchat object, eval = !cellchat_ran } +cellchat <- createCellChat(object = data.input, meta = meta, group.by = "labels") + +``` + +```{r set cellchat db, eval = !cellchat_ran} +CellChatDB <- CellChatDB.human +CellChatDB.use <- subsetDB(CellChatDB) +cellchat@DB <- CellChatDB.use + +``` + +```{r subset and preprocess data, eval = !cellchat_ran } + +cellchat <- subsetData(cellchat) +cellchat <- updateCellChat(cellchat) +future::plan("multisession", workers = 8) # recommend running with at 8-16 cores +cellchat <- identifyOverExpressedGenes(cellchat) # may take a couple minutes +cellchat <- identifyOverExpressedInteractions(cellchat) # may take a couple minutes + +``` + +```{r compute communication prob, eval = !cellchat_ran} + +# Not recommended: project gene expression data onto protein-protein interaction network. +# Useful with shallow sequencing depth but introduces many weak communications. +# If used, must set raw.use = FALSE when running computeCommunProb +# cellchat <- projectData(cellchat, PPI.human) + + +# this next command takes 0.5-2+ hours +# can choose various methods for caculating average gene exp per group, +# 'triMean' allegedly produces fewer but stronger interactions +cellchat <- computeCommunProb(cellchat, type = "triMean") + +# filter out the cell-cell communication if < 50 cells per group +cellchat <- filterCommunication(cellchat, min.cells = 50) + +qsave(cellchat, '../data/snrna_cellchat.qs', preset = 'fast') + +``` + +# Overall Results + +```{r load cellchat, eval = cellchat_ran} +cellchat <- qread(params$cellchat_fn) + +df.net <- subsetCommunication(cellchat) %>% dplyr::arrange(pval) +df.net %>% sanitize_datatable() + +``` + +## Top interactions {.tabset} +```{r check pairs, results = 'asis', fig.width = 8, fig.height = 12} + +top_ints <- (df.net %>% pull(interaction_name) %>% unique)[1:10] +for (interaction in top_ints){ + cat('\n') + cat('### ', as.character(interaction), '\n') + interactors <- unlist(strsplit(as.character(interaction), '_')) + p1 <- VlnPlot(snrna, features = interactors, group.by = 'Chris_annot', + pt.size = 0.1, log = T, ncol = 1) + print(p1) + cat('\n') +} + +``` + +```{r compute pathway communication probs, eval = cellchat_ran} +cellchat <- computeCommunProbPathway(cellchat) +cellchat <- aggregateNet(cellchat) + +``` + +## Visualize Cell-Cell Communication Networks + +```{r chord plots, fig.width = 10, fig.height = 8, eval = cellchat_ran} + +groupSize <- as.numeric(table(cellchat@idents)) +par(mfrow = c(1,2), xpd=TRUE) +netVisual_circle(cellchat@net$count, vertex.weight = rowSums(cellchat@net$count), + weight.scale = T, label.edge= F, title.name = "Number of interactions") +netVisual_circle(cellchat@net$weight, vertex.weight = rowSums(cellchat@net$weight), + weight.scale = T, label.edge= F, title.name = "Interaction weights/strength") + +``` + +```{r heatmaps, eval = cellchat_ran} + +netVisual_heatmap(cellchat, measure = "count", color.heatmap = "Blues") +netVisual_heatmap(cellchat, measure = "weight", color.heatmap = "Blues") + +``` + +# Comparison Results + +Here we run the CellChat analysis twice, once on the Grade 2 rejection samples and once on the Grade 0 rejection samples. We compare the significant signaling interactions and investigate changes in them between rejection grades. + +```{r prep inputs rejection, eval=!cellchat_rejection_ran} + +grade2 <- subset(snrna, orig.ident %in% c('BRI-2396', 'BRI-2402')) +grade0 <- subset(snrna, orig.ident %in% c('BRI-2395', 'BRI-2411')) + +data.input_grade2 <- grade2[["SCT"]]@data +labels_grade2 <- grade2$Chris_annot +meta_grade2 <- data.frame(labels = labels_grade2, row.names = names(labels_grade2), samples = grade2$orig.ident) + +data.input_grade0 <- grade0[["SCT"]]@data +labels_grade0 <- grade0$Chris_annot +meta_grade0 <- data.frame(labels = labels_grade0, row.names = names(labels_grade0), samples = grade0$orig.ident) + +``` + +```{r create cellchat object rejection, eval=!cellchat_rejection_ran} +cellchat_grade2 <- createCellChat(object = data.input_grade2, meta = meta_grade2, group.by = "labels") +cellchat_grade0 <- createCellChat(object = data.input_grade0, meta = meta_grade0, group.by = "labels") + +``` + +```{r subset and preprocess data rejection, eval=!cellchat_rejection_ran} + +future::plan("multisession", workers = 8) # recommend running with at 8-16 cores + +cellchat_grade2@DB <- CellChatDB.use +cellchat_grade0@DB <- CellChatDB.use + +cellchat_grade2 <- subsetData(cellchat_grade2) +cellchat_grade2 <- updateCellChat(cellchat_grade2) +cellchat_grade2 <- identifyOverExpressedGenes(cellchat_grade2) # may take a couple minutes +cellchat_grade2 <- identifyOverExpressedInteractions(cellchat_grade2) # may take a couple minutes + +cellchat_grade0 <- subsetData(cellchat_grade0) +cellchat_grade0 <- updateCellChat(cellchat_grade0) +cellchat_grade0 <- identifyOverExpressedGenes(cellchat_grade0) # may take a couple minutes +cellchat_grade0 <- identifyOverExpressedInteractions(cellchat_grade0) # may take a couple minutes + +``` + +```{r compute communication prob rejection, eval=!cellchat_rejection_ran} +cellchat_grade2 <- computeCommunProb(cellchat_grade2, type = "triMean") # command takes 0.5-2+ hours +cellchat_grade2 <- filterCommunication(cellchat_grade2, min.cells = 50) +qsave(cellchat_grade2, params$cellchat_grade2_fn, preset = 'fast') + +cellchat_grade0 <- computeCommunProb(cellchat_grade0, type = "triMean") # command takes 0.5-2+ hours +cellchat_grade0 <- filterCommunication(cellchat_grade0, min.cells = 50) +qsave(cellchat_grade0, params$cellchat_grade0_fn, preset = 'fast') + +``` + +```{r load cellchat rejection, eval = cellchat_rejection_ran} + +cellchat_grade2 <- qread(params$cellchat_grade2_fn) +cellchat_grade0 <- qread(params$cellchat_grade0_fn) + +cellchat_grade2 <- filterCommunication(cellchat_grade2, min.cells = 50) +cellchat_grade0 <- filterCommunication(cellchat_grade0, min.cells = 50) + +df.net_grade2 <- subsetCommunication(cellchat_grade2)%>% dplyr::arrange(pval) +df.net_grade0 <- subsetCommunication(cellchat_grade0)%>% dplyr::arrange(pval) + +``` + +## Grade 2 + +```{r datatable grade 2, eval = cellchat_rejection_ran} +df.net_grade2 %>% sanitize_datatable() + +``` + +### Top interactions {.tabset} +```{r check pairs grade 2, results = 'asis', fig.width = 8, fig.height = 12} + +top_ints <- (df.net_grade2 %>% pull(interaction_name) %>% unique)[1:10] +for (interaction in top_ints){ + cat('\n') + cat('#### ', as.character(interaction), '\n') + interactors <- unlist(strsplit(as.character(interaction), '_')) + p1 <- VlnPlot(snrna, features = interactors, group.by = 'Chris_annot', pt.size = 0.1, log = T, ncol = 1) + print(p1) + cat('\n') +} + +``` + + +## Grade 0 + +```{r datatable grade 0, eval = cellchat_rejection_ran} +df.net_grade0 %>% sanitize_datatable() + +``` + +### Top interactions {.tabset} +```{r check pairs grade 0, results = 'asis', fig.width = 8, fig.height = 12} + +top_ints <- (df.net_grade0 %>% pull(interaction_name) %>% unique)[1:10] +for (interaction in top_ints){ + cat('\n') + cat('#### ', as.character(interaction), '\n') + interactors <- unlist(strsplit(as.character(interaction), '_')) + p1 <- VlnPlot(snrna, features = interactors, group.by = 'Chris_annot', pt.size = 0.1, log = T, ncol = 1) + print(p1) + cat('\n') +} + +``` + +```{r merge rejection objects, eval = cellchat_rejection_ran} + +cellchat_grade2 <- computeCommunProbPathway(cellchat_grade2) +cellchat_grade2 <- aggregateNet(cellchat_grade2) +cellchat_grade2 <- netAnalysis_computeCentrality(cellchat_grade2) +cellchat_grade0 <- computeCommunProbPathway(cellchat_grade0) +cellchat_grade0 <- aggregateNet(cellchat_grade0) +cellchat_grade0 <- netAnalysis_computeCentrality(cellchat_grade0) + +object.list <- list(grade0 = cellchat_grade0, grade2 = cellchat_grade2) +cellchat_merged <- mergeCellChat(object.list, add.names = names(object.list)) + +df.net_merged <- subsetCommunication(cellchat_merged) + +``` + +## Compare Interactions/Interaction Strength + +```{r compare interactions, eval = cellchat_rejection_ran} + +gg1 <- compareInteractions(cellchat_merged, show.legend = F, group = c(1,2)) +gg2 <- compareInteractions(cellchat_merged, show.legend = F, group = c(1,2), measure = "weight") +gg1 + gg2 + +``` + +```{r chord plots merged, eval = cellchat_rejection_ran, fig.width = 10, fig.height = 8} +par(mfrow = c(1,2), xpd=TRUE) +netVisual_diffInteraction(cellchat_merged, weight.scale = T) +netVisual_diffInteraction(cellchat_merged, weight.scale = T, measure = "weight") + +``` + +```{r heatmaps merged, eval = cellchat_rejection_ran, fig.width = 10, fig.height = 8} + +gg1 <- netVisual_heatmap(cellchat_merged) +gg2 <- netVisual_heatmap(cellchat_merged, measure = "weight") +gg1 + gg2 + +``` + +## Compare Major Pathway Sources and Targets + +From the CellChat documentation: "Comparing the outgoing and incoming interaction strength in a 2D space allows ready identification of the cell populations with significant changes in sending or receiving signals between different datasets." + +```{r compare send/receive changes, eval = cellchat_rejection_ran, fig.width = 10, fig.height = 6} + +num.link <- sapply(object.list, function(x) {rowSums(x@net$count) + colSums(x@net$count)-diag(x@net$count)}) +weight.MinMax <- c(min(num.link), max(num.link)) # control the dot size in the different datasets +gg <- list() +for (i in 1:length(object.list)) { + gg[[i]] <- netAnalysis_signalingRole_scatter(object.list[[i]], title = names(object.list)[i], weight.MinMax = weight.MinMax) +} +patchwork::wrap_plots(plots = gg) +``` + + +```{r identify signaling changes, eval = cellchat_rejection_ran, fig.width = 12, fig.height = 12} +gg1 <- netAnalysis_signalingChanges_scatter(cellchat_merged, idents.use = "Vascular_EC") +gg2 <- netAnalysis_signalingChanges_scatter(cellchat_merged, idents.use = "Lymphatic_EC") +gg3 <- netAnalysis_signalingChanges_scatter(cellchat_merged, idents.use = "Pericyte") +patchwork::wrap_plots(plots = list(gg1,gg2,gg3), nrow = 3, ncol = 1) + +``` + +## Cluster Altered Signaling Interactions + +From the CellChat documentation: "CellChat performs joint manifold learning and classification of the inferred communication networks based on their functional and topological similarity across different conditions. + +By quantifying the similarity between the cellular communication networks of signaling pathways across conditions, this analysis highlights the potentially altered signaling pathways. CellChat adopts the concept of network rewiring from network biology and hypothesized that the difference between different communication networks may affect biological processes across conditions. UMAP is used for visualizing signaling relationship and interpreting our signaling outputs in an intuitive way without involving the classification of conditions. + +Functional similarity: High degree of functional similarity indicates major senders and receivers are similar, and it can be interpreted as the two signaling pathways or two ligand-receptor pairs exhibit similar and/or redundant roles. + +Structural similarity: A structural similarity was used to compare their signaling network structure, without considering the similarity of senders and receivers." + + +### Based on Functional Similarity + +```{r identify signaling groups functional, eval = cellchat_rejection_ran} + +cellchat_merged <- computeNetSimilarityPairwise(cellchat_merged, type = "functional") +cellchat_merged <- netEmbedding(cellchat_merged, type = "functional") +cellchat_merged <- netClustering(cellchat_merged, type = "functional") +netVisual_embeddingPairwise(cellchat_merged, type = "functional", label.size = 3.5) + +``` + +### Based on Structural Similarity + +```{r identify signaling groups structural, eval = cellchat_rejection_ran} +cellchat_merged <- computeNetSimilarityPairwise(cellchat_merged, type = "structural") +cellchat_merged <- netEmbedding(cellchat_merged, type = "structural") +cellchat_merged <- netClustering(cellchat_merged, type = "structural") +netVisual_embeddingPairwise(cellchat_merged, type = "structural", label.size = 3.5) +``` + +## Compare Overall Signaling Information Flow + +"CellChat can identify the conserved and context-specific signaling pathways by simply comparing the information flow for each signaling pathway, which is defined by the sum of communication probability among all pairs of cell groups in the inferred network (i.e., the total weights in the network)." + +```{r info flow, fig.height = 9, eval = cellchat_rejection_ran} + +rankNet(cellchat_merged, mode = "comparison", measure = "weight", sources.use = NULL, targets.use = NULL, stacked = F, do.stat = TRUE) + +``` + +## Compare Signaling Patterns Across Cell Populations + +"In this heatmap, colobar represents the relative signaling strength of a signaling pathway across cell groups (Note that values are row-scaled). The top colored bar plot shows the total signaling strength of a cell group by summarizing all signaling pathways displayed in the heatmap. The right grey bar plot shows the total signaling strength of a signaling pathway by summarizing all cell groups displayed in the heatmap." + + +```{r outgoing signaling, fig.height = 9, eval = cellchat_rejection_ran} + +i = 1 +pathway.union <- union(object.list[[i]]@netP$pathways, object.list[[i+1]]@netP$pathways) +ht1 = netAnalysis_signalingRole_heatmap(object.list[[i]], pattern = "outgoing", signaling = pathway.union, title = names(object.list)[i], width = 5, height = 16, cluster.cols = T) +ht2 = netAnalysis_signalingRole_heatmap(object.list[[i+1]], pattern = "outgoing", signaling = pathway.union, title = names(object.list)[i+1], width = 5, height = 16, cluster.cols = T) +draw(ht1 + ht2, ht_gap = unit(0.5, "cm")) +``` + +```{r incoming signaling, fig.height = 9, eval = cellchat_rejection_ran} +ht1 = netAnalysis_signalingRole_heatmap(object.list[[i]], pattern = "incoming", signaling = pathway.union, title = names(object.list)[i], width = 5, height = 16, cluster.cols = T) +ht2 = netAnalysis_signalingRole_heatmap(object.list[[i+1]], pattern = "incoming", signaling = pathway.union, title = names(object.list)[i+1], width = 5, height = 16, cluster.cols = T) +draw(ht1 + ht2, ht_gap = unit(0.5, "cm")) +``` + +## Identify Dysfunctional Interaction Signaling Using Communication Probabilities + +"CellChat can identify the up-regulated (increased) and down-regulated (decreased) signaling ligand-receptor pairs in one dataset compared to the other dataset by comparing the communication probability between two datasets for each L-R pair and each pair of cell groups" + +```{r compare signaling, fig.height = 12, fig.width = 8, eval = cellchat_rejection_ran} + +gg1 <- netVisual_bubble(cellchat_merged, + # sources.use = c('Vascular_EC', 'Lymphatic_EC', 'Pericyte'), + # targets.use = c('Vascular_EC', 'Lymphatic_EC', 'Pericyte'), + comparison = c(1, 2), + max.dataset = 2, + title.name = "Increased signaling in Grade 2", + angle.x = 45, + remove.isolate = T) +gg1 +signaling.grade2_increased = gg1$data From b3fb4af425361286f8d66a9cb7483cab47627b0f Mon Sep 17 00:00:00 2001 From: Alex Bartlett <74612800+abartlett004@users.noreply.github.com> Date: Thu, 1 Aug 2024 15:55:45 -0400 Subject: [PATCH 36/93] Create readme.md --- inst/templates/spatial/cosmx/readme.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 inst/templates/spatial/cosmx/readme.md diff --git a/inst/templates/spatial/cosmx/readme.md b/inst/templates/spatial/cosmx/readme.md new file mode 100644 index 0000000..637c16b --- /dev/null +++ b/inst/templates/spatial/cosmx/readme.md @@ -0,0 +1,16 @@ +# Guidelines for analysis + +Make sure there is a valid project name, and modify `information.R` with the right information for your project. You can use this file with any other Rmd to include the project/analysis information. + +## QC + +`QC/QC.Rmd` is a template for QC metrics. It plots the locations of cells on the slide, filters cells using the number of genes and AtoMX quality flags, and normalizes the data. It also provides sample code for clustering and cell type identification. + +Read instruction in the R and Rmd scripts to render it. + +Note that future versions of this template will include code for building your own RDS object out of .csv.gz files produced by the AtoMX software instead of loading an RDS directly, as this allows the analyst to access information about transcript locations and cell sementation that is not available in the pre-made RDS objects from AtoMx or BWH. + +## DropBox + +- In `reports/QC` + - [ ] copy QC `Rmd/R/html/figures` From 25f15bf255aea54f9489f95e1671968e43815a84 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Thu, 1 Aug 2024 16:44:26 -0400 Subject: [PATCH 37/93] remove dependencies, fix base deployment --- DESCRIPTION | 10 +++--- R/helpers.R | 8 ++++- README.md | 1 - vignettes/PCA_variance_analysis.Rmd | 54 ----------------------------- 4 files changed, 12 insertions(+), 61 deletions(-) delete mode 100644 vignettes/PCA_variance_analysis.Rmd diff --git a/DESCRIPTION b/DESCRIPTION index e7d76dc..d4995f8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -11,12 +11,7 @@ Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.1 Imports: - DESeq2, - stringr, - ggplot2, magrittr, - hues, - ggprism, grDevices, R.utils, readr, @@ -28,6 +23,11 @@ Imports: whisker, rlang Suggests: + hues, + ggprism, + stringr, + ggplot2, + DESeq2, knitr, rmarkdown, testthat (>= 3.0.0) diff --git a/R/helpers.R b/R/helpers.R index aba6f42..49f534a 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -50,6 +50,10 @@ bcbio_nfcore_check <- function(file){ bcbio_templates <- function(type="rnaseq", outpath){ fs::dir_create(outpath) switch(type, + base={ + #file.copy(fpath, outpath, recursive = T) + copy_templates(outpath, "base") + }, rnaseq={ #file.copy(fpath, outpath, recursive = T) copy_templates(outpath, "nf-core/rnaseq") @@ -138,7 +142,9 @@ copy_files_in_folder<- function(origin, remote){ copy_templates <- function(path, pipeline){ base = c("bcbioR") - if (pipeline=="nf-core/rnaseq"){ + if (pipeline=="base"){ + parts = c("templates/base") + }else if(pipeline=="nf-core/rnaseq"){ parts = c("templates/rnaseq") }else if(pipeline=="singlecell"){ parts = c("templates/singlecell") diff --git a/README.md b/README.md index 649e1f6..ef3583a 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,6 @@ The following code will pop up a Rmd template will populate that folder with HCB ``` use_bcbio_projects(path,pipeline="nf-core/rnaseq") use_bcbio_projects(path,pipeline="singlecell") - ``` ### Set RNAseq report folder diff --git a/vignettes/PCA_variance_analysis.Rmd b/vignettes/PCA_variance_analysis.Rmd deleted file mode 100644 index 4074693..0000000 --- a/vignettes/PCA_variance_analysis.Rmd +++ /dev/null @@ -1,54 +0,0 @@ ---- -title: "PCA with variance analysis" -author: "Harvard Chan Bioinformatics Core" ---- - -Dispersion estimates are a key part of the DESEQ2 analysis. DESEQ2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per gene dispersions to match this distribution. If one group has higher variance than all others this will affect the dispersion estimates. Here we visually check that the variance per group is similar using a PCA. The ellipses are minimal volume enclosing ellipses using the Khachiyan algorithm. - - -**Manually add in your covariate of interest to define the groups. One can be created by combining multiple metadata columns using the paste0 function.** - -```{r } -## Example of creating a group covariate - -# meta$group <- paste0(meta$sex,"_", meta$age,"_",meta$treatment) - -factor_of_interest <- "insert column name for covariate of interest" -``` - - -```{r } -library(DEGreport) -library(ggplot2) -library(ggforce) - -data("bcbio_vsd_data") - -colors=cb_friendly_cols(1:15) -ggplot2::theme_set(theme_prism(base_size = 14)) - -pca <- degPCA(assay(bcbio_vsd_data), colData(bcbio_vsd_data), - condition = factor_of_interest, name = "sample", data = T) - -pca$plot + ggtitle(paste0("All samples", "\nPCA using ", nrow(vst), " genes")) + - theme(plot.title=element_text(hjust=0.5)) + - geom_mark_ellipse(aes(color = sample_type)) -``` - -## PERMDISP - -Groups in a univariate analysis can also differ with regard to their mean values, variation around those means, or both. In univariate analyses, dispersion can be examined using Levene’s test. PERMDISP is a multivariate extension of Levene’s test to examine whether groups differ in variability. In essence, PERMDISP involves calculating the distance from each data point to its group centroid and then testing whether those distances differ among the groups. [Source](https://uw.pressbooks.pub/appliedmultivariatestatistics/chapter/permdisp/) - -Here we apply this test to our variance stabilized data. We calculate distances betwen samples and then use the betadisper() function from the popular vegan package. We get two overall p-values where significant means that the dispersions are different between groups. The first p-value comes from the anova() function and the second from the permutest() function. We also get pairwise p-values for every group-group comparison. - -```{r} -library(vegan) -vare.disa <- vegdist(t(assay(bcbio_vsd_data))) - -mod = betadisper(vare.disa, colData(bcbio_vsd_data)[['sample_type']]) -anova(mod) -permutest(mod, pairwise = TRUE) - -``` - - From 8954ba2d11583bc89e357a68f64125808aaadb81 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Thu, 1 Aug 2024 16:49:10 -0400 Subject: [PATCH 38/93] remove deseq2 as dep --- DESCRIPTION | 1 - NAMESPACE | 1 - R/bcbioR-package.R | 1 - 3 files changed, 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d4995f8..71aa3e2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -27,7 +27,6 @@ Suggests: ggprism, stringr, ggplot2, - DESeq2, knitr, rmarkdown, testthat (>= 3.0.0) diff --git a/NAMESPACE b/NAMESPACE index 00ac2cc..0540b95 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,7 +9,6 @@ export(scale_color_cb_friendly) export(scale_fill_cb_friendly) export(use_bcbio_analysis) export(use_bcbio_projects) -import(DESeq2) import(R.utils) import(fs) import(ggplot2) diff --git a/R/bcbioR-package.R b/R/bcbioR-package.R index 903be7b..c8363e6 100644 --- a/R/bcbioR-package.R +++ b/R/bcbioR-package.R @@ -7,7 +7,6 @@ #' @importFrom readr read_csv #' @importFrom stringr str_replace_all ## usethis namespace: end -#' @import DESeq2 #' @import ggplot2 #' @import usethis #' @import fs From e143dd7439a11966bf981683b086b6ec2fe7de91 Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Wed, 7 Aug 2024 16:50:38 -0400 Subject: [PATCH 39/93] rough draft of methylation template --- inst/templates/methylation/QC/QC.Rmd | 600 +++++++++++++++++++++++ inst/templates/methylation/information.R | 6 + 2 files changed, 606 insertions(+) create mode 100644 inst/templates/methylation/QC/QC.Rmd create mode 100644 inst/templates/methylation/information.R diff --git a/inst/templates/methylation/QC/QC.Rmd b/inst/templates/methylation/QC/QC.Rmd new file mode 100644 index 0000000..689e322 --- /dev/null +++ b/inst/templates/methylation/QC/QC.Rmd @@ -0,0 +1,600 @@ +--- +title: "Quality Control" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + params_file: ../information.R + meta_fn: ../meta/methylation_mucci_hbc04926.csv +--- + +```{r echo = F} +source(params$params_file) +``` + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE, echo=FALSE,} + +library(tidyverse) +library(bcbioR) +library(ggprism) +library(knitr) +library(tools) +library(qs) +library(janitor) +library(DEGreport) +library(ggrepel) +library(pheatmap) +library(minfi) +library(IlluminaHumanMethylationEPICv2manifest) +library(IlluminaHumanMethylationEPICv2anno.20a1.hg38) +library(methylclock) +options(stringsAsFactors = FALSE) +options(future.globals.maxSize= 891289600) + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) +opts_chunk[["set"]]( + cache = F, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + echo = T, + fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) + +``` + +```{r sanitize_datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df)), + filter = 'top') +} +``` + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` +- Aim: `r aim` + +The workflow followed in this report is descriped in the documentation [here](https://nbis-workshop-epigenomics.readthedocs.io/en/latest/content/tutorials/methylationArray/Array_Tutorial.html) + +# Stratify samples by DNA quantity + + +```{r examine_sample_quant} + +metadata <- read.csv(params$meta_fn, row.names = 1) %>% clean_names() + +# TODO remove from HERE to next TODO if quantile classification not relevant +thirds_blood <- quantile( + metadata %>% filter(tissue == 'blood') %>% pull(total_ng), + c(0.33, 0.67) +) + +thirds_tissue <- quantile( + metadata %>% filter(tissue != 'blood') %>% pull(ng_dna), + c(0.33, 0.67) +) + +``` + +For blood samples, input DNA quantity was considered low if <= `r round(thirds_blood[1], 2)` ng, high if it was >= `r round(thirds_blood[2], 2)` ng, and medium if in between. + +For tissue samples, input DNA quantity was considered low if <= `r round(thirds_tissue[1], 2)` ng, high if it was >= `r round(thirds_tissue[2], 2)` ng, and medium if in between. + +```{r plot_sample_quant} + +ggplot(metadata, aes(x = ng_dna, fill = tissue)) + + geom_histogram() + + geom_vline(aes(xintercept = thirds_tissue[1])) + + geom_vline(aes(xintercept = thirds_tissue[2])) + + scale_fill_cb_friendly() + +ggplot(metadata, aes(x = total_ng, fill = tissue)) + + geom_histogram() + + geom_vline(aes(xintercept = thirds_blood[1])) + + geom_vline(aes(xintercept = thirds_blood[2])) + + scale_fill_cb_friendly() + + +``` + + +## Metadata by quantity classification + +```{r stratify_samples} + +metadata <- metadata %>% + mutate(dna_quant_level = case_when( + tissue == 'blood' ~ case_when( + total_ng <= thirds_blood[1] ~ 'Low', + total_ng >= thirds_blood[2] ~ 'High', + TRUE ~ 'Medium' + ), + TRUE ~ case_when( + ng_dna <= thirds_tissue[1] ~ 'Low', + ng_dna >= thirds_tissue[2] ~ 'High', + TRUE ~ 'Medium' + ) + )) %>% + mutate(subject_id = ifelse(is.na(subject_id), id, subject_id)) + +# TODO + +metadata <- metadata %>% group_by(dna_quant_level, subject_id) %>% + mutate(sample_name = paste('ID', subject_id, dna_quant_level, row_number(), sep = '_')) +names_vec <- metadata %>% + pull(sample_name) +names(names_vec) <- metadata$ch_ip_id + +metadata %>% sanitize_datatable() +``` + +# Minfi QC {.tabset} + +```{r load methyl data} + +rgSet <- read.metharray.exp(base = "../data/", recursive = TRUE) +colnames(rgSet) <- names_vec[colnames(rgSet)] +rgSet@annotation <- c(array = "IlluminaHumanMethylationEPICv2", annotation = "20a1.hg38") +rownames(metadata) <- metadata$sample_name +rgSet@colData <- DataFrame(metadata) + +MSet <- preprocessRaw(rgSet) +ratioSet <- ratioConvert(MSet, what = "both", keepCN = TRUE) +gset <- mapToGenome(ratioSet) + +beta <- getBeta(gset) +m <- getM(gset) + +``` + +## Intensity + +From the documentation: "minfi provides a simple quality control plot that uses the log median intensity in both the methylated (M) and unmethylated (U) channels. When plotting these two medians against each other, good samples tend to cluster together, while failed samples tend to separate and have lower median intensities" + +```{r intensity} +qc <- getQC(MSet) +# plotQC(qc) +badSampleCutoff <- 10.5 +qc_df <- as.data.frame(qc) %>% rownames_to_column('sample_name') %>% + mutate(sample_qual = ifelse((mMed + uMed) / 2 < badSampleCutoff, 'bad', 'good')) + + +ggplot(qc_df, aes(x = mMed, y = uMed, color = sample_qual)) + + geom_point() + + geom_text_repel(data = qc_df %>% filter(sample_qual == 'bad'), + aes(x = mMed, y = uMed, color = sample_qual, label = sample_name)) + + xlab('Meth median intensity (log2)') + ylab('Unmeth median intensity (log2)') + + xlim(c(8,14)) + ylim(c(8,14)) + scale_color_cb_friendly() +``` + +## Detection + +From the documentation: "We can additionally look at the detection p-values for every CpG in every sample, which is indicative of the quality of the signal. The method used by minfi to calculate detection p-values compares the total signal (M+U) for each probe to the background signal level, which is estimated from the negative control probes. Very small p-values are indicative of a reliable signal whilst large p-values generally indicate a poor quality signal." + + +```{r detection, fig.width=10, fig.height = 6} + +detP <- detectionP(rgSet) +# barplot(colMeans(detP), las=2, cex.names=0.8, ylab="Mean detection p-values") +# abline(h=0.05,col="red") + +det_df <- data.frame(sample_name = colnames(detP), detection_p_val = colMeans(detP)) %>% + left_join(metadata) +ggplot(det_df, aes(x = sample_name, y = detection_p_val, fill = tissue)) + geom_col() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + + geom_hline(yintercept = 0.01) + scale_fill_cb_friendly() + +det_frac_df <- data.frame(sample_name = colnames(detP), frac_probes_detected = 1 - colSums(detP > 0.05) / nrow(detP)) %>% + left_join(metadata) +ggplot(det_frac_df, aes(x = sample_name, y = frac_probes_detected, fill = tissue)) + geom_col() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + scale_fill_cb_friendly() + +``` + +## Control Probes {.tabset} + +From the documentation: "The array contains several internal control probes that can be used to assess the quality control of different sample preparation steps (bisulfite conversion, hybridization, etc.)" + + +```{r control probes, results = 'asis', fig.height = 8} + +control_probes <- c("BISULFITE CONVERSION I", "BISULFITE CONVERSION II", "HYBRIDIZATION", "NON-POLYMORPHIC", "SPECIFICITY I", "SPECIFICITY II", "TARGET REMOVAL", "NEGATIVE") + +for (probe in control_probes){ + cat('\n') + cat('### ', probe, '\n') + controlStripPlot(rgSet, controls=probe) + cat('\n') +} +``` + +# Minfi Normalization (Funnorm) {.tabset} + +From the documentation: "If there exist global biological methylation differences between your samples, as for instance a dataset with cancer and normal samples, or a dataset with different tissues/cell types, use the preprocessFunnorm function as it is aimed for such datasets" + +## Between Arrays + +From the documentation: "The overall density distribution of Beta values for each sample is another useful metric to determine sample quality. Usually, one would expect to see most Beta values to be either close to 0 or 1, indicating most of the CpG sites in the sample are unmethylated or methylated" + + +```{r funnorm between} + +ggdat=as.data.frame(getBeta(MSet)) %>% rownames_to_column("cpgs") %>% + pivot_longer(cols = !matches("cpgs")) %>% + inner_join(metadata[,c("sample_name","tissue")], by=c("name"="sample_name")) + +MSet_funnorm <- preprocessFunnorm(rgSet) + +ggdat_funnorm=as.data.frame(getBeta(MSet_funnorm)) %>% rownames_to_column("cpgs") %>% + pivot_longer(cols = !matches("cpgs")) %>% + inner_join(metadata[,c("sample_name","tissue")], by=c("name"="sample_name")) + +par(mfrow=c(1,2)) +ggplot(ggdat,aes(value, group=name,color=tissue))+ + geom_density(alpha=0.6, size=1) + + scale_color_cb_friendly() + ggtitle('Raw') +ggplot(ggdat_funnorm, aes(value, group=name,color=tissue))+ + geom_density(alpha=0.6, size=1) + + scale_color_cb_friendly() + ggtitle('Funnorm') + +``` + +## Within Arrays + +From the documentation: "A comparison of the Beta distributions for the different probe designs. This will give an indication of the effectiveness of the within-array normalization." + + +```{r funnorm within, message = F, echo = F, results = 'hide'} + +typeI <- getProbeInfo(MSet, type = "I")[, c("Name","nCpG")] +typeII <- getProbeInfo(MSet, type = "II")[, c("Name","nCpG")] +probeTypes <- rbind(typeI, typeII) +probeTypes$Type <- rep(x = c("I", "II"), times = c(nrow(typeI), nrow(typeII))) + +lapply(colnames(MSet), function(sample){ + par(mfrow=c(1,2)) + plotBetasByType(MSet[, sample], main = paste('Raw', sample)) + plotBetasByType(getBeta(MSet_funnorm)[, sample], probeTypes = probeTypes, + main = paste('Funnorm', sample)) +}) + +``` + +# Sample Clustering {.tabset} + +Post-normalization, we are interested to look at the similarities or differences between various samples. One way to do this is by creating PCA plots, which graphically represent the relationships between objects in multidimensional space into 2 dimensional space, where the two dimensions are chosen so that they capture the greatest sources of variation in the data. + +Another way to do this is by calculating the correlation between beta values for samples, clustering the samples based on these correlations, and visualizing the information in a heatmap. + + +## All samples + +```{r pca_all_samples} +beta_funnorm <- getBeta(MSet_funnorm) +metadata <- as.data.frame(metadata) +rownames(metadata) <- metadata$sample_name +degPCA(beta_funnorm, metadata, condition = 'tissue', shape = 'dna_quant_level') + + scale_color_cb_friendly() + +``` + +```{r heatmap_all_samples, fig.width = 12, fig.height = 10} +beta_funnorm_cor <- cor(beta_funnorm) + +colma <- metadata %>% select(dna_quant_level, tissue) + +anno_colors=lapply(colnames(colma), function(c){ + l.col=c(cb_friendly_pal('grey')(length(unique(colma[[c]])) - 1), 'white') + names(l.col)=unique(colma[[c]]) + l.col +}) +names(anno_colors)=colnames(colma) +p <- pheatmap(beta_funnorm_cor, + annotation = colma, + annotation_colors = anno_colors, + # show_rownames = F, + # show_colnames = F, + color = cb_friendly_pal('heatmap')(15) +) +p +``` + +## Blood +```{r pca_blood_samples, fig.height = 7} + +# TODO remove from here to next TODO if PCA per tissue type is not needed +metadata_blood <- metadata %>% filter(tissue == 'blood') +beta_funnorm_blood <- beta_funnorm[, colnames(beta_funnorm) %in% rownames(metadata_blood)] + +degPCA(beta_funnorm_blood, metadata_blood, condition = 'subject_id', shape = 'dna_quant_level') + + scale_color_cb_friendly() + + +``` + +```{r heatmap_blood_samples, fig.width = 10, fig.height = 8} +beta_funnorm_cor_blood <- cor(beta_funnorm_blood) + +colma_blood <- metadata_blood %>% select(dna_quant_level) + +anno_colors_blood=lapply(colnames(colma_blood), function(c){ + l.col=c(cb_friendly_pal('grey')(length(unique(colma_blood[[c]])) - 1), 'white') + names(l.col)=unique(colma_blood[[c]]) + l.col +}) +names(anno_colors_blood)=colnames(colma_blood) +p <- pheatmap(beta_funnorm_cor_blood, + annotation = colma_blood, + annotation_colors = anno_colors_blood, + # show_rownames = F, + # show_colnames = F, + color = cb_friendly_pal('heatmap')(15) +) +p + +``` + +## Prostate +```{r pca_prostate_samples} +metadata_prostate <- metadata %>% filter(tissue != 'blood') +beta_funnorm_prostate <- beta_funnorm[, colnames(beta_funnorm) %in% rownames(metadata_prostate)] + +degPCA(beta_funnorm_prostate, metadata_prostate, condition = 'subject_id', shape = 'dna_quant_level') + + scale_color_cb_friendly() + +``` + +```{r heatmap_prostate_samples, fig.width = 10, fig.height = 8} +beta_funnorm_cor_prostate <- cor(beta_funnorm_prostate) + +colma_prostate <- metadata_prostate %>% select(dna_quant_level) + +anno_colors_prostate=lapply(colnames(colma_prostate), function(c){ + l.col=c(cb_friendly_pal('grey')(length(unique(colma_prostate[[c]])) - 1), 'white') + names(l.col)=unique(colma_prostate[[c]]) + l.col +}) +names(anno_colors_prostate)=colnames(colma_prostate) +p <- pheatmap(beta_funnorm_cor_prostate, + annotation = colma_prostate, + annotation_colors = anno_colors_prostate, + # show_rownames = F, + # show_colnames = F, + color = cb_friendly_pal('heatmap')(15) +) +p + +# TODO +``` + + +# Filtered Sample Clustering {.tabset} + +Poor performing probes as well as probes with SNPs were filtered from the data. + +```{r filter probes} + +detP <- detectionP(rgSet) +detP <- detP[match(featureNames(MSet_funnorm), rownames(detP)),] +keep <- rowSums(detP < 0.01) == ncol(MSet_funnorm) +MSet_funnorm_filt <- MSet_funnorm[keep,] +MSet_funnorm_filt <- dropLociWithSnps(MSet_funnorm_filt) + +``` + +## All samples + +```{r pca_all_samples_filt} +beta_funnorm_filt <- getBeta(MSet_funnorm_filt) +metadata <- as.data.frame(metadata) +rownames(metadata) <- metadata$sample_name +degPCA(beta_funnorm_filt, metadata, condition = 'tissue', shape = 'dna_quant_level') + + scale_color_cb_friendly() + +``` + +```{r heatmap_all_samples_filt, fig.width = 12, fig.height = 10} +beta_funnorm_filt_cor <- cor(beta_funnorm_filt) + +colma <- metadata %>% select(dna_quant_level, tissue) + +anno_colors=lapply(colnames(colma), function(c){ + l.col=c(cb_friendly_pal('grey')(length(unique(colma[[c]])) - 1), 'white') + names(l.col)=unique(colma[[c]]) + l.col +}) +names(anno_colors)=colnames(colma) +p <- pheatmap(beta_funnorm_filt_cor, + annotation = colma, + annotation_colors = anno_colors, + # show_rownames = F, + # show_colnames = F, + color = cb_friendly_pal('heatmap')(15) +) +p +``` + +## Blood +```{r pca_blood_samples_filt, fig.height = 7} + +# TODO remove from here to next TODO if PCA per tissue type is not relevant + +metadata_blood <- metadata %>% filter(tissue == 'blood') +beta_funnorm_filt_blood <- beta_funnorm_filt[, colnames(beta_funnorm_filt) %in% rownames(metadata_blood)] + +degPCA(beta_funnorm_filt_blood, metadata_blood, condition = 'subject_id', shape = 'dna_quant_level') + + scale_color_cb_friendly() + + +``` + +```{r heatmap_blood_samples_filt, fig.width = 10, fig.height = 8} +beta_funnorm_filt_cor_blood <- cor(beta_funnorm_filt_blood) + +colma_blood <- metadata_blood %>% select(dna_quant_level) + +anno_colors_blood=lapply(colnames(colma_blood), function(c){ + l.col=c(cb_friendly_pal('grey')(length(unique(colma_blood[[c]])) - 1), 'white') + names(l.col)=unique(colma_blood[[c]]) + l.col +}) +names(anno_colors_blood)=colnames(colma_blood) +p <- pheatmap(beta_funnorm_filt_cor_blood, + annotation = colma_blood, + annotation_colors = anno_colors_blood, + # show_rownames = F, + # show_colnames = F, + color = cb_friendly_pal('heatmap')(15) +) +p + +``` + +## Prostate +```{r pca_prostate_samples_filt} +metadata_prostate <- metadata %>% filter(tissue != 'blood') +beta_funnorm_filt_prostate <- beta_funnorm_filt[, colnames(beta_funnorm_filt) %in% rownames(metadata_prostate)] + +degPCA(beta_funnorm_filt_prostate, metadata_prostate, condition = 'subject_id', shape = 'dna_quant_level') + + scale_color_cb_friendly() + +``` + +```{r heatmap_prostate_samples_filt, fig.width = 10, fig.height = 8} +beta_funnorm_filt_cor_prostate <- cor(beta_funnorm_filt_prostate) + +colma_prostate <- metadata_prostate %>% select(dna_quant_level) + +anno_colors_prostate=lapply(colnames(colma_prostate), function(c){ + l.col=c(cb_friendly_pal('grey')(length(unique(colma_prostate[[c]])) - 1), 'white') + names(l.col)=unique(colma_prostate[[c]]) + l.col +}) +names(anno_colors_prostate)=colnames(colma_prostate) +p <- pheatmap(beta_funnorm_filt_cor_prostate, + annotation = colma_prostate, + annotation_colors = anno_colors_prostate, + # show_rownames = F, + # show_colnames = F, + color = cb_friendly_pal('heatmap')(15) +) +p + +# TODO + +``` + +# Epigenetic Clocks {.tabset} + + +```{r calculate epi age} +clock_rownames <- gsub('_[TB]+C[12]+1$', '', rownames(MSet_funnorm_filt)) +dups <- clock_rownames[duplicated(clock_rownames)] +keep <- !(clock_rownames %in% dups) + +MSet_funnorm_clock <- MSet_funnorm_filt[keep, ] +rownames(MSet_funnorm_clock) <- gsub('_[TB]+C[12]+1$', '', rownames(MSet_funnorm_clock)) +# cpgs.missing <- checkClocks(MSet_funnorm_clock) + + +age <- DNAmAge(MSet_funnorm_clock) +age_merged <- age %>% left_join(metadata, by = c('id' = 'sample_name')) %>% + mutate(age = ifelse(is.na(age), agedx, age)) + +# plotDNAmAge(age_merged$Horvath, age_merged$age) +``` + +## Horvath +```{r horvath} +ggplot(age_merged, aes(x = age, y = Horvath, color = tissue, shape = dna_quant_level)) + + geom_point() + + geom_text_repel(data = age_merged %>% filter(Horvath < 50), + aes(x = age, y = Horvath, color = tissue, label = id)) + + scale_color_cb_friendly() + +``` + +## Levine +```{r levine} +ggplot(age_merged, aes(x = age, y = Levine, color = tissue, shape = dna_quant_level)) + + geom_point() + + geom_text_repel(data = age_merged %>% filter(Levine < 40), + aes(x = age, y = Levine, color = tissue, label = id)) + + scale_color_cb_friendly() + +``` + +# Markers + +We observe that the CpG sites closest to the canonical TSS of GSTP1 are more methylated in malignant samples than other samples, although we see the opposite effect or little difference for sites farther from the TSS. + +```{r markers, fig.width = 12} + +# TODO replace with markers of interest, or remove this chunk if not relevant +annotation <- getAnnotation(MSet_funnorm_filt) + +gstp1 <- annotation %>% as.data.frame() %>% + filter(chr == 'chr11', pos < 67590000, pos > 67580000, strand == '+') + +gstp1_beta <- beta_funnorm_filt %>% as.data.frame() %>% rownames_to_column('Name') %>% + filter(Name %in% gstp1$Name) %>% + pivot_longer(!Name, names_to = 'sample_name', values_to = 'beta') %>% + left_join(metadata) %>% + left_join(gstp1) %>% + mutate(dist_from_TSS = pos - 67583812) + +ggplot(gstp1_beta, aes(x = as.factor(dist_from_TSS), y = beta, fill = tissue)) + geom_boxplot() + + facet_wrap(~tissue) + scale_fill_cb_friendly() + xlab('distance from TSS') + ggtitle('GSTP1 probes') + +``` + + +```{r, eval = F} +cn <- getCN(MSet_funnorm_filt) + +cn_sample <- cn %>% as.data.frame() %>% rownames_to_column('Name') %>% + select(Name, `ID_170-11-002_Medium_1`) %>% + left_join(annotation %>% as.data.frame() %>% select(Name, chr, strand, pos)) %>% + arrange(chr, pos) %>% mutate(xpos = row_number()) + +ggplot(cn_sample, aes(x = pos, y = `ID_170-11-002_Medium_1`)) + geom_point() + facet_wrap(~chr) +``` + +# Conclusions + +TODO write here + +# R session + +List and version of tools used for the report generation. + +```{r} +sessionInfo() +``` \ No newline at end of file diff --git a/inst/templates/methylation/information.R b/inst/templates/methylation/information.R new file mode 100644 index 0000000..6e15eef --- /dev/null +++ b/inst/templates/methylation/information.R @@ -0,0 +1,6 @@ +# info params +project = "name_hbcXXXXX" +PI = 'person name' +experiment = 'short description' +aim = 'short description' +analyst = 'person in the core' From 3336b5b32de3412123f4e9682c9d3e783f171695 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Thu, 8 Aug 2024 16:17:34 -0400 Subject: [PATCH 40/93] add test of copied files --- DESCRIPTION | 6 ++--- NAMESPACE | 7 ++---- R/bcbioR-package.R | 5 ++-- R/helpers.R | 11 +++++---- man/bcbio_templates.Rd | 4 +++- tests/testthat/rnaseq.R | 51 ++++++++++++++++++++++++----------------- 6 files changed, 46 insertions(+), 38 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 71aa3e2..ebbce77 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,11 +21,11 @@ Imports: jsonlite, yaml, whisker, - rlang + rlang, + stringr Suggests: hues, ggprism, - stringr, ggplot2, knitr, rmarkdown, @@ -34,4 +34,4 @@ VignetteBuilder: knitr URL: http://bcb.io/bcbioR/ Config/testthat/edition: 3 Depends: - R (>= 4.3.1) + R (>= 4.2.0) diff --git a/NAMESPACE b/NAMESPACE index 0540b95..63cd351 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,14 +7,11 @@ export(cb_friendly_pal) export(list_cb_friendly_cols) export(scale_color_cb_friendly) export(scale_fill_cb_friendly) -export(use_bcbio_analysis) -export(use_bcbio_projects) import(R.utils) import(fs) -import(ggplot2) -import(ggprism) -import(hues) import(usethis) +import(whisker) +import(withr) importFrom(grDevices,colorRampPalette) importFrom(magrittr,"%>%") importFrom(readr,read_csv) diff --git a/R/bcbioR-package.R b/R/bcbioR-package.R index c8363e6..46c710b 100644 --- a/R/bcbioR-package.R +++ b/R/bcbioR-package.R @@ -7,10 +7,9 @@ #' @importFrom readr read_csv #' @importFrom stringr str_replace_all ## usethis namespace: end -#' @import ggplot2 #' @import usethis #' @import fs -#' @import hues -#' @import ggprism #' @import R.utils +#' @import withr +#' @import whisker NULL diff --git a/R/helpers.R b/R/helpers.R index 49f534a..2ac6160 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -44,7 +44,9 @@ bcbio_nfcore_check <- function(file){ #' @param outpath string path indicating where to copy all the files to #' @examples #' \dontrun{ -#' bcbio_templates("rnaseq", "path_to_projects/project1/reports") +#' path <- withr::local_tempdir() +#' bcbio_templates(type="base",outpath=path) +#' fs::dir_ls(path,all=T) #' } #' @export bcbio_templates <- function(type="rnaseq", outpath){ @@ -123,7 +125,7 @@ bcbio_params <-function(nfcore_path, pipeline, metadata, copy){ } copy_files_in_folder<- function(origin, remote){ - to_copy <- fs::dir_ls(origin) + to_copy <- fs::dir_ls(origin,all = TRUE) to_copy <- grep("org", to_copy, value = TRUE, invert = TRUE) for (element in to_copy){ @@ -189,7 +191,7 @@ bcbio_render <- function(path, pipeline, data){ } } -#' @export +#' help with bcbio analysis setup use_bcbio_analysis <- function(path, pipeline, copy=TRUE, metadata=NULL){ if (copy){ @@ -238,8 +240,7 @@ use_bcbio_analysis <- function(path, pipeline, copy=TRUE, metadata=NULL){ } -#' @export -#' @examples +#' Pilot to deploy full projects at once #' path <- withr::local_tempdir() #' use_bcbio_projects(path,pipeline="nf-core/rnaseq",copy=TRUE) #' fs::dir_ls(path) diff --git a/man/bcbio_templates.Rd b/man/bcbio_templates.Rd index 217ea0c..c5a2f7b 100644 --- a/man/bcbio_templates.Rd +++ b/man/bcbio_templates.Rd @@ -23,6 +23,8 @@ project folder. } \examples{ \dontrun{ - bcbio_templates("rnaseq", "path_to_projects/project1/reports") + path <- withr::local_tempdir() + bcbio_templates(type="base",outpath=path) + fs::dir_ls(path,all=T) } } diff --git a/tests/testthat/rnaseq.R b/tests/testthat/rnaseq.R index 4b1967a..00a77a8 100644 --- a/tests/testthat/rnaseq.R +++ b/tests/testthat/rnaseq.R @@ -1,30 +1,39 @@ library(bcbioR) + +test_that("base copy",{ + path <- withr::local_tempdir() + print(path) + bcbio_templates(type="base", outpath=path) + expect_length(fs::dir_ls(path,all=T),9) +}) + test_that("rnaseq copy",{ path <- withr::local_tempdir() print(path) bcbio_templates(type="rnaseq", outpath=path) - numerator="tumor" - denominator="normal" - subset_value=NA - rmarkdown::render(input = file.path(path,"DE/DEG.Rmd"), - output_dir = file.path(path,"DE"), - output_format = "html_document", - output_file = ifelse(!is.na(subset_value), - paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'), - paste0('DE_', numerator, '_vs_', denominator, '.html') - ), - clean = TRUE, - envir = new.env(), - params = list( - subset_value = subset_value, - numerator = numerator, - denominator = denominator, - params_file = file.path(path,'DE/params_de-example.R'), - project_file = file.path(path,'information.R'), - functions_file = file.path(path,'DE/load_data.R') - ) - ) + expect_length(fs::dir_ls(path,all=T),4) + # numerator="tumor" + # denominator="normal" + # subset_value=NA + # rmarkdown::render(input = file.path(path,"DE/DEG.Rmd"), + # output_dir = file.path(path,"DE"), + # output_format = "html_document", + # output_file = ifelse(!is.na(subset_value), + # paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'), + # paste0('DE_', numerator, '_vs_', denominator, '.html') + # ), + # clean = TRUE, + # envir = new.env(), + # params = list( + # subset_value = subset_value, + # numerator = numerator, + # denominator = denominator, + # params_file = file.path(path,'DE/params_de-example.R'), + # project_file = file.path(path,'information.R'), + # functions_file = file.path(path,'DE/load_data.R') + # ) + # ) # use_bcbio_projects(path, nfcore="nf-core/rnaseq", copy=TRUE, git=FALSE) }) From 022ccc5d59edec6a8c787ab27b576be5d1814af2 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Thu, 8 Aug 2024 16:21:47 -0400 Subject: [PATCH 41/93] roll back function name --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ef3583a..2ba7800 100644 --- a/README.md +++ b/README.md @@ -31,8 +31,8 @@ setwd("/path/to/analysis/folder") The following code will pop up a Rmd template will populate that folder with HCBC data structure guidelines ``` -use_bcbio_projects(path,pipeline="nf-core/rnaseq") -use_bcbio_projects(path,pipeline="singlecell") +bcbio_templates(path,pipeline="nf-core/rnaseq") +bcbio_templates(path,pipeline="singlecell") ``` ### Set RNAseq report folder From 006aa7c38f7faaad03cb19439c5c48894cb6d5db Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Fri, 9 Aug 2024 10:39:21 -0400 Subject: [PATCH 42/93] add test of sc folders and app --- R/helpers.R | 14 ++++++++++++++ tests/testthat/rnaseq.R | 9 +++++++++ 2 files changed, 23 insertions(+) diff --git a/R/helpers.R b/R/helpers.R index 2ac6160..1fcde18 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -142,7 +142,19 @@ copy_files_in_folder<- function(origin, remote){ } } +deploy_apps <- function(apps, path){ + fs::dir_create(file.path(path, "apps")) + sapply(names(apps), function(app){ + dest_file=file.path(path, "apps", paste0(app, ".zip")) + download.file(url = apps[[app]], + destfile = dest_file) + unzip(zipfile = dest_file, exdir = dirname(dest_file)) + fs::file_delete(dest_file) + }) +} + copy_templates <- function(path, pipeline){ + apps=list() base = c("bcbioR") if (pipeline=="base"){ parts = c("templates/base") @@ -150,6 +162,7 @@ copy_templates <- function(path, pipeline){ parts = c("templates/rnaseq") }else if(pipeline=="singlecell"){ parts = c("templates/singlecell") + apps=c(apps, scRNAseq_qc="https://github.com/hbc/scRNAseq_qc_app/archive/refs/heads/main.zip") }else if(pipeline=="singlecell_delux"){ parts = c("templates/singlecell_delux") }else if(pipeline=="multiomics"){ @@ -163,6 +176,7 @@ copy_templates <- function(path, pipeline){ # value = TRUE, invert = TRUE) # ui_info("{ui_value(length(ls_files))} amount of files to copy") copy_files_in_folder(analysis_template, path) + deploy_apps(apps, path) } bcbio_render <- function(path, pipeline, data){ diff --git a/tests/testthat/rnaseq.R b/tests/testthat/rnaseq.R index 00a77a8..8c82339 100644 --- a/tests/testthat/rnaseq.R +++ b/tests/testthat/rnaseq.R @@ -1,6 +1,15 @@ library(bcbioR) +test_that("scrnaseq",{ + path <- withr::local_tempdir() + print(path) + copy_templates(path, "singlecell") + expect_length(fs::dir_ls(path,all=T),8) + expect_true(grepl("scRNAseq_qc_app", + fs::dir_ls(file.path(path, "apps"), recurse=T, all=T)[2])) +}) + test_that("base copy",{ path <- withr::local_tempdir() print(path) From 3f658608ed37b42e25b2b761ee713d916a7da3d6 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Fri, 9 Aug 2024 10:40:19 -0400 Subject: [PATCH 43/93] make comments until ready to export --- R/helpers.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/helpers.R b/R/helpers.R index 1fcde18..d3cb841 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -205,7 +205,7 @@ bcbio_render <- function(path, pipeline, data){ } } -#' help with bcbio analysis setup +# help with bcbio analysis setup use_bcbio_analysis <- function(path, pipeline, copy=TRUE, metadata=NULL){ if (copy){ @@ -254,10 +254,10 @@ use_bcbio_analysis <- function(path, pipeline, copy=TRUE, metadata=NULL){ } -#' Pilot to deploy full projects at once -#' path <- withr::local_tempdir() -#' use_bcbio_projects(path,pipeline="nf-core/rnaseq",copy=TRUE) -#' fs::dir_ls(path) +# Pilot to deploy full projects at once +# path <- withr::local_tempdir() +# use_bcbio_projects(path,pipeline="nf-core/rnaseq",copy=TRUE) +# fs::dir_ls(path) use_bcbio_projects <- function(path, pipeline=NULL, metadata=NULL, git=TRUE, gh=FALSE, org=NULL, copy=TRUE) { From 10ef7adb46a0d330f70565d384ee575b1af6bb08 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Fri, 9 Aug 2024 15:58:18 -0400 Subject: [PATCH 44/93] get all files in templates --- inst/templates/base/code/placeholder.R | 0 inst/templates/base/data/readme | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 inst/templates/base/code/placeholder.R create mode 100644 inst/templates/base/data/readme diff --git a/inst/templates/base/code/placeholder.R b/inst/templates/base/code/placeholder.R new file mode 100644 index 0000000..e69de29 diff --git a/inst/templates/base/data/readme b/inst/templates/base/data/readme new file mode 100644 index 0000000..e69de29 From 342d205a93ec9186752e6c24258fa8b9bb3b00f1 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Fri, 9 Aug 2024 16:24:38 -0400 Subject: [PATCH 45/93] add gitignore template --- inst/templates/base/{.gitignore => gitignore} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename inst/templates/base/{.gitignore => gitignore} (100%) diff --git a/inst/templates/base/.gitignore b/inst/templates/base/gitignore similarity index 100% rename from inst/templates/base/.gitignore rename to inst/templates/base/gitignore From 61e114cb784fd2aa31bd82c5695dac13dfe579b9 Mon Sep 17 00:00:00 2001 From: Alex Bartlett <74612800+abartlett004@users.noreply.github.com> Date: Fri, 9 Aug 2024 16:32:34 -0400 Subject: [PATCH 46/93] Update README.md --- inst/templates/base/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/templates/base/README.md b/inst/templates/base/README.md index d286e6a..a7c4fa3 100644 --- a/inst/templates/base/README.md +++ b/inst/templates/base/README.md @@ -43,7 +43,7 @@ - [ ] Track in *Git* this `README` file - [ ] Track in *Git* files in `scripts`, `meta`, and `reports` that belongs to these type: - - **Note** Git add `*.Rmd *.R *ipynb *.sh *.yaml`. (feel free use `.gitignore` if you use a GUI for non-tracked files). *DO NOT* use `git add *`. *DO NOT* track `html/csv/figures` + - **Note** Git add `*.Rmd *.R *ipynb *.sh *.yaml`. (feel free use `.gitignore` if you use a GUI for non-tracked files). *DO NOT* use `git add *`. *DO NOT* track `html/csv/figures`. *DO NOT* track files that you did not use for this project (i.e. irrelevant templates, placeholders) - [ ] Commit files and push to *Github* as necessary throughout the project, but especially when work is complete ## Dropbox From 7ac5d1aaa9fd934fdcdc2b6a48159adbcede27ab Mon Sep 17 00:00:00 2001 From: Alex Bartlett <74612800+abartlett004@users.noreply.github.com> Date: Fri, 9 Aug 2024 16:33:54 -0400 Subject: [PATCH 47/93] Update gitignore --- inst/templates/base/gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/templates/base/gitignore b/inst/templates/base/gitignore index a8c1ef8..5f86d62 100644 --- a/inst/templates/base/gitignore +++ b/inst/templates/base/gitignore @@ -15,3 +15,4 @@ docs/* **/*rda **/*csv **/*tsv +**/*app.R From 07bde9a9b2b17c92a5e61c717e2426b663e64c57 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Tue, 13 Aug 2024 14:43:56 -0400 Subject: [PATCH 48/93] make load_data common to qc and de --- inst/templates/rnaseq/DE/DEG.Rmd | 7 +- inst/templates/rnaseq/QC/QC_nf-core.Rmd | 204 ++++-------------- .../rnaseq/QC/params_qc_nf-core-example.R | 7 +- inst/templates/rnaseq/QC/params_qc_nf-core.R | 5 +- .../templates/rnaseq/{DE => libs}/load_data.R | 15 +- tests/testthat/rnaseq.R | 103 ++++----- 6 files changed, 97 insertions(+), 244 deletions(-) rename inst/templates/rnaseq/{DE => libs}/load_data.R (90%) diff --git a/inst/templates/rnaseq/DE/DEG.Rmd b/inst/templates/rnaseq/DE/DEG.Rmd index c2069be..814b4fe 100644 --- a/inst/templates/rnaseq/DE/DEG.Rmd +++ b/inst/templates/rnaseq/DE/DEG.Rmd @@ -17,21 +17,20 @@ output: editor_options: chunk_output_type: console params: - # Put hg38, mm10, mm39, or other - - ## Combatseq and ruv can both be false or ONLY ONE can be true + ## Combatseq and ruv can both be false or ONLY ONE can be true ## Both cannot be true numerator: tumor denominator: normal column: sample_type subset_column: null subset_value: null + # Put hg38, mm10, mm39, or other genome: hg38 ruv: false combatseq: false params_file: params_de-example.R project_file: ../information.R - functions_file: load_data.R + functions_file: ../libs/load_data.R --- diff --git a/inst/templates/rnaseq/QC/QC_nf-core.Rmd b/inst/templates/rnaseq/QC/QC_nf-core.Rmd index dee4712..2bfeff9 100644 --- a/inst/templates/rnaseq/QC/QC_nf-core.Rmd +++ b/inst/templates/rnaseq/QC/QC_nf-core.Rmd @@ -17,23 +17,24 @@ output: editor_options: chunk_output_type: console params: - # params_file: params_qc_nf-core-example.R # example data # Fill this file with the right paths to nfcore output - params_file: params_qc_nf-core.R # Put hg38, mm10, mm39, or other + # params_file: params_qc_nf-core-example.R # example data + params_file: params_qc_nf-core-example.R genome: hg38 project_file: ../information.R + functions_file: ../libs/load_data.R factor_of_interest: sample_type --- -```{r} +```{r, cache = FALSE, message = FALSE, warning=FALSE} # This set up the working directory to this file so all files can be found library(rstudioapi) setwd(fs::path_dir(getSourceEditorContext()$path)) ``` -```{r source_params, echo = F} +```{r source_params, cache = FALSE, message = FALSE, warning=FALSE} # 1. set up factor_of_interest parameter from parameter above or manually # this is used to color plots, it needs to be part of the metadata factor_of_interest=params$factor_of_interest @@ -43,6 +44,8 @@ source(params$params_file) # 3. If you set up this file, project information will be printed below and #. it can be reused for other Rmd files. source(params$project_file) +# 4. Load custom functions to load data from coldata/metrics/counts +source(params$functions_file) ``` # Overview @@ -80,50 +83,6 @@ opts_chunk[["set"]]( ``` -```{r subchunkify, echo=FALSE, eval=FALSE} -#' Create sub-chunks for plots -#' -#' taken from: https://stackoverflow.com/questions/15365829/dynamic-height-and-width-for-knitr-plots -#' -#' @param pl a plot object -#' @param fig.height figure height -#' @param fig.width figure width -#' @param chunk_name name of the chunk -#' -#' @author Andreas Scharmueller \email{andschar@@protonmail.com} -#' -subchunkify = function(pl, - fig.height = 7, - fig.width = 5, - chunk_name = 'plot') { - pl_deparsed = paste0(deparse(function() { - pl - }), collapse = '') - - sub_chunk = paste0( - "```{r ", - chunk_name, - ", fig.height=", - fig.height, - ", fig.width=", - fig.width, - ", dpi=72", - ", echo=FALSE, message=FALSE, warning=FALSE, fig.align='center'}", - "\n(", - pl_deparsed, - ")()", - "\n```" - ) - - cat(knitr::knit( - text = knitr::knit_expand(text = sub_chunk), - quiet = TRUE - )) -} - -``` - - ```{r sanitize-datatable} sanitize_datatable = function(df, ...) { # remove dashes which cause wrapping @@ -135,16 +94,29 @@ sanitize_datatable = function(df, ...) { # Samples and metadata +```{r load_data, message=F, warning=F} +# This code will load from bcbio or nf-core folder +# NOTE make sure to set numerator and denominator +coldata <- load_coldata(coldata_fn) +coldata$sample=row.names(coldata) + +counts <- load_counts(counts_fn) +counts <- counts[,colnames(counts) %in% coldata$sample] + +metrics <- load_metrics(se_object, multiqc_data_dir, gtf_fn, counts) %>% + left_join(coldata, by = c('sample')) %>% + as.data.frame() +# TODO: change order as needed +order <- unique(metrics[["sample"]]) +rownames(metrics) <- metrics$sample +# if the names don't match in order or string check files names and coldata information +counts = counts[,rownames(metrics)] +coldata = coldata[rownames(metrics),] +stopifnot(all(names(counts) == rownames(metrics))) +``` ```{r load_metadata} - -meta_df=read_csv(metadata_fn) %>% - arrange(.data[[factor_of_interest]]) %>% - distinct(sample, .keep_all = T) %>% - dplyr::select(!matches("fastq"), !matches("strandness")) -meta_df$sample <- make.names(meta_df$sample) -order <- meta_df$sample - +meta_df=coldata ggplot(meta_df, aes(.data[[factor_of_interest]], fill = .data[[factor_of_interest]])) + geom_bar() + ylab("") + xlab("") + ylab("# of samples") + @@ -152,112 +124,11 @@ ggplot(meta_df, aes(.data[[factor_of_interest]], ``` -```{r load_data} -# read counts from SE object -se <- readRDS(se_object) -raw_counts <- assays(se)[["counts"]] %>% round() %>% - as.matrix() -raw_counts=raw_counts[rowSums(raw_counts)!=0,] -``` - -```{r prepare_metrics} -# Get metrics from nf-core into bcbio like table -# many metrics are already in the General Table of MultiQC, this reads the file -metrics <- read_tsv(file.path(multiqc_data_dir, 'multiqc_general_stats.txt')) - -# we get some more metrics from Qualimap and rename columns -metrics_qualimap <- read_tsv(file.path(multiqc_data_dir, 'mqc_qualimap_genomic_origin_1.txt')) -metrics <- metrics %>% full_join(metrics_qualimap) -metrics <- metrics %>% - clean_names() %>% - dplyr::rename_with(~gsub('.*mqc_generalstats_', '', .)) - -# This uses the fastqc metrics to get total reads -total_reads <- metrics %>% - dplyr::filter(!is.na(fastqc_raw_total_sequences)) %>% - remove_empty(which = 'cols') %>% - dplyr::rename(single_sample = sample) %>% - mutate(sample = gsub('_[12]+$', '', single_sample)) %>% - group_by(sample) %>% - summarize(total_reads = sum(fastqc_raw_total_sequences)) - -# This renames to user-friendly names the metrics columns -metrics <- metrics %>% - dplyr::filter(is.na(fastqc_raw_total_sequences)) %>% - remove_empty(which = 'cols') %>% - full_join(total_reads) %>% - mutate(mapped_reads = samtools_reads_mapped) %>% - mutate(exonic_rate = exonic/(star_uniquely_mapped * 2)) %>% - mutate(intronic_rate = intronic/(star_uniquely_mapped * 2)) %>% - mutate(intergenic_rate = intergenic/(star_uniquely_mapped * 2)) %>% - mutate(x5_3_bias = qualimap_5_3_bias) - -# Sometimes we don't have rRNA due to mismatch annotation, We skip this if is the case -gtf <- NULL -if (genome =="other"){ - gtf <- gtf_fn -}else{ - if (genome == "hg38") { - gtf <- "hg38.rna.gtf.gz" - } else if (genome == "mm10") { - gtf <- "mm10.rna.gtf.gz" - } else if (genome == "mm39") { - gtf <- "mm39.rna.gtf.gz" - } - gtf <- system.file("extdata", "annotation", - gtf, - package="bcbioR") -} -if (is.null(gtf)) { - print("No genome provided! Please add it at the top of this Rmd") -} - -gtf=rtracklayer::import(gtf) - -one=grep("gene_type", colnames(as.data.frame(gtf)), value = TRUE) -another=grep("gene_biotype", colnames(as.data.frame(gtf)), value = TRUE) -biotype=NULL -if(length(one)==1){ - biotype=one -}else if(length(another)==1){ - biotype=another -}else{ - warning("No gene biotype founded") -} - -if (!is.null(biotype)){ - annotation=as.data.frame(gtf) %>% .[,c("gene_id", biotype)] - rRNA=grepl("rRNA|tRNA",annotation[[biotype]]) - genes=intersect(annotation[rRNA,"gene_id"],row.names(raw_counts)) - ratio=data.frame(sample=colnames(raw_counts), - r_and_t_rna_rate=colSums(raw_counts[genes,])/colSums(raw_counts)) - metrics = left_join(metrics, ratio, by="sample") -}else{ - metrics[["r_and_t_rna_rate"]] <- NA -} - -# if ("custom_content_biotype_counts_percent_r_rna" %in% colnames(metrics)){ -# metrics <- mutate(metrics, r_rna_rate = custom_content_biotype_counts_percent_r_rna) -# }else{ -# metrics[["r_rna_rate"]] <- NA -# } -metrics=metrics[,c("sample","mapped_reads","exonic_rate","intronic_rate", - "total_reads", - "x5_3_bias", "r_and_t_rna_rate","intergenic_rate")] -metrics$sample <- make.names(metrics$sample) -metrics <- metrics %>% - full_join(meta_df , by = c("sample" = "sample")) %>% - dplyr::select(where(~!all(is.na(.)))) - -``` - ```{r show_metadata} meta_sm <- meta_df %>% - as.data.frame() %>% - column_to_rownames("sample") + as.data.frame() meta_sm %>% sanitize_datatable() - ``` # Read metrics {.tabset} @@ -321,7 +192,7 @@ metrics %>% The number of genes represented in every sample is expected to be consistent and over 20K (grey line). ```{r calc_genes_detected} -genes_detected <- colSums(assays(se)[["counts"]] > 0) %>% enframe() +genes_detected <- colSums(counts > 0) %>% enframe() sample_names <- metrics[,c("sample"), drop=F] genes_detected <- left_join(genes_detected, sample_names, by = c("name" = "sample")) genes_detected <- genes_detected %>% group_by(name) @@ -484,15 +355,15 @@ We expect consistency in the box plots here between the samples, i.e. the distri metrics_small <- metrics %>% dplyr::select(sample, .data[[factor_of_interest]]) metrics_small <- left_join(sample_names, metrics_small) -counts <- - assays(se)[["counts"]] %>% +counts_lng <- + counts %>% as_tibble() %>% filter(rowSums(.)!=0) %>% gather(name, counts) -counts <- left_join(counts, metrics_small, by = c("name" = "sample")) +counts_lng <- left_join(counts_lng, metrics_small, by = c("name" = "sample")) -ggplot(counts, aes(factor(name, level = order), +ggplot(counts_lng, aes(factor(name, level = order), log2(counts+1), fill = .data[[factor_of_interest]])) + geom_boxplot() + @@ -515,11 +386,11 @@ Principal Component Analysis (PCA) is a statistical technique used to simplify h ```{r PCA1:5 summary, all, unlabeled, fig.width= 7, fig.height = 5} -vst <- vst(raw_counts) +vst <- vst(counts) coldat_for_pca <- as.data.frame(metrics) rownames(coldat_for_pca) <- coldat_for_pca$sample -coldat_for_pca <- coldat_for_pca[colnames(raw_counts),] +coldat_for_pca <- coldat_for_pca[colnames(counts),] pca1 <- degPCA(vst, coldat_for_pca, condition = factor_of_interest, data = T)[["plot"]] pca2 <- degPCA(vst, coldat_for_pca, @@ -552,10 +423,9 @@ cluster by plotting the correlation between the expression profiles of the samples. ```{r clustering fig, fig.width = 10, fig.asp = .62} - vst_cor <- cor(vst) -colma=meta_df %>% as.data.frame() +colma=coldata %>% as.data.frame() rownames(colma) <- colma$sample colma <- colma[rownames(vst_cor), ] colma <- colma %>% dplyr::select(.data[[factor_of_interest]]) diff --git a/inst/templates/rnaseq/QC/params_qc_nf-core-example.R b/inst/templates/rnaseq/QC/params_qc_nf-core-example.R index dae62ce..b364967 100644 --- a/inst/templates/rnaseq/QC/params_qc_nf-core-example.R +++ b/inst/templates/rnaseq/QC/params_qc_nf-core-example.R @@ -1,9 +1,12 @@ # info params -# Example data: COMMENT THESE LINE IF YOU ARE USING YOUR DATA -metadata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/coldata.csv' + +# Example data +coldata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/coldata.csv' +counts_fn=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.tsv') se_object=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.rds') # This folder is in the output directory inside multiqc folder multiqc_data_dir='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/multiqc/star_salmon/multiqc-report-data/' # This file is inside the genome folder in the output directory gtf_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/devel/nf-core/genome/genome.filtered.gtf.gz' +se_object = NA diff --git a/inst/templates/rnaseq/QC/params_qc_nf-core.R b/inst/templates/rnaseq/QC/params_qc_nf-core.R index 08b3ec0..897b6b0 100644 --- a/inst/templates/rnaseq/QC/params_qc_nf-core.R +++ b/inst/templates/rnaseq/QC/params_qc_nf-core.R @@ -2,10 +2,11 @@ # Your data # This is the file used to run nf-core or compatible to that -metadata_fn='/Path/to/metadata/meta.csv' +coldata_fn='/Path/to/metadata/meta.csv' # This file is inside star_salmon/ folder -se_object='/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.rds' +counts_fn='/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.tsv' # This folder called "multiqc_report_data" is inside the output directory star_salmon inside multiqc folder multiqc_data_dir='/path/to/nf-core/output/multiqc/star_salmon/multiqc_report_data' # This file is inside the genome folder in the output directory, use this only for non-model organism # gtf_fn='/path/to/nf-core/output/genome/hg38.filtered.gtf' +se_object = NA diff --git a/inst/templates/rnaseq/DE/load_data.R b/inst/templates/rnaseq/libs/load_data.R similarity index 90% rename from inst/templates/rnaseq/DE/load_data.R rename to inst/templates/rnaseq/libs/load_data.R index 8a1d297..6462edb 100644 --- a/inst/templates/rnaseq/DE/load_data.R +++ b/inst/templates/rnaseq/libs/load_data.R @@ -101,15 +101,17 @@ load_metrics <- function(se_object, multiqc_data_dir, gtf_fn, counts){ return(metrics) } -load_coldata <- function(coldata_fn, column, numerator, denominator, subset_column = NULL, subset_value = NULL){ +load_coldata <- function(coldata_fn, column=NULL, numerator=NULL, denominator=NULL, subset_column = NULL, subset_value = NULL){ coldata=read.csv(coldata_fn) %>% - dplyr::select(!matches("fastq") & !matches("strandness")) %>% + dplyr::distinct(sample, .keep_all = T) %>% + dplyr::select(!matches("fastq"), !matches("strandness")) %>% distinct() if('description' %in% names(coldata)){ coldata$sample <- tolower(coldata$description) } coldata <- coldata %>% distinct(sample, .keep_all = T) - stopifnot(column %in% names(coldata)) + if (!is.null(column)) + stopifnot(column %in% names(coldata)) # use only some samples, by default use all if (!is.null(subset_column)){ @@ -121,7 +123,8 @@ load_coldata <- function(coldata_fn, column, numerator, denominator, subset_colu rownames(coldata) <- coldata$sample coldata$description <- coldata$sample - coldata[[column]] = relevel(as.factor(coldata[[column]]), denominator) + if (!is.null(denominator)) + coldata[[column]] = relevel(as.factor(coldata[[column]]), denominator) return(coldata) } @@ -138,8 +141,8 @@ load_counts <- function(counts_fn){ } else { # nf-core input counts <- read_tsv(counts_fn) %>% dplyr::select(-gene_name) %>% mutate(gene_id = str_replace(gene_id, pattern = "\\.[0-9]+$", "")) %>% - column_to_rownames('gene_id') %>% round - + column_to_rownames('gene_id') %>% round %>% as.matrix() + counts=counts[rowSums(counts)!=0,] return(counts) } diff --git a/tests/testthat/rnaseq.R b/tests/testthat/rnaseq.R index 8c82339..bdebb74 100644 --- a/tests/testthat/rnaseq.R +++ b/tests/testthat/rnaseq.R @@ -1,74 +1,51 @@ library(bcbioR) -test_that("scrnaseq",{ +test_that("rnaseq deg",{ path <- withr::local_tempdir() print(path) - copy_templates(path, "singlecell") - expect_length(fs::dir_ls(path,all=T),8) - expect_true(grepl("scRNAseq_qc_app", - fs::dir_ls(file.path(path, "apps"), recurse=T, all=T)[2])) -}) - -test_that("base copy",{ - path <- withr::local_tempdir() - print(path) - bcbio_templates(type="base", outpath=path) - expect_length(fs::dir_ls(path,all=T),9) + bcbio_templates(type="rnaseq", outpath=path) + fs::dir_ls(path,all=T) + numerator="tumor" + denominator="normal" + subset_value=NA + rmarkdown::render(input = file.path(path,"DE/DEG.Rmd"), + output_dir = file.path(path,"DE"), + output_format = "html_document", + output_file = ifelse(!is.na(subset_value), + paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'), + paste0('DE_', numerator, '_vs_', denominator, '.html') + ), + clean = TRUE, + # envir = new.env(), + params = list( + subset_value = subset_value, + numerator = numerator, + denominator = denominator, + params_file = file.path(path,'DE/params_de-example.R'), + project_file = file.path(path,'information.R'), + functions_file = file.path(path,'libs/load_data.R') + ) + ) + # browseURL(file.path(path, "DE/DE_tumor_vs_normal.html")) + # usethis::proj_activate(path) }) -test_that("rnaseq copy",{ +test_that("rnaseq qc",{ path <- withr::local_tempdir() print(path) bcbio_templates(type="rnaseq", outpath=path) - expect_length(fs::dir_ls(path,all=T),4) - # numerator="tumor" - # denominator="normal" - # subset_value=NA - # rmarkdown::render(input = file.path(path,"DE/DEG.Rmd"), - # output_dir = file.path(path,"DE"), - # output_format = "html_document", - # output_file = ifelse(!is.na(subset_value), - # paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'), - # paste0('DE_', numerator, '_vs_', denominator, '.html') - # ), - # clean = TRUE, - # envir = new.env(), - # params = list( - # subset_value = subset_value, - # numerator = numerator, - # denominator = denominator, - # params_file = file.path(path,'DE/params_de-example.R'), - # project_file = file.path(path,'information.R'), - # functions_file = file.path(path,'DE/load_data.R') - # ) - # ) - # use_bcbio_projects(path, nfcore="nf-core/rnaseq", copy=TRUE, git=FALSE) + fs::dir_ls(path,all=T) + rmarkdown::render(input = file.path(path,"QC/QC_nf-core.Rmd"), + output_dir = file.path(path,"QC"), + output_format = "html_document", + clean = TRUE, + params = list( + params_file = file.path(path,'QC/params_qc_nf-core-example.R'), + project_file = file.path(path,'information.R'), + functions_file = file.path(path,'libs/load_data.R') + ) + ) + # browseURL(file.path(path, "QC/QC_nf-core.html")) + # usethis::proj_activate(path) }) - -# test_that("rnaseq testing", { -# path <- withr::local_tempdir() -# print(path) -# bcbio_templates(type="rnaseq", outpath=path) -# numerator="tumor" -# denominator="normal" -# subset_value=NA -# rmarkdown::render(input = file.path(path,"DE/DEG.Rmd"), -# output_dir = file.path(path,"DE"), -# output_format = "html_document", -# output_file = ifelse(!is.na(subset_value), -# paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'), -# paste0('DE_', numerator, '_vs_', denominator, '.html') -# ), -# clean = TRUE, -# envir = new.env(), -# params = list( -# subset_value = subset_value, -# numerator = numerator, -# denominator = denominator, -# params_file = file.path(path,'DE/params_de.R'), -# project_file = file.path(path,'information.R'), -# functions_file = file.path(path,'DE/load_data.R') -# ) -# ) -# }) From 44ac670f1ab98c8b989990fe34e3039d1ff798e2 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Tue, 13 Aug 2024 14:50:30 -0400 Subject: [PATCH 49/93] dont run in macosx --- .github/workflows/R-CMD-check.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 8e9e284..f01ef9b 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -18,7 +18,7 @@ jobs: fail-fast: false matrix: config: - - {os: macos-latest, r: 'release'} + # - {os: macos-latest, r: 'release'} #- {os: windows-latest, r: 'release'} - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - {os: ubuntu-latest, r: 'release'} From e96d3b47fbe0d59cefa579226e70472afcac5975 Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Fri, 16 Aug 2024 10:38:48 -0400 Subject: [PATCH 50/93] no special chars in sample names --- R/helpers.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/helpers.R b/R/helpers.R index d3cb841..a78ecb6 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -22,6 +22,8 @@ bcbio_nfcore_check <- function(file){ stop("Missing required columns ", paste(required, collapse = " ")) }else if (any(grepl("^[1-9]", samplesheet[["sample"]]))){ stop("Avoid samples starting with numbers ") + }else if (any(grep("[^a-zA-Z0-9_]", samplesheet[["sample"]]))){ + stop("Sample names should contain only letters, numbers, and underscores") }else if (any(is.na(samplesheet))){ warning("Columns with missing values") }else{ From 64a1e992ddbb6e5618b07191435118a994819f22 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Fri, 23 Aug 2024 15:35:33 -0400 Subject: [PATCH 51/93] 0.3.0alpha release --- DESCRIPTION | 2 +- NEWS.md | 8 + inst/templates/rnaseq/DE/DEG.Rmd | 456 +++++++++++++++---------- inst/templates/rnaseq/libs/FA.R | 90 +++++ inst/templates/rnaseq/libs/load_data.R | 6 +- tests/testthat/rnaseq.R | 18 +- 6 files changed, 373 insertions(+), 207 deletions(-) create mode 100644 inst/templates/rnaseq/libs/FA.R diff --git a/DESCRIPTION b/DESCRIPTION index ebbce77..e2efa40 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: bcbioR Type: Package Title: Templates and functions to guide downstream analysis and data interpretation -Version: 0.2.0 +Version: 0.3.0 Authors@R: person("Pantano", "Lorena", , "lorena.pantano@gmail.com", role = c("aut", "cre")) Description: Collaborative code repository at the Harvard Chan Bioinformatics Core. diff --git a/NEWS.md b/NEWS.md index dd18251..c2b24bf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,11 @@ +# bcbioR 0.3.0 + +* Support multiple comparisons +* re-structure templates +* Add text with best practices +* Add draft for methylation and spatial +* Add scQC shiny app + # bcbioR 0.1.3 * fix duplicated gene names diff --git a/inst/templates/rnaseq/DE/DEG.Rmd b/inst/templates/rnaseq/DE/DEG.Rmd index 814b4fe..39c1e2e 100644 --- a/inst/templates/rnaseq/DE/DEG.Rmd +++ b/inst/templates/rnaseq/DE/DEG.Rmd @@ -15,22 +15,21 @@ output: collapsed: true smooth_scroll: true editor_options: - chunk_output_type: console + chunk_output_type: inline params: ## Combatseq and ruv can both be false or ONLY ONE can be true - ## Both cannot be true - numerator: tumor - denominator: normal - column: sample_type + # numerator: tumor + # denominator: normal + column: "sample_type" + contrasts: !r list(c("sample_type", "tumor", "normal")) subset_column: null subset_value: null - # Put hg38, mm10, mm39, or other genome: hg38 ruv: false combatseq: false params_file: params_de-example.R project_file: ../information.R - functions_file: ../libs/load_data.R + functions_file: ../libs --- @@ -42,17 +41,19 @@ setwd(fs::path_dir(getSourceEditorContext()$path)) ```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} +library(tidyverse) # 1. Set up input files in this R file (params_de.R) source(params$params_file) # 2. Set up project file (already done from QC probably) source(params$project_file) # 3. Load custom functions to load data from coldata/metrics/counts -source(params$functions_file) +map(list.files(params$functions_file,pattern = "*.R$",full.names = T),source) %>% invisible() # IMPORTANT set these values if you are not using the parameters in the header (lines 22-31) genome=params$genome column=params$column -numerator=params$numerator -denominator=params$denominator +contrasts=params$contrasts +# numerator=params$numerator +# denominator=params$denominator subset_column=params$subset_column subset_value=params$subset_value run_ruv=params$ruv @@ -80,6 +81,7 @@ library(pheatmap) library(janitor) library(ggforce) library(vegan) +library(htmltools) colors=cb_friendly_cols(1:15) ggplot2::theme_set(theme_prism(base_size = 14)) @@ -108,13 +110,10 @@ sanitize_datatable = function(df, ...) { } ``` - - ```{r load_data, message=F, warning=F} # This code will load from bcbio or nf-core folder # NOTE make sure to set numerator and denominator coldata <- load_coldata(coldata_fn, column, - numerator, denominator, subset_column, subset_value) coldata$sample=row.names(coldata) @@ -128,11 +127,10 @@ rownames(metrics) <- metrics$sample # if the names don't match in order or string check files names and coldata information counts = counts[,rownames(metrics)] coldata = coldata[rownames(metrics),] +coldata[[contrasts[[1]][1]]] = relevel(as.factor(coldata[[contrasts[[1]][1]]]), contrasts[[1]][3]) stopifnot(all(names(counts) == rownames(metrics))) ``` - - # Overview - Project: `r project` @@ -140,30 +138,6 @@ stopifnot(all(names(counts) == rownames(metrics))) - Analyst: `r analyst` - Experiment: `r experiment` - Aim: `r aim` -- Comparison: `r ifelse(is.null(subset_value), paste0(numerator, ' vs. ', denominator), paste0(subset_value, ': ', numerator, ' vs. ', denominator))` - -```{r create_filenames} - -if (!is.null(subset_value) & !is.null(subset_value)){ - filenames = str_interp("${subset_value}_${numerator}_vs_${denominator}") -} else { - filenames = str_interp("${numerator}_vs_${denominator}") -} - -contrasts = c(column,numerator,denominator) -coef=paste0(column,"_",numerator,"_vs_",denominator) - -name_expression_fn=file.path( - basedir, - str_interp("${filenames}_expression.csv")) -name_deg_fn=file.path( - basedir, - str_interp("${filenames}_deg.csv")) -name_pathways_fn=file.path( - basedir, - str_interp("${filenames}_pathways.csv")) - -``` ```{r load_counts_data} @@ -173,6 +147,15 @@ rdata = AnnotationDbi::select(org.Hs.eg.db, rownames(counts), 'SYMBOL', 'ENSEMBL ``` +# Set up + +We recommend not to filter before DESeq2 since it will be handling by it. There are cases where pre-filtering could be good: + +- large number of drop-outs, to reduce computation +- large number of samples, to reduce computation +- unbalanced groups, many less samples for one group than another, maybe filtering by group is an option. + + ```{r setup_RUV} dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = ~1) @@ -209,7 +192,7 @@ pca$plot + ggtitle(paste0("All samples", "\nPCA using ", nrow(vsd_before), " gen geom_mark_ellipse(aes(color = sample_type)) + scale_color_cb_friendly() ``` -## PERMDISP +## Analysis of the variance by group Groups in a univariate analysis can also differ with regard to their mean values, variation around those means, or both. In univariate analyses, dispersion can be examined using Levene’s test. PERMDISP is a multivariate extension of Levene’s test to examine whether groups differ in variability. In essence, PERMDISP involves calculating the distance from each data point to its group centroid and then testing whether those distances differ among the groups. [Source](https://uw.pressbooks.pub/appliedmultivariatestatistics/chapter/permdisp/) @@ -225,7 +208,6 @@ permutest(mod, pairwise = TRUE) ``` - # Covariate analysis Multiple factors related to the experimental design or quality of sequencing may influence the outcomes of a given RNA-seq experiment. To further determine whether any confounding covariate risks affecting the results of our differential expression analyses, it is useful to assess the correlation between covariates and principal component (PC) values. @@ -240,6 +222,7 @@ degCovariates( ) ``` +# Data modeling ```{r init_DESEQ} @@ -254,23 +237,28 @@ norm_matrix = assay(vsd_before) new_cdata <- coldata ``` +For this study, this formula is recommended: `r as.character(formula)` -```{r, eval=F, echo=FALSE} -#### IF YOU ARE RUNNING RUV OR COMBATSEQ RUN THE CHUNKS BELOW OTHERWISE SKIP TO Differential Expression SECTION -### RUV - LINES 261-296 -### COMBATSEQ - LINES 303-369 +```{r, eval=F, echo=FALSE} +#### IF YOU ARE RUNNING RUV OR COMBATSEQ RUN THE CHUNKS BELOW OTHERWISE SKIP TO Differential Expression SECTION or remove this section ``` - +## Remove unwanted variation ```{r, eval=run_ruv, results='asis', echo=run_ruv} -cat("# Remove Unwanted Variability +cat("### Assessing unknown factors When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data.") ``` +```{r, eval=!run_ruv, results='asis', echo=run_ruv} +cat("There is no need to assess unknown factor for this study.") +``` + ```{r do_RUV, eval=run_ruv, echo=run_ruv} +# TOFIX Add to template: check correlation of dummy variables produced by ruvseq with existing covariates in metadata +# NOTE ruvseq (used when you don’t know where the unwanted variation is coming from. Package utilizes dummy variable(s), 1-5 used, start with 1, look at PCA, decide if you want more separation) Add any known-created RUV variables to DESeq2 formula. Normalized matrix produced – only for visualization, not for input into DESeq2 library(RUVSeq) # If you want to skip the code, just set up formula to be your model in the next chunk of code @@ -290,7 +278,7 @@ formula <- as.formula(paste0("~ ", collapse = " + " ), " + ", column) ) -norm_matrix=ruvset$normalizedCounts +norm_matrix=ruvset$normalizedCounts # NOTE use this for visualization pca2 <- degPCA(norm_matrix, new_cdata, condition = column) + ggtitle('After RUV') pca2 + scale_color_cb_friendly() @@ -298,16 +286,16 @@ pca2 + scale_color_cb_friendly() ``` ```{r after_RUV, eval=run_ruv} - dds_to_use <- DESeqDataSetFromMatrix(counts, new_cdata, design = formula) vsd_to_use<- vst(dds_to_use, blind=FALSE) - ``` + ```{r combat-text , eval=run_combatseq, results='asis', echo=run_combatseq} +# NOTE Combatseq (part of the SVA package) - corrected count, removing the effects while retaining the structure of the data. Used in a scenario where you know what covariate/batch is. Do not add know-removed known covariates to DESeq2 formula. Also, don’t attempt to remove biological effect (e.g. donor), this is not conceptually valid; best for technical variation. library(sva) -cat("# Remove Batch Effects +cat("### Remove Batch Effects Here we apply Combat-seq (https://github.com/zhangyuqing/ComBat-seq) to try to remove batch effects so we can better tease out the effects of interest. @@ -315,6 +303,9 @@ Combat-seq uses a negative binomial regression to model batch effects, providing ``` +```{r, eval=!run_combatseq, results='asis', echo=run_combatseq} +cat("There is no need to remove known factors like batch effect in this study.") +``` ```{r set_variable_combatseq, eval=run_combatseq, echo=run_combatseq} @@ -348,7 +339,6 @@ treatment = coldata[[to_keep]] ``` - ```{r do_combatseq, eval=run_combatseq} adjusted_counts <- ComBat_seq(as.matrix(counts), batch=batch, group = treatment) @@ -363,12 +353,11 @@ adjusted_counts <- ComBat_seq(as.matrix(counts), batch=batch, group = treatment) dds_to_use <- DESeqDataSetFromMatrix(adjusted_counts, coldata, design = formula) vsd_combat<- vst(dds_to_use, blind=FALSE) -combat_matrix = assay(vsd_combat) +norm_matrix = assay(vsd_combat) -pca_combat <- degPCA(combat_matrix, coldata, +pca_combat <- degPCA(norm_matrix, coldata, condition = column) + ggtitle('After Combatseq') pca_combat + scale_color_cb_friendly() - ``` @@ -381,113 +370,166 @@ Before fitting the model, we often look at a metric called dispersion, which is We use the below dispersion plot, which should show an inverse relationship between dispersion and mean expression, to get an idea of whether our data is a good fit for the model. ```{r DE} +# NOTE Note VST won’t regress out this when normalizing de <- DESeq(dds_to_use) - DESeq2::plotDispEsts(de) ``` Because it is difficult to accurately detect and quantify the expression of lowly expressed genes, differences in their expression between treatment conditions can be unduly exaggerated after the model is fit. We correct for this so that gene LFC is not dependent overall on basal gene expression level. -```{r lfc_shrink} -# resultsNames(de) # check the order is right -resLFC = results(de, contrast=contrasts) -resLFCS <- lfcShrink(de, coef=coef, type="apeglm") +In cases there are multiple groups and conditions across groups is recommended to use dummy variables instead of interaction terms: https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#interactions. -res <- as.data.frame(resLFCS) %>% - rownames_to_column('gene_id') %>% left_join(rdata, by = 'gene_id') %>% - relocate(gene_name) %>% dplyr::rename(lfc = log2FoldChange) %>% - mutate(pi = abs(lfc) * -log10(padj)) %>% arrange(-pi) +The LRT is useful for testing multiple terms at once, for example testing 3 or more levels of a factor at once, or all interactions between two variables. The LRT for count data is conceptually similar to an analysis of variance (ANOVA) calculation in linear regression, except that in the case of the Negative Binomial GLM, we use an analysis of deviance (ANODEV), where the deviance captures the difference in likelihood between a full and a reduced model. -## Filter out genes that have no expression or were filtered out by DESEQ2 -res <- res[res$baseMean>0,] %>% drop_na(padj) %>% drop_na(pvalue) +```{r lfc_shrink} +# NOTE As a note: Use `ashr` for comparisons with many groups to be able to pull out all the contrasts; otherwise `apeglm` is fine. It shrinks less. -res_sig <- res %>% filter(padj < 0.05) %>% arrange(padj) %>% - mutate(gene_name = ifelse(is.na(gene_name), gene_id, gene_name)) +# NOTE We recommend LRT for time series -res_mod <- res %>% mutate(lfc = replace(lfc, lfc < -5, -5)) %>% mutate(lfc = replace(lfc, lfc > 5, 5)) -show <- as.data.frame(res_mod[1:10, c("lfc", "padj", "gene_name")]) +# resultsNames(de) # check the order is right +names_to_use=lapply(contrasts, function(contrast){coef = paste0(contrast[1], "_", contrast[2], "_vs_", contrast[3])}) +names(contrasts)=names_to_use +de_list=lapply(contrasts, function(contrast){ + resLFC = results(de, contrast=contrast) + coef = paste0(contrast[1], "_", contrast[2], "_vs_", contrast[3]) + resLFCS <- lfcShrink(de, coef=coef, type="apeglm") + # resLFCS <- lfcShrink(de, contrast=contrast, type="ash") + + res <- as.data.frame(resLFCS) %>% + rownames_to_column('gene_id') %>% left_join(rdata, by = 'gene_id') %>% + relocate(gene_name) %>% dplyr::rename(lfc = log2FoldChange) %>% + mutate(pi = abs(lfc) * -log10(padj)) %>% arrange(-pi) + + ## Filter out genes that have no expression or were filtered out by DESEQ2 + res <- res[res$baseMean>0,] %>% drop_na(padj) %>% drop_na(pvalue) + + res_sig <- res %>% filter(padj < 0.05) %>% arrange(padj) %>% + mutate(gene_name = ifelse(is.na(gene_name), gene_id, gene_name)) + results=list(lfc=resLFC, lfcs=resLFCS, all=res, sig=res_sig) + return(results) +}) -degMA(as.DEGSet(resLFC)) + ggtitle('Before LFC Shrinking') +# NOTE if you add manually any other comparison to the list with the following variables, +# the code below will make the plots for those as wells: +# de_list=c(de_list, new_comparison=list(lfc=resLFC, lfcs=resLFCS, all=res, sig=res_sig)) ``` -## MA plot +## MA plot {.tabset} -```{r after_lfc_shrink} -degMA(as.DEGSet(resLFCS), limit = 2) + ggtitle('After LFC Shrinking') +This plot can help to: +- Identify Differential Expression: Genes that show a significant log-fold change (M value away from 0) indicate changes in expression between conditions. +- Assess Data Quality: The plot can help in identifying biases or systematic errors in the data. Ideally, most points should scatter around the M=0 line, indicating that there is no significant systematic difference between the conditions. +- Visualize data dispersion: The distribution of points along the A-axis gives a sense of the spread of expression levels and any patterns or anomalies in the dataset. +```{r after_lfc_shrink, results='asis', message=F, warning=F} +for (contrast in names(de_list)){ + cat("### ", contrast, "\n\n") + p1=degMA(as.DEGSet(de_list[[contrast]]$lfc)) + ggtitle('Before LFC Shrinking') + print(p1) + p2=degMA(as.DEGSet(de_list[[contrast]]$lfcs), limit = 2) + ggtitle('After LFC Shrinking') + print(p2) + cat("\n\n") +} ``` -## Volcano plot +## Volcano plot {.tabset} This volcano plot shows the genes that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in red are genes that have padj < 0.05 and a log2-fold change > 1. Points in blue have a padj < 0.05 and a log2-fold change < 1 and points in green have a padj > 0.05 and a log2-fold change > 2. Grey points are non-significant. The dashed lines correspond to the cutoff values of log2 foldchance and padj that we have chosen. -```{r volcano_plot, fig.height=6} +```{r volcano_plot, fig.height=6, results='asis'} # degVolcano(res_mod[,c('lfc', 'padj')], plot_text = show) -EnhancedVolcano(res_mod, - lab= res_mod$gene_name, - pCutoff = 0.05, - selectLab = c(res_sig$gene_name[1:15]), - FCcutoff = 0.5, - x = 'lfc', - y = 'padj', - title="Volcano Tumor vs. Normal", - col=as.vector(colors[c("dark_grey", "light_blue", +for (contrast in names(de_list)){ + cat("### ", contrast, "\n\n") + res <- de_list[[contrast]][["all"]] + res_mod <- res %>% mutate(lfc = replace(lfc, lfc < -5, -5)) %>% mutate(lfc = replace(lfc, lfc > 5, 5)) + show <- as.data.frame(res_mod[1:10, c("lfc", "padj", "gene_name")]) + p1=EnhancedVolcano(res_mod, + lab= res_mod$gene_name, + pCutoff = 0.05, + selectLab = c(show$gene_name), + FCcutoff = 0.5, + x = 'lfc', + y = 'padj', + title=contrast, + col=as.vector(colors[c("dark_grey", "light_blue", "purple", "purple")]), - subtitle = "", drawConnectors = T, max.overlaps = Inf) + subtitle = "", drawConnectors = T, max.overlaps = Inf) + print(p1) + cat("\n\n") +} ``` -## Heatmap +## Heatmap {.tabset} -```{r heapmap} +```{r heapmap, results='asis'} ### Run pheatmap using the metadata data frame for the annotation -ma=norm_matrix[res_sig$gene_id,] -colma=coldata[,c(column), drop=FALSE] -colors=lapply(colnames(colma), function(c){ - l.col=colors[1:length(unique(colma[[c]]))] - names(l.col)=unique(colma[[c]]) - l.col -}) -names(colors)=colnames(colma) -pheatmap(ma, - color = inferno(10), - cluster_rows = T, - show_rownames = F, - annotation = colma, - annotation_colors = colors, - border_color = NA, - fontsize = 10, - scale = "row", - fontsize_row = 10, - height = 20) +for (contrast in names(de_list)){ + cat("### ", contrast, "\n\n") + res_sig = de_list[[contrast]][["sig"]] + ma=norm_matrix[res_sig$gene_id,] + colma=coldata[,c(column), drop=FALSE] + ma_colors=lapply(colnames(colma), function(c){ + l.col=colors[1:length(unique(colma[[c]]))] + names(l.col)=unique(colma[[c]]) + l.col + }) + names(ma_colors)=colnames(colma) + p1=pheatmap(ma, + color = inferno(10), + cluster_rows = T, + show_rownames = F, + annotation = colma, + annotation_colors = ma_colors, + border_color = NA, + fontsize = 10, + scale = "row", + fontsize_row = 10, + height = 20) + print(p1) + cat("\n\n") +} ``` -## Differentially Expressed Genes +## Differentially Expressed Genes {.tabset} -```{r sig_genes_table} -res_sig %>% sanitize_datatable +```{r sig_genes_table, results='asis'} +dt_list=list() +for (contrast in names(de_list)){ + res_sig=de_list[[contrast]][["sig"]] + dt_list=c(dt_list, + list(h3(contrast)), + list(DT::datatable(res_sig))) +} +tagList(dt_list) ``` -## Plot top 16 genes +## Plot top 16 genes {.tabset} -```{r top n DEGs, fig.height = 6, fig.width = 8} +```{r top n DEGs, fig.height = 6, fig.width = 8, results='asis'} n = 16 -top_n <- res_sig %>% slice_min(order_by = padj, n = n, with_ties = F) %>% - dplyr::select(gene_name, gene_id) -top_n_exp <- norm_matrix %>% as.data.frame() %>% - rownames_to_column('gene_id') %>% - # dplyr::select(-group, -group_name) %>% - pivot_longer(!gene_id, names_to = 'sample', values_to = 'log2_expression') %>% - right_join(top_n, relationship = "many-to-many") %>% - left_join(coldata, by = 'sample') - -ggplot(top_n_exp, aes_string(x = column, y = 'log2_expression')) + - geom_boxplot(outlier.shape = NA, linewidth=0.5, color="grey") + - geom_point() + - facet_wrap(~gene_name) + - ggtitle(str_interp('Expression of Top ${n} DEGs')) + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +for (contrast in names(de_list)){ + cat("### ", contrast, "\n\n") + res_sig = de_list[[contrast]][["sig"]] + top_n <- res_sig %>% slice_min(order_by = padj, n = n, with_ties = F) %>% + dplyr::select(gene_name, gene_id) + top_n_exp <- norm_matrix %>% as.data.frame() %>% + rownames_to_column('gene_id') %>% + # dplyr::select(-group, -group_name) %>% + pivot_longer(!gene_id, names_to = 'sample', values_to = 'log2_expression') %>% + right_join(top_n, relationship = "many-to-many") %>% + left_join(coldata, by = 'sample') + + p1=ggplot(top_n_exp, aes_string(x = column, y = 'log2_expression')) + + geom_boxplot(outlier.shape = NA, linewidth=0.5, color="grey") + + geom_point() + + facet_wrap(~gene_name) + + ggtitle(str_interp('Expression of Top ${n} DEGs')) + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + print(p1) + cat("\n\n") +} ``` # Pathway Enrichment @@ -495,80 +537,120 @@ ggplot(top_n_exp, aes_string(x = column, y = 'log2_expression')) + From the set of differentially expressed genes and using publicly available information about gene sets involved in biological processes and functions, we can calculate which biological processes and functions are significantly perturbed as a result of the treatment. ```{r} -universe=res %>% - filter(!is.na(padj)) %>% pull(gene_id) -mapping = AnnotationDbi::select(org.Hs.eg.db, universe, 'ENTREZID', 'ENSEMBL') - -all_in_life=list( - msigdbr(species = "human", category = "H") %>% mutate(gs_subcat="Hallmark"), - msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME"), - msigdbr(species = "human", category = "C2", subcategory = "CP:KEGG"), - msigdbr(species = "human", category = "C2", subcategory = "CP:PID"), - msigdbr(species = "human", category = "C5", subcategory = "GO:BP"), - msigdbr(species = "human", category = "C5", subcategory = "GO:MF"), - msigdbr(species = "human", category = "C5", subcategory = "HPO"), - msigdbr(species = "human", category = "C3", subcategory = "TFT:GTRD"), - msigdbr(species = "human", category = "C6") %>% mutate(gs_subcat="Oncogenic") -) -ora_input = res %>% filter(!is.na(padj), padj<0.01, abs(lfc)>0.3) %>% pull(gene_id) -input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL')) - -total_deg=length(unique(ora_input))/length(unique(mapping$ENTREZID)) -pathways_ora_all = lapply(all_in_life, function(p){ - pathway = split(x = p$entrez_gene, f = p$gs_name) - db_name = paste(p$gs_cat[1], p$gs_subcat[1],sep=":") - respath <- fora(pathways = pathway, - genes = unique(input_entrezid$ENTREZID), - universe = unique(mapping$ENTREZID), - minSize = 15, - maxSize = 500) - coll_respath = collapsePathwaysORA(respath[order(pval)][padj < 0.1], - pathway, unique(input_entrezid$ENTREZID), unique(mapping$ENTREZID)) - as_tibble(respath[pathway %in% coll_respath$mainPathways]) %>% - mutate(database=db_name, NES=(overlap/size)/(total_deg)) -}) %>% bind_rows() %>% - mutate(analysis="ORA") +all_in_life=get_databases() +``` + +# Pathway Analysis + +```{r, warning=F, message=F} +fa_list=lapply(de_list,function(contrast){ + + res=contrast[["all"]] + universe=res %>% + filter(!is.na(padj)) %>% pull(gene_id) + universe_mapping = AnnotationDbi::select(org.Hs.eg.db, universe, 'ENSEMBL', columns=c('ENTREZID', 'SYMBOL')) -ora_tb = pathways_ora_all %>% unnest(overlapGenes) %>% - group_by(pathway) %>% - left_join(mapping, by =c("overlapGenes"="ENTREZID")) %>% - dplyr::select(pathway, padj, NES, ENSEMBL, analysis, - database) + ora_input = res %>% filter(!is.na(padj), padj<0.01, abs(lfc)>0.3) %>% pull(gene_id) + #change to the right species + input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL')) + all=run_fora(input_entrezid, universe_mapping,all_in_life) + + ora_input = res %>% filter(!is.na(padj), padj<0.01, lfc>0.3) %>% pull(gene_id) + #change to the right species + input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL')) + up=run_fora(input_entrezid, universe_mapping,all_in_life) + + ora_input = res %>% filter(!is.na(padj), padj<0.01, lfc<0.3) %>% pull(gene_id) + #change to the right species + input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL')) + down=run_fora(input_entrezid, universe_mapping,all_in_life) -pathways_long = ora_tb + list(all=all,up=up,down=down) +}) +``` + +## All significant genes {.tabset} +```{r, results='asis'} +# NOTE DT::datatables doesn't work with tabset and for loops +# You can use the following code to print dynamically or call manually sanitize_datatable() +# multiple times +dt_list=list() +for (contrast in names(de_list)){ + res_sig=fa_list[[contrast]][["all"]] + dt_list=c(dt_list, + list(h3(contrast)), + list(DT::datatable(res_sig))) +} +tagList(dt_list) +``` + + +## Down-regulated genes {.tabset} + +```{r, results='asis'} +dt_list=list() +for (contrast in names(de_list)){ + res_sig=fa_list[[contrast]][["down"]] + dt_list=c(dt_list, + list(h3(contrast)), + list(DT::datatable(res_sig))) +} +tagList(dt_list) ``` -```{r pathaways_table} -pathways_ora_all %>% sanitize_datatable() +## Up-regulated genes {.tabset} + +```{r, results='asis'} +dt_list=list() +for (contrast in names(de_list)){ + res_sig=fa_list[[contrast]][["up"]] + dt_list=c(dt_list, + list(h3(contrast)), + list(DT::datatable(res_sig))) +} +tagList(dt_list) ``` -```{r write-files} -counts_norm=norm_matrix %>% as.data.frame() %>% - rownames_to_column("gene_id") %>% - mutate(comparison = str_interp("${numerator}_vs_${denominator}")) +# Save files -res_for_writing <- res %>% - mutate(comparison = str_interp("${numerator}_vs_${denominator}")) +```{r write_files} +if (!is.null(subset_value) & !is.null(subset_value)){ + filenames = str_interp("${subset_value}") +} else { + filenames = "" +} +for (contrast in names(contrasts)){ + filenames = paste0(filenames, "_", contrast) + name_expression_fn=file.path( + basedir, + str_interp("${filenames}_expression.csv")) + + name_deg_fn=file.path( + basedir, + str_interp("${filenames}_deg.csv")) + + name_pathways_fn=file.path( + basedir, + str_interp("${filenames}_pathways.csv")) -pathways_for_writing <- pathways_long %>% - mutate(comparison = str_interp("${numerator}_vs_${denominator}")) - -if (!is.null(subset_value)){ - counts_norm <- counts_norm %>% - mutate(subset = subset_value) - res_for_writing <- res_for_writing %>% - mutate(subset = subset_value) - pathways_for_writing <- pathways_for_writing %>% - mutate(subset = subset_value) + counts_norm=norm_matrix %>% as.data.frame() %>% + rownames_to_column("gene_id") %>% + mutate(comparison = contrast) + + res_for_writing <- de_list[[contrast]][["all"]] %>% + mutate(comparison = contrast) + + pathways_for_writing <- fa_list[[contrast]][["all"]] %>% + mutate(comparison = contrast) + write_csv(counts_norm, name_expression_fn) + write_csv(res_for_writing, name_deg_fn) + write_csv(pathways_for_writing, name_pathways_fn) } -write_csv(counts_norm, name_expression_fn) -write_csv(res_for_writing, name_deg_fn) -write_csv(pathways_for_writing, name_pathways_fn) ``` # R session diff --git a/inst/templates/rnaseq/libs/FA.R b/inst/templates/rnaseq/libs/FA.R new file mode 100644 index 0000000..d1d2a02 --- /dev/null +++ b/inst/templates/rnaseq/libs/FA.R @@ -0,0 +1,90 @@ +# library(msigdb) +# msigdb.hs = getMsigdb(org = 'hs', id = 'SYM', version = '7.5') +# +# get_databases_v2=function(){ +# all_in_life=list( +# GOBP=subsetCollection(msigdb.hs, 'c5', 'GO:BP'), +# GOMF=subsetCollection(msigdb.hs, 'c5', 'GO:MF'), +# HALLMARK=subsetCollection(msigdb.hs, 'h'), +# KEGG=subsetCollection(msigdb.hs, 'c2', 'CP:KEGG') +# ) %>% lapply(., function(geneset){ +# gs=lapply(geneset, function(x){ +# geneIds(x) +# }) +# names(gs)=sapply(geneset, setName) +# gs +# }) +# all_in_life +# } + +get_databases=function(){ + all_in_life=list( + msigdbr(species = "human", category = "H") %>% mutate(gs_subcat="Hallmark"), + # msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME"), + msigdbr(species = "human", category = "C2", subcategory = "CP:KEGG"), + # msigdbr(species = "human", category = "C2", subcategory = "CP:PID"), + msigdbr(species = "human", category = "C5", subcategory = "GO:BP"), + msigdbr(species = "human", category = "C5", subcategory = "GO:MF") + # msigdbr(species = "human", category = "C5", subcategory = "HPO"), + # msigdbr(species = "human", category = "C3", subcategory = "TFT:GTRD"), + # msigdbr(species = "human", category = "C6") %>% mutate(gs_subcat="Oncogenic") + ) +all_in_life +} + +run_fora_v2=function(input, uni, all_in_life){ + # browser() + total_deg=length(unique(input$SYMBOL))/length(unique(uni$SYMBOL)) + pathways_ora_all = lapply(names(all_in_life), function(database){ + pathway = all_in_life[[database]] + #pathway = split(x = p$entrez_gene, f = p$gs_name) + #db_name = paste(p$gs_cat[1], p$gs_subcat[1],sep=":") + respath <- fora(pathways = pathway, + genes = unique(input$SYMBOL), + universe = unique(uni$SYMBOL), + minSize = 15, + maxSize = 500) + coll_respath = collapsePathwaysORA(respath[order(pval)][padj < 0.1], + pathway, unique(input$SYMBOL), unique(uni$SYMBOL)) + as_tibble(respath[pathway %in% coll_respath$mainPathways]) %>% + mutate(database=db_name, NES=(overlap/size)/(total_deg)) + }) %>% bind_rows() %>% + mutate(analysis="ORA") + ora_tb = pathways_ora_all %>% unnest(overlapGenes) %>% + group_by(pathway) %>% + left_join(uni, by =c("overlapGenes"="SYMBOL")) %>% + dplyr::select(pathway, padj, NES, SYMBOL, analysis, + database) %>% + group_by(pathway,padj,NES,database,analysis) %>% + summarise(genes=paste(SYMBOL,collapse = ",")) + ora_tb + +} + +run_fora=function(input, uni,all_in_life){ + # browser() + total_deg=length(unique(input))/length(unique(uni$ENTREZID)) + pathways_ora_all = lapply(all_in_life, function(p){ + pathway = split(x = p$entrez_gene, f = p$gs_name) + db_name = paste(p$gs_cat[1], p$gs_subcat[1],sep=":") + respath <- fora(pathways = pathway, + genes = unique(input$ENTREZID), + universe = unique(uni$ENTREZID), + minSize = 15, + maxSize = 500) + coll_respath = collapsePathwaysORA(respath[order(pval)][padj < 0.1], + pathway, unique(input$ENTREZID), unique(uni$ENTREZID)) + as_tibble(respath[pathway %in% coll_respath$mainPathways]) %>% + mutate(database=db_name, NES=(overlap/size)/(total_deg)) + }) %>% bind_rows() %>% + mutate(analysis="ORA") + ora_tb = pathways_ora_all %>% unnest(overlapGenes) %>% + group_by(pathway) %>% + left_join(uni, by =c("overlapGenes"="ENTREZID")) %>% + dplyr::select(pathway, padj, NES, SYMBOL, analysis, + database) %>% + group_by(pathway,padj,NES,database,analysis) %>% + summarise(genes=paste(SYMBOL,collapse = ",")) + ora_tb + +} diff --git a/inst/templates/rnaseq/libs/load_data.R b/inst/templates/rnaseq/libs/load_data.R index 6462edb..aaa32af 100644 --- a/inst/templates/rnaseq/libs/load_data.R +++ b/inst/templates/rnaseq/libs/load_data.R @@ -101,7 +101,7 @@ load_metrics <- function(se_object, multiqc_data_dir, gtf_fn, counts){ return(metrics) } -load_coldata <- function(coldata_fn, column=NULL, numerator=NULL, denominator=NULL, subset_column = NULL, subset_value = NULL){ +load_coldata <- function(coldata_fn, column=NULL, subset_column = NULL, subset_value = NULL){ coldata=read.csv(coldata_fn) %>% dplyr::distinct(sample, .keep_all = T) %>% dplyr::select(!matches("fastq"), !matches("strandness")) %>% @@ -123,8 +123,8 @@ load_coldata <- function(coldata_fn, column=NULL, numerator=NULL, denominator=NU rownames(coldata) <- coldata$sample coldata$description <- coldata$sample - if (!is.null(denominator)) - coldata[[column]] = relevel(as.factor(coldata[[column]]), denominator) + # if (!is.null(denominator)) + # coldata[[column]] = relevel(as.factor(coldata[[column]]), denominator) return(coldata) } diff --git a/tests/testthat/rnaseq.R b/tests/testthat/rnaseq.R index bdebb74..42dfd1f 100644 --- a/tests/testthat/rnaseq.R +++ b/tests/testthat/rnaseq.R @@ -6,27 +6,13 @@ test_that("rnaseq deg",{ print(path) bcbio_templates(type="rnaseq", outpath=path) fs::dir_ls(path,all=T) - numerator="tumor" - denominator="normal" - subset_value=NA rmarkdown::render(input = file.path(path,"DE/DEG.Rmd"), output_dir = file.path(path,"DE"), output_format = "html_document", - output_file = ifelse(!is.na(subset_value), - paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'), - paste0('DE_', numerator, '_vs_', denominator, '.html') - ), - clean = TRUE, + clean = TRUE # envir = new.env(), - params = list( - subset_value = subset_value, - numerator = numerator, - denominator = denominator, - params_file = file.path(path,'DE/params_de-example.R'), - project_file = file.path(path,'information.R'), - functions_file = file.path(path,'libs/load_data.R') - ) ) + # browseURL(file.path(path, "DE/DE_tumor_vs_normal.html")) # usethis::proj_activate(path) }) From b216020f8b7042fb9293bc7406457b54d5db0dfd Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Fri, 23 Aug 2024 15:39:21 -0400 Subject: [PATCH 52/93] add credit --- inst/templates/rnaseq/DE/DEG.Rmd | 3 ++- inst/templates/rnaseq/QC/QC_nf-core.Rmd | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/inst/templates/rnaseq/DE/DEG.Rmd b/inst/templates/rnaseq/DE/DEG.Rmd index 39c1e2e..798ecd8 100644 --- a/inst/templates/rnaseq/DE/DEG.Rmd +++ b/inst/templates/rnaseq/DE/DEG.Rmd @@ -32,6 +32,7 @@ params: functions_file: ../libs --- +Template developed with materials from https://hbctraining.github.io/main/. ```{r} # This set up the working directory to this file so all files can be found @@ -538,7 +539,7 @@ From the set of differentially expressed genes and using publicly available info ```{r} -all_in_life=get_databases() +wiall_in_life=get_databases() ``` # Pathway Analysis diff --git a/inst/templates/rnaseq/QC/QC_nf-core.Rmd b/inst/templates/rnaseq/QC/QC_nf-core.Rmd index 2bfeff9..22841fd 100644 --- a/inst/templates/rnaseq/QC/QC_nf-core.Rmd +++ b/inst/templates/rnaseq/QC/QC_nf-core.Rmd @@ -27,6 +27,8 @@ params: factor_of_interest: sample_type --- +Template developed with materials from https://hbctraining.github.io/main/. + ```{r, cache = FALSE, message = FALSE, warning=FALSE} # This set up the working directory to this file so all files can be found library(rstudioapi) From 08af050d7d420814a39f7298994705a525f5f7df Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Fri, 23 Aug 2024 16:59:50 -0400 Subject: [PATCH 53/93] first draft --- .gitignore | 1 + inst/templates/chipseq/QC/QC.Rmd | 358 ++++++++++++++++++ inst/templates/chipseq/QC/params_qc-example.R | 11 + inst/templates/chipseq/information.R | 6 + inst/templates/chipseq/libs/load_data.R | 83 ++++ inst/templates/chipseq/readme.md | 12 + 6 files changed, 471 insertions(+) create mode 100644 inst/templates/chipseq/QC/QC.Rmd create mode 100644 inst/templates/chipseq/QC/params_qc-example.R create mode 100644 inst/templates/chipseq/information.R create mode 100644 inst/templates/chipseq/libs/load_data.R create mode 100644 inst/templates/chipseq/readme.md diff --git a/.gitignore b/.gitignore index 3302caf..0da43d7 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,4 @@ tests/* .httr-oauth .DS_Store .quarto +inst/templates/chipseq/QC/QC.html diff --git a/inst/templates/chipseq/QC/QC.Rmd b/inst/templates/chipseq/QC/QC.Rmd new file mode 100644 index 0000000..cb6b71d --- /dev/null +++ b/inst/templates/chipseq/QC/QC.Rmd @@ -0,0 +1,358 @@ +--- +title: "Quality Control" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + # Fill this file with the right paths to nfcore output + # params_file: params_qc_nf-core-example.R # example data + params_file: params_qc-example.R + project_file: ../information.R + functions_file: ../libs/load_data.R + factor_of_interest: genotype + +--- + +```{r, cache = FALSE, message = FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +# library(rstudioapi) +# setwd(fs::path_dir(getSourceEditorContext()$path)) +``` + + +```{r source_params, cache = FALSE, message = FALSE, warning=FALSE} +# 1. set up factor_of_interest parameter from parameter above or manually +# this is used to color plots, it needs to be part of the metadata +# 2. Set input files in this file +source(params$params_file) +# 3. If you set up this file, project information will be printed below and +#. it can be reused for other Rmd files. +source(params$project_file) +# 4. Load custom functions to load data from coldata/metrics/counts +source(params$functions_file) +``` + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` + + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(tidyverse) +library(knitr) +library(rtracklayer) +library(DESeq2) +library(DEGreport) +library(ggrepel) +# library(RColorBrewer) +library(DT) +library(pheatmap) +library(bcbioR) +library(janitor) +ggplot2::theme_set(theme_light(base_size = 14)) +opts_chunk[["set"]]( + cache = FALSE, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + fig.height = 4) +``` + + +```{r sanitize-datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df))) +} +``` + +# Samples and metadata + +```{r load_data, message=F, warning=F} +# This code will load from bcbio or nf-core folder +# NOTE make sure to set numerator and denominator +coldata <- load_coldata(coldata_fn) +coldata$sample=row.names(coldata) + +metrics <- load_metrics(multiqc_data_dir) + +metrics <- full_join(coldata, metrics) +rownames(metrics) <- metrics$sample +dds <- load_counts(counts_fn) + +coldata_for_dds = metrics[colnames(dds),] +stopifnot(all(colnames(dds) == rownames(coldata_for_dds))) + +peaks <- load_peaks(peaks_dir) %>% left_join(coldata) +``` + +```{r show_metadata} +metrics_lite <- metrics %>% dplyr::select(sample, total_reads, mapped_reads_pct, frip, peak_count) +full_join(coldata, metrics_lite) %>% sanitize_datatable() +``` + +# Read metrics {.tabset} + +## Total reads + +Here, we want to see consistency and a minimum of 20 million reads (the grey line). + +```{r plot_total_reads} +metrics %>% + ggplot(aes(x = sample, + y = total_reads, + fill = antibody)) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "million reads") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Total reads")+ + geom_hline(yintercept=20000000, color = "grey", linewidth=2) + +``` + +## Mapping rate + +```{r calc_min_max_pct_mapped} +#get min percent mapped reads for reference +min_pct_mapped <- round(min(metrics$mapped_reads/metrics$total_reads)*100,1) +max_pct_mapped <- round(max(metrics$mapped_reads/metrics$total_reads)*100,1) +``` + +The genomic mapping rate represents the percentage of reads mapping to the reference genome. We want to see consistent mapping rates between samples and over 70% mapping (the grey line). These samples have mapping rates: `r min_pct_mapped` - `r max_pct_mapped`%. + +```{r plot_mapping_rate} +metrics %>% + ggplot(aes(x = sample, + y = mapped_reads_pct, + fill = antibody)) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "% reads mapped") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Mapping rate") + xlab("") + + geom_hline(yintercept=70, color = "grey", linewidth=2) + +``` + +## Mapped Reads + +```{r plot_mapped_reads} +metrics %>% + ggplot(aes(x = sample, + y = mapped_reads, + fill = antibody)) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "million reads") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Mapped reads") + +``` + +## NSC + +```{r plot_nsc} +metrics %>% + ggplot(aes(x = sample, + y = nsc, + fill = antibody)) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "NSC coefficient") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Normalized Strand Cross-Correlation") + +``` + + +## RSC + +```{r plot_rsc} +metrics %>% + ggplot(aes(x = sample, + y = rsc, + fill = antibody)) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "NSC coefficient") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Relative Strand Cross-Correlation") + +``` + + +## Fraction of reads in peaks + +This figure shows what percentage of reads are mapping to regions within peaks called by macs2. The expected fraction of reads in peaks will vary by protein. + +```{r plot_frip} +metrics %>% filter(!is.na(frip)) %>% + ggplot(aes(x = sample, + y = frip, + fill = antibody)) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "FRiP") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Fraction of reads in peaks") + +``` + +## Number of peaks + +Ideally, we will see a similar number of peaks between replicates. + +```{r plot_peak_count} +metrics %>% filter(!is.na(peak_count)) %>% + ggplot(aes(x = sample, + y = peak_count, + fill = antibody)) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "Number of Peaks") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Number of Peaks") + +``` + +## Non-Redundant Fraction + +The NRF is the number of uniquely mapping reads divided by the total number of reads. The ENCODE website also sets out standardized thresholds for this as well and those are summarized in the table below. + +```{r nrf table} +NRF <- c("NRF < 0.5", "0.5 < NRF < 0.8", "0.8 < NRF < 0.9", "NRF > 0.9") +NRF_level <- c("Concerning", "Acceptable", "Compliant", "Ideal") + +NRF_df <- data.frame(NRF, NRF_level) + +colnames(NRF_df) <- c("NRF", "NRF Level") +NRF_df %>% sanitize_datatable() + +``` + +```{r plot_nrf} +metrics %>% + ggplot(aes(x = sample, + y = nrf, + fill = antibody)) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "Non-Redundant Fraction") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Non-Redundant Fraction")+ + geom_hline(yintercept = 0.9, linetype = "dashed", color="green") + + geom_hline(yintercept = 0.8, linetype = "dashed", color="orange") + + geom_hline(yintercept = 0.5, linetype = "dashed", color="red") + +``` + +# Correlation Heatmap + +Inter-correlation analysis (ICA) is another way to look at how well samples +cluster by plotting the correlation between the peak regions of the +samples. + +```{r clustering fig, fig.width = 10, fig.asp = .62} +vst_cor <- cor(assays(dds)$vst) + +colma=coldata_for_dds %>% as.data.frame() +rownames(colma) <- colma$sample +colma <- colma[rownames(vst_cor), ] +colma <- colma %>% dplyr::select(.data[[params$factor_of_interest]]) +anno_colors=lapply(colnames(colma), function(c){ + l.col=cb_friendly_pal('grey')(length(unique(colma[[c]]))) + names(l.col)=unique(colma[[c]]) + l.col +}) +names(anno_colors)=colnames(colma) + +p <- pheatmap(vst_cor, + annotation = colma, + annotation_colors = anno_colors, + show_rownames = T, + show_colnames = T, + color = cb_friendly_pal('heatmap')(15) + ) +p +``` + +# PCA + +We can run PCA to evaluate the variation amongst our samples and whether or not the greatest sources of variation in the data (PC1 and PC2) can be attributed to the factors of interest in this experiment. + +```{r PCA1:5 summary, all, unlabeled, fig.width= 7, fig.height = 5} + +pca1 <- degPCA(assays(dds)$vst, coldata_for_dds, + condition = params$factor_of_interest, data = T)[["plot"]] +pca2 <- degPCA(assays(dds)$vst, coldata_for_dds, + condition = params$factor_of_interest, data = T, pc1="PC3", pc2="PC4")[["plot"]] + +pca1 + scale_color_cb_friendly() +pca2 + scale_color_cb_friendly() + +``` + +# Sample Concordance {.tabset} + +## Peak enrichment vs. Peak rank + +In this plot, we are looking at each individual replicates to evaluate what number of peaks we would retain if threshholding by peak enrichment. It is also valuable to see how this differs between replicates within a sample group. + +```{r peak enrichment vs rank} +ggplot(peaks, aes(x = peak_rank, y = peak_enrichment, color = sample)) + + geom_line() + + scale_color_cb_friendly() + + xlab("Peak rank") + ylab("Peak enrichment") + +``` + +## Peak signal distribution + +Here, we plot a histogram of peak signal values for each sample. This plot can be used to help determine a minimum value for peak enrichment that can be used for filtering. + +```{r peak signal distribution} +ggplot(peaks, aes(x = peak_enrichment, fill = .data[[params$factor_of_interest]])) + + geom_histogram(aes(peak_enrichment)) + + scale_fill_cb_friendly() + + xlab("Peak enrichment") + +``` + +# R session + +List and version of tools used for the QC report generation. + +```{r} +sessionInfo() +``` diff --git a/inst/templates/chipseq/QC/params_qc-example.R b/inst/templates/chipseq/QC/params_qc-example.R new file mode 100644 index 0000000..e622725 --- /dev/null +++ b/inst/templates/chipseq/QC/params_qc-example.R @@ -0,0 +1,11 @@ +# info params + + +# Example data +coldata_fn='~/Downloads/chipseq_peakanalysis_H3K4Me.csv' +# This folder is in the output directory inside multiqc folder +multiqc_data_dir='~/O2/s3_results/chipseq_peakanalysis_h3k4me/multiqc/broadPeak/multiqc_data/' +# This folder is in the output director +peaks_dir = '~/O2/s3_results/chipseq_peakanalysis_h3k4me/bowtie2/mergedLibrary/macs2/broadPeak/' +# This folder is in the output directory +counts_fn = '~/O2/s3_results/chipseq_peakanalysis_h3k4me/bowtie2/mergedLibrary/macs2/broadPeak/consensus/H3K4me/deseq2/H3K4me.consensus_peaks.rds' diff --git a/inst/templates/chipseq/information.R b/inst/templates/chipseq/information.R new file mode 100644 index 0000000..6e15eef --- /dev/null +++ b/inst/templates/chipseq/information.R @@ -0,0 +1,6 @@ +# info params +project = "name_hbcXXXXX" +PI = 'person name' +experiment = 'short description' +aim = 'short description' +analyst = 'person in the core' diff --git a/inst/templates/chipseq/libs/load_data.R b/inst/templates/chipseq/libs/load_data.R new file mode 100644 index 0000000..d44436d --- /dev/null +++ b/inst/templates/chipseq/libs/load_data.R @@ -0,0 +1,83 @@ +library(tidyverse) +library(SummarizedExperiment) +library(janitor) +load_metrics <- function(multiqc_data_dir){ + + # the reading-in of these next two files needs changing in order to correctly + # account for samples that have been sequenced multiple times. + # simply removing T1 is not the correct way to do it + fastqc <- read_tsv(file.path(multiqc_data_dir, 'multiqc_fastqc.txt')) %>% clean_names() %>% + dplyr::select(sample, total_reads = total_sequences) %>% + mutate(sample = gsub('_T1', '', sample)) + samtools <- read_tsv(file.path(multiqc_data_dir, 'multiqc_samtools_stats.txt')) %>% clean_names() %>% + dplyr::select(sample, mapped_reads = reads_mapped) %>% + mutate(sample = gsub('_T1', '', sample)) + + + phantom <- read_tsv(file.path(multiqc_data_dir, 'multiqc_phantompeakqualtools.txt')) %>% clean_names() %>% + dplyr::select(sample, nsc, rsc) + frip <- read_tsv(file.path(multiqc_data_dir, 'multiqc_frip_score-plot.txt')) %>% select(-Sample) %>% + pivot_longer(everything(), names_to = 'sample', values_to = 'frip') %>% filter(!is.na(frip)) + peak_count <- read_tsv(file.path(multiqc_data_dir, 'multiqc_peak_count-plot.txt')) %>% select(-Sample) %>% + pivot_longer(everything(), names_to = 'sample', values_to = 'peak_count') %>% filter(!is.na(peak_count)) + nrf <- read_tsv(file.path(multiqc_data_dir, 'mqc_picard_deduplication_1.txt')) %>% clean_names() %>% + mutate(nrf = unique_unpaired / (unique_unpaired + duplicate_unpaired)) %>% + dplyr::select(sample, nrf) + + metrics <- full_join(fastqc, samtools) %>% full_join(phantom) %>% full_join(frip) %>% + full_join(peak_count) %>% full_join(nrf) %>% + mutate(mapped_reads_pct = round(mapped_reads/total_reads*100,1)) + + metrics$sample <- make.names(metrics$sample) + rownames(metrics) <- metrics$sample + return(metrics) +} + +load_coldata <- function(coldata_fn, column=NULL, numerator=NULL, denominator=NULL, subset_column = NULL, subset_value = NULL){ + coldata=read.csv(coldata_fn) %>% + dplyr::distinct(sample, .keep_all = T) %>% + dplyr::select(!matches("fastq")) %>% + distinct() + if('description' %in% names(coldata)){ + coldata$sample <- tolower(coldata$description) + } + coldata <- coldata %>% distinct(sample, .keep_all = T) + if (!is.null(column)) + stopifnot(column %in% names(coldata)) + + # use only some samples, by default use all + if (!is.null(subset_column)){ + coldata <- coldata[coldata[[paste(subset_column)]] == subset_value, ] + } + #coldata <- coldata[coldata[[paste(column)]] %in% c(numerator, denominator), ] + #browser() + coldata$sample <- make.names(coldata$sample) + rownames(coldata) <- coldata$sample + coldata$description <- coldata$sample + coldata$antibody <- ifelse(coldata$antibody == '', 'input', coldata$antibody) + coldata$type <- ifelse(coldata$antibody == 'input', 'input', 'chip') + + if (!is.null(denominator)) + coldata[[column]] = relevel(as.factor(coldata[[column]]), denominator) + + return(coldata) +} + +load_counts <- function(counts_fn){ + + counts <- readRDS(counts_fn) + return(counts) + +} + +load_peaks <- function(peaks_dir){ + peaks_fns <- list.files(peaks_dir, pattern = '_peaks.broadPeak') + names(peaks_fns) <- gsub('_peaks.broadPeak', '', peaks_fns) + peaks_all <- lapply(peaks_fns, function(fn) { + peaks <- read_delim(file.path(peaks_dir, fn), col_names = F) + peaks_df <- data.frame(peak_enrichment = peaks$X7, peak_rank = rank(dplyr::desc(peaks$X7))) %>% + dplyr::arrange(peak_rank) + return(peaks_df) + }) %>% bind_rows(.id = 'sample') + return(peaks_all) +} diff --git a/inst/templates/chipseq/readme.md b/inst/templates/chipseq/readme.md new file mode 100644 index 0000000..112ddac --- /dev/null +++ b/inst/templates/chipseq/readme.md @@ -0,0 +1,12 @@ +# Guidelines for analysis + +Make sure there is a valid project name, and modify `information.R` with the right information for your project. You can use this file with any other Rmd to include the project/analysis information. + +## QC + +`QC/QC.Rmd` is a template for QC metrics. It includes basic read-level statistics, peak quality information, sample correlation analysis, and PCA. + +## DropBox + +- In `reports/QC` + - [ ] copy QC `Rmd/R/html/figures` From 99724317d1d27682d24eb6a1aa966a7d73c63eca Mon Sep 17 00:00:00 2001 From: eberdan Date: Mon, 26 Aug 2024 10:47:19 -0400 Subject: [PATCH 54/93] Update QC.rmd added parameters file from shiny app and a few other edits. --- inst/templates/singlecell/QC/QC.rmd | 58 +++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/inst/templates/singlecell/QC/QC.rmd b/inst/templates/singlecell/QC/QC.rmd index 60bb9de..1294608 100644 --- a/inst/templates/singlecell/QC/QC.rmd +++ b/inst/templates/singlecell/QC/QC.rmd @@ -5,13 +5,16 @@ date: "`r Sys.Date()`" params: ## If you have Ribosomal ratio in your raw seurat object put this as TRUE otherwise leave as FALSE ribosomal: FALSE + params_file: parameters.R + project_file: ../information.R --- # Overview -- Project: project -- PI: PI -- Analyst: analyst +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` ```{r, eval=FALSE} @@ -19,18 +22,27 @@ params: # This is a template for scRNA QC to present to your client. The actual QC can be done using our rshiny app: -# After you have decided on your QC metrics load your raw object (i.e. right after you first read data into seurat) and create your QC object by editing lines 49-67. +# After you have decided on your QC metrics load your raw object (i.e. right after you first read data into seurat) and put the parameters.R file you got from the shiny app in the same folder as this rmd. -# Edit text line 246 with your chosen QC cutoffs! ``` +```{r} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +``` + + ```{r setup, include=FALSE} library(Seurat) library(tidyverse) library(ggplot2) +# 1. Set up input files in this R file (params_de.R) +source(params$params_file) + knitr::opts_chunk[["set"]]( cache = FALSE, dev = c("png", "pdf"), @@ -46,7 +58,7 @@ knitr::opts_chunk[["set"]]( -```{r load and filter} +```{r load and filter no ribo} ## Load data seurat_raw <- seurat_clust <- readRDS("seurat_pre-filtered.rds") @@ -54,21 +66,37 @@ seurat_raw <- seurat_clust <- readRDS("seurat_pre-filtered.rds") ## Creat QC object USE METRICS YOU CHOSE IN THE RSHINY APP seurat_qc <- subset(x = seurat_raw, - subset = (nCount_RNA >= 1500) - & (nFeature_RNA >= 2200) - & (mitoRatio < 0.1) - ## & (riboRatio < 0.4) - & (Log10GenesPerUMI > 0.80) + subset = (nCount_RNA >= nCount_RNA_cutoff) + & (nFeature_RNA >= nFeature_RNA_cutoff) + & (mitoRatio < mitoRatio_cutoff) + ## & (riboRatio < riboRatio_cutoff) + & (Log10GenesPerUMI > Log10GenesPerUMI_cutoff) ) -## Save QC object +``` -saveRDS(seurat_qc, file = "seurat_post-QC.rds") + +```{r load and filter ribo, eval=ribosomal, warning=FALSE, results='asis'} + +seurat_qc <- subset(x = seurat_raw, + subset = (nCount_RNA >= nCount_RNA_cutoff) + & (nFeature_RNA >= nFeature_RNA_cutoff) + & (mitoRatio < mitoRatio_cutoff) + & (riboRatio < riboRatio_cutoff) + & (Log10GenesPerUMI > Log10GenesPerUMI_cutoff) + ) ``` +```{r} + +## Save QC object +saveRDS(seurat_qc, file = "seurat_post-QC.rds") + +``` + ```{r prep-info} @@ -243,7 +271,7 @@ metadata0 %>% # QC metrics: Filtered data {.tabset} -Based on the above QC metrics, we filtered the dataset to isolate cells passing the following thresholds: **>250 UMIs, >250 genes, <0.2 mitochondrial gene ratio, and >0.8 complexity**. +Based on the above QC metrics, we filtered the dataset to isolate cells passing the following thresholds: >`nCount_RNA_cutoff` UMIs, >`nFeature_RNA_cutoff` genes, <`mitoRatio_cutoff` mitochondrial gene ratio, and >`Log10GenesPerUMI_cutoff` complexity. In this section, we review QC metrics for our filtered dataset. @@ -391,10 +419,8 @@ metadata1 %>% ``` - # R session ```{r} sessionInfo() ``` - From 727bff9a2b0df47723f118fe60def8371aeffa74 Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Mon, 26 Aug 2024 10:56:42 -0400 Subject: [PATCH 55/93] modify for multiple lanes per sample --- inst/templates/chipseq/QC/params_qc-example.R | 8 ++++---- inst/templates/chipseq/libs/load_data.R | 11 ++++++++--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/inst/templates/chipseq/QC/params_qc-example.R b/inst/templates/chipseq/QC/params_qc-example.R index e622725..5d10d43 100644 --- a/inst/templates/chipseq/QC/params_qc-example.R +++ b/inst/templates/chipseq/QC/params_qc-example.R @@ -2,10 +2,10 @@ # Example data -coldata_fn='~/Downloads/chipseq_peakanalysis_H3K4Me.csv' +coldata_fn='~/Downloads/chipseq_peakanalysis_H3K27Ac.csv' # This folder is in the output directory inside multiqc folder -multiqc_data_dir='~/O2/s3_results/chipseq_peakanalysis_h3k4me/multiqc/broadPeak/multiqc_data/' +multiqc_data_dir='~/O2/s3_results/chipseq_peakanalysis_h3k27ac/multiqc/broadPeak/multiqc_data/' # This folder is in the output director -peaks_dir = '~/O2/s3_results/chipseq_peakanalysis_h3k4me/bowtie2/mergedLibrary/macs2/broadPeak/' +peaks_dir = '~/O2/s3_results/chipseq_peakanalysis_h3k27ac/bowtie2/mergedLibrary/macs2/broadPeak/' # This folder is in the output directory -counts_fn = '~/O2/s3_results/chipseq_peakanalysis_h3k4me/bowtie2/mergedLibrary/macs2/broadPeak/consensus/H3K4me/deseq2/H3K4me.consensus_peaks.rds' +counts_fn = '~/O2/s3_results/chipseq_peakanalysis_h3k27ac/bowtie2/mergedLibrary/macs2/broadPeak/consensus/H3K27ac/deseq2/H3K27ac.consensus_peaks.rds' diff --git a/inst/templates/chipseq/libs/load_data.R b/inst/templates/chipseq/libs/load_data.R index d44436d..ace70ef 100644 --- a/inst/templates/chipseq/libs/load_data.R +++ b/inst/templates/chipseq/libs/load_data.R @@ -8,11 +8,16 @@ load_metrics <- function(multiqc_data_dir){ # simply removing T1 is not the correct way to do it fastqc <- read_tsv(file.path(multiqc_data_dir, 'multiqc_fastqc.txt')) %>% clean_names() %>% dplyr::select(sample, total_reads = total_sequences) %>% - mutate(sample = gsub('_T1', '', sample)) + mutate(new_sample = gsub('_T[0-9]+', '', sample)) %>% + group_by(new_sample) %>% + summarize(new_total_reads = sum(total_reads)) %>% + dplyr::select(sample = new_sample, total_reads = new_total_reads) samtools <- read_tsv(file.path(multiqc_data_dir, 'multiqc_samtools_stats.txt')) %>% clean_names() %>% dplyr::select(sample, mapped_reads = reads_mapped) %>% - mutate(sample = gsub('_T1', '', sample)) - + mutate(new_sample = gsub('_T[0-9]+', '', sample)) %>% + group_by(new_sample) %>% + summarize(new_mapped_reads = sum(mapped_reads)) %>% + dplyr::select(sample = new_sample, mapped_reads = new_mapped_reads) phantom <- read_tsv(file.path(multiqc_data_dir, 'multiqc_phantompeakqualtools.txt')) %>% clean_names() %>% dplyr::select(sample, nsc, rsc) From d6624c58dc9616933d03725a41c9b7715d478df7 Mon Sep 17 00:00:00 2001 From: eberdan Date: Wed, 28 Aug 2024 13:01:47 -0400 Subject: [PATCH 56/93] Update seurat_init.R fixed crucial error --- inst/templates/singlecell/scripts/seurat_init.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/inst/templates/singlecell/scripts/seurat_init.R b/inst/templates/singlecell/scripts/seurat_init.R index d345556..079a451 100644 --- a/inst/templates/singlecell/scripts/seurat_init.R +++ b/inst/templates/singlecell/scripts/seurat_init.R @@ -45,6 +45,10 @@ seurat_merge$mitoRatio <- seurat_merge@meta.data$mitoRatio/100 # Divide by 100 f # Number of genes per UMI for each cell seurat_merge$Log10GenesPerUMI <- log10(seurat_merge$nFeature_RNA) / log10(seurat_merge$nCount_RNA) +# Extract cell level metadata +metadata <- seurat_merge@meta.data +metadata$barcode <- rownames(metadata) + # Import experimental metadata metaexp <- read.csv("/path/to/experimental/metadata/meta.csv") From 5e1ae0057178695693423c4ebd17e83e790c7f2b Mon Sep 17 00:00:00 2001 From: eberdan Date: Thu, 29 Aug 2024 10:10:35 -0400 Subject: [PATCH 57/93] Update QC.rmd fixed typo --- inst/templates/singlecell/QC/QC.rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/templates/singlecell/QC/QC.rmd b/inst/templates/singlecell/QC/QC.rmd index 1294608..b7c5d5b 100644 --- a/inst/templates/singlecell/QC/QC.rmd +++ b/inst/templates/singlecell/QC/QC.rmd @@ -61,7 +61,7 @@ knitr::opts_chunk[["set"]]( ```{r load and filter no ribo} ## Load data -seurat_raw <- seurat_clust <- readRDS("seurat_pre-filtered.rds") +seurat_raw <- readRDS("seurat_pre-filtered.rds") ## Creat QC object USE METRICS YOU CHOSE IN THE RSHINY APP From f3fbeb69a26cc650beaca0d209c937bfd1311fbd Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Fri, 30 Aug 2024 12:45:48 -0400 Subject: [PATCH 58/93] upset plot for overlap between peaks --- inst/templates/chipseq/QC/QC.Rmd | 49 ++++++++++++++++++++++++- inst/templates/chipseq/libs/load_data.R | 5 ++- 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/inst/templates/chipseq/QC/QC.Rmd b/inst/templates/chipseq/QC/QC.Rmd index cb6b71d..64495c5 100644 --- a/inst/templates/chipseq/QC/QC.Rmd +++ b/inst/templates/chipseq/QC/QC.Rmd @@ -65,6 +65,8 @@ library(DT) library(pheatmap) library(bcbioR) library(janitor) +library(ChIPpeakAnno) +library(UpSetR) ggplot2::theme_set(theme_light(base_size = 14)) opts_chunk[["set"]]( cache = FALSE, @@ -323,7 +325,7 @@ pca2 + scale_color_cb_friendly() ``` -# Sample Concordance {.tabset} +# Peak Signal Concordance {.tabset} ## Peak enrichment vs. Peak rank @@ -349,6 +351,51 @@ ggplot(peaks, aes(x = peak_enrichment, fill = .data[[params$factor_of_interest]] ``` +# Peak Overlap {.tabset} + +We examine the amount of overlap between peaks in replicates of the same experimental condition. + +``` {r peak overlap, results = 'asis', fig.width = 8} + +for (current_sample_group in unique(peaks$sample_group)){ + cat("## ", current_sample_group, "\n") + + peaks_sample_group <- peaks %>% filter(sample_group == current_sample_group) + + peaks_sample_group_granges <- sapply( + unique(peaks_sample_group$sample), + function(current_sample) { + ChIPpeakAnno::toGRanges( + peaks_sample_group %>% filter(sample == current_sample), + format = ifelse(grepl('broadPeak', peaks_dir), 'broadPeak', 'narroPeak') + ) + } + ) + + # maxgap defaults to -1 which means that two peaks will be merged if they overlap by at least 1 bp + # connectedpeaks examples (https://support.bioconductor.org/p/133486/#133603), if 5 peaks in group1 overlap with 2 peaks in group 2, setting connectedPeaks to "merge" will add 1 to the overlapping counts + overlaps <- findOverlapsOfPeaks(peaks_sample_group_granges, connectedPeaks = 'merge') + + set_counts <- overlaps$venn_cnt[, colnames(overlaps$venn_cnt)] %>% + as.data.frame() %>% + mutate(group_number = row_number()) %>% + pivot_longer(!Counts & !group_number, names_to = 'sample', values_to = 'member') %>% + filter(member > 0) %>% + group_by(Counts, group_number) %>% + summarize(group = paste(sample, collapse = '&')) + + set_counts_upset <- set_counts$Counts + names(set_counts_upset) <- set_counts$group + + p <- upset(fromExpression(set_counts_upset), order.by = "freq", text.scale = 1.5) + print(p) + + cat('\n\n') + +} + +``` + # R session List and version of tools used for the QC report generation. diff --git a/inst/templates/chipseq/libs/load_data.R b/inst/templates/chipseq/libs/load_data.R index ace70ef..112e272 100644 --- a/inst/templates/chipseq/libs/load_data.R +++ b/inst/templates/chipseq/libs/load_data.R @@ -80,9 +80,12 @@ load_peaks <- function(peaks_dir){ names(peaks_fns) <- gsub('_peaks.broadPeak', '', peaks_fns) peaks_all <- lapply(peaks_fns, function(fn) { peaks <- read_delim(file.path(peaks_dir, fn), col_names = F) - peaks_df <- data.frame(peak_enrichment = peaks$X7, peak_rank = rank(dplyr::desc(peaks$X7))) %>% + peaks_df <- data.frame(seqnames = peaks$X1, start = peaks$X2, end = peaks$X3, + peak_enrichment = peaks$X7, peak_rank = rank(dplyr::desc(peaks$X7))) %>% dplyr::arrange(peak_rank) return(peaks_df) }) %>% bind_rows(.id = 'sample') + peaks_all$sample_group <- gsub('_REP[0-9]+', '', peaks_all$sample) + return(peaks_all) } From 84d93df0260a64259f11d96893d52eee0ab38b06 Mon Sep 17 00:00:00 2001 From: Alex Bartlett <74612800+abartlett004@users.noreply.github.com> Date: Wed, 4 Sep 2024 09:17:46 -0400 Subject: [PATCH 59/93] Update load_data.R --- inst/templates/chipseq/libs/load_data.R | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/inst/templates/chipseq/libs/load_data.R b/inst/templates/chipseq/libs/load_data.R index 112e272..e109d73 100644 --- a/inst/templates/chipseq/libs/load_data.R +++ b/inst/templates/chipseq/libs/load_data.R @@ -3,9 +3,7 @@ library(SummarizedExperiment) library(janitor) load_metrics <- function(multiqc_data_dir){ - # the reading-in of these next two files needs changing in order to correctly - # account for samples that have been sequenced multiple times. - # simply removing T1 is not the correct way to do it + fastqc <- read_tsv(file.path(multiqc_data_dir, 'multiqc_fastqc.txt')) %>% clean_names() %>% dplyr::select(sample, total_reads = total_sequences) %>% mutate(new_sample = gsub('_T[0-9]+', '', sample)) %>% From 23b01c45a6480f35c1415279b029cd911576b7ea Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Wed, 4 Sep 2024 11:58:41 -0400 Subject: [PATCH 60/93] fix narrowpeak bugs --- inst/templates/chipseq/QC/QC.Rmd | 2 +- inst/templates/chipseq/QC/params_qc-example.R | 9 +++++---- inst/templates/chipseq/libs/load_data.R | 9 +++++++-- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/inst/templates/chipseq/QC/QC.Rmd b/inst/templates/chipseq/QC/QC.Rmd index 64495c5..0ddf5eb 100644 --- a/inst/templates/chipseq/QC/QC.Rmd +++ b/inst/templates/chipseq/QC/QC.Rmd @@ -367,7 +367,7 @@ for (current_sample_group in unique(peaks$sample_group)){ function(current_sample) { ChIPpeakAnno::toGRanges( peaks_sample_group %>% filter(sample == current_sample), - format = ifelse(grepl('broadPeak', peaks_dir), 'broadPeak', 'narroPeak') + format = ifelse(grepl('broadPeak', peaks_dir), 'broadPeak', 'narrowPeak') ) } ) diff --git a/inst/templates/chipseq/QC/params_qc-example.R b/inst/templates/chipseq/QC/params_qc-example.R index 5d10d43..eeb3ce5 100644 --- a/inst/templates/chipseq/QC/params_qc-example.R +++ b/inst/templates/chipseq/QC/params_qc-example.R @@ -2,10 +2,11 @@ # Example data -coldata_fn='~/Downloads/chipseq_peakanalysis_H3K27Ac.csv' +coldata_fn='~/Downloads/chipseq_peakanalysis_PRDM16.csv' # This folder is in the output directory inside multiqc folder -multiqc_data_dir='~/O2/s3_results/chipseq_peakanalysis_h3k27ac/multiqc/broadPeak/multiqc_data/' +multiqc_data_dir='~/O2/s3_results/chipseq_peakanalysis_prdm16/multiqc/narrowPeak/multiqc_data/' # This folder is in the output director -peaks_dir = '~/O2/s3_results/chipseq_peakanalysis_h3k27ac/bowtie2/mergedLibrary/macs2/broadPeak/' +peaks_dir = '~/O2/s3_results/chipseq_peakanalysis_prdm16/bowtie2/mergedLibrary/macs2/narrowPeak/' # This folder is in the output directory -counts_fn = '~/O2/s3_results/chipseq_peakanalysis_h3k27ac/bowtie2/mergedLibrary/macs2/broadPeak/consensus/H3K27ac/deseq2/H3K27ac.consensus_peaks.rds' +# counts_fn = '~/O2/s3_results/chipseq_peakanalysis_h3k27ac/bowtie2/mergedLibrary/macs2/broadPeak/consensus/H3K27ac/deseq2/H3K27ac.consensus_peaks.rds' +counts_fn = '~/O2/s3_results/chipseq_peakanalysis_prdm16/bowtie2/mergedLibrary/macs2/narrowPeak/consensus/PRDM16/deseq2/PRDM16.consensus_peaks.rds' diff --git a/inst/templates/chipseq/libs/load_data.R b/inst/templates/chipseq/libs/load_data.R index 112e272..7ef45ad 100644 --- a/inst/templates/chipseq/libs/load_data.R +++ b/inst/templates/chipseq/libs/load_data.R @@ -76,8 +76,13 @@ load_counts <- function(counts_fn){ } load_peaks <- function(peaks_dir){ - peaks_fns <- list.files(peaks_dir, pattern = '_peaks.broadPeak') - names(peaks_fns) <- gsub('_peaks.broadPeak', '', peaks_fns) + if(grepl('broadPeak', peaks_dir)){ + peaks_fns <- list.files(peaks_dir, pattern = '_peaks.broadPeak') + names(peaks_fns) <- gsub('_peaks.broadPeak', '', peaks_fns) + } else { + peaks_fns <- list.files(peaks_dir, pattern = '_peaks.narrowPeak') + names(peaks_fns) <- gsub('_peaks.narrowPeak', '', peaks_fns) + } peaks_all <- lapply(peaks_fns, function(fn) { peaks <- read_delim(file.path(peaks_dir, fn), col_names = F) peaks_df <- data.frame(seqnames = peaks$X1, start = peaks$X2, end = peaks$X3, From 26240c4b09b16044295a177cdde1bf6d02170b72 Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Fri, 6 Sep 2024 15:18:25 -0400 Subject: [PATCH 61/93] HTO demux --- .../templates/singlecell/scripts/demux_HTOs.R | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 inst/templates/singlecell/scripts/demux_HTOs.R diff --git a/inst/templates/singlecell/scripts/demux_HTOs.R b/inst/templates/singlecell/scripts/demux_HTOs.R new file mode 100644 index 0000000..70ba210 --- /dev/null +++ b/inst/templates/singlecell/scripts/demux_HTOs.R @@ -0,0 +1,44 @@ +library(Seurat) +library(qs) + +############################ perform demultiplexing ############################ + +# replace these paths with ones to your data (cellranger outputs) +sample_matrix <- ReadMtx('data/tumor_4068_ovarian_cdki/processed_data/matrix.mtx.gz', + cells = 'data/tumor_4068_ovarian_cdki/processed_data/barcodes.tsv.gz', + features = 'data/tumor_4068_ovarian_cdki/processed_data/features.tsv.gz') + +# create two matrices of counts: one of for hashtag oligo counts, and one for counts for actual genes +# include in the HTO count matrix only those HTOs that are actually assigned to samples in your dataset (in this case, Hashtag1 and Hashtag2) +hto_matrix <- full_matrix[grepl('Hashtag[12]+', rownames(full_matrix)), ] +expression_matrix <- full_matrix[!grepl('Hashtag', rownames(full_matrix)), ] + +# create a Seurat object from the raw data, including a slot for HTO counts +sample_seurat <- CreateSeuratObject( + counts = Matrix::Matrix(as.matrix(expression_matrix), sparse = T)) +sample_seurat[["HTO"]] <- CreateAssayObject(counts = hto_matrix) + +# normalize both slots in the Seurat object +sample_seurat <- NormalizeData(sample_seurat) +sample_seurat <- NormalizeData(sample_seurat, assay = "HTO", normalization.method = "CLR") + +# perform demultiplexing. adjust positive.quantile as necessary to call more/fewer cells as hashtag-positive +sample_seurat <- HTODemux(sample_seurat, assay = "HTO", positive.quantile = 0.99) + +qsave(sample_seurat, 'data/processed/hto_demux_seurat.qs') +# saveRDS(sample_seurat, 'data/processed/hto_demux_seurat.rds') + + +################## evaluate demultiplexing performance ######################### + +# distributions of expression of hashtags should make sense considering hashtag assigned +RidgePlot(sample_seurat, assay = "HTO", features = c("Hashtag1", 'Hashtag2'), ncol = 2) + +# evaluate expression of hashtags vs calls for singlet, doublet, and unassigned +FeatureScatter(sample_seurat, feature1 = "Hashtag1", feature2 = "Hashtag2") +HTOHeatmap(sample_seurat, assay = "HTO") + +# evaluate nCount_RNA of cells classified as doublets +Idents(sample_seurat) <- "HTO_classification.global" +VlnPlot(sample_seurat, features = "nCount_RNA", pt.size = 0.1, log = TRUE) + From 695962cc107308bb0b25b637be84022932a2f72a Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Tue, 10 Sep 2024 17:47:20 -0400 Subject: [PATCH 62/93] gsva template --- .gitignore | 1 + inst/templates/rnaseq/DE/GSVA.Rmd | 216 ++++++++++++++++++++++++++++++ 2 files changed, 217 insertions(+) create mode 100644 inst/templates/rnaseq/DE/GSVA.Rmd diff --git a/.gitignore b/.gitignore index 0da43d7..184104b 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,4 @@ tests/* .DS_Store .quarto inst/templates/chipseq/QC/QC.html +*.html \ No newline at end of file diff --git a/inst/templates/rnaseq/DE/GSVA.Rmd b/inst/templates/rnaseq/DE/GSVA.Rmd new file mode 100644 index 0000000..ebf22e5 --- /dev/null +++ b/inst/templates/rnaseq/DE/GSVA.Rmd @@ -0,0 +1,216 @@ +--- +title: "GSVA" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: inline +params: + # set column name and contrasts to be factors of interest + column: "sample_type" + contrasts: !r list(c("sample_type", "tumor", "normal")) + project_file: ../information.R + params_file: params_de-example.R + functions_file: ../libs + # if working on o2, select from gene set repository at /n/app/bcbio/platform/gene_sets/20240904 + geneset_fn: ~/Downloads/h.all.v2024.1.Hs.entrez.gmt +--- +```{r libraries, message = FALSE, warning=FALSE} +# path to libraries if working on O2 +# .libPaths("/n/app/bcbio/R4.3.1_rnaseq/") + +## load libraries +library(GSVA) +library(GSEABase) +library(reshape2) +library(ChIPpeakAnno) +library(org.Hs.eg.db) +# library(org.Mm.eg.db) +library(AnnotationDbi) +library(DESeq2) +library(limma) +library(gridExtra) +library(bcbioR) +library(ggprism) +library(knitr) +library(rstudioapi) +library(tidyverse) + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) +opts_chunk[["set"]]( + cache = F, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + echo = T, + fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) +``` + +```{r} +# This set up the working directory to this file so all files can be found +setwd(fs::path_dir(getSourceEditorContext()$path)) +``` + +```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} +source(params$project_file) +source(params$params_file) +map(list.files(params$functions_file,pattern = "*.R$",full.names = T),source) %>% invisible() +column=params$column +contrasts=params$contrasts +subset_column=params$subset_column +subset_value=params$subset_value + +``` + +```{r sanitize_datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df))) +} +``` + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` +- Aim: `r aim` + +```{r load_data} + +coldata <- load_coldata(coldata_fn, column, + subset_column, subset_value) +coldata[[contrasts[[1]][1]]] = relevel(as.factor(coldata[[contrasts[[1]][1]]]), contrasts[[1]][3]) +coldata$sample=row.names(coldata) + +counts <- load_counts(counts_fn) +counts <- counts[,colnames(counts) %in% coldata$sample] + +``` + +# Data + +```{r show_coldata} +coldata %>% sanitize_datatable() +``` + +```{r normalize_data} +dds <- DESeqDataSetFromMatrix(counts, + colData = coldata, + design = ~ 1) + +dds <- DESeq(dds) +norm_counts <- counts(dds, normalized=TRUE) + +``` + + +```{r ensembl_to_entrez} +## convert ensembl to entrez + +entrezIDs_all = convert2EntrezID(IDs=rownames(norm_counts), orgAnn="org.Hs.eg.db", + ID_type="ensembl_gene_id") + +entrezid <- mapIds(org.Hs.eg.db, keys = rownames(norm_counts), keytype="ENSEMBL", column = "ENTREZID") + +counts_entrez <- norm_counts +stopifnot(nrow(counts_entrez) == length(entrezid)) +rownames(counts_entrez) <- entrezid + +``` + + +# Prep and run GSVA + +```{r load_genesets} + +gene_sets = read_table(params$geneset_fn, col_names = F) +names(gene_sets)[1:2] <- c('pathway', 'url') + +gene_sets_long = gene_sets %>% + dplyr::select(-url) %>% + pivot_longer(!pathway, names_to = 'column_num', values_to = 'entrez_id') %>% + filter(!is.na(entrez_id)) + +genes_by_pathway <- split(gene_sets_long$entrez_id, gene_sets_long$pathway) + +``` + + +```{r GSVA, message = F, warning = F} + +gsvaPar <- GSVA::gsvaParam(counts_entrez, genes_by_pathway, kcdf = "Poisson") + +gsva.es <- gsva(gsvaPar, verbose = F) + +``` + +## Test for Significance + +```{r limma} + +mod <- model.matrix(~ factor(coldata[[column]])) +fit <- lmFit(gsva.es, mod) +fit <- eBayes(fit) +res <- topTable(fit, coef=paste0("factor(coldata[[column]])",contrasts[[1]][2]),number=Inf,sort.by="P") + +res %>% sanitize_datatable() +``` + +## Graph top 5 pathways{.tabset} + +```{r graph pathways, results = 'asis'} +scores <- t(gsva.es) + +sig <- subset(res, res$adj.P.Val < 0.1) +pathways <- rownames(sig)[1:5] + +to_graph = data.frame(scores[,pathways]) %>% rownames_to_column('sample') %>% + pivot_longer(!sample, names_to = 'pathway', values_to = 'enrichment_score') +to_graph <- left_join(to_graph, coldata) + +for (single_pathway in pathways) { + + cat('### ', single_pathway, '\n') + + to_graph_single_pathway <- to_graph %>% filter(pathway == single_pathway) + p <- ggplot(to_graph_single_pathway, aes(x = .data[[column]], y = enrichment_score)) + geom_boxplot() + + geom_point(alpha=0.5) + ggtitle(single_pathway) + print(p) + + cat('\n\n') + +} + +``` + +# R session + +List and version of tools used for the QC report generation. + +```{r} +sessionInfo() +``` + From 8f9b2aa2efdbd36a6d3e4b124e99e16801103e4e Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Wed, 11 Sep 2024 09:52:45 -0400 Subject: [PATCH 63/93] finish comparision intersection code --- .../rnaseq/DE/Comparison-intersections.Rmd | 71 ++++++++----------- 1 file changed, 30 insertions(+), 41 deletions(-) diff --git a/inst/templates/rnaseq/DE/Comparison-intersections.Rmd b/inst/templates/rnaseq/DE/Comparison-intersections.Rmd index b073469..cd88800 100644 --- a/inst/templates/rnaseq/DE/Comparison-intersections.Rmd +++ b/inst/templates/rnaseq/DE/Comparison-intersections.Rmd @@ -20,29 +20,6 @@ params: project_file: ../information.R --- -```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} -## Adjusted P-value used for significance -padj_co <- 0.05 -## Log2FC used for significance. If no cutoff used put 0 -LFC <- 0.5 -## Normalized counts for ALL samples -# norm <- "/Users/emb016/Documents/comparisons_templates/norm_counts.csv" -# Load the count data, for this example it is the last columns of the DE table -norm_counts <- read.csv("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/norm_counts.csv.gz", - row.names = 1) - -# Load the meta data, here we are making one for the exmaple -metadata <- read_csv("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/meta.csv.gz") %>% as.data.frame() - -## Full results file (all genes) for contrast 1 -files=c("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group1.csv.gz", - "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group2.csv.gz", - "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group3.csv.gz") - -``` - - - ```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} library(rtracklayer) library(tidyverse) @@ -82,6 +59,27 @@ set.seed(1234567890L) ``` +```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} +## Adjusted P-value used for significance +padj_co <- 0.05 +## Log2FC used for significance. If no cutoff used put 0 +LFC <- 0.5 +## Normalized counts for ALL samples +# norm <- "/Users/emb016/Documents/comparisons_templates/norm_counts.csv" +# Load the count data, for this example it is the last columns of the DE table +norm_counts <- read.csv("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/norm_counts.csv.gz", + row.names = 1) + +# Load the meta data, here we are making one for the exmaple +metadata <- read_csv("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/meta.csv.gz") %>% as.data.frame() + +## Full results file (all genes) for contrast 1 +files=c("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group1.csv.gz", + "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group2.csv.gz", + "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group3.csv.gz") + +``` + # Load Data We load our dataset @@ -111,10 +109,8 @@ sign_genes=lapply(names(files), function(name){ ``` - # Make list of comparisons - ```{r, fig.height=8, fig.width=8, warning=FALSE, error=FALSE, message=FALSE} de=lapply(sign_genes, function(x){ x$gene_id @@ -126,17 +122,14 @@ names(de) <- comp_names Because we have done so many tests venn diagrams no longer work for our data. Instead we will use upset plots. *These plots are relatively intuitive for 2 or 3 categories, but can tend to get more complex for >3 categories. In all cases, you will find the categories being compared and their size listed below the bar plots on the left. As you look to the right (directly below each bar) there are dots with connecting lines that denote which categories the overlap is between, or if there is no overlap (just a dot). The numbers at the top of the bars denote the size of the overlap.* - ```{r, fig.height=8, fig.width=12} upset(fromList(de), order.by = "freq", nsets=N) - ``` ## Pull intersect(s) of interest After identifying intersect(s) of interest we can determine which genes are found in which intersections - ```{r, warning=FALSE, error=FALSE, message=FALSE} ## Grab intersection gene_names <- data.frame(gene=unique(unlist(de))) @@ -158,38 +151,34 @@ df_int <- lapply(gene_names$gene,function(x){ }) %>% bind_rows() ``` - ```{r, eval=F} ## Run this code to find the name of your intersect of interest. You will use this in the next code chunk table(df_int$int) ``` ```{r, warning=FALSE, error=FALSE, message=FALSE} -## subset interaction of interest replace the intersect name with the name of the intersect from above. You can copy and paste the below commands to grab multiple intersects. +## NOTE: subset interaction of interest replace the intersect name with the name of the intersect from above. You can copy and paste the below commands to grab multiple intersects. Intersect1 <- subset(df_int, df_int$int=="DMSO vs Group2|DMSO vs Group3") ``` ## Get annotation data -```{r, warning=FALSE, error=FALSE, message=FALSE} -# edit this to be the correct organism. One set of annotations per intersect. +```{r, warning=FALSE, error=FALSE, message=FALSE} +# NOTE: edit this to be the correct organism. One set of annotations per intersect. # rdata = AnnotationDbi::select(org.Hs.eg.db, Intersect1$gene, 'SYMBOL', 'ENSEMBL') %>% # dplyr::select(gene_id = ENSEMBL, gene_name = SYMBOL) %>% distinct(gene_id, .keep_all = T) -# FIX: following code is only for test data, use the above with real data +# NOTE: following code is only for test data, use the above with real data rdata=data.frame(gene_id=row.names(norm_counts), gene_name=row.names(norm_counts)) ``` - - ## Heatmap of intersect We generate a heatmap with all samples to see the patterns contained in this intersect. ```{r, fig.height=6, warning=FALSE, error=FALSE, message=FALSE} -## Assign factors of interest. These need to correspond to columns in your metadata. - +## NOTE: Assign factors of interest. These need to correspond to columns in your metadata. factor1 <- "Treatment" factor2 <- "Cell_line" @@ -223,13 +212,12 @@ pheatmap(norm_sig, height = 20) ``` - -## Graph all genes in intersect +## Graph genes in intersect ```{r, warning=FALSE, error=FALSE, message=FALSE} Intersect1_annot <- Intersect1 %>% left_join(rdata, by=c("gene"="gene_id")) # REMOVE to plot all -Intersect1_annot <- Intersect1_annot[1:10] +Intersect1_annot <- Intersect1_annot[1:10,] graphs <- length(Intersect1_annot$gene) to_test <- t(norm_counts) @@ -245,7 +233,8 @@ to_graph$Factor2 <- metadata[,factor2] for (i in seq(1,graphs)) { to_graph$temp=to_graph[[i]] print(ggplot(to_graph,aes(x=Factor1,y=temp,color=Factor2)) + - geom_boxplot() + geom_point(alpha=0.5) + ylab(paste0(names[[i]])) + xlab(factor1) + scale_color_discrete(name = "Covariate")) + geom_boxplot() + geom_point(alpha=0.5, position = position_dodge(width = .75)) + + ylab(paste0(names[[i]])) + xlab(factor1) + scale_color_discrete(name = "Covariate")) } ``` From eec545b1de19f8cc289a52a7fab5fba1d4b54ddf Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Wed, 11 Sep 2024 10:16:28 -0400 Subject: [PATCH 64/93] geneset from github, gmt from clusterprofiler, failsafe for no sig pathways --- inst/templates/rnaseq/DE/GSVA.Rmd | 62 +++++++++++++++++++------------ 1 file changed, 38 insertions(+), 24 deletions(-) diff --git a/inst/templates/rnaseq/DE/GSVA.Rmd b/inst/templates/rnaseq/DE/GSVA.Rmd index ebf22e5..bcb0290 100644 --- a/inst/templates/rnaseq/DE/GSVA.Rmd +++ b/inst/templates/rnaseq/DE/GSVA.Rmd @@ -23,8 +23,9 @@ params: project_file: ../information.R params_file: params_de-example.R functions_file: ../libs - # if working on o2, select from gene set repository at /n/app/bcbio/platform/gene_sets/20240904 - geneset_fn: ~/Downloads/h.all.v2024.1.Hs.entrez.gmt + # select from gene set repository at https://github.com/bcbio/resources/tree/main/gene_sets/gene_sets + # choose geneset, click "Raw', and copy url + geneset_fn: https://raw.githubusercontent.com/bcbio/resources/main/gene_sets/gene_sets/20240904/human/h.all.v2024.1.Hs.entrez.gmt --- ```{r libraries, message = FALSE, warning=FALSE} # path to libraries if working on O2 @@ -46,6 +47,7 @@ library(ggprism) library(knitr) library(rstudioapi) library(tidyverse) +library(clusterProfiler) colors=cb_friendly_cols(1:15) ggplot2::theme_set(theme_prism(base_size = 14)) @@ -110,6 +112,12 @@ counts <- counts[,colnames(counts) %in% coldata$sample] ``` +# Method + +Gene Set Variation Analysis (GSVA) is a non-parametric, unsupervised method for estimating variation of gene set enrichment through the samples of a expression data set. GSVA performs a change in coordinate systems, transforming the data from a gene by sample matrix to a gene-set by sample matrix, thereby allowing the evaluation of pathway enrichment for each sample. This new matrix of GSVA enrichment scores facilitates applying standard analytical methods like functional enrichment, survival analysis, clustering, CNV-pathway analysis or cross-tissue pathway analysis, in a pathway-centric manner. More info in the vignette [here](https://bioconductor.org/packages/release/bioc/vignettes/GSVA/inst/doc/GSVA.html). + +Hänzelmann S, Castelo R, Guinney J (2013). “GSVA: gene set variation analysis for microarray and RNA-Seq data.” BMC Bioinformatics, 14, 7. doi:10.1186/1471-2105-14-7, [https://doi.org/10.1186/1471-2105-14-7](https://doi.org/10.1186/1471-2105-14-7) + # Data ```{r show_coldata} @@ -146,15 +154,10 @@ rownames(counts_entrez) <- entrezid ```{r load_genesets} -gene_sets = read_table(params$geneset_fn, col_names = F) -names(gene_sets)[1:2] <- c('pathway', 'url') - -gene_sets_long = gene_sets %>% - dplyr::select(-url) %>% - pivot_longer(!pathway, names_to = 'column_num', values_to = 'entrez_id') %>% - filter(!is.na(entrez_id)) +# gene_sets = read_table(params$geneset_fn, col_names = F) +gene_sets <- read.gmt(params$geneset_fn) -genes_by_pathway <- split(gene_sets_long$entrez_id, gene_sets_long$pathway) +genes_by_pathway <- split(gene_sets$gene, gene_sets$term) ``` @@ -185,23 +188,34 @@ res %>% sanitize_datatable() scores <- t(gsva.es) sig <- subset(res, res$adj.P.Val < 0.1) -pathways <- rownames(sig)[1:5] -to_graph = data.frame(scores[,pathways]) %>% rownames_to_column('sample') %>% - pivot_longer(!sample, names_to = 'pathway', values_to = 'enrichment_score') -to_graph <- left_join(to_graph, coldata) +if(nrow(sig) >= 5){ + pathways <- rownames(sig)[1:5] +} else if(nrow(sig) == 0) { + pathways <- c() +} else { + pathways <- rownames(sig) +} -for (single_pathway in pathways) { - - cat('### ', single_pathway, '\n') - - to_graph_single_pathway <- to_graph %>% filter(pathway == single_pathway) - p <- ggplot(to_graph_single_pathway, aes(x = .data[[column]], y = enrichment_score)) + geom_boxplot() + - geom_point(alpha=0.5) + ggtitle(single_pathway) - print(p) - - cat('\n\n') +if (length(pathways) > 0){ + to_graph = data.frame(scores[,pathways]) %>% rownames_to_column('sample') %>% + pivot_longer(!sample, names_to = 'pathway', values_to = 'enrichment_score') + to_graph <- left_join(to_graph, coldata) + for (single_pathway in pathways) { + + cat('### ', single_pathway, '\n') + + to_graph_single_pathway <- to_graph %>% filter(pathway == single_pathway) + p <- ggplot(to_graph_single_pathway, aes(x = .data[[column]], y = enrichment_score)) + geom_boxplot() + + geom_point(alpha=0.5) + ggtitle(single_pathway) + print(p) + + cat('\n\n') + + } +} else { + cat('No pathways were detected as significantly enriched') } ``` From ee83582357fa8c90e9b39d1a82994b3fe4adf266 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Wed, 11 Sep 2024 16:34:26 -0400 Subject: [PATCH 65/93] finish two way comparison --- .../rnaseq/DE/Cross-comparison-analysis.Rmd | 51 +++++++++---------- 1 file changed, 23 insertions(+), 28 deletions(-) diff --git a/inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd b/inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd index 4e78ea6..275fbb2 100644 --- a/inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd +++ b/inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd @@ -20,31 +20,6 @@ params: project_file: ../information.R --- - - -```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} -# 1. Set up input files in this R file (params_pairwisecomp.R) -## Full results file (all genes) for contrastt 1 -comp1_fn <- 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group1.csv.gz' -## Name of contrast 1. This will be displayed on the figures -comp1_name <- "DMSO vs. Group1" -## Full results file (all genes) for contrast 2 -comp2_fn <- 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group2.csv.gz' -## Name of contrast 2. This will be displayed on the figures -comp2_name <- "DMSO vs. Group2" -## Adjusted P-value used for significance -padj_co <- 0.05 -## Log2FC used for significance. If no cutoff used put 0 -LFC <- 0.5 - -comp1 <- read_csv(comp1_fn) %>% - dplyr::filter(padj <= 1) -comp2 <- read_csv(comp2_fn) %>% - dplyr::filter(padj <= 1) -``` - - - ```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} library(rtracklayer) library(tidyverse) @@ -78,8 +53,30 @@ opts_chunk[["set"]]( set.seed(1234567890L) ``` +# Compare two differential expression analysis + +```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} +# 1. Set up input files in this R file (params_pairwisecomp.R) +## Full results file (all genes) for contrastt 1 +comp1_fn <- 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group1.csv.gz' +## Name of contrast 1. This will be displayed on the figures +comp1_name <- "DMSO vs. Group1" +## Full results file (all genes) for contrast 2 +comp2_fn <- 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/cross-comparison/all_results_DMSO_vs_Group2.csv.gz' +## Name of contrast 2. This will be displayed on the figures +comp2_name <- "DMSO vs. Group2" +## Adjusted P-value used for significance +padj_co <- 0.05 +## Log2FC used for significance. If no cutoff used put 0 +LFC <- 0.5 + +comp1 <- read_csv(comp1_fn) %>% + dplyr::filter(padj <= 1) +comp2 <- read_csv(comp2_fn) %>% + dplyr::filter(padj <= 1) +``` -# Load Data +## Load Data We load our dataset @@ -95,7 +92,6 @@ comp2_sig <- comp2 %>% ``` - # Comparisons We start with a venn diagram looking at the overlap between our two contrasts @@ -169,7 +165,6 @@ ggplot(lfc, aes(x=comp1, y=comp2, color=col)) + geom_point() + ``` - ## Compare ajusted P-values We plot adjusted P-values for our contrasts and color points by whether or not they are significant in our contrasts. The black line is 1:1. From 8050b4fd7e1a0af794eec8869d7efb51ce9e19e8 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Wed, 11 Sep 2024 16:46:18 -0400 Subject: [PATCH 66/93] adapt GSEA and ORA --- inst/templates/rnaseq/DE/DEG.Rmd | 57 +++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/inst/templates/rnaseq/DE/DEG.Rmd b/inst/templates/rnaseq/DE/DEG.Rmd index 798ecd8..dffa6dd 100644 --- a/inst/templates/rnaseq/DE/DEG.Rmd +++ b/inst/templates/rnaseq/DE/DEG.Rmd @@ -538,11 +538,55 @@ for (contrast in names(de_list)){ From the set of differentially expressed genes and using publicly available information about gene sets involved in biological processes and functions, we can calculate which biological processes and functions are significantly perturbed as a result of the treatment. ```{r} +all_in_life=get_databases() +``` + +# Pathway Analysis- GSEA + +Gene Set Enrichment Analysis (GSEA) is a computational method used to determine whether a predefined set of genes shows statistically significant, concordant differences between two biological states (e.g., disease vs. normal) in RNA-seq data or other types of gene expression data. Advantages of GSEA. + +- Biological Insight: Helps in understanding the underlying biological processes and pathways affected, rather than focusing on individual genes. +- Incorporation of Prior Knowledge: Utilizes predefined gene sets, allowing integration of existing biological knowledge. +- Contextual Relevance: Can reveal subtle but coordinated changes in biologically meaningful gene sets that might not be apparent when looking at individual genes. + +```{r, warning=F, message=F} +fa_gsea_list=lapply(de_list,function(contrast){ + + res=contrast[["all"]] + gsea_input = res %>% filter(!is.na(padj)) %>% dplyr::select(gene_id, lfc) + #change to the right species + input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, gsea_input$gene_id, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL')) + input_entrezid <- inner_join(gsea_input, input_entrezid, by=c("gene_id"="ENSEMBL")) %>% + filter(!is.na(ENTREZID)) %>% + distinct(ENTREZID, .keep_all=TRUE) + + tb = run_fgsea(input_entrezid, all_in_life) + tb %>% filter(padj<0.05) + }) +``` -wiall_in_life=get_databases() +```{r, results='asis'} +# NOTE DT::datatables doesn't work with tabset and for loops +# You can use the following code to print dynamically or call manually sanitize_datatable() +# multiple times +dt_list=list() +for (contrast in names(de_list)){ + res_sig=fa_gsea_list[[contrast]] + dt_list=c(dt_list, + list(h3(contrast)), + list(sanitize_datatable(res_sig))) +} +tagList(dt_list) ``` -# Pathway Analysis + +# Pathway Analysis- Over-representation + +Over-Representation Analysis (ORA) is a statistical method used to determine whether a predefined set of genes (e.g., genes belonging to a specific biological pathway or function) is over-represented (or enriched) among a list of differentially expressed genes (DEGs) from RNA-seq data. Adventages of ORA: + +- Simplicity: Easy to perform and interpret. +- Biological Insight: Helps to identify pathways and processes that are significantly affected in the condition studied. +- Prior Knowledge Integration: Utilizes existing biological knowledge through predefined gene sets. ```{r, warning=F, message=F} fa_list=lapply(de_list,function(contrast){ @@ -582,7 +626,7 @@ for (contrast in names(de_list)){ res_sig=fa_list[[contrast]][["all"]] dt_list=c(dt_list, list(h3(contrast)), - list(DT::datatable(res_sig))) + list(sanitize_datatable(res_sig))) } tagList(dt_list) ``` @@ -596,7 +640,7 @@ for (contrast in names(de_list)){ res_sig=fa_list[[contrast]][["down"]] dt_list=c(dt_list, list(h3(contrast)), - list(DT::datatable(res_sig))) + list(sanitize_datatable(res_sig))) } tagList(dt_list) ``` @@ -610,7 +654,7 @@ for (contrast in names(de_list)){ res_sig=fa_list[[contrast]][["up"]] dt_list=c(dt_list, list(h3(contrast)), - list(DT::datatable(res_sig))) + list(sanitize_datatable(res_sig))) } tagList(dt_list) ``` @@ -645,8 +689,11 @@ for (contrast in names(contrasts)){ res_for_writing <- de_list[[contrast]][["all"]] %>% mutate(comparison = contrast) + # NOTE choose what pathway to save, all, down or up, or everything, just + #. need to add more lines pathways_for_writing <- fa_list[[contrast]][["all"]] %>% mutate(comparison = contrast) + write_csv(counts_norm, name_expression_fn) write_csv(res_for_writing, name_deg_fn) write_csv(pathways_for_writing, name_pathways_fn) From 1d4406e69192372215414073973473634c15b181 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Wed, 11 Sep 2024 16:46:36 -0400 Subject: [PATCH 67/93] start library functions for FA --- inst/templates/rnaseq/libs/FA.R | 35 +++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/inst/templates/rnaseq/libs/FA.R b/inst/templates/rnaseq/libs/FA.R index d1d2a02..8829550 100644 --- a/inst/templates/rnaseq/libs/FA.R +++ b/inst/templates/rnaseq/libs/FA.R @@ -29,7 +29,7 @@ get_databases=function(){ # msigdbr(species = "human", category = "C3", subcategory = "TFT:GTRD"), # msigdbr(species = "human", category = "C6") %>% mutate(gs_subcat="Oncogenic") ) -all_in_life + all_in_life } run_fora_v2=function(input, uni, all_in_life){ @@ -72,9 +72,9 @@ run_fora=function(input, uni,all_in_life){ universe = unique(uni$ENTREZID), minSize = 15, maxSize = 500) - coll_respath = collapsePathwaysORA(respath[order(pval)][padj < 0.1], - pathway, unique(input$ENTREZID), unique(uni$ENTREZID)) - as_tibble(respath[pathway %in% coll_respath$mainPathways]) %>% + # coll_respath = collapsePathwaysORA(respath[order(pval)][padj < 0.1], + # pathway, unique(input$ENTREZID), unique(uni$ENTREZID)) + as_tibble(respath) %>% mutate(database=db_name, NES=(overlap/size)/(total_deg)) }) %>% bind_rows() %>% mutate(analysis="ORA") @@ -88,3 +88,30 @@ run_fora=function(input, uni,all_in_life){ ora_tb } + +run_fgsea=function(input, all_in_life){ + # browser() + input_gsea <- input$lfc + names(input_gsea) <- input$ENTREZID + pathways_all = lapply(all_in_life, function(p){ + pathway = split(x = p$entrez_gene, f = p$gs_name) + db_name = paste(p$gs_cat[1], p$gs_subcat[1],sep=":") + respath <- fgsea(pathways = pathway, + stats = input_gsea, + minSize = 15, + maxSize = 500) + + as_tibble(respath) %>% + mutate(database=db_name) + }) %>% bind_rows() %>% + mutate(analysis="GSEA") + tb = pathways_all %>% unnest(leadingEdge) %>% + group_by(pathway) %>% + left_join(input, by =c("leadingEdge"="ENTREZID")) %>% + dplyr::select(pathway, padj, size, NES, SYMBOL, analysis, + database) %>% + group_by(pathway, padj, size, NES, database, analysis) %>% + summarise(genes=paste(SYMBOL,collapse = ",")) + tb + +} From cea26620469b3e4a82ec932d0c7fa6ee50f5699d Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Wed, 11 Sep 2024 17:00:32 -0400 Subject: [PATCH 68/93] first attempt to use controlled gmt files --- inst/templates/rnaseq/libs/FA.R | 54 +++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/inst/templates/rnaseq/libs/FA.R b/inst/templates/rnaseq/libs/FA.R index 8829550..5ccaf8c 100644 --- a/inst/templates/rnaseq/libs/FA.R +++ b/inst/templates/rnaseq/libs/FA.R @@ -1,30 +1,38 @@ -# library(msigdb) -# msigdb.hs = getMsigdb(org = 'hs', id = 'SYM', version = '7.5') -# -# get_databases_v2=function(){ -# all_in_life=list( -# GOBP=subsetCollection(msigdb.hs, 'c5', 'GO:BP'), -# GOMF=subsetCollection(msigdb.hs, 'c5', 'GO:MF'), -# HALLMARK=subsetCollection(msigdb.hs, 'h'), -# KEGG=subsetCollection(msigdb.hs, 'c2', 'CP:KEGG') -# ) %>% lapply(., function(geneset){ -# gs=lapply(geneset, function(x){ -# geneIds(x) -# }) -# names(gs)=sapply(geneset, setName) -# gs -# }) -# all_in_life -# } +library(msigdbr) +library(clusterProfiler) -get_databases=function(){ +get_databases_v2=function(sps=human){ + gmt.files=list(human=c("h.all.v2024.1.Hs.entrez.gmt", + "c5.go.v2024.1.Hs.entrez.gmt", + "c5.go.mf.v2024.1.Hs.entrez.gmt", + "c5.go.cc.v2024.1.Hs.entrez.gmt", + "c5.go.bp.v2024.1.Hs.entrez.gmt", + "c2.cp.reactome.v2024.1.Hs.entrez.gmt", + "c2.cp.kegg_legacy.v2024.1.Hs.entrez.gmt"), + mouse=c("mh.all.v2024.1.Mm.entrez.gmt", + "m5.go.v2024.1.Mm.entrez.gmt", + "m5.go.mf.v2024.1.Mm.entrez.gmt", + "m5.go.cc.v2024.1.Mm.entrez.gmt", + "m5.go.bp.v2024.1.Mm.entrez.gmt", + "m2.cp.reactome.v2024.1.Mm.entrez.gmt", + "m2.cp.kegg_legacy.v2024.1.Mm.entrez.gmt")) + all_in_life=lapply(gmt.files[[sps]], function(gmt){ + df=read.gmt(file.path(source,gmt)) + names(df)=c("gs_name", "entrez_gene") + df + }) + names(all_in_life) = str_remove(gmt.files[[sps]], ".v2024.*$") + all_in_life +} + +get_databases=function(sps="human"){ all_in_life=list( - msigdbr(species = "human", category = "H") %>% mutate(gs_subcat="Hallmark"), + msigdbr(species = sps, category = "H") %>% mutate(gs_subcat="Hallmark"), # msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME"), - msigdbr(species = "human", category = "C2", subcategory = "CP:KEGG"), + msigdbr(species = sps, category = "C2", subcategory = "CP:KEGG"), # msigdbr(species = "human", category = "C2", subcategory = "CP:PID"), - msigdbr(species = "human", category = "C5", subcategory = "GO:BP"), - msigdbr(species = "human", category = "C5", subcategory = "GO:MF") + msigdbr(species = sps, category = "C5", subcategory = "GO:BP"), + msigdbr(species = sps, category = "C5", subcategory = "GO:MF") # msigdbr(species = "human", category = "C5", subcategory = "HPO"), # msigdbr(species = "human", category = "C3", subcategory = "TFT:GTRD"), # msigdbr(species = "human", category = "C6") %>% mutate(gs_subcat="Oncogenic") From e15f25386c7d9de2fcd0dd1e125be3c09cc4451b Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Thu, 12 Sep 2024 16:20:24 -0400 Subject: [PATCH 69/93] get gitignores --- R/helpers.R | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/R/helpers.R b/R/helpers.R index a78ecb6..8cf029a 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -126,6 +126,14 @@ bcbio_params <-function(nfcore_path, pipeline, metadata, copy){ } +detect_gitignores <- function(path){ + gits <- fs::dir_ls(path, recurse = TRUE, regexp = 'gitignore') + sapply(gits, function(fn){ + hidden <- file.path(dirname(fn), paste0(".", basename(fn))) + fs::file_move(fn, hidden) + }) +} + copy_files_in_folder<- function(origin, remote){ to_copy <- fs::dir_ls(origin,all = TRUE) to_copy <- grep("org", to_copy, @@ -142,6 +150,7 @@ copy_files_in_folder<- function(origin, remote){ fs::file_copy(element, full_new_path) } } + detect_gitignores(remote) } deploy_apps <- function(apps, path){ From c4d5c322fa7b5f0fbb42f63769b6db4799a301f4 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Thu, 12 Sep 2024 16:26:11 -0400 Subject: [PATCH 70/93] update readme with right commands --- README.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2ba7800..4957bf7 100644 --- a/README.md +++ b/README.md @@ -31,8 +31,10 @@ setwd("/path/to/analysis/folder") The following code will pop up a Rmd template will populate that folder with HCBC data structure guidelines ``` -bcbio_templates(path,pipeline="nf-core/rnaseq") -bcbio_templates(path,pipeline="singlecell") +path="/path/to/analysis/folder" +bcbio_templates(type="base", outpath=path) +bcbio_templates(type="rnaseq", outpath=path) +bcbio_templates(type="singlecell", outpath=path) ``` ### Set RNAseq report folder @@ -79,3 +81,8 @@ Some best practices when developing: - James Billingsley - Zhu Zhuo - Elizabeth Partan +- Noor Sohail +- Meeta Mistry +- Will Gammerdinger +- Upen Bhattarai +- Shannan Ho Sui From 00dcfd75e9cd67e1e8e0adc73411c1a251ef218d Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Thu, 12 Sep 2024 16:35:05 -0400 Subject: [PATCH 71/93] fix example of command line --- R/helpers.R | 3 +++ inst/templates/base/README.md | 4 ++-- inst/templates/base/reports/placeholder | 0 3 files changed, 5 insertions(+), 2 deletions(-) create mode 100644 inst/templates/base/reports/placeholder diff --git a/R/helpers.R b/R/helpers.R index 8cf029a..c2d7509 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -187,6 +187,9 @@ copy_templates <- function(path, pipeline){ # value = TRUE, invert = TRUE) # ui_info("{ui_value(length(ls_files))} amount of files to copy") copy_files_in_folder(analysis_template, path) + # check org folder is in there + # search for param + _README.md + # concat file to README.md deploy_apps(apps, path) } diff --git a/inst/templates/base/README.md b/inst/templates/base/README.md index a7c4fa3..15dee0e 100644 --- a/inst/templates/base/README.md +++ b/inst/templates/base/README.md @@ -17,8 +17,8 @@ - [ ] Use the same project name to create a folder in *Dropbox* and a repo in *GitHub* - [ ] If you didn't provide the pipeline when creating this project: Use the function `bcbio_templates` to create templates inside `reports` for each type of analysis. For instance, for *RNAseq*: - - `use_bcbio_analysis(".", 'nf-core/rnaseq', copy = TRUE)` or - - `use_bcbio_analysis(".", 'singlecell', copy = TRUE)` + - `bcbio_templates(type="rnaseq", outpath=path)` or + - `bcbio_templates(type="singlecell", outpath=path)` - Then go to that folder and read the `README.md` ## Folders diff --git a/inst/templates/base/reports/placeholder b/inst/templates/base/reports/placeholder new file mode 100644 index 0000000..e69de29 From fdfdfca4bee33c6c561e8b0d7a55a0bd5fdae43f Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Thu, 12 Sep 2024 16:38:49 -0400 Subject: [PATCH 72/93] update tests --- .gitignore | 3 +- tests/testthat/test-deploy.R | 75 ++++++++++++++++++++++++++ tests/testthat/{misc.R => test-misc.R} | 0 3 files changed, 76 insertions(+), 2 deletions(-) create mode 100644 tests/testthat/test-deploy.R rename tests/testthat/{misc.R => test-misc.R} (100%) diff --git a/.gitignore b/.gitignore index 184104b..6af5e77 100644 --- a/.gitignore +++ b/.gitignore @@ -14,10 +14,9 @@ docs /Meta/ .DS* inst/rmarkdown/templates/rnaseq/skeleton/DE/Multiplicative_DGE_Analysis.Rmd -tests/* .Rdata .httr-oauth .DS_Store .quarto inst/templates/chipseq/QC/QC.html -*.html \ No newline at end of file +*.html diff --git a/tests/testthat/test-deploy.R b/tests/testthat/test-deploy.R new file mode 100644 index 0000000..049480a --- /dev/null +++ b/tests/testthat/test-deploy.R @@ -0,0 +1,75 @@ +library(bcbioR) + + +test_that("scrnaseq",{ + path <- withr::local_tempdir() + print(path) + copy_templates(path, "singlecell") + expect_length(fs::dir_ls(path,all=T),8) + expect_true(grepl("scRNAseq_qc_app", + fs::dir_ls(file.path(path, "apps"), recurse=T, all=T)[2])) +}) + +test_that("base copy",{ + path <- withr::local_tempdir() + print(path) + bcbio_templates(type="base", outpath=path) + expect_length(fs::dir_ls(path,all=T),10) + expect_true(file.exists(file.path(path,".gitignore"))) +}) + +test_that("rnaseq copy",{ + path <- withr::local_tempdir() + print(path) + bcbio_templates(type="rnaseq", outpath=path) + expect_length(fs::dir_ls(path,all=T),6) + # numerator="tumor" + # denominator="normal" + # subset_value=NA + # rmarkdown::render(input = file.path(path,"DE/DEG.Rmd"), + # output_dir = file.path(path,"DE"), + # output_format = "html_document", + # output_file = ifelse(!is.na(subset_value), + # paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'), + # paste0('DE_', numerator, '_vs_', denominator, '.html') + # ), + # clean = TRUE, + # envir = new.env(), + # params = list( + # subset_value = subset_value, + # numerator = numerator, + # denominator = denominator, + # params_file = file.path(path,'DE/params_de-example.R'), + # project_file = file.path(path,'information.R'), + # functions_file = file.path(path,'DE/load_data.R') + # ) + # ) + # use_bcbio_projects(path, nfcore="nf-core/rnaseq", copy=TRUE, git=FALSE) +}) + +# test_that("rnaseq testing", { +# path <- withr::local_tempdir() +# print(path) +# bcbio_templates(type="rnaseq", outpath=path) +# numerator="tumor" +# denominator="normal" +# subset_value=NA +# rmarkdown::render(input = file.path(path,"DE/DEG.Rmd"), +# output_dir = file.path(path,"DE"), +# output_format = "html_document", +# output_file = ifelse(!is.na(subset_value), +# paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'), +# paste0('DE_', numerator, '_vs_', denominator, '.html') +# ), +# clean = TRUE, +# envir = new.env(), +# params = list( +# subset_value = subset_value, +# numerator = numerator, +# denominator = denominator, +# params_file = file.path(path,'DE/params_de.R'), +# project_file = file.path(path,'information.R'), +# functions_file = file.path(path,'DE/load_data.R') +# ) +# ) +# }) diff --git a/tests/testthat/misc.R b/tests/testthat/test-misc.R similarity index 100% rename from tests/testthat/misc.R rename to tests/testthat/test-misc.R From 04f41f4f6f6d27cce7b51013567422062367cb85 Mon Sep 17 00:00:00 2001 From: Alex Bartlett <74612800+abartlett004@users.noreply.github.com> Date: Fri, 13 Sep 2024 12:52:08 -0400 Subject: [PATCH 73/93] Update QC.Rmd --- inst/templates/chipseq/QC/QC.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/templates/chipseq/QC/QC.Rmd b/inst/templates/chipseq/QC/QC.Rmd index 0ddf5eb..50ab08e 100644 --- a/inst/templates/chipseq/QC/QC.Rmd +++ b/inst/templates/chipseq/QC/QC.Rmd @@ -203,7 +203,7 @@ metrics %>% fill = antibody)) + geom_bar(stat = "identity") + coord_flip() + - scale_y_continuous(name = "NSC coefficient") + + scale_y_continuous(name = "RSC coefficient") + scale_x_discrete(limits = rev) + scale_fill_cb_friendly() + xlab("") + ggtitle("Relative Strand Cross-Correlation") From f517d42d7c4fc4080f7a0fb45bc131fdb659f100 Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Thu, 19 Sep 2024 13:56:33 -0400 Subject: [PATCH 74/93] begin diffbind templates --- DESCRIPTION | 0 LICENSE | 0 LICENSE.md | 0 NAMESPACE | 0 NEWS.md | 0 README.md | 0 _pkgdown.yml | 0 .../diffbind/chipseq_peakanalysis_H3K27Ac.csv | 25 ++++++++++++++++++ .../diffbind/params_diffbind-example.R | 8 ++++++ inst/templates/chipseq/information.R | 0 inst/templates/chipseq/libs/load_data.R | 0 .../chipseq/libs/make_diffbind_samplesheet.R | 26 +++++++++++++++++++ inst/templates/chipseq/readme.md | 0 13 files changed, 59 insertions(+) mode change 100644 => 100755 DESCRIPTION mode change 100644 => 100755 LICENSE mode change 100644 => 100755 LICENSE.md mode change 100644 => 100755 NAMESPACE mode change 100644 => 100755 NEWS.md mode change 100644 => 100755 README.md mode change 100644 => 100755 _pkgdown.yml create mode 100644 inst/templates/chipseq/diffbind/chipseq_peakanalysis_H3K27Ac.csv create mode 100644 inst/templates/chipseq/diffbind/params_diffbind-example.R mode change 100644 => 100755 inst/templates/chipseq/information.R mode change 100644 => 100755 inst/templates/chipseq/libs/load_data.R create mode 100755 inst/templates/chipseq/libs/make_diffbind_samplesheet.R mode change 100644 => 100755 inst/templates/chipseq/readme.md diff --git a/DESCRIPTION b/DESCRIPTION old mode 100644 new mode 100755 diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/LICENSE.md b/LICENSE.md old mode 100644 new mode 100755 diff --git a/NAMESPACE b/NAMESPACE old mode 100644 new mode 100755 diff --git a/NEWS.md b/NEWS.md old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/_pkgdown.yml b/_pkgdown.yml old mode 100644 new mode 100755 diff --git a/inst/templates/chipseq/diffbind/chipseq_peakanalysis_H3K27Ac.csv b/inst/templates/chipseq/diffbind/chipseq_peakanalysis_H3K27Ac.csv new file mode 100644 index 0000000..9686e20 --- /dev/null +++ b/inst/templates/chipseq/diffbind/chipseq_peakanalysis_H3K27Ac.csv @@ -0,0 +1,25 @@ +sample,fastq_1,fastq_2,antibody,control,genotype +WT_H3K27ac_ChIPseq_REP1,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780292_SRR6823782.fastq.gz,,H3K27ac,WT_H3K27ac_input_REP1,WT +WT_H3K27ac_ChIPseq_REP1,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780292_SRR6823783.fastq.gz,,H3K27ac,WT_H3K27ac_input_REP1,WT +WT_H3K27ac_ChIPseq_REP1,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780292_SRR6823784.fastq.gz,,H3K27ac,WT_H3K27ac_input_REP1,WT +WT_H3K27ac_input_REP1,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780293_SRR6823785.fastq.gz,,,,WT +WT_H3K27ac_input_REP1,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780293_SRR6823786.fastq.gz,,,,WT +WT_H3K27ac_ChIPseq_REP2,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780294_SRR6823787.fastq.gz,,H3K27ac,WT_H3K27ac_input_REP2,WT +WT_H3K27ac_ChIPseq_REP2,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780294_SRR6823788.fastq.gz,,H3K27ac,WT_H3K27ac_input_REP2,WT +WT_H3K27ac_ChIPseq_REP2,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780294_SRR6823789.fastq.gz,,H3K27ac,WT_H3K27ac_input_REP2,WT +WT_H3K27ac_input_REP2,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780295_SRR6823790.fastq.gz,,,,WT +WT_H3K27ac_input_REP2,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780295_SRR6823791.fastq.gz,,,,WT +WT_H3K27ac_ChIPseq_REP3,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780296_SRR6823792.fastq.gz,,H3K27ac,WT_H3K27ac_input_REP3,WT +WT_H3K27ac_input_REP3,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780297_SRR6823793.fastq.gz,,,,WT +cKO_H3K27ac_ChIPseq_REP1,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780298_SRR6823794.fastq.gz,,H3K27ac,cKO_H3K27ac_input_REP1,cKO +cKO_H3K27ac_ChIPseq_REP1,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780298_SRR6823795.fastq.gz,,H3K27ac,cKO_H3K27ac_input_REP1,cKO +cKO_H3K27ac_ChIPseq_REP1,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780298_SRR6823796.fastq.gz,,H3K27ac,cKO_H3K27ac_input_REP1,cKO +cKO_H3K27ac_input_REP1,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780299_SRR6823797.fastq.gz,,,,cKO +cKO_H3K27ac_input_REP1,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780299_SRR6823798.fastq.gz,,,,cKO +cKO_H3K27ac_ChIPseq_REP2,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780300_SRR6823799.fastq.gz,,H3K27ac,cKO_H3K27ac_input_REP2,cKO +cKO_H3K27ac_ChIPseq_REP2,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780300_SRR6823800.fastq.gz,,H3K27ac,cKO_H3K27ac_input_REP2,cKO +cKO_H3K27ac_ChIPseq_REP2,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780300_SRR6823801.fastq.gz,,H3K27ac,cKO_H3K27ac_input_REP2,cKO +cKO_H3K27ac_input_REP2,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780301_SRR6823802.fastq.gz,,,,cKO +cKO_H3K27ac_input_REP2,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780301_SRR6823803.fastq.gz,,,,cKO +cKO_H3K27ac_ChIPseq_REP3,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780302_SRR6823804.fastq.gz,,H3K27ac,cKO_H3K27ac_input_REP3,cKO +cKO_H3K27ac_input_REP3,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780303_SRR6823805.fastq.gz,,,,cKO \ No newline at end of file diff --git a/inst/templates/chipseq/diffbind/params_diffbind-example.R b/inst/templates/chipseq/diffbind/params_diffbind-example.R new file mode 100644 index 0000000..3ddfd56 --- /dev/null +++ b/inst/templates/chipseq/diffbind/params_diffbind-example.R @@ -0,0 +1,8 @@ +# info params + + +# Example data +coldata_fn='chipseq_peakanalysis_H3K27Ac.csv' +peaks_dir = '~/s3_results/chipseq_peakanalysis_h3k27ac_narrow/bowtie2/mergedLibrary/macs2/narrowPeak/' +bam_dir = '~/s3_results/chipseq_peakanalysis_h3k27ac_narrow/bowtie2/mergedLibrary/' +peak_caller = 'macs' \ No newline at end of file diff --git a/inst/templates/chipseq/information.R b/inst/templates/chipseq/information.R old mode 100644 new mode 100755 diff --git a/inst/templates/chipseq/libs/load_data.R b/inst/templates/chipseq/libs/load_data.R old mode 100644 new mode 100755 diff --git a/inst/templates/chipseq/libs/make_diffbind_samplesheet.R b/inst/templates/chipseq/libs/make_diffbind_samplesheet.R new file mode 100755 index 0000000..7a68bd3 --- /dev/null +++ b/inst/templates/chipseq/libs/make_diffbind_samplesheet.R @@ -0,0 +1,26 @@ +library(tidyverse) +library(tools) + +source('../libs/load_data.R') +source('params_diffbind-example.R') + +coldata <- load_coldata(coldata_fn) + +bam_files <- data.frame(bam = list.files(bam_dir, pattern = '.bam$', full.names = T)) %>% + mutate(sample = sub("\\..*", "",basename(bam))) + +peak_files <- data.frame(Peaks = list.files(peaks_dir, pattern = 'Peak$', full.names = T)) %>% + mutate(SampleID = sub("\\..*", "",basename(Peaks))) %>% + mutate(SampleID = gsub('_peaks', '', SampleID)) + +coldata_for_diffbind <- coldata %>% + filter(!is.na(control) & control != '') %>% + # select(-description) %>% + dplyr::rename(ControlID = control, SampleID = sample) %>% + separate(SampleID, into = c('sample', 'Replicate'), remove = F, sep = '_REP') %>% + mutate(peakCaller = peak_caller) + +samplesheet <- coldata_for_diffbind %>% + left_join(bam_files %>% select(SampleID = sample, bamReads = bam), by = 'SampleID') %>% + left_join(bam_files %>% select(ControlID = sample, bamControl = bam), by = 'ControlID') %>% + left_join(peak_files, by = 'SampleID') diff --git a/inst/templates/chipseq/readme.md b/inst/templates/chipseq/readme.md old mode 100644 new mode 100755 From 8937f2245e293b41289fb4488c6cb47335fc30d5 Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Thu, 19 Sep 2024 19:55:51 +0000 Subject: [PATCH 75/93] working samplesheet, produced count matrix --- inst/templates/chipseq/diffbind/params_diffbind-example.R | 7 +++---- inst/templates/chipseq/libs/make_diffbind_samplesheet.R | 8 +++++++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/inst/templates/chipseq/diffbind/params_diffbind-example.R b/inst/templates/chipseq/diffbind/params_diffbind-example.R index 3ddfd56..655d32c 100644 --- a/inst/templates/chipseq/diffbind/params_diffbind-example.R +++ b/inst/templates/chipseq/diffbind/params_diffbind-example.R @@ -2,7 +2,6 @@ # Example data -coldata_fn='chipseq_peakanalysis_H3K27Ac.csv' -peaks_dir = '~/s3_results/chipseq_peakanalysis_h3k27ac_narrow/bowtie2/mergedLibrary/macs2/narrowPeak/' -bam_dir = '~/s3_results/chipseq_peakanalysis_h3k27ac_narrow/bowtie2/mergedLibrary/' -peak_caller = 'macs' \ No newline at end of file +coldata_fn='/workspace/s3_hcbc_seqera/test-data/chipseq_peakanalysis_h3k27ac_narrow/chipseq_peakanalysis_H3K27Ac.csv' +peaks_dir = '/workspace/s3_hcbc_seqera/test-data/chipseq_peakanalysis_h3k27ac_narrow/bowtie2/mergedLibrary/macs2/narrowPeak/' +bam_dir = '/workspace/s3_hcbc_seqera/test-data/chipseq_peakanalysis_h3k27ac_narrow/bowtie2/mergedLibrary/' diff --git a/inst/templates/chipseq/libs/make_diffbind_samplesheet.R b/inst/templates/chipseq/libs/make_diffbind_samplesheet.R index 7a68bd3..b69e7ce 100755 --- a/inst/templates/chipseq/libs/make_diffbind_samplesheet.R +++ b/inst/templates/chipseq/libs/make_diffbind_samplesheet.R @@ -1,6 +1,7 @@ library(tidyverse) library(tools) - +library(DiffBind) +library(qs) source('../libs/load_data.R') source('params_diffbind-example.R') @@ -24,3 +25,8 @@ samplesheet <- coldata_for_diffbind %>% left_join(bam_files %>% select(SampleID = sample, bamReads = bam), by = 'SampleID') %>% left_join(bam_files %>% select(ControlID = sample, bamControl = bam), by = 'ControlID') %>% left_join(peak_files, by = 'SampleID') + +diffbind_obj <- dba(sampleSheet = samplesheet, scoreCol = 5) +diffbind_count <- dba.count(diffbind_obj, bUseSummarizeOverlaps = TRUE) +qsave(diffbind_count, 'diffbind_count.qs') + From d6402e5060789085682acbc6b47dc0cee04960ac Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Fri, 20 Sep 2024 16:13:22 +0000 Subject: [PATCH 76/93] begin diffbind rmd template --- inst/templates/chipseq/QC/QC.Rmd | 2 - .../diffbind/chipseq_peakanalysis_H3K27Ac.csv | 25 ---- inst/templates/chipseq/diffbind/diffbind.Rmd | 127 ++++++++++++++++++ inst/templates/chipseq/libs/load_data.R | 24 +++- .../chipseq/libs/make_diffbind_samplesheet.R | 32 ----- 5 files changed, 150 insertions(+), 60 deletions(-) delete mode 100644 inst/templates/chipseq/diffbind/chipseq_peakanalysis_H3K27Ac.csv create mode 100644 inst/templates/chipseq/diffbind/diffbind.Rmd delete mode 100755 inst/templates/chipseq/libs/make_diffbind_samplesheet.R diff --git a/inst/templates/chipseq/QC/QC.Rmd b/inst/templates/chipseq/QC/QC.Rmd index 50ab08e..4ba539c 100644 --- a/inst/templates/chipseq/QC/QC.Rmd +++ b/inst/templates/chipseq/QC/QC.Rmd @@ -23,7 +23,6 @@ params: project_file: ../information.R functions_file: ../libs/load_data.R factor_of_interest: genotype - --- ```{r, cache = FALSE, message = FALSE, warning=FALSE} @@ -94,7 +93,6 @@ sanitize_datatable = function(df, ...) { ```{r load_data, message=F, warning=F} # This code will load from bcbio or nf-core folder -# NOTE make sure to set numerator and denominator coldata <- load_coldata(coldata_fn) coldata$sample=row.names(coldata) diff --git a/inst/templates/chipseq/diffbind/chipseq_peakanalysis_H3K27Ac.csv b/inst/templates/chipseq/diffbind/chipseq_peakanalysis_H3K27Ac.csv deleted file mode 100644 index 9686e20..0000000 --- a/inst/templates/chipseq/diffbind/chipseq_peakanalysis_H3K27Ac.csv +++ /dev/null @@ -1,25 +0,0 @@ -sample,fastq_1,fastq_2,antibody,control,genotype -WT_H3K27ac_ChIPseq_REP1,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780292_SRR6823782.fastq.gz,,H3K27ac,WT_H3K27ac_input_REP1,WT -WT_H3K27ac_ChIPseq_REP1,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780292_SRR6823783.fastq.gz,,H3K27ac,WT_H3K27ac_input_REP1,WT -WT_H3K27ac_ChIPseq_REP1,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780292_SRR6823784.fastq.gz,,H3K27ac,WT_H3K27ac_input_REP1,WT -WT_H3K27ac_input_REP1,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780293_SRR6823785.fastq.gz,,,,WT -WT_H3K27ac_input_REP1,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780293_SRR6823786.fastq.gz,,,,WT -WT_H3K27ac_ChIPseq_REP2,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780294_SRR6823787.fastq.gz,,H3K27ac,WT_H3K27ac_input_REP2,WT -WT_H3K27ac_ChIPseq_REP2,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780294_SRR6823788.fastq.gz,,H3K27ac,WT_H3K27ac_input_REP2,WT -WT_H3K27ac_ChIPseq_REP2,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780294_SRR6823789.fastq.gz,,H3K27ac,WT_H3K27ac_input_REP2,WT -WT_H3K27ac_input_REP2,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780295_SRR6823790.fastq.gz,,,,WT -WT_H3K27ac_input_REP2,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780295_SRR6823791.fastq.gz,,,,WT -WT_H3K27ac_ChIPseq_REP3,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780296_SRR6823792.fastq.gz,,H3K27ac,WT_H3K27ac_input_REP3,WT -WT_H3K27ac_input_REP3,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780297_SRR6823793.fastq.gz,,,,WT -cKO_H3K27ac_ChIPseq_REP1,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780298_SRR6823794.fastq.gz,,H3K27ac,cKO_H3K27ac_input_REP1,cKO -cKO_H3K27ac_ChIPseq_REP1,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780298_SRR6823795.fastq.gz,,H3K27ac,cKO_H3K27ac_input_REP1,cKO -cKO_H3K27ac_ChIPseq_REP1,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780298_SRR6823796.fastq.gz,,H3K27ac,cKO_H3K27ac_input_REP1,cKO -cKO_H3K27ac_input_REP1,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780299_SRR6823797.fastq.gz,,,,cKO -cKO_H3K27ac_input_REP1,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780299_SRR6823798.fastq.gz,,,,cKO -cKO_H3K27ac_ChIPseq_REP2,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780300_SRR6823799.fastq.gz,,H3K27ac,cKO_H3K27ac_input_REP2,cKO -cKO_H3K27ac_ChIPseq_REP2,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780300_SRR6823800.fastq.gz,,H3K27ac,cKO_H3K27ac_input_REP2,cKO -cKO_H3K27ac_ChIPseq_REP2,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780300_SRR6823801.fastq.gz,,H3K27ac,cKO_H3K27ac_input_REP2,cKO -cKO_H3K27ac_input_REP2,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780301_SRR6823802.fastq.gz,,,,cKO -cKO_H3K27ac_input_REP2,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780301_SRR6823803.fastq.gz,,,,cKO -cKO_H3K27ac_ChIPseq_REP3,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780302_SRR6823804.fastq.gz,,H3K27ac,cKO_H3K27ac_input_REP3,cKO -cKO_H3K27ac_input_REP3,s3://hcbc-seqera/results/sra_peakanalysis_GSE111661/fastq/SRX3780303_SRR6823805.fastq.gz,,,,cKO \ No newline at end of file diff --git a/inst/templates/chipseq/diffbind/diffbind.Rmd b/inst/templates/chipseq/diffbind/diffbind.Rmd new file mode 100644 index 0000000..3479f17 --- /dev/null +++ b/inst/templates/chipseq/diffbind/diffbind.Rmd @@ -0,0 +1,127 @@ +--- +title: "ChIPSeq DiffBind" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + # Fill this file with the right paths to nfcore output + params_file: params_diffbind-example.R + project_file: ../information.R + functions_file: ../libs/load_data.R + # .qs file name for saving DiffBind Counts object + diffbind_counts_file: diffbind_counts.qs + factor_of_interest: genotype +--- + +```{r, cache = FALSE, message = FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +# library(rstudioapi) +# setwd(fs::path_dir(getSourceEditorContext()$path)) +``` + + +```{r source_params, cache = FALSE, message = FALSE, warning=FALSE} +# 1. set up factor_of_interest parameter from parameter above or manually +# this is used to color plots, it needs to be part of the metadata +# 2. Set input files in this file +source(params$params_file) +# 3. If you set up this file, project information will be printed below and +#. it can be reused for other Rmd files. +source(params$project_file) +# 4. Load custom functions to load data from coldata/metrics/counts +source(params$functions_file) +``` + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` + + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(tidyverse) +library(knitr) +# library(rtracklayer) +library(DESeq2) +library(DEGreport) +library(ggrepel) +# library(RColorBrewer) +library(DT) +library(pheatmap) +library(bcbioR) +library(janitor) +# library(ChIPpeakAnno) +library(UpSetR) +library(DiffBind) +ggplot2::theme_set(theme_light(base_size = 14)) +opts_chunk[["set"]]( + cache = FALSE, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + fig.height = 4) +``` + + +```{r sanitize-datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df))) +} +``` + +# Samples and metadata + +```{r load_data, message=F, warning=F} +coldata <- load_coldata(coldata_fn) + +# DiffBind requires a very specific samplesheet in order to create the peak counts object, see https://www.rdocumentation.org/packages/DiffBind/versions/2.0.2/topics/dba for further details + +# make_diffbind_samplesheet is a function provided by bcbioR to help assemble DiffBind's samplesheet +# using the nf-core samplesheet and output. In the resulting DiffBind counts object, it +# encodes your factor of interest as "Factor" and the antibody as "Condition" + +samplesheet <- make_diffbind_samplesheet(coldata, bam_dir, peaks_dir, params$factor_of_interest) + +# if necessary, one additional covariate of interest can be encoded as "Tissue" + +``` + +```{r show_metadata} +samplesheet %>% select(SampleID, Replicate, Condition, Factor, ControlID) %>% sanitize_datatable() +``` + +# DiffBind + +```{r create diffbind counts object, eval = !file.exists()} + +diffbind_obj <- dba(sampleSheet = samplesheet, scoreCol = 5) + +# This command may take several minutes. Recommend using multiple cores and lots of memory +diffbind_counts <- dba.count(diffbind_obj, bUseSummarizeOverlaps = TRUE, bParallel = T) + +# save object when time-intensive command is finished, so that this cell only need run once +qsave(diffbind_counts, params$diffbind_counts_file) + +``` diff --git a/inst/templates/chipseq/libs/load_data.R b/inst/templates/chipseq/libs/load_data.R index bc3b744..b54b6a7 100755 --- a/inst/templates/chipseq/libs/load_data.R +++ b/inst/templates/chipseq/libs/load_data.R @@ -1,9 +1,9 @@ library(tidyverse) library(SummarizedExperiment) library(janitor) + load_metrics <- function(multiqc_data_dir){ - fastqc <- read_tsv(file.path(multiqc_data_dir, 'multiqc_fastqc.txt')) %>% clean_names() %>% dplyr::select(sample, total_reads = total_sequences) %>% mutate(new_sample = gsub('_T[0-9]+', '', sample)) %>% @@ -92,3 +92,25 @@ load_peaks <- function(peaks_dir){ return(peaks_all) } + +make_diffbind_samplesheet <- function(coldata, bam_dir, peaks_dir, column = NULL){ + bam_files <- data.frame(bam = list.files(bam_dir, pattern = '.bam$', full.names = T)) %>% + mutate(sample = sub("\\..*", "",basename(bam))) + + peak_files <- data.frame(Peaks = list.files(peaks_dir, pattern = 'Peak$', full.names = T)) %>% + mutate(SampleID = sub("\\..*", "",basename(Peaks))) %>% + mutate(SampleID = gsub('_peaks', '', SampleID)) + + coldata_for_diffbind <- coldata %>% + filter(!is.na(control) & control != '') %>% + dplyr::rename(ControlID = control, SampleID = sample, Condition = antibody) %>% + separate(SampleID, into = c('sample', 'Replicate'), remove = F, sep = '_REP') + coldata_for_diffbind$Factor <- coldata_for_diffbind[[column]] + + samplesheet <- coldata_for_diffbind %>% + left_join(bam_files %>% select(SampleID = sample, bamReads = bam), by = 'SampleID') %>% + left_join(bam_files %>% select(ControlID = sample, bamControl = bam), by = 'ControlID') %>% + left_join(peak_files, by = 'SampleID') + + return(samplesheet) +} diff --git a/inst/templates/chipseq/libs/make_diffbind_samplesheet.R b/inst/templates/chipseq/libs/make_diffbind_samplesheet.R deleted file mode 100755 index b69e7ce..0000000 --- a/inst/templates/chipseq/libs/make_diffbind_samplesheet.R +++ /dev/null @@ -1,32 +0,0 @@ -library(tidyverse) -library(tools) -library(DiffBind) -library(qs) -source('../libs/load_data.R') -source('params_diffbind-example.R') - -coldata <- load_coldata(coldata_fn) - -bam_files <- data.frame(bam = list.files(bam_dir, pattern = '.bam$', full.names = T)) %>% - mutate(sample = sub("\\..*", "",basename(bam))) - -peak_files <- data.frame(Peaks = list.files(peaks_dir, pattern = 'Peak$', full.names = T)) %>% - mutate(SampleID = sub("\\..*", "",basename(Peaks))) %>% - mutate(SampleID = gsub('_peaks', '', SampleID)) - -coldata_for_diffbind <- coldata %>% - filter(!is.na(control) & control != '') %>% - # select(-description) %>% - dplyr::rename(ControlID = control, SampleID = sample) %>% - separate(SampleID, into = c('sample', 'Replicate'), remove = F, sep = '_REP') %>% - mutate(peakCaller = peak_caller) - -samplesheet <- coldata_for_diffbind %>% - left_join(bam_files %>% select(SampleID = sample, bamReads = bam), by = 'SampleID') %>% - left_join(bam_files %>% select(ControlID = sample, bamControl = bam), by = 'ControlID') %>% - left_join(peak_files, by = 'SampleID') - -diffbind_obj <- dba(sampleSheet = samplesheet, scoreCol = 5) -diffbind_count <- dba.count(diffbind_obj, bUseSummarizeOverlaps = TRUE) -qsave(diffbind_count, 'diffbind_count.qs') - From b7201baff5dfb965dfc8a221ec9c8303b8dbc148 Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Thu, 26 Sep 2024 17:36:14 +0000 Subject: [PATCH 77/93] full report --- inst/templates/chipseq/diffbind/diffbind.Rmd | 92 +++++++++++++++++-- .../diffbind/params_diffbind-example.R | 6 +- 2 files changed, 88 insertions(+), 10 deletions(-) diff --git a/inst/templates/chipseq/diffbind/diffbind.Rmd b/inst/templates/chipseq/diffbind/diffbind.Rmd index 3479f17..f0ef924 100644 --- a/inst/templates/chipseq/diffbind/diffbind.Rmd +++ b/inst/templates/chipseq/diffbind/diffbind.Rmd @@ -18,18 +18,20 @@ editor_options: chunk_output_type: console params: # Fill this file with the right paths to nfcore output + # .qs file name for saving DiffBind Counts object params_file: params_diffbind-example.R project_file: ../information.R functions_file: ../libs/load_data.R - # .qs file name for saving DiffBind Counts object diffbind_counts_file: diffbind_counts.qs factor_of_interest: genotype + numerator: cKO + denominator: WT --- ```{r, cache = FALSE, message = FALSE, warning=FALSE} # This set up the working directory to this file so all files can be found -# library(rstudioapi) -# setwd(fs::path_dir(getSourceEditorContext()$path)) +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) ``` @@ -68,7 +70,11 @@ library(janitor) # library(ChIPpeakAnno) library(UpSetR) library(DiffBind) -ggplot2::theme_set(theme_light(base_size = 14)) +library(qs) +library(EnhancedVolcano) +library(ggprism) +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) opts_chunk[["set"]]( cache = FALSE, cache.lazy = FALSE, @@ -80,6 +86,9 @@ opts_chunk[["set"]]( tidy = FALSE, warning = FALSE, fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) ``` @@ -112,9 +121,7 @@ samplesheet <- make_diffbind_samplesheet(coldata, bam_dir, peaks_dir, params$fac samplesheet %>% select(SampleID, Replicate, Condition, Factor, ControlID) %>% sanitize_datatable() ``` -# DiffBind - -```{r create diffbind counts object, eval = !file.exists()} +```{r create diffbind counts object, eval = !file.exists(params$diffbind_counts_file)} diffbind_obj <- dba(sampleSheet = samplesheet, scoreCol = 5) @@ -125,3 +132,74 @@ diffbind_counts <- dba.count(diffbind_obj, bUseSummarizeOverlaps = TRUE, bParall qsave(diffbind_counts, params$diffbind_counts_file) ``` + +# PCA +```{r PCA} + +diffbind_counts <- qread(params$diffbind_counts_file) + +diffbind_norm <- dba.normalize(diffbind_counts) + +norm_counts <- dba.peakset(diffbind_norm, bRetrieve=TRUE, DataType=DBA_DATA_FRAME) %>% + mutate(peak = paste(CHR, START, END, sep = '_')) %>% + select(-CHR, -START, -END) +rownames(norm_counts) <- norm_counts$peak +norm_counts <- norm_counts %>% select(-peak) %>% as.matrix() +norm_counts_log <- log2(norm_counts + 1) + +coldata_for_pca <- coldata[colnames(norm_counts), ] + +stopifnot(all(colnames(norm_counts) == rownames(coldata_for_pca))) + +degPCA(norm_counts_log, coldata_for_pca, condition = params$factor_of_interest) + + scale_color_cb_friendly() + +``` + +# Differentially Bound Peaks + +## Table +```{r DE analysis} +diffbind_norm <- dba.contrast(diffbind_norm, contrast = c('Factor', params$numerator, params$denominator)) +results <- dba.analyze(diffbind_norm, bGreylist = F) + +results_report <- dba.report(results, th = 1) %>% as.data.frame() +results_report_sig <- results_report %>% filter(FDR < 0.05) + +results_report_sig %>% sanitize_datatable() + +``` + +## Volcano plot +```{r volcano, fig.height = 8} +results_report_mod <- results_report %>% + mutate(Fold = replace(Fold, Fold < -5, -5)) %>% + mutate(Fold = replace(Fold, Fold > 5, 5)) %>% + mutate(peak = paste(seqnames, start, end, sep = '_')) +show <- as.data.frame(results_report_mod[1:6, c("Fold", "FDR", "peak")]) +EnhancedVolcano(results_report_mod, + lab= results_report_mod$peak, + pCutoff = 0.05, + selectLab = c(show$peak), + FCcutoff = 0.5, + x = 'Fold', + y = 'FDR', + title = paste(params$factor_of_interest, ':', params$numerator, 'vs', params$denominator), + col=as.vector(colors[c("dark_grey", "light_blue", + "purple", "purple")]), + subtitle = "", drawConnectors = T, max.overlaps = Inf) + +``` + +## Plot top peaks +```{r plot top peaks, fig.width = 8, fig.height = 6} +norm_counts_log_long <- norm_counts_log %>% as.data.frame() %>% + rownames_to_column('peak') %>% + pivot_longer(!peak, names_to = 'sample', values_to = 'norm_counts_log2') %>% + left_join(coldata) + +norm_counts_log_long_top <- norm_counts_log_long %>% filter(peak %in% show$peak) + +ggplot(norm_counts_log_long_top, aes(x = .data[[params$factor_of_interest]], y = norm_counts_log2)) + + facet_wrap(~peak, scale = 'free_y') + geom_boxplot() +``` diff --git a/inst/templates/chipseq/diffbind/params_diffbind-example.R b/inst/templates/chipseq/diffbind/params_diffbind-example.R index 655d32c..6db638d 100644 --- a/inst/templates/chipseq/diffbind/params_diffbind-example.R +++ b/inst/templates/chipseq/diffbind/params_diffbind-example.R @@ -2,6 +2,6 @@ # Example data -coldata_fn='/workspace/s3_hcbc_seqera/test-data/chipseq_peakanalysis_h3k27ac_narrow/chipseq_peakanalysis_H3K27Ac.csv' -peaks_dir = '/workspace/s3_hcbc_seqera/test-data/chipseq_peakanalysis_h3k27ac_narrow/bowtie2/mergedLibrary/macs2/narrowPeak/' -bam_dir = '/workspace/s3_hcbc_seqera/test-data/chipseq_peakanalysis_h3k27ac_narrow/bowtie2/mergedLibrary/' +coldata_fn='/workspace/data/chipseq_peakanalysis_h3k27ac_narrow/chipseq_peakanalysis_H3K27Ac.csv' +peaks_dir = '/workspace/data/chipseq_peakanalysis_h3k27ac_narrow/bowtie2/mergedLibrary/macs2/narrowPeak/' +bam_dir = '/workspace/data/chipseq_peakanalysis_h3k27ac_narrow/bowtie2/mergedLibrary/' From 46da11643e15585fd318fc77e917abf3135bd6bb Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Thu, 26 Sep 2024 21:38:38 +0000 Subject: [PATCH 78/93] begin adding annotation and functional enrichment --- inst/templates/chipseq/diffbind/diffbind.Rmd | 65 ++++++++++++++++---- inst/templates/chipseq/libs/load_data.R | 8 +-- 2 files changed, 58 insertions(+), 15 deletions(-) diff --git a/inst/templates/chipseq/diffbind/diffbind.Rmd b/inst/templates/chipseq/diffbind/diffbind.Rmd index f0ef924..9a6402d 100644 --- a/inst/templates/chipseq/diffbind/diffbind.Rmd +++ b/inst/templates/chipseq/diffbind/diffbind.Rmd @@ -26,6 +26,8 @@ params: factor_of_interest: genotype numerator: cKO denominator: WT + # species = mouse or human + species: mouse --- ```{r, cache = FALSE, message = FALSE, warning=FALSE} @@ -73,6 +75,19 @@ library(DiffBind) library(qs) library(EnhancedVolcano) library(ggprism) +library(ChIPseeker) + +if (params$species == 'mouse'){ + library(TxDb.Mmusculus.UCSC.mm10.knownGene) + txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene + anno_db <- 'org.Mm.eg.db' +} else if (params$species == human){ + library(TxDb.Hsapiens.UCSC.hg38.knownGene) + txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene + anno_db <- 'org.Hs.eg.db' +} + + colors=cb_friendly_cols(1:15) ggplot2::theme_set(theme_prism(base_size = 14)) opts_chunk[["set"]]( @@ -89,6 +104,7 @@ opts_chunk[["set"]]( # set seed for reproducibility set.seed(1234567890L) + ``` @@ -118,7 +134,7 @@ samplesheet <- make_diffbind_samplesheet(coldata, bam_dir, peaks_dir, params$fac ``` ```{r show_metadata} -samplesheet %>% select(SampleID, Replicate, Condition, Factor, ControlID) %>% sanitize_datatable() +samplesheet %>% dplyr::select(SampleID, Replicate, Condition, Factor, ControlID) %>% sanitize_datatable() ``` ```{r create diffbind counts object, eval = !file.exists(params$diffbind_counts_file)} @@ -142,9 +158,9 @@ diffbind_norm <- dba.normalize(diffbind_counts) norm_counts <- dba.peakset(diffbind_norm, bRetrieve=TRUE, DataType=DBA_DATA_FRAME) %>% mutate(peak = paste(CHR, START, END, sep = '_')) %>% - select(-CHR, -START, -END) + dplyr::select(-CHR, -START, -END) rownames(norm_counts) <- norm_counts$peak -norm_counts <- norm_counts %>% select(-peak) %>% as.matrix() +norm_counts <- norm_counts %>% dplyr::select(-peak) %>% as.matrix() norm_counts_log <- log2(norm_counts + 1) coldata_for_pca <- coldata[colnames(norm_counts), ] @@ -161,24 +177,27 @@ degPCA(norm_counts_log, coldata_for_pca, condition = params$factor_of_interest) ## Table ```{r DE analysis} diffbind_norm <- dba.contrast(diffbind_norm, contrast = c('Factor', params$numerator, params$denominator)) -results <- dba.analyze(diffbind_norm, bGreylist = F) +results_obj <- dba.analyze(diffbind_norm, bGreylist = F) + +results_report <- dba.report(results_obj, th = 1) +results_report_sig <- dba.report(results_obj) -results_report <- dba.report(results, th = 1) %>% as.data.frame() -results_report_sig <- results_report %>% filter(FDR < 0.05) +results <- results_report %>% as.data.frame() +results_sig <- results_report_sig %>% as.data.frame() -results_report_sig %>% sanitize_datatable() +results_sig %>% sanitize_datatable() ``` ## Volcano plot ```{r volcano, fig.height = 8} -results_report_mod <- results_report %>% +results_mod <- results %>% mutate(Fold = replace(Fold, Fold < -5, -5)) %>% mutate(Fold = replace(Fold, Fold > 5, 5)) %>% mutate(peak = paste(seqnames, start, end, sep = '_')) -show <- as.data.frame(results_report_mod[1:6, c("Fold", "FDR", "peak")]) -EnhancedVolcano(results_report_mod, - lab= results_report_mod$peak, +show <- as.data.frame(results_mod[1:6, c("Fold", "FDR", "peak")]) +EnhancedVolcano(results_mod, + lab= results_mod$peak, pCutoff = 0.05, selectLab = c(show$peak), FCcutoff = 0.5, @@ -203,3 +222,27 @@ norm_counts_log_long_top <- norm_counts_log_long %>% filter(peak %in% show$peak) ggplot(norm_counts_log_long_top, aes(x = .data[[params$factor_of_interest]], y = norm_counts_log2)) + facet_wrap(~peak, scale = 'free_y') + geom_boxplot() ``` + +## Annotate DB peaks + +```{r annotate, echo = F} + +results_sig_anno <- annotatePeak(results_report_sig, + tssRegion = c(-2000, 2000), + TxDb = txdb, + annoDb = params$anno_db, + verbose = F) +results_sig_anno_df <- results_sig_anno %>% as.data.frame() + +plotAnnoPie(results_sig_anno) + +plotDistToTSS(results_sig_anno) + +anno_data <- toGRanges(txdb, feature = 'gene') +results_sig_anno_batch <- annotatePeakInBatch(results_report_sig, + AnnotationData = anno_data, + output = 'overlapping', + maxgap = 1000) + +results_sig_anno_batch_df <- results_sig_anno_batch %>% as.data.frame() +``` \ No newline at end of file diff --git a/inst/templates/chipseq/libs/load_data.R b/inst/templates/chipseq/libs/load_data.R index b54b6a7..e233ec4 100755 --- a/inst/templates/chipseq/libs/load_data.R +++ b/inst/templates/chipseq/libs/load_data.R @@ -19,9 +19,9 @@ load_metrics <- function(multiqc_data_dir){ phantom <- read_tsv(file.path(multiqc_data_dir, 'multiqc_phantompeakqualtools.txt')) %>% clean_names() %>% dplyr::select(sample, nsc, rsc) - frip <- read_tsv(file.path(multiqc_data_dir, 'multiqc_frip_score-plot.txt')) %>% select(-Sample) %>% + frip <- read_tsv(file.path(multiqc_data_dir, 'multiqc_frip_score-plot.txt')) %>% dplyr::select(-Sample) %>% pivot_longer(everything(), names_to = 'sample', values_to = 'frip') %>% filter(!is.na(frip)) - peak_count <- read_tsv(file.path(multiqc_data_dir, 'multiqc_peak_count-plot.txt')) %>% select(-Sample) %>% + peak_count <- read_tsv(file.path(multiqc_data_dir, 'multiqc_peak_count-plot.txt')) %>% dplyr::select(-Sample) %>% pivot_longer(everything(), names_to = 'sample', values_to = 'peak_count') %>% filter(!is.na(peak_count)) nrf <- read_tsv(file.path(multiqc_data_dir, 'mqc_picard_deduplication_1.txt')) %>% clean_names() %>% mutate(nrf = unique_unpaired / (unique_unpaired + duplicate_unpaired)) %>% @@ -108,8 +108,8 @@ make_diffbind_samplesheet <- function(coldata, bam_dir, peaks_dir, column = NULL coldata_for_diffbind$Factor <- coldata_for_diffbind[[column]] samplesheet <- coldata_for_diffbind %>% - left_join(bam_files %>% select(SampleID = sample, bamReads = bam), by = 'SampleID') %>% - left_join(bam_files %>% select(ControlID = sample, bamControl = bam), by = 'ControlID') %>% + left_join(bam_files %>% dplyr::select(SampleID = sample, bamReads = bam), by = 'SampleID') %>% + left_join(bam_files %>% dplyr::select(ControlID = sample, bamControl = bam), by = 'ControlID') %>% left_join(peak_files, by = 'SampleID') return(samplesheet) From fb97f8e99fab81b38ddb2ea59726278f64e91d1a Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Fri, 27 Sep 2024 14:58:54 +0000 Subject: [PATCH 79/93] investigate peak anno functions --- inst/templates/chipseq/diffbind/diffbind.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/templates/chipseq/diffbind/diffbind.Rmd b/inst/templates/chipseq/diffbind/diffbind.Rmd index 9a6402d..9487e1d 100644 --- a/inst/templates/chipseq/diffbind/diffbind.Rmd +++ b/inst/templates/chipseq/diffbind/diffbind.Rmd @@ -69,7 +69,7 @@ library(DT) library(pheatmap) library(bcbioR) library(janitor) -# library(ChIPpeakAnno) +library(ChIPpeakAnno) library(UpSetR) library(DiffBind) library(qs) From d30c42e7ed0a05545c22e9d623b1fa565e2f46be Mon Sep 17 00:00:00 2001 From: eberdan Date: Tue, 1 Oct 2024 09:40:31 -0400 Subject: [PATCH 80/93] Add files via upload --- .../scRNA_normalization_template.rmd | 646 ++++++++++++++++++ 1 file changed, 646 insertions(+) create mode 100644 inst/templates/singlecell/Integration/scRNA_normalization_template.rmd diff --git a/inst/templates/singlecell/Integration/scRNA_normalization_template.rmd b/inst/templates/singlecell/Integration/scRNA_normalization_template.rmd new file mode 100644 index 0000000..a8f3c49 --- /dev/null +++ b/inst/templates/singlecell/Integration/scRNA_normalization_template.rmd @@ -0,0 +1,646 @@ +--- +title: "scRNA normalization and clustering" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: inline +params: + params_file: parameters.R + project_file: ../information.R +--- + +Template developed with materials from https://hbctraining.github.io/main/. + +```{r} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +``` + + + +```{r setup, cache=FALSE, message=FALSE, warning=FALSE, echo=FALSE} +knitr::opts_chunk$set(echo = TRUE) +# Load libraries +library(Seurat) +library(harmony) +library(knitr) +library(rmarkdown) +library(tidyverse) +library(data.table) +library(DT) +library(patchwork) +library(clustree) +#library(future) +# Set seed for reproducibility +set.seed(1454944673L) +opts_chunk[["set"]]( + audodep = TRUE, + cache = FALSE, + cache.lazy = FALSE, + error = TRUE, + echo = FALSE, + fig.height = 5L, + fig.retina = 2L, + fig.width = 9.6, + message = FALSE, + tidy = TRUE, + warning = TRUE +) +``` + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` +- Aim: `r aim` + + +## Dataset + +The Seurat object used as input for this report was prepared with the thresholds detailed below applied. + +-nGenes > `nFeature_RNA_cutoff` +-nUMI > `nCount_RNA_cutoff` +-complexity > `Log10GenesPerUMI_cutoff` +-percent mitochondrial reads < `mitoRatio_cutoff` + + +```{r load_data, cache = TRUE} + +# Loading QC'd object +seurat_qc <- readRDS( "seurat_post-QC.rds") +DefaultAssay(seurat_qc) <- "RNA" + +# Fix number of PCs for RNA assay +rpcs <- 50 + + +# Define color scales for up to 24 clusters/samples +colsD <- RColorBrewer::brewer.pal(8, "Dark2") +colsM <- RColorBrewer::brewer.pal(8, "Set2") +colsL <- RColorBrewer::brewer.pal(8, "Pastel2") +# Stack same colors from dark to pastel +cols3 <- unlist(strsplit(paste(colsD, colsM, colsL, sep = "_"), "_")) +cols2 <- c(unlist(strsplit(paste(colsD, colsM, sep = "_"), "_")), "deepskyblue2") + +``` + + +After filtering, each sample contributed the following number of cells to the analysis: + +```{r meta pre doub} +table(seurat_qc$orig.ident) +``` + + +# Sources of variability Log normalization {.tabset} + +In this section, we look at potential confounding variables in our (post-QC) dataset, to determine whether their effect needs to be accounted for before normalizing and integrating the data. + +To enable meaningful visualization of the data, we apply a minimal normalization to our raw data (log-normalization). We then identify the top 2000 most variable genes across the log-normalized data, i.e. those with the greatest variability in expression level from one cell to the next. Finally, we calculate principal components (PCs) based on these top 2000 most variable genes, and use the first 50 PCs to derive reduced UMAP (Uniform Manifold Approximation and Projection) components. + + +**We start with log normalization because it is good to observe the data and any trends using a simple transformation. More complex methods like SCT can alter the data in a way that is not as intuitive to interpret.** + +```{r rna_norm0, warning=FALSE, message=FALSE} + #Normalize data +seurat_lognorm <- NormalizeData(seurat_qc, + normalization.method = "LogNormalize", + scale.factor = 10000) + +# Find variable genes (largest dispersion in expression across cells) +seurat_lognorm <- FindVariableFeatures(seurat_lognorm, nfeatures = 2000) + + +# Scale and center data +seurat_lognorm <- ScaleData(seurat_lognorm, model.use = "linear") + +# Calculate PCs and UMAP +seurat_lognorm <- RunPCA(seurat_lognorm) +seurat_lognorm <- RunUMAP(seurat_lognorm, 1:50) + + +``` + + +## Examine highly variable genes + +Highly variable gene selection is extremely important since many downstream steps are computed only on these genes. Seurat allows us to access the ranked highly variable genes with the VariableFeatures() function. We can additionally visualize the dispersion of all genes using Seurat’s VariableFeaturePlot(), which shows a gene’s average expression across all cells on the x-axis and variance on the y-axis. Ideally we want to use genes that have high variance since this can indicate a change in expression depending on populations of cells. Adding labels using the LabelPoints() helps us understand which genes will be driving shape of our data. + +```{r} +# Identify the 15 most highly variable genes +ranked_variable_genes <- VariableFeatures(seurat_lognorm) +top_genes <- ranked_variable_genes[1:15] + +# Plot the average expression and variance of these genes +# With labels to indicate which genes are in the top 15 +p <- VariableFeaturePlot(seurat_lognorm) +LabelPoints(plot = p, points = top_genes, repel = TRUE) +``` + + + + + +## Sample x covariates + +We then use the UMAP reduction to explore our dataset and assess how different variables influence cell clustering. Throughout this report, **UMAP representations are split by various covariates**, to enable checking for potential phenotype-specific clustering. + + +```{r} + +## Below is an example plot, change the group.by and split.by parameters to make plots with your own covariates. + + +UMAPPlot(seurat_norm, group.by = "orig.ident", split.by = "Sex") + ggtitle("UMAP (split by Surgery)") + + +``` + + +## Cell cycle + +The phase of the cell cycle that cells are in at the time of sample preparation can introduce some variability in the transcriptome that we are not interested in exploring. + +To examine cell cycle variation in our data, we assign a score to each cell, derived from the overall expression level of known markers of the G2/M and S phase in that cell. We then display the cells, color-coded by inferred cell cycle phase, on our UMAP. + +Unless cells very strongly cluster by phase of the cell cycle (which is not the case here), we do not recommend to regress out the effect of the cell cycle. + +```{r cell_cycle_scoring, message=FALSE, warning=FALSE} +# Step 1 - Get cell cycle markers +## Cell cycle markers for c.elegans, human, mouse, D. rerio, and D. melanogaster can be found here: https://github.com/hbc/tinyatlas/tree/1e2136a35e773f14d97ae9cbdb6c375327b2dd2b/cell_cycle + + +# Source cell cycle markers +cc_markers = read.csv("/path/to/cell/cycle/markers", sep=",") + + + +# Compute cell cycle score for each cell +seurat_norm <- CellCycleScoring(seurat_norm, + g2m.features = cc_markers$external_gene_name[cc_markers$phase=="G2/M"], + s.features = cc_markers$external_gene_name[cc_markers$phase=="S"]) + + +## Plot cell cycle (grouped by) along with covariates (split.by). Add in your covariates of interest + +UMAPPlot(seurat_norm, group.by = "Phase", split.by = "sex") + ggtitle("UMAP (split by sex)") + + +``` + + + +## mitoRatio + +The mitochondrial to nuclear gene ratio (mitoRatio) is a marker of cellular stress and might also affect cell clustering. For this dataset, we have seen during QC that the fraction of mitochondrial genes was negligible (which is good). Therefore, we do not expect the need to regress out this variable for normalization purposes, but it's always good to check. + +```{r mito_ratio} + + +## This custom function by Amelie Jule creates great plots for looking at different QC parameters across the UMAP + +signaturePlot <- function(seurat_object, + gene_signature, + reduction = "umap", + split_var = NULL, + pt_size = 0.5) { + + g1 <- FeaturePlot(seurat_object, + features = gene_signature, + reduction = reduction, + split.by = split_var, + order = TRUE, + pt.size = pt_size, + combine = FALSE) + + min_val <- min(pull(seurat_object@meta.data, gene_signature)) + max_val <- max(pull(seurat_object@meta.data, gene_signature)) + fix_params <- scale_color_gradientn(colours = c("grey80", "blue"), + limits = c(min_val, max_val)) + + g2 <- lapply(g1, function (x) { x + fix_params + + theme_minimal() + # theme_void() + theme(legend.position = "bottom", + plot.title = element_text(hjust = 0.5)) + + ggtitle("") }) + + g2 + +} + + + +### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] + +g <- signaturePlot(seurat_norm, + gene_signature = "mitoRatio", + split_var = "sex") +g[[1]] + ggtitle("Female") | g[[2]] + ggtitle("Male") + +``` + + +## nUMIs (nCount) + + +```{r} + +### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] + +g <- signaturePlot(seurat_norm, + gene_signature = "nCount_RNA", + split_var = "sex") +g[[1]] + ggtitle("Female") | g[[2]] + ggtitle("Male") + + + +``` + + + +## nGenes (nFeature) + + +```{r} + +### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] + + +g <- signaturePlot(seurat_norm, + gene_signature = "nFeature_RNA", + split_var = "sex") +g[[1]] + ggtitle("Female") | g[[2]] + ggtitle("Male") + + +``` + + +## Complexity + + +```{r} + +### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] + + +g <- signaturePlot(seurat_norm, + gene_signature = "Log10GenesPerUMI", + split_var = "sex") +g[[1]] + ggtitle("Female") | g[[2]] + ggtitle("Male") + + + +``` + + + +# SCT Normalization {.tabset} + +Now that we have established which effects are observed in our data, we can use the SCTransform method to regress out these effects. The SCTransform method was proposed as a better alternative to the log transform normalization method that we used for exploring sources of unwanted variation. The method not only normalizes data, but it also performs a variance stabilization and allows for additional covariates to be regressed out. + +All genes cannot be treated the same, as such, the SCTransform method constructs a generalized linear model (GLM) for each gene with UMI counts as the response and sequencing depth as the explanatory variable. Information is pooled across genes with similar abundances, to regularize parameter estimates and obtain residuals which represent effectively normalized data values which are no longer correlated with sequencing depth. + +We searched for the top 3000 genes with the largest variability in expression level from cell to cell after SCT-normalization, and re-calculated our principal and UMAP components based on the SCT-normalized data for these top genes. + + +**We keep each sample separate for SCT normalization.** + +```{r rna_norm1, warning=FALSE, message=FALSE, eval=FALSE} + +## Note that this single command replaces NormalizeData(), ScaleData(), and FindVariableFeatures() + +## SCT can be run with and without regressing out variables. Generally we do not regress out covariates. However, we provide both options below. + + +## To properly integrate with harmony we split our object by sample first. + +seurat_sctnorm <- SplitObject(seurat_norm, split.by = "orig.ident") + +for (i in 1:length(split_sctnorm)) { + split_sctnorm[[i]] <- SCTransform(split_sctnorm[[i]], vst.flavor = "v2", variable.features.n = 3000) +} + + +#for (i in 1:length(split_seurat)) { +# split_seurat[[i]] <- SCTransform(split_seurat[[i]], vars.to.regress = c("mitoRatio"), vst.flavor = "v2", variable.features.n = 3000) +} + + + +``` + + +## Look at UMAPs post SCT + +The plots below show the same variables as before, this time **displayed on the UMAP calculated after applying SCT-normalization**. + +We qualitatively reviewed the "structure" in our normalized data projection . We were particularly interested in seeing whether similar cell populations across samples clustered together (i.e. overlapped on the UMAP). + + + +```{r, fig.width=10} + + +DefaultAssay(seurat_sctnorm) <- "SCT" + + +UMAPPlot(seurat_sctnorm, group.by = "sex") + ggtitle("UMAP by sex") + +``` + + + + +### Cell cycle + +The phase of the cell cycle that cells are in at the time of sample preparation can introduce some variability in the transcriptome that we are not interested in exploring. + +To examine cell cycle variation in our data, we assign a score to each cell, derived from the overall expression level of known markers of the G2/M and S phase in that cell. We then display the cells, color-coded by inferred cell cycle phase, on our UMAP. + +Unless cells very strongly cluster by phase of the cell cycle (which is not the case here), we do not recommend to regress out the effect of the cell cycle. + +```{r cell_cycle_scoring, message=FALSE, warning=FALSE} +# Step 1 - Get cell cycle markers +## Cell cycle markers for c.elegans, human, mouse, D. rerio, and D. melanogaster can be found here: https://github.com/hbc/tinyatlas/tree/1e2136a35e773f14d97ae9cbdb6c375327b2dd2b/cell_cycle + + +# Source cell cycle markers +cc_markers = read.csv("/path/to/cell/cycle/markers", sep=",") + + + +# Compute cell cycle score for each cell +seurat_sctnorm <- CellCycleScoring(seurat_sctnorm, + g2m.features = cc_markers$external_gene_name[cc_markers$phase=="G2/M"], + s.features = cc_markers$external_gene_name[cc_markers$phase=="S"]) + + +## Plot cell cycle (grouped by) along with covariates (split.by). Add in your covariates of interest + +UMAPPlot(seurat_sctnorm, group.by = "Phase", split.by = "sex") + ggtitle("UMAP (split by sex)") + + +``` + + + +### mitoRatio + +The mitochondrial to nuclear gene ratio (mitoRatio) is a marker of cellular stress and might also affect cell clustering. For this dataset, we have seen during QC that the fraction of mitochondrial genes was negligible (which is good). Therefore, we do not expect the need to regress out this variable for normalization purposes, but it's always good to check. + +```{r mito_ratio} + + +## This custom function by Amelie Jule creates great plots for looking at different QC parameters across the UMAP + +signaturePlot <- function(seurat_object, + gene_signature, + reduction = "umap", + split_var = NULL, + pt_size = 0.5) { + + g1 <- FeaturePlot(seurat_object, + features = gene_signature, + reduction = reduction, + split.by = split_var, + order = TRUE, + pt.size = pt_size, + combine = FALSE) + + min_val <- min(pull(seurat_object@meta.data, gene_signature)) + max_val <- max(pull(seurat_object@meta.data, gene_signature)) + fix_params <- scale_color_gradientn(colours = c("grey80", "blue"), + limits = c(min_val, max_val)) + + g2 <- lapply(g1, function (x) { x + fix_params + + theme_minimal() + # theme_void() + theme(legend.position = "bottom", + plot.title = element_text(hjust = 0.5)) + + ggtitle("") }) + + g2 + +} + + + +### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] + +g <- signaturePlot(seurat_sctnorm, + gene_signature = "mitoRatio", + split_var = "sex") +g[[1]] + ggtitle("Female") | g[[2]] + ggtitle("Male") + +``` + + +### nUMIs (nCount) + + +```{r} + +### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] + +g <- signaturePlot(seurat_sctnorm, + gene_signature = "nCount_RNA", + split_var = "sex") +g[[1]] + ggtitle("Female") | g[[2]] + ggtitle("Male") + + + +``` + + + +### nGenes (nFeature) + + +```{r} + +### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] + + +g <- signaturePlot(seurat_sctnorm, + gene_signature = "nFeature_RNA", + split_var = "sex") +g[[1]] + ggtitle("Female") | g[[2]] + ggtitle("Male") + + +``` + + +## Complexity + + +```{r} + +### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] + + +g <- signaturePlot(seurat_sctnorm, + gene_signature = "Log10GenesPerUMI", + split_var = "sex") +g[[1]] + ggtitle("Female") | g[[2]] + ggtitle("Male") + + + +``` + +## Run harmony and umaps again + +If cells cluster by sample, condition, batch, dataset, modality, this integration step can greatly improve the clustering and the downstream analyses. + +To integrate, we will use the shared highly variable genes (identified using SCTransform) from each group, then, we will “integrate” or “harmonize” the groups to overlay cells that are similar or have a “common set of biological features” between groups. + +We use [`Harmony`](https://portals.broadinstitute.org/harmony/articles/quickstart.html), which is based on a transformation of principal components (PCs) to find similarities across datasets. Here we group samples by the original sample id. + +```{r} + +## Here seurat will integrate on the level of sample id. If you want to integrate on other aspects the SCT normalization will need to be done with all of the data together. + +seurat_harmony <- IntegrateLayers(object = seurat_sctnorm, method = HarmonyIntegration, + orig.reduction = "pca", new.reduction = 'harmony', + assay = "SCT", verbose = FALSE) +seurat_harmony <- RunUMAP(seurat_harmony, reduction = "harmony", dims = 1:40, reduction.name = "umap.harmony") + +``` + + +## Pre vs. Post integration + +```{r dimplot_both all, echo=FALSE} +p1 <- DimPlot(seurat_sctnorm, group.by = "orig.ident", + reduction = "umap") + + theme(legend.position = "bottom") + + ggtitle("pre-integration") +p2 <- DimPlot(seurat_harmony, group.by = "orig.ident", + reduction = "umap.harmony") + + theme(legend.position = "bottom") + + ggtitle("post-integration") + + +p1 | p2 +``` + + +## Clustering + +For single-modality scRNA-seq analysis, `Seurat` clusters the cells using a Louvain clustering approach. First, a K-nearest neighbor (KNN) graph is built, where cells are connected if they have a similar transcriptome, as determined from their scores on the first `r rpcs` PCs. Then, the graph is partitioned into "communities" or "clusters" of interconnected cells that are more tightly connected with each other than with cells outside of the corresponding cluster. + +A limitation of this approach is that the number of identified clusters depends on the chosen resolution, a parameter that must be set by the user and does not necessarily reflect the underlying biology of the dataset. For most single-cell datasets, a resolution of 0.1 to 1 will provide a reasonable number of clusters. Complex datasets with multiple cell types may require a larger resolution, and vice versa. + + + +```{r find_neighbors all, echo=TRUE} +seurat_harmony <- FindNeighbors(seurat_harmony, reduction = "harmony", dims = 1:40) + + +seurat_clust <- FindClusters(object = seurat_harmony, + resolution = c(0.1, 0.2, 0.4, 0.6, 0.8, 1.0), verbose = FALSE) + +``` + + + +## Clustering Tree + + +We build a clustering tree using the [clustree](https://lazappi.github.io/clustree/articles/clustree.html) package to show how cells move as the clustering resolution is increased. Each cluster forms a node in the tree and edges are constructed by considering the cells in a cluster at a lower resolution (say 𝑘=2) that end up in a cluster at the next highest resolution (say 𝑘=3). By connecting clusters in this way we can see how clusters are related to each other, which are clearly distinct and which are unstable. The size of each node is related to the number of samples in each cluster and the color indicates the clustering resolution. Edges are colored according to the number of samples they represent and the transparency shows the incoming node proportion, the number of samples in the edge divided by the number of samples in the node it points to. + + + +```{r, fig.height=10, fig.width=8} +library(clustree) + +meta = seurat_clust@meta.data +meta = na.omit(meta) + + +## Change the prefix to match your clusters + +clustree(meta, prefix = "SCT_snn_res.") + + +``` + +## Visualize clusters {.tabset} + +We take a look at how the clusters look at resolutions 0.1, 0.2,0.4, and 0.6 + +### 0.1 + +```{r umap_0.1} +cluster_res <- 0.1 +Idents(object = seurat_clust) <- paste0("SCT_snn_res.", cluster_res) +DimPlot(seurat_clust, + reduction = "umap", + split.by = "surgery", + label = TRUE) +``` + +* * * + + +### 0.2 + +```{r umap_0.2} +cluster_res <- 0.2 +Idents(object = seurat_clust) <- paste0("SCT_snn_res.", cluster_res) +DimPlot(seurat_clust, + reduction = "umap", + split.by = "surgery", + label = TRUE) +``` + +* * * + + + +### 0.4 + +```{r umap_0.4} +cluster_res <- 0.4 +Idents(object = seurat_clust) <- paste0("SCT_snn_res.", cluster_res) +DimPlot(seurat_clust, + reduction = "umap", + split.by = "surgery", + label = TRUE) +``` + +* * * + + +### 0.6 + +```{r umap_0.6} +cluster_res <- 0.6 +Idents(object = seurat_clust) <- paste0("SCT_snn_res.", cluster_res) +DimPlot(seurat_clust, + reduction = "umap", + split.by = "surgery", + label = TRUE) +``` + +* * * + + +```{r} +saveRDS(seurat_clust, "seurat_clust.RDs") +``` + From 33f64dc3e1ed223eff0eaa6a121bf19673a2865c Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Tue, 1 Oct 2024 18:00:40 +0000 Subject: [PATCH 81/93] diffbind text, write files --- .gitignore | 2 + inst/templates/chipseq/diffbind/diffbind.Rmd | 82 +++++++++++++++++++- 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 6af5e77..a6d9e0e 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,5 @@ inst/rmarkdown/templates/rnaseq/skeleton/DE/Multiplicative_DGE_Analysis.Rmd .quarto inst/templates/chipseq/QC/QC.html *.html +*.csv +*.qs \ No newline at end of file diff --git a/inst/templates/chipseq/diffbind/diffbind.Rmd b/inst/templates/chipseq/diffbind/diffbind.Rmd index 9487e1d..ad51426 100644 --- a/inst/templates/chipseq/diffbind/diffbind.Rmd +++ b/inst/templates/chipseq/diffbind/diffbind.Rmd @@ -28,7 +28,10 @@ params: denominator: WT # species = mouse or human species: mouse + counts_fn: diffbind_counts.csv + results_sig_anno_fn: diffbind_results_sig_anno.csv --- +Template developed with materials from https://hbctraining.github.io/main/. ```{r, cache = FALSE, message = FALSE, warning=FALSE} # This set up the working directory to this file so all files can be found @@ -56,6 +59,22 @@ source(params$functions_file) - Analyst: `r analyst` - Experiment: `r experiment` +# Methodology + +[DiffBind](https://bioconductor.org/packages/release/bioc/vignettes/DiffBind/inst/doc/DiffBind.pdf) +is an R Bioconductor package which provides functions for processing +DNA data enriched for genomic loci, including ChIPseq data enriched for sites +where specific protein/DNA binding occurs or histone marks are enriched. + +DiffBind is mainly used for identifying sites that are differentially enriched +between two or more sample groups. It works primarily with sets of peak calls +('peaksets'), which are sets of genomic intervals representing candidate protein +binding sites for each sample. It includes functions that support the processing +of peaksets, including overlapping and merging peak sets across an entire dataset, +counting sequencing reads in overlapping intervals in peak sets, and identifying +statistically significantly differentially bound sites based on evidence of +binding affinity (measured by differences in read densities). To this end it uses +statistical routines developed in an RNA-Seq context (primarily the Bioconductor packages [edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html) and [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html)). ```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} library(tidyverse) @@ -137,6 +156,27 @@ samplesheet <- make_diffbind_samplesheet(coldata, bam_dir, peaks_dir, params$fac samplesheet %>% dplyr::select(SampleID, Replicate, Condition, Factor, ControlID) %>% sanitize_datatable() ``` +# Calculate counts matrix + +The first step is to read in a set of peaksets and associated metadata. +This is done using the DiffBind sample sheet. Once the peaksets are read in, +a merging function finds all overlapping peaks and derives a single set of unique +genomic intervals covering all the supplied peaks (a consensus peakset for +the experiment). A region is considered for the consensus set if it appears in +more than two of the samples. This consensus set represents the overall set of +candidate binding sites to be used in further analysis. + +The next step is to take the alignment files and compute count information for +each of the peaks/regions in the consensus set. In this step, for each of the +consensus regions, DiffBind uses the number of aligned reads in the ChIP sample +and the input sample to compute a normalized read count for each sample at every +potential binding site. The peaks in the consensus peakset may be re-centered and +trimmed based on calculating their summits (point of greatest read overlap) in +order to provide more standardized peak intervals. + +We then normalize the count matrix to adjust for varying +library size, and we use the normalized counts for further analysis including PCA. + ```{r create diffbind counts object, eval = !file.exists(params$diffbind_counts_file)} diffbind_obj <- dba(sampleSheet = samplesheet, scoreCol = 5) @@ -150,6 +190,12 @@ qsave(diffbind_counts, params$diffbind_counts_file) ``` # PCA + +Principal Component Analysis (PCA) is a statistical technique used to simplify +high-dimensional data by identifying patterns and reducing the number of variables. +In the context of ChIPseq, PCA helps analyze large datasets containing information +about thousands of binding locations across different samples (e.g., tissues, cells). + ```{r PCA} diffbind_counts <- qread(params$diffbind_counts_file) @@ -162,6 +208,10 @@ norm_counts <- dba.peakset(diffbind_norm, bRetrieve=TRUE, DataType=DBA_DATA_FRAM rownames(norm_counts) <- norm_counts$peak norm_counts <- norm_counts %>% dplyr::select(-peak) %>% as.matrix() norm_counts_log <- log2(norm_counts + 1) +norm_counts_log_df <- norm_counts_log %>% as.data.frame() %>% + rownames_to_column('peak') + +write_csv(norm_counts_log_df, params$counts_fn) coldata_for_pca <- coldata[colnames(norm_counts), ] @@ -174,6 +224,11 @@ degPCA(norm_counts_log, coldata_for_pca, condition = params$factor_of_interest) # Differentially Bound Peaks +A standardized differential analysis is performed using DiffBind and the DESeq2 package, +including estimation of size factors and dispersions, fitting and testing the +model, evaluating the supplied contrast, and shrinking the LFCs. A p-value and FDR +is assigned to each candidate binding site indicating confidence that they are differentially bound. + ## Table ```{r DE analysis} diffbind_norm <- dba.contrast(diffbind_norm, contrast = c('Factor', params$numerator, params$denominator)) @@ -190,6 +245,9 @@ results_sig %>% sanitize_datatable() ``` ## Volcano plot + +This volcano plot shows the binding sites that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in purple are sites that have padj < 0.05 and a log2-fold change magnitude > 0.5. Points in blue have a padj > 0.05 and a log2-fold change magnitude > 0.5. Grey points are non-significant. The dashed lines correspond to the cutoff values of log2-fold change and padj that we have chosen. + ```{r volcano, fig.height = 8} results_mod <- results %>% mutate(Fold = replace(Fold, Fold < -5, -5)) %>% @@ -211,6 +269,9 @@ EnhancedVolcano(results_mod, ``` ## Plot top peaks + +We visualize the log2 normalized read counts at a few of the most differentially +bound sites. ```{r plot top peaks, fig.width = 8, fig.height = 6} norm_counts_log_long <- norm_counts_log %>% as.data.frame() %>% rownames_to_column('peak') %>% @@ -225,6 +286,13 @@ ggplot(norm_counts_log_long_top, aes(x = .data[[params$factor_of_interest]], y = ## Annotate DB peaks +We use the [ChIPseeker](https://www.bioconductor.org/packages/release/bioc/html/ChIPseeker.html) +package to determine the genomic context of the differentially bound peaks and +visualize these annotations. We consider the promoter region to be within 2000 bp in either direction of the TSS. + +We also use [ChIPpeakAnno](https://bioconductor.org/packages/release/bioc/html/ChIPpeakAnno.html) +to identify any gene features within 1000 bp of a differentially bound site. + ```{r annotate, echo = F} results_sig_anno <- annotatePeak(results_report_sig, @@ -245,4 +313,16 @@ results_sig_anno_batch <- annotatePeakInBatch(results_report_sig, maxgap = 1000) results_sig_anno_batch_df <- results_sig_anno_batch %>% as.data.frame() -``` \ No newline at end of file + +write_csv(results_sig_anno_batch_df, params$results_sig_anno_fn) +``` + +# Functional Enrichment + +# R session + +List and version of tools used for the report generation. + +```{r} +sessionInfo() +``` From 3ec2536b4447eaa4e2b48b33822cd39d91eeb5bc Mon Sep 17 00:00:00 2001 From: Alex Bartlett <74612800+abartlett004@users.noreply.github.com> Date: Tue, 1 Oct 2024 14:15:31 -0400 Subject: [PATCH 82/93] Update README.md --- inst/templates/rnaseq/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/inst/templates/rnaseq/README.md b/inst/templates/rnaseq/README.md index 6de2117..e6e6cbc 100644 --- a/inst/templates/rnaseq/README.md +++ b/inst/templates/rnaseq/README.md @@ -5,8 +5,8 @@ Make sure there is a project name for this. ## Run data with nf-core rnaseq - Make sure you have access to our [Seqera WorkSpace](https://cloud.seqera.io/orgs/HBC/workspaces/core_production/launchpad) -- Transfer data to HCBC S3: Ask Alex/Lorena. Files will be at our S3 bucket `input/rawdata` folder -- Prepare the CSV file according this [instructions](https://nf-co.re/rnaseq/3.14.0/docs/usage#multiple-runs-of-the-same-sample). File should look like this: +- Transfer data to HCBC S3: Ask Alex/Lorena. Files will be at our S3 bucket `input` folder +- Prepare the CSV file according these [instructions](https://nf-co.re/rnaseq/3.14.0/docs/usage#multiple-runs-of-the-same-sample). File should look like this: ```csv sample,fastq_1,fastq_2,strandedness @@ -22,11 +22,11 @@ You can add more columns to this file with more metadata, and use this file as t - Upload file to our `Datasets` in Seqera using the name of the project but starting with `rnaseq-pi_lastname-hbc_code` - Go to `Launchpad`, select `nf-core_rnaseq` pipeline, and select the previous created `Datasets` in the `input` parameter after clicking in `Browser` - Select an output directory with the same name used for the `Dataset` inside the `results` folder in S3 -- When pipeline is down, data will be copied to our on-premise HPC in the scratch system under `scratch/groups/hsph/hbc/bcbio/` folder +- When pipeline is done, data will be copied to our on-premise HPC in the scratch system under `scratch/groups/hsph/hbc/bcbio/` folder ## Downstream analysis -Please, modify `information.R` with the right information. You can use this file with any other Rmd to include the project/analysis information. +Modify `information.R` with the right information. You can use this file with any other Rmd to include the project/analysis information. ### QC @@ -37,7 +37,7 @@ Read instruction in the R and Rmd scripts to render it. ### DE -`DE/DEG.Rmd` is a template for two groups comparison. `params_de.R` has the information of the input files to load. You can point to `bcbio` or `nf-core/rnaseq` output files. +`DE/DEG.Rmd` is a template for comparison between two groups. `params_de.R` has the information for the input files to load. You can point to `bcbio` or `nf-core/rnaseq` output files. On the `YAML` header file of the `Rmd` you can specify some parameters or just set them up in the first chunk of code of the template. This template has examples of: From 8181c8ac96e8b48b5f255e12859304be9a1d1e96 Mon Sep 17 00:00:00 2001 From: Alex Bartlett <74612800+abartlett004@users.noreply.github.com> Date: Tue, 1 Oct 2024 14:27:03 -0400 Subject: [PATCH 83/93] Update readme.md --- inst/templates/chipseq/readme.md | 41 ++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/inst/templates/chipseq/readme.md b/inst/templates/chipseq/readme.md index 112ddac..efa37c1 100755 --- a/inst/templates/chipseq/readme.md +++ b/inst/templates/chipseq/readme.md @@ -2,11 +2,44 @@ Make sure there is a valid project name, and modify `information.R` with the right information for your project. You can use this file with any other Rmd to include the project/analysis information. +## Run data with nf-core rnaseq + +- Make sure you have access to our [Seqera WorkSpace](https://cloud.seqera.io/orgs/HBC/workspaces/core_production/launchpad) +- Transfer data to HCBC S3: Ask Alex/Lorena. Files will be at our S3 bucket `input` folder +- Prepare the CSV file according these [instructions](https://nf-co.re/chipseq/2.0.0/docs/usage/). File should look like this: + +```csv +sample,fastq_1,fastq_2,antibody,control +WT_BCATENIN_IP_REP1,BLA203A1_S27_L006_R1_001.fastq.gz,,BCATENIN,WT_INPUT +WT_BCATENIN_IP_REP2,BLA203A25_S16_L001_R1_001.fastq.gz,,BCATENIN,WT_INPUT +WT_BCATENIN_IP_REP2,BLA203A25_S16_L002_R1_001.fastq.gz,,BCATENIN,WT_INPUT +WT_BCATENIN_IP_REP2,BLA203A25_S16_L003_R1_001.fastq.gz,,BCATENIN,WT_INPUT +WT_BCATENIN_IP_REP3,BLA203A49_S40_L001_R1_001.fastq.gz,,BCATENIN,WT_INPUT +WT_INPUT_REP1,BLA203A6_S32_L006_R1_001.fastq.gz,,, +WT_INPUT_REP2,BLA203A30_S21_L001_R1_001.fastq.gz,,, +WT_INPUT_REP2,BLA203A30_S21_L002_R1_001.fastq.gz,,, +WT_INPUT_REP3,BLA203A31_S21_L003_R1_001.fastq.gz,,, +``` + +You can add more columns to this file with more metadata, and use this file as the `coldata` file in the templates. + +- Upload file to our `Datasets` in Seqera using the name of the project but starting with `chipseq-pi_lastname-hbc_code` +- Go to `Launchpad`, select `nf-core_chipseq` pipeline, and select the previous created `Datasets` in the `input` parameter after clicking in `Browser` + - Select an output directory with the same name used for the `Dataset` inside the `results` folder in S3 +- When pipeline is done, data will be copied to our on-premise HPC in the scratch system under `scratch/groups/hsph/hbc/bcbio/` folder + ## QC -`QC/QC.Rmd` is a template for QC metrics. It includes basic read-level statistics, peak quality information, sample correlation analysis, and PCA. +`QC/QC.Rmd` is a template for QC metrics. It includes basic read-level statistics, peak quality information, sample correlation analysis, and PCA that it produces using the above samplesheet and output from the nf-core pipeline. Use `params_qc.R` to provide the required input files. + +## DiffBind + +`diffbind/diffbind.Rmd` is a template for comparing peak binding betweeen two groups. Use `params_diffbind.R` to provide the required input files. -## DropBox +On the YAML header file of the Rmd you can specify some parameters including the conditions to be compared, the genome used, and the desired output file names. This template has examples of: +* calculating a peak counts matrix +* PCA +* differential binding analyiss +* peak annotation +* functional analysis (coming soon) -- In `reports/QC` - - [ ] copy QC `Rmd/R/html/figures` From 255147933c10f0ed27f545b5cbd1461b114dcc83 Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Tue, 1 Oct 2024 18:27:23 +0000 Subject: [PATCH 84/93] add non-example params files --- inst/templates/chipseq/QC/params_qc.R | 10 ++++++++++ inst/templates/chipseq/diffbind/params_diffbind.R | 9 +++++++++ 2 files changed, 19 insertions(+) create mode 100644 inst/templates/chipseq/QC/params_qc.R create mode 100644 inst/templates/chipseq/diffbind/params_diffbind.R diff --git a/inst/templates/chipseq/QC/params_qc.R b/inst/templates/chipseq/QC/params_qc.R new file mode 100644 index 0000000..125baf5 --- /dev/null +++ b/inst/templates/chipseq/QC/params_qc.R @@ -0,0 +1,10 @@ +# info params + + +coldata_fn='/path/to/nf-core/samplesheet.csv' +# This folder is in the nf-core output directory inside multiqc folder +multiqc_data_dir='/path/to/nf-core/output/multiqc/narrowPeak/multiqc_data/' +# This folder is in the nf-core output directory, maybe is broadPeak instead of narrowPeak +peaks_dir = '/path/to/nf-core/output/bowtie2/mergedLibrary/macs2/narrowPeak/' +# This folder is in the nf-core output directory, maybe is broadPeak instead of narrowPeak, also includes antibody name +counts_fn = '/path/to/nf-core/output/bowtie2/mergedLibrary/macs2/narrowPeak/consensus/antibody/deseq2/antibody.consensus_peaks.rds' diff --git a/inst/templates/chipseq/diffbind/params_diffbind.R b/inst/templates/chipseq/diffbind/params_diffbind.R new file mode 100644 index 0000000..0a35a81 --- /dev/null +++ b/inst/templates/chipseq/diffbind/params_diffbind.R @@ -0,0 +1,9 @@ +# info params + +coldata_fn='/path/to/nf-core/samplesheet.csv' + +# This folder is in the nf-core output directory, maybe is broadPeak instead of narrowPeak +peaks_dir = '/path/to/nf-core/output/bowtie2/mergedLibrary/macs2/narrowPeak/' + +# This folder is in the nf-core output directory +bam_dir = '/path/to/nf-core/output/bowtie2/mergedLibrary/' From 5c912d7170aba34b0214c952a2a21646e488e062 Mon Sep 17 00:00:00 2001 From: Alex Bartlett <74612800+abartlett004@users.noreply.github.com> Date: Tue, 1 Oct 2024 16:01:55 -0400 Subject: [PATCH 85/93] Update readme.md --- inst/templates/chipseq/readme.md | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/templates/chipseq/readme.md b/inst/templates/chipseq/readme.md index efa37c1..f0e88b3 100755 --- a/inst/templates/chipseq/readme.md +++ b/inst/templates/chipseq/readme.md @@ -43,3 +43,4 @@ On the YAML header file of the Rmd you can specify some parameters including the * peak annotation * functional analysis (coming soon) +This template writes to CSV a log2 normalized counts matrix of peaks x samples as well as the annotated significant results of the differential binding analysis. From c9d6339fbae729ddc7938a727f543fe44eb75e67 Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Thu, 3 Oct 2024 21:37:09 +0000 Subject: [PATCH 86/93] begin feedback from meeting --- inst/templates/chipseq/diffbind/diffbind.Rmd | 44 +++++++++++++++----- inst/templates/chipseq/libs/load_data.R | 4 +- 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/inst/templates/chipseq/diffbind/diffbind.Rmd b/inst/templates/chipseq/diffbind/diffbind.Rmd index ad51426..422faf5 100644 --- a/inst/templates/chipseq/diffbind/diffbind.Rmd +++ b/inst/templates/chipseq/diffbind/diffbind.Rmd @@ -23,7 +23,7 @@ params: project_file: ../information.R functions_file: ../libs/load_data.R diffbind_counts_file: diffbind_counts.qs - factor_of_interest: genotype + condition_of_interest: genotype numerator: cKO denominator: WT # species = mouse or human @@ -41,7 +41,7 @@ setwd(fs::path_dir(getSourceEditorContext()$path)) ```{r source_params, cache = FALSE, message = FALSE, warning=FALSE} -# 1. set up factor_of_interest parameter from parameter above or manually +# 1. set up condition_of_interest parameter from parameter above or manually # this is used to color plots, it needs to be part of the metadata # 2. Set input files in this file source(params$params_file) @@ -144,9 +144,9 @@ coldata <- load_coldata(coldata_fn) # make_diffbind_samplesheet is a function provided by bcbioR to help assemble DiffBind's samplesheet # using the nf-core samplesheet and output. In the resulting DiffBind counts object, it -# encodes your factor of interest as "Factor" and the antibody as "Condition" +# encodes your condition of interest as "Condition" and the antibody as "Factor" -samplesheet <- make_diffbind_samplesheet(coldata, bam_dir, peaks_dir, params$factor_of_interest) +samplesheet <- make_diffbind_samplesheet(coldata, bam_dir, peaks_dir, params$condition_of_interest) # if necessary, one additional covariate of interest can be encoded as "Tissue" @@ -217,11 +217,12 @@ coldata_for_pca <- coldata[colnames(norm_counts), ] stopifnot(all(colnames(norm_counts) == rownames(coldata_for_pca))) -degPCA(norm_counts_log, coldata_for_pca, condition = params$factor_of_interest) + +degPCA(norm_counts_log, coldata_for_pca, condition = params$condition_of_interest) + scale_color_cb_friendly() ``` + # Differentially Bound Peaks A standardized differential analysis is performed using DiffBind and the DESeq2 package, @@ -229,21 +230,44 @@ including estimation of size factors and dispersions, fitting and testing the model, evaluating the supplied contrast, and shrinking the LFCs. A p-value and FDR is assigned to each candidate binding site indicating confidence that they are differentially bound. -## Table -```{r DE analysis} -diffbind_norm <- dba.contrast(diffbind_norm, contrast = c('Factor', params$numerator, params$denominator)) +```{r DB analysis} +diffbind_norm <- dba.contrast(diffbind_norm, contrast = c('Condition', params$numerator, params$denominator)) results_obj <- dba.analyze(diffbind_norm, bGreylist = F) results_report <- dba.report(results_obj, th = 1) results_report_sig <- dba.report(results_obj) + results <- results_report %>% as.data.frame() results_sig <- results_report_sig %>% as.data.frame() +``` + +## MA plot + +This plot can help to: +- Identify Differential Binding: Sites that show a significant log-fold change (M value away from 0) indicate changes in binding between conditions. +- Assess Data Quality: The plot can help in identifying biases or systematic errors in the data. Ideally, most points should scatter around the M=0 line, indicating that there is no significant systematic difference between the conditions. +- Visualize data dispersion: The distribution of points along the A-axis gives a sense of the spread of binding levels and any patterns or anomalies in the dataset. + +```{r MA plot} +results_for_ma <- results%>% + mutate(peak = paste(seqnames, start, end, sep = '_')) %>% + mutate(t = 0) %>% + dplyr::select(peak, AveExpr = Conc, logFC = Fold, P.Value = p.value, adj.P.Val = FDR, t) +rownames(results_for_ma) <- results_for_ma$peak +degMA(as.DEGSet(results_for_ma, contrast = paste(params$numerator, params$denominator, sep = ' vs. '))) + +``` + +## Table of differentially bound peaks + +```{r DB table} results_sig %>% sanitize_datatable() ``` + ## Volcano plot This volcano plot shows the binding sites that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in purple are sites that have padj < 0.05 and a log2-fold change magnitude > 0.5. Points in blue have a padj > 0.05 and a log2-fold change magnitude > 0.5. Grey points are non-significant. The dashed lines correspond to the cutoff values of log2-fold change and padj that we have chosen. @@ -261,7 +285,7 @@ EnhancedVolcano(results_mod, FCcutoff = 0.5, x = 'Fold', y = 'FDR', - title = paste(params$factor_of_interest, ':', params$numerator, 'vs', params$denominator), + title = paste(params$condition_of_interest, ':', params$numerator, 'vs', params$denominator), col=as.vector(colors[c("dark_grey", "light_blue", "purple", "purple")]), subtitle = "", drawConnectors = T, max.overlaps = Inf) @@ -280,7 +304,7 @@ norm_counts_log_long <- norm_counts_log %>% as.data.frame() %>% norm_counts_log_long_top <- norm_counts_log_long %>% filter(peak %in% show$peak) -ggplot(norm_counts_log_long_top, aes(x = .data[[params$factor_of_interest]], y = norm_counts_log2)) + +ggplot(norm_counts_log_long_top, aes(x = .data[[params$condition_of_interest]], y = norm_counts_log2)) + facet_wrap(~peak, scale = 'free_y') + geom_boxplot() ``` diff --git a/inst/templates/chipseq/libs/load_data.R b/inst/templates/chipseq/libs/load_data.R index e233ec4..fec3d4a 100755 --- a/inst/templates/chipseq/libs/load_data.R +++ b/inst/templates/chipseq/libs/load_data.R @@ -103,9 +103,9 @@ make_diffbind_samplesheet <- function(coldata, bam_dir, peaks_dir, column = NULL coldata_for_diffbind <- coldata %>% filter(!is.na(control) & control != '') %>% - dplyr::rename(ControlID = control, SampleID = sample, Condition = antibody) %>% + dplyr::rename(ControlID = control, SampleID = sample, Factor = antibody) %>% separate(SampleID, into = c('sample', 'Replicate'), remove = F, sep = '_REP') - coldata_for_diffbind$Factor <- coldata_for_diffbind[[column]] + coldata_for_diffbind$Condition <- coldata_for_diffbind[[column]] samplesheet <- coldata_for_diffbind %>% left_join(bam_files %>% dplyr::select(SampleID = sample, bamReads = bam), by = 'SampleID') %>% From 98ac197a640c9ce59e70be66a7c63b4041c94bc2 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Fri, 4 Oct 2024 15:22:41 -0400 Subject: [PATCH 87/93] fix rnaseq readme and add sc test dataset --- inst/templates/rnaseq/DE/DEG.Rmd | 38 +-- ...on-intersections.Rmd => Intersections.Rmd} | 0 inst/templates/rnaseq/README.md | 40 +-- .../scRNA_normalization_template.rmd | 315 +++++++----------- inst/templates/singlecell/information.R | 14 +- 5 files changed, 144 insertions(+), 263 deletions(-) rename inst/templates/rnaseq/DE/{Comparison-intersections.Rmd => Intersections.Rmd} (100%) diff --git a/inst/templates/rnaseq/DE/DEG.Rmd b/inst/templates/rnaseq/DE/DEG.Rmd index dffa6dd..2e9c469 100644 --- a/inst/templates/rnaseq/DE/DEG.Rmd +++ b/inst/templates/rnaseq/DE/DEG.Rmd @@ -25,7 +25,7 @@ params: subset_column: null subset_value: null genome: hg38 - ruv: false + ruv: true combatseq: false params_file: params_de-example.R project_file: ../information.R @@ -59,6 +59,7 @@ subset_column=params$subset_column subset_value=params$subset_value run_ruv=params$ruv run_combatseq=params$combatseq +run_rmv=run_ruv | run_combatseq factor_of_interest <- column ``` @@ -240,21 +241,22 @@ new_cdata <- coldata For this study, this formula is recommended: `r as.character(formula)` - ```{r, eval=F, echo=FALSE} #### IF YOU ARE RUNNING RUV OR COMBATSEQ RUN THE CHUNKS BELOW OTHERWISE SKIP TO Differential Expression SECTION or remove this section ``` ## Remove unwanted variation -```{r, eval=run_ruv, results='asis', echo=run_ruv} -cat("### Assessing unknown factors +Removing unwanted variation from RNA-seq analysis is essential to ensure that the results reflect biological rather than technical differences. Methods like ComBat, RUVseq, or surrogate variable analysis (SVA) can be applied to adjust for batch effects, library preparation discrepancies, or other confounders. These techniques model and subtract the unwanted variation, enhancing the ability to detect true biological signals in the data. Proper normalization and careful experimental design are also crucial steps to mitigate such unwanted variation. -When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data.") -``` +### Assessing unknown factors -```{r, eval=!run_ruv, results='asis', echo=run_ruv} -cat("There is no need to assess unknown factor for this study.") +```{r, results='asis'} +if (run_ruv){ + cat("When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data.") +}else{ + cat("There is no need to assess unknown factor for this study.") +} ``` ```{r do_RUV, eval=run_ruv, echo=run_ruv} @@ -291,15 +293,13 @@ dds_to_use <- DESeqDataSetFromMatrix(counts, new_cdata, design = formula) vsd_to_use<- vst(dds_to_use, blind=FALSE) ``` +### Remove Batch Effects ```{r combat-text , eval=run_combatseq, results='asis', echo=run_combatseq} # NOTE Combatseq (part of the SVA package) - corrected count, removing the effects while retaining the structure of the data. Used in a scenario where you know what covariate/batch is. Do not add know-removed known covariates to DESeq2 formula. Also, don’t attempt to remove biological effect (e.g. donor), this is not conceptually valid; best for technical variation. library(sva) -cat("### Remove Batch Effects - -Here we apply Combat-seq (https://github.com/zhangyuqing/ComBat-seq) to try to remove batch effects so we can better tease out the effects of interest. - +cat("Here we apply Combat-seq (https://github.com/zhangyuqing/ComBat-seq) to try to remove batch effects so we can better tease out the effects of interest. Combat-seq uses a negative binomial regression to model batch effects, providing adjusted data by mapping the original data to an expected distribution if there were no batch effects. The adjusted data preserves the integer nature of counts, so that it is compatible with the assumptions of state-of-the-art differential expression software (e.g. edgeR, DESeq2, which specifically request untransformed count data).") ``` @@ -309,10 +309,9 @@ cat("There is no need to remove known factors like batch effect in this study.") ``` ```{r set_variable_combatseq, eval=run_combatseq, echo=run_combatseq} +# NOTE work on this code if you need to run combatseq -## FILL OUT THIS CHUNK OF CODE IF YOU WANT TO RUN COMBATSEQ - -## Set your batch effect variable here this is the variable that combatseq will try to remove +# Set your batch effect variable here this is the variable that combatseq will try to remove ## Column name of your batch variable to_remove = "batch" @@ -321,11 +320,9 @@ to_remove = "batch" to_keep = "sample_type" - coldata[[to_remove]] <- as.factor(coldata[[to_remove]]) coldata[[to_keep]] <- as.factor(coldata[[to_keep]]) - batch = coldata[[to_remove]] treatment = coldata[[to_keep]] @@ -335,18 +332,15 @@ treatment = coldata[[to_keep]] #treatment2 = metrics[[to_keep]] #treatment3 = metrics[[to_keep]] - # imp = cbind(as.numeric(as.character(treatment1)),as.numeric(as.character(treatment2)), as.numeric(as.character(treatment3))) - ``` ```{r do_combatseq, eval=run_combatseq} adjusted_counts <- ComBat_seq(as.matrix(counts), batch=batch, group = treatment) -## For multiple variables of interest +#NOTE For multiple variables of interest # adjusted_counts <- ComBat_seq(as.matrix(counts2), batch=batch, covar_mod = imp) - ``` ```{r after_combatseq, eval=run_combatseq} @@ -361,7 +355,6 @@ pca_combat <- degPCA(norm_matrix, coldata, pca_combat + scale_color_cb_friendly() ``` - # Differential Expression Differential gene expression analysis of count data was performed using the Bioconductor R package, DESeq2, which fits the count data to a negative binomial model. @@ -579,7 +572,6 @@ for (contrast in names(de_list)){ tagList(dt_list) ``` - # Pathway Analysis- Over-representation Over-Representation Analysis (ORA) is a statistical method used to determine whether a predefined set of genes (e.g., genes belonging to a specific biological pathway or function) is over-represented (or enriched) among a list of differentially expressed genes (DEGs) from RNA-seq data. Adventages of ORA: diff --git a/inst/templates/rnaseq/DE/Comparison-intersections.Rmd b/inst/templates/rnaseq/DE/Intersections.Rmd similarity index 100% rename from inst/templates/rnaseq/DE/Comparison-intersections.Rmd rename to inst/templates/rnaseq/DE/Intersections.Rmd diff --git a/inst/templates/rnaseq/README.md b/inst/templates/rnaseq/README.md index 6de2117..42ffbd9 100644 --- a/inst/templates/rnaseq/README.md +++ b/inst/templates/rnaseq/README.md @@ -4,25 +4,8 @@ Make sure there is a project name for this. ## Run data with nf-core rnaseq -- Make sure you have access to our [Seqera WorkSpace](https://cloud.seqera.io/orgs/HBC/workspaces/core_production/launchpad) -- Transfer data to HCBC S3: Ask Alex/Lorena. Files will be at our S3 bucket `input/rawdata` folder -- Prepare the CSV file according this [instructions](https://nf-co.re/rnaseq/3.14.0/docs/usage#multiple-runs-of-the-same-sample). File should look like this: - -```csv -sample,fastq_1,fastq_2,strandedness -CONTROL_REP1,s3path/AEG588A1_S1_L002_R1_001.fastq.gz,s3path/AEG588A1_S1_L002_R2_001.fastq.gz,auto -CONTROL_REP1,s3path/AEG588A1_S1_L003_R1_001.fastq.gz,s3path/AEG588A1_S1_L003_R2_001.fastq.gz,auto -CONTROL_REP1,s3path/AEG588A1_S1_L004_R1_001.fastq.gz,s3path/AEG588A1_S1_L004_R2_001.fastq.gz,auto -``` - -Use `bcbio_nfcore_check(csv_file)` to check the file is correct. - -You can add more columns to this file with more metadata, and use this file as the `coldata` file in the templates. - -- Upload file to our `Datasets` in Seqera using the name of the project but starting with `rnaseq-pi_lastname-hbc_code` -- Go to `Launchpad`, select `nf-core_rnaseq` pipeline, and select the previous created `Datasets` in the `input` parameter after clicking in `Browser` - - Select an output directory with the same name used for the `Dataset` inside the `results` folder in S3 -- When pipeline is down, data will be copied to our on-premise HPC in the scratch system under `scratch/groups/hsph/hbc/bcbio/` folder +This templates assume data has been processed by [nf-core/rnasea](https://nf-co.re/rnaseq/3.14.0/docs/usage). +We recommend to use the samplesheet.csv used with nf-core as metadata file, where other relevant columns can be there even if they are not used by the pipeline. ## Downstream analysis @@ -48,19 +31,10 @@ On the `YAML` header file of the `Rmd` you can specify some parameters or just s - Pathway analysis - Tables -## DropBox +### Other templates + +- `DE/GSVA.Rmd` shows an example on how to use [GSVA package](https://bioconductor.org/packages/release/bioc/html/GSVA.html) for estimating variation of gene set enrichment through the samples of a expression data set +- `DE/Cross-comparison-analysis.Rmd` shows an exmaple on how to compare two differential expression analysis from the `DEG.Rmd` template. +- `DE/Intersections.Rmd` shows an example on how to compare multiple differential expression analyses from `DE/DEG.Rmd` and find intersections. -- In `reports/QC` - - [ ] copy `bcbio-se.rds` and `tximport-counts.csv` - - [ ] copy QC `Rmd/R/html/figures` -- In `reports/DE` - - [ ] Normalized counts for all genes x all samples (csv format) -- In `reports/DE`, for *each analysis*: - - **Note** For multiple comparisons/analysis, do a single report/template if possible in the parent folder using parameters whenever possible. - - Create a folder with the comparison names in the files. Numbering by comparison (`01.1_DE_comp1`, `01.2_DE_comp2`, etc.). If you’re running multiple models for the same comparison, append `_M#`. Add the following files under each folder: - - [ ] Normalized count table with the samples used in this analysis/comparison. - - [ ] Full results `DESeq2` for all genes (csv format) with annotation columns appended. - - [ ] Significant genes results file (subset of annotated full results by chosen p-value and LFC). Separate files will be created for each individual contrast. - - [ ] Significant genes results file as described above, but additionally append columns containing normalized count values for each sample. - - Make sure to append the gene symbols to these tables so the researcher can interpret the results. diff --git a/inst/templates/singlecell/Integration/scRNA_normalization_template.rmd b/inst/templates/singlecell/Integration/scRNA_normalization_template.rmd index a8f3c49..2082b9d 100644 --- a/inst/templates/singlecell/Integration/scRNA_normalization_template.rmd +++ b/inst/templates/singlecell/Integration/scRNA_normalization_template.rmd @@ -17,18 +17,33 @@ output: editor_options: chunk_output_type: inline params: - params_file: parameters.R project_file: ../information.R --- Template developed with materials from https://hbctraining.github.io/main/. -```{r} +```{r, message=FALSE, warning=FALSE} # This set up the working directory to this file so all files can be found library(rstudioapi) +library(tidyverse) setwd(fs::path_dir(getSourceEditorContext()$path)) ``` +```{r} +# parameters +## Cell cycle markers for c.elegans, human, mouse, D. rerio, and D. melanogaster can be found here: https://github.com/hbc/tinyatlas/tree/1e2136a35e773f14d97ae9cbdb6c375327b2dd2b/cell_cycle +## This files needs gene_name and phase columns to work with this template + +cell_cycle_file="https://github.com/bcbio/resources/raw/refs/heads/main/singlecell/human_cell_cycle.csv" +# Source cell cycle markers +cc_markers = read_csv(cell_cycle_file) +stopifnot(c("gene_name", "phase") %in% colnames(cc_markers)) + +seurat_obj="https://github.com/bcbio/bcbioR-test-data/raw/refs/heads/main/singlecell/tiny.rds" +seurat_output="/tmp/seurat_clust.rds" + +source(params$project_file) +``` ```{r setup, cache=FALSE, message=FALSE, warning=FALSE, echo=FALSE} @@ -38,7 +53,6 @@ library(Seurat) library(harmony) library(knitr) library(rmarkdown) -library(tidyverse) library(data.table) library(DT) library(patchwork) @@ -81,14 +95,14 @@ The Seurat object used as input for this report was prepared with the thresholds ```{r load_data, cache = TRUE} - # Loading QC'd object -seurat_qc <- readRDS( "seurat_post-QC.rds") -DefaultAssay(seurat_qc) <- "RNA" - -# Fix number of PCs for RNA assay -rpcs <- 50 +if (isUrl(seurat_obj)){ + seurat_qc <- readRDS(url(seurat_obj)) +}else{ + seurat_qc <- readRDS(seurat_obj) +} +DefaultAssay(seurat_qc) <- "RNA" # Define color scales for up to 24 clusters/samples colsD <- RColorBrewer::brewer.pal(8, "Dark2") @@ -97,7 +111,6 @@ colsL <- RColorBrewer::brewer.pal(8, "Pastel2") # Stack same colors from dark to pastel cols3 <- unlist(strsplit(paste(colsD, colsM, colsL, sep = "_"), "_")) cols2 <- c(unlist(strsplit(paste(colsD, colsM, sep = "_"), "_")), "deepskyblue2") - ``` @@ -114,11 +127,10 @@ In this section, we look at potential confounding variables in our (post-QC) dat To enable meaningful visualization of the data, we apply a minimal normalization to our raw data (log-normalization). We then identify the top 2000 most variable genes across the log-normalized data, i.e. those with the greatest variability in expression level from one cell to the next. Finally, we calculate principal components (PCs) based on these top 2000 most variable genes, and use the first 50 PCs to derive reduced UMAP (Uniform Manifold Approximation and Projection) components. - **We start with log normalization because it is good to observe the data and any trends using a simple transformation. More complex methods like SCT can alter the data in a way that is not as intuitive to interpret.** ```{r rna_norm0, warning=FALSE, message=FALSE} - #Normalize data +#Normalize data seurat_lognorm <- NormalizeData(seurat_qc, normalization.method = "LogNormalize", scale.factor = 10000) @@ -133,11 +145,8 @@ seurat_lognorm <- ScaleData(seurat_lognorm, model.use = "linear") # Calculate PCs and UMAP seurat_lognorm <- RunPCA(seurat_lognorm) seurat_lognorm <- RunUMAP(seurat_lognorm, 1:50) - - ``` - ## Examine highly variable genes Highly variable gene selection is extremely important since many downstream steps are computed only on these genes. Seurat allows us to access the ranked highly variable genes with the VariableFeatures() function. We can additionally visualize the dispersion of all genes using Seurat’s VariableFeaturePlot(), which shows a gene’s average expression across all cells on the x-axis and variance on the y-axis. Ideally we want to use genes that have high variance since this can indicate a change in expression depending on populations of cells. Adding labels using the LabelPoints() helps us understand which genes will be driving shape of our data. @@ -153,26 +162,16 @@ p <- VariableFeaturePlot(seurat_lognorm) LabelPoints(plot = p, points = top_genes, repel = TRUE) ``` - - - - ## Sample x covariates We then use the UMAP reduction to explore our dataset and assess how different variables influence cell clustering. Throughout this report, **UMAP representations are split by various covariates**, to enable checking for potential phenotype-specific clustering. - ```{r} - ## Below is an example plot, change the group.by and split.by parameters to make plots with your own covariates. - -UMAPPlot(seurat_norm, group.by = "orig.ident", split.by = "Sex") + ggtitle("UMAP (split by Surgery)") - - +UMAPPlot(seurat_lognorm, group.by = "orig.ident") + ggtitle("UMAP") ``` - ## Cell cycle The phase of the cell cycle that cells are in at the time of sample preparation can introduce some variability in the transcriptome that we are not interested in exploring. @@ -183,36 +182,24 @@ Unless cells very strongly cluster by phase of the cell cycle (which is not the ```{r cell_cycle_scoring, message=FALSE, warning=FALSE} # Step 1 - Get cell cycle markers -## Cell cycle markers for c.elegans, human, mouse, D. rerio, and D. melanogaster can be found here: https://github.com/hbc/tinyatlas/tree/1e2136a35e773f14d97ae9cbdb6c375327b2dd2b/cell_cycle - - -# Source cell cycle markers -cc_markers = read.csv("/path/to/cell/cycle/markers", sep=",") - - # Compute cell cycle score for each cell -seurat_norm <- CellCycleScoring(seurat_norm, - g2m.features = cc_markers$external_gene_name[cc_markers$phase=="G2/M"], - s.features = cc_markers$external_gene_name[cc_markers$phase=="S"]) - +## NOTE use the right column names for cc_markers if they are different than +# external_gene_name and phase +seurat_lognorm <- CellCycleScoring(seurat_lognorm, + g2m.features = cc_markers$gene_name[cc_markers$phase=="G2/M"], + s.features = cc_markers$gene_name[cc_markers$phase=="S"]) ## Plot cell cycle (grouped by) along with covariates (split.by). Add in your covariates of interest -UMAPPlot(seurat_norm, group.by = "Phase", split.by = "sex") + ggtitle("UMAP (split by sex)") - - +UMAPPlot(seurat_lognorm, group.by = "Phase") + ggtitle("UMAP") ``` - - ## mitoRatio The mitochondrial to nuclear gene ratio (mitoRatio) is a marker of cellular stress and might also affect cell clustering. For this dataset, we have seen during QC that the fraction of mitochondrial genes was negligible (which is good). Therefore, we do not expect the need to regress out this variable for normalization purposes, but it's always good to check. ```{r mito_ratio} - - ## This custom function by Amelie Jule creates great plots for looking at different QC parameters across the UMAP signaturePlot <- function(seurat_object, @@ -244,72 +231,47 @@ signaturePlot <- function(seurat_object, } - - ### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] -g <- signaturePlot(seurat_norm, +g <- signaturePlot(seurat_lognorm, gene_signature = "mitoRatio", - split_var = "sex") -g[[1]] + ggtitle("Female") | g[[2]] + ggtitle("Male") - + split_var = "orig.ident") +g[[1]] + ggtitle("S1") | g[[2]]+ ggtitle("S2") ``` - ## nUMIs (nCount) - ```{r} - ### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] -g <- signaturePlot(seurat_norm, +g <- signaturePlot(seurat_lognorm, gene_signature = "nCount_RNA", - split_var = "sex") -g[[1]] + ggtitle("Female") | g[[2]] + ggtitle("Male") - - - + split_var = "subj") +g[[1]] + ggtitle("S1") | g[[2]]+ ggtitle("S2") ``` - - ## nGenes (nFeature) - ```{r} - ### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] - -g <- signaturePlot(seurat_norm, +g <- signaturePlot(seurat_lognorm, gene_signature = "nFeature_RNA", - split_var = "sex") -g[[1]] + ggtitle("Female") | g[[2]] + ggtitle("Male") - - + split_var = "orig.ident") +g[[1]] + ggtitle("S1") | g[[2]]+ ggtitle("S2") ``` - ## Complexity - ```{r} - ### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] - -g <- signaturePlot(seurat_norm, +g <- signaturePlot(seurat_lognorm, gene_signature = "Log10GenesPerUMI", - split_var = "sex") -g[[1]] + ggtitle("Female") | g[[2]] + ggtitle("Male") - - - + split_var = "orig.ident") +g[[1]] + ggtitle("S1") | g[[2]]+ ggtitle("S2") ``` - - # SCT Normalization {.tabset} Now that we have established which effects are observed in our data, we can use the SCTransform method to regress out these effects. The SCTransform method was proposed as a better alternative to the log transform normalization method that we used for exploring sources of unwanted variation. The method not only normalizes data, but it also performs a variance stabilization and allows for additional covariates to be regressed out. @@ -318,56 +280,59 @@ All genes cannot be treated the same, as such, the SCTransform method constructs We searched for the top 3000 genes with the largest variability in expression level from cell to cell after SCT-normalization, and re-calculated our principal and UMAP components based on the SCT-normalized data for these top genes. - **We keep each sample separate for SCT normalization.** -```{r rna_norm1, warning=FALSE, message=FALSE, eval=FALSE} +```{r, eval=file.exists("seurat_sct.rds")} +#NOTE run the chunck below to create this object, and loading will be used while +# knitting to speed up the rendering +seurat_sctnorm = readRDS("seurat_sct.rds") +``` + +```{r rna_norm1, warning=FALSE, message=FALSE, eval=!exists("seurat_sctnorm")} +#NOTE: this should be ran previous rendering to prepare the object ## Note that this single command replaces NormalizeData(), ScaleData(), and FindVariableFeatures() ## SCT can be run with and without regressing out variables. Generally we do not regress out covariates. However, we provide both options below. - ## To properly integrate with harmony we split our object by sample first. -seurat_sctnorm <- SplitObject(seurat_norm, split.by = "orig.ident") - -for (i in 1:length(split_sctnorm)) { - split_sctnorm[[i]] <- SCTransform(split_sctnorm[[i]], vst.flavor = "v2", variable.features.n = 3000) -} - - +# split_sctnorm <- SplitObject(seurat_lognorm, split.by = "orig.ident") +# +# for (i in 1:length(split_sctnorm)) { +# split_sctnorm[[i]] <- SCTransform(split_sctnorm[[i]], +# vst.flavor = "v2", +# variable.features.n = 3000) +# } +# +# integ_features <- SelectIntegrationFeatures(object.list = split_seurat, +# nfeatures = 3000) +# split_seurat <- PrepSCTIntegration(object.list = split_seurat, +# anchor.features = integ_features) #for (i in 1:length(split_seurat)) { # split_seurat[[i]] <- SCTransform(split_seurat[[i]], vars.to.regress = c("mitoRatio"), vst.flavor = "v2", variable.features.n = 3000) -} - - +# } +seurat_lognorm[["RNA"]] <- split(seurat_lognorm[["RNA"]], f= seurat_lognorm$orig.ident) +seurat_sctnorm <- SCTransform(seurat_lognorm, + vst.flavor = "v2", + variable.features.n = 3000) +saveRDS(seurat_sctnorm, file="seurat_sct.rds") ``` - ## Look at UMAPs post SCT The plots below show the same variables as before, this time **displayed on the UMAP calculated after applying SCT-normalization**. We qualitatively reviewed the "structure" in our normalized data projection . We were particularly interested in seeing whether similar cell populations across samples clustered together (i.e. overlapped on the UMAP). - - ```{r, fig.width=10} - - DefaultAssay(seurat_sctnorm) <- "SCT" - - -UMAPPlot(seurat_sctnorm, group.by = "sex") + ggtitle("UMAP by sex") - +seurat_sctnorm <- RunPCA(seurat_sctnorm) +UMAPPlot(seurat_sctnorm, group.by = "orig.ident") + ggtitle("UMAP") ``` - - - -### Cell cycle +## Cell cycle The phase of the cell cycle that cells are in at the time of sample preparation can introduce some variability in the transcriptome that we are not interested in exploring. @@ -375,38 +340,23 @@ To examine cell cycle variation in our data, we assign a score to each cell, der Unless cells very strongly cluster by phase of the cell cycle (which is not the case here), we do not recommend to regress out the effect of the cell cycle. -```{r cell_cycle_scoring, message=FALSE, warning=FALSE} +```{r cell_cycle_scoring2, message=FALSE, warning=FALSE} # Step 1 - Get cell cycle markers ## Cell cycle markers for c.elegans, human, mouse, D. rerio, and D. melanogaster can be found here: https://github.com/hbc/tinyatlas/tree/1e2136a35e773f14d97ae9cbdb6c375327b2dd2b/cell_cycle - - -# Source cell cycle markers -cc_markers = read.csv("/path/to/cell/cycle/markers", sep=",") - - - # Compute cell cycle score for each cell seurat_sctnorm <- CellCycleScoring(seurat_sctnorm, - g2m.features = cc_markers$external_gene_name[cc_markers$phase=="G2/M"], - s.features = cc_markers$external_gene_name[cc_markers$phase=="S"]) - + g2m.features = cc_markers$gene_name[cc_markers$phase=="G2/M"], + s.features = cc_markers$gene_name[cc_markers$phase=="S"]) ## Plot cell cycle (grouped by) along with covariates (split.by). Add in your covariates of interest - -UMAPPlot(seurat_sctnorm, group.by = "Phase", split.by = "sex") + ggtitle("UMAP (split by sex)") - - +UMAPPlot(seurat_sctnorm, group.by = "Phase", split.by = "orig.ident") + ggtitle("UMAP") ``` - - -### mitoRatio +## mitoRatio The mitochondrial to nuclear gene ratio (mitoRatio) is a marker of cellular stress and might also affect cell clustering. For this dataset, we have seen during QC that the fraction of mitochondrial genes was negligible (which is good). Therefore, we do not expect the need to regress out this variable for normalization purposes, but it's always good to check. -```{r mito_ratio} - - +```{r mito_ratio2} ## This custom function by Amelie Jule creates great plots for looking at different QC parameters across the UMAP signaturePlot <- function(seurat_object, @@ -438,71 +388,49 @@ signaturePlot <- function(seurat_object, } - - ### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] g <- signaturePlot(seurat_sctnorm, gene_signature = "mitoRatio", - split_var = "sex") -g[[1]] + ggtitle("Female") | g[[2]] + ggtitle("Male") - + split_var = "orig.ident") +g[[1]] + ggtitle("S1") | g[[2]] + ggtitle("S2") ``` - -### nUMIs (nCount) - +## nUMIs (nCount) ```{r} - ### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] g <- signaturePlot(seurat_sctnorm, gene_signature = "nCount_RNA", - split_var = "sex") -g[[1]] + ggtitle("Female") | g[[2]] + ggtitle("Male") - - - + split_var = "orig.ident") +g[[1]] + ggtitle("S1") | g[[2]] + ggtitle("S2") ``` - - -### nGenes (nFeature) - +## nGenes (nFeature) ```{r} - ### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] - g <- signaturePlot(seurat_sctnorm, gene_signature = "nFeature_RNA", - split_var = "sex") -g[[1]] + ggtitle("Female") | g[[2]] + ggtitle("Male") - - + split_var = "orig.ident") +g[[1]] + ggtitle("S1") | g[[2]] + ggtitle("S2") ``` - ## Complexity - ```{r} ### Here we apply the function. gene_signature is whatever qc metric you care about and split_var should be a covariate of interest. Adjust the titles to match the covariate groups. If you have more than 2 covariate groups then you will have multiple plots g[[3]]....g[[n]] - g <- signaturePlot(seurat_sctnorm, gene_signature = "Log10GenesPerUMI", - split_var = "sex") -g[[1]] + ggtitle("Female") | g[[2]] + ggtitle("Male") - - - + split_var = "orig.ident") +g[[1]] + ggtitle("S1") | g[[2]] + ggtitle("S2") ``` -## Run harmony and umaps again +# Run harmony and umaps again If cells cluster by sample, condition, batch, dataset, modality, this integration step can greatly improve the clustering and the downstream analyses. @@ -510,15 +438,21 @@ To integrate, we will use the shared highly variable genes (identified using SCT We use [`Harmony`](https://portals.broadinstitute.org/harmony/articles/quickstart.html), which is based on a transformation of principal components (PCs) to find similarities across datasets. Here we group samples by the original sample id. -```{r} +```{r, eval=file.exists("seurat_sct.rds")} +#NOTE run the chunck below to create this object, and loading will be used while +# knitting to speed up the rendering +seurat_harmony = readRDS("seurat_sct.rds") +``` +```{r, eval=exists("seurat_harmony"), warning=FALSE, message=FALSE} ## Here seurat will integrate on the level of sample id. If you want to integrate on other aspects the SCT normalization will need to be done with all of the data together. - -seurat_harmony <- IntegrateLayers(object = seurat_sctnorm, method = HarmonyIntegration, +seurat_harmony <- IntegrateLayers(object = seurat_sctnorm, + method = HarmonyIntegration, orig.reduction = "pca", new.reduction = 'harmony', assay = "SCT", verbose = FALSE) +seurat_sctnorm <- RunPCA(seurat_sctnorm) seurat_harmony <- RunUMAP(seurat_harmony, reduction = "harmony", dims = 1:40, reduction.name = "umap.harmony") - +saveRDS(seurat_harmony, file="seurat_harmony.rds") ``` @@ -534,49 +468,37 @@ p2 <- DimPlot(seurat_harmony, group.by = "orig.ident", theme(legend.position = "bottom") + ggtitle("post-integration") - p1 | p2 ``` - ## Clustering -For single-modality scRNA-seq analysis, `Seurat` clusters the cells using a Louvain clustering approach. First, a K-nearest neighbor (KNN) graph is built, where cells are connected if they have a similar transcriptome, as determined from their scores on the first `r rpcs` PCs. Then, the graph is partitioned into "communities" or "clusters" of interconnected cells that are more tightly connected with each other than with cells outside of the corresponding cluster. +For single-modality scRNA-seq analysis, `Seurat` clusters the cells using a Louvain clustering approach. First, a K-nearest neighbor (KNN) graph is built, where cells are connected if they have a similar transcriptome, as determined from their scores on the first PCs. Then, the graph is partitioned into "communities" or "clusters" of interconnected cells that are more tightly connected with each other than with cells outside of the corresponding cluster. A limitation of this approach is that the number of identified clusters depends on the chosen resolution, a parameter that must be set by the user and does not necessarily reflect the underlying biology of the dataset. For most single-cell datasets, a resolution of 0.1 to 1 will provide a reasonable number of clusters. Complex datasets with multiple cell types may require a larger resolution, and vice versa. - - ```{r find_neighbors all, echo=TRUE} -seurat_harmony <- FindNeighbors(seurat_harmony, reduction = "harmony", dims = 1:40) - - +seurat_harmony <- FindNeighbors(seurat_harmony, assay = "SCT", + reduction = "harmony", dims = 1:40) +# check graph names names(seurat_harmony@graphs) +# DefaultAssay(object = seurat_harmony[["pca"]]) seurat_clust <- FindClusters(object = seurat_harmony, - resolution = c(0.1, 0.2, 0.4, 0.6, 0.8, 1.0), verbose = FALSE) - + resolution = c(0.1, 0.2, 0.4, 0.6, 0.8, 1.0), + verbose = FALSE) ``` - - ## Clustering Tree - We build a clustering tree using the [clustree](https://lazappi.github.io/clustree/articles/clustree.html) package to show how cells move as the clustering resolution is increased. Each cluster forms a node in the tree and edges are constructed by considering the cells in a cluster at a lower resolution (say 𝑘=2) that end up in a cluster at the next highest resolution (say 𝑘=3). By connecting clusters in this way we can see how clusters are related to each other, which are clearly distinct and which are unstable. The size of each node is related to the number of samples in each cluster and the color indicates the clustering resolution. Edges are colored according to the number of samples they represent and the transparency shows the incoming node proportion, the number of samples in the edge divided by the number of samples in the node it points to. - - ```{r, fig.height=10, fig.width=8} library(clustree) meta = seurat_clust@meta.data meta = na.omit(meta) - ## Change the prefix to match your clusters - clustree(meta, prefix = "SCT_snn_res.") - - ``` ## Visualize clusters {.tabset} @@ -586,54 +508,49 @@ We take a look at how the clusters look at resolutions 0.1, 0.2,0.4, and 0.6 ### 0.1 ```{r umap_0.1} -cluster_res <- 0.1 +cluster_res <- 0.2 Idents(object = seurat_clust) <- paste0("SCT_snn_res.", cluster_res) DimPlot(seurat_clust, - reduction = "umap", - split.by = "surgery", + reduction = "umap.harmony", + split.by = "orig.ident", label = TRUE) ``` * * * - ### 0.2 ```{r umap_0.2} cluster_res <- 0.2 Idents(object = seurat_clust) <- paste0("SCT_snn_res.", cluster_res) DimPlot(seurat_clust, - reduction = "umap", - split.by = "surgery", + reduction = "umap.harmony", + split.by = "orig.ident", label = TRUE) ``` * * * - - ### 0.4 ```{r umap_0.4} cluster_res <- 0.4 Idents(object = seurat_clust) <- paste0("SCT_snn_res.", cluster_res) DimPlot(seurat_clust, - reduction = "umap", - split.by = "surgery", + reduction = "umap.harmony", + split.by = "orig.ident", label = TRUE) ``` * * * - - ### 0.6 ```{r umap_0.6} cluster_res <- 0.6 Idents(object = seurat_clust) <- paste0("SCT_snn_res.", cluster_res) DimPlot(seurat_clust, - reduction = "umap", - split.by = "surgery", + reduction = "umap.harmony", + split.by = "orig.ident", label = TRUE) ``` @@ -641,6 +558,6 @@ DimPlot(seurat_clust, ```{r} -saveRDS(seurat_clust, "seurat_clust.RDs") +saveRDS(seurat_clust, file=seurat_output) ``` diff --git a/inst/templates/singlecell/information.R b/inst/templates/singlecell/information.R index 3444196..6e15eef 100644 --- a/inst/templates/singlecell/information.R +++ b/inst/templates/singlecell/information.R @@ -1,8 +1,6 @@ -# project params -root = "../" -date = "YYYYMMDD" -column = "treatment" -subset_column = 'cell' -metadata_fn = "../meta/samplesheet.csv" -counts_fn = '../data/tximport-counts.csv' -basedir <- 'reports' +# info params +project = "name_hbcXXXXX" +PI = 'person name' +experiment = 'short description' +aim = 'short description' +analyst = 'person in the core' From 1b7dc268dd8594c1b29627211030342cd9b798c6 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Wed, 9 Oct 2024 10:20:36 -0400 Subject: [PATCH 88/93] adding checks and status to all rmd --- R/helpers.R | 46 +- inst/templates/base/README.md | 21 +- inst/templates/base/reports/example.Rmd | 45 ++ inst/templates/chipseq/QC/QC.Rmd | 11 +- inst/templates/chipseq/diffbind/diffbind.Rmd | 9 +- inst/templates/chipseq/readme.md | 27 +- inst/templates/methylation/QC/QC.Rmd | 15 +- inst/templates/multiomics/teaseq/QC/QC.Rmd | 13 + .../rnaseq/DE/Cross-comparison-analysis.Rmd | 14 + inst/templates/rnaseq/DE/DEG.Rmd | 6 + inst/templates/rnaseq/DE/GSVA.Rmd | 20 +- inst/templates/rnaseq/DE/Intersections.Rmd | 15 + inst/templates/rnaseq/QC/QC_nf-core.Rmd | 6 + inst/templates/rnaseq/README.md | 2 +- inst/templates/rnaseq/org/hcbc/README.md | 75 +++ .../singlecell/CellToCell/cellchat.Rmd | 440 ------------------ .../scRNA_normalization_template.rmd | 8 + inst/templates/singlecell/QC/QC.rmd | 51 +- inst/templates/singlecell/README.md | 9 +- .../singlecell/scripts/seurat_init.R | 9 + .../singlecell_delux/CellToCell/cellchat.Rmd | 22 +- inst/templates/spatial/cosmx/QC/QC.Rmd | 17 +- 22 files changed, 340 insertions(+), 541 deletions(-) create mode 100644 inst/templates/base/reports/example.Rmd create mode 100644 inst/templates/rnaseq/org/hcbc/README.md delete mode 100644 inst/templates/singlecell/CellToCell/cellchat.Rmd diff --git a/R/helpers.R b/R/helpers.R index c2d7509..db1e9b4 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -44,6 +44,7 @@ bcbio_nfcore_check <- function(file){ #' @param type string indicating the type of analysis, supported: rnaseq. #' #' @param outpath string path indicating where to copy all the files to +#' @param org string with the organization name. To deploy specific files. #' @examples #' \dontrun{ #' path <- withr::local_tempdir() @@ -51,32 +52,40 @@ bcbio_nfcore_check <- function(file){ #' fs::dir_ls(path,all=T) #' } #' @export -bcbio_templates <- function(type="rnaseq", outpath){ +bcbio_templates <- function(type="rnaseq", outpath=NULL, org=NULL){ + if (type=="all"){ + usethis::ui_info("Showing analysis:") + msg <- basename(fs::dir_ls(fs::path_package("bcbioR", "templates"))) + return(msg) + } + if (is.null(outpath)){ + usethis::ui_stop("outpath needs to be defined.") + } fs::dir_create(outpath) switch(type, base={ #file.copy(fpath, outpath, recursive = T) - copy_templates(outpath, "base") + copy_templates(outpath, "base", org) }, rnaseq={ #file.copy(fpath, outpath, recursive = T) - copy_templates(outpath, "nf-core/rnaseq") + copy_templates(outpath, "nf-core/rnaseq", org) }, singlecell={ #file.copy(fpath, outpath, recursive = T) - copy_templates(outpath, "singlecell") + copy_templates(outpath, "singlecell", org) }, singlecell_delux={ #file.copy(fpath, outpath, recursive = T) - copy_templates(outpath, "singlecell_delux") + copy_templates(outpath, "singlecell_delux", org) }, spatial={ #file.copy(fpath, outpath, recursive = T) - copy_templates(outpath, "spatial") + copy_templates(outpath, "spatial", org) }, multiomics={ #file.copy(fpath, outpath, recursive = T) - copy_templates(outpath, "multiomics") + copy_templates(outpath, "multiomics", org) }, { stop('project type not recognize, please choose: ', 'rnaseq', 'singlecell','singlecell_delux','spatial') @@ -134,20 +143,22 @@ detect_gitignores <- function(path){ }) } -copy_files_in_folder<- function(origin, remote){ +copy_files_in_folder<- function(origin, remote, is_org=FALSE){ to_copy <- fs::dir_ls(origin,all = TRUE) - to_copy <- grep("org", to_copy, - value = TRUE, invert = TRUE) + if (!is_org) { + to_copy <- grep("org", to_copy, + value = TRUE, invert = TRUE) + } for (element in to_copy){ full_new_path <- fs::path_join(c(remote, fs::path_file(element))) if (fs::is_dir(element)){ - if (!(fs::dir_exists(full_new_path))) - fs::dir_copy(element, full_new_path) + if (!(fs::dir_exists(full_new_path)) | is_org) + fs::dir_copy(element, full_new_path, overwrite = is_org) } if (fs::is_file(element)){ - if (!(fs::file_exists(full_new_path))) - fs::file_copy(element, full_new_path) + if (!(fs::file_exists(full_new_path)) | is_org) + fs::file_copy(element, full_new_path, overwrite = is_org) } } detect_gitignores(remote) @@ -164,7 +175,7 @@ deploy_apps <- function(apps, path){ }) } -copy_templates <- function(path, pipeline){ +copy_templates <- function(path, pipeline, org=NULL){ apps=list() base = c("bcbioR") if (pipeline=="base"){ @@ -182,11 +193,16 @@ copy_templates <- function(path, pipeline){ parts = c("templates/spatial") } analysis_template <- fs::path_package(base, parts) + org_template <- fs::path_package(base, parts, "org", org) + ui_info("Getting templates from {ui_value(analysis_template)}") # ls_files <- grep("org", list.files(analysis_template, full.names = TRUE), # value = TRUE, invert = TRUE) # ui_info("{ui_value(length(ls_files))} amount of files to copy") copy_files_in_folder(analysis_template, path) + ui_info("Getting templates from {ui_value(org_template)}") + copy_files_in_folder(org_template, path, is_org=TRUE) + # check org folder is in there # search for param + _README.md # concat file to README.md diff --git a/inst/templates/base/README.md b/inst/templates/base/README.md index 15dee0e..0159d62 100644 --- a/inst/templates/base/README.md +++ b/inst/templates/base/README.md @@ -14,7 +14,6 @@ - [ ] Replace the title in this file to match the project's title - [ ] Modify `information.R` with the right text for this project, it can be used to source in other `Rmd` files. The main `Rmd` file in this directory can be used to show general information of the project if needed. -- [ ] Use the same project name to create a folder in *Dropbox* and a repo in *GitHub* - [ ] If you didn't provide the pipeline when creating this project: Use the function `bcbio_templates` to create templates inside `reports` for each type of analysis. For instance, for *RNAseq*: - `bcbio_templates(type="rnaseq", outpath=path)` or @@ -23,21 +22,13 @@ ## Folders -- `meta` should contain the CSV/YAML files used by *bcbio* or *nextflow* +- `meta` should contain the CSV/YAML files used by *nextflow* or your pipelines - `scripts` should contain `sbatch` scripts or any custom scripts used in this project - `data` contains raw data, it can contains big data objects -- `reports` contains `Rmd` and `html` together with their files that will be added to *DropBox*. Each type of project have different guidelines. -- `final` contains the output of *nextflow/bcbio* +- `reports` contains `Rmd` and `html` together with their files that will be published and shared. +- `final` contains the output of *nextflow* - `code` contains any other files that support custom analysis and don't generate a report -- For any relevant client files or papers use the `docs` folder on *DropBox* - -## Download - -- [ ] Download data to the `data` directory on O2. Check the md5 checksums if available. - -## Analysis - -- [ ] Make sure that final folder is copied from *scratch* or *S3* to `/n/data1/cores/bcbio/PIs/` +- For any relevant client files or papers use the `docs` folder ## GitHub @@ -45,7 +36,3 @@ - [ ] Track in *Git* files in `scripts`, `meta`, and `reports` that belongs to these type: - **Note** Git add `*.Rmd *.R *ipynb *.sh *.yaml`. (feel free use `.gitignore` if you use a GUI for non-tracked files). *DO NOT* use `git add *`. *DO NOT* track `html/csv/figures`. *DO NOT* track files that you did not use for this project (i.e. irrelevant templates, placeholders) - [ ] Commit files and push to *Github* as necessary throughout the project, but especially when work is complete - -## Dropbox - -- [ ] Add to the *DropBox* folder all files in `reports` diff --git a/inst/templates/base/reports/example.Rmd b/inst/templates/base/reports/example.Rmd new file mode 100644 index 0000000..83df986 --- /dev/null +++ b/inst/templates/base/reports/example.Rmd @@ -0,0 +1,45 @@ +--- +title: "Example" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + project_file: ../information.R +--- + +```{r, cache = FALSE, message = FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-stable-green) revision. + +```{r source_params, cache = FALSE, message = FALSE, warning=FALSE} +source(params$project_file) +``` + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` diff --git a/inst/templates/chipseq/QC/QC.Rmd b/inst/templates/chipseq/QC/QC.Rmd index 4ba539c..f57c8d6 100644 --- a/inst/templates/chipseq/QC/QC.Rmd +++ b/inst/templates/chipseq/QC/QC.Rmd @@ -27,10 +27,17 @@ params: ```{r, cache = FALSE, message = FALSE, warning=FALSE} # This set up the working directory to this file so all files can be found -# library(rstudioapi) -# setwd(fs::path_dir(getSourceEditorContext()$path)) +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) ``` +This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. + ```{r source_params, cache = FALSE, message = FALSE, warning=FALSE} # 1. set up factor_of_interest parameter from parameter above or manually diff --git a/inst/templates/chipseq/diffbind/diffbind.Rmd b/inst/templates/chipseq/diffbind/diffbind.Rmd index 422faf5..6e7eb46 100644 --- a/inst/templates/chipseq/diffbind/diffbind.Rmd +++ b/inst/templates/chipseq/diffbind/diffbind.Rmd @@ -19,6 +19,7 @@ editor_options: params: # Fill this file with the right paths to nfcore output # .qs file name for saving DiffBind Counts object + # species = mouse or human params_file: params_diffbind-example.R project_file: ../information.R functions_file: ../libs/load_data.R @@ -26,7 +27,6 @@ params: condition_of_interest: genotype numerator: cKO denominator: WT - # species = mouse or human species: mouse counts_fn: diffbind_counts.csv results_sig_anno_fn: diffbind_results_sig_anno.csv @@ -37,8 +37,15 @@ Template developed with materials from https://hbctraining.github.io/main/. # This set up the working directory to this file so all files can be found library(rstudioapi) setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) ``` +This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. + ```{r source_params, cache = FALSE, message = FALSE, warning=FALSE} # 1. set up condition_of_interest parameter from parameter above or manually diff --git a/inst/templates/chipseq/readme.md b/inst/templates/chipseq/readme.md index f0e88b3..c1a53d1 100755 --- a/inst/templates/chipseq/readme.md +++ b/inst/templates/chipseq/readme.md @@ -4,29 +4,8 @@ Make sure there is a valid project name, and modify `information.R` with the rig ## Run data with nf-core rnaseq -- Make sure you have access to our [Seqera WorkSpace](https://cloud.seqera.io/orgs/HBC/workspaces/core_production/launchpad) -- Transfer data to HCBC S3: Ask Alex/Lorena. Files will be at our S3 bucket `input` folder -- Prepare the CSV file according these [instructions](https://nf-co.re/chipseq/2.0.0/docs/usage/). File should look like this: - -```csv -sample,fastq_1,fastq_2,antibody,control -WT_BCATENIN_IP_REP1,BLA203A1_S27_L006_R1_001.fastq.gz,,BCATENIN,WT_INPUT -WT_BCATENIN_IP_REP2,BLA203A25_S16_L001_R1_001.fastq.gz,,BCATENIN,WT_INPUT -WT_BCATENIN_IP_REP2,BLA203A25_S16_L002_R1_001.fastq.gz,,BCATENIN,WT_INPUT -WT_BCATENIN_IP_REP2,BLA203A25_S16_L003_R1_001.fastq.gz,,BCATENIN,WT_INPUT -WT_BCATENIN_IP_REP3,BLA203A49_S40_L001_R1_001.fastq.gz,,BCATENIN,WT_INPUT -WT_INPUT_REP1,BLA203A6_S32_L006_R1_001.fastq.gz,,, -WT_INPUT_REP2,BLA203A30_S21_L001_R1_001.fastq.gz,,, -WT_INPUT_REP2,BLA203A30_S21_L002_R1_001.fastq.gz,,, -WT_INPUT_REP3,BLA203A31_S21_L003_R1_001.fastq.gz,,, -``` - -You can add more columns to this file with more metadata, and use this file as the `coldata` file in the templates. - -- Upload file to our `Datasets` in Seqera using the name of the project but starting with `chipseq-pi_lastname-hbc_code` -- Go to `Launchpad`, select `nf-core_chipseq` pipeline, and select the previous created `Datasets` in the `input` parameter after clicking in `Browser` - - Select an output directory with the same name used for the `Dataset` inside the `results` folder in S3 -- When pipeline is done, data will be copied to our on-premise HPC in the scratch system under `scratch/groups/hsph/hbc/bcbio/` folder +This templates assume data has been processed by [nf-core/chipseq](https://nf-co.re/chipseq/2.1.0/docs/usage/). +We recommend to use the samplesheet.csv used with nf-core as metadata file, where other relevant columns can be there even if they are not used by the pipeline. ## QC @@ -39,7 +18,7 @@ You can add more columns to this file with more metadata, and use this file as t On the YAML header file of the Rmd you can specify some parameters including the conditions to be compared, the genome used, and the desired output file names. This template has examples of: * calculating a peak counts matrix * PCA -* differential binding analyiss +* differential binding analysis * peak annotation * functional analysis (coming soon) diff --git a/inst/templates/methylation/QC/QC.Rmd b/inst/templates/methylation/QC/QC.Rmd index 689e322..bb912a0 100644 --- a/inst/templates/methylation/QC/QC.Rmd +++ b/inst/templates/methylation/QC/QC.Rmd @@ -21,6 +21,19 @@ params: meta_fn: ../meta/methylation_mucci_hbc04926.csv --- +```{r, cache = FALSE, message = FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-draft-grey) revision. + ```{r echo = F} source(params$params_file) ``` @@ -597,4 +610,4 @@ List and version of tools used for the report generation. ```{r} sessionInfo() -``` \ No newline at end of file +``` diff --git a/inst/templates/multiomics/teaseq/QC/QC.Rmd b/inst/templates/multiomics/teaseq/QC/QC.Rmd index 9f1a302..85d0e44 100644 --- a/inst/templates/multiomics/teaseq/QC/QC.Rmd +++ b/inst/templates/multiomics/teaseq/QC/QC.Rmd @@ -70,6 +70,19 @@ opts_chunk[["set"]]( ) ``` +```{r, cache = FALSE, message = FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-draft-grey) revision. + ```{r setup, cache=FALSE, message=FALSE} library(Seurat) library(tidyverse) diff --git a/inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd b/inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd index 275fbb2..0d4ac49 100644 --- a/inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd +++ b/inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd @@ -19,6 +19,20 @@ editor_options: params: project_file: ../information.R --- +```{r, message=FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +library(tidyverse) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) +stopifnot(compareVersion(package.version("Seurat"), "5.0.0")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. ```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} library(rtracklayer) diff --git a/inst/templates/rnaseq/DE/DEG.Rmd b/inst/templates/rnaseq/DE/DEG.Rmd index 2e9c469..0384f70 100644 --- a/inst/templates/rnaseq/DE/DEG.Rmd +++ b/inst/templates/rnaseq/DE/DEG.Rmd @@ -38,8 +38,14 @@ Template developed with materials from https://hbctraining.github.io/main/. # This set up the working directory to this file so all files can be found library(rstudioapi) setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) ``` +This code is in this ![](https://img.shields.io/badge/status-stable-green) revision. ```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} library(tidyverse) diff --git a/inst/templates/rnaseq/DE/GSVA.Rmd b/inst/templates/rnaseq/DE/GSVA.Rmd index bcb0290..f8339a6 100644 --- a/inst/templates/rnaseq/DE/GSVA.Rmd +++ b/inst/templates/rnaseq/DE/GSVA.Rmd @@ -27,6 +27,22 @@ params: # choose geneset, click "Raw', and copy url geneset_fn: https://raw.githubusercontent.com/bcbio/resources/main/gene_sets/gene_sets/20240904/human/h.all.v2024.1.Hs.entrez.gmt --- + +```{r, message=FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +library(tidyverse) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) +stopifnot(compareVersion(package.version("Seurat"), "5.0.0")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. + ```{r libraries, message = FALSE, warning=FALSE} # path to libraries if working on O2 # .libPaths("/n/app/bcbio/R4.3.1_rnaseq/") @@ -68,10 +84,6 @@ opts_chunk[["set"]]( set.seed(1234567890L) ``` -```{r} -# This set up the working directory to this file so all files can be found -setwd(fs::path_dir(getSourceEditorContext()$path)) -``` ```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} source(params$project_file) diff --git a/inst/templates/rnaseq/DE/Intersections.Rmd b/inst/templates/rnaseq/DE/Intersections.Rmd index cd88800..fe7816c 100644 --- a/inst/templates/rnaseq/DE/Intersections.Rmd +++ b/inst/templates/rnaseq/DE/Intersections.Rmd @@ -20,6 +20,21 @@ params: project_file: ../information.R --- +```{r, message=FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +library(tidyverse) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) +stopifnot(compareVersion(package.version("Seurat"), "5.0.0")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. + ```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} library(rtracklayer) library(tidyverse) diff --git a/inst/templates/rnaseq/QC/QC_nf-core.Rmd b/inst/templates/rnaseq/QC/QC_nf-core.Rmd index 22841fd..32032ab 100644 --- a/inst/templates/rnaseq/QC/QC_nf-core.Rmd +++ b/inst/templates/rnaseq/QC/QC_nf-core.Rmd @@ -33,8 +33,14 @@ Template developed with materials from https://hbctraining.github.io/main/. # This set up the working directory to this file so all files can be found library(rstudioapi) setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) ``` +This code is in this ![](https://img.shields.io/badge/status-stable-green) revision. ```{r source_params, cache = FALSE, message = FALSE, warning=FALSE} # 1. set up factor_of_interest parameter from parameter above or manually diff --git a/inst/templates/rnaseq/README.md b/inst/templates/rnaseq/README.md index b8c6535..114d2bf 100644 --- a/inst/templates/rnaseq/README.md +++ b/inst/templates/rnaseq/README.md @@ -4,7 +4,7 @@ Make sure there is a project name for this. ## Run data with nf-core rnaseq -This templates assume data has been processed by [nf-core/rnasea](https://nf-co.re/rnaseq/3.14.0/docs/usage). +This templates assume data has been processed by [nf-core/rnaseq](https://nf-co.re/rnaseq/3.14.0/docs/usage). We recommend to use the samplesheet.csv used with nf-core as metadata file, where other relevant columns can be there even if they are not used by the pipeline. ## Downstream analysis diff --git a/inst/templates/rnaseq/org/hcbc/README.md b/inst/templates/rnaseq/org/hcbc/README.md new file mode 100644 index 0000000..50f8f16 --- /dev/null +++ b/inst/templates/rnaseq/org/hcbc/README.md @@ -0,0 +1,75 @@ +# Guideline for RNAseq downstream analysis + +Make sure there is a project name for this. + +## Run data with nf-core rnaseq + +- Make sure you have access to our [Seqera WorkSpace](https://cloud.seqera.io/orgs/HBC/workspaces/core_production/launchpad) +- Transfer data to HCBC S3: Ask Alex/Lorena. Files will be at our S3 bucket `input/rawdata` folder +- Prepare the CSV file according this [instructions](https://nf-co.re/rnaseq/3.14.0/docs/usage#multiple-runs-of-the-same-sample). File should look like this: + +```csv +sample,fastq_1,fastq_2,strandedness +CONTROL_REP1,s3path/AEG588A1_S1_L002_R1_001.fastq.gz,s3path/AEG588A1_S1_L002_R2_001.fastq.gz,auto +CONTROL_REP1,s3path/AEG588A1_S1_L003_R1_001.fastq.gz,s3path/AEG588A1_S1_L003_R2_001.fastq.gz,auto +CONTROL_REP1,s3path/AEG588A1_S1_L004_R1_001.fastq.gz,s3path/AEG588A1_S1_L004_R2_001.fastq.gz,auto +``` + +Use `bcbio_nfcore_check(csv_file)` to check the file is correct. + +You can add more columns to this file with more metadata, and use this file as the `coldata` file the templates. + +- Upload file to our `Datasets` in Seqera using the name of the project but starting with `nfcore-rnaseq` +- Go to `Launchpad`, select `nf-core_rnaseq` pipeline, and select the previous created `Datasets` in the `input` parameter after clicking in `Browser` + - Select an output directory with the same name used for the `Dataset` inside the `results` folder in S3 +- When pipeline is down, data will be copied to our on-premise HPC in the scratch system under `scratch/groups/hsph/hbc/bcbio/` folder + +## Downstream analysis + +Please, modify `information.R` with the right information. You can use this file with any other Rmd to include the project/analysis information. + +### QC + +`QC/QC.Rmd` is a template for QC metrics. Use `params_qc.R` for `bcbio` + or `QC/QC_nf-core.Rmd` `params_qc_nf-core.R` for `nf-core/rnaseq` outputs. + +Read instruction in the R and Rmd scripts to render it. + +### DE + +`DE/DEG.Rmd` is a template for two groups comparison. `params_de.R` has the information of the input files to load. You can point to `bcbio` or `nf-core/rnaseq` output files. + +On the `YAML` header file of the `Rmd` you can specify some parameters or just set them up in the first chunk of code of the template. This template has examples of: + +- sub-setting data +- two groups comparison +- volcano plot +- MA plot +- Pathway analysis +- Tables + +There are some code related to alternative analysis: + +- `DE/Multiplicative_DE_docs.md` that shows some cases when there is multiple variables in the model with multiple levels: sex (2 levels) and genotype (4 levels) + +## DropBox + +- In `reports/QC` + - [ ] copy `bcbio-se.rds` and `tximport-counts.csv` + - [ ] copy QC `Rmd/R/html/figures` +- In `reports/DE` + - [ ] Normalized counts for all genes x all samples (csv format) +- In `reports/DE`, for *each analysis*: + - **Note** For multiple comparisons/analysis, do a single report/template if possible in the parent folder using parameters whenever possible. + - Create a folder with the comparison names in the files. Numbering by comparison (`01.1_DE_comp1`, `01.2_DE_comp2`, etc.). If you’re running multiple models for the same comparison, append `_M#`. Add the following files under each folder: + - [ ] Normalized count table with the samples used in this analysis/comparison. + - [ ] Full results `DESeq2` for all genes (csv format) with annotation columns appended. + - [ ] Significant genes results file (subset of annotated full results by chosen p-value and LFC). Separate files will be created for each individual contrast. + - [ ] Significant genes results file as described above, but additionally append columns containing normalized count values for each sample. + - Make sure to append the gene symbols to these tables so the researcher can interpret the results. + +## GitHub + +- [ ] Push all `*Rmd` `*R` files used for the *QC* and *DE* analysis respecting folder structure. + +Please, ignore `*html/figures/csv` and any output of the code. diff --git a/inst/templates/singlecell/CellToCell/cellchat.Rmd b/inst/templates/singlecell/CellToCell/cellchat.Rmd deleted file mode 100644 index b9da33e..0000000 --- a/inst/templates/singlecell/CellToCell/cellchat.Rmd +++ /dev/null @@ -1,440 +0,0 @@ ---- -title: "CellChat" -author: "Harvard Chan Bioinformatics Core" -date: "`r Sys.Date()`" -output: - html_document: - code_folding: hide - df_print: paged - highlights: pygments - number_sections: false - self_contained: true - theme: default - toc: true - toc_float: - collapsed: true - smooth_scroll: true -editor_options: - chunk_output_type: console -params: - project_file: information.R - seurat_fn: ../data/fDat_sn_RC.rds - cellchat_fn: ../data/snrna_cellchat.qs - cellchat_grade2_fn: ../data/snrna_cellchat_grade2.qs - cellchat_grade0_fn: ../data/snrna_cellchat_grade0.qs ---- - -```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE, echo=FALSE,} - -reticulate::use_virtualenv("/n/app/bcbio/R4.3.1_python_cellchat") -reticulate::py_config() # should show v3.9.14 -Sys.getenv("PYTHONPATH") # should be empty - -current_libs <- .libPaths() -.libPaths(c('/n/app/bcbio/R4.3.1_cellchat/', current_libs)) -library(CellChat) - -library(tidyverse) -library(Seurat) -library(bcbioR) -library(ggprism) -library(knitr) -library(tools) -library(qs) -library(patchwork) -library(ComplexHeatmap) - -options(stringsAsFactors = FALSE) - -colors=cb_friendly_cols(1:15) -ggplot2::theme_set(theme_prism(base_size = 14)) -opts_chunk[["set"]]( - cache = F, - cache.lazy = FALSE, - dev = c("png", "pdf"), - error = TRUE, - highlight = TRUE, - message = FALSE, - prompt = FALSE, - tidy = FALSE, - warning = FALSE, - echo = T, - fig.height = 4) - -# set seed for reproducibility -set.seed(1234567890L) - -cellchat_ran <- file.exists(params$cellchat_fn) -cellchat_rejection_ran <- file.exists(params$cellchat_grade2_fn) & file.exists(params$cellchat_grade0_fn) - -``` - -# Clustering - -```{r load_data } - -snrna <- readRDS(params$seurat_fn) - -# in this case, Chris_annot = cell_type -DimPlot(snrna, reduction = 'umap', group.by = 'Chris_annot') - -``` - -```{r prep cellchat inputs, eval = !cellchat_ran } - -# need to use normalized counts as input -data.input <- snrna[["SCT"]]@data -labels <- snrna$Chris_annot -meta <- data.frame(labels = labels, row.names = names(labels), samples = snrna$orig.ident) - -``` - -```{r create cellchat object, eval = !cellchat_ran } -cellchat <- createCellChat(object = data.input, meta = meta, group.by = "labels") - -``` - -```{r set cellchat db, eval = !cellchat_ran} -CellChatDB <- CellChatDB.human -CellChatDB.use <- subsetDB(CellChatDB) -cellchat@DB <- CellChatDB.use - -``` - -```{r subset and preprocess data, eval = !cellchat_ran } - -cellchat <- subsetData(cellchat) -cellchat <- updateCellChat(cellchat) -future::plan("multisession", workers = 8) # recommend running with at 8-16 cores -cellchat <- identifyOverExpressedGenes(cellchat) # may take a couple minutes -cellchat <- identifyOverExpressedInteractions(cellchat) # may take a couple minutes - -``` - -```{r compute communication prob, eval = !cellchat_ran} - -# Not recommended: project gene expression data onto protein-protein interaction network. -# Useful with shallow sequencing depth but introduces many weak communications. -# If used, must set raw.use = FALSE when running computeCommunProb -# cellchat <- projectData(cellchat, PPI.human) - - -# this next command takes 0.5-2+ hours -# can choose various methods for caculating average gene exp per group, -# 'triMean' allegedly produces fewer but stronger interactions -cellchat <- computeCommunProb(cellchat, type = "triMean") - -# filter out the cell-cell communication if < 50 cells per group -cellchat <- filterCommunication(cellchat, min.cells = 50) - -qsave(cellchat, '../data/snrna_cellchat.qs', preset = 'fast') - -``` - -# Overall Results - -```{r load cellchat, eval = cellchat_ran} -cellchat <- qread(params$cellchat_fn) - -df.net <- subsetCommunication(cellchat) %>% dplyr::arrange(pval) -df.net %>% sanitize_datatable() - -``` - -## Top interactions {.tabset} -```{r check pairs, results = 'asis', fig.width = 8, fig.height = 12} - -top_ints <- (df.net %>% pull(interaction_name) %>% unique)[1:10] -for (interaction in top_ints){ - cat('\n') - cat('### ', as.character(interaction), '\n') - interactors <- unlist(strsplit(as.character(interaction), '_')) - p1 <- VlnPlot(snrna, features = interactors, group.by = 'Chris_annot', - pt.size = 0.1, log = T, ncol = 1) - print(p1) - cat('\n') -} - -``` - -```{r compute pathway communication probs, eval = cellchat_ran} -cellchat <- computeCommunProbPathway(cellchat) -cellchat <- aggregateNet(cellchat) - -``` - -## Visualize Cell-Cell Communication Networks - -```{r chord plots, fig.width = 10, fig.height = 8, eval = cellchat_ran} - -groupSize <- as.numeric(table(cellchat@idents)) -par(mfrow = c(1,2), xpd=TRUE) -netVisual_circle(cellchat@net$count, vertex.weight = rowSums(cellchat@net$count), - weight.scale = T, label.edge= F, title.name = "Number of interactions") -netVisual_circle(cellchat@net$weight, vertex.weight = rowSums(cellchat@net$weight), - weight.scale = T, label.edge= F, title.name = "Interaction weights/strength") - -``` - -```{r heatmaps, eval = cellchat_ran} - -netVisual_heatmap(cellchat, measure = "count", color.heatmap = "Blues") -netVisual_heatmap(cellchat, measure = "weight", color.heatmap = "Blues") - -``` - -# Comparison Results - -Here we run the CellChat analysis twice, once on the Grade 2 rejection samples and once on the Grade 0 rejection samples. We compare the significant signaling interactions and investigate changes in them between rejection grades. - -```{r prep inputs rejection, eval=!cellchat_rejection_ran} - -grade2 <- subset(snrna, orig.ident %in% c('BRI-2396', 'BRI-2402')) -grade0 <- subset(snrna, orig.ident %in% c('BRI-2395', 'BRI-2411')) - -data.input_grade2 <- grade2[["SCT"]]@data -labels_grade2 <- grade2$Chris_annot -meta_grade2 <- data.frame(labels = labels_grade2, row.names = names(labels_grade2), samples = grade2$orig.ident) - -data.input_grade0 <- grade0[["SCT"]]@data -labels_grade0 <- grade0$Chris_annot -meta_grade0 <- data.frame(labels = labels_grade0, row.names = names(labels_grade0), samples = grade0$orig.ident) - -``` - -```{r create cellchat object rejection, eval=!cellchat_rejection_ran} -cellchat_grade2 <- createCellChat(object = data.input_grade2, meta = meta_grade2, group.by = "labels") -cellchat_grade0 <- createCellChat(object = data.input_grade0, meta = meta_grade0, group.by = "labels") - -``` - -```{r subset and preprocess data rejection, eval=!cellchat_rejection_ran} - -future::plan("multisession", workers = 8) # recommend running with at 8-16 cores - -cellchat_grade2@DB <- CellChatDB.use -cellchat_grade0@DB <- CellChatDB.use - -cellchat_grade2 <- subsetData(cellchat_grade2) -cellchat_grade2 <- updateCellChat(cellchat_grade2) -cellchat_grade2 <- identifyOverExpressedGenes(cellchat_grade2) # may take a couple minutes -cellchat_grade2 <- identifyOverExpressedInteractions(cellchat_grade2) # may take a couple minutes - -cellchat_grade0 <- subsetData(cellchat_grade0) -cellchat_grade0 <- updateCellChat(cellchat_grade0) -cellchat_grade0 <- identifyOverExpressedGenes(cellchat_grade0) # may take a couple minutes -cellchat_grade0 <- identifyOverExpressedInteractions(cellchat_grade0) # may take a couple minutes - -``` - -```{r compute communication prob rejection, eval=!cellchat_rejection_ran} -cellchat_grade2 <- computeCommunProb(cellchat_grade2, type = "triMean") # command takes 0.5-2+ hours -cellchat_grade2 <- filterCommunication(cellchat_grade2, min.cells = 50) -qsave(cellchat_grade2, params$cellchat_grade2_fn, preset = 'fast') - -cellchat_grade0 <- computeCommunProb(cellchat_grade0, type = "triMean") # command takes 0.5-2+ hours -cellchat_grade0 <- filterCommunication(cellchat_grade0, min.cells = 50) -qsave(cellchat_grade0, params$cellchat_grade0_fn, preset = 'fast') - -``` - -```{r load cellchat rejection, eval = cellchat_rejection_ran} - -cellchat_grade2 <- qread(params$cellchat_grade2_fn) -cellchat_grade0 <- qread(params$cellchat_grade0_fn) - -cellchat_grade2 <- filterCommunication(cellchat_grade2, min.cells = 50) -cellchat_grade0 <- filterCommunication(cellchat_grade0, min.cells = 50) - -df.net_grade2 <- subsetCommunication(cellchat_grade2)%>% dplyr::arrange(pval) -df.net_grade0 <- subsetCommunication(cellchat_grade0)%>% dplyr::arrange(pval) - -``` - -## Grade 2 - -```{r datatable grade 2, eval = cellchat_rejection_ran} -df.net_grade2 %>% sanitize_datatable() - -``` - -### Top interactions {.tabset} -```{r check pairs grade 2, results = 'asis', fig.width = 8, fig.height = 12} - -top_ints <- (df.net_grade2 %>% pull(interaction_name) %>% unique)[1:10] -for (interaction in top_ints){ - cat('\n') - cat('#### ', as.character(interaction), '\n') - interactors <- unlist(strsplit(as.character(interaction), '_')) - p1 <- VlnPlot(snrna, features = interactors, group.by = 'Chris_annot', pt.size = 0.1, log = T, ncol = 1) - print(p1) - cat('\n') -} - -``` - - -## Grade 0 - -```{r datatable grade 0, eval = cellchat_rejection_ran} -df.net_grade0 %>% sanitize_datatable() - -``` - -### Top interactions {.tabset} -```{r check pairs grade 0, results = 'asis', fig.width = 8, fig.height = 12} - -top_ints <- (df.net_grade0 %>% pull(interaction_name) %>% unique)[1:10] -for (interaction in top_ints){ - cat('\n') - cat('#### ', as.character(interaction), '\n') - interactors <- unlist(strsplit(as.character(interaction), '_')) - p1 <- VlnPlot(snrna, features = interactors, group.by = 'Chris_annot', pt.size = 0.1, log = T, ncol = 1) - print(p1) - cat('\n') -} - -``` - -```{r merge rejection objects, eval = cellchat_rejection_ran} - -cellchat_grade2 <- computeCommunProbPathway(cellchat_grade2) -cellchat_grade2 <- aggregateNet(cellchat_grade2) -cellchat_grade2 <- netAnalysis_computeCentrality(cellchat_grade2) -cellchat_grade0 <- computeCommunProbPathway(cellchat_grade0) -cellchat_grade0 <- aggregateNet(cellchat_grade0) -cellchat_grade0 <- netAnalysis_computeCentrality(cellchat_grade0) - -object.list <- list(grade0 = cellchat_grade0, grade2 = cellchat_grade2) -cellchat_merged <- mergeCellChat(object.list, add.names = names(object.list)) - -df.net_merged <- subsetCommunication(cellchat_merged) - -``` - -## Compare Interactions/Interaction Strength - -```{r compare interactions, eval = cellchat_rejection_ran} - -gg1 <- compareInteractions(cellchat_merged, show.legend = F, group = c(1,2)) -gg2 <- compareInteractions(cellchat_merged, show.legend = F, group = c(1,2), measure = "weight") -gg1 + gg2 - -``` - -```{r chord plots merged, eval = cellchat_rejection_ran, fig.width = 10, fig.height = 8} -par(mfrow = c(1,2), xpd=TRUE) -netVisual_diffInteraction(cellchat_merged, weight.scale = T) -netVisual_diffInteraction(cellchat_merged, weight.scale = T, measure = "weight") - -``` - -```{r heatmaps merged, eval = cellchat_rejection_ran, fig.width = 10, fig.height = 8} - -gg1 <- netVisual_heatmap(cellchat_merged) -gg2 <- netVisual_heatmap(cellchat_merged, measure = "weight") -gg1 + gg2 - -``` - -## Compare Major Pathway Sources and Targets - -From the CellChat documentation: "Comparing the outgoing and incoming interaction strength in a 2D space allows ready identification of the cell populations with significant changes in sending or receiving signals between different datasets." - -```{r compare send/receive changes, eval = cellchat_rejection_ran, fig.width = 10, fig.height = 6} - -num.link <- sapply(object.list, function(x) {rowSums(x@net$count) + colSums(x@net$count)-diag(x@net$count)}) -weight.MinMax <- c(min(num.link), max(num.link)) # control the dot size in the different datasets -gg <- list() -for (i in 1:length(object.list)) { - gg[[i]] <- netAnalysis_signalingRole_scatter(object.list[[i]], title = names(object.list)[i], weight.MinMax = weight.MinMax) -} -patchwork::wrap_plots(plots = gg) -``` - - -```{r identify signaling changes, eval = cellchat_rejection_ran, fig.width = 12, fig.height = 12} -gg1 <- netAnalysis_signalingChanges_scatter(cellchat_merged, idents.use = "Vascular_EC") -gg2 <- netAnalysis_signalingChanges_scatter(cellchat_merged, idents.use = "Lymphatic_EC") -gg3 <- netAnalysis_signalingChanges_scatter(cellchat_merged, idents.use = "Pericyte") -patchwork::wrap_plots(plots = list(gg1,gg2,gg3), nrow = 3, ncol = 1) - -``` - -## Cluster Altered Signaling Interactions - -From the CellChat documentation: "CellChat performs joint manifold learning and classification of the inferred communication networks based on their functional and topological similarity across different conditions. - -By quantifying the similarity between the cellular communication networks of signaling pathways across conditions, this analysis highlights the potentially altered signaling pathways. CellChat adopts the concept of network rewiring from network biology and hypothesized that the difference between different communication networks may affect biological processes across conditions. UMAP is used for visualizing signaling relationship and interpreting our signaling outputs in an intuitive way without involving the classification of conditions. - -Functional similarity: High degree of functional similarity indicates major senders and receivers are similar, and it can be interpreted as the two signaling pathways or two ligand-receptor pairs exhibit similar and/or redundant roles. - -Structural similarity: A structural similarity was used to compare their signaling network structure, without considering the similarity of senders and receivers." - - -### Based on Functional Similarity - -```{r identify signaling groups functional, eval = cellchat_rejection_ran} - -cellchat_merged <- computeNetSimilarityPairwise(cellchat_merged, type = "functional") -cellchat_merged <- netEmbedding(cellchat_merged, type = "functional") -cellchat_merged <- netClustering(cellchat_merged, type = "functional") -netVisual_embeddingPairwise(cellchat_merged, type = "functional", label.size = 3.5) - -``` - -### Based on Structural Similarity - -```{r identify signaling groups structural, eval = cellchat_rejection_ran} -cellchat_merged <- computeNetSimilarityPairwise(cellchat_merged, type = "structural") -cellchat_merged <- netEmbedding(cellchat_merged, type = "structural") -cellchat_merged <- netClustering(cellchat_merged, type = "structural") -netVisual_embeddingPairwise(cellchat_merged, type = "structural", label.size = 3.5) -``` - -## Compare Overall Signaling Information Flow - -"CellChat can identify the conserved and context-specific signaling pathways by simply comparing the information flow for each signaling pathway, which is defined by the sum of communication probability among all pairs of cell groups in the inferred network (i.e., the total weights in the network)." - -```{r info flow, fig.height = 9, eval = cellchat_rejection_ran} - -rankNet(cellchat_merged, mode = "comparison", measure = "weight", sources.use = NULL, targets.use = NULL, stacked = F, do.stat = TRUE) - -``` - -## Compare Signaling Patterns Across Cell Populations - -"In this heatmap, colobar represents the relative signaling strength of a signaling pathway across cell groups (Note that values are row-scaled). The top colored bar plot shows the total signaling strength of a cell group by summarizing all signaling pathways displayed in the heatmap. The right grey bar plot shows the total signaling strength of a signaling pathway by summarizing all cell groups displayed in the heatmap." - - -```{r outgoing signaling, fig.height = 9, eval = cellchat_rejection_ran} - -i = 1 -pathway.union <- union(object.list[[i]]@netP$pathways, object.list[[i+1]]@netP$pathways) -ht1 = netAnalysis_signalingRole_heatmap(object.list[[i]], pattern = "outgoing", signaling = pathway.union, title = names(object.list)[i], width = 5, height = 16, cluster.cols = T) -ht2 = netAnalysis_signalingRole_heatmap(object.list[[i+1]], pattern = "outgoing", signaling = pathway.union, title = names(object.list)[i+1], width = 5, height = 16, cluster.cols = T) -draw(ht1 + ht2, ht_gap = unit(0.5, "cm")) -``` - -```{r incoming signaling, fig.height = 9, eval = cellchat_rejection_ran} -ht1 = netAnalysis_signalingRole_heatmap(object.list[[i]], pattern = "incoming", signaling = pathway.union, title = names(object.list)[i], width = 5, height = 16, cluster.cols = T) -ht2 = netAnalysis_signalingRole_heatmap(object.list[[i+1]], pattern = "incoming", signaling = pathway.union, title = names(object.list)[i+1], width = 5, height = 16, cluster.cols = T) -draw(ht1 + ht2, ht_gap = unit(0.5, "cm")) -``` - -## Identify Dysfunctional Interaction Signaling Using Communication Probabilities - -"CellChat can identify the up-regulated (increased) and down-regulated (decreased) signaling ligand-receptor pairs in one dataset compared to the other dataset by comparing the communication probability between two datasets for each L-R pair and each pair of cell groups" - -```{r compare signaling, fig.height = 12, fig.width = 8, eval = cellchat_rejection_ran} - -gg1 <- netVisual_bubble(cellchat_merged, - # sources.use = c('Vascular_EC', 'Lymphatic_EC', 'Pericyte'), - # targets.use = c('Vascular_EC', 'Lymphatic_EC', 'Pericyte'), - comparison = c(1, 2), - max.dataset = 2, - title.name = "Increased signaling in Grade 2", - angle.x = 45, - remove.isolate = T) -gg1 -signaling.grade2_increased = gg1$data diff --git a/inst/templates/singlecell/Integration/scRNA_normalization_template.rmd b/inst/templates/singlecell/Integration/scRNA_normalization_template.rmd index 2082b9d..148a9ce 100644 --- a/inst/templates/singlecell/Integration/scRNA_normalization_template.rmd +++ b/inst/templates/singlecell/Integration/scRNA_normalization_template.rmd @@ -27,8 +27,16 @@ Template developed with materials from https://hbctraining.github.io/main/. library(rstudioapi) library(tidyverse) setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) +stopifnot(compareVersion(package.version("Seurat"), "5.0.0")>=0) ``` +This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. + ```{r} # parameters ## Cell cycle markers for c.elegans, human, mouse, D. rerio, and D. melanogaster can be found here: https://github.com/hbc/tinyatlas/tree/1e2136a35e773f14d97ae9cbdb6c375327b2dd2b/cell_cycle diff --git a/inst/templates/singlecell/QC/QC.rmd b/inst/templates/singlecell/QC/QC.rmd index b7c5d5b..f2de4eb 100644 --- a/inst/templates/singlecell/QC/QC.rmd +++ b/inst/templates/singlecell/QC/QC.rmd @@ -5,17 +5,23 @@ date: "`r Sys.Date()`" params: ## If you have Ribosomal ratio in your raw seurat object put this as TRUE otherwise leave as FALSE ribosomal: FALSE - params_file: parameters.R + params_file: https://github.com/bcbio/bcbioR-test-data/raw/refs/heads/main/singlecell/parameters.R project_file: ../information.R --- -# Overview - -- Project: `r project` -- PI: `r PI` -- Analyst: `r analyst` -- Experiment: `r experiment` +```{r, cache = FALSE, message = FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) +stopifnot(compareVersion(package.version("Seurat"), "5.0.0")>=0) +``` +This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. ```{r, eval=FALSE} ### READ ME FIRST @@ -26,14 +32,12 @@ params: ``` +# Overview - -```{r} -# This set up the working directory to this file so all files can be found -library(rstudioapi) -setwd(fs::path_dir(getSourceEditorContext()$path)) -``` - +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` ```{r setup, include=FALSE} library(Seurat) @@ -41,7 +45,15 @@ library(tidyverse) library(ggplot2) # 1. Set up input files in this R file (params_de.R) -source(params$params_file) +# Loading QC'd object +parameters = params$params_file +if (isUrl(seurat_obj)){ + source(url(parameters)) +}else{ + source(parameters) +} + +seurat_obj = "~/Downloads/filtered_gene_bc_matrices/tiny.rds" knitr::opts_chunk[["set"]]( cache = FALSE, @@ -60,8 +72,7 @@ knitr::opts_chunk[["set"]]( ```{r load and filter no ribo} ## Load data - -seurat_raw <- readRDS("seurat_pre-filtered.rds") +seurat_raw <- readRDS(seurat_obj) ## Creat QC object USE METRICS YOU CHOSE IN THE RSHINY APP @@ -94,21 +105,15 @@ seurat_qc <- subset(x = seurat_raw, ## Save QC object saveRDS(seurat_qc, file = "seurat_post-QC.rds") - ``` ```{r prep-info} - ## Prep information for plotting metadata0 <- seurat_raw@meta.data - metadata0 = metadata0 %>% dplyr::rename(nUMI = nCount_RNA, nGene = nFeature_RNA) - metadata1 <- seurat_qc@meta.data - - metadata1 = metadata1 %>% dplyr::rename(nUMI = nCount_RNA, nGene = nFeature_RNA) ``` diff --git a/inst/templates/singlecell/README.md b/inst/templates/singlecell/README.md index 9680182..5c7dc35 100644 --- a/inst/templates/singlecell/README.md +++ b/inst/templates/singlecell/README.md @@ -2,7 +2,7 @@ # Start with cell-ranger -`pre-process-w-cellranger.md` contains step by step guidelines on how to run cellranger in O2 and load data into R. This `scripts/seurat_init.R` script contains all the pieces to go from cellranger output to Seurat obj. It is assuming a mouse genome. +`pre-process-w-cellranger.md` contains step by step guidelines on how to run cellranger and load data into R. This `scripts/seurat_init.R` script contains all the pieces to go from cellranger output to Seurat obj. It is assuming a mouse genome. # QC @@ -12,10 +12,3 @@ Currently we are working on deploying a shiny app to inspect the single cell obj Currently we are working on guidelines and templates for this step. There is some draft under *Integration** folder. -# Cell to cell communication - -CellChat template is at `CellToCell/cellchat.Rmd`. We have built a stable environment in O2 using the following modules: - -``` -# gcc/9.2.0 imageMagick/7.1.0 geos/3.10.2 cmake/3.22.2 R/4.3.1 fftw/3.3.10 gdal/3.1.4 udunits/2.2.28 boost/1.75.0 python/3.9.14 -``` diff --git a/inst/templates/singlecell/scripts/seurat_init.R b/inst/templates/singlecell/scripts/seurat_init.R index 079a451..b2b64c2 100644 --- a/inst/templates/singlecell/scripts/seurat_init.R +++ b/inst/templates/singlecell/scripts/seurat_init.R @@ -1,3 +1,12 @@ +library(rstudioapi) +library(tidyverse) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) +stopifnot(compareVersion(package.version("Seurat"), "5.0.0")>=0) library(Seurat) library(data.table) diff --git a/inst/templates/singlecell_delux/CellToCell/cellchat.Rmd b/inst/templates/singlecell_delux/CellToCell/cellchat.Rmd index b9da33e..330a049 100644 --- a/inst/templates/singlecell_delux/CellToCell/cellchat.Rmd +++ b/inst/templates/singlecell_delux/CellToCell/cellchat.Rmd @@ -24,14 +24,31 @@ params: cellchat_grade0_fn: ../data/snrna_cellchat_grade0.qs --- -```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE, echo=FALSE,} - +```{r, cache = FALSE, message = FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) +stopifnot(compareVersion(package.version("Seurat"), "5.0.0")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-draft-grey) revision. + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE, echo=FALSE} +# NOTE change or remove according your environment reticulate::use_virtualenv("/n/app/bcbio/R4.3.1_python_cellchat") reticulate::py_config() # should show v3.9.14 Sys.getenv("PYTHONPATH") # should be empty current_libs <- .libPaths() .libPaths(c('/n/app/bcbio/R4.3.1_cellchat/', current_libs)) +``` + +```{r} library(CellChat) library(tidyverse) @@ -66,7 +83,6 @@ set.seed(1234567890L) cellchat_ran <- file.exists(params$cellchat_fn) cellchat_rejection_ran <- file.exists(params$cellchat_grade2_fn) & file.exists(params$cellchat_grade0_fn) - ``` # Clustering diff --git a/inst/templates/spatial/cosmx/QC/QC.Rmd b/inst/templates/spatial/cosmx/QC/QC.Rmd index 4271822..bff6628 100644 --- a/inst/templates/spatial/cosmx/QC/QC.Rmd +++ b/inst/templates/spatial/cosmx/QC/QC.Rmd @@ -26,6 +26,20 @@ params: umap_dim: approximateumap_8c6f278e.b9f4.4535.aeca.8955c1dff614_1 --- +```{r, cache = FALSE, message = FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) +stopifnot(compareVersion(package.version("Seurat"), "5.0.0")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-draft-grey) revision. + ```{r load_params, echo = F} source(params$project_file) ``` @@ -75,7 +89,6 @@ sanitize_datatable = function(df, ...) { - Analyst: `r analyst` - Experiment: `r experiment` - Aim: `r aim` -- Sample: `r params$seurat_fn` ```{r read rds} @@ -314,4 +327,4 @@ rownames(seurat_filtered@meta.data) <- seurat_filtered@meta.data$cell_id ImageDimPlot(seurat_filtered, group.by = 'cell_type', axes = TRUE, crop = TRUE) -``` \ No newline at end of file +``` From 1dad379c51b3b1b3855b14e5ca2fbc9e89c9178c Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Wed, 9 Oct 2024 10:20:49 -0400 Subject: [PATCH 89/93] remove former hcbc readme --- inst/templates/rnaseq/org/hcbc_README.md | 75 ------------------------ 1 file changed, 75 deletions(-) delete mode 100644 inst/templates/rnaseq/org/hcbc_README.md diff --git a/inst/templates/rnaseq/org/hcbc_README.md b/inst/templates/rnaseq/org/hcbc_README.md deleted file mode 100644 index 50f8f16..0000000 --- a/inst/templates/rnaseq/org/hcbc_README.md +++ /dev/null @@ -1,75 +0,0 @@ -# Guideline for RNAseq downstream analysis - -Make sure there is a project name for this. - -## Run data with nf-core rnaseq - -- Make sure you have access to our [Seqera WorkSpace](https://cloud.seqera.io/orgs/HBC/workspaces/core_production/launchpad) -- Transfer data to HCBC S3: Ask Alex/Lorena. Files will be at our S3 bucket `input/rawdata` folder -- Prepare the CSV file according this [instructions](https://nf-co.re/rnaseq/3.14.0/docs/usage#multiple-runs-of-the-same-sample). File should look like this: - -```csv -sample,fastq_1,fastq_2,strandedness -CONTROL_REP1,s3path/AEG588A1_S1_L002_R1_001.fastq.gz,s3path/AEG588A1_S1_L002_R2_001.fastq.gz,auto -CONTROL_REP1,s3path/AEG588A1_S1_L003_R1_001.fastq.gz,s3path/AEG588A1_S1_L003_R2_001.fastq.gz,auto -CONTROL_REP1,s3path/AEG588A1_S1_L004_R1_001.fastq.gz,s3path/AEG588A1_S1_L004_R2_001.fastq.gz,auto -``` - -Use `bcbio_nfcore_check(csv_file)` to check the file is correct. - -You can add more columns to this file with more metadata, and use this file as the `coldata` file the templates. - -- Upload file to our `Datasets` in Seqera using the name of the project but starting with `nfcore-rnaseq` -- Go to `Launchpad`, select `nf-core_rnaseq` pipeline, and select the previous created `Datasets` in the `input` parameter after clicking in `Browser` - - Select an output directory with the same name used for the `Dataset` inside the `results` folder in S3 -- When pipeline is down, data will be copied to our on-premise HPC in the scratch system under `scratch/groups/hsph/hbc/bcbio/` folder - -## Downstream analysis - -Please, modify `information.R` with the right information. You can use this file with any other Rmd to include the project/analysis information. - -### QC - -`QC/QC.Rmd` is a template for QC metrics. Use `params_qc.R` for `bcbio` - or `QC/QC_nf-core.Rmd` `params_qc_nf-core.R` for `nf-core/rnaseq` outputs. - -Read instruction in the R and Rmd scripts to render it. - -### DE - -`DE/DEG.Rmd` is a template for two groups comparison. `params_de.R` has the information of the input files to load. You can point to `bcbio` or `nf-core/rnaseq` output files. - -On the `YAML` header file of the `Rmd` you can specify some parameters or just set them up in the first chunk of code of the template. This template has examples of: - -- sub-setting data -- two groups comparison -- volcano plot -- MA plot -- Pathway analysis -- Tables - -There are some code related to alternative analysis: - -- `DE/Multiplicative_DE_docs.md` that shows some cases when there is multiple variables in the model with multiple levels: sex (2 levels) and genotype (4 levels) - -## DropBox - -- In `reports/QC` - - [ ] copy `bcbio-se.rds` and `tximport-counts.csv` - - [ ] copy QC `Rmd/R/html/figures` -- In `reports/DE` - - [ ] Normalized counts for all genes x all samples (csv format) -- In `reports/DE`, for *each analysis*: - - **Note** For multiple comparisons/analysis, do a single report/template if possible in the parent folder using parameters whenever possible. - - Create a folder with the comparison names in the files. Numbering by comparison (`01.1_DE_comp1`, `01.2_DE_comp2`, etc.). If you’re running multiple models for the same comparison, append `_M#`. Add the following files under each folder: - - [ ] Normalized count table with the samples used in this analysis/comparison. - - [ ] Full results `DESeq2` for all genes (csv format) with annotation columns appended. - - [ ] Significant genes results file (subset of annotated full results by chosen p-value and LFC). Separate files will be created for each individual contrast. - - [ ] Significant genes results file as described above, but additionally append columns containing normalized count values for each sample. - - Make sure to append the gene symbols to these tables so the researcher can interpret the results. - -## GitHub - -- [ ] Push all `*Rmd` `*R` files used for the *QC* and *DE* analysis respecting folder structure. - -Please, ignore `*html/figures/csv` and any output of the code. From 282c7302e487c95481ab7e288bef64cfd545d6a2 Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Thu, 10 Oct 2024 18:35:26 +0000 Subject: [PATCH 90/93] add venn diagram to chipseq qc --- inst/templates/chipseq/QC/QC.Rmd | 41 +++++++++++-------- inst/templates/chipseq/QC/params_qc-example.R | 9 ++-- 2 files changed, 29 insertions(+), 21 deletions(-) diff --git a/inst/templates/chipseq/QC/QC.Rmd b/inst/templates/chipseq/QC/QC.Rmd index 4ba539c..7dbfa68 100644 --- a/inst/templates/chipseq/QC/QC.Rmd +++ b/inst/templates/chipseq/QC/QC.Rmd @@ -27,8 +27,8 @@ params: ```{r, cache = FALSE, message = FALSE, warning=FALSE} # This set up the working directory to this file so all files can be found -# library(rstudioapi) -# setwd(fs::path_dir(getSourceEditorContext()$path)) +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) ``` @@ -66,6 +66,8 @@ library(bcbioR) library(janitor) library(ChIPpeakAnno) library(UpSetR) + +colors=cb_friendly_cols(1:15) ggplot2::theme_set(theme_light(base_size = 14)) opts_chunk[["set"]]( cache = FALSE, @@ -353,8 +355,7 @@ ggplot(peaks, aes(x = peak_enrichment, fill = .data[[params$factor_of_interest]] We examine the amount of overlap between peaks in replicates of the same experimental condition. -``` {r peak overlap, results = 'asis', fig.width = 8} - +``` {r peak overlap, results = 'asis', fig.width = 8, fig.height = 6} for (current_sample_group in unique(peaks$sample_group)){ cat("## ", current_sample_group, "\n") @@ -374,19 +375,27 @@ for (current_sample_group in unique(peaks$sample_group)){ # connectedpeaks examples (https://support.bioconductor.org/p/133486/#133603), if 5 peaks in group1 overlap with 2 peaks in group 2, setting connectedPeaks to "merge" will add 1 to the overlapping counts overlaps <- findOverlapsOfPeaks(peaks_sample_group_granges, connectedPeaks = 'merge') - set_counts <- overlaps$venn_cnt[, colnames(overlaps$venn_cnt)] %>% - as.data.frame() %>% - mutate(group_number = row_number()) %>% - pivot_longer(!Counts & !group_number, names_to = 'sample', values_to = 'member') %>% - filter(member > 0) %>% - group_by(Counts, group_number) %>% - summarize(group = paste(sample, collapse = '&')) + n_samples <- length(names(overlaps$overlappingPeaks)) + + if (n_samples > 3){ + set_counts <- overlaps$venn_cnt[, colnames(overlaps$venn_cnt)] %>% + as.data.frame() %>% + mutate(group_number = row_number()) %>% + pivot_longer(!Counts & !group_number, names_to = 'sample', values_to = 'member') %>% + filter(member > 0) %>% + group_by(Counts, group_number) %>% + summarize(group = paste(sample, collapse = '&')) + + set_counts_upset <- set_counts$Counts + names(set_counts_upset) <- set_counts$group - set_counts_upset <- set_counts$Counts - names(set_counts_upset) <- set_counts$group - - p <- upset(fromExpression(set_counts_upset), order.by = "freq", text.scale = 1.5) - print(p) + p <- upset(fromExpression(set_counts_upset), order.by = "freq", text.scale = 1.5) + print(p) + } else{ + venn_sample_names <- gsub(paste0(current_sample_group, '_'), '', names(overlaps$all.peaks)) + invisible(capture.output(makeVennDiagram(overlaps, connectedPeaks = "merge", fill = colors[1:n_samples], + NameOfPeaks = venn_sample_names))) + } cat('\n\n') diff --git a/inst/templates/chipseq/QC/params_qc-example.R b/inst/templates/chipseq/QC/params_qc-example.R index eeb3ce5..7f2da63 100644 --- a/inst/templates/chipseq/QC/params_qc-example.R +++ b/inst/templates/chipseq/QC/params_qc-example.R @@ -2,11 +2,10 @@ # Example data -coldata_fn='~/Downloads/chipseq_peakanalysis_PRDM16.csv' +coldata_fn='/workspace/data/chipseq_peakanalysis_h3k27ac_narrow/chipseq_peakanalysis_H3K27Ac.csv' # This folder is in the output directory inside multiqc folder -multiqc_data_dir='~/O2/s3_results/chipseq_peakanalysis_prdm16/multiqc/narrowPeak/multiqc_data/' +multiqc_data_dir='/workspace/data/chipseq_peakanalysis_h3k27ac_narrow/multiqc/narrowPeak/multiqc_data/' # This folder is in the output director -peaks_dir = '~/O2/s3_results/chipseq_peakanalysis_prdm16/bowtie2/mergedLibrary/macs2/narrowPeak/' +peaks_dir = '/workspace/data/chipseq_peakanalysis_h3k27ac_narrow/bowtie2/mergedLibrary/macs2/narrowPeak/' # This folder is in the output directory -# counts_fn = '~/O2/s3_results/chipseq_peakanalysis_h3k27ac/bowtie2/mergedLibrary/macs2/broadPeak/consensus/H3K27ac/deseq2/H3K27ac.consensus_peaks.rds' -counts_fn = '~/O2/s3_results/chipseq_peakanalysis_prdm16/bowtie2/mergedLibrary/macs2/narrowPeak/consensus/PRDM16/deseq2/PRDM16.consensus_peaks.rds' +counts_fn = '/workspace/data/chipseq_peakanalysis_h3k27ac_narrow/bowtie2/mergedLibrary/macs2/narrowPeak/consensus/H3K27ac/deseq2/H3K27ac.consensus_peaks.rds' From a0ad6c92cd73c37eae481f987cc041cbe417e1cd Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Thu, 10 Oct 2024 20:02:23 +0000 Subject: [PATCH 91/93] start edits to diffbind --- inst/templates/chipseq/diffbind/diffbind.Rmd | 73 ++++++++++++-------- 1 file changed, 46 insertions(+), 27 deletions(-) diff --git a/inst/templates/chipseq/diffbind/diffbind.Rmd b/inst/templates/chipseq/diffbind/diffbind.Rmd index 6e7eb46..f71a20d 100644 --- a/inst/templates/chipseq/diffbind/diffbind.Rmd +++ b/inst/templates/chipseq/diffbind/diffbind.Rmd @@ -29,7 +29,7 @@ params: denominator: WT species: mouse counts_fn: diffbind_counts.csv - results_sig_anno_fn: diffbind_results_sig_anno.csv + results_sig_anno_fn: diffbind_results_anno.csv --- Template developed with materials from https://hbctraining.github.io/main/. @@ -37,11 +37,12 @@ Template developed with materials from https://hbctraining.github.io/main/. # This set up the working directory to this file so all files can be found library(rstudioapi) setwd(fs::path_dir(getSourceEditorContext()$path)) + # NOTE: This code will check version, this is our recommendation, it may work #. other versions -stopifnot(R.version$major>= 4) # requires R4 -stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 -stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) +# stopifnot(R.version$major>= 4) # requires R4 +# stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 +# stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) ``` This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. @@ -107,10 +108,12 @@ if (params$species == 'mouse'){ library(TxDb.Mmusculus.UCSC.mm10.knownGene) txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene anno_db <- 'org.Mm.eg.db' + library(org.Mm.eg.db) } else if (params$species == human){ library(TxDb.Hsapiens.UCSC.hg38.knownGene) txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene anno_db <- 'org.Hs.eg.db' + library(org.Hs.eg.db) } @@ -237,6 +240,10 @@ including estimation of size factors and dispersions, fitting and testing the model, evaluating the supplied contrast, and shrinking the LFCs. A p-value and FDR is assigned to each candidate binding site indicating confidence that they are differentially bound. +We use [ChIPpeakAnno](https://bioconductor.org/packages/release/bioc/html/ChIPpeakAnno.html) +to identify any gene features within 1000 bp of a differentially bound site. + + ```{r DB analysis} diffbind_norm <- dba.contrast(diffbind_norm, contrast = c('Condition', params$numerator, params$denominator)) results_obj <- dba.analyze(diffbind_norm, bGreylist = F) @@ -244,11 +251,32 @@ results_obj <- dba.analyze(diffbind_norm, bGreylist = F) results_report <- dba.report(results_obj, th = 1) results_report_sig <- dba.report(results_obj) - results <- results_report %>% as.data.frame() -results_sig <- results_report_sig %>% as.data.frame() + +``` + +```{r annotate DB peaks} + +anno_data <- toGRanges(txdb, feature = 'gene') +results_anno_batch <- annotatePeakInBatch(results_report, + AnnotationData = anno_data, + output = 'overlapping', + maxgap = 1000) + +results_anno_batch_df <- results_anno_batch %>% as.data.frame() + +entrez_to_symbol <- AnnotationDbi::select(org.Mm.eg.db, results_anno_batch_df$feature, + "ENTREZID", columns = 'SYMBOL') %>% + filter(!is.na(ENTREZID)) %>% distinct() + +results_anno_batch_df <- results_anno_batch_df %>% + left_join(entrez_to_symbol %>% dplyr::select(feature = ENTREZID, gene_name = SYMBOL)) + +write_csv(results_anno_batch_df, params$results_sig_anno_fn) + ``` + ## MA plot This plot can help to: @@ -257,11 +285,10 @@ This plot can help to: - Visualize data dispersion: The distribution of points along the A-axis gives a sense of the spread of binding levels and any patterns or anomalies in the dataset. ```{r MA plot} -results_for_ma <- results%>% +results_for_ma <- results_anno_batch_df%>% mutate(peak = paste(seqnames, start, end, sep = '_')) %>% mutate(t = 0) %>% dplyr::select(peak, AveExpr = Conc, logFC = Fold, P.Value = p.value, adj.P.Val = FDR, t) -rownames(results_for_ma) <- results_for_ma$peak degMA(as.DEGSet(results_for_ma, contrast = paste(params$numerator, params$denominator, sep = ' vs. '))) ``` @@ -270,7 +297,9 @@ degMA(as.DEGSet(results_for_ma, contrast = paste(params$numerator, params$denomi ```{r DB table} -results_sig %>% sanitize_datatable() +results_sig_anno_batch_df <- results_anno_batch_df %>% filter(FDR < 0.05) +results_sig_anno_batch_df %>% dplyr::select(names(results), feature, gene_name) %>% + sanitize_datatable() ``` @@ -280,15 +309,17 @@ results_sig %>% sanitize_datatable() This volcano plot shows the binding sites that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in purple are sites that have padj < 0.05 and a log2-fold change magnitude > 0.5. Points in blue have a padj > 0.05 and a log2-fold change magnitude > 0.5. Grey points are non-significant. The dashed lines correspond to the cutoff values of log2-fold change and padj that we have chosen. ```{r volcano, fig.height = 8} -results_mod <- results %>% +results_mod <- results_sig_anno_batch_df %>% mutate(Fold = replace(Fold, Fold < -5, -5)) %>% mutate(Fold = replace(Fold, Fold > 5, 5)) %>% - mutate(peak = paste(seqnames, start, end, sep = '_')) -show <- as.data.frame(results_mod[1:6, c("Fold", "FDR", "peak")]) + mutate(peak = paste(seqnames, start, end, sep = '_')) +# show <- as.data.frame(results_mod[1:6, c("Fold", "FDR", "gene_name")]) + +show <- results_mod %>% filter(!is.na(gene_name)) %>% slice_min(n = 6, order_by = FDR) EnhancedVolcano(results_mod, - lab= results_mod$peak, + lab= results_mod$gene_name, pCutoff = 0.05, - selectLab = c(show$peak), + # selectLab = c(show$gene_name), FCcutoff = 0.5, x = 'Fold', y = 'FDR', @@ -321,15 +352,12 @@ We use the [ChIPseeker](https://www.bioconductor.org/packages/release/bioc/html/ package to determine the genomic context of the differentially bound peaks and visualize these annotations. We consider the promoter region to be within 2000 bp in either direction of the TSS. -We also use [ChIPpeakAnno](https://bioconductor.org/packages/release/bioc/html/ChIPpeakAnno.html) -to identify any gene features within 1000 bp of a differentially bound site. - ```{r annotate, echo = F} results_sig_anno <- annotatePeak(results_report_sig, tssRegion = c(-2000, 2000), TxDb = txdb, - annoDb = params$anno_db, + annoDb = anno_db, verbose = F) results_sig_anno_df <- results_sig_anno %>% as.data.frame() @@ -337,15 +365,6 @@ plotAnnoPie(results_sig_anno) plotDistToTSS(results_sig_anno) -anno_data <- toGRanges(txdb, feature = 'gene') -results_sig_anno_batch <- annotatePeakInBatch(results_report_sig, - AnnotationData = anno_data, - output = 'overlapping', - maxgap = 1000) - -results_sig_anno_batch_df <- results_sig_anno_batch %>% as.data.frame() - -write_csv(results_sig_anno_batch_df, params$results_sig_anno_fn) ``` # Functional Enrichment From 1e718b3b4bfa5249adcf1198585d7bc45aea06e2 Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Fri, 11 Oct 2024 16:37:00 +0000 Subject: [PATCH 92/93] annotation and ora for diffbind, fix typo in DEG.Rmd --- inst/templates/chipseq/diffbind/diffbind.Rmd | 76 ++++++++++++++++++-- inst/templates/chipseq/libs/load_data.R | 44 ++++++++++++ inst/templates/rnaseq/DE/DEG.Rmd | 2 +- 3 files changed, 117 insertions(+), 5 deletions(-) diff --git a/inst/templates/chipseq/diffbind/diffbind.Rmd b/inst/templates/chipseq/diffbind/diffbind.Rmd index f71a20d..ef7f88a 100644 --- a/inst/templates/chipseq/diffbind/diffbind.Rmd +++ b/inst/templates/chipseq/diffbind/diffbind.Rmd @@ -103,6 +103,8 @@ library(qs) library(EnhancedVolcano) library(ggprism) library(ChIPseeker) +library(msigdbr) +library(fgsea) if (params$species == 'mouse'){ library(TxDb.Mmusculus.UCSC.mm10.knownGene) @@ -265,9 +267,15 @@ results_anno_batch <- annotatePeakInBatch(results_report, results_anno_batch_df <- results_anno_batch %>% as.data.frame() -entrez_to_symbol <- AnnotationDbi::select(org.Mm.eg.db, results_anno_batch_df$feature, - "ENTREZID", columns = 'SYMBOL') %>% - filter(!is.na(ENTREZID)) %>% distinct() +if(params$species == 'mouse'){ + entrez_to_symbol <- AnnotationDbi::select(org.Mm.eg.db, results_anno_batch_df$feature, + "ENTREZID", columns = 'SYMBOL') %>% + filter(!is.na(ENTREZID)) %>% distinct() +} else if (params$species == 'human'){ + entrez_to_symbol <- AnnotationDbi::select(org.Hs.eg.db, results_anno_batch_df$feature, + "ENTREZID", columns = 'SYMBOL') %>% + filter(!is.na(ENTREZID)) %>% distinct() +} results_anno_batch_df <- results_anno_batch_df %>% left_join(entrez_to_symbol %>% dplyr::select(feature = ENTREZID, gene_name = SYMBOL)) @@ -316,10 +324,12 @@ results_mod <- results_sig_anno_batch_df %>% # show <- as.data.frame(results_mod[1:6, c("Fold", "FDR", "gene_name")]) show <- results_mod %>% filter(!is.na(gene_name)) %>% slice_min(n = 6, order_by = FDR) + +results_mod <- results_mod %>% mutate(gene_name = ifelse(peak %in% show$peak , gene_name, NA)) EnhancedVolcano(results_mod, lab= results_mod$gene_name, pCutoff = 0.05, - # selectLab = c(show$gene_name), + selectLab = c(show$gene_name), FCcutoff = 0.5, x = 'Fold', y = 'FDR', @@ -369,6 +379,64 @@ plotDistToTSS(results_sig_anno) # Functional Enrichment +Over-Representation Analysis (ORA) is a statistical method used to determine whether a predefined set of genes (e.g., genes belonging to a specific biological pathway or function) is over-represented (or enriched) among a list of differentially bound genes (DEGs) from ChIP-seq. Adventages of ORA: + +- Simplicity: Easy to perform and interpret. +- Biological Insight: Helps to identify pathways and processes that are significantly affected in the condition studied. +- Prior Knowledge Integration: Utilizes existing biological knowledge through predefined gene sets. + +```{r get databases} +if(params$species == 'human'){ + all_in_life=get_databases() +} else if (params$species == 'mouse'){ + all_in_life = get_databases('Mus musculus') +} +``` + +```{r ora} + +universe_mapping = results_anno_batch_df %>% + filter(!is.na(FDR), !is.na(feature)) %>% + dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct() + +ora_input = results_anno_batch_df %>% + filter(!is.na(FDR), FDR < 0.01, abs(Fold) > 0.3, !is.na(feature)) %>% + dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct() +all = run_fora(ora_input, universe_mapping, all_in_life) + +ora_input = results_anno_batch_df %>% + filter(!is.na(FDR), FDR < 0.01, Fold > 0.3, !is.na(feature)) %>% + dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct() +up = run_fora(ora_input, universe_mapping, all_in_life) + +ora_input = results_anno_batch_df %>% + filter(!is.na(FDR), FDR < 0.01, Fold < -0.3, !is.na(feature)) %>% + dplyr::select(ENTREZID = feature, SYMBOL = gene_name) %>% distinct() +down = run_fora(ora_input, universe_mapping, all_in_life) + +``` + + +## Significant pathways using all DB genes + +```{r all pathways} +all %>% sanitize_datatable() +``` + + +## Significant pathways using increased DB genes + +```{r up pathways} +up %>% sanitize_datatable() +``` + + +## Significant pathways using decreased DB genes + +```{r down pathways, results='asis'} +down %>% sanitize_datatable() +``` + # R session List and version of tools used for the report generation. diff --git a/inst/templates/chipseq/libs/load_data.R b/inst/templates/chipseq/libs/load_data.R index fec3d4a..324e808 100755 --- a/inst/templates/chipseq/libs/load_data.R +++ b/inst/templates/chipseq/libs/load_data.R @@ -114,3 +114,47 @@ make_diffbind_samplesheet <- function(coldata, bam_dir, peaks_dir, column = NULL return(samplesheet) } + +get_databases=function(sps="human"){ + all_in_life=list( + msigdbr(species = sps, category = "H") %>% mutate(gs_subcat="Hallmark"), + # msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME"), + msigdbr(species = sps, category = "C2", subcategory = "CP:KEGG"), + # msigdbr(species = "human", category = "C2", subcategory = "CP:PID"), + msigdbr(species = sps, category = "C5", subcategory = "GO:BP"), + msigdbr(species = sps, category = "C5", subcategory = "GO:MF") + # msigdbr(species = "human", category = "C5", subcategory = "HPO"), + # msigdbr(species = "human", category = "C3", subcategory = "TFT:GTRD"), + # msigdbr(species = "human", category = "C6") %>% mutate(gs_subcat="Oncogenic") + ) + all_in_life +} + +run_fora=function(input, uni,all_in_life){ + # browser() + total_deg=length(unique(input))/length(unique(uni$ENTREZID)) + pathways_ora_all = lapply(all_in_life, function(p){ + pathway = split(x = p$entrez_gene, f = p$gs_name) + db_name = paste(p$gs_cat[1], p$gs_subcat[1],sep=":") + respath <- fora(pathways = pathway, + genes = unique(input$ENTREZID), + universe = unique(uni$ENTREZID), + minSize = 15, + maxSize = 500) + # coll_respath = collapsePathwaysORA(respath[order(pval)][padj < 0.1], + # pathway, unique(input$ENTREZID), unique(uni$ENTREZID)) + as_tibble(respath) %>% + mutate(database=db_name, NES=(overlap/size)/(total_deg)) + }) %>% bind_rows() %>% + mutate(analysis="ORA") + ora_tb = pathways_ora_all %>% unnest(overlapGenes) %>% + group_by(pathway) %>% + left_join(uni, by =c("overlapGenes"="ENTREZID")) %>% + dplyr::select(pathway, padj, NES, SYMBOL, analysis, + database) %>% + group_by(pathway,padj,NES,database,analysis) %>% + summarise(genes=paste(SYMBOL,collapse = ",")) + ora_tb + +} + diff --git a/inst/templates/rnaseq/DE/DEG.Rmd b/inst/templates/rnaseq/DE/DEG.Rmd index 0384f70..129e328 100644 --- a/inst/templates/rnaseq/DE/DEG.Rmd +++ b/inst/templates/rnaseq/DE/DEG.Rmd @@ -604,7 +604,7 @@ fa_list=lapply(de_list,function(contrast){ input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL')) up=run_fora(input_entrezid, universe_mapping,all_in_life) - ora_input = res %>% filter(!is.na(padj), padj<0.01, lfc<0.3) %>% pull(gene_id) + ora_input = res %>% filter(!is.na(padj), padj<0.01, lfc<-0.3) %>% pull(gene_id) #change to the right species input_entrezid <- AnnotationDbi::select(org.Hs.eg.db, ora_input, 'ENSEMBL', columns = c('ENTREZID', 'SYMBOL')) down=run_fora(input_entrezid, universe_mapping,all_in_life) From c1aa8fcf07b4fc9747a2c82c1ec9179cf94d59d6 Mon Sep 17 00:00:00 2001 From: Lorena Pantano Date: Fri, 11 Oct 2024 14:31:57 -0400 Subject: [PATCH 93/93] fix checking version text and finish sc integration --- inst/templates/base/reports/example.Rmd | 2 +- inst/templates/chipseq/QC/QC.Rmd | 2 +- inst/templates/chipseq/diffbind/diffbind.Rmd | 2 +- .../rnaseq/DE/Cross-comparison-analysis.Rmd | 3 +- inst/templates/rnaseq/DE/DEG.Rmd | 2 +- inst/templates/rnaseq/DE/GSVA.Rmd | 3 +- inst/templates/rnaseq/DE/Intersections.Rmd | 3 +- inst/templates/rnaseq/QC/QC_nf-core.Rmd | 2 +- .../scRNA_normalization_template.rmd | 199 ++++++++++++------ inst/templates/singlecell/QC/QC.rmd | 4 +- .../singlecell_delux/CellToCell/cellchat.Rmd | 4 +- 11 files changed, 149 insertions(+), 77 deletions(-) diff --git a/inst/templates/base/reports/example.Rmd b/inst/templates/base/reports/example.Rmd index 83df986..e771265 100644 --- a/inst/templates/base/reports/example.Rmd +++ b/inst/templates/base/reports/example.Rmd @@ -28,7 +28,7 @@ setwd(fs::path_dir(getSourceEditorContext()$path)) #. other versions stopifnot(R.version$major>= 4) # requires R4 stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 -stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) ``` This code is in this ![](https://img.shields.io/badge/status-stable-green) revision. diff --git a/inst/templates/chipseq/QC/QC.Rmd b/inst/templates/chipseq/QC/QC.Rmd index f57c8d6..8c21781 100644 --- a/inst/templates/chipseq/QC/QC.Rmd +++ b/inst/templates/chipseq/QC/QC.Rmd @@ -33,7 +33,7 @@ setwd(fs::path_dir(getSourceEditorContext()$path)) #. other versions stopifnot(R.version$major>= 4) # requires R4 stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 -stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) ``` This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. diff --git a/inst/templates/chipseq/diffbind/diffbind.Rmd b/inst/templates/chipseq/diffbind/diffbind.Rmd index 6e7eb46..af93a01 100644 --- a/inst/templates/chipseq/diffbind/diffbind.Rmd +++ b/inst/templates/chipseq/diffbind/diffbind.Rmd @@ -41,7 +41,7 @@ setwd(fs::path_dir(getSourceEditorContext()$path)) #. other versions stopifnot(R.version$major>= 4) # requires R4 stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 -stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) ``` This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. diff --git a/inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd b/inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd index 0d4ac49..3a6674d 100644 --- a/inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd +++ b/inst/templates/rnaseq/DE/Cross-comparison-analysis.Rmd @@ -28,8 +28,7 @@ setwd(fs::path_dir(getSourceEditorContext()$path)) #. other versions stopifnot(R.version$major>= 4) # requires R4 stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 -stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) -stopifnot(compareVersion(package.version("Seurat"), "5.0.0")>=0) +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) ``` This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. diff --git a/inst/templates/rnaseq/DE/DEG.Rmd b/inst/templates/rnaseq/DE/DEG.Rmd index 0384f70..aae0efc 100644 --- a/inst/templates/rnaseq/DE/DEG.Rmd +++ b/inst/templates/rnaseq/DE/DEG.Rmd @@ -42,7 +42,7 @@ setwd(fs::path_dir(getSourceEditorContext()$path)) #. other versions stopifnot(R.version$major>= 4) # requires R4 stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 -stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) ``` This code is in this ![](https://img.shields.io/badge/status-stable-green) revision. diff --git a/inst/templates/rnaseq/DE/GSVA.Rmd b/inst/templates/rnaseq/DE/GSVA.Rmd index f8339a6..c7028a3 100644 --- a/inst/templates/rnaseq/DE/GSVA.Rmd +++ b/inst/templates/rnaseq/DE/GSVA.Rmd @@ -37,8 +37,7 @@ setwd(fs::path_dir(getSourceEditorContext()$path)) #. other versions stopifnot(R.version$major>= 4) # requires R4 stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 -stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) -stopifnot(compareVersion(package.version("Seurat"), "5.0.0")>=0) +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) ``` This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. diff --git a/inst/templates/rnaseq/DE/Intersections.Rmd b/inst/templates/rnaseq/DE/Intersections.Rmd index fe7816c..fc1778b 100644 --- a/inst/templates/rnaseq/DE/Intersections.Rmd +++ b/inst/templates/rnaseq/DE/Intersections.Rmd @@ -29,8 +29,7 @@ setwd(fs::path_dir(getSourceEditorContext()$path)) #. other versions stopifnot(R.version$major>= 4) # requires R4 stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 -stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) -stopifnot(compareVersion(package.version("Seurat"), "5.0.0")>=0) +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) ``` This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. diff --git a/inst/templates/rnaseq/QC/QC_nf-core.Rmd b/inst/templates/rnaseq/QC/QC_nf-core.Rmd index 32032ab..dd91975 100644 --- a/inst/templates/rnaseq/QC/QC_nf-core.Rmd +++ b/inst/templates/rnaseq/QC/QC_nf-core.Rmd @@ -37,7 +37,7 @@ setwd(fs::path_dir(getSourceEditorContext()$path)) #. other versions stopifnot(R.version$major>= 4) # requires R4 stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 -stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) ``` This code is in this ![](https://img.shields.io/badge/status-stable-green) revision. diff --git a/inst/templates/singlecell/Integration/scRNA_normalization_template.rmd b/inst/templates/singlecell/Integration/scRNA_normalization_template.rmd index 148a9ce..05a9349 100644 --- a/inst/templates/singlecell/Integration/scRNA_normalization_template.rmd +++ b/inst/templates/singlecell/Integration/scRNA_normalization_template.rmd @@ -31,8 +31,8 @@ setwd(fs::path_dir(getSourceEditorContext()$path)) #. other versions stopifnot(R.version$major>= 4) # requires R4 stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 -stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) -stopifnot(compareVersion(package.version("Seurat"), "5.0.0")>=0) +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) +stopifnot(compareVersion(as.character(packageVersion("Seurat")), "5.0.0")>=0) ``` This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. @@ -43,17 +43,12 @@ This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revis ## This files needs gene_name and phase columns to work with this template cell_cycle_file="https://github.com/bcbio/resources/raw/refs/heads/main/singlecell/human_cell_cycle.csv" -# Source cell cycle markers -cc_markers = read_csv(cell_cycle_file) -stopifnot(c("gene_name", "phase") %in% colnames(cc_markers)) - seurat_obj="https://github.com/bcbio/bcbioR-test-data/raw/refs/heads/main/singlecell/tiny.rds" seurat_output="/tmp/seurat_clust.rds" source(params$project_file) ``` - ```{r setup, cache=FALSE, message=FALSE, warning=FALSE, echo=FALSE} knitr::opts_chunk$set(echo = TRUE) # Load libraries @@ -65,7 +60,19 @@ library(data.table) library(DT) library(patchwork) library(clustree) +library(ggprism) +library(grafify) +library(R.utils) #library(future) + +ggplot2::theme_set(theme_prism(base_size = 12)) +# https://grafify-vignettes.netlify.app/colour_palettes.html +# NOTE change colors here if you wish +scale_colour_discrete <- function(...) + scale_colour_manual(..., values = as.vector(grafify:::graf_palettes[["kelly"]])) + +#options(ggplot2.discrete.colour= ) + # Set seed for reproducibility set.seed(1454944673L) opts_chunk[["set"]]( @@ -73,7 +80,7 @@ opts_chunk[["set"]]( cache = FALSE, cache.lazy = FALSE, error = TRUE, - echo = FALSE, + echo = TRUE, fig.height = 5L, fig.retina = 2L, fig.width = 9.6, @@ -91,7 +98,6 @@ opts_chunk[["set"]]( - Experiment: `r experiment` - Aim: `r aim` - ## Dataset The Seurat object used as input for this report was prepared with the thresholds detailed below applied. @@ -103,6 +109,10 @@ The Seurat object used as input for this report was prepared with the thresholds ```{r load_data, cache = TRUE} +# Source cell cycle markers +cc_markers = read_csv(cell_cycle_file) +stopifnot(c("gene_name", "phase") %in% colnames(cc_markers)) + # Loading QC'd object if (isUrl(seurat_obj)){ seurat_qc <- readRDS(url(seurat_obj)) @@ -146,13 +156,12 @@ seurat_lognorm <- NormalizeData(seurat_qc, # Find variable genes (largest dispersion in expression across cells) seurat_lognorm <- FindVariableFeatures(seurat_lognorm, nfeatures = 2000) - # Scale and center data seurat_lognorm <- ScaleData(seurat_lognorm, model.use = "linear") # Calculate PCs and UMAP seurat_lognorm <- RunPCA(seurat_lognorm) -seurat_lognorm <- RunUMAP(seurat_lognorm, 1:50) +seurat_lognorm <- RunUMAP(seurat_lognorm, 1:40) ``` ## Examine highly variable genes @@ -290,45 +299,28 @@ We searched for the top 3000 genes with the largest variability in expression le **We keep each sample separate for SCT normalization.** - -```{r, eval=file.exists("seurat_sct.rds")} +```{r, eval=file.exists("seurat_sctnorm.rds")} #NOTE run the chunck below to create this object, and loading will be used while # knitting to speed up the rendering -seurat_sctnorm = readRDS("seurat_sct.rds") +seurat_sctnorm = readRDS("seurat_sctnorm.rds") ``` -```{r rna_norm1, warning=FALSE, message=FALSE, eval=!exists("seurat_sctnorm")} +```{r eval=!exists("seurat_sctnorm"), warning=FALSE, message=FALSE} #NOTE: this should be ran previous rendering to prepare the object ## Note that this single command replaces NormalizeData(), ScaleData(), and FindVariableFeatures() ## SCT can be run with and without regressing out variables. Generally we do not regress out covariates. However, we provide both options below. +seurat_lognorm[["RNA"]] <- split(seurat_lognorm[["RNA"]], + f = seurat_lognorm$orig.ident) -## To properly integrate with harmony we split our object by sample first. - -# split_sctnorm <- SplitObject(seurat_lognorm, split.by = "orig.ident") -# -# for (i in 1:length(split_sctnorm)) { -# split_sctnorm[[i]] <- SCTransform(split_sctnorm[[i]], -# vst.flavor = "v2", -# variable.features.n = 3000) -# } -# -# integ_features <- SelectIntegrationFeatures(object.list = split_seurat, -# nfeatures = 3000) -# split_seurat <- PrepSCTIntegration(object.list = split_seurat, -# anchor.features = integ_features) -#for (i in 1:length(split_seurat)) { -# split_seurat[[i]] <- SCTransform(split_seurat[[i]], vars.to.regress = c("mitoRatio"), vst.flavor = "v2", variable.features.n = 3000) -# } - -seurat_lognorm[["RNA"]] <- split(seurat_lognorm[["RNA"]], f= seurat_lognorm$orig.ident) seurat_sctnorm <- SCTransform(seurat_lognorm, - vst.flavor = "v2", - variable.features.n = 3000) -saveRDS(seurat_sctnorm, file="seurat_sct.rds") + vst.flavor = "v2", + # vars.to.regress = c("mitoRatio") + variable.features.n = 3000) +saveRDS(seurat_sctnorm, file="seurat_sctnorm.rds") ``` -## Look at UMAPs post SCT +### Look at UMAPs post SCT The plots below show the same variables as before, this time **displayed on the UMAP calculated after applying SCT-normalization**. @@ -438,7 +430,75 @@ g <- signaturePlot(seurat_sctnorm, g[[1]] + ggtitle("S1") | g[[2]] + ggtitle("S2") ``` -# Run harmony and umaps again + +# Integration + +## CCA integration + +```{r, eval=file.exists("seurat_cca.rds")} +#NOTE run the chunck below to create this object, and loading will be used while +# knitting to speed up the rendering +seurat_cca= readRDS("seurat_cca.rds") +``` + +```{r rna_cca, warning=FALSE, message=FALSE, eval=!exists("seurat_cca")} +#NOTE: this should be ran previous rendering to prepare the object +## Note that this single command replaces NormalizeData(), ScaleData(), and FindVariableFeatures() + +## SCT can be run with and without regressing out variables. Generally we do not regress out covariates. However, we provide both options below. + +## To properly integrate with harmony we split our object by sample first. +split_sctnorm <- SplitObject(seurat_lognorm, split.by = "orig.ident") +# NOTE If we have a large dataset, then we might need to adjust the limit for allowable object sizes within R +# options(future.globals.maxSize = 4000 * 1024^2) +for (i in 1:length(split_sctnorm)) { + split_sctnorm[[i]] <- SCTransform(split_sctnorm[[i]], + vst.flavor = "v2", + # vars.to.regress = c("mitoRatio") + variable.features.n = 3000) +} + +integ_features <- SelectIntegrationFeatures(object.list = split_sctnorm, + nfeatures = 3000) +split_sctnorm <- PrepSCTIntegration(object.list = split_sctnorm, + anchor.features = integ_features) + +# Find best buddies - can take a while to run +integ_anchors <- FindIntegrationAnchors(object.list = split_sctnorm, + normalization.method = "SCT", + anchor.features = integ_features) +# Integrate across conditions +seurat_cca <- IntegrateData(anchorset = integ_anchors, + normalization.method = "SCT") + +# Rejoin the layers in the RNA assay that we split earlier +seurat_cca[["RNA"]] <- JoinLayers(seurat_cca[["RNA"]]) + +# Run PCA +seurat_cca <- RunPCA(object = seurat_cca) + +# Run UMAP +seurat_cca <- RunUMAP(seurat_cca, + reduction.name = "umap.cca", + dims = 1:40) + +saveRDS(seurat_cca, file="seurat_cca.rds") +``` + +```{r} +p1 <- DimPlot(seurat_lognorm, group.by = "orig.ident", + reduction = "umap") + + theme(legend.position = "bottom") + + ggtitle("pre-integration") +p2 <- DimPlot(seurat_cca, group.by = "orig.ident", + reduction = "umap.cca") + + theme(legend.position = "bottom") + + ggtitle("post-integration") + +p1 | p2 +``` + +## Harmony If cells cluster by sample, condition, batch, dataset, modality, this integration step can greatly improve the clustering and the downstream analyses. @@ -446,24 +506,27 @@ To integrate, we will use the shared highly variable genes (identified using SCT We use [`Harmony`](https://portals.broadinstitute.org/harmony/articles/quickstart.html), which is based on a transformation of principal components (PCs) to find similarities across datasets. Here we group samples by the original sample id. -```{r, eval=file.exists("seurat_sct.rds")} +```{r rna_hrmny, eval=file.exists("seurat_harmony.rds")} #NOTE run the chunck below to create this object, and loading will be used while # knitting to speed up the rendering -seurat_harmony = readRDS("seurat_sct.rds") +seurat_harmony = readRDS("seurat_harmony.rds") ``` -```{r, eval=exists("seurat_harmony"), warning=FALSE, message=FALSE} +```{r, eval=!exists("seurat_harmony"), warning=FALSE, message=FALSE} ## Here seurat will integrate on the level of sample id. If you want to integrate on other aspects the SCT normalization will need to be done with all of the data together. +# seurat_sctnorm[["RNA"]] <- split(seurat_sctnorm[["RNA"]], f = seurat_sctnorm$orig.ident) seurat_harmony <- IntegrateLayers(object = seurat_sctnorm, method = HarmonyIntegration, - orig.reduction = "pca", new.reduction = 'harmony', - assay = "SCT", verbose = FALSE) -seurat_sctnorm <- RunPCA(seurat_sctnorm) -seurat_harmony <- RunUMAP(seurat_harmony, reduction = "harmony", dims = 1:40, reduction.name = "umap.harmony") + orig.reduction = "pca", + new.reduction = 'harmony', + assay = "SCT", verbose = FALSE) +seurat_harmony <- RunPCA(seurat_harmony) +seurat_harmony <- RunUMAP(seurat_harmony, reduction = "harmony", + dims = 1:40, + reduction.name = "umap.harmony") saveRDS(seurat_harmony, file="seurat_harmony.rds") ``` - ## Pre vs. Post integration ```{r dimplot_both all, echo=FALSE} @@ -471,12 +534,16 @@ p1 <- DimPlot(seurat_sctnorm, group.by = "orig.ident", reduction = "umap") + theme(legend.position = "bottom") + ggtitle("pre-integration") -p2 <- DimPlot(seurat_harmony, group.by = "orig.ident", +p2 <- DimPlot(seurat_cca, group.by = "orig.ident", + reduction = "umap.cca") + + theme(legend.position = "bottom") + + ggtitle("post-integration CCA") +p3 <- DimPlot(seurat_harmony, group.by = "orig.ident", reduction = "umap.harmony") + theme(legend.position = "bottom") + - ggtitle("post-integration") + ggtitle("post-integration Harmony") -p1 | p2 +p1 | p2 | p3 ``` ## Clustering @@ -486,11 +553,13 @@ For single-modality scRNA-seq analysis, `Seurat` clusters the cells using a Louv A limitation of this approach is that the number of identified clusters depends on the chosen resolution, a parameter that must be set by the user and does not necessarily reflect the underlying biology of the dataset. For most single-cell datasets, a resolution of 0.1 to 1 will provide a reasonable number of clusters. Complex datasets with multiple cell types may require a larger resolution, and vice versa. ```{r find_neighbors all, echo=TRUE} -seurat_harmony <- FindNeighbors(seurat_harmony, assay = "SCT", - reduction = "harmony", dims = 1:40) +# NOTE use seurat_harmony or seurat_cca +# seurat_clust <- FindNeighbors(seurat_harmony, assay = "SCT", +# reduction = "harmony", dims = 1:40) +seurat_clust <- FindNeighbors(seurat_cca, assay = "SCT", dims = 1:40) # check graph names names(seurat_harmony@graphs) # DefaultAssay(object = seurat_harmony[["pca"]]) -seurat_clust <- FindClusters(object = seurat_harmony, +seurat_clust <- FindClusters(object = seurat_clust, resolution = c(0.1, 0.2, 0.4, 0.6, 0.8, 1.0), verbose = FALSE) ``` @@ -506,7 +575,13 @@ meta = seurat_clust@meta.data meta = na.omit(meta) ## Change the prefix to match your clusters -clustree(meta, prefix = "SCT_snn_res.") +# NOTE: this if you have run HARMONY +# prefix_clu <- "SNN_res." +# show_this <- "umap.harmony" +# NOTE: this if you have run CCA +prefix_clu <- "integrated_snn_res." +show_this <- "umap.cca" +clustree(meta, prefix = prefix_clu) ``` ## Visualize clusters {.tabset} @@ -517,9 +592,9 @@ We take a look at how the clusters look at resolutions 0.1, 0.2,0.4, and 0.6 ```{r umap_0.1} cluster_res <- 0.2 -Idents(object = seurat_clust) <- paste0("SCT_snn_res.", cluster_res) +Idents(object = seurat_clust) <- paste0(prefix_clu, cluster_res) DimPlot(seurat_clust, - reduction = "umap.harmony", + reduction = show_this, split.by = "orig.ident", label = TRUE) ``` @@ -530,9 +605,9 @@ DimPlot(seurat_clust, ```{r umap_0.2} cluster_res <- 0.2 -Idents(object = seurat_clust) <- paste0("SCT_snn_res.", cluster_res) +Idents(object = seurat_clust) <- paste0(prefix_clu, cluster_res) DimPlot(seurat_clust, - reduction = "umap.harmony", + reduction = show_this, split.by = "orig.ident", label = TRUE) ``` @@ -543,9 +618,9 @@ DimPlot(seurat_clust, ```{r umap_0.4} cluster_res <- 0.4 -Idents(object = seurat_clust) <- paste0("SCT_snn_res.", cluster_res) +Idents(object = seurat_clust) <- paste0(prefix_clu, cluster_res) DimPlot(seurat_clust, - reduction = "umap.harmony", + reduction = show_this, split.by = "orig.ident", label = TRUE) ``` @@ -555,9 +630,9 @@ DimPlot(seurat_clust, ```{r umap_0.6} cluster_res <- 0.6 -Idents(object = seurat_clust) <- paste0("SCT_snn_res.", cluster_res) +Idents(object = seurat_clust) <- paste0(prefix_clu, cluster_res) DimPlot(seurat_clust, - reduction = "umap.harmony", + reduction = show_this, split.by = "orig.ident", label = TRUE) ``` diff --git a/inst/templates/singlecell/QC/QC.rmd b/inst/templates/singlecell/QC/QC.rmd index f2de4eb..6c24e4c 100644 --- a/inst/templates/singlecell/QC/QC.rmd +++ b/inst/templates/singlecell/QC/QC.rmd @@ -17,8 +17,8 @@ setwd(fs::path_dir(getSourceEditorContext()$path)) #. other versions stopifnot(R.version$major>= 4) # requires R4 stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 -stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) -stopifnot(compareVersion(package.version("Seurat"), "5.0.0")>=0) +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) +stopifnot(compareVersion(as.character(packageVersion("Seurat")), "5.0.0")>=0) ``` This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. diff --git a/inst/templates/singlecell_delux/CellToCell/cellchat.Rmd b/inst/templates/singlecell_delux/CellToCell/cellchat.Rmd index 330a049..6e49644 100644 --- a/inst/templates/singlecell_delux/CellToCell/cellchat.Rmd +++ b/inst/templates/singlecell_delux/CellToCell/cellchat.Rmd @@ -32,8 +32,8 @@ setwd(fs::path_dir(getSourceEditorContext()$path)) #. other versions stopifnot(R.version$major>= 4) # requires R4 stopifnot(compareVersion(R.version$minor,"3.3")==0) # requires >=4.3.3 -stopifnot(compareVersion(BiocManager::version(), "3.18")>=0) -stopifnot(compareVersion(package.version("Seurat"), "5.0.0")>=0) +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) +stopifnot(compareVersion(as.character(packageVersion("Seurat")), "5.0.0")>=0) ``` This code is in this ![](https://img.shields.io/badge/status-draft-grey) revision.