From 33f64dc3e1ed223eff0eaa6a121bf19673a2865c Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Tue, 1 Oct 2024 18:00:40 +0000 Subject: [PATCH 1/6] diffbind text, write files --- .gitignore | 2 + inst/templates/chipseq/diffbind/diffbind.Rmd | 82 +++++++++++++++++++- 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 6af5e77..a6d9e0e 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,5 @@ inst/rmarkdown/templates/rnaseq/skeleton/DE/Multiplicative_DGE_Analysis.Rmd .quarto inst/templates/chipseq/QC/QC.html *.html +*.csv +*.qs \ No newline at end of file diff --git a/inst/templates/chipseq/diffbind/diffbind.Rmd b/inst/templates/chipseq/diffbind/diffbind.Rmd index 9487e1d..ad51426 100644 --- a/inst/templates/chipseq/diffbind/diffbind.Rmd +++ b/inst/templates/chipseq/diffbind/diffbind.Rmd @@ -28,7 +28,10 @@ params: denominator: WT # species = mouse or human species: mouse + counts_fn: diffbind_counts.csv + results_sig_anno_fn: diffbind_results_sig_anno.csv --- +Template developed with materials from https://hbctraining.github.io/main/. ```{r, cache = FALSE, message = FALSE, warning=FALSE} # This set up the working directory to this file so all files can be found @@ -56,6 +59,22 @@ source(params$functions_file) - Analyst: `r analyst` - Experiment: `r experiment` +# Methodology + +[DiffBind](https://bioconductor.org/packages/release/bioc/vignettes/DiffBind/inst/doc/DiffBind.pdf) +is an R Bioconductor package which provides functions for processing +DNA data enriched for genomic loci, including ChIPseq data enriched for sites +where specific protein/DNA binding occurs or histone marks are enriched. + +DiffBind is mainly used for identifying sites that are differentially enriched +between two or more sample groups. It works primarily with sets of peak calls +('peaksets'), which are sets of genomic intervals representing candidate protein +binding sites for each sample. It includes functions that support the processing +of peaksets, including overlapping and merging peak sets across an entire dataset, +counting sequencing reads in overlapping intervals in peak sets, and identifying +statistically significantly differentially bound sites based on evidence of +binding affinity (measured by differences in read densities). To this end it uses +statistical routines developed in an RNA-Seq context (primarily the Bioconductor packages [edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html) and [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html)). ```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} library(tidyverse) @@ -137,6 +156,27 @@ samplesheet <- make_diffbind_samplesheet(coldata, bam_dir, peaks_dir, params$fac samplesheet %>% dplyr::select(SampleID, Replicate, Condition, Factor, ControlID) %>% sanitize_datatable() ``` +# Calculate counts matrix + +The first step is to read in a set of peaksets and associated metadata. +This is done using the DiffBind sample sheet. Once the peaksets are read in, +a merging function finds all overlapping peaks and derives a single set of unique +genomic intervals covering all the supplied peaks (a consensus peakset for +the experiment). A region is considered for the consensus set if it appears in +more than two of the samples. This consensus set represents the overall set of +candidate binding sites to be used in further analysis. + +The next step is to take the alignment files and compute count information for +each of the peaks/regions in the consensus set. In this step, for each of the +consensus regions, DiffBind uses the number of aligned reads in the ChIP sample +and the input sample to compute a normalized read count for each sample at every +potential binding site. The peaks in the consensus peakset may be re-centered and +trimmed based on calculating their summits (point of greatest read overlap) in +order to provide more standardized peak intervals. + +We then normalize the count matrix to adjust for varying +library size, and we use the normalized counts for further analysis including PCA. + ```{r create diffbind counts object, eval = !file.exists(params$diffbind_counts_file)} diffbind_obj <- dba(sampleSheet = samplesheet, scoreCol = 5) @@ -150,6 +190,12 @@ qsave(diffbind_counts, params$diffbind_counts_file) ``` # PCA + +Principal Component Analysis (PCA) is a statistical technique used to simplify +high-dimensional data by identifying patterns and reducing the number of variables. +In the context of ChIPseq, PCA helps analyze large datasets containing information +about thousands of binding locations across different samples (e.g., tissues, cells). + ```{r PCA} diffbind_counts <- qread(params$diffbind_counts_file) @@ -162,6 +208,10 @@ norm_counts <- dba.peakset(diffbind_norm, bRetrieve=TRUE, DataType=DBA_DATA_FRAM rownames(norm_counts) <- norm_counts$peak norm_counts <- norm_counts %>% dplyr::select(-peak) %>% as.matrix() norm_counts_log <- log2(norm_counts + 1) +norm_counts_log_df <- norm_counts_log %>% as.data.frame() %>% + rownames_to_column('peak') + +write_csv(norm_counts_log_df, params$counts_fn) coldata_for_pca <- coldata[colnames(norm_counts), ] @@ -174,6 +224,11 @@ degPCA(norm_counts_log, coldata_for_pca, condition = params$factor_of_interest) # Differentially Bound Peaks +A standardized differential analysis is performed using DiffBind and the DESeq2 package, +including estimation of size factors and dispersions, fitting and testing the +model, evaluating the supplied contrast, and shrinking the LFCs. A p-value and FDR +is assigned to each candidate binding site indicating confidence that they are differentially bound. + ## Table ```{r DE analysis} diffbind_norm <- dba.contrast(diffbind_norm, contrast = c('Factor', params$numerator, params$denominator)) @@ -190,6 +245,9 @@ results_sig %>% sanitize_datatable() ``` ## Volcano plot + +This volcano plot shows the binding sites that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in purple are sites that have padj < 0.05 and a log2-fold change magnitude > 0.5. Points in blue have a padj > 0.05 and a log2-fold change magnitude > 0.5. Grey points are non-significant. The dashed lines correspond to the cutoff values of log2-fold change and padj that we have chosen. + ```{r volcano, fig.height = 8} results_mod <- results %>% mutate(Fold = replace(Fold, Fold < -5, -5)) %>% @@ -211,6 +269,9 @@ EnhancedVolcano(results_mod, ``` ## Plot top peaks + +We visualize the log2 normalized read counts at a few of the most differentially +bound sites. ```{r plot top peaks, fig.width = 8, fig.height = 6} norm_counts_log_long <- norm_counts_log %>% as.data.frame() %>% rownames_to_column('peak') %>% @@ -225,6 +286,13 @@ ggplot(norm_counts_log_long_top, aes(x = .data[[params$factor_of_interest]], y = ## Annotate DB peaks +We use the [ChIPseeker](https://www.bioconductor.org/packages/release/bioc/html/ChIPseeker.html) +package to determine the genomic context of the differentially bound peaks and +visualize these annotations. We consider the promoter region to be within 2000 bp in either direction of the TSS. + +We also use [ChIPpeakAnno](https://bioconductor.org/packages/release/bioc/html/ChIPpeakAnno.html) +to identify any gene features within 1000 bp of a differentially bound site. + ```{r annotate, echo = F} results_sig_anno <- annotatePeak(results_report_sig, @@ -245,4 +313,16 @@ results_sig_anno_batch <- annotatePeakInBatch(results_report_sig, maxgap = 1000) results_sig_anno_batch_df <- results_sig_anno_batch %>% as.data.frame() -``` \ No newline at end of file + +write_csv(results_sig_anno_batch_df, params$results_sig_anno_fn) +``` + +# Functional Enrichment + +# R session + +List and version of tools used for the report generation. + +```{r} +sessionInfo() +``` From 3ec2536b4447eaa4e2b48b33822cd39d91eeb5bc Mon Sep 17 00:00:00 2001 From: Alex Bartlett <74612800+abartlett004@users.noreply.github.com> Date: Tue, 1 Oct 2024 14:15:31 -0400 Subject: [PATCH 2/6] Update README.md --- inst/templates/rnaseq/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/inst/templates/rnaseq/README.md b/inst/templates/rnaseq/README.md index 6de2117..e6e6cbc 100644 --- a/inst/templates/rnaseq/README.md +++ b/inst/templates/rnaseq/README.md @@ -5,8 +5,8 @@ Make sure there is a project name for this. ## Run data with nf-core rnaseq - Make sure you have access to our [Seqera WorkSpace](https://cloud.seqera.io/orgs/HBC/workspaces/core_production/launchpad) -- Transfer data to HCBC S3: Ask Alex/Lorena. Files will be at our S3 bucket `input/rawdata` folder -- Prepare the CSV file according this [instructions](https://nf-co.re/rnaseq/3.14.0/docs/usage#multiple-runs-of-the-same-sample). File should look like this: +- Transfer data to HCBC S3: Ask Alex/Lorena. Files will be at our S3 bucket `input` folder +- Prepare the CSV file according these [instructions](https://nf-co.re/rnaseq/3.14.0/docs/usage#multiple-runs-of-the-same-sample). File should look like this: ```csv sample,fastq_1,fastq_2,strandedness @@ -22,11 +22,11 @@ You can add more columns to this file with more metadata, and use this file as t - Upload file to our `Datasets` in Seqera using the name of the project but starting with `rnaseq-pi_lastname-hbc_code` - Go to `Launchpad`, select `nf-core_rnaseq` pipeline, and select the previous created `Datasets` in the `input` parameter after clicking in `Browser` - Select an output directory with the same name used for the `Dataset` inside the `results` folder in S3 -- When pipeline is down, data will be copied to our on-premise HPC in the scratch system under `scratch/groups/hsph/hbc/bcbio/` folder +- When pipeline is done, data will be copied to our on-premise HPC in the scratch system under `scratch/groups/hsph/hbc/bcbio/` folder ## Downstream analysis -Please, modify `information.R` with the right information. You can use this file with any other Rmd to include the project/analysis information. +Modify `information.R` with the right information. You can use this file with any other Rmd to include the project/analysis information. ### QC @@ -37,7 +37,7 @@ Read instruction in the R and Rmd scripts to render it. ### DE -`DE/DEG.Rmd` is a template for two groups comparison. `params_de.R` has the information of the input files to load. You can point to `bcbio` or `nf-core/rnaseq` output files. +`DE/DEG.Rmd` is a template for comparison between two groups. `params_de.R` has the information for the input files to load. You can point to `bcbio` or `nf-core/rnaseq` output files. On the `YAML` header file of the `Rmd` you can specify some parameters or just set them up in the first chunk of code of the template. This template has examples of: From 8181c8ac96e8b48b5f255e12859304be9a1d1e96 Mon Sep 17 00:00:00 2001 From: Alex Bartlett <74612800+abartlett004@users.noreply.github.com> Date: Tue, 1 Oct 2024 14:27:03 -0400 Subject: [PATCH 3/6] Update readme.md --- inst/templates/chipseq/readme.md | 41 ++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/inst/templates/chipseq/readme.md b/inst/templates/chipseq/readme.md index 112ddac..efa37c1 100755 --- a/inst/templates/chipseq/readme.md +++ b/inst/templates/chipseq/readme.md @@ -2,11 +2,44 @@ Make sure there is a valid project name, and modify `information.R` with the right information for your project. You can use this file with any other Rmd to include the project/analysis information. +## Run data with nf-core rnaseq + +- Make sure you have access to our [Seqera WorkSpace](https://cloud.seqera.io/orgs/HBC/workspaces/core_production/launchpad) +- Transfer data to HCBC S3: Ask Alex/Lorena. Files will be at our S3 bucket `input` folder +- Prepare the CSV file according these [instructions](https://nf-co.re/chipseq/2.0.0/docs/usage/). File should look like this: + +```csv +sample,fastq_1,fastq_2,antibody,control +WT_BCATENIN_IP_REP1,BLA203A1_S27_L006_R1_001.fastq.gz,,BCATENIN,WT_INPUT +WT_BCATENIN_IP_REP2,BLA203A25_S16_L001_R1_001.fastq.gz,,BCATENIN,WT_INPUT +WT_BCATENIN_IP_REP2,BLA203A25_S16_L002_R1_001.fastq.gz,,BCATENIN,WT_INPUT +WT_BCATENIN_IP_REP2,BLA203A25_S16_L003_R1_001.fastq.gz,,BCATENIN,WT_INPUT +WT_BCATENIN_IP_REP3,BLA203A49_S40_L001_R1_001.fastq.gz,,BCATENIN,WT_INPUT +WT_INPUT_REP1,BLA203A6_S32_L006_R1_001.fastq.gz,,, +WT_INPUT_REP2,BLA203A30_S21_L001_R1_001.fastq.gz,,, +WT_INPUT_REP2,BLA203A30_S21_L002_R1_001.fastq.gz,,, +WT_INPUT_REP3,BLA203A31_S21_L003_R1_001.fastq.gz,,, +``` + +You can add more columns to this file with more metadata, and use this file as the `coldata` file in the templates. + +- Upload file to our `Datasets` in Seqera using the name of the project but starting with `chipseq-pi_lastname-hbc_code` +- Go to `Launchpad`, select `nf-core_chipseq` pipeline, and select the previous created `Datasets` in the `input` parameter after clicking in `Browser` + - Select an output directory with the same name used for the `Dataset` inside the `results` folder in S3 +- When pipeline is done, data will be copied to our on-premise HPC in the scratch system under `scratch/groups/hsph/hbc/bcbio/` folder + ## QC -`QC/QC.Rmd` is a template for QC metrics. It includes basic read-level statistics, peak quality information, sample correlation analysis, and PCA. +`QC/QC.Rmd` is a template for QC metrics. It includes basic read-level statistics, peak quality information, sample correlation analysis, and PCA that it produces using the above samplesheet and output from the nf-core pipeline. Use `params_qc.R` to provide the required input files. + +## DiffBind + +`diffbind/diffbind.Rmd` is a template for comparing peak binding betweeen two groups. Use `params_diffbind.R` to provide the required input files. -## DropBox +On the YAML header file of the Rmd you can specify some parameters including the conditions to be compared, the genome used, and the desired output file names. This template has examples of: +* calculating a peak counts matrix +* PCA +* differential binding analyiss +* peak annotation +* functional analysis (coming soon) -- In `reports/QC` - - [ ] copy QC `Rmd/R/html/figures` From 255147933c10f0ed27f545b5cbd1461b114dcc83 Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Tue, 1 Oct 2024 18:27:23 +0000 Subject: [PATCH 4/6] add non-example params files --- inst/templates/chipseq/QC/params_qc.R | 10 ++++++++++ inst/templates/chipseq/diffbind/params_diffbind.R | 9 +++++++++ 2 files changed, 19 insertions(+) create mode 100644 inst/templates/chipseq/QC/params_qc.R create mode 100644 inst/templates/chipseq/diffbind/params_diffbind.R diff --git a/inst/templates/chipseq/QC/params_qc.R b/inst/templates/chipseq/QC/params_qc.R new file mode 100644 index 0000000..125baf5 --- /dev/null +++ b/inst/templates/chipseq/QC/params_qc.R @@ -0,0 +1,10 @@ +# info params + + +coldata_fn='/path/to/nf-core/samplesheet.csv' +# This folder is in the nf-core output directory inside multiqc folder +multiqc_data_dir='/path/to/nf-core/output/multiqc/narrowPeak/multiqc_data/' +# This folder is in the nf-core output directory, maybe is broadPeak instead of narrowPeak +peaks_dir = '/path/to/nf-core/output/bowtie2/mergedLibrary/macs2/narrowPeak/' +# This folder is in the nf-core output directory, maybe is broadPeak instead of narrowPeak, also includes antibody name +counts_fn = '/path/to/nf-core/output/bowtie2/mergedLibrary/macs2/narrowPeak/consensus/antibody/deseq2/antibody.consensus_peaks.rds' diff --git a/inst/templates/chipseq/diffbind/params_diffbind.R b/inst/templates/chipseq/diffbind/params_diffbind.R new file mode 100644 index 0000000..0a35a81 --- /dev/null +++ b/inst/templates/chipseq/diffbind/params_diffbind.R @@ -0,0 +1,9 @@ +# info params + +coldata_fn='/path/to/nf-core/samplesheet.csv' + +# This folder is in the nf-core output directory, maybe is broadPeak instead of narrowPeak +peaks_dir = '/path/to/nf-core/output/bowtie2/mergedLibrary/macs2/narrowPeak/' + +# This folder is in the nf-core output directory +bam_dir = '/path/to/nf-core/output/bowtie2/mergedLibrary/' From 5c912d7170aba34b0214c952a2a21646e488e062 Mon Sep 17 00:00:00 2001 From: Alex Bartlett <74612800+abartlett004@users.noreply.github.com> Date: Tue, 1 Oct 2024 16:01:55 -0400 Subject: [PATCH 5/6] Update readme.md --- inst/templates/chipseq/readme.md | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/templates/chipseq/readme.md b/inst/templates/chipseq/readme.md index efa37c1..f0e88b3 100755 --- a/inst/templates/chipseq/readme.md +++ b/inst/templates/chipseq/readme.md @@ -43,3 +43,4 @@ On the YAML header file of the Rmd you can specify some parameters including the * peak annotation * functional analysis (coming soon) +This template writes to CSV a log2 normalized counts matrix of peaks x samples as well as the annotated significant results of the differential binding analysis. From c9d6339fbae729ddc7938a727f543fe44eb75e67 Mon Sep 17 00:00:00 2001 From: Alex Bartlett Date: Thu, 3 Oct 2024 21:37:09 +0000 Subject: [PATCH 6/6] begin feedback from meeting --- inst/templates/chipseq/diffbind/diffbind.Rmd | 44 +++++++++++++++----- inst/templates/chipseq/libs/load_data.R | 4 +- 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/inst/templates/chipseq/diffbind/diffbind.Rmd b/inst/templates/chipseq/diffbind/diffbind.Rmd index ad51426..422faf5 100644 --- a/inst/templates/chipseq/diffbind/diffbind.Rmd +++ b/inst/templates/chipseq/diffbind/diffbind.Rmd @@ -23,7 +23,7 @@ params: project_file: ../information.R functions_file: ../libs/load_data.R diffbind_counts_file: diffbind_counts.qs - factor_of_interest: genotype + condition_of_interest: genotype numerator: cKO denominator: WT # species = mouse or human @@ -41,7 +41,7 @@ setwd(fs::path_dir(getSourceEditorContext()$path)) ```{r source_params, cache = FALSE, message = FALSE, warning=FALSE} -# 1. set up factor_of_interest parameter from parameter above or manually +# 1. set up condition_of_interest parameter from parameter above or manually # this is used to color plots, it needs to be part of the metadata # 2. Set input files in this file source(params$params_file) @@ -144,9 +144,9 @@ coldata <- load_coldata(coldata_fn) # make_diffbind_samplesheet is a function provided by bcbioR to help assemble DiffBind's samplesheet # using the nf-core samplesheet and output. In the resulting DiffBind counts object, it -# encodes your factor of interest as "Factor" and the antibody as "Condition" +# encodes your condition of interest as "Condition" and the antibody as "Factor" -samplesheet <- make_diffbind_samplesheet(coldata, bam_dir, peaks_dir, params$factor_of_interest) +samplesheet <- make_diffbind_samplesheet(coldata, bam_dir, peaks_dir, params$condition_of_interest) # if necessary, one additional covariate of interest can be encoded as "Tissue" @@ -217,11 +217,12 @@ coldata_for_pca <- coldata[colnames(norm_counts), ] stopifnot(all(colnames(norm_counts) == rownames(coldata_for_pca))) -degPCA(norm_counts_log, coldata_for_pca, condition = params$factor_of_interest) + +degPCA(norm_counts_log, coldata_for_pca, condition = params$condition_of_interest) + scale_color_cb_friendly() ``` + # Differentially Bound Peaks A standardized differential analysis is performed using DiffBind and the DESeq2 package, @@ -229,21 +230,44 @@ including estimation of size factors and dispersions, fitting and testing the model, evaluating the supplied contrast, and shrinking the LFCs. A p-value and FDR is assigned to each candidate binding site indicating confidence that they are differentially bound. -## Table -```{r DE analysis} -diffbind_norm <- dba.contrast(diffbind_norm, contrast = c('Factor', params$numerator, params$denominator)) +```{r DB analysis} +diffbind_norm <- dba.contrast(diffbind_norm, contrast = c('Condition', params$numerator, params$denominator)) results_obj <- dba.analyze(diffbind_norm, bGreylist = F) results_report <- dba.report(results_obj, th = 1) results_report_sig <- dba.report(results_obj) + results <- results_report %>% as.data.frame() results_sig <- results_report_sig %>% as.data.frame() +``` + +## MA plot + +This plot can help to: +- Identify Differential Binding: Sites that show a significant log-fold change (M value away from 0) indicate changes in binding between conditions. +- Assess Data Quality: The plot can help in identifying biases or systematic errors in the data. Ideally, most points should scatter around the M=0 line, indicating that there is no significant systematic difference between the conditions. +- Visualize data dispersion: The distribution of points along the A-axis gives a sense of the spread of binding levels and any patterns or anomalies in the dataset. + +```{r MA plot} +results_for_ma <- results%>% + mutate(peak = paste(seqnames, start, end, sep = '_')) %>% + mutate(t = 0) %>% + dplyr::select(peak, AveExpr = Conc, logFC = Fold, P.Value = p.value, adj.P.Val = FDR, t) +rownames(results_for_ma) <- results_for_ma$peak +degMA(as.DEGSet(results_for_ma, contrast = paste(params$numerator, params$denominator, sep = ' vs. '))) + +``` + +## Table of differentially bound peaks + +```{r DB table} results_sig %>% sanitize_datatable() ``` + ## Volcano plot This volcano plot shows the binding sites that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in purple are sites that have padj < 0.05 and a log2-fold change magnitude > 0.5. Points in blue have a padj > 0.05 and a log2-fold change magnitude > 0.5. Grey points are non-significant. The dashed lines correspond to the cutoff values of log2-fold change and padj that we have chosen. @@ -261,7 +285,7 @@ EnhancedVolcano(results_mod, FCcutoff = 0.5, x = 'Fold', y = 'FDR', - title = paste(params$factor_of_interest, ':', params$numerator, 'vs', params$denominator), + title = paste(params$condition_of_interest, ':', params$numerator, 'vs', params$denominator), col=as.vector(colors[c("dark_grey", "light_blue", "purple", "purple")]), subtitle = "", drawConnectors = T, max.overlaps = Inf) @@ -280,7 +304,7 @@ norm_counts_log_long <- norm_counts_log %>% as.data.frame() %>% norm_counts_log_long_top <- norm_counts_log_long %>% filter(peak %in% show$peak) -ggplot(norm_counts_log_long_top, aes(x = .data[[params$factor_of_interest]], y = norm_counts_log2)) + +ggplot(norm_counts_log_long_top, aes(x = .data[[params$condition_of_interest]], y = norm_counts_log2)) + facet_wrap(~peak, scale = 'free_y') + geom_boxplot() ``` diff --git a/inst/templates/chipseq/libs/load_data.R b/inst/templates/chipseq/libs/load_data.R index e233ec4..fec3d4a 100755 --- a/inst/templates/chipseq/libs/load_data.R +++ b/inst/templates/chipseq/libs/load_data.R @@ -103,9 +103,9 @@ make_diffbind_samplesheet <- function(coldata, bam_dir, peaks_dir, column = NULL coldata_for_diffbind <- coldata %>% filter(!is.na(control) & control != '') %>% - dplyr::rename(ControlID = control, SampleID = sample, Condition = antibody) %>% + dplyr::rename(ControlID = control, SampleID = sample, Factor = antibody) %>% separate(SampleID, into = c('sample', 'Replicate'), remove = F, sep = '_REP') - coldata_for_diffbind$Factor <- coldata_for_diffbind[[column]] + coldata_for_diffbind$Condition <- coldata_for_diffbind[[column]] samplesheet <- coldata_for_diffbind %>% left_join(bam_files %>% dplyr::select(SampleID = sample, bamReads = bam), by = 'SampleID') %>%