Skip to content

Commit

Permalink
save QC
Browse files Browse the repository at this point in the history
  • Loading branch information
lpantano committed Jul 25, 2024
1 parent 4c723f1 commit 2a4cf2f
Showing 1 changed file with 43 additions and 41 deletions.
84 changes: 43 additions & 41 deletions inst/templates/rnaseq/qc/QC_nf-core.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -17,45 +17,41 @@ output:
editor_options:
chunk_output_type: console
params:
# params_file: params_qc_nf-core-example.R # example data
# Fill this file with the right paths to nfcore output
params_file: params_qc_nf-core.R
# Put hg38, mm10, mm39, or other
genome: hg38
project_file: ../information.R
factor_of_interest: sample_type
---

```{r, echo = F}
```{r}
# This set up the working directory to this file so all files can be found
library(rstudioapi)
setwd(fs::path_dir(getSourceEditorContext()$path))
```

```{r}
metadata_fn="{{metadata_fn}}"
se_object="{{se_object}}"
# This folder is in the output directory inside multiqc folder
multiqc_data_dir="{{multiqc_data_dir}}"
# This file is inside the genome folder in the output directory
gtf_fn="{{gtf_fn}}"
```

```{r source_params, echo = F}
#knitr::opts_knit$set(root.dir = getSourceEditorContext()$path)
# 1. set up factor_of_interest parameter from parameter above or manualy
# 1. set up factor_of_interest parameter from parameter above or manually
# this is used to color plots, it needs to be part of the metadata
factor_of_interest=params$factor_of_interest
genome=params$genome
# 2. Set input files in this file
# This is the file used to run nf-core or compatible to that
metadata_fn='/Path/to/metadata/meta.csv'
# This file is inside star_salmon/ folder
se_object='/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.rds'
# This folder called "multiqc_report_data" is inside the output directory star_salmon inside multiqc folder
multiqc_data_dir='/path/to/nf-core/output/star_salmon/multiqc_report_data'
# This file is inside the genome folder in the output directory, use this only for non-model organism
# gtf_fn='/path/to/nf-core/output/genome/hg38.filtered.gtf'
source(params$params_file)
# 3. If you set up this file, project information will be printed below and
#. it can be reused for other Rmd files.
source(params$project_file)
```

# Overview

{{ project }}
- Project: `r project`
- PI: `r PI`
- Analyst: `r analyst`
- Experiment: `r experiment`


```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE}
library(tidyverse)
Expand All @@ -80,7 +76,7 @@ opts_chunk[["set"]](
prompt = FALSE,
tidy = FALSE,
warning = FALSE,
fig.height = 6)
fig.height = 4)
```


Expand Down Expand Up @@ -149,28 +145,24 @@ meta_df=read_csv(metadata_fn) %>%
meta_df$sample <- make.names(meta_df$sample)
order <- meta_df$sample
# remove some columns
meta_df <- data.frame(meta_df[,!(colnames(meta_df) %in% c("fastq_1", "fastq_2", "strandedness"))])
ggplot(meta_df, aes(.data[[factor_of_interest]],
fill = .data[[factor_of_interest]])) +
geom_bar() + ylab("") + xlab("") + ylab("# of samples") +
scale_fill_cb_friendly()
scale_fill_cb_friendly() + theme(axis.text.x=element_text(angle = 90, vjust = 0.5), legend.position = "none")
```


```{r}
```{r load_data}
# read counts from SE object
se <- readRDS(se_object)
raw_counts <- assays(se)[["counts"]] %>% round() %>%
as.matrix()
raw_counts=raw_counts[rowSums(raw_counts)!=0,]
```

```{r prepare metrics}
```{r prepare_metrics}
# Get metrics from nf-core into bcbio like table
# many metrics are already in the Genereal Table of MultiQC, this reads the file
# many metrics are already in the General Table of MultiQC, this reads the file
metrics <- read_tsv(file.path(multiqc_data_dir, 'multiqc_general_stats.txt'))
# we get some more metrics from Qualimap and rename columns
Expand Down Expand Up @@ -259,7 +251,7 @@ metrics <- metrics %>%
```

```{r show-metadata}
```{r show_metadata}
meta_sm <- meta_df %>%
as.data.frame() %>%
column_to_rownames("sample")
Expand All @@ -282,9 +274,10 @@ metrics %>%
geom_bar(stat = "identity") +
coord_flip() +
scale_y_continuous(name = "million reads") +
scale_x_discrete(limits = rev) +
scale_fill_cb_friendly() + xlab("") +
ggtitle("Total reads") +
geom_hline(yintercept=20000000, color = "grey", size=2)
geom_hline(yintercept=20000000, color = "grey", linewidth=2)
metrics %>%
ggplot(aes(x = .data[[factor_of_interest]],
Expand Down Expand Up @@ -315,10 +308,11 @@ metrics %>%
color = .data[[factor_of_interest]])) +
geom_point(alpha = 0.5, size=4) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_color_cb_friendly() +
ylim(0, 100) +
ggtitle("Mapping rate") + xlab("") +
geom_hline(yintercept=70, color = "grey", size=2)
geom_hline(yintercept=70, color = "grey", linewidth=2)
```


Expand Down Expand Up @@ -346,20 +340,22 @@ ggplot(metrics,aes(x = factor(sample, level = order),
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_cb_friendly() +
scale_x_discrete(limits = rev) +
ggtitle("Number of genes") +
ylab("Number of genes") +
xlab("") +
geom_hline(yintercept=20000, color = "grey", size=2)
geom_hline(yintercept=20000, color = "grey", linewidth=2)
metrics %>%
ggplot(aes(x = .data[[factor_of_interest]],
y = n_genes,
color = .data[[factor_of_interest]])) +
geom_point(alpha = 0.5, size=4) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(name = "million reads") +
scale_color_cb_friendly() + xlab("") +
ggtitle("Total reads")
ggtitle("Number of Genes")
```

Expand Down Expand Up @@ -393,11 +389,12 @@ metrics %>%
geom_point(alpha = 0.5, size=4) +
ylab("Exonic rate %") +
ggtitle("Exonic mapping rate") +
scale_x_discrete(limits = rev) +
scale_color_cb_friendly() +
coord_flip() +
xlab("") +
ylim(c(0,100)) +
geom_hline(yintercept=70, color = "grey", size=2)
geom_hline(yintercept=70, color = "grey", linewidth=2)
```

## Intronic mapping rate
Expand All @@ -412,11 +409,12 @@ metrics %>%
geom_point(alpha = 0.5, size=4) +
ylab("Intronic rate %") +
ggtitle("Intronic mapping rate") +
scale_x_discrete(limits = rev) +
scale_color_cb_friendly() +
coord_flip() +
xlab("") +
ylim(c(0,100)) +
geom_hline(yintercept=20, color = "grey", size=2)
geom_hline(yintercept=20, color = "grey", linewidth=2)
```

## Intergenic mapping rate
Expand All @@ -432,9 +430,10 @@ metrics %>%
ylab("Intergenic rate %") +
ggtitle("Intergenic mapping rate") +
coord_flip() + xlab("") +
scale_x_discrete(limits = rev) +
scale_color_cb_friendly() +
ylim(c(0, 100)) +
geom_hline(yintercept=15, color = "grey", size=2)
geom_hline(yintercept=15, color = "grey", linewidth=2)
```

## tRNA/rRNA mapping rate
Expand All @@ -448,14 +447,15 @@ metrics %>%
ggplot(aes(x = factor(sample, level = order),
y = r_and_t_rna_rate * 100,
color = .data[[factor_of_interest]])) +
geom_point(alpha = 0.5) +
geom_point(alpha = 0.5, size=4) +
ylab("tRNA/rRNA rate, %")+
ylim(0, rrna_ylim) +
ggtitle("tRNA/rRNA mapping rate") +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_color_cb_friendly() +
ylim(c(0, 100)) + xlab("") +
geom_hline(yintercept=10, color = "grey", size=2)
geom_hline(yintercept=10, color = "grey", linewidth=2)
```

## 5'->3' bias
Expand All @@ -470,9 +470,10 @@ metrics %>%
geom_point(alpha = 0.5, size=4) +
ggtitle("5'-3' bias") +
coord_flip() +
scale_x_discrete(limits = rev) +
ylim(c(0.5,1.5)) + xlab("") + ylab("5'-3' bias") +
scale_color_cb_friendly()+
geom_hline(yintercept=1, color = "grey", size=2)
geom_hline(yintercept=1, color = "grey", linewidth=2)
```

## Counts per gene - all genes
Expand All @@ -495,6 +496,7 @@ ggplot(counts, aes(factor(name, level = order),
log2(counts+1),
fill = .data[[factor_of_interest]])) +
geom_boxplot() +
scale_x_discrete(limits = rev) +
scale_fill_cb_friendly() +
coord_flip() + xlab("") +
ggtitle("Counts per gene, all non-zero genes") +
Expand All @@ -510,7 +512,7 @@ In this section, we look at how well the different groups in the dataset cluster

Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells).

<!-- ### PCA, PCs 1-5, (labled) -->
<!-- ### PCA, PCs 1-4, (labled) -->
```{r PCA1:5 summary, all, unlabeled, fig.width= 7, fig.height = 5}
vst <- vst(raw_counts)
Expand Down

0 comments on commit 2a4cf2f

Please sign in to comment.