Skip to content

Commit

Permalink
feat: deduplicate annotations
Browse files Browse the repository at this point in the history
  • Loading branch information
gabriellovate committed Nov 30, 2023
1 parent baea66e commit 754240f
Show file tree
Hide file tree
Showing 7 changed files with 203 additions and 45 deletions.
1 change: 0 additions & 1 deletion bin/annotate_interactions.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,6 @@ def fit_optimal_gmm(
return gmm_dict[max_components] # Return the model with max_components if no optimal found



def calculate_individual_pdfs(gmm, density_array, weights=None):
"""
Calculate the probability density function for each component of the Gaussian Mixture Model.
Expand Down
150 changes: 150 additions & 0 deletions bin/deduplicate_annotations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
#!/usr/bin/env python3

"""deduplicate_annotations.py
Takes a count table and an annotation table and removes all components which overlap with the component with the most reads.
Usage:
deduplicate_annotations.py -a <annotation_table> -c <count_table> -o <output_folder>
deduplicate_annotations.py -h | --help
Options:
-h --help Show this screen.
-a --annotation_table=<annotation_table> The annotation table.
-c --count_table=<count_table> The count table.
-o --output=<output_folder> The output folder.
"""

# takes the 'square' with the most reads
# remove all the squares which overlaps with that one from the list of possible interactions
# take the square with most reads, repeat 2/3 until components are done

from docopt import docopt
import pandas as pd
import os

def parse_annotation_table(annotation_table):
"""Parses the annotation table into a pandas dataframe.
Args:
annotation_table (str): The annotation table.
Returns:
pandas.DataFrame: The annotation table as a dataframe.
"""

df = pd.read_csv(annotation_table, sep='\t', header=0)
return df


def parse_count_table(count_table, calculate_means=True, sort_by_mean=True):
"""Parses the count table into a pandas dataframe.
Args:
count_table (str): The count table.
calculate_means (bool): Whether to calculate the means of the count table.
sort_by_mean (bool): Whether to sort the count table by the mean.
Returns:
pandas.DataFrame: The count table as a dataframe.
"""
# first line is the header, first (unamed) column is the id
df = pd.read_csv(count_table, sep='\t', header=0, index_col=0)
if calculate_means:
# for each line in the count table, calculate the mean of the counts (ignoring the id column)
df['mean'] = df.mean(axis=1)
if sort_by_mean:
df = df.sort_values(by='mean', ascending=False)
elif not calculate_means and sort_by_mean:
# cannot sort by mean if it has not been calculated
raise ValueError('Cannot sort by mean if it has not been calculated.')
return df


def check_if_overlap(annotation1, annotation2):
"""Checks if two annotations overlap.
An annotation is a pandas.Series with the following columns:
- id
- segment01
- start01
- end01
- segment02
- start02
- end02
Args:
annotation1 (pandas.Series): The first annotation.
annotation2 (pandas.Series): The second annotation.
Returns:
bool: True if the annotations overlap, False otherwise.
"""
# check if the segments are the same
if annotation1['segment01'] != annotation2['segment01'] or annotation1['segment02'] != annotation2['segment02']:
return False
# check if the start of the first annotation is between the start and end of the second annotation
if annotation1['start01'] >= annotation2['start01'] and annotation1['start01'] <= annotation2['end01']:
return True
# check if the end of the first annotation is between the start and end of the second annotation
if annotation1['end01'] >= annotation2['start01'] and annotation1['end01'] <= annotation2['end01']:
return True
# check if the start of the second annotation is between the start and end of the first annotation
if annotation2['start02'] >= annotation1['start02'] and annotation2['start02'] <= annotation1['end02']:
return True
# check if the end of the second annotation is between the start and end of the first annotation
if annotation2['end02'] >= annotation1['start02'] and annotation2['end02'] <= annotation1['end02']:
return True
# if none of the above, there is no overlap
return False


def main():
"""Main function.
"""

# parse the arguments
arguments = docopt(__doc__)
annotation_table = arguments['--annotation_table']
count_table = arguments['--count_table']
output_folder = arguments['--output']

# parse the annotation table
annotation_table_df = parse_annotation_table(annotation_table)

# parse the count table
count_table_df = parse_count_table(count_table)

# Convert annotation table to a dictionary for faster access
annotation_dict = annotation_table_df.set_index('id').to_dict('index')

# List to store rows for the deduplicated DataFrame
deduplicated_rows = []

# Iterate over the count table
for index, row in count_table_df.iterrows():
# Get the annotation
annotation = annotation_dict[index]

# Check if the annotation overlaps with any of the following annotations
if not any(check_if_overlap(annotation, annotation_dict[idx]) for idx in deduplicated_rows):
deduplicated_rows.append(index)

# Create deduplicated DataFrame
# set index to the id column
# count_table_deduplicated_df = count_table_df.loc[deduplicated_rows]
count_table_deduplicated_df = count_table_df.loc[deduplicated_rows]
annotation_table_deduplicated_df = annotation_table_df.loc[deduplicated_rows]

# Write the deduplicated count table to a file
output_files = [os.path.join(output_folder, 'deduplicated_count_table.tsv'),
os.path.join(output_folder, 'deduplicated_annotation_table.tsv')]
count_table_deduplicated_df.to_csv(output_files[0], sep='\t', header=True)
annotation_table_deduplicated_df.to_csv(output_files[1], sep='\t', header=True)


if __name__ == '__main__':
main()
25 changes: 0 additions & 25 deletions bin/deduplicate_components.py

This file was deleted.

42 changes: 27 additions & 15 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,12 @@ include { plotHeatmapsAnnotated } from './modules/data_visualization.nf'
include { annotateArrays; mergeAnnotations } from './modules/annotate_interactions.nf'
// differential analysis
include { generateCountTables; mergeCountTables; runDESeq2 } from './modules/differential_analysis.nf'
// make alias for mergeCountTables to merge all count tables
include { mergeCountTables as mergeAllCountTables } from './modules/differential_analysis.nf'
// deduplicate annotations
include { deduplicateAnnotations } from './modules/annotate_interactions.nf'
// generate circos plots
include { makeCircosTable_deseq2; makeCircosTable_count_table; makeCircosTable_count_table_30; makeCircosTable_count_table_40; makeCircosTable_count_table_50; runCircos_single; runCircos_comb } from './modules/data_visualization.nf'
include { makeCircosTable_deseq2; makeCircosTable_count_table; runCircos_single; runCircos_comb } from './modules/data_visualization.nf'
workflow {
if ( params.help ) {
println("""
Expand Down Expand Up @@ -111,17 +115,7 @@ workflow {
--output <OUTDIR> Output directory
Optional arguments:
--annotation_table <ANNOTATION_TABLE> CSV, TSV or XLSX file containing the annotations to be used. The file must have the following format:
<GENE_NAME>,<GENE_START>,<GENE_END>,<GENE_STRAND>,<GENE_CHROMOSOME>
<GENE_NAME>,<GENE_START>,<GENE_END>,<GENE_STRAND>,<GENE_CHROMOSOME>
...
<GENE_NAME>,<GENE_START>,<GENE_END>,<GENE_STRAND>,<GENE_CHROMOSOME>
where:
- <GENE_NAME> is the name of the gene
- <GENE_START> is the start position of the gene
- <GENE_END> is the end position of the gene
- <GENE_STRAND> is the strand of the gene
- <GENE_CHROMOSOME> is the chromosome of the gene
--annotation_table <ANNOTATION_TABLE> CSV, TSV or XLSX file containing the annotations to be used.
--test Run in test mode. This will run the pipeline with a small subset of the data
--help Print this help message
""")
Expand Down Expand Up @@ -215,7 +209,9 @@ workflow {
}

// Plot the annotations on the heatmaps
plotHeatmapsAnnotated( annotated_arrays_ch )
plotHeatmapsAnnotated(
annotated_arrays_ch.map( it -> [ it[0], it[1], it[2], it[3] ] ) // sample name, genome, array, annotations
)

// Generate count tables
count_tables_ch = generateCountTables( annotated_trns_ch )
Expand All @@ -225,6 +221,22 @@ workflow {
.map( it -> [ it[2], it[1] ] ) // group name, count tables
)

// Merge all count tables independently of the group by collecting all count tables
merged_count_tables_all_ch = mergeAllCountTables(
count_tables_ch
.map( it -> [ it[1] ] ) // count tables
.collect()
.map( it -> [ "all", it ] ) // group name, count tables
)

// Deduplicate annotations
deduplicateAnnotations(
merged_count_tables_all_ch
.combine( mergeAnnotations.out )
.map( it -> [ it[0], it[2], it[3] ] ) // group name, count table, annotations
.view()
)

// Run differential analysis with DESeq2
samples_input_ch = Channel
.fromPath( params.comparisons, checkIfExists: true )
Expand Down Expand Up @@ -254,11 +266,11 @@ workflow {
.map( it -> [ it[1], it[0], it[3], it[2] ] )
.combine( genomes_ch, by: 0 )
.map( it -> [ it[1], it[2], it[0], it[4], it[3] ] )
.combine( mergeAnnotations.out )
.combine( deduplicateAnnotations.out )
circos_count_table_ch = merged_count_tables_ch
.combine( genomes_ch, by: 0 )
.map( it -> [ it[0], it[2], it[1] ] )
.combine( mergeAnnotations.out )
.combine( deduplicateAnnotations.out )
}

// Create circos tables
Expand Down
21 changes: 21 additions & 0 deletions modules/annotate_interactions.nf
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,25 @@ process mergeAnnotations {
echo "id\tsegment01\tstart01\tend01\tsegment02\tstart02\tend02" >> merged_annotations.tsv
merge_annotation_tables.py ${annotations} -o merged_annotations.tsv
"""
}

/*************************************************************************
* deduplicate annotations
*************************************************************************/
process deduplicateAnnotations {
label 'RNAswarm'

input:
tuple val(group), path(annotation_table), path(count_table)

output:
tuple path("deduplicated_annotations/deduplicated_annotation_table.tsv"), path("deduplicated_annotations/deduplicated_count_table.tsv")

publishDir "${params.output}/06-annotations" , mode: 'copy'

script:
"""
mkdir deduplicated_annotations
deduplicate_annotations.py -a ${annotation_table} -c ${count_table} -o deduplicated_annotations
"""
}
2 changes: 1 addition & 1 deletion modules/differential_analysis.nf
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ process mergeCountTables {
output:
tuple val(group_name), path("${group_name}_count_table.tsv")

publishDir "${params.output}/06-count_analysis/count_tables", mode: 'copy'
publishDir "${params.output}/07-count_analysis/count_tables", mode: 'copy'

script:
"""
Expand Down
7 changes: 4 additions & 3 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ params {
segemehl_threads = 48

// annotation
min_components = 70
min_components = 80
max_components = 80
step_size = 5
sigma = 0.5
step_size = 1
sigma = 0.65

// help
help = false
Expand Down Expand Up @@ -107,6 +107,7 @@ profiles {
apptainer {
conda.enabled = true
conda.createTimeout = '2 h'
// conda.useMicromamba = true
apptainer.enabled = true
apptainer.autoMounts = true
// apptainer.runOptions = '--unsquash'
Expand Down

0 comments on commit 754240f

Please sign in to comment.