From 754240f6e4e96fe729c2fed53eec9af0e7ca223c Mon Sep 17 00:00:00 2001 From: Gabriel Lencioni Lovate <39889038+gabriellovate@users.noreply.github.com> Date: Thu, 30 Nov 2023 17:13:56 +0100 Subject: [PATCH] feat: deduplicate annotations --- bin/annotate_interactions.py | 1 - bin/deduplicate_annotations.py | 150 +++++++++++++++++++++++++++++++ bin/deduplicate_components.py | 25 ------ main.nf | 42 +++++---- modules/annotate_interactions.nf | 21 +++++ modules/differential_analysis.nf | 2 +- nextflow.config | 7 +- 7 files changed, 203 insertions(+), 45 deletions(-) create mode 100644 bin/deduplicate_annotations.py delete mode 100644 bin/deduplicate_components.py diff --git a/bin/annotate_interactions.py b/bin/annotate_interactions.py index e86868a..91a6ac9 100755 --- a/bin/annotate_interactions.py +++ b/bin/annotate_interactions.py @@ -228,7 +228,6 @@ def fit_optimal_gmm( return gmm_dict[max_components] # Return the model with max_components if no optimal found - def calculate_individual_pdfs(gmm, density_array, weights=None): """ Calculate the probability density function for each component of the Gaussian Mixture Model. diff --git a/bin/deduplicate_annotations.py b/bin/deduplicate_annotations.py new file mode 100644 index 0000000..17ec3d6 --- /dev/null +++ b/bin/deduplicate_annotations.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 + +"""deduplicate_annotations.py + +Takes a count table and an annotation table and removes all components which overlap with the component with the most reads. + +Usage: + deduplicate_annotations.py -a -c -o + deduplicate_annotations.py -h | --help + +Options: + -h --help Show this screen. + -a --annotation_table= The annotation table. + -c --count_table= The count table. + -o --output= The output folder. + +""" + +# takes the 'square' with the most reads +# remove all the squares which overlaps with that one from the list of possible interactions +# take the square with most reads, repeat 2/3 until components are done + +from docopt import docopt +import pandas as pd +import os + +def parse_annotation_table(annotation_table): + """Parses the annotation table into a pandas dataframe. + + Args: + annotation_table (str): The annotation table. + + Returns: + pandas.DataFrame: The annotation table as a dataframe. + """ + + df = pd.read_csv(annotation_table, sep='\t', header=0) + return df + + +def parse_count_table(count_table, calculate_means=True, sort_by_mean=True): + """Parses the count table into a pandas dataframe. + + Args: + count_table (str): The count table. + calculate_means (bool): Whether to calculate the means of the count table. + sort_by_mean (bool): Whether to sort the count table by the mean. + + Returns: + pandas.DataFrame: The count table as a dataframe. + """ + # first line is the header, first (unamed) column is the id + df = pd.read_csv(count_table, sep='\t', header=0, index_col=0) + if calculate_means: + # for each line in the count table, calculate the mean of the counts (ignoring the id column) + df['mean'] = df.mean(axis=1) + if sort_by_mean: + df = df.sort_values(by='mean', ascending=False) + elif not calculate_means and sort_by_mean: + # cannot sort by mean if it has not been calculated + raise ValueError('Cannot sort by mean if it has not been calculated.') + return df + + +def check_if_overlap(annotation1, annotation2): + """Checks if two annotations overlap. + + An annotation is a pandas.Series with the following columns: + - id + - segment01 + - start01 + - end01 + - segment02 + - start02 + - end02 + + Args: + annotation1 (pandas.Series): The first annotation. + annotation2 (pandas.Series): The second annotation. + + Returns: + bool: True if the annotations overlap, False otherwise. + + + """ + # check if the segments are the same + if annotation1['segment01'] != annotation2['segment01'] or annotation1['segment02'] != annotation2['segment02']: + return False + # check if the start of the first annotation is between the start and end of the second annotation + if annotation1['start01'] >= annotation2['start01'] and annotation1['start01'] <= annotation2['end01']: + return True + # check if the end of the first annotation is between the start and end of the second annotation + if annotation1['end01'] >= annotation2['start01'] and annotation1['end01'] <= annotation2['end01']: + return True + # check if the start of the second annotation is between the start and end of the first annotation + if annotation2['start02'] >= annotation1['start02'] and annotation2['start02'] <= annotation1['end02']: + return True + # check if the end of the second annotation is between the start and end of the first annotation + if annotation2['end02'] >= annotation1['start02'] and annotation2['end02'] <= annotation1['end02']: + return True + # if none of the above, there is no overlap + return False + + +def main(): + """Main function. + """ + + # parse the arguments + arguments = docopt(__doc__) + annotation_table = arguments['--annotation_table'] + count_table = arguments['--count_table'] + output_folder = arguments['--output'] + + # parse the annotation table + annotation_table_df = parse_annotation_table(annotation_table) + + # parse the count table + count_table_df = parse_count_table(count_table) + + # Convert annotation table to a dictionary for faster access + annotation_dict = annotation_table_df.set_index('id').to_dict('index') + + # List to store rows for the deduplicated DataFrame + deduplicated_rows = [] + + # Iterate over the count table + for index, row in count_table_df.iterrows(): + # Get the annotation + annotation = annotation_dict[index] + + # Check if the annotation overlaps with any of the following annotations + if not any(check_if_overlap(annotation, annotation_dict[idx]) for idx in deduplicated_rows): + deduplicated_rows.append(index) + + # Create deduplicated DataFrame + # set index to the id column + # count_table_deduplicated_df = count_table_df.loc[deduplicated_rows] + count_table_deduplicated_df = count_table_df.loc[deduplicated_rows] + annotation_table_deduplicated_df = annotation_table_df.loc[deduplicated_rows] + + # Write the deduplicated count table to a file + output_files = [os.path.join(output_folder, 'deduplicated_count_table.tsv'), + os.path.join(output_folder, 'deduplicated_annotation_table.tsv')] + count_table_deduplicated_df.to_csv(output_files[0], sep='\t', header=True) + annotation_table_deduplicated_df.to_csv(output_files[1], sep='\t', header=True) + + +if __name__ == '__main__': + main() diff --git a/bin/deduplicate_components.py b/bin/deduplicate_components.py deleted file mode 100644 index 8c8780c..0000000 --- a/bin/deduplicate_components.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python3 - -"""deduplicate_components.py - -Takes an arbitrary number of trns files, finds and merges interactions, outputing an annotation table and the GMMs for each combination. - -Usage: - deduplicate_components.py -a -c -o - deduplicate_components.py -h | --help - -Options: - -h --help Show this screen. - -a --annotation_table= The annotation table. - -c --count_table= The count table. - -o --output= The output file. - -""" - -# takes the 'square' with the most reads -# remove all the squares which overlaps with that one from the list of possible interactions -# take the square with most reads, repeat 2/3 until components are done - -from docopt import docopt -import pandas as pd - diff --git a/main.nf b/main.nf index f1a130c..904c25f 100644 --- a/main.nf +++ b/main.nf @@ -76,8 +76,12 @@ include { plotHeatmapsAnnotated } from './modules/data_visualization.nf' include { annotateArrays; mergeAnnotations } from './modules/annotate_interactions.nf' // differential analysis include { generateCountTables; mergeCountTables; runDESeq2 } from './modules/differential_analysis.nf' +// make alias for mergeCountTables to merge all count tables +include { mergeCountTables as mergeAllCountTables } from './modules/differential_analysis.nf' +// deduplicate annotations +include { deduplicateAnnotations } from './modules/annotate_interactions.nf' // generate circos plots -include { makeCircosTable_deseq2; makeCircosTable_count_table; makeCircosTable_count_table_30; makeCircosTable_count_table_40; makeCircosTable_count_table_50; runCircos_single; runCircos_comb } from './modules/data_visualization.nf' +include { makeCircosTable_deseq2; makeCircosTable_count_table; runCircos_single; runCircos_comb } from './modules/data_visualization.nf' workflow { if ( params.help ) { println(""" @@ -111,17 +115,7 @@ workflow { --output Output directory Optional arguments: - --annotation_table CSV, TSV or XLSX file containing the annotations to be used. The file must have the following format: - ,,,, - ,,,, - ... - ,,,, - where: - - is the name of the gene - - is the start position of the gene - - is the end position of the gene - - is the strand of the gene - - is the chromosome of the gene + --annotation_table CSV, TSV or XLSX file containing the annotations to be used. --test Run in test mode. This will run the pipeline with a small subset of the data --help Print this help message """) @@ -215,7 +209,9 @@ workflow { } // Plot the annotations on the heatmaps - plotHeatmapsAnnotated( annotated_arrays_ch ) + plotHeatmapsAnnotated( + annotated_arrays_ch.map( it -> [ it[0], it[1], it[2], it[3] ] ) // sample name, genome, array, annotations + ) // Generate count tables count_tables_ch = generateCountTables( annotated_trns_ch ) @@ -225,6 +221,22 @@ workflow { .map( it -> [ it[2], it[1] ] ) // group name, count tables ) + // Merge all count tables independently of the group by collecting all count tables + merged_count_tables_all_ch = mergeAllCountTables( + count_tables_ch + .map( it -> [ it[1] ] ) // count tables + .collect() + .map( it -> [ "all", it ] ) // group name, count tables + ) + + // Deduplicate annotations + deduplicateAnnotations( + merged_count_tables_all_ch + .combine( mergeAnnotations.out ) + .map( it -> [ it[0], it[2], it[3] ] ) // group name, count table, annotations + .view() + ) + // Run differential analysis with DESeq2 samples_input_ch = Channel .fromPath( params.comparisons, checkIfExists: true ) @@ -254,11 +266,11 @@ workflow { .map( it -> [ it[1], it[0], it[3], it[2] ] ) .combine( genomes_ch, by: 0 ) .map( it -> [ it[1], it[2], it[0], it[4], it[3] ] ) - .combine( mergeAnnotations.out ) + .combine( deduplicateAnnotations.out ) circos_count_table_ch = merged_count_tables_ch .combine( genomes_ch, by: 0 ) .map( it -> [ it[0], it[2], it[1] ] ) - .combine( mergeAnnotations.out ) + .combine( deduplicateAnnotations.out ) } // Create circos tables diff --git a/modules/annotate_interactions.nf b/modules/annotate_interactions.nf index 14bb43c..faf92aa 100644 --- a/modules/annotate_interactions.nf +++ b/modules/annotate_interactions.nf @@ -38,4 +38,25 @@ process mergeAnnotations { echo "id\tsegment01\tstart01\tend01\tsegment02\tstart02\tend02" >> merged_annotations.tsv merge_annotation_tables.py ${annotations} -o merged_annotations.tsv """ +} + +/************************************************************************* +* deduplicate annotations +*************************************************************************/ +process deduplicateAnnotations { + label 'RNAswarm' + + input: + tuple val(group), path(annotation_table), path(count_table) + + output: + tuple path("deduplicated_annotations/deduplicated_annotation_table.tsv"), path("deduplicated_annotations/deduplicated_count_table.tsv") + + publishDir "${params.output}/06-annotations" , mode: 'copy' + + script: + """ + mkdir deduplicated_annotations + deduplicate_annotations.py -a ${annotation_table} -c ${count_table} -o deduplicated_annotations + """ } \ No newline at end of file diff --git a/modules/differential_analysis.nf b/modules/differential_analysis.nf index fc9b279..fda83b4 100644 --- a/modules/differential_analysis.nf +++ b/modules/differential_analysis.nf @@ -31,7 +31,7 @@ process mergeCountTables { output: tuple val(group_name), path("${group_name}_count_table.tsv") - publishDir "${params.output}/06-count_analysis/count_tables", mode: 'copy' + publishDir "${params.output}/07-count_analysis/count_tables", mode: 'copy' script: """ diff --git a/nextflow.config b/nextflow.config index fd34666..3f15af6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -37,10 +37,10 @@ params { segemehl_threads = 48 // annotation - min_components = 70 + min_components = 80 max_components = 80 - step_size = 5 - sigma = 0.5 + step_size = 1 + sigma = 0.65 // help help = false @@ -107,6 +107,7 @@ profiles { apptainer { conda.enabled = true conda.createTimeout = '2 h' + // conda.useMicromamba = true apptainer.enabled = true apptainer.autoMounts = true // apptainer.runOptions = '--unsquash'