From 754240f6e4e96fe729c2fed53eec9af0e7ca223c Mon Sep 17 00:00:00 2001
From: Gabriel Lencioni Lovate
 <39889038+gabriellovate@users.noreply.github.com>
Date: Thu, 30 Nov 2023 17:13:56 +0100
Subject: [PATCH] feat: deduplicate annotations

---
 bin/annotate_interactions.py     |   1 -
 bin/deduplicate_annotations.py   | 150 +++++++++++++++++++++++++++++++
 bin/deduplicate_components.py    |  25 ------
 main.nf                          |  42 +++++----
 modules/annotate_interactions.nf |  21 +++++
 modules/differential_analysis.nf |   2 +-
 nextflow.config                  |   7 +-
 7 files changed, 203 insertions(+), 45 deletions(-)
 create mode 100644 bin/deduplicate_annotations.py
 delete mode 100644 bin/deduplicate_components.py

diff --git a/bin/annotate_interactions.py b/bin/annotate_interactions.py
index e86868a..91a6ac9 100755
--- a/bin/annotate_interactions.py
+++ b/bin/annotate_interactions.py
@@ -228,7 +228,6 @@ def fit_optimal_gmm(
         return gmm_dict[max_components]  # Return the model with max_components if no optimal found
 
 
-
 def calculate_individual_pdfs(gmm, density_array, weights=None):
     """
     Calculate the probability density function for each component of the Gaussian Mixture Model.
diff --git a/bin/deduplicate_annotations.py b/bin/deduplicate_annotations.py
new file mode 100644
index 0000000..17ec3d6
--- /dev/null
+++ b/bin/deduplicate_annotations.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+
+"""deduplicate_annotations.py
+
+Takes a count table and an annotation table and removes all components which overlap with the component with the most reads.
+
+Usage:
+    deduplicate_annotations.py -a <annotation_table> -c <count_table> -o <output_folder>
+    deduplicate_annotations.py -h | --help
+
+Options:
+    -h --help                                    Show this screen.
+    -a --annotation_table=<annotation_table>     The annotation table.
+    -c --count_table=<count_table>               The count table.
+    -o --output=<output_folder>                  The output folder.
+
+"""
+
+# takes the 'square' with the most reads
+# remove all the squares which overlaps with that one from the list of possible interactions
+# take the square with most reads, repeat 2/3 until components are done
+
+from docopt import docopt
+import pandas as pd
+import os
+
+def parse_annotation_table(annotation_table):
+    """Parses the annotation table into a pandas dataframe.
+
+    Args:
+        annotation_table (str): The annotation table.
+
+    Returns:
+        pandas.DataFrame: The annotation table as a dataframe.
+    """
+
+    df = pd.read_csv(annotation_table, sep='\t', header=0)
+    return df
+
+
+def parse_count_table(count_table, calculate_means=True, sort_by_mean=True):
+    """Parses the count table into a pandas dataframe.
+
+    Args:
+        count_table (str): The count table.
+        calculate_means (bool): Whether to calculate the means of the count table.
+        sort_by_mean (bool): Whether to sort the count table by the mean.
+
+    Returns:
+        pandas.DataFrame: The count table as a dataframe.
+    """
+    # first line is the header, first (unamed) column is the id
+    df = pd.read_csv(count_table, sep='\t', header=0, index_col=0)
+    if calculate_means:
+        # for each line in the count table, calculate the mean of the counts (ignoring the id column)
+        df['mean'] = df.mean(axis=1)
+        if sort_by_mean:
+            df = df.sort_values(by='mean', ascending=False)
+    elif not calculate_means and sort_by_mean:
+        # cannot sort by mean if it has not been calculated
+        raise ValueError('Cannot sort by mean if it has not been calculated.')
+    return df
+
+
+def check_if_overlap(annotation1, annotation2):
+    """Checks if two annotations overlap.
+
+    An annotation is a pandas.Series with the following columns:
+    - id
+    - segment01
+    - start01
+    - end01
+    - segment02
+    - start02
+    - end02
+
+    Args:
+        annotation1 (pandas.Series): The first annotation.
+        annotation2 (pandas.Series): The second annotation.
+
+    Returns:
+        bool: True if the annotations overlap, False otherwise.
+
+
+    """
+    # check if the segments are the same
+    if annotation1['segment01'] != annotation2['segment01'] or annotation1['segment02'] != annotation2['segment02']:
+        return False
+    # check if the start of the first annotation is between the start and end of the second annotation
+    if annotation1['start01'] >= annotation2['start01'] and annotation1['start01'] <= annotation2['end01']:
+        return True
+    # check if the end of the first annotation is between the start and end of the second annotation
+    if annotation1['end01'] >= annotation2['start01'] and annotation1['end01'] <= annotation2['end01']:
+        return True
+    # check if the start of the second annotation is between the start and end of the first annotation
+    if annotation2['start02'] >= annotation1['start02'] and annotation2['start02'] <= annotation1['end02']:
+        return True
+    # check if the end of the second annotation is between the start and end of the first annotation
+    if annotation2['end02'] >= annotation1['start02'] and annotation2['end02'] <= annotation1['end02']:
+        return True
+    # if none of the above, there is no overlap
+    return False
+
+
+def main():
+    """Main function.
+    """
+
+    # parse the arguments
+    arguments = docopt(__doc__)
+    annotation_table = arguments['--annotation_table']
+    count_table = arguments['--count_table']
+    output_folder = arguments['--output']
+
+    # parse the annotation table
+    annotation_table_df = parse_annotation_table(annotation_table)
+
+    # parse the count table
+    count_table_df = parse_count_table(count_table)
+
+    # Convert annotation table to a dictionary for faster access
+    annotation_dict = annotation_table_df.set_index('id').to_dict('index')
+
+    # List to store rows for the deduplicated DataFrame
+    deduplicated_rows = []
+
+    # Iterate over the count table
+    for index, row in count_table_df.iterrows():
+        # Get the annotation
+        annotation = annotation_dict[index]
+
+        # Check if the annotation overlaps with any of the following annotations
+        if not any(check_if_overlap(annotation, annotation_dict[idx]) for idx in deduplicated_rows):
+            deduplicated_rows.append(index)
+
+    # Create deduplicated DataFrame
+    # set index to the id column
+    # count_table_deduplicated_df = count_table_df.loc[deduplicated_rows]
+    count_table_deduplicated_df = count_table_df.loc[deduplicated_rows]
+    annotation_table_deduplicated_df = annotation_table_df.loc[deduplicated_rows]
+
+    # Write the deduplicated count table to a file
+    output_files = [os.path.join(output_folder, 'deduplicated_count_table.tsv'),
+                   os.path.join(output_folder, 'deduplicated_annotation_table.tsv')]
+    count_table_deduplicated_df.to_csv(output_files[0], sep='\t', header=True)
+    annotation_table_deduplicated_df.to_csv(output_files[1], sep='\t', header=True)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/bin/deduplicate_components.py b/bin/deduplicate_components.py
deleted file mode 100644
index 8c8780c..0000000
--- a/bin/deduplicate_components.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/env python3
-
-"""deduplicate_components.py
-
-Takes an arbitrary number of trns files, finds and merges interactions, outputing an annotation table and the GMMs for each combination.
-
-Usage:
-    deduplicate_components.py -a <annotation_table> -c <count_table> -o <output_file>
-    deduplicate_components.py -h | --help
-
-Options:
-    -h --help                                    Show this screen.
-    -a --annotation_table=<annotation_table>     The annotation table.
-    -c --count_table=<count_table>               The count table.
-    -o --output=<output_file>                    The output file.
-
-"""
-
-# takes the 'square' with the most reads
-# remove all the squares which overlaps with that one from the list of possible interactions
-# take the square with most reads, repeat 2/3 until components are done
-
-from docopt import docopt
-import pandas as pd
-
diff --git a/main.nf b/main.nf
index f1a130c..904c25f 100644
--- a/main.nf
+++ b/main.nf
@@ -76,8 +76,12 @@ include { plotHeatmapsAnnotated } from './modules/data_visualization.nf'
 include { annotateArrays; mergeAnnotations } from './modules/annotate_interactions.nf'
 // differential analysis
 include { generateCountTables; mergeCountTables; runDESeq2 } from './modules/differential_analysis.nf'
+// make alias for mergeCountTables to merge all count tables
+include { mergeCountTables as mergeAllCountTables } from './modules/differential_analysis.nf'
+// deduplicate annotations
+include { deduplicateAnnotations } from './modules/annotate_interactions.nf'
 // generate circos plots
-include { makeCircosTable_deseq2; makeCircosTable_count_table; makeCircosTable_count_table_30; makeCircosTable_count_table_40; makeCircosTable_count_table_50; runCircos_single; runCircos_comb } from './modules/data_visualization.nf'
+include { makeCircosTable_deseq2; makeCircosTable_count_table; runCircos_single; runCircos_comb } from './modules/data_visualization.nf'
 workflow {
     if ( params.help ) {
         println("""
@@ -111,17 +115,7 @@ workflow {
             --output <OUTDIR>                     Output directory
 
     Optional arguments:
-            --annotation_table <ANNOTATION_TABLE> CSV, TSV or XLSX file containing the annotations to be used. The file must have the following format:
-                                                    <GENE_NAME>,<GENE_START>,<GENE_END>,<GENE_STRAND>,<GENE_CHROMOSOME>
-                                                    <GENE_NAME>,<GENE_START>,<GENE_END>,<GENE_STRAND>,<GENE_CHROMOSOME>
-                                                    ...
-                                                    <GENE_NAME>,<GENE_START>,<GENE_END>,<GENE_STRAND>,<GENE_CHROMOSOME>
-                                                    where:
-                                                    - <GENE_NAME> is the name of the gene
-                                                    - <GENE_START> is the start position of the gene
-                                                    - <GENE_END> is the end position of the gene
-                                                    - <GENE_STRAND> is the strand of the gene
-                                                    - <GENE_CHROMOSOME> is the chromosome of the gene
+            --annotation_table <ANNOTATION_TABLE> CSV, TSV or XLSX file containing the annotations to be used.
             --test                                Run in test mode. This will run the pipeline with a small subset of the data
             --help                                Print this help message
             """)
@@ -215,7 +209,9 @@ workflow {
     }
 
     // Plot the annotations on the heatmaps
-    plotHeatmapsAnnotated( annotated_arrays_ch )
+    plotHeatmapsAnnotated( 
+        annotated_arrays_ch.map( it -> [ it[0], it[1], it[2], it[3] ] ) // sample name, genome, array, annotations
+    )
 
     // Generate count tables
     count_tables_ch = generateCountTables( annotated_trns_ch )
@@ -225,6 +221,22 @@ workflow {
             .map( it -> [ it[2], it[1] ] ) // group name, count tables
     )
 
+    // Merge all count tables independently of the group by collecting all count tables
+    merged_count_tables_all_ch = mergeAllCountTables(
+        count_tables_ch
+            .map( it -> [ it[1] ] ) // count tables
+            .collect()
+            .map( it -> [ "all", it ] ) // group name, count tables
+    )
+
+    // Deduplicate annotations
+    deduplicateAnnotations(
+        merged_count_tables_all_ch
+            .combine( mergeAnnotations.out )
+            .map( it -> [ it[0], it[2], it[3] ] ) // group name, count table, annotations
+            .view()
+    )
+
     // Run differential analysis with DESeq2
     samples_input_ch = Channel
             .fromPath( params.comparisons, checkIfExists: true )
@@ -254,11 +266,11 @@ workflow {
                         .map( it -> [ it[1], it[0], it[3], it[2] ] )
                         .combine( genomes_ch, by: 0 )
                         .map( it -> [ it[1], it[2], it[0], it[4], it[3] ] )
-                        .combine( mergeAnnotations.out )
+                        .combine( deduplicateAnnotations.out )
         circos_count_table_ch = merged_count_tables_ch
                         .combine( genomes_ch, by: 0 )
                         .map( it -> [ it[0], it[2], it[1] ] )
-                        .combine( mergeAnnotations.out )
+                        .combine( deduplicateAnnotations.out )
     }
 
     // Create circos tables
diff --git a/modules/annotate_interactions.nf b/modules/annotate_interactions.nf
index 14bb43c..faf92aa 100644
--- a/modules/annotate_interactions.nf
+++ b/modules/annotate_interactions.nf
@@ -38,4 +38,25 @@ process mergeAnnotations {
     echo "id\tsegment01\tstart01\tend01\tsegment02\tstart02\tend02" >> merged_annotations.tsv
     merge_annotation_tables.py ${annotations} -o merged_annotations.tsv
     """
+}
+
+/*************************************************************************
+* deduplicate annotations
+*************************************************************************/
+process deduplicateAnnotations {
+    label 'RNAswarm'
+
+    input:
+    tuple val(group), path(annotation_table), path(count_table)
+
+    output:
+    tuple path("deduplicated_annotations/deduplicated_annotation_table.tsv"), path("deduplicated_annotations/deduplicated_count_table.tsv")
+
+    publishDir "${params.output}/06-annotations" , mode: 'copy'
+
+    script:
+    """
+    mkdir deduplicated_annotations
+    deduplicate_annotations.py -a ${annotation_table} -c ${count_table} -o deduplicated_annotations
+    """
 }
\ No newline at end of file
diff --git a/modules/differential_analysis.nf b/modules/differential_analysis.nf
index fc9b279..fda83b4 100644
--- a/modules/differential_analysis.nf
+++ b/modules/differential_analysis.nf
@@ -31,7 +31,7 @@ process mergeCountTables {
     output:
     tuple val(group_name), path("${group_name}_count_table.tsv")
 
-    publishDir "${params.output}/06-count_analysis/count_tables", mode: 'copy'
+    publishDir "${params.output}/07-count_analysis/count_tables", mode: 'copy'
 
     script:
     """
diff --git a/nextflow.config b/nextflow.config
index fd34666..3f15af6 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -37,10 +37,10 @@ params {
     segemehl_threads = 48
 
     // annotation
-    min_components = 70
+    min_components = 80
     max_components = 80
-    step_size = 5
-    sigma = 0.5
+    step_size = 1
+    sigma = 0.65
 
     // help
     help = false
@@ -107,6 +107,7 @@ profiles {
     apptainer {
         conda.enabled = true
         conda.createTimeout = '2 h'
+        // conda.useMicromamba = true
         apptainer.enabled = true
         apptainer.autoMounts = true
         // apptainer.runOptions = '--unsquash'