Skip to content

Commit

Permalink
Merge pull request #31 from rnajena:30-error-with-deduplicateannotations
Browse files Browse the repository at this point in the history
Main
  • Loading branch information
gabriellovate authored Apr 30, 2024
2 parents 236263d + eaa0d1e commit 6829cd1
Show file tree
Hide file tree
Showing 7 changed files with 44 additions and 111 deletions.
Empty file modified bin/deduplicate_annotations.py
100644 → 100755
Empty file.
71 changes: 22 additions & 49 deletions bin/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from matplotlib.colors import LogNorm
import pandas as pd


def parse_fasta(fasta_file):
"""Parses a fasta file and returns a dictionary of segment names and sequences.
Expand All @@ -27,24 +28,25 @@ def parse_fasta(fasta_file):
with open(fasta_file) as file: # read genome file
for line in file: # parse genome file
if line.startswith(">"): # parse header
#* if there is a header already, we store the current sequence
#* to this header.
# * if there is a header already, we store the current sequence
# * to this header.
if header:
fasta_dict[header] = seq
#* then we flush the sequence
# * then we flush the sequence
seq = ""
#* and parse the new header
# * and parse the new header
header = line.strip()[1:]
else:
#* if no header is detected, we append the new line
#* to the current sequence
# * if no header is detected, we append the new line
# * to the current sequence
seq += line.strip()
#* after the last line, we have to store
#* the last sequence as well. Since no new line with ">" occurs, we
#* do this manually
# * after the last line, we have to store
# * the last sequence as well. Since no new line with ">" occurs, we
# * do this manually
fasta_dict[header] = seq
return fasta_dict


def make_combination_array(genome_dict, intra_only=False):
"""
Creates a dictionary of numpy array of all possible genome segment combinations.
Expand Down Expand Up @@ -72,21 +74,24 @@ def make_combination_array(genome_dict, intra_only=False):
# * the iterator to a list. Actually, we also could just put the iterator in the for loop.
# * should work as well. Is a tad more memory efficient.
if intra_only:
segment_combinations = list(itertools.combinations_with_replacement(segments, 2))
segment_combinations = list(
itertools.combinations_with_replacement(segments, 2)
)
segment_combinations = [
segment_combination
for segment_combination in segment_combinations
if segment_combination[0] == segment_combination[1]
]
else:
segment_combinations = list(itertools.combinations_with_replacement(segments, 2))
segment_combinations = list(
itertools.combinations_with_replacement(segments, 2)
)
segment_combinations = [
segment_combination
for segment_combination in segment_combinations
if segment_combination[0] != segment_combination[1]
]


for segment_combination in segment_combinations:
# for segment_combination in itertools.combinations_with_replacement(segments,2): # * this should work as well
combination_arrays[segment_combination] = np.zeros(
Expand All @@ -98,38 +103,6 @@ def make_combination_array(genome_dict, intra_only=False):
return combination_arrays


def parse_annotation_table(annotation_table):
"""
Parses an annotation table and returns a dictionary of segment names and annotations.
The annotation table must have the following columns: id,segment01,start01,end01,segment02,start02,end02
Parameters
----------
annotation_table : str
Path to annotation table.
Returns
-------
annotation_dict : dict
Dictionary of segment names and annotations.
"""
annotation_dict = {}
with open(annotation_table) as file:
for line in file:
if not line.startswith("id"):
line = line.strip().split("\t")
annotation_dict[line[0]] = {
"segment01": line[1],
"start01": int(line[2]),
"end01": int(line[3]),
"segment02": line[4],
"start02": int(line[5]),
"end02": int(line[6]),
}
return annotation_dict


def positive_to_negative_strand_point(genome_dict, segment, position):
"""Transpose a point from the positive to the negative strand.
Expand All @@ -147,6 +120,7 @@ def positive_to_negative_strand_point(genome_dict, segment, position):
# Get the position on the negative strand
return aLen - position + 1


def negative_to_positive_strand(genome_dict, aSeq, cai, caj, bSeq, cbi, cbj):
"""Transpose the region of interest from the negative to the positive strand.
Expand Down Expand Up @@ -178,6 +152,7 @@ def negative_to_positive_strand(genome_dict, aSeq, cai, caj, bSeq, cbi, cbj):

return aSeq, cai_pos, caj_pos, bSeq, cbi_pos, cbj_pos


def positive_to_negative_strand(genome_dict, aSeq, cai, caj, bSeq, cbi, cbj):
"""Transpose the region of interest from the positive to the negative strand.
Expand Down Expand Up @@ -238,12 +213,10 @@ def parse_annotation_table(annotation_table):
"end02",
]
regions = pd.read_csv(annotation_table, names=header)
# add id column
# add id column
regions["id"] = regions.index
elif annotation_table.lower().endswith(".tsv"):
regions = pd.read_csv(annotation_table, sep="\t")
else:
raise ValueError(
"Annotation table must be either a csv file or a xlsx file."
)
return regions
raise ValueError("Annotation table must be either a csv file or a xlsx file.")
return regions
5 changes: 4 additions & 1 deletion bin/make_circos_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,10 @@ def main():
DESeq2_results = DESeq2_results.rename(columns={"Unnamed: 0": "id"})

# Read annotation table
annotation_table = hp.parse_annotation_table(annotation_table)
annotation_table = pd.read_csv(annotation_table, header=0, index_col=0, sep="\t")

# Remove rows in DESeq2 results that are not in the annotation table
DESeq2_results = DESeq2_results[DESeq2_results["id"].isin(annotation_table.index)]

# Parse genome segment names and sequences
genome_dict = hp.parse_fasta(genome_file)
Expand Down
16 changes: 13 additions & 3 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ include { fillArrays; mergeArrays } from './modules/handle_arrays.nf'
include { plotHeatmaps as plotHeatmapsRaw } from './modules/data_visualization.nf'
include { plotHeatmaps as plotHeatmapsMerged } from './modules/data_visualization.nf'
include { plotHeatmapsAnnotated } from './modules/data_visualization.nf'
include { plotHeatmapsAnnotated as plotHeatmapsAnnotatedDedup } from './modules/data_visualization.nf'
include { annotateArrays; mergeAnnotations } from './modules/annotate_interactions.nf'
// differential analysis
include { generateCountTables; mergeCountTables; runDESeq2 } from './modules/differential_analysis.nf'
Expand Down Expand Up @@ -214,6 +215,7 @@ workflow {
)

// Generate count tables
annotated_trns_ch.view()
count_tables_ch = generateCountTables( annotated_trns_ch )
merged_count_tables_ch = mergeCountTables(
count_tables_ch
Expand All @@ -232,10 +234,17 @@ workflow {
// Deduplicate annotations
deduplicate_annotations_input_ch = merged_count_tables_all_ch // group_name, merged_count_table
.combine( mergeAnnotations.out ) // merged_annotations
.map( it -> [ it[0], it[2], it[3] ] ) // group name, count table, annotations
deduplicate_annotations_input_ch.view()
.map( it -> [ it[0], it[2], it[1] ] ) // group name, count table, annotations
deduplicate_annotations_input_ch
deduplicateAnnotations( deduplicate_annotations_input_ch )

// Plot deduplicated annotations on the heatmaps
dedup_heatmaps_ch = annotated_arrays_ch
.map( it -> [ it[0], it[1], it[2], it[3] ] ) // sample name, genome, array, annotations
.combine( deduplicateAnnotations.out )
dedup_heatmaps_ch.view()
plotHeatmapsAnnotatedDedup( dedup_heatmaps_ch )

// Run differential analysis with DESeq2
samples_input_ch = Channel
.fromPath( params.comparisons, checkIfExists: true )
Expand All @@ -244,6 +253,7 @@ workflow {
.map( it -> [ it[1], it[0], it[2] ] )
.combine( merged_count_tables_ch, by: 0 )
.map( it -> [ it[1], it[2], it[0], it[3] ] )
.view()

runDESeq2( samples_input_ch )

Expand Down Expand Up @@ -271,7 +281,7 @@ workflow {
.map( it -> [ it[0], it[2], it[1] ] )
.combine( deduplicateAnnotations.out )
}

circos_deseq2_ch.view()
// Create circos tables
makeCircosTable_deseq2( circos_deseq2_ch )
makeCircosTable_count_table( circos_count_table_ch )
Expand Down
2 changes: 1 addition & 1 deletion modules/annotate_interactions.nf
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ process deduplicateAnnotations {
tuple val(group), path(annotation_table), path(count_table)

output:
tuple path("deduplicated_annotations/deduplicated_annotation_table.tsv"), path("deduplicated_annotations/deduplicated_count_table.tsv")
tuple val(group), path("deduplicated_annotations/annotation_table_deduplicated.tsv"), path("deduplicated_annotations/count_table_deduplicated.tsv")

publishDir "${params.output}/06-annotations" , mode: 'copy'

Expand Down
60 changes: 3 additions & 57 deletions modules/data_visualization.nf
Original file line number Diff line number Diff line change
Expand Up @@ -50,68 +50,14 @@ process makeCircosTable_count_table {
label 'RNAswarm_small'

input:
tuple val(genome_name), path(genome), path(count_table), path(annotation_table)
tuple val(genome_name), path(genome), path(genome_count_table), val(group_name), path(annotation_table), path(global_count_table)

output:
tuple val(genome_name), path("${genome_name}_circos")

script:
"""
make_circos_files.py -c ${count_table} -a ${annotation_table} -g ${genome} -o ${genome_name}_circos
"""
}

/*************************************************************************
* make circos table for count_table results
*************************************************************************/
process makeCircosTable_count_table_30 {
label 'RNAswarm_small'

input:
tuple val(genome_name), path(genome), path(count_table), path(annotation_table)

output:
tuple val(genome_name), path("${genome_name}_circos_30")

script:
"""
make_circos_files.py -c ${count_table} -a ${annotation_table} -g ${genome} -o ${genome_name}_circos_30 --number_of_top_hits 30
"""
}

/*************************************************************************
* make circos table for count_table results
*************************************************************************/
process makeCircosTable_count_table_40 {
label 'RNAswarm_small'

input:
tuple val(genome_name), path(genome), path(count_table), path(annotation_table)

output:
tuple val(genome_name), path("${genome_name}_circos_40")

script:
"""
make_circos_files.py -c ${count_table} -a ${annotation_table} -g ${genome} -o ${genome_name}_circos_40 --number_of_top_hits 40
"""
}

/*************************************************************************
* make circos table for count_table results
*************************************************************************/
process makeCircosTable_count_table_50 {
label 'RNAswarm'

input:
tuple val(genome_name), path(genome), path(count_table), path(annotation_table)

output:
tuple val(genome_name), path("${genome_name}_circos_50")

script:
"""
make_circos_files.py -c ${count_table} -a ${annotation_table} -g ${genome} -o ${genome_name}_circos_50 --number_of_top_hits 50
make_circos_files.py -c ${genome_count_table} -a ${annotation_table} -g ${genome} -o ${genome_name}_circos
"""
}

Expand All @@ -122,7 +68,7 @@ process makeCircosTable_deseq2 {
label 'RNAswarm_small'

input:
tuple val(genome_name_01), path(genome_01), val(genome_name_02), path(genome_02), path(results_DESeq2), path(annotation_table)
tuple val(genome_name_01), path(genome_01), val(genome_name_02), path(genome_02), path(results_DESeq2), val(group_name), path(annotation_table), path(global_count_table)

output:
tuple val(genome_name_01), val(genome_name_02), path("${genome_name_01}_${genome_name_02}_circos")
Expand Down
1 change: 1 addition & 0 deletions modules/differential_analysis.nf
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ process generateCountTables {
"""
}


/*************************************************************************
# Merge count tables
*************************************************************************/
Expand Down

0 comments on commit 6829cd1

Please sign in to comment.