Skip to content

Commit

Permalink
Merge pull request #6 from nebiolabs/picard_calculation
Browse files Browse the repository at this point in the history
Picard calculation merged into master includes basal correction, ONT and PWM
  • Loading branch information
aerijman authored Jun 22, 2021
2 parents 7f06f2d + 3cf2ef1 commit b48e3b2
Show file tree
Hide file tree
Showing 96 changed files with 1,120 additions and 15 deletions.
Empty file modified .circleci/config.yml
100644 → 100755
Empty file.
Empty file modified .definitions.py
100644 → 100755
Empty file.
9 changes: 8 additions & 1 deletion .gitignore
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
**/.DS_Store
.*pyc
*pyc
/dist/
/*.egg-info
*.swp
tasmanian/tests/*html
./tasmanian_venv/
tasmanian_venv/*
*pyc
*table
*metrics
*html
*summary
*bam
tasmanian/test_suite/Damage-estimator
Empty file modified .manuscript.txt.swp
100644 → 100755
Empty file.
Empty file modified LICENCE.txt
100644 → 100755
Empty file.
Empty file modified MANIFEST.in
100644 → 100755
Empty file.
Empty file modified README.md
100644 → 100755
Empty file.
Binary file removed bin/._intersections
Binary file not shown.
Empty file modified conf/conda.yaml
100644 → 100755
Empty file.
Binary file removed figures/._intersections_tasmanian.jpg
Binary file not shown.
Empty file modified figures/intersections_tasmanian.jpg
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified figures/snapshot.jpg
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified figures/snapshot_good.jpg
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
27 changes: 27 additions & 0 deletions find_pairs
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env python

import sys

def find_difference(s1, s2):
l1=len(s1)
l2=len(s2)

if len(s1) != len(s2):
return 0

for l in range(l1):
if s1[l] != s2[l]:
return [s1,s2] if s1[l]=='1' else [s2,s1]

sys.stderr.write("\nNo complementary read found for {}\n".format(s1))
return 0


def main():
reads = find_difference(sys.argv[1], sys.argv[2])

if reads:
sys.stdout.write("{}\t{}\n".format(reads[0], reads[1]))

if __name__ == '__main__':
main()
Empty file modified main.nf
100644 → 100755
Empty file.
Empty file modified nextflow.config
100644 → 100755
Empty file.
Empty file modified process.yml
100644 → 100755
Empty file.
Empty file modified requirements.txt
100644 → 100755
Empty file.
Empty file modified setup.py
100644 → 100755
Empty file.
Empty file modified tasmanian/.DS_Store
100644 → 100755
Empty file.
Empty file modified tasmanian/__init__.py
100644 → 100755
Empty file.
Empty file modified tasmanian/__pycache__/__init__.cpython-37.pyc
100644 → 100755
Empty file.
Empty file modified tasmanian/__pycache__/intersections.cpython-37.pyc
100644 → 100755
Empty file.
Empty file modified tasmanian/__pycache__/tasmanian.cpython-37.pyc
100644 → 100755
Empty file.
Empty file modified tasmanian/galaxy_wrapper/Tasmanian.xml
100644 → 100755
Empty file.
Empty file.
Empty file modified tasmanian/galaxy_wrapper/test-data/small_region.fa
100644 → 100755
Empty file.
Empty file modified tasmanian/galaxy_wrapper/test-data/small_region.fa.fai
100644 → 100755
Empty file.
Empty file modified tasmanian/galaxy_wrapper/test-data/tasmanian_test_output.table
100644 → 100755
Empty file.
Empty file modified tasmanian/galaxy_wrapper/test-data/test.bam
100644 → 100755
Empty file.
Empty file modified tasmanian/galaxy_wrapper/test-data/test.output
100644 → 100755
Empty file.
Empty file modified tasmanian/galaxy_wrapper/test-data/test2.bam
100644 → 100755
Empty file.
Empty file modified tasmanian/galaxy_wrapper/test-data/test2.output
100644 → 100755
Empty file.
Empty file modified tasmanian/galaxy_wrapper/tool-data/tasmanian_index.loc.sample
100644 → 100755
Empty file.
Empty file modified tasmanian/galaxy_wrapper/tool-data/tasmanian_indexes.loc
100644 → 100755
Empty file.
Empty file modified tasmanian/galaxy_wrapper/tool_test_output.html
100644 → 100755
Empty file.
Empty file modified tasmanian/galaxy_wrapper/tool_test_output.json
100644 → 100755
Empty file.
Empty file modified tasmanian/manuscript.txt
100644 → 100755
Empty file.
Empty file modified tasmanian/manuscript_figures.pptx
100644 → 100755
Empty file.
Empty file modified tasmanian/scripts/collect_data.py
100644 → 100755
Empty file.
Empty file modified tasmanian/scripts/count_CpG_GC_repeat.py
100644 → 100755
Empty file.
Empty file modified tasmanian/scripts/hist.py
100644 → 100755
Empty file.
114 changes: 100 additions & 14 deletions tasmanian/tasmanian_script.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
#!/bin/python
#!/usr/bin/env python

# TODO: add context nucleotides?

import sys, os, re, time
import sys, os, re, time, pickle
import numpy as np
import pandas as pd
from itertools import product
from itertools import product, permutations
import logging, uuid
from scipy.stats import mode

try:
from tasmanian.utils.utils import revcomp, simple_deltas_is_this_garbage, init_artifacts_table, load_reference, trim_table
from tasmanian.utils.utils import fill_PFM, initialize_PFM, pfm2ppm, ppm2pwm
from tasmanian.utils.plot import plot_html
except Exception as e: #ImportError: #ModuleNotFoundError:
# Either tests or base_dir, it's downstream of ../tasmanian/tasmanian/
Expand All @@ -21,7 +22,7 @@
utils_path = p + '/utils'
sys.path = [utils_path] + sys.path

from utils import revcomp, simple_deltas_is_this_garbage, init_artifacts_table, load_reference, trim_table
from utils import revcomp, simple_deltas_is_this_garbage, init_artifacts_table, load_reference, trim_table, fill_PFM, initialize_PFM, pfm2ppm, ppm2pwm
from plot import plot_html

###############################################################################
Expand Down Expand Up @@ -49,7 +50,10 @@ def analyze_artifacts(Input, Args):
-g|--fragment-length (use fragments withi these lengths ONLY)
-o|--output-prefix (use this prefix for the output and logging files)
-c|--confidence (number of bases in the complement region of the read)
-d|--debug (create a log file)
-d|--debug (create a log file)
-O|--ont (this is ONT data)
-p|--picard-logic (normalize tables based on picard CollectSequencingArtifactMetrics logic)
-P|--include-pwm
"""

SKIP_READS = {
Expand Down Expand Up @@ -80,6 +84,9 @@ def analyze_artifacts(Input, Args):
check_lengths_counter = 0
confidence = 20
debug = False
ONT = False
picard = False
PWM, flanking_n = False, 5

# if there are arguments get them
for n,i in enumerate(Args):
Expand Down Expand Up @@ -115,6 +122,19 @@ def analyze_artifacts(Input, Args):
confidence = int(sys.argv[n+1])
if i in ['-d','--debug']:
debug = True
if i in ['-O','--ont']:
ONT = True
READ_LENGTH=100000
MAX_LENGTH=100000
TLEN=np.array([0,100000])
check_lengths = READ_LENGTH
if i in ['-p','--picard-logic']:
picard = True
if i in ['-P','--include-pwm']:
PWM = True
if len(sys.argv)>n+1:
flanking_n = int(sys.argv[n+1]) if sys.argv[n+1].isnumeric() else flanking_n


if debug:
# if debugging create this logfile
Expand Down Expand Up @@ -153,7 +173,14 @@ def analyze_artifacts(Input, Args):
# in thee tables the CONFIDENCE reads ONLY. These are reads with >= CONFIDENCE bases in the complement
errors_intersectionB = init_artifacts_table(READ_LENGTH)
errors_complementB = init_artifacts_table(READ_LENGTH)


# initialize PFM (later on converted to PWM)
if PWM:
All_combinations = [
''.join(i) for i in permutations(['A','C','T','G'], 2)] +\
['AA','CC','GG','TT']

PFM = {i: initialize_PFM(flanking_n=flanking_n) for i in All_combinations}

# to check that there are tasmanian flags in sam file and if not, use unrelateds table only.
bed_tag = False
Expand All @@ -166,7 +193,7 @@ def analyze_artifacts(Input, Args):
if line[0]=="@":
continue

_, flag, chrom, start, mapq, cigar, _, _, tlen, seq, phred = line.split('\t')[:11]
read_id, flag, chrom, start, mapq, cigar, _, _, tlen, seq, phred = line.split('\t')[:11]

# If sam data does not have the tasmanian tag, there is no intersection with bed and proceed to a single output table
if bed_tag_counter<10 and bed_tag==False:
Expand Down Expand Up @@ -262,6 +289,10 @@ def analyze_artifacts(Input, Args):

elif i=='D':
ref_idx[position:position+number]=0

elif i=='H':
ref_idx[position:position+number]=0
seq_idx[position:position+number]=0

position += number
last = current+1
Expand All @@ -277,20 +308,23 @@ def analyze_artifacts(Input, Args):
pass

# if bin(int(flag))[2:][-5]=='1' or make it easy for now...
if flag==99:
if flag==99 or (ONT and flag in [0, 2048]): # for ont, not considering "secondary alignments", only "supp"
strand='fwd'; read=1
elif flag==163:
strand='fwd'; read=2
elif flag==83:
elif flag==83 or (ONT and flag in [16, 2064]):
strand='rev'; read=1
elif flag==147:
strand='rev'; read=2
else: continue

# At this point only accepted reads are analyzed. incorporate the fist X read length and later get mode
if check_lengths_counter < 100:
if check_lengths_counter<100 and not ONT:
check_lengths.append(seq_len)
check_lengths_counter +=1
elif ONT:
if seq_len > check_lengths: # If ONT, check_lengths is a number not a list
check_lengths = seq_len # THIS IS SAFE: if read length is > TLEN, it will be discarded

if _UNMASK_GENOME:
ref = ''.join(ref).upper() # re-think before doing this.
Expand Down Expand Up @@ -350,14 +384,49 @@ def analyze_artifacts(Input, Args):
errors_complementB[read][read_pos][ref_pos][Base.upper()] += 1
else:
errors_complement[read][read_pos][ref_pos][Base] += 1

# write somewhere read_id, flag(read-number + strand),
# read_position, chrom, genomic-position, mismatch-type.
if debug:
if ref_pos != Base and Base in ['A','C','T','G']:
sys.stderr.write('{},{},{},{},{},{},{}\n'.format(read_id, flag, read_pos, chrom, pos+start, ref_pos, Base))

if PWM:
# We have to fix this. For now, avoid positions too close to the ends of the reads
#if pos <=flanking_n:
# this_seq = ref[0:pos*2+1] # keep it symetrical (I am not sure though)
#elif pos+flanking_n > len(ref):
# continue
#else:
this_seq = ref[pos-flanking_n:pos+flanking_n+1] # Assuming 0-based index

if strand == 'rev':
this_seq = [revcomp(b) for b in this_seq][::-1]

# This will be replaced with a smarter option.
if len(this_seq) < flanking_n*2+1:
continue

this_seq = ''.join(this_seq)
fill_PFM(this_seq, PFM[''.join([ref_pos,Base])])

except Exception as e:
if debug:
logger.warning('error:{} in chr:{}, position:{}, read:{}, base:{}, seq:{}, start:{} and ref_pos:{}'.format(\
str(e), chrom, pos, read, base, ''.join(seq), start, ref[pos]))
PWM = {}
for k,v in PFM.items():
PWM[k] = pfm2ppm(v)

for k,v in PWM.items():
if k[0]==k[1]: # careful dont make CC, GG, TT, AA to matrices of ones!!
continue
else:
base_dist = PWM[k[0]+k[0]]
PWM[k] = ppm2pwm(v, base_dist)

# fix tables on length
READ_LENGTH = mode(check_lengths)[0][0]
READ_LENGTH = mode(check_lengths)[0][0] if not ONT else check_lengths
#READ_LENGTH = np.max(check_lengths)

if debug:
Expand All @@ -372,7 +441,7 @@ def analyze_artifacts(Input, Args):
errors_complementB = trim_table(errors_complementB, READ_LENGTH)

#######################
## REPORTING SECTION ##
# REPORTING SECTION ##
#######################

# Report performance and less relevant statistics
Expand Down Expand Up @@ -434,7 +503,7 @@ def analyze_artifacts(Input, Args):
#f.write('\n'.join(logging))
#f.close()

return table
return table, PWM



Expand All @@ -447,7 +516,18 @@ def analyze_artifacts(Input, Args):
Args = sys.argv

# run tasmanian
table = analyze_artifacts(sys.stdin, Args)
table, PWM = analyze_artifacts(sys.stdin, Args)

# print results
table_print = pd.concat(
[i if n==0 else i.iloc[:,2:] for n,i in enumerate(table.values())]
, axis=1)

table_print.columns = ['read','position'] + ','.join([
','.join([t+i for i in table['intersection'].columns[2:]]) for t in ['I','C','N','cI','cC']
]).split(',')

table_print.to_csv(sys.stdout, index=False)

# avoid overrighting report file before saving.
report_filename = 'Tasmanian_artifact_report.html'
Expand All @@ -466,3 +546,9 @@ def analyze_artifacts(Input, Args):
with open(report_filename, 'w') as f:
f.write(report_html)

# save PWM into pickle
pwm_filename = 'Tasmanian_pwm_file' + str(file_num)
with open(pwm_filename, 'wb') as f:
pickle.dump(PWM, f, protocol=pickle.HIGHEST_PROTOCOL)

sys.stderr.write('\n' + report_filename + " and " + pwm_filename + " related files created\n")
Empty file modified tasmanian/test_suite/.DS_Store
100644 → 100755
Empty file.
Loading

0 comments on commit b48e3b2

Please sign in to comment.