-
Notifications
You must be signed in to change notification settings - Fork 1
/
make-pseudoreps.sh
executable file
·57 lines (46 loc) · 1.93 KB
/
make-pseudoreps.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#! /usr/bin/env bash
# For creating pseudoreplicates from file
#make-pseudoreps.sh #{{{
usage() { echo "Usage: $0 [-b <bam file to be split>] [-o <output directory>] [-n <number of pseudoreplicates>]" 1>&2; exit 1; }
# : means takes an argument but not mandatory (if mandatory will have to check after)
options=':b:o:n:h'
while getopts $options option
do
case $option in
b ) bam=(${OPTARG}) ;;
o ) output_path=${OPTARG} ;;
n ) NUM=${OPTARG} ;;
h ) usage ;;
: ) echo "Missing option argument for -$OPTARG" >&2; usage ;;
\?) echo "Unknown option: -$OPTARG" >&2; usage ;;
* ) usage ;;
esac
done
shift $((OPTIND-1))
if [ -z "$bam" ] || [ -z "$output_path" ]
then
usage
fi
if [ -z "$NUM" ]
then
NUM=2
fi
#}}}
target=$(basename ${bam/.bam/})
# This will shuffle the lines in the file and split it into two parts
total_reads=$(samtools view -c ${bam})
subset_reads=$(( total_reads / ${NUM} ))
command="samtools view ${bam} | shuf | split -d -l ${subset_reads} - ${output_path}/${target}"
eval ${command}
samtools view -H ${bam} > ${output_path}/${target}.sam_header
# In the last sed command using @ as the delimiter instead of / because $command (which is expanded in the substitution) contains / and so sed thinks the command ends before it actually does
tail -n 1 ${output_path}/${target}.sam_header | grep '@PG' | sed 's/ID:.*\tVN/ID:shuf-split\tVN/g' | sed 's/VN:.*CL/VN:unk\tCL/' | sed "s@CL:.*@CL:\"$command\"@g" > ${output_path}/${target}.sam_PG_line
for i in `seq 0 $((NUM - 1))`
do
cat ${output_path}/${target}.sam_header ${output_path}/${target}.sam_PG_line ${output_path}/${target}0${i} > ${output_path}/${target}0${i}.sam
samtools view -bSo ${output_path}/${target}_pseudoreps0${i}.bam ${output_path}/${target}0${i}.sam
rm ${output_path}/${target}0${i}
rm ${output_path}/${target}0${i}.sam
done
rm ${output_path}/${target}.sam_PG_line
rm ${output_path}/${target}.sam_header