From 2b6ddf50e5addf1b6d3b827bcb86f48d152733cb Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Thu, 16 Jun 2022 01:13:41 +0200 Subject: [PATCH 01/13] smamll padding, new tools --- Dockerfile | 4 ++-- pggb | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index e3415db..a9eeb2f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -39,7 +39,7 @@ RUN apt-get update \ RUN git clone --recursive https://github.com/waveygang/wfmash \ && cd wfmash \ && git pull \ - && git checkout 19234a4a153ea3f3acce6f4ac192afc70bbb7fda \ + && git checkout f3b149b3cf894c6a2d148a868b9d8a94e7d16c40 \ && git submodule update --init --recursive \ && cmake -H. -DCMAKE_BUILD_TYPE=Generic -Bbuild && cmake --build build -- -j $(nproc) \ && cp build/bin/wfmash /usr/local/bin/wfmash \ @@ -57,7 +57,7 @@ RUN git clone --recursive https://github.com/ekg/seqwish \ RUN git clone --recursive https://github.com/pangenome/smoothxg \ && cd smoothxg \ && git pull \ - && git checkout 87264898fa81fbac4602d51a9840186bb82f4da1 \ + && git checkout d32db9543427adcb9b762aced8ea3478cc6a4455 \ && git submodule update --init --recursive \ && sed -i 's/-march=native/-march=haswell/g' deps/spoa/CMakeLists.txt \ && sed -i 's/-march=native/-march=haswell/g' deps/abPOA/CMakeLists.txt \ diff --git a/pggb b/pggb index 088b8a9..cd49cd1 100755 --- a/pggb +++ b/pggb @@ -25,7 +25,7 @@ max_path_jump=0 max_edge_jump=0 target_poa_length=4001,4507 poa_params=false -poa_padding=0.03 +poa_padding=0.001 run_abpoa=false run_global_poa=false do_viz=true @@ -200,7 +200,7 @@ if [ $show_help ]; then echo " -P, --poa-params PARAMS score parameters for POA in the form of match,mismatch,gap1,ext1,gap2,ext2" echo " may also be given as presets: asm5, asm10, asm15, asm20" echo " [default: 1,19,39,3,81,1 = asm5]" - echo " -O, --poa-padding N pad each end of each sequence in POA with N*(longest_poa_seq) bp [default: 0.03]" + echo " -O, --poa-padding N pad each end of each sequence in POA with N*(longest_poa_seq) bp [default: 0.001]" echo " -b, --run-abpoa run abPOA [default: SPOA]" echo " -z, --global-poa run the POA in global mode [default: local mode]" echo " -d, --pad-max-depth N depth/haplotype at which we don't pad the POA problem [default: 100]" @@ -546,7 +546,7 @@ if [[ $normalize == true ]]; then resume=false # The PG-SGD sorting (`-pY`) is not deterministic, then all subsequent steps need to be rerun for consistency fi - + if [[ ! -s "$prefix_smoothed_output".final.gfa || $resume == false ]]; then ( $timer -f "$fmt" odgi view -i "$prefix_smoothed_output".final.og -g > "$prefix_smoothed_output".final.gfa ) 2> >(tee -a "$log_file") fi @@ -554,7 +554,7 @@ else if [[ ! -s "$prefix_smoothed_output".final.gfa || $resume == false ]]; then mv "$prefix_smoothed".gfa "$prefix_smoothed_output".final.gfa fi - + if [[ ! -s "$prefix_smoothed_output".final.og || $resume == false ]]; then $timer -f "$fmt" odgi build -t $threads -P -g "$prefix_smoothed_output".final.gfa -o "$prefix_smoothed_output".final.og 2> >(tee -a "$log_file") fi From b72f611dec3fa53041ad5aff287955e8e84dacbf Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Thu, 16 Jun 2022 09:53:22 +0200 Subject: [PATCH 02/13] triple smoothing with shorter target lengths provides good accuracy --- pggb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pggb b/pggb index cd49cd1..27e582c 100755 --- a/pggb +++ b/pggb @@ -23,7 +23,7 @@ block_ratio_min=0 pad_max_depth=100 max_path_jump=0 max_edge_jump=0 -target_poa_length=4001,4507 +target_poa_length=700,900,1100 poa_params=false poa_padding=0.001 run_abpoa=false From 93e78a50a2c28605b7ea66429fee8b2e4f606d54 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Thu, 16 Jun 2022 10:29:07 +0200 Subject: [PATCH 03/13] min match filter of 19bp working well in tests --- pggb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pggb b/pggb index 27e582c..29eef70 100755 --- a/pggb +++ b/pggb @@ -15,7 +15,7 @@ block_length=false sparse_map=false mash_kmer=false mash_kmer_thres=false -min_match_length=47 +min_match_length=19 sparse_factor=0 transclose_batch=10000000 n_haps=false @@ -189,7 +189,7 @@ if [ $show_help ]; then echo " -Y, --exclude-delim C skip mappings between sequences with the same name prefix before" echo " the given delimiter character [default: all-vs-all and !self]" echo " [seqwish]" - echo " -k, --min-match-len N filter exact matches below this length [default: 47]" + echo " -k, --min-match-len N filter exact matches below this length [default: 19]" echo " -f, --sparse-factor N keep this randomly selected fraction of input matches [default: no sparsification]" echo " -B, --transclose-batch number of bp to use for transitive closure batch [default: 10000000]" echo " [smoothxg]" From 37ad45a2781dea9136f795790fe86a484376797a Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Thu, 16 Jun 2022 10:29:47 +0200 Subject: [PATCH 04/13] correct help text --- pggb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pggb b/pggb index 29eef70..b19ea08 100755 --- a/pggb +++ b/pggb @@ -200,7 +200,7 @@ if [ $show_help ]; then echo " -P, --poa-params PARAMS score parameters for POA in the form of match,mismatch,gap1,ext1,gap2,ext2" echo " may also be given as presets: asm5, asm10, asm15, asm20" echo " [default: 1,19,39,3,81,1 = asm5]" - echo " -O, --poa-padding N pad each end of each sequence in POA with N*(longest_poa_seq) bp [default: 0.001]" + echo " -O, --poa-padding N pad each end of each sequence in POA with N*(mean_seq_len) bp [default: 0.001]" echo " -b, --run-abpoa run abPOA [default: SPOA]" echo " -z, --global-poa run the POA in global mode [default: local mode]" echo " -d, --pad-max-depth N depth/haplotype at which we don't pad the POA problem [default: 100]" From 0d328c08aa31d26e4c714ec549de77bc37a009d5 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Thu, 16 Jun 2022 10:30:35 +0200 Subject: [PATCH 05/13] set -p 90 -s 10k as default --- pggb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pggb b/pggb index b19ea08..6c37eaf 100755 --- a/pggb +++ b/pggb @@ -8,9 +8,9 @@ output_dir="" temp_dir="" input_paf=false resume=false -map_pct_id=95 +map_pct_id=90 n_mappings=false -segment_length=5000 +segment_length=10000 block_length=false sparse_map=false mash_kmer=false @@ -178,10 +178,10 @@ if [ $show_help ]; then echo "options:" echo " [wfmash]" echo " -i, --input-fasta FILE input FASTA/FASTQ file" - echo " -s, --segment-length N segment length for mapping [default: 5k]" + echo " -s, --segment-length N segment length for mapping [default: 10k]" echo " -l, --block-length N minimum block length filter for mapping [default: 5*segment-length]" echo " -N, --no-split disable splitting of input sequences during mapping [enabled by default]" - echo " -p, --map-pct-id PCT percent identity for mapping/alignment [default: 95]" + echo " -p, --map-pct-id PCT percent identity for mapping/alignment [default: 90]" echo " -n, --n-mappings N number of mappings to retain for each segment" echo " -x, --sparse-map N keep this fraction of mappings ('auto' for giant component heuristic) [default: 1.0]" echo " -K, --mash-kmer N kmer size for mapping [default: 19]" From 2d9d64ac1b3e1f3710aa9549f36a9ed0e6774ded Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Fri, 17 Jun 2022 00:05:43 +0200 Subject: [PATCH 06/13] update wfmash --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index a9eeb2f..6daa4c6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -39,7 +39,7 @@ RUN apt-get update \ RUN git clone --recursive https://github.com/waveygang/wfmash \ && cd wfmash \ && git pull \ - && git checkout f3b149b3cf894c6a2d148a868b9d8a94e7d16c40 \ + && git checkout 5321ea68e8216e835aa1f70361d2ee24700e9a43 \ && git submodule update --init --recursive \ && cmake -H. -DCMAKE_BUILD_TYPE=Generic -Bbuild && cmake --build build -- -j $(nproc) \ && cp build/bin/wfmash /usr/local/bin/wfmash \ From 061e2327c88079d4208bfff899f2dddebc3a50d9 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Fri, 17 Jun 2022 13:04:09 +0200 Subject: [PATCH 07/13] correct docs --- pggb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pggb b/pggb index 6c37eaf..9da54c0 100755 --- a/pggb +++ b/pggb @@ -196,7 +196,7 @@ if [ $show_help ]; then echo " -H, --n-haps N number of haplotypes, if different than that set with -n [default: n-mappings]" echo " -j, --path-jump-max maximum path jump to include in block [default: 0]" echo " -e, --edge-jump-max N maximum edge jump before breaking [default: 0 / off]" - echo " -G, --poa-length-target N,M target sequence length for POA, first pass = N, second pass = M [default: 4001,4507]" + echo " -G, --poa-length-target N,M target sequence length for POA, one per pass [default: 700,900,1100]" echo " -P, --poa-params PARAMS score parameters for POA in the form of match,mismatch,gap1,ext1,gap2,ext2" echo " may also be given as presets: asm5, asm10, asm15, asm20" echo " [default: 1,19,39,3,81,1 = asm5]" From a8f46f39faaf11833943407bf7e68ad74dc38d1c Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sat, 18 Jun 2022 00:28:28 +0200 Subject: [PATCH 08/13] correct doc string --- pggb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pggb b/pggb index 9da54c0..1669f46 100755 --- a/pggb +++ b/pggb @@ -185,7 +185,7 @@ if [ $show_help ]; then echo " -n, --n-mappings N number of mappings to retain for each segment" echo " -x, --sparse-map N keep this fraction of mappings ('auto' for giant component heuristic) [default: 1.0]" echo " -K, --mash-kmer N kmer size for mapping [default: 19]" - echo " -F, --mash-kmer-thres N ignore the top % most-frequent kmers [default: 0.5]" + echo " -F, --mash-kmer-thres N ignore the top % most-frequent kmers [default: 0.1]" echo " -Y, --exclude-delim C skip mappings between sequences with the same name prefix before" echo " the given delimiter character [default: all-vs-all and !self]" echo " [seqwish]" From d3be438b8a0540fc4c99c81b5a0df43153d9db9b Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sat, 18 Jun 2022 00:28:54 +0200 Subject: [PATCH 09/13] update wfmash --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 6daa4c6..1f9bc6b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -39,7 +39,7 @@ RUN apt-get update \ RUN git clone --recursive https://github.com/waveygang/wfmash \ && cd wfmash \ && git pull \ - && git checkout 5321ea68e8216e835aa1f70361d2ee24700e9a43 \ + && git checkout e65ba1c17f35763943d92dffa5dcb1caafcd5c44 \ && git submodule update --init --recursive \ && cmake -H. -DCMAKE_BUILD_TYPE=Generic -Bbuild && cmake --build build -- -j $(nproc) \ && cp build/bin/wfmash /usr/local/bin/wfmash \ From e8849815abb90a7c388c726b23eb4d70a63908c8 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sat, 18 Jun 2022 01:17:06 +0200 Subject: [PATCH 10/13] update wfmash to not get corrupted traceback --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 1f9bc6b..b8103f3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -39,7 +39,7 @@ RUN apt-get update \ RUN git clone --recursive https://github.com/waveygang/wfmash \ && cd wfmash \ && git pull \ - && git checkout e65ba1c17f35763943d92dffa5dcb1caafcd5c44 \ + && git checkout 8900299ebf1b731bbe9e669e8a37e5e52112666b \ && git submodule update --init --recursive \ && cmake -H. -DCMAKE_BUILD_TYPE=Generic -Bbuild && cmake --build build -- -j $(nproc) \ && cp build/bin/wfmash /usr/local/bin/wfmash \ From 237bf05daa186424ecf639dbb956364e245cd9e9 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Mon, 27 Jun 2022 16:28:00 +0200 Subject: [PATCH 11/13] update wfmash with a softer default high-freq filter --- Dockerfile | 2 +- pggb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index b8103f3..76198dd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -39,7 +39,7 @@ RUN apt-get update \ RUN git clone --recursive https://github.com/waveygang/wfmash \ && cd wfmash \ && git pull \ - && git checkout 8900299ebf1b731bbe9e669e8a37e5e52112666b \ + && git checkout bee8b1b1368d22b6509b31ef4262141849794796 \ && git submodule update --init --recursive \ && cmake -H. -DCMAKE_BUILD_TYPE=Generic -Bbuild && cmake --build build -- -j $(nproc) \ && cp build/bin/wfmash /usr/local/bin/wfmash \ diff --git a/pggb b/pggb index 1669f46..cd72617 100755 --- a/pggb +++ b/pggb @@ -185,7 +185,7 @@ if [ $show_help ]; then echo " -n, --n-mappings N number of mappings to retain for each segment" echo " -x, --sparse-map N keep this fraction of mappings ('auto' for giant component heuristic) [default: 1.0]" echo " -K, --mash-kmer N kmer size for mapping [default: 19]" - echo " -F, --mash-kmer-thres N ignore the top % most-frequent kmers [default: 0.1]" + echo " -F, --mash-kmer-thres N ignore the top % most-frequent kmers [default: 0.001]" echo " -Y, --exclude-delim C skip mappings between sequences with the same name prefix before" echo " the given delimiter character [default: all-vs-all and !self]" echo " [seqwish]" From b19dd0e7f74fbd9a3d2286d7b916e5c12cb8554b Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sun, 3 Jul 2022 09:56:19 +0200 Subject: [PATCH 12/13] 5k segment length default --- pggb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pggb b/pggb index a7d076f..a872aef 100755 --- a/pggb +++ b/pggb @@ -10,7 +10,7 @@ input_paf=false resume=false map_pct_id=90 n_mappings=false -segment_length=10000 +segment_length=5000 block_length=false sparse_map=false mash_kmer=false @@ -178,7 +178,7 @@ if [ $show_help ]; then echo "options:" echo " [wfmash]" echo " -i, --input-fasta FILE input FASTA/FASTQ file" - echo " -s, --segment-length N segment length for mapping [default: 10k]" + echo " -s, --segment-length N segment length for mapping [default: 5k]" echo " -l, --block-length N minimum block length filter for mapping [default: 5*segment-length]" echo " -N, --no-split disable splitting of input sequences during mapping [enabled by default]" echo " -p, --map-pct-id PCT percent identity for mapping/alignment [default: 90]" From 15a42e2ac114efffa6c9d9f9a0c1a063f327d7b7 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sun, 3 Jul 2022 10:53:03 +0200 Subject: [PATCH 13/13] update wfmash to v0.9.1 --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 1683069..45af0ea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,7 +41,7 @@ RUN apt-get update \ RUN git clone --recursive https://github.com/waveygang/wfmash \ && cd wfmash \ && git pull \ - && git checkout bee8b1b1368d22b6509b31ef4262141849794796 \ + && git checkout e969972c2896146250cad7af60bdb8418ca6148d \ && git submodule update --init --recursive \ && cmake -H. -DCMAKE_BUILD_TYPE=Generic -Bbuild && cmake --build build -- -j $(nproc) \ && cp build/bin/wfmash /usr/local/bin/wfmash \