From 4babc8641666fcceba1c95fd377f4c734eafe175 Mon Sep 17 00:00:00 2001 From: Qiang Wang Date: Mon, 17 Jul 2023 02:47:07 +0800 Subject: [PATCH] `fasr link --best` --- README.md | 5 +++-- src/cmd_fasr/link.rs | 33 +++++++++++++++++++++++++++++++ src/fasr.rs | 1 - src/lib.rs | 1 + src/libs/alignment.rs | 45 +++++++++++++++++++++++++++++++++++++++++++ src/libs/mod.rs | 1 + tests/cli_fasr.rs | 29 +++++++++++++++++++++------- 7 files changed, 105 insertions(+), 10 deletions(-) create mode 100644 src/libs/alignment.rs diff --git a/README.md b/README.md index 60898af..5e8203d 100644 --- a/README.md +++ b/README.md @@ -528,15 +528,16 @@ fasr axt2fas tests/fasr/RM11_1a.chr.sizes tests/fasr/example.axt --qname RM11_1a fasr name tests/fasr/example.fas --count -cargo run --bin fasr cover tests/fasr/example.fas +fasr cover tests/fasr/example.fas -cargo run --bin fasr cover tests/fasr/example.fas --name S288c --trim 10 +fasr cover tests/fasr/example.fas --name S288c --trim 10 fasr concat tests/fasr/name.lst tests/fasr/example.fas fasr subset tests/fasr/name.lst tests/fasr/example.fas fasr link tests/fasr/example.fas --pair +cargo run --bin fasr link tests/fasr/example.fas --best samtools faidx tests/fasr/NC_000932.fa NC_000932:1-10 diff --git a/src/cmd_fasr/link.rs b/src/cmd_fasr/link.rs index 85aa3e7..110d87c 100644 --- a/src/cmd_fasr/link.rs +++ b/src/cmd_fasr/link.rs @@ -1,5 +1,6 @@ use clap::*; use intspan::*; +use itertools::Itertools; // Create clap subcommand arguments pub fn make_subcommand() -> Command { @@ -25,6 +26,12 @@ pub fn make_subcommand() -> Command { .action(ArgAction::SetTrue) .help("Bilateral (pairwise) links"), ) + .arg( + Arg::new("best") + .long("best") + .action(ArgAction::SetTrue) + .help("best-to-best bilateral links"), + ) .arg( Arg::new("outfile") .long("outfile") @@ -42,6 +49,7 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> { //---------------------------- let mut writer = writer(args.get_one::("outfile").unwrap()); let is_pair = args.get_flag("pair"); + let is_best = args.get_flag("best"); for infile in args.get_many::("infiles").unwrap() { let mut reader = reader(infile); @@ -61,6 +69,31 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> { writer.write_all(format!("{}\t{}\n", headers[i], headers[j]).as_ref())?; } } + } else if is_best { + let mut best_pair: Vec<(usize, usize)> = vec![]; + for i in 0..headers.len() { + let mut dist_idx: (f32, usize) = (1.0, headers.len() - 1); + for j in 0..headers.len() { + if i == j { + continue; + } + let dist = pair_d(block.entries[i].seq(), block.entries[j].seq()); + if dist < dist_idx.0 { + dist_idx = (dist, j); + } + } + if i < dist_idx.1 { + best_pair.push((i, dist_idx.1)); + } else { + best_pair.push((dist_idx.1, i)); + } + } + // from itertools + let best_pair: Vec<(usize, usize)> = best_pair.into_iter().unique().collect(); + + for (i, j) in best_pair { + writer.write_all(format!("{}\t{}\n", headers[i], headers[j]).as_ref())?; + } } else { writer.write_all(format!("{}\n", headers.join("\t")).as_ref())?; } diff --git a/src/fasr.rs b/src/fasr.rs index f18f99c..87f0253 100644 --- a/src/fasr.rs +++ b/src/fasr.rs @@ -47,7 +47,6 @@ fn main() -> anyhow::Result<()> { Ok(()) } -// TODO: fasr link --best // TODO: replace samtools // TODO: add more tools // TODO: simple - replace, slice diff --git a/src/lib.rs b/src/lib.rs index 5137e25..600fb0e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,6 +4,7 @@ extern crate lazy_static; mod libs; mod utils; +pub use crate::libs::alignment::*; pub use crate::libs::coverage::*; pub use crate::libs::fas::*; pub use crate::libs::intspan::*; diff --git a/src/libs/alignment.rs b/src/libs/alignment.rs new file mode 100644 index 0000000..939b32f --- /dev/null +++ b/src/libs/alignment.rs @@ -0,0 +1,45 @@ +use std::collections::HashSet; + +lazy_static! { + static ref BASES: HashSet = vec![b'a', b'g', b'c', b't', b'A', b'G', b'C', b'T',] + .into_iter() + .collect(); +} + +/// Distance (D) between two sequences +/// +/// ``` +/// // * ** ** +/// let seq1 = b"GTCTGCATGCN"; +/// let seq2 = b"TTTAGCTAgc-"; +/// // difference 5 +/// // comparable 10 +/// assert_eq!(intspan::pair_d(seq1, seq2), 0.5); +/// ``` +pub fn pair_d(seq1: &[u8], seq2: &[u8]) -> f32 { + assert_eq!( + seq1.len(), + seq2.len(), + "Two sequences of different length ({}!={})", + seq1.len(), + seq2.len() + ); + + let mut comparable = 0; + let mut difference = 0; + + for (base1, base2) in seq1.iter().zip(seq2) { + if BASES.contains(base1) && BASES.contains(base2) { + comparable += 1; + if base1.to_ascii_uppercase() != base2.to_ascii_uppercase() { + difference += 1; + } + } + } + + assert_ne!(comparable, 0, "Comparable bases shouldn't be zero"); + + // eprintln!("{} {}", difference, comparable); + + difference as f32 / comparable as f32 +} diff --git a/src/libs/mod.rs b/src/libs/mod.rs index a808804..125f268 100644 --- a/src/libs/mod.rs +++ b/src/libs/mod.rs @@ -1,3 +1,4 @@ +pub mod alignment; pub mod coverage; pub mod fas; pub mod intspan; diff --git a/tests/cli_fasr.rs b/tests/cli_fasr.rs index fa8d6fa..4854274 100644 --- a/tests/cli_fasr.rs +++ b/tests/cli_fasr.rs @@ -178,11 +178,7 @@ fn command_link() -> anyhow::Result<()> { assert_eq!(stdout.lines().count(), 3); assert_eq!(stdout.lines().next().unwrap().split_whitespace().count(), 4); - Ok(()) -} - -#[test] -fn command_link_pair() -> anyhow::Result<()> { + // --pair let mut cmd = Command::cargo_bin("fasr")?; let output = cmd .arg("link") @@ -195,6 +191,19 @@ fn command_link_pair() -> anyhow::Result<()> { assert_eq!(stdout.lines().count(), 18); assert_eq!(stdout.lines().next().unwrap().split_whitespace().count(), 2); + // --best + let mut cmd = Command::cargo_bin("fasr")?; + let output = cmd + .arg("link") + .arg("tests/fasr/example.fas") + .arg("--best") + .output() + .unwrap(); + let stdout = String::from_utf8(output.stdout).unwrap(); + + assert_eq!(stdout.lines().count(), 9); + assert_eq!(stdout.lines().next().unwrap().split_whitespace().count(), 2); + Ok(()) } @@ -394,7 +403,10 @@ fn command_join() -> anyhow::Result<()> { let stdout = String::from_utf8(output.stdout).unwrap(); assert_eq!(stdout.lines().count(), 5); - assert!(stdout.lines().next().unwrap().contains(">Spar"), "Selected name first"); + assert!( + stdout.lines().next().unwrap().contains(">Spar"), + "Selected name first" + ); let mut cmd = Command::cargo_bin("fasr")?; let output = cmd @@ -407,7 +419,10 @@ fn command_join() -> anyhow::Result<()> { let stdout = String::from_utf8(output.stdout).unwrap(); assert_eq!(stdout.lines().count(), 9); - assert!(stdout.lines().next().unwrap().contains(">S288c."), "First name first"); + assert!( + stdout.lines().next().unwrap().contains(">S288c."), + "First name first" + ); Ok(()) }