Skip to content

Commit

Permalink
Add align_stat()
Browse files Browse the repository at this point in the history
  • Loading branch information
wang-q committed Jul 16, 2023
1 parent 4babc86 commit 3406da6
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 2 deletions.
4 changes: 2 additions & 2 deletions src/fasr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,6 @@ fn main() -> anyhow::Result<()> {

// TODO: replace samtools
// TODO: add more tools
// TODO: simple - replace, slice
// TODO: hard - refine, stat, vars, xlsx
// TODO: simple - replace, stat, slice
// TODO: hard - refine, vars, xlsx
// TODO: fasr kb - scripts for join pairwise alignments p2m
79 changes: 79 additions & 0 deletions src/libs/alignment.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use itertools::Itertools;
use std::collections::HashSet;

lazy_static! {
Expand Down Expand Up @@ -43,3 +44,81 @@ pub fn pair_d(seq1: &[u8], seq2: &[u8]) -> f32 {

difference as f32 / comparable as f32
}

/// Basic stats on alignments
///
/// ```
/// let seqs = vec![
/// // *
/// b"AAAATTTTGG".as_ref(),
/// b"aaaatttttg".as_ref(),
/// ];
/// assert_eq!(intspan::align_stat(&seqs), (10, 10, 1, 0, 0, 0.1,));
///
/// let seqs = vec![
/// //* * *
/// b"TTAGCCGCTGAGAAGCC".as_ref(),
/// b"GTAGCCGCTGA-AGGCC".as_ref(),
/// ];
/// assert_eq!(intspan::align_stat(&seqs), (17, 16, 2, 1, 0, 0.125,));
///
/// let seqs = vec![
/// // * ** * ** * *
/// b"GATTATCATCACCCCAGCCACATW".as_ref(),
/// b"GATTTT--TCACTCCATTCGCATA".as_ref(),
/// ];
/// assert_eq!(intspan::align_stat(&seqs), (24, 21, 5, 2, 1, 0.238,));
///
/// ```
pub fn align_stat(seqs: &[&[u8]]) -> (i32, i32, i32, i32, i32, f32) {
let seq_count = seqs.len();
assert_ne!(seq_count, 0, "Need sequences");

let length = seqs[0].len();

let mut comparable = 0;
let mut difference = 0;
let mut gap = 0;
let mut ambiguous = 0;

// For each position, search for polymorphic sites
for pos in 0..length {
let mut column = vec![];
for i in 0..seq_count {
column.push(seqs[i][pos].to_ascii_uppercase());
}
column = column.into_iter().unique().collect();

if column.clone().into_iter().all(|e| BASES.contains(&e)) {
comparable += 1;
if column.clone().into_iter().any(|e| e != column[0]) {
difference += 1;
}
} else if column.clone().into_iter().any(|e| e == b'-') {
gap += 1;
} else {
ambiguous += 1;
}
}

assert_ne!(comparable, 0, "Comparable bases shouldn't be zero");

let mut dists = vec![];
for i in 0..seq_count {
for j in i + 1..seq_count {
let dist = pair_d(seqs[i], seqs[j]);
dists.push(dist);
}
}

let mean_d = f32::trunc(dists.iter().sum::<f32>() / dists.len() as f32 * 1000.0) / 1000.0;

(
length as i32,
comparable,
difference,
gap,
ambiguous,
mean_d,
)
}

0 comments on commit 3406da6

Please sign in to comment.