Skip to content

Commit

Permalink
Add fasr concat
Browse files Browse the repository at this point in the history
  • Loading branch information
wang-q committed Jul 12, 2023
1 parent d3e9726 commit fc4ebb2
Show file tree
Hide file tree
Showing 8 changed files with 164 additions and 4 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,8 @@ cargo run --bin fasr maf2fas tests/fasr/example.maf

cargo run --bin fasr name tests/fasr/example.fas -c

cargo run --bin fasr concat tests/fasr/example.fas tests/fasr/name.lst

```


Expand Down
107 changes: 107 additions & 0 deletions src/cmd_fasr/concat.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
use clap::*;
use intspan::*;
use std::collections::BTreeMap;

// Create clap subcommand arguments
pub fn make_subcommand() -> Command {
Command::new("concat")
.about("Concatenate sequence pieces of same species")
.after_help(
r###"
* <infile> is path to block fasta file, .fas.gz is supported
* infile == stdin means reading from STDIN
* <name.lst> is a file with a list of names to keep, one per line
* Orders in the output file will following the ones in <name.lst>
"###,
)
.arg(
Arg::new("infile")
.required(true)
.num_args(1)
.index(1)
.help("Sets the input file to use"),
)
.arg(
Arg::new("name.lst")
.required(true)
.num_args(1)
.index(2)
.help("Path to name.lst"),
)
.arg(
Arg::new("phylip")
.long("phylip")
.action(ArgAction::SetTrue)
.help("Output relaxed phylip instead of fasta"),
)
.arg(
Arg::new("outfile")
.long("outfile")
.short('o')
.num_args(1)
.default_value("stdout")
.help("Output filename. [stdout] for screen"),
)
}

// command implementation
pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
//----------------------------
// Loading
//----------------------------
let mut writer = writer(args.get_one::<String>("outfile").unwrap());
let mut reader = reader(args.get_one::<String>("infile").unwrap());
let is_phylip = args.get_flag("phylip");

//----------------------------
// Load names
//----------------------------
let names = read_first_column(args.get_one::<String>("name.lst").unwrap());

let mut seq_of: BTreeMap<String, String> = BTreeMap::new();
for name in &names {
// default value
seq_of.insert(name.to_string(), "".to_string());
}

while let Ok(block) = next_fas_block(&mut reader) {
let block_names = block.names;
let length = block.entries.first().unwrap().seq().len();

for name in &names {
if block_names.contains(name) {
for entry in &block.entries {
let entry_name = entry.range().name();
if entry_name == name {
let seq = std::str::from_utf8(entry.seq()).unwrap();
seq_of.entry(name.to_string()).and_modify(|e| *e += seq);
}
}
} else {
// fill absent names with ------
seq_of
.entry(name.to_string())
.and_modify(|e| *e += "-".repeat(length).as_str());
}
}
}

//----------------------------
// Output
//----------------------------
if is_phylip {
let count = names.len();
let length = seq_of.first_key_value().unwrap().1.len();
writer.write_all(format!("{} {}\n", count, length).as_ref())?;
for (k, v) in &seq_of {
writer.write_all(format!("{} {}\n", k, v).as_ref())?;
}
} else {
for (k, v) in &seq_of {
writer.write_all(format!(">{}\n{}\n", k, v).as_ref())?;
}
}

Ok(())
}
1 change: 1 addition & 0 deletions src/cmd_fasr/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
//! Subcommand modules for the `fasr` binary.

pub mod concat;
pub mod maf2fas;
pub mod name;
9 changes: 5 additions & 4 deletions src/cmd_fasr/name.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ use std::collections::BTreeMap;
// Create clap subcommand arguments
pub fn make_subcommand() -> Command {
Command::new("name")
.about("Scan block fasta files and output all species names")
.about("Output all species names")
.after_help(
r###"
* <infiles> are paths to fas files, .fas.gz is supported
* <infiles> are paths to block fasta files, .fas.gz is supported
* infile == stdin means reading from STDIN
"###,
Expand Down Expand Up @@ -51,10 +51,11 @@ pub fn execute(args: &ArgMatches) -> anyhow::Result<()> {
let mut reader = reader(infile);

while let Ok(block) = next_fas_block(&mut reader) {
for entry in block.entries {
for entry in &block.entries {
let range = entry.range();

count_of.entry(range.name().to_string())
count_of
.entry(range.name().to_string())
.and_modify(|e| *e += 1)
.or_insert(1);
}
Expand Down
2 changes: 2 additions & 0 deletions src/fasr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@ fn main() -> anyhow::Result<()> {
.propagate_version(true)
.arg_required_else_help(true)
.color(ColorChoice::Auto)
.subcommand(cmd_fasr::concat::make_subcommand())
.subcommand(cmd_fasr::name::make_subcommand())
.subcommand(cmd_fasr::maf2fas::make_subcommand());

// Check which subcomamnd the user ran...
match app.get_matches().subcommand() {
Some(("concat", sub_matches)) => cmd_fasr::concat::execute(sub_matches),
Some(("maf2fas", sub_matches)) => cmd_fasr::maf2fas::execute(sub_matches),
Some(("name", sub_matches)) => cmd_fasr::name::execute(sub_matches),
_ => unreachable!(),
Expand Down
4 changes: 4 additions & 0 deletions src/libs/fas.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ impl FasEntry {
/// A Fas alignment block.
pub struct FasBlock {
pub entries: Vec<FasEntry>,
pub names: Vec<String>,
}

/// Get the next FasBlock out of the input.
Expand Down Expand Up @@ -131,17 +132,20 @@ pub fn parse_fas_block(
block_lines.push_back(line);
}
let mut block_entries: Vec<FasEntry> = vec![];
let mut block_names: Vec<String> = vec![];

while let Some(header) = block_lines.pop_front() {
let range = Range::from_str(header.as_str());
let seq = block_lines.pop_front().unwrap().as_bytes().to_vec();

let entry = FasEntry::from(&range, &seq);
block_entries.push(entry);
block_names.push(range.name().to_string());
}

Ok(FasBlock {
entries: block_entries,
names: block_names,
})
}

Expand Down
41 changes: 41 additions & 0 deletions tests/cli_fasr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,44 @@ fn command_maf2fas() -> anyhow::Result<()> {

Ok(())
}

#[test]
fn command_concat() -> anyhow::Result<()> {
let mut cmd = Command::cargo_bin("fasr")?;
let output = cmd
.arg("concat")
.arg("tests/fasr/example.fas")
.arg("tests/fasr/name.lst")
.output()
.unwrap();
let stdout = String::from_utf8(output.stdout).unwrap();

assert_eq!(stdout.lines().count(), 4);
assert_eq!(stdout.lines().next().unwrap().len(), 5); // >Spar
assert_eq!(stdout.lines().last().unwrap().len(), 239); // >Spar
assert!(stdout.contains("Spar"), "name list");
assert!(!stdout.contains("S288c"), "name list");

Ok(())
}

#[test]
fn command_concat_phylip() -> anyhow::Result<()> {
let mut cmd = Command::cargo_bin("fasr")?;
let output = cmd
.arg("concat")
.arg("tests/fasr/example.fas")
.arg("tests/fasr/name.lst")
.arg("--phylip")
.output()
.unwrap();
let stdout = String::from_utf8(output.stdout).unwrap();

assert_eq!(stdout.lines().count(), 3);
assert_eq!(
stdout.lines().last().unwrap().len(),
"YJM789".to_string().len() + 1 + 239
);

Ok(())
}
2 changes: 2 additions & 0 deletions tests/fasr/name.lst
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Spar
YJM789

0 comments on commit fc4ebb2

Please sign in to comment.