Skip to content

Commit 8515a65

Browse files
authored
Merge pull request #37 from bacpop/i35_cli_update
CLI improvements
2 parents 948d240 + 95be75e commit 8515a65

16 files changed

+161
-158
lines changed

Cargo.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
[package]
22
name = "sketchlib"
3-
version = "0.1.1"
3+
version = "0.1.2"
44
authors = [
55
"John Lees <[email protected]>",
66
"Nicholas Croucher <[email protected]>",
77
"Johanna von Wachsmann <[email protected]>",
88
"Victor Rodriguez Bouza <[email protected]>",
9+
"Joel Hellewell <[email protected]",
910
]
1011
edition = "2021"
1112
description = "Genome and amino-acid sketching"

README.md

+5-3
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,11 @@ With all options we typically recommend using `-v` to see all progress during th
2222

2323
Using input fasta/fastq files, create a sketch database. Run `sketchlib sketch -h` to see the help.
2424

25-
- List .fasta files on the command line, or use `-f` to provide a file(s). From file,
26-
these are one line per sample listing the name and fasta file, or name and two read files
27-
(fastq). Inputs can be gzipped or not, this is automatically detected.
25+
- List .fasta files on the command line, or use `-f` to provide a file(s). Inputs can be gzipped or not, this is automatically detected.
26+
From file, these are one line per sample listing:
27+
- One column (fasta input): file name, which is also used as the sample name
28+
- Two columns (fasta input): sample name and file name
29+
- Three columns (fastq input): sample name and two read files
2830
- To set the k-mer size in the sketch database you can either give a list of sizes with `--k-vals`
2931
or a sequence `--k-seq` with start,stop,step. e.g. `--k-seq 17,29,4` would sketch at k=17, 21, 25 and 29.
3032
- Set the sketch size with `-s`. Typically 1000 is enough for species level resolution, 10000 for within-species/strain

src/cli.rs

+26-15
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
//! Command line interface, built using [`crate::clap` with `Derive`](https://docs.rs/clap/latest/clap/_derive/_tutorial/index.html)
2-
use clap::{ArgGroup, Parser, Subcommand};
2+
use clap::{ArgGroup, Args, Parser, Subcommand};
33

44
use super::hashing::{AaLevel, HashType, DEFAULT_LEVEL};
55

@@ -40,14 +40,30 @@ pub fn check_threads(threads: usize) {
4040
#[derive(Parser)]
4141
#[command(author, version, about, long_about = None)]
4242
#[command(propagate_version = true)]
43-
pub struct Args {
43+
pub struct MainArgs {
4444
#[doc(hidden)]
4545
#[command(subcommand)]
4646
pub command: Commands,
4747

4848
/// Show progress messages
4949
#[arg(short, long, global = true)]
5050
pub verbose: bool,
51+
52+
/// Don't show any messages
53+
#[arg(long, global = true)]
54+
pub quiet: bool,
55+
}
56+
57+
#[derive(Args)]
58+
#[group(required = true, multiple = false)]
59+
pub struct Kmers {
60+
/// K-mer list (comma separated k-mer values to sketch at)
61+
#[arg(short, long, required = true, value_delimiter = ',')]
62+
pub k_vals: Option<Vec<usize>>,
63+
64+
/// K-mer linear sequence (start,end,step)
65+
#[arg(long, required = true, value_delimiter = ',')]
66+
pub k_seq: Option<Vec<usize>>,
5167
}
5268

5369
/// Subcommands and their specific options
@@ -61,10 +77,10 @@ pub enum Commands {
6177
/// Create sketches from input data
6278
Sketch {
6379
/// List of input FASTA files
64-
#[arg(long, group = "input", num_args = 1.., value_delimiter = ',')]
80+
#[arg(group = "input")]
6581
seq_files: Option<Vec<String>>,
6682

67-
/// File listing input files (tab separated name, sequences)
83+
/// File listing input files (tab separated name, sequences, see README)
6884
#[arg(short, group = "input")]
6985
file_list: Option<String>,
7086

@@ -82,13 +98,8 @@ pub enum Commands {
8298
#[arg(short)]
8399
output: String,
84100

85-
/// K-mer list
86-
#[arg(short, long, group = "kmer", required = true, num_args = 1.., value_delimiter = ',')]
87-
k_vals: Option<Vec<usize>>,
88-
89-
/// K-mer sequence: start end step
90-
#[arg(long, group = "kmer", required = true, num_args = 3)]
91-
k_seq: Option<Vec<usize>>,
101+
#[command(flatten)]
102+
kmers: Kmers,
92103

93104
/// Sketch size
94105
#[arg(short, long, default_value_t = DEFAULT_SKETCHSIZE)]
@@ -173,7 +184,7 @@ pub enum Commands {
173184
db: String,
174185

175186
/// List of input FASTA files
176-
#[arg(long, group = "input", num_args = 1.., value_delimiter = ',')]
187+
#[arg(group = "input")]
177188
seq_files: Option<Vec<String>>,
178189

179190
/// File listing input files (tab separated name, sequences)
@@ -236,7 +247,7 @@ pub enum Commands {
236247
},
237248
}
238249

239-
/// Function to parse command line args into [`Args`] struct
240-
pub fn cli_args() -> Args {
241-
Args::parse()
250+
/// Function to parse command line args into [`MainArgs`] struct
251+
pub fn cli_args() -> MainArgs {
252+
MainArgs::parse()
242253
}

src/hashing/aahash_iterator.rs

+4-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,10 @@ impl RollHash for AaHashIterator {
3434
self.fh = new_it.0;
3535
self.index = new_it.1;
3636
} else {
37-
panic!("K-mer larger than smallest valid sequence, which is:\n{}", std::str::from_utf8(&self.seq).unwrap());
37+
panic!(
38+
"K-mer larger than smallest valid sequence, which is:\n{}",
39+
std::str::from_utf8(&self.seq).unwrap()
40+
);
3841
}
3942
}
4043

src/io.rs

+20-14
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
//! Functions to read input fasta/fastq files
2+
use crate::cli::Kmers;
3+
24
use std::fs::File;
35
use std::io::{stdout, BufRead, BufReader, BufWriter, Write};
46
use std::path::Path;
@@ -10,9 +12,9 @@ pub type InputFastx = (String, String, Option<String>);
1012
pub fn read_input_fastas(seq_files: &[String]) -> Vec<InputFastx> {
1113
let mut input_files = Vec::new();
1214
// matches the file name (no extension) in a full path
13-
let re_path = Regex::new(r"^.+/(.+)\.(?i:fa|fasta|fastq|fastq\.gz)$").unwrap();
15+
let re_path = Regex::new(r"^.+/(.+\.?i:fa|fasta|fastq|fastq\.gz)$").unwrap();
1416
// matches the file name (no extension) with no path
15-
let re_name = Regex::new(r"^(.+)\.(?i:fa|fasta|fastq|fastq\.gz)$").unwrap();
17+
let re_name = Regex::new(r"^(.+\.?i:fa|fasta|fastq|fastq\.gz)$").unwrap();
1618
for file in seq_files {
1719
let caps = re_path.captures(file).or(re_name.captures(file));
1820
let name = match caps {
@@ -24,14 +26,14 @@ pub fn read_input_fastas(seq_files: &[String]) -> Vec<InputFastx> {
2426
input_files
2527
}
2628

27-
pub fn parse_kmers(k_list: &Option<Vec<usize>>, k_seq: &Option<Vec<usize>>) -> Vec<usize> {
28-
if k_list.is_some() && k_seq.is_some() {
29+
pub fn parse_kmers(k: &Kmers) -> Vec<usize> {
30+
if k.k_vals.is_some() && k.k_seq.is_some() {
2931
panic!("Only one of --k-vals or --k-seq should be specified");
3032
}
3133

32-
let mut kmers = if let Some(k) = k_list {
34+
let mut kmers = if let Some(k) = &k.k_vals {
3335
k.clone().to_vec()
34-
} else if let Some(k) = k_seq {
36+
} else if let Some(k) = &k.k_seq {
3537
(k[0]..=k[1]).step_by(k[2]).collect()
3638
} else {
3739
panic!("Must specify --k-vals or --k-seq");
@@ -79,18 +81,22 @@ pub fn get_input_list(
7981
for line in f.lines() {
8082
let line = line.expect("Unable to read line in file_list");
8183
let fields: Vec<&str> = line.split_whitespace().collect();
82-
// Should be 2 entries for fasta, 3 for fastq
83-
let second_file = match fields.len() {
84-
0..=1 => {
85-
panic!("Unable to parse line in file_list")
86-
}
87-
2 => None,
88-
3 => Some(fields[2].to_string()),
84+
// 1 entry: fasta with name = file
85+
// 2 entries: fasta name, file
86+
// 3 entries: fastq name, file1, file2
87+
let parsed_input = match fields.len() {
88+
1 => ((fields[0].to_string()), fields[0].to_string(), None),
89+
2 => ((fields[0].to_string()), fields[1].to_string(), None),
90+
3 => (
91+
(fields[0].to_string()),
92+
fields[0].to_string(),
93+
Some(fields[2].to_string()),
94+
),
8995
_ => {
9096
panic!("Unable to parse line in file_list")
9197
}
9298
};
93-
input_files.push((fields[0].to_string(), fields[1].to_string(), second_file));
99+
input_files.push(parsed_input);
94100
}
95101
input_files
96102
}

src/lib.rs

+35-16
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ use std::time::Instant;
1616
extern crate arrayref;
1717
extern crate num_cpus;
1818
use anyhow::Error;
19-
use indicatif::{ParallelProgressIterator, ProgressStyle};
19+
use indicatif::ParallelProgressIterator;
2020
use rayon::prelude::*;
2121

2222
pub mod cli;
@@ -45,6 +45,8 @@ pub mod bloom_filter;
4545
pub mod hashing;
4646

4747
pub mod utils;
48+
use crate::utils::get_progress_bar;
49+
4850
use std::fs::{File, OpenOptions};
4951
use std::io::copy;
5052

@@ -59,7 +61,9 @@ pub const CHUNK_SIZE: usize = 1000;
5961
#[doc(hidden)]
6062
pub fn main() -> Result<(), Error> {
6163
let args = cli_args();
62-
if args.verbose {
64+
if args.quiet {
65+
simple_logger::init_with_level(log::Level::Error).unwrap();
66+
} else if args.verbose {
6367
simple_logger::init_with_level(log::Level::Info).unwrap();
6468
// simple_logger::init_with_level(log::Level::Trace).unwrap();
6569
} else {
@@ -76,8 +80,7 @@ pub fn main() -> Result<(), Error> {
7680
#[cfg(feature = "3di")]
7781
convert_pdb,
7882
output,
79-
k_vals,
80-
k_seq,
83+
kmers,
8184
mut sketch_size,
8285
seq_type,
8386
level,
@@ -97,7 +100,7 @@ pub fn main() -> Result<(), Error> {
97100
log::info!("Getting input files");
98101
let input_files = get_input_list(file_list, seq_files);
99102
log::info!("Parsed {} samples in input list", input_files.len());
100-
let kmers = parse_kmers(k_vals, k_seq);
103+
let kmers = parse_kmers(kmers);
101104
// Build, merge
102105
let rc = !*single_strand;
103106
// Set expected sketchsize
@@ -128,6 +131,7 @@ pub fn main() -> Result<(), Error> {
128131
rc,
129132
*min_count,
130133
*min_qual,
134+
args.quiet,
131135
);
132136
let sketch_vec = MultiSketch::new(&mut sketches, sketch_size, &kmers, seq_type);
133137
sketch_vec
@@ -196,8 +200,8 @@ pub fn main() -> Result<(), Error> {
196200
};
197201
log::info!("{dist_type}");
198202

199-
let bar_style =
200-
ProgressStyle::with_template("{percent}% {bar:80.cyan/blue} eta:{eta}").unwrap();
203+
let percent = true; // In progress bar, don't show total number as huge
204+
201205
// TODO: possible improvement would be to load sketch slices when i, j change
202206
// This would require a change to core_acc where multiple k-mer lengths are loaded at once
203207
// Overall this would be nicer I think (not sure about speed)
@@ -210,10 +214,11 @@ pub fn main() -> Result<(), Error> {
210214
log::info!("Calculating all ref vs ref distances");
211215
let mut distances = DistanceMatrix::new(&references, None, dist_type);
212216
let par_chunk = CHUNK_SIZE * distances.n_dist_cols();
217+
let progress_bar = get_progress_bar(par_chunk, percent, args.quiet);
213218
distances
214219
.dists_mut()
215220
.par_chunks_mut(par_chunk)
216-
.progress_with_style(bar_style)
221+
.progress_with(progress_bar)
217222
.enumerate()
218223
.for_each(|(chunk_idx, dist_slice)| {
219224
// Get first i, j index for the chunk
@@ -228,7 +233,11 @@ pub fn main() -> Result<(), Error> {
228233
references.get_sketch_slice(j, k),
229234
references.sketch_size,
230235
);
231-
dist = if *ani { ani_pois(dist, k_f32) } else { 1.0_f32 - dist };
236+
dist = if *ani {
237+
ani_pois(dist, k_f32)
238+
} else {
239+
1.0_f32 - dist
240+
};
232241
dist_slice[dist_idx] = dist;
233242
} else {
234243
let dist =
@@ -259,13 +268,14 @@ pub fn main() -> Result<(), Error> {
259268
log::info!("Calculating sparse ref vs ref distances with {nn} nearest neighbours");
260269
let mut sp_distances =
261270
SparseDistanceMatrix::new(&references, nn, dist_type);
271+
let progress_bar = get_progress_bar(nn, percent, args.quiet);
262272
// TODO is it possible to add a template to the trait so this code is only written once? Maybe not
263273
match sp_distances.dists_mut() {
264274
DistVec::Jaccard(distances) => {
265275
let k = k_idx.unwrap();
266276
distances
267277
.par_chunks_mut(nn)
268-
.progress_with_style(bar_style)
278+
.progress_with(progress_bar)
269279
.enumerate()
270280
.for_each(|(i, row_dist_slice)| {
271281
let mut heap = BinaryHeap::with_capacity(nn);
@@ -279,8 +289,11 @@ pub fn main() -> Result<(), Error> {
279289
references.get_sketch_slice(j, k),
280290
references.sketch_size,
281291
);
282-
dist =
283-
if *ani { ani_pois(dist, k_f32) } else { 1.0_f32 - dist };
292+
dist = if *ani {
293+
ani_pois(dist, k_f32)
294+
} else {
295+
1.0_f32 - dist
296+
};
284297
let dist_item = SparseJaccard(j, dist);
285298
if heap.len() < nn
286299
|| dist_item < *heap.peek().unwrap()
@@ -299,7 +312,7 @@ pub fn main() -> Result<(), Error> {
299312
DistVec::CoreAcc(distances) => {
300313
distances
301314
.par_chunks_mut(nn)
302-
.progress_with_style(bar_style)
315+
.progress_with(progress_bar)
303316
.enumerate()
304317
.for_each(|(i, row_dist_slice)| {
305318
let mut heap = BinaryHeap::with_capacity(nn);
@@ -339,10 +352,11 @@ pub fn main() -> Result<(), Error> {
339352
DistanceMatrix::new(&references, Some(&query_db), dist_type);
340353
let par_chunk = CHUNK_SIZE * distances.n_dist_cols();
341354
let nq = query_db.number_samples_loaded();
355+
let progress_bar = get_progress_bar(par_chunk, percent, args.quiet);
342356
distances
343357
.dists_mut()
344358
.par_chunks_mut(par_chunk)
345-
.progress_with_style(bar_style)
359+
.progress_with(progress_bar)
346360
.enumerate()
347361
.for_each(|(chunk_idx, dist_slice)| {
348362
// Get first i, j index for the chunk
@@ -355,7 +369,11 @@ pub fn main() -> Result<(), Error> {
355369
query_db.get_sketch_slice(j, k),
356370
references.sketch_size,
357371
);
358-
dist = if *ani { ani_pois(dist, k_f32) } else { 1.0_f32 - dist };
372+
dist = if *ani {
373+
ani_pois(dist, k_f32)
374+
} else {
375+
1.0_f32 - dist
376+
};
359377
dist_slice[dist_idx] = dist;
360378
} else {
361379
let dist = core_acc_dist(&references, &query_db, i, j);
@@ -474,6 +492,7 @@ pub fn main() -> Result<(), Error> {
474492
rc,
475493
*min_count,
476494
*min_qual,
495+
args.quiet,
477496
);
478497
let mut db2_metadata =
479498
MultiSketch::new(&mut db2_sketches, sketch_size, kmers, seq_type);
@@ -554,7 +573,7 @@ pub fn main() -> Result<(), Error> {
554573
let end = Instant::now();
555574

556575
log::info!("Complete");
557-
if print_success {
576+
if print_success && !args.quiet {
558577
eprintln!(
559578
"🧬🖋️ sketchlib done in {}s",
560579
end.duration_since(start).as_secs()

0 commit comments

Comments
 (0)