Skip to content

Commit e051db4

Browse files
author
Johanna
committed
add append (previously concat) with most comments resolved
1 parent a3b2f4d commit e051db4

File tree

7 files changed

+26
-62
lines changed

7 files changed

+26
-62
lines changed

src/cli.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -161,9 +161,9 @@ pub enum Commands {
161161
#[arg(required = true, short)]
162162
output: String,
163163
},
164-
/// Concat one sketch file (.skm and .skd pair) with new genomes (seq_files or files_list)
165-
Concat {
166-
/// The first .skd (sketch data) file
164+
/// Append new genomes to be sketched to an existing sketch database
165+
Append {
166+
/// Sketching database basename (so without .skm or .skd)
167167
#[arg(required = true)]
168168
db: String,
169169

src/lib.rs

+8-7
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,7 @@ pub fn main() -> Result<(), Error> {
397397
utils::save_sketch_data(ref_db_name1, ref_db_name2, output)
398398
}
399399

400-
Commands::Concat {
400+
Commands::Append {
401401
db,
402402
seq_files,
403403
file_list,
@@ -409,6 +409,8 @@ pub fn main() -> Result<(), Error> {
409409
threads,
410410
level,
411411
} => {
412+
// An extra thread is needed for the writer. This doesn't 'overuse' CPU
413+
check_threads(*threads + 1);
412414
//get input files
413415
log::info!("Getting input files");
414416
let input_files: Vec<(String, String, Option<String>)> =
@@ -417,10 +419,9 @@ pub fn main() -> Result<(), Error> {
417419

418420
//check if any of the new files are already existant in the db
419421
let db_metadata: MultiSketch = MultiSketch::load(db)
420-
.unwrap_or_else(|_| panic!("Could not read sketch metadata from {}.skm", db));
421-
println!("{:?}", db_metadata);
422+
.expect(&format!("Could not read sketch metadata from .skm: {}", db));
422423

423-
if !db_metadata.concat_competibility(&input_files) {
424+
if !db_metadata.append_compatibility(&input_files) {
424425
panic!("Databases are not compatible for merging.")
425426
}
426427
log::info!("Passed concat check");
@@ -436,7 +437,7 @@ pub fn main() -> Result<(), Error> {
436437

437438
log::info!(
438439
"Running sketching: k:{:?}; sketch_size:{}; seq:{:?}; threads:{}",
439-
&kmers,
440+
kmers,
440441
sketch_size * u64::BITS as u64,
441442
seq_type,
442443
threads,
@@ -452,7 +453,7 @@ pub fn main() -> Result<(), Error> {
452453
output,
453454
&input_files,
454455
*concat_fasta,
455-
&kmers,
456+
kmers,
456457
sketch_size,
457458
&seq_type,
458459
rc,
@@ -479,7 +480,7 @@ pub fn main() -> Result<(), Error> {
479480
let concat_metadata = db2_metadata.merge_sketches(&db_metadata);
480481
concat_metadata
481482
.save_metadata(output)
482-
.unwrap_or_else(|_| panic!("Couldn't save metadata to {}", output));
483+
.expect(&format!("Could not save metadata to {}", output));
483484
Ok(())
484485
}
485486

src/multisketch.rs

+11-4
Original file line numberDiff line numberDiff line change
@@ -158,14 +158,21 @@ impl MultiSketch {
158158
&& self.get_hash_type() == sketch2.get_hash_type()
159159
}
160160

161-
pub fn concat_competibility(&self, name_vec: &[(String, String, Option<String>)]) -> bool {
161+
pub fn append_compatibility(&self, name_vec: &[(String, String, Option<String>)]) -> bool {
162+
let mut compatibility = true;
163+
let mut duplicate_list = Vec::new();
162164
for (id, _, _) in name_vec.iter() {
163165
if self.name_map.contains_key(id) {
164-
println!("{} is found in both database and input fasta", id);
165-
return false;
166+
duplicate_list.push(id);
167+
compatibility = false;
166168
}
167169
}
168-
true
170+
171+
if !duplicate_list.is_empty() {
172+
println!("Duplicates found: {:?}", duplicate_list);
173+
}
174+
175+
compatibility
169176
}
170177

171178
pub fn merge_sketches(&mut self, sketch2: &Self) -> &mut Self {

tests/concat.rs

+4-40
Original file line numberDiff line numberDiff line change
@@ -4,48 +4,16 @@ use std::path::Path;
44

55
pub mod common;
66
use crate::common::*;
7-
8-
#[path = "../src/io.rs"]
9-
pub mod io;
7+
use sketchlib::io::*;
108

119
use sketchlib::multisketch::MultiSketch;
1210

1311
#[cfg(test)]
1412

1513
mod tests {
14+
use sketchlib::io;
1615
use super::*;
17-
18-
#[test]
19-
fn concat_competibility_test() {
20-
let sandbox = TestSetup::setup();
21-
let ref_db1 = sandbox.file_string("sketches1", TestDir::Input);
22-
let ref_db2 = sandbox.file_string("sketches2", TestDir::Input);
23-
let file_list_name = sandbox.file_string("fasta.txt", TestDir::Input);
24-
let non_seq_files: Option<Vec<String>> = None;
25-
26-
27-
let file_list: Option<String> = Some(file_list_name);
28-
//get input files with file_list
29-
log::info!("Getting input files for test");
30-
let input_file_list = io::get_input_list(&file_list, &non_seq_files);
31-
log::info!("Parsed {} samples in input list", input_file_list.len());
32-
33-
//check if any of the new files are already existant in the db
34-
let db1_metadata: MultiSketch = MultiSketch::load(&ref_db1)
35-
.unwrap_or_else(|_| panic!("Could not read sketch metadata from {}.skm", ref_db1));
36-
println!("{:?}", db1_metadata);
37-
38-
//check if any of the new files are already existant in the db
39-
let db2_metadata: MultiSketch = MultiSketch::load(&ref_db2)
40-
.unwrap_or_else(|_| panic!("Could not read sketch metadata from {}.skm", ref_db2));
41-
println!("{:?}", db2_metadata);
42-
43-
// Test case 1:
44-
assert!(
45-
!db1_metadata.concat_competibility(&input_file_list),
46-
"Sketches should not be compatible"
47-
);
48-
}
16+
4917

5018
#[test]
5119
fn test_concat_sketches() {
@@ -62,7 +30,6 @@ mod tests {
6230
.args(&["-o", "part1"])
6331
.assert()
6432
.success();
65-
log::info!("Part 1 Sketched");
6633

6734
Command::new(cargo_bin("sketchlib"))
6835
.current_dir(sandbox.get_wd())
@@ -75,7 +42,6 @@ mod tests {
7542
.args(&["-o", "part2"])
7643
.assert()
7744
.success();
78-
log::info!("Part 2 Sketched");
7945

8046
Command::new(cargo_bin("sketchlib"))
8147
.current_dir(sandbox.get_wd())
@@ -90,7 +56,6 @@ mod tests {
9056
.args(&["-o", "concat_ref"])
9157
.assert()
9258
.success();
93-
log::info!("concat_ref Sketched");
9459

9560
// Overlapping labels fails
9661
Command::new(cargo_bin("sketchlib"))
@@ -107,7 +72,7 @@ mod tests {
10772

10873
Command::new(cargo_bin("sketchlib"))
10974
.current_dir(sandbox.get_wd())
110-
.arg("concat")
75+
.arg("append")
11176
.arg("part1")
11277
.arg("--seq-files")
11378
// .arg(sandbox.file_string("fasta_part2.txt", TestDir::Input))
@@ -117,7 +82,6 @@ mod tests {
11782
.args(&["-o", "concat_test"])
11883
.assert()
11984
.success();
120-
log::info!("concat_test Sketched");
12185

12286
// Check .skm the same
12387
let concat_sketch: MultiSketch =

tests/test_files_in/fasta.txt

-4
This file was deleted.

tests/test_files_in/fasta_part1.txt

-2
This file was deleted.

tests/test_files_in/fasta_part2.txt

-2
This file was deleted.

0 commit comments

Comments
 (0)