Skip to content

Commit a3b2f4d

Browse files
author
Johanna
committed
add concat functionality with tests
1 parent 4cf4faf commit a3b2f4d

File tree

7 files changed

+190
-30
lines changed

7 files changed

+190
-30
lines changed

src/cli.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ pub enum Commands {
161161
#[arg(required = true, short)]
162162
output: String,
163163
},
164-
/// Concat one sketch file (.skm and .skd pair) with new genomes
164+
/// Concat one sketch file (.skm and .skd pair) with new genomes (seq_files or files_list)
165165
Concat {
166166
/// The first .skd (sketch data) file
167167
#[arg(required = true)]
@@ -200,7 +200,7 @@ pub enum Commands {
200200
#[arg(long, value_parser = valid_cpus, default_value_t = 1)]
201201
threads: usize,
202202

203-
/// aaHash 'level'
203+
/// aaHash 'level'
204204
#[arg(long, value_enum, default_value_t = DEFAULT_LEVEL)]
205205
level: AaLevel,
206206
},

src/lib.rs

+28-19
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ pub mod bloom_filter;
3939
pub mod hashing;
4040

4141
pub mod utils;
42+
use std::fs::{File, OpenOptions};
43+
use std::io::copy;
4244

4345
/// Default k-mer size for (genome) sketching
4446
pub const DEFAULT_KMER: usize = 17;
@@ -407,7 +409,6 @@ pub fn main() -> Result<(), Error> {
407409
threads,
408410
level,
409411
} => {
410-
411412
//get input files
412413
log::info!("Getting input files");
413414
let input_files: Vec<(String, String, Option<String>)> =
@@ -419,25 +420,23 @@ pub fn main() -> Result<(), Error> {
419420
.unwrap_or_else(|_| panic!("Could not read sketch metadata from {}.skm", db));
420421
println!("{:?}", db_metadata);
421422

422-
println!("{:?}", db_metadata.kmer_lengths());
423-
db_metadata.concat_competibility(&input_files);
423+
if !db_metadata.concat_competibility(&input_files) {
424+
panic!("Databases are not compatible for merging.")
425+
}
424426
log::info!("Passed concat check");
425427

426428
// read out sketching information needed to sketch the new files
427429
let kmers = db_metadata.kmer_lengths();
428-
// Build, merge
429430
let rc = !*single_strand;
430-
// Set expected sketchsize
431431
let sketch_size = db_metadata.sketch_size;
432-
// Set aa level
433432
let seq_type = db_metadata.get_hash_type();
434-
435433
if *concat_fasta && matches!(*seq_type, HashType::DNA | HashType::PDB) {
436434
panic!("--concat-fasta currently only supported with --seq-type aa");
437435
}
436+
438437
log::info!(
439438
"Running sketching: k:{:?}; sketch_size:{}; seq:{:?}; threads:{}",
440-
kmers,
439+
&kmers,
441440
sketch_size * u64::BITS as u64,
442441
seq_type,
443442
threads,
@@ -448,7 +447,7 @@ pub fn main() -> Result<(), Error> {
448447
} else {
449448
seq_type.clone()
450449
};
451-
// sketch freshly incoming files
450+
// sketch genomes and save them to concat output file
452451
let mut db2_sketches = sketch_files(
453452
output,
454453
&input_files,
@@ -460,17 +459,27 @@ pub fn main() -> Result<(), Error> {
460459
*min_count,
461460
*min_qual,
462461
);
463-
let db2_metadata = MultiSketch::new(&mut db2_sketches, sketch_size, &kmers, seq_type);
464-
db2_metadata
465-
.save_metadata(output)
466-
.expect("Error saving metadata");
467-
468-
// // save skd data from db1 and from freshly sketched input files
469-
// log::info!("Merging and saving sketch data to {}.skd", output);
470-
// utils::save_sketch_data(db_metadata, db2, output);
462+
let mut db2_metadata =
463+
MultiSketch::new(&mut db2_sketches, sketch_size, kmers, seq_type);
471464

472-
// // read in skm from db1
473-
// // merge and update skm from db1 and the new just sketched sketch
465+
// save skd data from db1 and from freshly sketched input files
466+
log::info!("Merging and saving sketch data to {}.skd", output);
467+
// let mut output_file = File::create(format!("{}.skd", output))?;
468+
let mut output_file = OpenOptions::new()
469+
.create(true)
470+
.append(true)
471+
.open(format!("{}.skd", output))?;
472+
// stream sketch data directly to concat output file
473+
let mut db_sketch = File::open(format!("{}.skd", db))?;
474+
println!("{:?}", db_sketch);
475+
println!("{:?}", output_file);
476+
copy(&mut db_sketch, &mut output_file)?;
477+
478+
// merge and update skm from db1 and the new just sketched sketch
479+
let concat_metadata = db2_metadata.merge_sketches(&db_metadata);
480+
concat_metadata
481+
.save_metadata(output)
482+
.unwrap_or_else(|_| panic!("Couldn't save metadata to {}", output));
474483
Ok(())
475484
}
476485

src/multisketch.rs

+4-5
Original file line numberDiff line numberDiff line change
@@ -158,15 +158,14 @@ impl MultiSketch {
158158
&& self.get_hash_type() == sketch2.get_hash_type()
159159
}
160160

161-
pub fn concat_competibility(&self, name_vec: &[(String, String, Option<String>)]) {
161+
pub fn concat_competibility(&self, name_vec: &[(String, String, Option<String>)]) -> bool {
162162
for (id, _, _) in name_vec.iter() {
163163
if self.name_map.contains_key(id) {
164-
panic!(
165-
"{} appears in both the database and the provided files. Cannot concat files.",
166-
id
167-
);
164+
println!("{} is found in both database and input fasta", id);
165+
return false;
168166
}
169167
}
168+
true
170169
}
171170

172171
pub fn merge_sketches(&mut self, sketch2: &Self) -> &mut Self {

tests/concat.rs

+148
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
use predicates::prelude::*;
2+
use snapbox::cmd::{cargo_bin, Command};
3+
use std::path::Path;
4+
5+
pub mod common;
6+
use crate::common::*;
7+
8+
#[path = "../src/io.rs"]
9+
pub mod io;
10+
11+
use sketchlib::multisketch::MultiSketch;
12+
13+
#[cfg(test)]
14+
15+
mod tests {
16+
use super::*;
17+
18+
#[test]
19+
fn concat_competibility_test() {
20+
let sandbox = TestSetup::setup();
21+
let ref_db1 = sandbox.file_string("sketches1", TestDir::Input);
22+
let ref_db2 = sandbox.file_string("sketches2", TestDir::Input);
23+
let file_list_name = sandbox.file_string("fasta.txt", TestDir::Input);
24+
let non_seq_files: Option<Vec<String>> = None;
25+
26+
27+
let file_list: Option<String> = Some(file_list_name);
28+
//get input files with file_list
29+
log::info!("Getting input files for test");
30+
let input_file_list = io::get_input_list(&file_list, &non_seq_files);
31+
log::info!("Parsed {} samples in input list", input_file_list.len());
32+
33+
//check if any of the new files are already existant in the db
34+
let db1_metadata: MultiSketch = MultiSketch::load(&ref_db1)
35+
.unwrap_or_else(|_| panic!("Could not read sketch metadata from {}.skm", ref_db1));
36+
println!("{:?}", db1_metadata);
37+
38+
//check if any of the new files are already existant in the db
39+
let db2_metadata: MultiSketch = MultiSketch::load(&ref_db2)
40+
.unwrap_or_else(|_| panic!("Could not read sketch metadata from {}.skm", ref_db2));
41+
println!("{:?}", db2_metadata);
42+
43+
// Test case 1:
44+
assert!(
45+
!db1_metadata.concat_competibility(&input_file_list),
46+
"Sketches should not be compatible"
47+
);
48+
}
49+
50+
#[test]
51+
fn test_concat_sketches() {
52+
let sandbox = TestSetup::setup();
53+
54+
Command::new(cargo_bin("sketchlib"))
55+
.current_dir(sandbox.get_wd())
56+
.arg("sketch")
57+
.args(&["--k-vals", "17"])
58+
.arg("--seq-files")
59+
.arg(sandbox.file_string("14412_3#82.contigs_velvet.fa.gz", TestDir::Input))
60+
.arg(sandbox.file_string("14412_3#84.contigs_velvet.fa.gz", TestDir::Input))
61+
.arg("-v")
62+
.args(&["-o", "part1"])
63+
.assert()
64+
.success();
65+
log::info!("Part 1 Sketched");
66+
67+
Command::new(cargo_bin("sketchlib"))
68+
.current_dir(sandbox.get_wd())
69+
.arg("sketch")
70+
.args(&["--k-vals", "17"])
71+
.arg("--seq-files")
72+
.arg(sandbox.file_string("R6.fa.gz", TestDir::Input))
73+
.arg(sandbox.file_string("TIGR4.fa.gz", TestDir::Input))
74+
.arg("-v")
75+
.args(&["-o", "part2"])
76+
.assert()
77+
.success();
78+
log::info!("Part 2 Sketched");
79+
80+
Command::new(cargo_bin("sketchlib"))
81+
.current_dir(sandbox.get_wd())
82+
.arg("sketch")
83+
.args(&["--k-vals", "17"])
84+
.arg("--seq-files")
85+
.arg(sandbox.file_string("R6.fa.gz", TestDir::Input))
86+
.arg(sandbox.file_string("TIGR4.fa.gz", TestDir::Input))
87+
.arg(sandbox.file_string("14412_3#82.contigs_velvet.fa.gz", TestDir::Input))
88+
.arg(sandbox.file_string("14412_3#84.contigs_velvet.fa.gz", TestDir::Input))
89+
.arg("-v")
90+
.args(&["-o", "concat_ref"])
91+
.assert()
92+
.success();
93+
log::info!("concat_ref Sketched");
94+
95+
// Overlapping labels fails
96+
Command::new(cargo_bin("sketchlib"))
97+
.current_dir(sandbox.get_wd())
98+
.arg("concat")
99+
.arg("part1")
100+
.arg("--seq-files")
101+
.arg(sandbox.file_string("14412_3#82.contigs_velvet.fa.gz", TestDir::Input))
102+
.arg(sandbox.file_string("14412_3#84.contigs_velvet.fa.gz", TestDir::Input))
103+
.arg("-v")
104+
.args(&["-o", "concat_test"])
105+
.assert()
106+
.failure();
107+
108+
Command::new(cargo_bin("sketchlib"))
109+
.current_dir(sandbox.get_wd())
110+
.arg("concat")
111+
.arg("part1")
112+
.arg("--seq-files")
113+
// .arg(sandbox.file_string("fasta_part2.txt", TestDir::Input))
114+
.arg(sandbox.file_string("R6.fa.gz", TestDir::Input))
115+
.arg(sandbox.file_string("TIGR4.fa.gz", TestDir::Input))
116+
.arg("-v")
117+
.args(&["-o", "concat_test"])
118+
.assert()
119+
.success();
120+
log::info!("concat_test Sketched");
121+
122+
// Check .skm the same
123+
let concat_sketch: MultiSketch =
124+
MultiSketch::load(&sandbox.file_string("concat_test", TestDir::Output))
125+
.expect("Failed to load output merged sketch");
126+
let expected_sketch =
127+
MultiSketch::load(&sandbox.file_string("concat_ref", TestDir::Output))
128+
.expect("Failed to load expected merged sketch");
129+
println!("{}", concat_sketch);
130+
println!("{}", expected_sketch);
131+
assert_eq!(
132+
concat_sketch, expected_sketch,
133+
"Concat sketch metadata does not match"
134+
);
135+
136+
// Check .skd the same
137+
let predicate_file = predicate::path::eq_file(Path::new(
138+
&sandbox.file_string("concat_test.skd", TestDir::Output),
139+
));
140+
assert_eq!(
141+
true,
142+
predicate_file.eval(Path::new(
143+
&sandbox.file_string("concat_ref.skd", TestDir::Output)
144+
)),
145+
"Concat sketch data does not match"
146+
);
147+
}
148+
}

tests/test_files_in/fasta.txt

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
14412_3#82.contigs_velvet.fa tests/test_files_in/14412_3#82.contigs_velvet.fa
2-
14412_3#84.contigs_velvet.fa tests/test_files_in/14412_3#84.contigs_velvet.fa
3-
R6.fa tests/test_files_in/R6.fa
4-
TIGR4.fa tests/test_files_in/TIGR4.fa
1+
14412_3#82.contigs_velvet.fa tests/test_files_in/14412_3#82.contigs_velvet.fa.gz
2+
14412_3#84.contigs_velvet.fa tests/test_files_in/14412_3#84.contigs_velvet.fa.gz
3+
R6.fa tests/test_files_in/R6.fa.gz
4+
TIGR4.fa tests/test_files_in/TIGR4.fa.gz

tests/test_files_in/fasta_part1.txt

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
14412_3#82.contigs_velvet.fa tests/test_files_in/14412_3#82.contigs_velvet.fa.gz
2+
14412_3#84.contigs_velvet.fa tests/test_files_in/14412_3#84.contigs_velvet.fa.gz

tests/test_files_in/fasta_part2.txt

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
R6.fa /Users/wachsmannj/Documents/work/development/sketchlib.rust/tests/test_files_in/R6.fa.gz
2+
TIGR4.fa /Users/wachsmannj/Documents/work/development/sketchlib.rust/tests/test_files_in/TIGR4.fa.gz

0 commit comments

Comments
 (0)