Skip to content

Commit b1646f9

Browse files
author
Johanna
committed
cleaned up code
1 parent 40d4f86 commit b1646f9

File tree

7 files changed

+172
-231
lines changed

7 files changed

+172
-231
lines changed

src/cli.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -207,13 +207,13 @@ pub enum Commands {
207207

208208
/// Delete genome(s) from a database (input: one id per line)
209209
Delete {
210-
/// Sketching database basename (so without .skm or .skd)
210+
/// Sketching database
211211
#[arg(required = true)]
212212
db: String,
213213

214214
/// Input file with IDs to delete (one ID per line)
215215
#[arg(required = true)]
216-
genome_ids: String,
216+
samples: String,
217217

218218
/// output file name
219219
#[arg(required = true)]

src/lib.rs

+4-3
Original file line numberDiff line numberDiff line change
@@ -486,13 +486,13 @@ pub fn main() -> Result<(), Error> {
486486

487487
Commands::Delete {
488488
db,
489-
genome_ids,
489+
samples,
490490
output_file,
491491
} => {
492492
let ref_db = utils::strip_sketch_extension(db);
493493

494494
log::info!("Reading input genomes");
495-
let path = Path::new(genome_ids);
495+
let path = Path::new(samples);
496496
let file = File::open(path)?;
497497
let reader = std::io::BufReader::new(file);
498498

@@ -503,8 +503,9 @@ pub fn main() -> Result<(), Error> {
503503
let mut sketches: MultiSketch = MultiSketch::load(ref_db)
504504
.unwrap_or_else(|_| panic!("Could not read sketch metadata from {}.skm", ref_db));
505505

506+
//TODO: check if all genome IDs have been found in meta and sketchdata
506507
// write new .skm
507-
let _ = sketches.remove_metadata(output_file, &ids);
508+
sketches.remove_metadata(output_file, &ids)?;
508509

509510
// remove samples from .skd file
510511
log::info!("Remove genomes and writing output");

src/multisketch.rs

+30-15
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1+
use anyhow::bail;
12
use anyhow::Error;
3+
use anyhow::{Result, anyhow};
4+
// use thiserror::Error;
25
use core::panic;
36
use std::fmt;
47
use std::fs::File;
@@ -12,6 +15,7 @@ use crate::hashing::HashType;
1215
use crate::sketch::{Sketch, BBITS};
1316
use crate::sketch_datafile::SketchArrayFile;
1417

18+
use std::collections::HashSet;
1519
#[derive(Serialize, Deserialize)]
1620
pub struct MultiSketch {
1721
pub sketch_size: u64,
@@ -195,17 +199,32 @@ impl MultiSketch {
195199
&mut self,
196200
output_file_name: &str,
197201
genome_ids_to_remove: &[String],
198-
) -> std::io::Result<()> {
199-
println!("{}", self);
200-
let mut new_sketch_metadata: Vec<Sketch> = Vec::with_capacity(self.sketch_metadata.len());
202+
) -> anyhow::Result<()> {
203+
// error for write batch
204+
// error for if IDs are missing
205+
let mut new_sketch_metadata: Vec<Sketch> =
206+
Vec::with_capacity(self.sketch_metadata.len() - genome_ids_to_remove.len());
207+
let mut removed_samples = Vec::new();
201208

202209
for sketch in &self.sketch_metadata {
210+
203211
if !genome_ids_to_remove.contains(&(*sketch.name()).to_string()) {
204212
new_sketch_metadata.push(sketch.clone());
213+
214+
} else {
215+
removed_samples.push(sketch.name());
205216
}
206217
}
218+
219+
let set1: HashSet<&str> = removed_samples.iter().map(AsRef::as_ref).collect();
220+
let set2: HashSet<&str> = genome_ids_to_remove.iter().map(AsRef::as_ref).collect();
221+
let missing: Vec<&&str> = set2.difference(&set1).collect();
222+
if !missing.is_empty() {
223+
bail!("Elements in set2 not found in set1: {:?}", missing);
224+
}
225+
207226
self.sketch_metadata = new_sketch_metadata;
208-
let _ = self.save_metadata(output_file_name);
227+
self.save_metadata(output_file_name)?;
209228
Ok(())
210229
}
211230

@@ -214,21 +233,20 @@ impl MultiSketch {
214233
input_prefix: &str,
215234
output_file: &str,
216235
genome_ids_to_remove: &[String],
217-
) -> std::io::Result<()> {
218-
// Check if all genome IDs to remove exist and get their positions
236+
) -> anyhow::Result<()> {
219237
let mut positions_to_remove = Vec::new();
220238
let mut missing_ids = Vec::new();
221239

222240
for id in genome_ids_to_remove {
223-
println!("{}", id);
224241
if let Some(&position) = self.name_map.get(id) {
225242
positions_to_remove.push(position);
226243
} else {
227244
missing_ids.push(id);
228245
}
229246
}
247+
230248
if !missing_ids.is_empty() {
231-
panic!("The following genome IDs were not found: {:?}", missing_ids);
249+
bail!("The following genome IDs were not found: {:?}", missing_ids);
232250
}
233251

234252
// Create a list of indices to keep
@@ -238,17 +256,14 @@ impl MultiSketch {
238256

239257
let input_filename = format!("{}.skd", input_prefix);
240258
let output_filename = format!("{}.skd", output_file);
241-
SketchArrayFile::write_batch(
259+
if let Err(e) = SketchArrayFile::write_batch(
242260
&input_filename,
243261
&output_filename,
244262
&indices_to_keep,
245263
self.sample_stride,
246-
)
247-
.unwrap_or_else(|e| {
248-
eprintln!("Error during batch write: {}", e);
249-
std::process::exit(1);
250-
});
251-
println!("Output sketch data written to: {output_filename}",);
264+
) {
265+
return Err(anyhow!("Error during batch write: {}", e).into());
266+
}
252267

253268
Ok(())
254269
}

src/sketch.rs

+2-3
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ impl Sketch {
150150
}
151151
}
152152

153-
fn clean_name(n: &str) -> String {
153+
fn remove_extension(n: &str) -> String {
154154
let stem = Path::new(n).file_stem().unwrap().to_str().unwrap();
155155
stem.strip_suffix(".fa")
156156
.or_else(|| stem.strip_suffix(".fasta"))
@@ -269,11 +269,10 @@ pub fn sketch_files(
269269
.iter_mut()
270270
.enumerate()
271271
.map(|(idx, hash_it)| {
272-
// changed how the sample name is processed
273272
let sample_name = if concat_fasta {
274273
format!("{name}_{}", idx + 1)
275274
} else {
276-
Sketch::clean_name(name)
275+
Sketch::remove_extension(name)
277276
};
278277
if hash_it.seq_len() == 0 {
279278
panic!("{sample_name} has no valid sequence");

tests/delete.rs

+80-2
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,13 @@ use std::path::Path;
44

55
pub mod common;
66
use crate::common::*;
7-
use sketchlib::io::*;
87

98
use sketchlib::multisketch::MultiSketch;
109

1110
#[cfg(test)]
1211

1312
mod tests {
1413
use super::*;
15-
use sketchlib::io;
1614

1715
#[test]
1816
fn test_delete_sketches() {
@@ -78,3 +76,83 @@ mod tests {
7876
);
7977
}
8078
}
79+
80+
// use predicates::prelude::*;
81+
// use snapbox::cmd::{cargo_bin, Command};
82+
// use std::path::Path;
83+
84+
// pub mod common;
85+
// use crate::common::*;
86+
// use sketchlib::io::*;
87+
88+
// use sketchlib::multisketch::MultiSketch;
89+
90+
// #[cfg(test)]
91+
92+
// mod tests {
93+
// use super::*;
94+
// use sketchlib::io;
95+
96+
// #[test]
97+
// fn test_delete_sketches() {
98+
// let sandbox = TestSetup::setup();
99+
100+
// Command::new(cargo_bin("sketchlib"))
101+
// .current_dir(sandbox.get_wd())
102+
// .arg("sketch")
103+
// .args(&["--k-vals", "17"])
104+
// .arg("--seq-files")
105+
// .arg(sandbox.file_string("14412_3#82.contigs_velvet.fa.gz", TestDir::Input))
106+
// .arg(sandbox.file_string("14412_3#84.contigs_velvet.fa.gz", TestDir::Input))
107+
// .arg(sandbox.file_string("R6.fa.gz", TestDir::Input))
108+
// .arg(sandbox.file_string("TIGR4.fa.gz", TestDir::Input))
109+
// .arg("-v")
110+
// .args(&["-o", "full_db"])
111+
// .assert()
112+
// .success();
113+
114+
// Command::new(cargo_bin("sketchlib"))
115+
// .current_dir(sandbox.get_wd())
116+
// .arg("sketch")
117+
// .args(&["--k-vals", "17"])
118+
// .arg("--seq-files")
119+
// .arg(sandbox.file_string("14412_3#82.contigs_velvet.fa.gz", TestDir::Input))
120+
// .arg(sandbox.file_string("14412_3#84.contigs_velvet.fa.gz", TestDir::Input))
121+
// .arg("-v")
122+
// .args(&["-o", "result_db"])
123+
// .assert()
124+
// .success();
125+
126+
// // samples in delete_test.txt are TIGR4, and R6
127+
// Command::new(cargo_bin("sketchlib"))
128+
// .current_dir(sandbox.get_wd())
129+
// .arg("delete")
130+
// .arg("full_db")
131+
// .arg(sandbox.file_string("delete_test.txt", TestDir::Input))
132+
// .arg("deleted_db")
133+
// .assert()
134+
// .success();
135+
136+
// let predicate_file = predicate::path::eq_file(Path::new(
137+
// &sandbox.file_string("deleted_db.skd", TestDir::Output),
138+
// ));
139+
// assert_eq!(
140+
// true,
141+
// predicate_file.eval(Path::new(
142+
// &sandbox.file_string("result_db.skd", TestDir::Output)
143+
// )),
144+
// "Sketch data after deletion incorrect"
145+
// );
146+
147+
// // Check .skm the same
148+
// let merged_sketch: MultiSketch =
149+
// MultiSketch::load(&sandbox.file_string("deleted_db", TestDir::Output))
150+
// .expect("Failed to load output merged sketch");
151+
// let expected_sketch = MultiSketch::load(&sandbox.file_string("result_db", TestDir::Output))
152+
// .expect("Failed to load expected merged sketch");
153+
// assert_eq!(
154+
// merged_sketch, expected_sketch,
155+
// "Metadata data after deletion incorrect"
156+
// );
157+
// }
158+
// }

0 commit comments

Comments
 (0)