Skip to content

Commit 4cf4faf

Browse files
author
Johanna
committed
add concat functionality (no tests yet)
1 parent 35e4258 commit 4cf4faf

File tree

4 files changed

+135
-2
lines changed

4 files changed

+135
-2
lines changed

src/cli.rs

+43
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,49 @@ pub enum Commands {
161161
#[arg(required = true, short)]
162162
output: String,
163163
},
164+
/// Concat one sketch file (.skm and .skd pair) with new genomes
165+
Concat {
166+
/// The first .skd (sketch data) file
167+
#[arg(required = true)]
168+
db: String,
169+
170+
/// List of input FASTA files
171+
#[arg(long, group = "input", num_args = 1.., value_delimiter = ',')]
172+
seq_files: Option<Vec<String>>,
173+
174+
/// File listing input files (tab separated name, sequences)
175+
#[arg(short, group = "input")]
176+
file_list: Option<String>,
177+
178+
/// Output filename for the merged sketch
179+
#[arg(required = true, short)]
180+
output: String,
181+
182+
/// Ignore reverse complement (all contigs are oriented along same strand)
183+
#[arg(long, default_value_t = DEFAULT_STRAND)]
184+
single_strand: bool,
185+
186+
/// Minimum k-mer count (with reads)
187+
#[arg(long, default_value_t = DEFAULT_MINCOUNT)]
188+
min_count: u16,
189+
190+
/// Minimum k-mer quality (with reads)
191+
#[arg(long, default_value_t = DEFAULT_MINQUAL)]
192+
min_qual: u8,
193+
194+
/// Treat every sequence in an input file as a new sample (aa only)
195+
// TODO: for now, could be extended to dna, but probably no need
196+
#[arg(long, default_value_t = false)]
197+
concat_fasta: bool,
198+
199+
/// Number of CPU threads
200+
#[arg(long, value_parser = valid_cpus, default_value_t = 1)]
201+
threads: usize,
202+
203+
/// aaHash 'level'
204+
#[arg(long, value_enum, default_value_t = DEFAULT_LEVEL)]
205+
level: AaLevel,
206+
},
164207
/// Print information about a .skm file
165208
Info {
166209
/// Sketch metadata file (.skm) to describe

src/lib.rs

+79
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,85 @@ pub fn main() -> Result<(), Error> {
395395
utils::save_sketch_data(ref_db_name1, ref_db_name2, output)
396396
}
397397

398+
Commands::Concat {
399+
db,
400+
seq_files,
401+
file_list,
402+
output,
403+
single_strand,
404+
min_count,
405+
min_qual,
406+
concat_fasta,
407+
threads,
408+
level,
409+
} => {
410+
411+
//get input files
412+
log::info!("Getting input files");
413+
let input_files: Vec<(String, String, Option<String>)> =
414+
get_input_list(file_list, seq_files);
415+
log::info!("Parsed {} samples in input list", input_files.len());
416+
417+
//check if any of the new files are already existant in the db
418+
let db_metadata: MultiSketch = MultiSketch::load(db)
419+
.unwrap_or_else(|_| panic!("Could not read sketch metadata from {}.skm", db));
420+
println!("{:?}", db_metadata);
421+
422+
println!("{:?}", db_metadata.kmer_lengths());
423+
db_metadata.concat_competibility(&input_files);
424+
log::info!("Passed concat check");
425+
426+
// read out sketching information needed to sketch the new files
427+
let kmers = db_metadata.kmer_lengths();
428+
// Build, merge
429+
let rc = !*single_strand;
430+
// Set expected sketchsize
431+
let sketch_size = db_metadata.sketch_size;
432+
// Set aa level
433+
let seq_type = db_metadata.get_hash_type();
434+
435+
if *concat_fasta && matches!(*seq_type, HashType::DNA | HashType::PDB) {
436+
panic!("--concat-fasta currently only supported with --seq-type aa");
437+
}
438+
log::info!(
439+
"Running sketching: k:{:?}; sketch_size:{}; seq:{:?}; threads:{}",
440+
kmers,
441+
sketch_size * u64::BITS as u64,
442+
seq_type,
443+
threads,
444+
);
445+
446+
let seq_type = if let HashType::AA(_) = seq_type {
447+
HashType::AA(level.clone())
448+
} else {
449+
seq_type.clone()
450+
};
451+
// sketch freshly incoming files
452+
let mut db2_sketches = sketch_files(
453+
output,
454+
&input_files,
455+
*concat_fasta,
456+
&kmers,
457+
sketch_size,
458+
&seq_type,
459+
rc,
460+
*min_count,
461+
*min_qual,
462+
);
463+
let db2_metadata = MultiSketch::new(&mut db2_sketches, sketch_size, &kmers, seq_type);
464+
db2_metadata
465+
.save_metadata(output)
466+
.expect("Error saving metadata");
467+
468+
// // save skd data from db1 and from freshly sketched input files
469+
// log::info!("Merging and saving sketch data to {}.skd", output);
470+
// utils::save_sketch_data(db_metadata, db2, output);
471+
472+
// // read in skm from db1
473+
// // merge and update skm from db1 and the new just sketched sketch
474+
Ok(())
475+
}
476+
398477
Commands::Info {
399478
skm_file,
400479
sample_info,

src/multisketch.rs

+11
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,17 @@ impl MultiSketch {
158158
&& self.get_hash_type() == sketch2.get_hash_type()
159159
}
160160

161+
pub fn concat_competibility(&self, name_vec: &[(String, String, Option<String>)]) {
162+
for (id, _, _) in name_vec.iter() {
163+
if self.name_map.contains_key(id) {
164+
panic!(
165+
"{} appears in both the database and the provided files. Cannot concat files.",
166+
id
167+
);
168+
}
169+
}
170+
}
171+
161172
pub fn merge_sketches(&mut self, sketch2: &Self) -> &mut Self {
162173
// First metadata
163174
let offset = self.sketch_metadata.len();

tests/merge.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -130,8 +130,8 @@ mod tests {
130130

131131
// Check .skm the same
132132
let merged_sketch: MultiSketch =
133-
MultiSketch::load(&sandbox.file_string("merged_test", TestDir::Output))
134-
.expect("Failed to load output merged sketch");
133+
MultiSketch::load(&sandbox.file_string("merged_test", TestDir::Output))
134+
.expect("Failed to load output merged sketch");
135135
let expected_sketch =
136136
MultiSketch::load(&sandbox.file_string("merged_ref", TestDir::Output))
137137
.expect("Failed to load expected merged sketch");

0 commit comments

Comments
 (0)