1
1
use anyhow:: bail;
2
2
use anyhow:: Error ;
3
- use anyhow:: { Result , anyhow } ;
3
+ use anyhow:: { anyhow , Result } ;
4
4
// use thiserror::Error;
5
5
use core:: panic;
6
6
use std:: fmt;
@@ -15,7 +15,11 @@ use crate::hashing::HashType;
15
15
use crate :: sketch:: { Sketch , BBITS } ;
16
16
use crate :: sketch_datafile:: SketchArrayFile ;
17
17
18
+ use rayon:: prelude:: * ;
18
19
use std:: collections:: HashSet ;
20
+
21
+ use std:: io:: Write ;
22
+ use std:: path:: Path ;
19
23
#[ derive( Serialize , Deserialize ) ]
20
24
pub struct MultiSketch {
21
25
pub sketch_size : u64 ,
@@ -41,6 +45,7 @@ impl MultiSketch {
41
45
sketch_size : u64 ,
42
46
kmer_lengths : & [ usize ] ,
43
47
hash_type : HashType ,
48
+ inverted : bool ,
44
49
) -> Self {
45
50
let mut name_map = HashMap :: with_capacity ( sketches. len ( ) ) ;
46
51
for sketch in sketches. iter ( ) {
@@ -63,6 +68,8 @@ impl MultiSketch {
63
68
}
64
69
}
65
70
71
+
72
+
66
73
/// Saves the metadata
67
74
pub fn save_metadata ( & self , file_prefix : & str ) -> Result < ( ) , Error > {
68
75
let filename = format ! ( "{}.skm" , file_prefix) ;
@@ -205,10 +212,8 @@ impl MultiSketch {
205
212
let mut removed_samples = Vec :: new ( ) ;
206
213
207
214
for sketch in & self . sketch_metadata {
208
-
209
215
if !genome_ids_to_remove. contains ( & ( * sketch. name ( ) ) . to_string ( ) ) {
210
216
new_sketch_metadata. push ( sketch. clone ( ) ) ;
211
-
212
217
} else {
213
218
removed_samples. push ( sketch. name ( ) ) ;
214
219
}
@@ -218,8 +223,11 @@ impl MultiSketch {
218
223
let set2: HashSet < & str > = genome_ids_to_remove. iter ( ) . map ( AsRef :: as_ref) . collect ( ) ;
219
224
let missing: Vec < & & str > = set2. difference ( & set1) . collect ( ) ;
220
225
if !missing. is_empty ( ) {
221
- bail ! ( "The following samples have not been found in the database: {:?}" , missing) ;
222
- }
226
+ bail ! (
227
+ "The following samples have not been found in the database: {:?}" ,
228
+ missing
229
+ ) ;
230
+ }
223
231
224
232
self . sketch_metadata = new_sketch_metadata;
225
233
self . save_metadata ( output_file_name) ?;
@@ -231,7 +239,7 @@ impl MultiSketch {
231
239
input_prefix : & str ,
232
240
output_file : & str ,
233
241
genome_ids_to_remove : & [ String ] ,
234
- ) -> anyhow:: Result < ( ) > {
242
+ ) -> anyhow:: Result < ( ) > {
235
243
let mut positions_to_remove = Vec :: new ( ) ;
236
244
let mut missing_ids = Vec :: new ( ) ;
237
245
@@ -266,10 +274,85 @@ impl MultiSketch {
266
274
Ok ( ( ) )
267
275
}
268
276
269
- // This function is called when sketches are merged, not when they are
270
- // first sketched (this is handled by sketch::sketch_files())
277
+ pub fn invert_index ( sketches : & MultiSketch ) -> HashMap < u64 , HashSet < usize > > {
278
+ println ! ( "Debug: Starting invert_index function" ) ;
279
+ println ! ( "Debug: Sample stride: {}" , sketches. sample_stride) ;
280
+ println ! ( "Debug: Sketch bins length: {}" , sketches. sketch_bins. len( ) ) ;
281
+
282
+ // HashMap storing the inverted index
283
+ let mut inverted_index: HashMap < u64 , HashSet < usize > > = HashMap :: default ( ) ;
284
+
285
+ // Parallise the inversion for each sample
286
+ let local_indices: Vec < HashMap < u64 , HashSet < usize > > > = sketches
287
+ . sketch_bins
288
+ . par_chunks ( sketches. sample_stride )
289
+ . enumerate ( )
290
+ . map ( |( genome_id, sample_hash) | {
291
+ println ! ( "Debug: Processing genome_id: {}" , genome_id) ;
292
+ println ! ( "Debug: Sample hash length: {}" , sample_hash. len( ) ) ;
293
+
294
+ // Print all hashes for this genome
295
+ println ! ( "Debug: Hashes for genome {}: {:?}" , genome_id, sample_hash) ;
296
+
297
+ let mut local_index: HashMap < u64 , HashSet < usize > > = HashMap :: default ( ) ;
298
+ for & hash in sample_hash {
299
+ local_index
300
+ . entry ( hash)
301
+ . or_insert_with ( HashSet :: default)
302
+ . insert ( genome_id) ;
303
+ }
304
+ println ! ( "Debug: Local index for genome {} size: {}" , genome_id, local_index. len( ) ) ;
305
+ // Print the hash-to-genome mappings for this local index
306
+ println ! ( "Debug: Local index contents for genome {}: {:?}" , genome_id, local_index) ;
307
+ local_index
308
+ } )
309
+ . collect ( ) ;
310
+
311
+ println ! ( "Debug: Number of local indices: {}" , local_indices. len( ) ) ;
312
+
313
+ // Merge all local inverted indices into a global one
314
+ for ( i, local) in local_indices. iter ( ) . enumerate ( ) {
315
+ println ! ( "Debug: Merging local index {}, size: {}" , i, local. len( ) ) ;
316
+ for ( hash, genome_set) in local {
317
+ inverted_index
318
+ . entry ( * hash)
319
+ . or_insert_with ( HashSet :: default)
320
+ . extend ( genome_set) ;
321
+ }
322
+ }
323
+
324
+ println ! ( "Debug: Final inverted index size: {}" , inverted_index. len( ) ) ;
325
+ println ! ( "Debug: Inverted index contents: {:?}" , inverted_index) ;
326
+
327
+ inverted_index
328
+ }
329
+ // remove, only need this for debugging
330
+ pub fn get_sketch_bins_len ( & self ) -> usize {
331
+ self . sketch_bins . len ( )
332
+ }
333
+
334
+ pub fn write_inverted_index_to_file < P : AsRef < Path > > (
335
+ inverted_index : & HashMap < u64 , HashSet < usize > > ,
336
+ file_path : P ,
337
+ ) -> std:: io:: Result < ( ) > {
338
+ let mut file = File :: create ( file_path) ?;
339
+
340
+ for ( hash, genome_set) in inverted_index {
341
+ writeln ! (
342
+ file,
343
+ "{} {}" ,
344
+ hash,
345
+ genome_set. iter( ) . map( |id| id. to_string( ) ) . collect:: <Vec <_>>( ) . join( " " )
346
+ ) ?;
347
+ }
348
+
349
+ Ok ( ( ) )
350
+ }
271
351
}
272
352
353
+ // This function is called when sketches are merged, not when they are
354
+ // first sketched (this is handled by sketch::sketch_files())
355
+
273
356
impl fmt:: Debug for MultiSketch {
274
357
fn fmt ( & self , f : & mut fmt:: Formatter ) -> fmt:: Result {
275
358
write ! (
0 commit comments