@@ -2,12 +2,14 @@ mod merge_dict_column;
2
2
mod merge_mapping;
3
3
mod term_merger;
4
4
5
- use std:: collections:: { BTreeMap , HashMap , HashSet } ;
5
+ use std:: collections:: { HashMap , HashSet } ;
6
6
use std:: io;
7
7
use std:: net:: Ipv6Addr ;
8
+ use std:: rc:: Rc ;
8
9
use std:: sync:: Arc ;
9
10
10
- use itertools:: Itertools ;
11
+ use common:: GroupByIteratorExtended ;
12
+ use itertools:: { EitherOrBoth , Itertools } ;
11
13
pub use merge_mapping:: { MergeRowOrder , ShuffleMergeOrder , StackMergeOrder } ;
12
14
13
15
use super :: writer:: ColumnarSerializer ;
@@ -18,7 +20,8 @@ use crate::columnar::writer::CompatibleNumericalTypes;
18
20
use crate :: columnar:: ColumnarReader ;
19
21
use crate :: dynamic_column:: DynamicColumn ;
20
22
use crate :: {
21
- BytesColumn , Column , ColumnIndex , ColumnType , ColumnValues , NumericalType , NumericalValue ,
23
+ BytesColumn , Column , ColumnIndex , ColumnType , ColumnValues , DynamicColumnHandle , NumericalType ,
24
+ NumericalValue ,
22
25
} ;
23
26
24
27
/// Column types are grouped into different categories.
@@ -28,7 +31,7 @@ use crate::{
28
31
/// In practise, today, only Numerical colummns are coerced into one type today.
29
32
///
30
33
/// See also [README.md].
31
- #[ derive( Copy , Clone , Eq , PartialEq , Hash , Debug ) ]
34
+ #[ derive( Copy , Clone , Eq , PartialEq , PartialOrd , Ord , Hash , Debug ) ]
32
35
pub ( crate ) enum ColumnTypeCategory {
33
36
Bool ,
34
37
Str ,
@@ -83,9 +86,13 @@ pub fn merge_columnar(
83
86
. iter ( )
84
87
. map ( |reader| reader. num_rows ( ) )
85
88
. collect :: < Vec < u32 > > ( ) ;
86
- let columns_to_merge =
87
- group_columns_for_merge ( columnar_readers, required_columns, & merge_row_order) ?;
88
- for ( ( column_name, column_type) , columns) in columns_to_merge {
89
+
90
+ let columns_to_merge_iter =
91
+ group_columns_for_merge_iter ( columnar_readers, required_columns, & merge_row_order) ?;
92
+ for res in columns_to_merge_iter {
93
+ let ( column_name, column_type, grouped_columns) = res?;
94
+ let columns = grouped_columns. columns ;
95
+
89
96
let mut column_serializer =
90
97
serializer. start_serialize_column ( column_name. as_bytes ( ) , column_type) ;
91
98
merge_column (
@@ -97,6 +104,7 @@ pub fn merge_columnar(
97
104
) ?;
98
105
column_serializer. finalize ( ) ?;
99
106
}
107
+
100
108
serializer. finalize ( merge_row_order. num_rows ( ) ) ?;
101
109
Ok ( ( ) )
102
110
}
@@ -214,11 +222,11 @@ struct GroupedColumns {
214
222
}
215
223
216
224
impl GroupedColumns {
217
- fn for_category ( column_category : ColumnTypeCategory , num_columnars : usize ) -> Self {
225
+ fn new ( num_columnars : usize ) -> Self {
218
226
GroupedColumns {
219
227
required_column_type : None ,
220
228
columns : vec ! [ None ; num_columnars] ,
221
- column_category,
229
+ column_category : ColumnTypeCategory :: Numerical ,
222
230
}
223
231
}
224
232
@@ -293,7 +301,7 @@ fn merged_numerical_columns_type<'a>(
293
301
fn is_empty_after_merge (
294
302
merge_row_order : & MergeRowOrder ,
295
303
column : & DynamicColumn ,
296
- columnar_id : usize ,
304
+ columnar_ord : usize ,
297
305
) -> bool {
298
306
if column. num_values ( ) == 0u32 {
299
307
// It was empty before the merge.
@@ -305,7 +313,7 @@ fn is_empty_after_merge(
305
313
false
306
314
}
307
315
MergeRowOrder :: Shuffled ( shuffled) => {
308
- if let Some ( alive_bitset) = & shuffled. alive_bitsets [ columnar_id ] {
316
+ if let Some ( alive_bitset) = & shuffled. alive_bitsets [ columnar_ord ] {
309
317
let column_index = column. column_index ( ) ;
310
318
match column_index {
311
319
ColumnIndex :: Empty { .. } => true ,
@@ -348,56 +356,115 @@ fn is_empty_after_merge(
348
356
}
349
357
}
350
358
351
- #[ allow( clippy:: type_complexity) ]
352
- fn group_columns_for_merge (
353
- columnar_readers : & [ & ColumnarReader ] ,
354
- required_columns : & [ ( String , ColumnType ) ] ,
355
- merge_row_order : & MergeRowOrder ,
356
- ) -> io:: Result < BTreeMap < ( String , ColumnType ) , Vec < Option < DynamicColumn > > > > {
357
- // Each column name may have multiple types of column associated.
358
- // For merging we are interested in the same column type category since they can be merged.
359
- let mut columns_grouped: HashMap < ( String , ColumnTypeCategory ) , GroupedColumns > = HashMap :: new ( ) ;
359
+ type MergeIter < ' a > =
360
+ Box < dyn Iterator < Item = io:: Result < ( Rc < String > , ColumnType , GroupedColumns ) > > + ' a > ;
360
361
361
- for & ( ref column_name, column_type) in required_columns {
362
- columns_grouped
363
- . entry ( ( column_name. clone ( ) , column_type. into ( ) ) )
364
- . or_insert_with ( || {
365
- GroupedColumns :: for_category ( column_type. into ( ) , columnar_readers. len ( ) )
366
- } )
367
- . require_type ( column_type) ?;
368
- }
362
+ /// Iterates over the columns of the columnar readers, grouped by column name.
363
+ /// Key functionality is that `open` of the Columns is done lazy per group.
364
+ fn group_columns_for_merge_iter < ' a > (
365
+ columnar_readers : & ' a [ & ' a ColumnarReader ] ,
366
+ required_columns : & ' a [ ( String , ColumnType ) ] ,
367
+ merge_row_order : & ' a MergeRowOrder ,
368
+ ) -> io:: Result < impl Iterator < Item = io:: Result < ( Rc < String > , ColumnType , GroupedColumns ) > > + ' a > {
369
+ let column_iters: Vec < _ > = columnar_readers
370
+ . iter ( )
371
+ . enumerate ( )
372
+ . map ( |( reader_ord, reader) | {
373
+ Ok ( reader
374
+ . iter_columns ( ) ?
375
+ . map ( move |el| ( Rc :: new ( el. 0 ) , reader_ord, el. 1 ) ) )
376
+ } )
377
+ . collect :: < io:: Result < _ > > ( ) ?;
378
+ let required_columns_map: HashMap < String , _ > = required_columns
379
+ . iter ( )
380
+ . map ( |( col_name, typ) | ( col_name. to_string ( ) , typ) )
381
+ . collect :: < HashMap < String , _ > > ( ) ;
382
+ let mut required_columns_list: Vec < String > = required_columns
383
+ . iter ( )
384
+ . map ( |( col_name, _) | col_name. to_string ( ) )
385
+ . collect ( ) ;
386
+ required_columns_list. sort ( ) ;
369
387
370
- for ( columnar_id , columnar_reader ) in columnar_readers . iter ( ) . enumerate ( ) {
371
- let column_name_and_handle = columnar_reader . list_columns ( ) ? ;
372
- // We skip columns that end up with 0 documents.
373
- // That way, we make sure they don't end up influencing the merge type or
374
- // creating empty columns.
388
+ // Kmerge and group on the column_name.
389
+ let group_iter = GroupByIteratorExtended :: group_by (
390
+ column_iters . into_iter ( ) . kmerge_by ( |a , b| a . 0 < b . 0 ) ,
391
+ |el| el . 0 . clone ( ) ,
392
+ ) ;
375
393
376
- for ( column_name, handle) in column_name_and_handle {
377
- let column_category: ColumnTypeCategory = handle. column_type ( ) . into ( ) ;
378
- let column = handle. open ( ) ?;
379
- if is_empty_after_merge ( merge_row_order, & column, columnar_id) {
380
- continue ;
381
- }
382
- columns_grouped
383
- . entry ( ( column_name, column_category) )
384
- . or_insert_with ( || {
385
- GroupedColumns :: for_category ( column_category, columnar_readers. len ( ) )
386
- } )
387
- . set_column ( columnar_id, column) ;
388
- }
389
- }
394
+ // Weave in the required columns into the sorted by column name iterator.
395
+ let groups_with_required = required_columns_list
396
+ . into_iter ( )
397
+ . merge_join_by ( group_iter, |a, b| a. cmp ( & b. 0 ) ) ;
390
398
391
- let mut merge_columns: BTreeMap < ( String , ColumnType ) , Vec < Option < DynamicColumn > > > =
392
- Default :: default ( ) ;
399
+ Ok ( groups_with_required. flat_map ( move |either| {
400
+ // It should be possible to do the grouping also on the column type in one pass, but some
401
+ // tests are failing.
402
+ let mut force_type: Option < ColumnType > = None ;
403
+ let ( key, group) = match either {
404
+ // set required column
405
+ EitherOrBoth :: Both ( _required, ( key, group) ) => {
406
+ force_type = required_columns_map. get ( & * key) . map ( |el| ( * * el) . into ( ) ) ;
407
+ ( key, group)
408
+ }
409
+ // Only required - Return artificial empty column
410
+ EitherOrBoth :: Left ( key) => {
411
+ let mut grouped_columns = GroupedColumns :: new ( columnar_readers. len ( ) ) ;
412
+ let force_type: Option < ColumnType > =
413
+ required_columns_map. get ( & * key) . map ( |el| ( * * el) . into ( ) ) ;
414
+ if let Some ( force_type) = force_type {
415
+ grouped_columns. require_type ( force_type) . unwrap ( ) ; // Can't panic
416
+ }
417
+ return Box :: new ( std:: iter:: once ( Ok ( (
418
+ Rc :: new ( key) ,
419
+ force_type. unwrap ( ) ,
420
+ grouped_columns,
421
+ ) ) ) ) as MergeIter < ' a > ;
422
+ }
423
+ // no required column
424
+ EitherOrBoth :: Right ( ( key, group) ) => ( key, group) ,
425
+ } ;
426
+ let mut group: Vec < ( Rc < String > , usize , DynamicColumnHandle ) > = group. collect ( ) ;
427
+ group. sort_by_key ( |el| el. 2 . column_type ) ;
428
+ let group_iter = GroupByIteratorExtended :: group_by ( group. into_iter ( ) , |el| {
429
+ let cat_type: ColumnTypeCategory = el. 2 . column_type ( ) . into ( ) ;
430
+ cat_type
431
+ } ) ;
432
+ let key = key. clone ( ) ;
433
+ Box :: new (
434
+ group_iter
435
+ . map ( move |( _cat, group) | {
436
+ let mut grouped_columns = GroupedColumns :: new ( columnar_readers. len ( ) ) ;
437
+ if let Some ( force_type) = force_type {
438
+ grouped_columns. require_type ( force_type) ?;
439
+ }
440
+ for col in group {
441
+ let columnar_ord = col. 1 ;
442
+ let column = col. 2 . open ( ) ?;
443
+ if !is_empty_after_merge ( merge_row_order, & column, columnar_ord) {
444
+ grouped_columns. set_column ( col. 1 , column) ;
445
+ }
446
+ }
393
447
394
- for ( ( column_name, _) , mut grouped_columns) in columns_grouped {
395
- let column_type = grouped_columns. column_type_after_merge ( ) ;
396
- coerce_columns ( column_type, & mut grouped_columns. columns ) ?;
397
- merge_columns. insert ( ( column_name, column_type) , grouped_columns. columns ) ;
398
- }
448
+ let column_type = grouped_columns. column_type_after_merge ( ) ;
449
+ coerce_columns ( column_type, & mut grouped_columns. columns ) ?;
399
450
400
- Ok ( merge_columns)
451
+ Ok ( ( key. clone ( ) , column_type, grouped_columns) )
452
+ } )
453
+ . filter ( |res| {
454
+ // Filter out empty columns.
455
+ if let Ok ( ( _, _, grouped_columns) ) = res {
456
+ if grouped_columns
457
+ . columns
458
+ . iter ( )
459
+ . all ( |column| column. is_none ( ) )
460
+ {
461
+ return false ;
462
+ }
463
+ }
464
+ true
465
+ } ) ,
466
+ )
467
+ } ) )
401
468
}
402
469
403
470
fn coerce_columns (
0 commit comments