Skip to content

Commit 9ebc5ed

Browse files
use fst for sstable index (#2268)
* read path for new fst based index * implement BlockAddrStoreWriter * extract slop/derivation computation * use better linear approximator and allow negative correction to approximator * document format and reorder some fields * optimize single block sstable size * plug backward compat
1 parent 0b56c88 commit 9ebc5ed

File tree

10 files changed

+1086
-323
lines changed

10 files changed

+1086
-323
lines changed

columnar/src/tests.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ fn test_dataframe_writer_str() {
2626
assert_eq!(columnar.num_columns(), 1);
2727
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
2828
assert_eq!(cols.len(), 1);
29-
assert_eq!(cols[0].num_bytes(), 87);
29+
assert_eq!(cols[0].num_bytes(), 73);
3030
}
3131

3232
#[test]
@@ -40,7 +40,7 @@ fn test_dataframe_writer_bytes() {
4040
assert_eq!(columnar.num_columns(), 1);
4141
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
4242
assert_eq!(cols.len(), 1);
43-
assert_eq!(cols[0].num_bytes(), 87);
43+
assert_eq!(cols[0].num_bytes(), 73);
4444
}
4545

4646
#[test]

src/fastfield/mod.rs

+8-8
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ mod tests {
131131
}
132132
let file = directory.open_read(path).unwrap();
133133

134-
assert_eq!(file.len(), 93);
134+
assert_eq!(file.len(), 80);
135135
let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
136136
let column = fast_field_readers
137137
.u64("field")
@@ -181,7 +181,7 @@ mod tests {
181181
write.terminate().unwrap();
182182
}
183183
let file = directory.open_read(path).unwrap();
184-
assert_eq!(file.len(), 121);
184+
assert_eq!(file.len(), 108);
185185
let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
186186
let col = fast_field_readers
187187
.u64("field")
@@ -214,7 +214,7 @@ mod tests {
214214
write.terminate().unwrap();
215215
}
216216
let file = directory.open_read(path).unwrap();
217-
assert_eq!(file.len(), 94);
217+
assert_eq!(file.len(), 81);
218218
let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
219219
let fast_field_reader = fast_field_readers
220220
.u64("field")
@@ -246,7 +246,7 @@ mod tests {
246246
write.terminate().unwrap();
247247
}
248248
let file = directory.open_read(path).unwrap();
249-
assert_eq!(file.len(), 4489);
249+
assert_eq!(file.len(), 4476);
250250
{
251251
let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
252252
let col = fast_field_readers
@@ -279,7 +279,7 @@ mod tests {
279279
write.terminate().unwrap();
280280
}
281281
let file = directory.open_read(path).unwrap();
282-
assert_eq!(file.len(), 265);
282+
assert_eq!(file.len(), 252);
283283

284284
{
285285
let fast_field_readers = FastFieldReaders::open(file, schema).unwrap();
@@ -773,7 +773,7 @@ mod tests {
773773
write.terminate().unwrap();
774774
}
775775
let file = directory.open_read(path).unwrap();
776-
assert_eq!(file.len(), 102);
776+
assert_eq!(file.len(), 84);
777777
let fast_field_readers = FastFieldReaders::open(file, schema).unwrap();
778778
let bool_col = fast_field_readers.bool("field_bool").unwrap();
779779
assert_eq!(bool_col.first(0), Some(true));
@@ -805,7 +805,7 @@ mod tests {
805805
write.terminate().unwrap();
806806
}
807807
let file = directory.open_read(path).unwrap();
808-
assert_eq!(file.len(), 114);
808+
assert_eq!(file.len(), 96);
809809
let readers = FastFieldReaders::open(file, schema).unwrap();
810810
let bool_col = readers.bool("field_bool").unwrap();
811811
for i in 0..25 {
@@ -830,7 +830,7 @@ mod tests {
830830
write.terminate().unwrap();
831831
}
832832
let file = directory.open_read(path).unwrap();
833-
assert_eq!(file.len(), 104);
833+
assert_eq!(file.len(), 86);
834834
let fastfield_readers = FastFieldReaders::open(file, schema).unwrap();
835835
let col = fastfield_readers.bool("field_bool").unwrap();
836836
assert_eq!(col.first(0), None);

sstable/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ description = "sstables for tantivy"
1111

1212
[dependencies]
1313
common = {version= "0.6", path="../common", package="tantivy-common"}
14+
tantivy-bitpacker = { version= "0.5", path="../bitpacker" }
1415
tantivy-fst = "0.5"
1516
# experimental gives us access to Decompressor::upper_bound
1617
zstd = { version = "0.13", features = ["experimental"] }

sstable/README.md

+62-24
Original file line numberDiff line numberDiff line change
@@ -89,33 +89,71 @@ Note: as the SSTable does not support redundant keys, there is no ambiguity betw
8989

9090
### SSTFooter
9191
```
92-
+-------+-------+-----+-------------+---------+---------+
93-
| Block | Block | ... | IndexOffset | NumTerm | Version |
94-
+-------+-------+-----+-------------+---------+---------+
95-
|----( # of blocks)---|
92+
+-----+----------------+-------------+-------------+---------+---------+
93+
| Fst | BlockAddrStore | StoreOffset | IndexOffset | NumTerm | Version |
94+
+-----+----------------+-------------+-------------+---------+---------+
9695
```
97-
- Block(SSTBlock): uses IndexValue for its Values format
96+
- Fst(Fst): finite state transducer mapping keys to a block number
97+
- BlockAddrStore(BlockAddrStore): store mapping a block number to its BlockAddr
98+
- StoreOffset(u64): Offset to start of the BlockAddrStore. If zero, see the SingleBlockSStable section
9899
- IndexOffset(u64): Offset to the start of the SSTFooter
99100
- NumTerm(u64): number of terms in the sstable
100-
- Version(u32): Currently equal to 2
101+
- Version(u32): Currently equal to 3
101102

102-
### IndexValue
103-
```
104-
+------------+----------+-------+-------+-----+
105-
| EntryCount | StartPos | Entry | Entry | ... |
106-
+------------+----------+-------+-------+-----+
107-
|---( # of entries)---|
108-
```
103+
### Fst
109104

110-
- EntryCount(VInt): number of entries
111-
- StartPos(VInt): the start pos of the first (data) block referenced by this (index) block
112-
- Entry (IndexEntry)
105+
Fst is in the format of tantivy\_fst
113106

114-
### Entry
115-
```
116-
+----------+--------------+
117-
| BlockLen | FirstOrdinal |
118-
+----------+--------------+
119-
```
120-
- BlockLen(VInt): length of the block
121-
- FirstOrdinal(VInt): ordinal of the first element in the given block
107+
### BlockAddrStore
108+
109+
+---------+-----------+-----------+-----+-----------+-----------+-----+
110+
| MetaLen | BlockMeta | BlockMeta | ... | BlockData | BlockData | ... |
111+
+---------+-----------+-----------+-----+-----------+-----------+-----+
112+
|---------(N blocks)----------|---------(N blocks)----------|
113+
114+
- MetaLen(u64): length of the BlockMeta section
115+
- BlockMeta(BlockAddrBlockMetadata): metadata to seek through BlockData
116+
- BlockData(CompactedBlockAddr): bitpacked per block metadata
117+
118+
### BlockAddrBlockMetadata
119+
120+
+--------+------------+--------------+------------+--------------+-------------------+-----------------+----------+
121+
| Offset | RangeStart | FirstOrdinal | RangeSlope | OrdinalSlope | FirstOrdinalNBits | RangeStartNBits | BlockLen |
122+
+--------+------------+--------------+------------+--------------+-------------------+-----------------+----------+
123+
124+
- Offset(u64): offset of the corresponding BlockData in the datastream
125+
- RangeStart(u64): the start position of the first block
126+
- FirstOrdinal(u64): the first ordinal of the first block
127+
- RangeSlope(u32): slope predicted for start range evolution (see computation in BlockData)
128+
- OrdinalSlope(u64): slope predicted for first ordinal evolution (see computation in BlockData)
129+
- FirstOrdinalNBits(u8): number of bits per ordinal in datastream (see computation in BlockData)
130+
- RangeStartNBits(u8): number of bits per range start in datastream (see computation in BlockData)
131+
132+
### BlockData
133+
134+
+-----------------+-------------------+---------------+
135+
| RangeStartDelta | FirstOrdinalDelta | FinalRangeEnd |
136+
+-----------------+-------------------+---------------+
137+
|------(BlockLen repetitions)---------|
138+
139+
- RangeStartDelta(var): RangeStartNBits *bits* of little endian number. See below for decoding
140+
- FirstOrdinalDelta(var): FirstOrdinalNBits *bits* of little endian number. See below for decoding
141+
- FinalRangeEnd(var): RangeStartNBits *bits* of integer. See below for decoding
142+
143+
converting a BlockData of index Index and a BlockAddrBlockMetadata to an actual block address is done as follow:
144+
range\_prediction := RangeStart + Index * RangeSlop;
145+
range\_derivation := RangeStartDelta - (1 << (RangeStartNBits-1));
146+
range\_start := range\_prediction + range\_derivation
147+
148+
The same computation can be done for ordinal.
149+
150+
Note that `range_derivation` can take negative value. `RangeStartDelta` is just its translation to a positive range.
151+
152+
153+
## SingleBlockSStable
154+
155+
The format used for the index is meant to be compact, however it has a constant cost of around 70
156+
bytes, which isn't negligible for a table containing very few keys.
157+
To limit the impact of that constant cost, single block sstable omit the Fst and BlockAddrStore from
158+
their index. Instead a block with first ordinal of 0, range start of 0 and range end of IndexOffset
159+
is implicitly used for every operations.

sstable/benches/ord_to_term.rs

+44
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,31 @@ pub fn criterion_benchmark(c: &mut Criterion) {
4040
assert!(dict.ord_to_term(19_000_000, &mut res).unwrap());
4141
})
4242
});
43+
c.bench_function("term_ord_suffix", |b| {
44+
b.iter(|| {
45+
assert_eq!(
46+
dict.term_ord(b"prefix.00186A0.suffix").unwrap().unwrap(),
47+
100_000
48+
);
49+
assert_eq!(
50+
dict.term_ord(b"prefix.121EAC0.suffix").unwrap().unwrap(),
51+
19_000_000
52+
);
53+
})
54+
});
55+
c.bench_function("open_and_term_ord_suffix", |b| {
56+
b.iter(|| {
57+
let dict = Dictionary::<MonotonicU64SSTable>::open(slice.clone()).unwrap();
58+
assert_eq!(
59+
dict.term_ord(b"prefix.00186A0.suffix").unwrap().unwrap(),
60+
100_000
61+
);
62+
assert_eq!(
63+
dict.term_ord(b"prefix.121EAC0.suffix").unwrap().unwrap(),
64+
19_000_000
65+
);
66+
})
67+
});
4368
}
4469
{
4570
let slice = make_test_sstable("");
@@ -59,6 +84,25 @@ pub fn criterion_benchmark(c: &mut Criterion) {
5984
assert!(dict.ord_to_term(19_000_000, &mut res).unwrap());
6085
})
6186
});
87+
c.bench_function("term_ord", |b| {
88+
b.iter(|| {
89+
assert_eq!(dict.term_ord(b"prefix.00186A0").unwrap().unwrap(), 100_000);
90+
assert_eq!(
91+
dict.term_ord(b"prefix.121EAC0").unwrap().unwrap(),
92+
19_000_000
93+
);
94+
})
95+
});
96+
c.bench_function("open_and_term_ord", |b| {
97+
b.iter(|| {
98+
let dict = Dictionary::<MonotonicU64SSTable>::open(slice.clone()).unwrap();
99+
assert_eq!(dict.term_ord(b"prefix.00186A0").unwrap().unwrap(), 100_000);
100+
assert_eq!(
101+
dict.term_ord(b"prefix.121EAC0").unwrap().unwrap(),
102+
19_000_000
103+
);
104+
})
105+
});
62106
}
63107
}
64108

sstable/src/dictionary.rs

+34-14
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,11 @@ use common::{BinarySerializable, OwnedBytes};
99
use tantivy_fst::automaton::AlwaysMatch;
1010
use tantivy_fst::Automaton;
1111

12+
use crate::sstable_index_v3::SSTableIndexV3Empty;
1213
use crate::streamer::{Streamer, StreamerBuilder};
13-
use crate::{BlockAddr, DeltaReader, Reader, SSTable, SSTableIndex, TermOrdinal, VoidSSTable};
14+
use crate::{
15+
BlockAddr, DeltaReader, Reader, SSTable, SSTableIndex, SSTableIndexV3, TermOrdinal, VoidSSTable,
16+
};
1417

1518
/// An SSTable is a sorted map that associates sorted `&[u8]` keys
1619
/// to any kind of typed values.
@@ -180,24 +183,41 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
180183
pub fn open(term_dictionary_file: FileSlice) -> io::Result<Self> {
181184
let (main_slice, footer_len_slice) = term_dictionary_file.split_from_end(20);
182185
let mut footer_len_bytes: OwnedBytes = footer_len_slice.read_bytes()?;
183-
184186
let index_offset = u64::deserialize(&mut footer_len_bytes)?;
185187
let num_terms = u64::deserialize(&mut footer_len_bytes)?;
186188
let version = u32::deserialize(&mut footer_len_bytes)?;
187-
if version != crate::SSTABLE_VERSION {
188-
return Err(io::Error::new(
189-
io::ErrorKind::Other,
190-
format!(
191-
"Unsuported sstable version, expected {version}, found {}",
192-
crate::SSTABLE_VERSION,
193-
),
194-
));
195-
}
196-
197189
let (sstable_slice, index_slice) = main_slice.split(index_offset as usize);
198190
let sstable_index_bytes = index_slice.read_bytes()?;
199-
let sstable_index = SSTableIndex::load(sstable_index_bytes)
200-
.map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "SSTable corruption"))?;
191+
192+
let sstable_index = match version {
193+
2 => SSTableIndex::V2(
194+
crate::sstable_index_v2::SSTableIndex::load(sstable_index_bytes).map_err(|_| {
195+
io::Error::new(io::ErrorKind::InvalidData, "SSTable corruption")
196+
})?,
197+
),
198+
3 => {
199+
let (sstable_index_bytes, mut footerv3_len_bytes) = sstable_index_bytes.rsplit(8);
200+
let store_offset = u64::deserialize(&mut footerv3_len_bytes)?;
201+
if store_offset != 0 {
202+
SSTableIndex::V3(
203+
SSTableIndexV3::load(sstable_index_bytes, store_offset).map_err(|_| {
204+
io::Error::new(io::ErrorKind::InvalidData, "SSTable corruption")
205+
})?,
206+
)
207+
} else {
208+
// if store_offset is zero, there is no index, so we build a pseudo-index
209+
// assuming a single block of sstable covering everything.
210+
SSTableIndex::V3Empty(SSTableIndexV3Empty::load(index_offset as usize))
211+
}
212+
}
213+
_ => {
214+
return Err(io::Error::new(
215+
io::ErrorKind::Other,
216+
format!("Unsuported sstable version, expected one of [2, 3], found {version}"),
217+
))
218+
}
219+
};
220+
201221
Ok(Dictionary {
202222
sstable_slice,
203223
sstable_index,

sstable/src/lib.rs

+8-9
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,9 @@ pub mod merge;
1010
mod streamer;
1111
pub mod value;
1212

13-
mod sstable_index;
14-
pub use sstable_index::{BlockAddr, SSTableIndex, SSTableIndexBuilder};
13+
mod sstable_index_v3;
14+
pub use sstable_index_v3::{BlockAddr, SSTableIndex, SSTableIndexBuilder, SSTableIndexV3};
15+
mod sstable_index_v2;
1516
pub(crate) mod vint;
1617
pub use dictionary::Dictionary;
1718
pub use streamer::{Streamer, StreamerBuilder};
@@ -28,7 +29,7 @@ use crate::value::{RangeValueReader, RangeValueWriter};
2829
pub type TermOrdinal = u64;
2930

3031
const DEFAULT_KEY_CAPACITY: usize = 50;
31-
const SSTABLE_VERSION: u32 = 2;
32+
const SSTABLE_VERSION: u32 = 3;
3233

3334
/// Given two byte string returns the length of
3435
/// the longest common prefix.
@@ -304,7 +305,8 @@ where
304305

305306
let offset = wrt.written_bytes();
306307

307-
self.index_builder.serialize(&mut wrt)?;
308+
let fst_len: u64 = self.index_builder.serialize(&mut wrt)?;
309+
wrt.write_all(&fst_len.to_le_bytes())?;
308310
wrt.write_all(&offset.to_le_bytes())?;
309311
wrt.write_all(&self.num_terms.to_le_bytes())?;
310312

@@ -385,13 +387,10 @@ mod test {
385387
16, 17, 33, 18, 19, 17, 20, // data block
386388
0, 0, 0, 0, // no more block
387389
// index
388-
8, 0, 0, 0, // size of index block
389-
0, // compression
390-
1, 0, 12, 0, 32, 17, 20, // index block
391-
0, 0, 0, 0, // no more index block
390+
0, 0, 0, 0, 0, 0, 0, 0, // fst lenght
392391
16, 0, 0, 0, 0, 0, 0, 0, // index start offset
393392
3, 0, 0, 0, 0, 0, 0, 0, // num term
394-
2, 0, 0, 0, // version
393+
3, 0, 0, 0, // version
395394
]
396395
);
397396
let buffer = OwnedBytes::new(buffer);

0 commit comments

Comments
 (0)