Skip to content

Commit 8bd2acf

Browse files
authored
Raise diskann maximum dimension from 2K to 16K (#181)
This PR fixes #100 and raises the dimension limit for pgvectorscale's diskann index from 2000 to 16000, which is the maximum supported by the underlying pgvector `vector` type. The previous limit of 2000 was needed to ensure that all data structures could be serialized onto single 8K pages. When going beyond 2000 dimensions, so long as SBQ is used for storage, quantized vectors, neighbor lists, and other data structures will still fit on a single page; the only thing that grows too large is `SbqMeans`. (The raw vectors used for reranking remain in the source relation, where standard Postgres TOAST machinery is used to read/write them). If plain storage is used, the old limit of 2000 remains in place. To deal with `SbqMeans`, we introduce a `ChainTape` data structure that is similar to `Tape` but supports reads/writes of large buffers across pages. The chained representation is considered a property of the `PageType`, and we introduce a new `PageType` for `SbqMeans` along with upgrade machinery from the old version. Similarly to the versioned `MetaPage`, there are no unit tests for this, but I did ad-hoc testing to confirm that the upgrade path works.
1 parent 8836117 commit 8bd2acf

File tree

6 files changed

+476
-56
lines changed

6 files changed

+476
-56
lines changed

pgvectorscale/src/access_method/build.rs

+75-8
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,14 @@ impl<'a, 'b> BuildState<'a, 'b> {
6262
}
6363
}
6464

65+
/// Maximum number of dimensions supported by pgvector's vector type. Also
66+
/// the maximum number of dimensions that can be indexed with diskann.
67+
pub const MAX_DIMENSION: u32 = 16000;
68+
69+
/// Maximum number of dimensions that can be indexed with diskann without
70+
/// using the SBQ storage type.
71+
pub const MAX_DIMENSION_NO_SBQ: u32 = 2000;
72+
6573
#[pg_guard]
6674
pub extern "C" fn ambuild(
6775
heaprel: pg_sys::Relation,
@@ -73,7 +81,7 @@ pub extern "C" fn ambuild(
7381
let opt = TSVIndexOptions::from_relation(&index_relation);
7482

7583
notice!(
76-
"Starting index build. num_neighbors={} search_list_size={}, max_alpha={}, storage_layout={:?}",
84+
"Starting index build with num_neighbors={}, search_list_size={}, max_alpha={}, storage_layout={:?}.",
7785
opt.get_num_neighbors(),
7886
opt.search_list_size,
7987
opt.max_alpha,
@@ -98,10 +106,22 @@ pub extern "C" fn ambuild(
98106
let meta_page =
99107
unsafe { MetaPage::create(&index_relation, dimensions as _, distance_type, opt) };
100108

101-
assert!(
102-
meta_page.get_num_dimensions_to_index() > 0
103-
&& meta_page.get_num_dimensions_to_index() <= 2000
104-
);
109+
if meta_page.get_num_dimensions_to_index() == 0 {
110+
error!("No dimensions to index");
111+
}
112+
113+
if meta_page.get_num_dimensions_to_index() > MAX_DIMENSION {
114+
error!("Too many dimensions to index (max is {})", MAX_DIMENSION);
115+
}
116+
117+
if meta_page.get_num_dimensions_to_index() > MAX_DIMENSION_NO_SBQ
118+
&& meta_page.get_storage_type() == StorageType::Plain
119+
{
120+
error!(
121+
"Too many dimensions to index with plain storage (max is {}). Use storage_layout=memory_optimized instead.",
122+
MAX_DIMENSION_NO_SBQ
123+
);
124+
}
105125

106126
let ntuples = do_heap_scan(index_info, &heap_relation, &index_relation, meta_page);
107127

@@ -878,7 +898,7 @@ pub mod tests {
878898
);
879899
880900
select setseed(0.5);
881-
-- generate 300 vectors
901+
-- generate {expected_cnt} vectors
882902
INSERT INTO {table_name} (id, embedding)
883903
SELECT
884904
*
@@ -1036,7 +1056,7 @@ pub mod tests {
10361056
);
10371057
10381058
select setseed(0.5);
1039-
-- generate 300 vectors
1059+
-- generate {expected_cnt} vectors
10401060
INSERT INTO test_data (id, embedding)
10411061
SELECT
10421062
*
@@ -1086,7 +1106,7 @@ pub mod tests {
10861106
CREATE INDEX idx_diskann_bq ON test_data USING diskann (embedding) WITH ({index_options});
10871107
10881108
select setseed(0.5);
1089-
-- generate 300 vectors
1109+
-- generate {expected_cnt} vectors
10901110
INSERT INTO test_data (id, embedding)
10911111
SELECT
10921112
*
@@ -1114,4 +1134,51 @@ pub mod tests {
11141134
verify_index_accuracy(expected_cnt, dimensions)?;
11151135
Ok(())
11161136
}
1137+
1138+
#[pg_test]
1139+
pub unsafe fn test_high_dimension_index() -> spi::Result<()> {
1140+
let index_options = "num_neighbors=10, search_list_size=10";
1141+
let expected_cnt = 1000;
1142+
1143+
for dimensions in [4000, 8000, 12000, 16000] {
1144+
Spi::run(&format!(
1145+
"CREATE TABLE test_data (
1146+
id int,
1147+
embedding vector ({dimensions})
1148+
);
1149+
1150+
CREATE INDEX idx_diskann_bq ON test_data USING diskann (embedding) WITH ({index_options});
1151+
1152+
select setseed(0.5);
1153+
-- generate {expected_cnt} vectors
1154+
INSERT INTO test_data (id, embedding)
1155+
SELECT
1156+
*
1157+
FROM (
1158+
SELECT
1159+
i % {expected_cnt},
1160+
('[' || array_to_string(array_agg(random()), ',', '0') || ']')::vector AS embedding
1161+
FROM
1162+
generate_series(1, {dimensions} * {expected_cnt}) i
1163+
GROUP BY
1164+
i % {expected_cnt}) g;
1165+
1166+
SET enable_seqscan = 0;
1167+
-- perform index scans on the vectors
1168+
SELECT
1169+
*
1170+
FROM
1171+
test_data
1172+
ORDER BY
1173+
embedding <=> (
1174+
SELECT
1175+
('[' || array_to_string(array_agg(random()), ',', '0') || ']')::vector AS embedding
1176+
FROM generate_series(1, {dimensions}));"))?;
1177+
1178+
verify_index_accuracy(expected_cnt, dimensions)?;
1179+
1180+
Spi::run("DROP TABLE test_data CASCADE;")?;
1181+
}
1182+
Ok(())
1183+
}
11171184
}

pgvectorscale/src/access_method/sbq.rs

+79-21
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,11 @@ use pgrx::{
2020
use rkyv::{vec::ArchivedVec, Archive, Deserialize, Serialize};
2121

2222
use crate::util::{
23-
page::PageType, table_slot::TableSlot, tape::Tape, ArchivedItemPointer, HeapPointer,
24-
IndexPointer, ItemPointer, ReadableBuffer,
23+
chain::{ChainItemReader, ChainTapeWriter},
24+
page::{PageType, ReadablePage},
25+
table_slot::TableSlot,
26+
tape::Tape,
27+
ArchivedItemPointer, HeapPointer, IndexPointer, ItemPointer, ReadableBuffer,
2528
};
2629

2730
use super::{meta_page::MetaPage, neighbor_with_distance::NeighborWithDistance};
@@ -33,33 +36,28 @@ const BITS_STORE_TYPE_SIZE: usize = 64;
3336
#[derive(Archive, Deserialize, Serialize, Readable, Writeable)]
3437
#[archive(check_bytes)]
3538
#[repr(C)]
36-
pub struct SbqMeans {
39+
pub struct SbqMeansV1 {
3740
count: u64,
3841
means: Vec<f32>,
3942
m2: Vec<f32>,
4043
}
4144

42-
impl SbqMeans {
45+
impl SbqMeansV1 {
4346
pub unsafe fn load<S: StatsNodeRead>(
4447
index: &PgRelation,
45-
meta_page: &super::meta_page::MetaPage,
48+
mut quantizer: SbqQuantizer,
49+
qip: ItemPointer,
4650
stats: &mut S,
4751
) -> SbqQuantizer {
48-
let mut quantizer = SbqQuantizer::new(meta_page);
49-
if quantizer.use_mean {
50-
if meta_page.get_quantizer_metadata_pointer().is_none() {
51-
pgrx::error!("No SBQ pointer found in meta page");
52-
}
53-
let quantizer_item_pointer = meta_page.get_quantizer_metadata_pointer().unwrap();
54-
let bq = SbqMeans::read(index, quantizer_item_pointer, stats);
55-
let archived = bq.get_archived_node();
56-
57-
quantizer.load(
58-
archived.count,
59-
archived.means.to_vec(),
60-
archived.m2.to_vec(),
61-
);
62-
}
52+
assert!(quantizer.use_mean);
53+
let bq = SbqMeansV1::read(index, qip, stats);
54+
let archived = bq.get_archived_node();
55+
56+
quantizer.load(
57+
archived.count,
58+
archived.means.to_vec(),
59+
archived.m2.to_vec(),
60+
);
6361
quantizer
6462
}
6563

@@ -69,7 +67,7 @@ impl SbqMeans {
6967
stats: &mut S,
7068
) -> ItemPointer {
7169
let mut tape = Tape::new(index, PageType::SbqMeans);
72-
let node = SbqMeans {
70+
let node = SbqMeansV1 {
7371
count: quantizer.count,
7472
means: quantizer.mean.to_vec(),
7573
m2: quantizer.m2.to_vec(),
@@ -80,6 +78,66 @@ impl SbqMeans {
8078
}
8179
}
8280

81+
#[derive(Archive, Deserialize, Serialize)]
82+
#[archive(check_bytes)]
83+
#[repr(C)]
84+
pub struct SbqMeans {
85+
count: u64,
86+
means: Vec<f32>,
87+
m2: Vec<f32>,
88+
}
89+
90+
impl SbqMeans {
91+
pub unsafe fn load<S: StatsNodeRead>(
92+
index: &PgRelation,
93+
meta_page: &super::meta_page::MetaPage,
94+
stats: &mut S,
95+
) -> SbqQuantizer {
96+
let mut quantizer = SbqQuantizer::new(meta_page);
97+
if !quantizer.use_mean {
98+
return quantizer;
99+
}
100+
let qip = meta_page
101+
.get_quantizer_metadata_pointer()
102+
.unwrap_or_else(|| pgrx::error!("No SBQ pointer found in meta page"));
103+
104+
let page = ReadablePage::read(index, qip.block_number);
105+
let page_type = page.get_type();
106+
match page_type {
107+
PageType::SbqMeansV1 => SbqMeansV1::load(index, quantizer, qip, stats),
108+
PageType::SbqMeans => {
109+
let mut tape_reader = ChainItemReader::new(index, PageType::SbqMeans, stats);
110+
let mut buf: Vec<u8> = Vec::new();
111+
for item in tape_reader.read(qip) {
112+
buf.extend_from_slice(item.get_data_slice());
113+
}
114+
115+
let means = rkyv::from_bytes::<SbqMeans>(buf.as_slice()).unwrap();
116+
quantizer.load(means.count, means.means, means.m2);
117+
quantizer
118+
}
119+
_ => {
120+
pgrx::error!("Invalid page type {} for SbqMeans", page_type as u8);
121+
}
122+
}
123+
}
124+
125+
pub unsafe fn store<S: StatsNodeWrite>(
126+
index: &PgRelation,
127+
quantizer: &SbqQuantizer,
128+
stats: &mut S,
129+
) -> ItemPointer {
130+
let bq = SbqMeans {
131+
count: quantizer.count,
132+
means: quantizer.mean.clone(),
133+
m2: quantizer.m2.clone(),
134+
};
135+
let mut tape = ChainTapeWriter::new(index, PageType::SbqMeans, stats);
136+
let buf = rkyv::to_bytes::<_, 1024>(&bq).unwrap();
137+
tape.write(&buf)
138+
}
139+
}
140+
83141
#[derive(Clone)]
84142
pub struct SbqQuantizer {
85143
pub use_mean: bool,

0 commit comments

Comments
 (0)