Skip to content

Commit 5996209

Browse files
authored
fix: #2078 return error when tokenizer not found while indexing (#2093)
* fix: #2078 return error when tokenizer not found while indexing * chore: formatting issues * chore: fix review comments
1 parent ebc7812 commit 5996209

File tree

1 file changed

+46
-10
lines changed

1 file changed

+46
-10
lines changed

src/indexer/segment_writer.rs

+46-10
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ use crate::postings::{
1515
use crate::schema::{FieldEntry, FieldType, Schema, Term, Value, DATE_TIME_PRECISION_INDEXED};
1616
use crate::store::{StoreReader, StoreWriter};
1717
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
18-
use crate::{DocId, Document, Opstamp, SegmentComponent};
18+
use crate::{DocId, Document, Opstamp, SegmentComponent, TantivyError};
1919

2020
/// Computes the initial size of the hash table.
2121
///
@@ -98,14 +98,18 @@ impl SegmentWriter {
9898
}
9999
_ => None,
100100
};
101-
text_options
102-
.and_then(|text_index_option| {
103-
let tokenizer_name = &text_index_option.tokenizer();
104-
tokenizer_manager.get(tokenizer_name)
105-
})
106-
.unwrap_or_default()
101+
let tokenizer_name = text_options
102+
.and_then(|text_index_option| Some(text_index_option.tokenizer()))
103+
.unwrap_or("default");
104+
105+
tokenizer_manager.get(tokenizer_name).ok_or_else(|| {
106+
TantivyError::SchemaError(format!(
107+
"Error getting tokenizer for field: {}",
108+
field_entry.name()
109+
))
110+
})
107111
})
108-
.collect();
112+
.collect::<Result<Vec<_>, _>>()?;
109113
Ok(SegmentWriter {
110114
max_doc: 0,
111115
ctx: IndexingContext::new(table_size),
@@ -438,15 +442,19 @@ fn remap_and_write(
438442

439443
#[cfg(test)]
440444
mod tests {
441-
use std::path::Path;
445+
use std::path::{Path, PathBuf};
446+
447+
use tempfile::TempDir;
442448

443449
use super::compute_initial_table_size;
444450
use crate::collector::Count;
445451
use crate::core::json_utils::JsonTermWriter;
446452
use crate::directory::RamDirectory;
447453
use crate::postings::TermInfo;
448454
use crate::query::PhraseQuery;
449-
use crate::schema::{IndexRecordOption, Schema, Type, STORED, STRING, TEXT};
455+
use crate::schema::{
456+
IndexRecordOption, Schema, TextFieldIndexing, TextOptions, Type, STORED, STRING, TEXT,
457+
};
450458
use crate::store::{Compressor, StoreReader, StoreWriter};
451459
use crate::time::format_description::well_known::Rfc3339;
452460
use crate::time::OffsetDateTime;
@@ -900,4 +908,32 @@ mod tests {
900908
postings.positions(&mut positions);
901909
assert_eq!(positions, &[4]); //< as opposed to 3 if we had a position length of 1.
902910
}
911+
912+
#[test]
913+
fn test_show_error_when_tokenizer_not_registered() {
914+
let text_field_indexing = TextFieldIndexing::default()
915+
.set_tokenizer("custom_en")
916+
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
917+
let text_options = TextOptions::default()
918+
.set_indexing_options(text_field_indexing)
919+
.set_stored();
920+
let mut schema_builder = Schema::builder();
921+
schema_builder.add_text_field("title", text_options);
922+
let schema = schema_builder.build();
923+
let tempdir = TempDir::new().unwrap();
924+
let tempdir_path = PathBuf::from(tempdir.path());
925+
Index::create_in_dir(&tempdir_path, schema).unwrap();
926+
let index = Index::open_in_dir(tempdir_path).unwrap();
927+
let schema = index.schema();
928+
let mut index_writer = index.writer(50_000_000).unwrap();
929+
let title = schema.get_field("title").unwrap();
930+
let mut document = Document::default();
931+
document.add_text(title, "The Old Man and the Sea");
932+
index_writer.add_document(document).unwrap();
933+
let error = index_writer.commit().unwrap_err();
934+
assert_eq!(
935+
error.to_string(),
936+
"Schema error: 'Error getting tokenizer for field: title'"
937+
);
938+
}
903939
}

0 commit comments

Comments
 (0)