1
1
use criterion:: { criterion_group, criterion_main, Criterion , Throughput } ;
2
2
use pprof:: criterion:: { Output , PProfProfiler } ;
3
3
use tantivy:: schema:: { TantivyDocument , FAST , INDEXED , STORED , STRING , TEXT } ;
4
- use tantivy:: { Index , IndexWriter } ;
4
+ use tantivy:: { tokenizer , Index , IndexWriter } ;
5
5
6
6
const HDFS_LOGS : & str = include_str ! ( "hdfs.json" ) ;
7
7
const GH_LOGS : & str = include_str ! ( "gh.json" ) ;
@@ -19,6 +19,13 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
19
19
schema_builder. add_text_field ( "severity" , STRING ) ;
20
20
schema_builder. build ( )
21
21
} ;
22
+ let schema_only_fast = {
23
+ let mut schema_builder = tantivy:: schema:: SchemaBuilder :: new ( ) ;
24
+ schema_builder. add_u64_field ( "timestamp" , FAST ) ;
25
+ schema_builder. add_text_field ( "body" , FAST ) ;
26
+ schema_builder. add_text_field ( "severity" , FAST ) ;
27
+ schema_builder. build ( )
28
+ } ;
22
29
let schema_with_store = {
23
30
let mut schema_builder = tantivy:: schema:: SchemaBuilder :: new ( ) ;
24
31
schema_builder. add_u64_field ( "timestamp" , INDEXED | STORED ) ;
@@ -83,6 +90,30 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
83
90
index_writer. commit ( ) . unwrap ( ) ;
84
91
} )
85
92
} ) ;
93
+ group. bench_function ( "index-hdfs-no-commit-fastfield" , |b| {
94
+ let lines = get_lines ( HDFS_LOGS ) ;
95
+ b. iter ( || {
96
+ let index = Index :: create_in_ram ( schema_only_fast. clone ( ) ) ;
97
+ let index_writer: IndexWriter = index. writer_with_num_threads ( 1 , 100_000_000 ) . unwrap ( ) ;
98
+ for doc_json in & lines {
99
+ let doc = TantivyDocument :: parse_json ( & schema, doc_json) . unwrap ( ) ;
100
+ index_writer. add_document ( doc) . unwrap ( ) ;
101
+ }
102
+ } )
103
+ } ) ;
104
+ group. bench_function ( "index-hdfs-with-commit-fastfield" , |b| {
105
+ let lines = get_lines ( HDFS_LOGS ) ;
106
+ b. iter ( || {
107
+ let index = Index :: create_in_ram ( schema_only_fast. clone ( ) ) ;
108
+ let mut index_writer: IndexWriter =
109
+ index. writer_with_num_threads ( 1 , 100_000_000 ) . unwrap ( ) ;
110
+ for doc_json in & lines {
111
+ let doc = TantivyDocument :: parse_json ( & schema, doc_json) . unwrap ( ) ;
112
+ index_writer. add_document ( doc) . unwrap ( ) ;
113
+ }
114
+ index_writer. commit ( ) . unwrap ( ) ;
115
+ } )
116
+ } ) ;
86
117
group. bench_function ( "index-hdfs-no-commit-json-without-docstore" , |b| {
87
118
let lines = get_lines ( HDFS_LOGS ) ;
88
119
b. iter ( || {
@@ -107,6 +138,18 @@ pub fn gh_index_benchmark(c: &mut Criterion) {
107
138
schema_builder. add_json_field ( "json" , TEXT | FAST ) ;
108
139
schema_builder. build ( )
109
140
} ;
141
+ let dynamic_schema_fast = {
142
+ let mut schema_builder = tantivy:: schema:: SchemaBuilder :: new ( ) ;
143
+ schema_builder. add_json_field ( "json" , FAST ) ;
144
+ schema_builder. build ( )
145
+ } ;
146
+ let ff_tokenizer_manager = tokenizer:: TokenizerManager :: default ( ) ;
147
+ ff_tokenizer_manager. register (
148
+ "raw" ,
149
+ tokenizer:: TextAnalyzer :: builder ( tokenizer:: RawTokenizer :: default ( ) )
150
+ . filter ( tokenizer:: RemoveLongFilter :: limit ( 255 ) )
151
+ . build ( ) ,
152
+ ) ;
110
153
111
154
let mut group = c. benchmark_group ( "index-gh" ) ;
112
155
group. throughput ( Throughput :: Bytes ( GH_LOGS . len ( ) as u64 ) ) ;
@@ -115,7 +158,23 @@ pub fn gh_index_benchmark(c: &mut Criterion) {
115
158
let lines = get_lines ( GH_LOGS ) ;
116
159
b. iter ( || {
117
160
let json_field = dynamic_schema. get_field ( "json" ) . unwrap ( ) ;
118
- let index = Index :: create_in_ram ( dynamic_schema. clone ( ) ) ;
161
+ let mut index = Index :: create_in_ram ( dynamic_schema. clone ( ) ) ;
162
+ index. set_fast_field_tokenizers ( ff_tokenizer_manager. clone ( ) ) ;
163
+ let index_writer: IndexWriter = index. writer_with_num_threads ( 1 , 100_000_000 ) . unwrap ( ) ;
164
+ for doc_json in & lines {
165
+ let json_val: serde_json:: Map < String , serde_json:: Value > =
166
+ serde_json:: from_str ( doc_json) . unwrap ( ) ;
167
+ let doc = tantivy:: doc!( json_field=>json_val) ;
168
+ index_writer. add_document ( doc) . unwrap ( ) ;
169
+ }
170
+ } )
171
+ } ) ;
172
+ group. bench_function ( "index-gh-fast" , |b| {
173
+ let lines = get_lines ( GH_LOGS ) ;
174
+ b. iter ( || {
175
+ let json_field = dynamic_schema_fast. get_field ( "json" ) . unwrap ( ) ;
176
+ let mut index = Index :: create_in_ram ( dynamic_schema_fast. clone ( ) ) ;
177
+ index. set_fast_field_tokenizers ( ff_tokenizer_manager. clone ( ) ) ;
119
178
let index_writer: IndexWriter = index. writer_with_num_threads ( 1 , 100_000_000 ) . unwrap ( ) ;
120
179
for doc_json in & lines {
121
180
let json_val: serde_json:: Map < String , serde_json:: Value > =
@@ -125,11 +184,13 @@ pub fn gh_index_benchmark(c: &mut Criterion) {
125
184
}
126
185
} )
127
186
} ) ;
187
+
128
188
group. bench_function ( "index-gh-with-commit" , |b| {
129
189
let lines = get_lines ( GH_LOGS ) ;
130
190
b. iter ( || {
131
191
let json_field = dynamic_schema. get_field ( "json" ) . unwrap ( ) ;
132
- let index = Index :: create_in_ram ( dynamic_schema. clone ( ) ) ;
192
+ let mut index = Index :: create_in_ram ( dynamic_schema. clone ( ) ) ;
193
+ index. set_fast_field_tokenizers ( ff_tokenizer_manager. clone ( ) ) ;
133
194
let mut index_writer: IndexWriter =
134
195
index. writer_with_num_threads ( 1 , 100_000_000 ) . unwrap ( ) ;
135
196
for doc_json in & lines {
0 commit comments