Skip to content

Commit 4bf568f

Browse files
Enable Fuzzy codec for doc id fields using a bloom filter (opensearch-project#11027) (opensearch-project#12171)
* Enable Fuzzy codec for doc id fields using a bloom filter (cherry picked from commit 0a88963) Signed-off-by: mgodwan <[email protected]> Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
1 parent 39fa660 commit 4bf568f

24 files changed

+1538
-2
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
5050
- Add support for Google Application Default Credentials in repository-gcs ([#8394](https://github.com/opensearch-project/OpenSearch/pull/8394))
5151
- New DateTime format for RFC3339 compatible date fields ([#11465](https://github.com/opensearch-project/OpenSearch/pull/11465))
5252
- Remove concurrent segment search feature flag for GA launch ([#12074](https://github.com/opensearch-project/OpenSearch/pull/12074))
53+
- Enable Fuzzy codec for doc id fields using a bloom filter ([#11022](https://github.com/opensearch-project/OpenSearch/pull/11022))
5354

5455
### Dependencies
5556
- Bumps jetty version to 9.4.52.v20230823 to fix GMS-2023-1857 ([#9822](https://github.com/opensearch-project/OpenSearch/pull/9822))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
*
4+
* The OpenSearch Contributors require contributions made to
5+
* this file be licensed under the Apache-2.0 license or a
6+
* compatible open source license.
7+
*/
8+
9+
package org.opensearch.benchmark.index.codec.fuzzy;
10+
11+
import org.apache.lucene.util.BytesRef;
12+
import org.opensearch.common.UUIDs;
13+
import org.opensearch.index.codec.fuzzy.FuzzySet;
14+
import org.opensearch.index.codec.fuzzy.FuzzySetFactory;
15+
import org.opensearch.index.codec.fuzzy.FuzzySetParameters;
16+
import org.opensearch.index.mapper.IdFieldMapper;
17+
import org.openjdk.jmh.annotations.Benchmark;
18+
import org.openjdk.jmh.annotations.BenchmarkMode;
19+
import org.openjdk.jmh.annotations.Fork;
20+
import org.openjdk.jmh.annotations.Measurement;
21+
import org.openjdk.jmh.annotations.Mode;
22+
import org.openjdk.jmh.annotations.OutputTimeUnit;
23+
import org.openjdk.jmh.annotations.Param;
24+
import org.openjdk.jmh.annotations.Scope;
25+
import org.openjdk.jmh.annotations.Setup;
26+
import org.openjdk.jmh.annotations.State;
27+
import org.openjdk.jmh.annotations.Warmup;
28+
29+
import java.io.IOException;
30+
import java.util.List;
31+
import java.util.Map;
32+
import java.util.concurrent.TimeUnit;
33+
import java.util.stream.Collectors;
34+
import java.util.stream.IntStream;
35+
36+
@Fork(3)
37+
@Warmup(iterations = 2)
38+
@Measurement(iterations = 5, time = 60, timeUnit = TimeUnit.SECONDS)
39+
@BenchmarkMode(Mode.AverageTime)
40+
@OutputTimeUnit(TimeUnit.MILLISECONDS)
41+
@State(Scope.Benchmark)
42+
public class FilterConstructionBenchmark {
43+
44+
private List<BytesRef> items;
45+
46+
@Param({ "1000000", "10000000", "50000000" })
47+
private int numIds;
48+
49+
@Param({ "0.0511", "0.1023", "0.2047" })
50+
private double fpp;
51+
52+
private FuzzySetFactory fuzzySetFactory;
53+
private String fieldName;
54+
55+
@Setup
56+
public void setupIds() {
57+
this.fieldName = IdFieldMapper.NAME;
58+
this.items = IntStream.range(0, numIds).mapToObj(i -> new BytesRef(UUIDs.base64UUID())).collect(Collectors.toList());
59+
FuzzySetParameters parameters = new FuzzySetParameters(() -> fpp);
60+
this.fuzzySetFactory = new FuzzySetFactory(Map.of(fieldName, parameters));
61+
}
62+
63+
@Benchmark
64+
public FuzzySet buildFilter() throws IOException {
65+
return fuzzySetFactory.createFuzzySet(items.size(), fieldName, () -> items.iterator());
66+
}
67+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
*
4+
* The OpenSearch Contributors require contributions made to
5+
* this file be licensed under the Apache-2.0 license or a
6+
* compatible open source license.
7+
*/
8+
9+
package org.opensearch.benchmark.index.codec.fuzzy;
10+
11+
import org.apache.lucene.util.BytesRef;
12+
import org.opensearch.common.UUIDs;
13+
import org.opensearch.index.codec.fuzzy.FuzzySet;
14+
import org.opensearch.index.codec.fuzzy.FuzzySetFactory;
15+
import org.opensearch.index.codec.fuzzy.FuzzySetParameters;
16+
import org.opensearch.index.mapper.IdFieldMapper;
17+
import org.openjdk.jmh.annotations.Benchmark;
18+
import org.openjdk.jmh.annotations.BenchmarkMode;
19+
import org.openjdk.jmh.annotations.Fork;
20+
import org.openjdk.jmh.annotations.Measurement;
21+
import org.openjdk.jmh.annotations.Mode;
22+
import org.openjdk.jmh.annotations.OutputTimeUnit;
23+
import org.openjdk.jmh.annotations.Param;
24+
import org.openjdk.jmh.annotations.Scope;
25+
import org.openjdk.jmh.annotations.Setup;
26+
import org.openjdk.jmh.annotations.State;
27+
import org.openjdk.jmh.annotations.Warmup;
28+
import org.openjdk.jmh.infra.Blackhole;
29+
30+
import java.io.IOException;
31+
import java.util.List;
32+
import java.util.Map;
33+
import java.util.Random;
34+
import java.util.concurrent.TimeUnit;
35+
import java.util.stream.Collectors;
36+
import java.util.stream.IntStream;
37+
38+
@Fork(3)
39+
@Warmup(iterations = 2)
40+
@Measurement(iterations = 5, time = 60, timeUnit = TimeUnit.SECONDS)
41+
@BenchmarkMode(Mode.AverageTime)
42+
@OutputTimeUnit(TimeUnit.MILLISECONDS)
43+
@State(Scope.Benchmark)
44+
public class FilterLookupBenchmark {
45+
46+
@Param({ "50000000", "1000000" })
47+
private int numItems;
48+
49+
@Param({ "1000000" })
50+
private int searchKeyCount;
51+
52+
@Param({ "0.0511", "0.1023", "0.2047" })
53+
private double fpp;
54+
55+
private FuzzySet fuzzySet;
56+
private List<BytesRef> items;
57+
private Random random = new Random();
58+
59+
@Setup
60+
public void setupFilter() throws IOException {
61+
String fieldName = IdFieldMapper.NAME;
62+
items = IntStream.range(0, numItems).mapToObj(i -> new BytesRef(UUIDs.base64UUID())).collect(Collectors.toList());
63+
FuzzySetParameters parameters = new FuzzySetParameters(() -> fpp);
64+
fuzzySet = new FuzzySetFactory(Map.of(fieldName, parameters)).createFuzzySet(numItems, fieldName, () -> items.iterator());
65+
}
66+
67+
@Benchmark
68+
public void contains_withExistingKeys(Blackhole blackhole) throws IOException {
69+
for (int i = 0; i < searchKeyCount; i++) {
70+
blackhole.consume(fuzzySet.contains(items.get(random.nextInt(items.size()))) == FuzzySet.Result.MAYBE);
71+
}
72+
}
73+
74+
@Benchmark
75+
public void contains_withRandomKeys(Blackhole blackhole) throws IOException {
76+
for (int i = 0; i < searchKeyCount; i++) {
77+
blackhole.consume(fuzzySet.contains(new BytesRef(UUIDs.base64UUID())));
78+
}
79+
}
80+
}

qa/rolling-upgrade/build.gradle

+1
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ for (Version bwcVersion : BuildParams.bwcVersions.wireCompatible) {
6262
setting 'repositories.url.allowed_urls', 'http://snapshot.test*'
6363
setting 'path.repo', "${buildDir}/cluster/shared/repo/${baseName}"
6464
setting 'http.content_type.required', 'true'
65+
systemProperty 'opensearch.experimental.optimize_doc_id_lookup.fuzzy_set.enabled', 'true'
6566
}
6667
}
6768

qa/rolling-upgrade/src/test/java/org/opensearch/upgrades/IndexingIT.java

+87
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
import org.opensearch.common.Booleans;
4444
import org.opensearch.common.io.Streams;
4545
import org.opensearch.common.settings.Settings;
46+
import org.opensearch.index.IndexSettings;
4647
import org.opensearch.index.codec.CodecService;
4748
import org.opensearch.index.engine.EngineConfig;
4849
import org.opensearch.indices.replication.common.ReplicationType;
@@ -349,6 +350,92 @@ public void testIndexingWithSegRep() throws Exception {
349350
}
350351
}
351352

353+
public void testIndexingWithFuzzyFilterPostings() throws Exception {
354+
if (UPGRADE_FROM_VERSION.onOrBefore(Version.V_2_11_1)) {
355+
logger.info("--> Skip test for version {} where fuzzy filter postings format feature is not available", UPGRADE_FROM_VERSION);
356+
return;
357+
}
358+
final String indexName = "test-index-fuzzy-set";
359+
final int shardCount = 3;
360+
final int replicaCount = 1;
361+
logger.info("--> Case {}", CLUSTER_TYPE);
362+
printClusterNodes();
363+
logger.info("--> _cat/shards before test execution \n{}", EntityUtils.toString(client().performRequest(new Request("GET", "/_cat/shards?v")).getEntity()));
364+
switch (CLUSTER_TYPE) {
365+
case OLD:
366+
Settings.Builder settings = Settings.builder()
367+
.put(IndexMetadata.INDEX_NUMBER_OF_SHARDS_SETTING.getKey(), shardCount)
368+
.put(IndexMetadata.INDEX_NUMBER_OF_REPLICAS_SETTING.getKey(), replicaCount)
369+
.put(IndexMetadata.SETTING_REPLICATION_TYPE, ReplicationType.SEGMENT)
370+
.put(
371+
EngineConfig.INDEX_CODEC_SETTING.getKey(),
372+
randomFrom(new ArrayList<>(CODECS) {
373+
{
374+
add(CodecService.LUCENE_DEFAULT_CODEC);
375+
}
376+
})
377+
)
378+
.put(INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.getKey(), "100ms");
379+
createIndex(indexName, settings.build());
380+
waitForClusterHealthWithNoShardMigration(indexName, "green");
381+
bulk(indexName, "_OLD", 5);
382+
break;
383+
case MIXED:
384+
waitForClusterHealthWithNoShardMigration(indexName, "yellow");
385+
break;
386+
case UPGRADED:
387+
Settings.Builder settingsBuilder = Settings.builder()
388+
.put(IndexSettings.INDEX_DOC_ID_FUZZY_SET_ENABLED_SETTING.getKey(), true);
389+
updateIndexSettings(indexName, settingsBuilder);
390+
waitForClusterHealthWithNoShardMigration(indexName, "green");
391+
break;
392+
default:
393+
throw new UnsupportedOperationException("Unknown cluster type [" + CLUSTER_TYPE + "]");
394+
}
395+
396+
int expectedCount;
397+
switch (CLUSTER_TYPE) {
398+
case OLD:
399+
expectedCount = 5;
400+
break;
401+
case MIXED:
402+
if (Booleans.parseBoolean(System.getProperty("tests.first_round"))) {
403+
expectedCount = 5;
404+
} else {
405+
expectedCount = 10;
406+
}
407+
break;
408+
case UPGRADED:
409+
expectedCount = 15;
410+
break;
411+
default:
412+
throw new UnsupportedOperationException("Unknown cluster type [" + CLUSTER_TYPE + "]");
413+
}
414+
415+
waitForSearchableDocs(indexName, shardCount, replicaCount);
416+
assertCount(indexName, expectedCount);
417+
418+
if (CLUSTER_TYPE != ClusterType.OLD) {
419+
bulk(indexName, "_" + CLUSTER_TYPE, 5);
420+
logger.info("--> Index one doc (to be deleted next) and verify doc count");
421+
Request toBeDeleted = new Request("PUT", "/" + indexName + "/_doc/to_be_deleted");
422+
toBeDeleted.addParameter("refresh", "true");
423+
toBeDeleted.setJsonEntity("{\"f1\": \"delete-me\"}");
424+
client().performRequest(toBeDeleted);
425+
waitForSearchableDocs(indexName, shardCount, replicaCount);
426+
assertCount(indexName, expectedCount + 6);
427+
428+
logger.info("--> Delete previously added doc and verify doc count");
429+
Request delete = new Request("DELETE", "/" + indexName + "/_doc/to_be_deleted");
430+
delete.addParameter("refresh", "true");
431+
client().performRequest(delete);
432+
waitForSearchableDocs(indexName, shardCount, replicaCount);
433+
assertCount(indexName, expectedCount + 5);
434+
435+
//forceMergeAndVerify(indexName, shardCount * (1 + replicaCount));
436+
}
437+
}
438+
352439
public void testAutoIdWithOpTypeCreate() throws IOException {
353440
final String indexName = "auto_id_and_op_type_create_index";
354441
StringBuilder b = new StringBuilder();

server/src/main/java/org/opensearch/common/settings/FeatureFlagSettings.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ protected FeatureFlagSettings(
3434
FeatureFlags.IDENTITY_SETTING,
3535
FeatureFlags.TELEMETRY_SETTING,
3636
FeatureFlags.DATETIME_FORMATTER_CACHING_SETTING,
37-
FeatureFlags.WRITEABLE_REMOTE_INDEX_SETTING
37+
FeatureFlags.WRITEABLE_REMOTE_INDEX_SETTING,
38+
FeatureFlags.DOC_ID_FUZZY_SET_SETTING
3839
);
3940
}

server/src/main/java/org/opensearch/common/settings/IndexScopedSettings.java

+3
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,9 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
229229
IndexMetadata.INDEX_REMOTE_SEGMENT_STORE_REPOSITORY_SETTING,
230230
IndexMetadata.INDEX_REMOTE_TRANSLOG_REPOSITORY_SETTING,
231231

232+
IndexSettings.INDEX_DOC_ID_FUZZY_SET_ENABLED_SETTING,
233+
IndexSettings.INDEX_DOC_ID_FUZZY_SET_FALSE_POSITIVE_PROBABILITY_SETTING,
234+
232235
// Settings for concurrent segment search
233236
IndexSettings.INDEX_CONCURRENT_SEGMENT_SEARCH_SETTING,
234237

server/src/main/java/org/opensearch/common/util/FeatureFlags.java

+7
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,11 @@ public class FeatureFlags {
5454
*/
5555
public static final String WRITEABLE_REMOTE_INDEX = "opensearch.experimental.feature.writeable_remote_index.enabled";
5656

57+
/**
58+
* Gates the optimization to enable bloom filters for doc id lookup.
59+
*/
60+
public static final String DOC_ID_FUZZY_SET = "opensearch.experimental.optimize_doc_id_lookup.fuzzy_set.enabled";
61+
5762
/**
5863
* Should store the settings from opensearch.yml.
5964
*/
@@ -110,4 +115,6 @@ public static boolean isEnabled(Setting<Boolean> featureFlag) {
110115
false,
111116
Property.NodeScope
112117
);
118+
119+
public static final Setting<Boolean> DOC_ID_FUZZY_SET_SETTING = Setting.boolSetting(DOC_ID_FUZZY_SET, false, Property.NodeScope);
113120
}

0 commit comments

Comments
 (0)