Skip to content

Commit a4b759d

Browse files
authored
Include stop word lists from Lucene and the Snowball project (#1666)
1 parent 3e9c806 commit a4b759d

File tree

6 files changed

+2243
-6
lines changed

6 files changed

+2243
-6
lines changed

.github/workflows/test.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ jobs:
4848
strategy:
4949
matrix:
5050
features: [
51-
{ label: "all", flags: "mmap,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints" },
51+
{ label: "all", flags: "mmap,stopwords,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints" },
5252
{ label: "quickwit", flags: "mmap,quickwit,failpoints" }
5353
]
5454

Cargo.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,9 @@ debug-assertions = true
9191
overflow-checks = true
9292

9393
[features]
94-
default = ["mmap", "lz4-compression" ]
94+
default = ["mmap", "stopwords", "lz4-compression"]
9595
mmap = ["fs2", "tempfile", "memmap2"]
96+
stopwords = []
9697

9798
brotli-compression = ["brotli"]
9899
lz4-compression = ["lz4_flex"]

src/fieldnorm/writer.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use crate::DocId;
99
/// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte
1010
/// of each document for each field with field norms.
1111
///
12-
/// `FieldNormsWriter` stores a Vec<u8> for each tracked field, using a
12+
/// `FieldNormsWriter` stores a `Vec<u8>` for each tracked field, using a
1313
/// byte per document per field.
1414
pub struct FieldNormsWriter {
1515
fieldnorms_buffers: Vec<Option<Vec<u8>>>,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import requests
2+
3+
LANGUAGES = [
4+
"danish",
5+
"dutch",
6+
"finnish",
7+
"french",
8+
"german",
9+
"italian",
10+
"norwegian",
11+
"portuguese",
12+
"russian",
13+
"spanish",
14+
"swedish",
15+
]
16+
17+
with requests.Session() as sess, open("stopwords.rs", "w") as mod:
18+
mod.write("/*\n")
19+
mod.write(
20+
"These stop word lists are from the Snowball project (https://snowballstem.org/)\nwhich carries the following copyright and license:\n\n"
21+
)
22+
23+
resp = sess.get(
24+
"https://raw.githubusercontent.com/snowballstem/snowball/master/COPYING"
25+
)
26+
resp.raise_for_status()
27+
mod.write(resp.text)
28+
mod.write("*/\n\n")
29+
30+
for lang in LANGUAGES:
31+
resp = sess.get(f"https://snowballstem.org/algorithms/{lang}/stop.txt")
32+
resp.raise_for_status()
33+
34+
mod.write(f"pub const {lang.upper()}: &[&str] = &[\n")
35+
36+
for line in resp.text.splitlines():
37+
line, _, _ = line.partition("|")
38+
39+
for word in line.split():
40+
mod.write(f' "{word}",\n')
41+
42+
mod.write("];\n\n")

src/tokenizer/stop_word_filter.rs src/tokenizer/stop_word_filter/mod.rs

+80-3
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@
1010
//! assert_eq!(stream.next().unwrap().text, "crafty");
1111
//! assert!(stream.next().is_none());
1212
//! ```
13+
#[cfg(feature = "stopwords")]
14+
#[rustfmt::skip]
15+
mod stopwords;
16+
1317
use std::sync::Arc;
1418

1519
use rustc_hash::FxHashSet;
@@ -31,14 +35,87 @@ impl StopWordFilter {
3135
}
3236
}
3337

34-
fn english() -> StopWordFilter {
35-
let words: [&'static str; 33] = [
38+
fn from_word_list(words: &[&str]) -> Self {
39+
Self::remove(words.iter().map(|&word| word.to_owned()))
40+
}
41+
42+
#[cfg(feature = "stopwords")]
43+
/// Create a `StopWorldFilter` for the Danish language
44+
pub fn danish() -> Self {
45+
Self::from_word_list(stopwords::DANISH)
46+
}
47+
48+
#[cfg(feature = "stopwords")]
49+
/// Create a `StopWorldFilter` for the Dutch language
50+
pub fn dutch() -> Self {
51+
Self::from_word_list(stopwords::DUTCH)
52+
}
53+
54+
/// Create a `StopWorldFilter` for the English language
55+
pub fn english() -> Self {
56+
// This is the same list of words used by the Apache-licensed Lucene project,
57+
// c.f. https://github.com/apache/lucene/blob/d5d6dc079395c47cd6d12dcce3bcfdd2c7d9dc63/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java#L46
58+
const WORDS: &[&str] = &[
3659
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into",
3760
"is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then",
3861
"there", "these", "they", "this", "to", "was", "will", "with",
3962
];
4063

41-
StopWordFilter::remove(words.iter().map(|&s| s.to_string()))
64+
Self::from_word_list(WORDS)
65+
}
66+
67+
#[cfg(feature = "stopwords")]
68+
/// Create a `StopWorldFilter` for the Finnish language
69+
pub fn finnish() -> Self {
70+
Self::from_word_list(stopwords::FINNISH)
71+
}
72+
73+
#[cfg(feature = "stopwords")]
74+
/// Create a `StopWorldFilter` for the French language
75+
pub fn french() -> Self {
76+
Self::from_word_list(stopwords::FRENCH)
77+
}
78+
79+
#[cfg(feature = "stopwords")]
80+
/// Create a `StopWorldFilter` for the German language
81+
pub fn german() -> Self {
82+
Self::from_word_list(stopwords::GERMAN)
83+
}
84+
85+
#[cfg(feature = "stopwords")]
86+
/// Create a `StopWorldFilter` for the Italian language
87+
pub fn italian() -> Self {
88+
Self::from_word_list(stopwords::ITALIAN)
89+
}
90+
91+
#[cfg(feature = "stopwords")]
92+
/// Create a `StopWorldFilter` for the Norwegian language
93+
pub fn norwegian() -> Self {
94+
Self::from_word_list(stopwords::NORWEGIAN)
95+
}
96+
97+
#[cfg(feature = "stopwords")]
98+
/// Create a `StopWorldFilter` for the Portuguese language
99+
pub fn portuguese() -> Self {
100+
Self::from_word_list(stopwords::PORTUGUESE)
101+
}
102+
103+
#[cfg(feature = "stopwords")]
104+
/// Create a `StopWorldFilter` for the Russian language
105+
pub fn russian() -> Self {
106+
Self::from_word_list(stopwords::RUSSIAN)
107+
}
108+
109+
#[cfg(feature = "stopwords")]
110+
/// Create a `StopWorldFilter` for the Spanish language
111+
pub fn spanish() -> Self {
112+
Self::from_word_list(stopwords::SPANISH)
113+
}
114+
115+
#[cfg(feature = "stopwords")]
116+
/// Create a `StopWorldFilter` for the Swedish language
117+
pub fn swedish() -> Self {
118+
Self::from_word_list(stopwords::SWEDISH)
42119
}
43120
}
44121

0 commit comments

Comments
 (0)