From 1f8c090a5f0797e6b0d593ea43ce9e1e71de51e6 Mon Sep 17 00:00:00 2001 From: Matthew Woodcraft Date: Sun, 22 May 2022 13:37:19 +0100 Subject: [PATCH 1/2] When creating the search index, omit words longer than 80 characters This avoids creating deeply nested objects in searchindex.json --- src/renderer/html_handlebars/search.rs | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/renderer/html_handlebars/search.rs b/src/renderer/html_handlebars/search.rs index 0a59ffe9f9..b5f8c4a2ac 100644 --- a/src/renderer/html_handlebars/search.rs +++ b/src/renderer/html_handlebars/search.rs @@ -2,7 +2,7 @@ use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::path::Path; -use elasticlunr::Index; +use elasticlunr::{Index, IndexBuilder}; use pulldown_cmark::*; use crate::book::{Book, BookItem}; @@ -13,9 +13,25 @@ use crate::utils; use serde::Serialize; +const MAX_WORD_LENGTH_TO_INDEX: usize = 80; + +/// Tokenizes in the same way as elasticlunr-rs (for English), but also drops long tokens. +fn tokenize(text: &str) -> Vec { + text.split(|c: char| c.is_whitespace() || c == '-') + .filter(|s| !s.is_empty()) + .map(|s| s.trim().to_lowercase()) + .filter(|s| s.len() <= MAX_WORD_LENGTH_TO_INDEX) + .collect() +} + /// Creates all files required for search. pub fn create_files(search_config: &Search, destination: &Path, book: &Book) -> Result<()> { - let mut index = Index::new(&["title", "body", "breadcrumbs"]); + let mut index = IndexBuilder::new() + .add_field_with_tokenizer("title", Box::new(&tokenize)) + .add_field_with_tokenizer("body", Box::new(&tokenize)) + .add_field_with_tokenizer("breadcrumbs", Box::new(&tokenize)) + .build(); + let mut doc_urls = Vec::with_capacity(book.sections.len()); for item in book.iter() { From 000a93dc777fe856bd93ccb29616250d05f99c67 Mon Sep 17 00:00:00 2001 From: Matthew Woodcraft Date: Sun, 22 May 2022 13:57:09 +0100 Subject: [PATCH 2/2] Test that long words are omitted from the search index. Note they do appear in the 'docs' part of searchindex.json (so they will be visible in search teasers). --- tests/dummy_book/src/first/no-headers.md | 4 +++- tests/rendered_output.rs | 2 +- tests/searchindex_fixture.json | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/dummy_book/src/first/no-headers.md b/tests/dummy_book/src/first/no-headers.md index 8f9a6d17ef..5d799aa684 100644 --- a/tests/dummy_book/src/first/no-headers.md +++ b/tests/dummy_book/src/first/no-headers.md @@ -1,3 +1,5 @@ Capybara capybara capybara. -Capybara capybara capybara. \ No newline at end of file +Capybara capybara capybara. + +ThisLongWordIsIncludedSoWeCanCheckThatSufficientlyLongWordsAreOmittedFromTheSearchIndex. diff --git a/tests/rendered_output.rs b/tests/rendered_output.rs index 873a622df5..c6267830e3 100644 --- a/tests/rendered_output.rs +++ b/tests/rendered_output.rs @@ -772,7 +772,7 @@ mod search { ); assert_eq!( docs[&no_headers]["body"], - "Capybara capybara capybara. Capybara capybara capybara." + "Capybara capybara capybara. Capybara capybara capybara. ThisLongWordIsIncludedSoWeCanCheckThatSufficientlyLongWordsAreOmittedFromTheSearchIndex." ); } diff --git a/tests/searchindex_fixture.json b/tests/searchindex_fixture.json index 9c349b6b20..3d7062d237 100644 --- a/tests/searchindex_fixture.json +++ b/tests/searchindex_fixture.json @@ -229,7 +229,7 @@ "title": "Unicode stress tests" }, "18": { - "body": "Capybara capybara capybara. Capybara capybara capybara.", + "body": "Capybara capybara capybara. Capybara capybara capybara. ThisLongWordIsIncludedSoWeCanCheckThatSufficientlyLongWordsAreOmittedFromTheSearchIndex.", "breadcrumbs": "First Chapter ยป No Headers", "id": "18", "title": "First Chapter"