Skip to content

Commit 92ad814

Browse files
authored
Merge pull request #5488 from BookStackApp/search_index_updates
Search index improvements
2 parents 2291d78 + f1b8e85 commit 92ad814

7 files changed

+333
-189
lines changed

app/Search/SearchIndex.php

+34-7
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,13 @@ class SearchIndex
1616
/**
1717
* A list of delimiter characters used to break-up parsed content into terms for indexing.
1818
*/
19-
public static string $delimiters = " \n\t.,!?:;()[]{}<>`'\"";
19+
public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\"«»";
20+
21+
/**
22+
* A list of delimiter which could be commonly used within a single term and also indicate a break between terms.
23+
* The indexer will index the full term with these delimiters, plus the terms split via these delimiters.
24+
*/
25+
public static string $softDelimiters = ".-";
2026

2127
public function __construct(
2228
protected EntityProvider $entityProvider
@@ -196,15 +202,36 @@ protected function generateTermScoreMapFromTags(array $tags): array
196202
protected function textToTermCountMap(string $text): array
197203
{
198204
$tokenMap = []; // {TextToken => OccurrenceCount}
199-
$splitChars = static::$delimiters;
200-
$token = strtok($text, $splitChars);
205+
$softDelims = static::$softDelimiters;
206+
$tokenizer = new SearchTextTokenizer($text, static::$delimiters);
207+
$extendedToken = '';
208+
$extendedLen = 0;
209+
210+
$token = $tokenizer->next();
201211

202212
while ($token !== false) {
203-
if (!isset($tokenMap[$token])) {
204-
$tokenMap[$token] = 0;
213+
$delim = $tokenizer->previousDelimiter();
214+
215+
if ($delim && str_contains($softDelims, $delim) && $token !== '') {
216+
$extendedToken .= $delim . $token;
217+
$extendedLen++;
218+
} else {
219+
if ($extendedLen > 1) {
220+
$tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
221+
}
222+
$extendedToken = $token;
223+
$extendedLen = 1;
205224
}
206-
$tokenMap[$token]++;
207-
$token = strtok($splitChars);
225+
226+
if ($token) {
227+
$tokenMap[$token] = ($tokenMap[$token] ?? 0) + 1;
228+
}
229+
230+
$token = $tokenizer->next();
231+
}
232+
233+
if ($extendedLen > 1) {
234+
$tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
208235
}
209236

210237
return $tokenMap;

app/Search/SearchOptions.php

+1-1
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ protected static function decodeEscapes(string $input): string
181181
protected static function parseStandardTermString(string $termString): array
182182
{
183183
$terms = explode(' ', $termString);
184-
$indexDelimiters = SearchIndex::$delimiters;
184+
$indexDelimiters = implode('', array_diff(str_split(SearchIndex::$delimiters), str_split(SearchIndex::$softDelimiters)));
185185
$parsed = [
186186
'terms' => [],
187187
'exacts' => [],

app/Search/SearchTextTokenizer.php

+70
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
<?php
2+
3+
namespace BookStack\Search;
4+
5+
/**
6+
* A custom text tokenizer which records & provides insight needed for our search indexing.
7+
* We used to use basic strtok() but this class does the following which that lacked:
8+
* - Tracks and provides the current/previous delimiter that we've stopped at.
9+
* - Returns empty tokens upon parsing a delimiter.
10+
*/
11+
class SearchTextTokenizer
12+
{
13+
protected int $currentIndex = 0;
14+
protected int $length;
15+
protected string $currentDelimiter = '';
16+
protected string $previousDelimiter = '';
17+
18+
public function __construct(
19+
protected string $text,
20+
protected string $delimiters = ' '
21+
) {
22+
$this->length = strlen($this->text);
23+
}
24+
25+
/**
26+
* Get the current delimiter to be found.
27+
*/
28+
public function currentDelimiter(): string
29+
{
30+
return $this->currentDelimiter;
31+
}
32+
33+
/**
34+
* Get the previous delimiter found.
35+
*/
36+
public function previousDelimiter(): string
37+
{
38+
return $this->previousDelimiter;
39+
}
40+
41+
/**
42+
* Get the next token between delimiters.
43+
* Returns false if there's no further tokens.
44+
*/
45+
public function next(): string|false
46+
{
47+
$token = '';
48+
49+
for ($i = $this->currentIndex; $i < $this->length; $i++) {
50+
$char = $this->text[$i];
51+
if (str_contains($this->delimiters, $char)) {
52+
$this->previousDelimiter = $this->currentDelimiter;
53+
$this->currentDelimiter = $char;
54+
$this->currentIndex = $i + 1;
55+
return $token;
56+
}
57+
58+
$token .= $char;
59+
}
60+
61+
if ($token) {
62+
$this->currentIndex = $this->length;
63+
$this->previousDelimiter = $this->currentDelimiter;
64+
$this->currentDelimiter = '';
65+
return $token;
66+
}
67+
68+
return false;
69+
}
70+
}

tests/Entity/EntitySearchTest.php tests/Search/EntitySearchTest.php

+1-180
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,9 @@
11
<?php
22

3-
namespace Tests\Entity;
3+
namespace Search;
44

55
use BookStack\Activity\Models\Tag;
66
use BookStack\Entities\Models\Book;
7-
use BookStack\Entities\Models\Bookshelf;
8-
use BookStack\Entities\Models\Chapter;
9-
use Illuminate\Support\Str;
107
use Tests\TestCase;
118

129
class EntitySearchTest extends TestCase
@@ -312,113 +309,6 @@ public function test_entity_template_selector_search()
312309
$defaultListTest->assertDontSee($templatePage->name);
313310
}
314311

315-
public function test_sibling_search_for_pages()
316-
{
317-
$chapter = $this->entities->chapterHasPages();
318-
$this->assertGreaterThan(2, count($chapter->pages), 'Ensure we\'re testing with at least 1 sibling');
319-
$page = $chapter->pages->first();
320-
321-
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$page->id}&entity_type=page");
322-
$search->assertSuccessful();
323-
foreach ($chapter->pages as $page) {
324-
$search->assertSee($page->name);
325-
}
326-
327-
$search->assertDontSee($chapter->name);
328-
}
329-
330-
public function test_sibling_search_for_pages_without_chapter()
331-
{
332-
$page = $this->entities->pageNotWithinChapter();
333-
$bookChildren = $page->book->getDirectVisibleChildren();
334-
$this->assertGreaterThan(2, count($bookChildren), 'Ensure we\'re testing with at least 1 sibling');
335-
336-
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$page->id}&entity_type=page");
337-
$search->assertSuccessful();
338-
foreach ($bookChildren as $child) {
339-
$search->assertSee($child->name);
340-
}
341-
342-
$search->assertDontSee($page->book->name);
343-
}
344-
345-
public function test_sibling_search_for_chapters()
346-
{
347-
$chapter = $this->entities->chapter();
348-
$bookChildren = $chapter->book->getDirectVisibleChildren();
349-
$this->assertGreaterThan(2, count($bookChildren), 'Ensure we\'re testing with at least 1 sibling');
350-
351-
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$chapter->id}&entity_type=chapter");
352-
$search->assertSuccessful();
353-
foreach ($bookChildren as $child) {
354-
$search->assertSee($child->name);
355-
}
356-
357-
$search->assertDontSee($chapter->book->name);
358-
}
359-
360-
public function test_sibling_search_for_books()
361-
{
362-
$books = Book::query()->take(10)->get();
363-
$book = $books->first();
364-
$this->assertGreaterThan(2, count($books), 'Ensure we\'re testing with at least 1 sibling');
365-
366-
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$book->id}&entity_type=book");
367-
$search->assertSuccessful();
368-
foreach ($books as $expectedBook) {
369-
$search->assertSee($expectedBook->name);
370-
}
371-
}
372-
373-
public function test_sibling_search_for_shelves()
374-
{
375-
$shelves = Bookshelf::query()->take(10)->get();
376-
$shelf = $shelves->first();
377-
$this->assertGreaterThan(2, count($shelves), 'Ensure we\'re testing with at least 1 sibling');
378-
379-
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$shelf->id}&entity_type=bookshelf");
380-
$search->assertSuccessful();
381-
foreach ($shelves as $expectedShelf) {
382-
$search->assertSee($expectedShelf->name);
383-
}
384-
}
385-
386-
public function test_sibling_search_for_books_provides_results_in_alphabetical_order()
387-
{
388-
$contextBook = $this->entities->book();
389-
$searchBook = $this->entities->book();
390-
391-
$searchBook->name = 'Zebras';
392-
$searchBook->save();
393-
394-
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextBook->id}&entity_type=book");
395-
$this->withHtml($search)->assertElementNotContains('a:first-child', 'Zebras');
396-
397-
$searchBook->name = '1AAAAAAArdvarks';
398-
$searchBook->save();
399-
400-
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextBook->id}&entity_type=book");
401-
$this->withHtml($search)->assertElementContains('a:first-child', '1AAAAAAArdvarks');
402-
}
403-
404-
public function test_sibling_search_for_shelves_provides_results_in_alphabetical_order()
405-
{
406-
$contextShelf = $this->entities->shelf();
407-
$searchShelf = $this->entities->shelf();
408-
409-
$searchShelf->name = 'Zebras';
410-
$searchShelf->save();
411-
412-
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextShelf->id}&entity_type=bookshelf");
413-
$this->withHtml($search)->assertElementNotContains('a:first-child', 'Zebras');
414-
415-
$searchShelf->name = '1AAAAAAArdvarks';
416-
$searchShelf->save();
417-
418-
$search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextShelf->id}&entity_type=bookshelf");
419-
$this->withHtml($search)->assertElementContains('a:first-child', '1AAAAAAArdvarks');
420-
}
421-
422312
public function test_search_works_on_updated_page_content()
423313
{
424314
$page = $this->entities->page();
@@ -453,75 +343,6 @@ public function test_search_ranks_common_words_lower()
453343
$this->withHtml($search)->assertElementContains('.entity-list > .page:nth-child(2)', 'Test page A');
454344
}
455345

456-
public function test_terms_in_headers_have_an_adjusted_index_score()
457-
{
458-
$page = $this->entities->newPage(['name' => 'Test page A', 'html' => '
459-
<p>TermA</p>
460-
<h1>TermB <strong>TermNested</strong></h1>
461-
<h2>TermC</h2>
462-
<h3>TermD</h3>
463-
<h4>TermE</h4>
464-
<h5>TermF</h5>
465-
<h6>TermG</h6>
466-
']);
467-
468-
$scoreByTerm = $page->searchTerms()->pluck('score', 'term');
469-
470-
$this->assertEquals(1, $scoreByTerm->get('TermA'));
471-
$this->assertEquals(10, $scoreByTerm->get('TermB'));
472-
$this->assertEquals(10, $scoreByTerm->get('TermNested'));
473-
$this->assertEquals(5, $scoreByTerm->get('TermC'));
474-
$this->assertEquals(4, $scoreByTerm->get('TermD'));
475-
$this->assertEquals(3, $scoreByTerm->get('TermE'));
476-
$this->assertEquals(2, $scoreByTerm->get('TermF'));
477-
// Is 1.5 but stored as integer, rounding up
478-
$this->assertEquals(2, $scoreByTerm->get('TermG'));
479-
}
480-
481-
public function test_indexing_works_as_expected_for_page_with_lots_of_terms()
482-
{
483-
$this->markTestSkipped('Time consuming test');
484-
485-
$count = 100000;
486-
$text = '';
487-
$chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_#';
488-
for ($i = 0; $i < $count; $i++) {
489-
$text .= substr(str_shuffle($chars), 0, 5) . ' ';
490-
}
491-
492-
$page = $this->entities->newPage(['name' => 'Test page A', 'html' => '<p>' . $text . '</p>']);
493-
494-
$termCount = $page->searchTerms()->count();
495-
496-
// Expect at least 90% unique rate
497-
$this->assertGreaterThan($count * 0.9, $termCount);
498-
}
499-
500-
public function test_name_and_content_terms_are_merged_to_single_score()
501-
{
502-
$page = $this->entities->newPage(['name' => 'TermA', 'html' => '
503-
<p>TermA</p>
504-
']);
505-
506-
$scoreByTerm = $page->searchTerms()->pluck('score', 'term');
507-
508-
// Scores 40 for being in the name then 1 for being in the content
509-
$this->assertEquals(41, $scoreByTerm->get('TermA'));
510-
}
511-
512-
public function test_tag_names_and_values_are_indexed_for_search()
513-
{
514-
$page = $this->entities->newPage(['name' => 'PageA', 'html' => '<p>content</p>', 'tags' => [
515-
['name' => 'Animal', 'value' => 'MeowieCat'],
516-
['name' => 'SuperImportant'],
517-
]]);
518-
519-
$scoreByTerm = $page->searchTerms()->pluck('score', 'term');
520-
$this->assertEquals(5, $scoreByTerm->get('MeowieCat'));
521-
$this->assertEquals(3, $scoreByTerm->get('Animal'));
522-
$this->assertEquals(3, $scoreByTerm->get('SuperImportant'));
523-
}
524-
525346
public function test_matching_terms_in_search_results_are_highlighted()
526347
{
527348
$this->entities->newPage(['name' => 'My Meowie Cat', 'html' => '<p>A superimportant page about meowieable animals</p>', 'tags' => [

0 commit comments

Comments
 (0)