Skip to content

Commit f444992

Browse files
committed
Searching: Added custom tokenizer that considers soft delimiters.
This changes indexing so that a.b now indexes as "a", "b" AND "a.b" instead of just the first two, for periods and hypens, so terms containing those characters can be searched within. Adds hypens as a delimiter - #2095
1 parent 45a15b4 commit f444992

File tree

4 files changed

+121
-8
lines changed

4 files changed

+121
-8
lines changed

app/Search/SearchIndex.php

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,13 @@ class SearchIndex
1616
/**
1717
* A list of delimiter characters used to break-up parsed content into terms for indexing.
1818
*/
19-
public static string $delimiters = " \n\t.,!?:;()[]{}<>`'\"";
19+
public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\"";
20+
21+
/**
22+
* A list of delimiter which could be commonly used within a single term and also indicate a break between terms.
23+
* The indexer will index the full term with these delimiters, plus the terms split via these delimiters.
24+
*/
25+
public static string $softDelimiters = ".-";
2026

2127
public function __construct(
2228
protected EntityProvider $entityProvider
@@ -196,15 +202,36 @@ protected function generateTermScoreMapFromTags(array $tags): array
196202
protected function textToTermCountMap(string $text): array
197203
{
198204
$tokenMap = []; // {TextToken => OccurrenceCount}
199-
$splitChars = static::$delimiters;
200-
$token = strtok($text, $splitChars);
205+
$softDelims = static::$softDelimiters;
206+
$tokenizer = new SearchTextTokenizer($text, static::$delimiters);
207+
$extendedToken = '';
208+
$extendedLen = 0;
209+
210+
$token = $tokenizer->next();
201211

202212
while ($token !== false) {
203-
if (!isset($tokenMap[$token])) {
204-
$tokenMap[$token] = 0;
213+
$delim = $tokenizer->previousDelimiter();
214+
215+
if ($delim && str_contains($softDelims, $delim) && $token !== '') {
216+
$extendedToken .= $delim . $token;
217+
$extendedLen++;
218+
} else {
219+
if ($extendedLen > 1) {
220+
$tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
221+
}
222+
$extendedToken = $token;
223+
$extendedLen = 1;
205224
}
206-
$tokenMap[$token]++;
207-
$token = strtok($splitChars);
225+
226+
if ($token) {
227+
$tokenMap[$token] = ($tokenMap[$token] ?? 0) + 1;
228+
}
229+
230+
$token = $tokenizer->next();
231+
}
232+
233+
if ($extendedLen > 1) {
234+
$tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
208235
}
209236

210237
return $tokenMap;

app/Search/SearchOptions.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ protected static function decodeEscapes(string $input): string
181181
protected static function parseStandardTermString(string $termString): array
182182
{
183183
$terms = explode(' ', $termString);
184-
$indexDelimiters = SearchIndex::$delimiters;
184+
$indexDelimiters = implode('', array_diff(str_split(SearchIndex::$delimiters), str_split(SearchIndex::$softDelimiters)));
185185
$parsed = [
186186
'terms' => [],
187187
'exacts' => [],

app/Search/SearchTextTokenizer.php

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
<?php
2+
3+
namespace BookStack\Search;
4+
5+
/**
6+
* A custom text tokenizer which records & provides insight needed for our search indexing.
7+
* We used to use basic strtok() but this class does the following which that lacked:
8+
* - Tracks and provides the current/previous delimiter that we've stopped at.
9+
* - Returns empty tokens upon parsing a delimiter.
10+
*/
11+
class SearchTextTokenizer
12+
{
13+
protected int $currentIndex = 0;
14+
protected int $length;
15+
protected string $currentDelimiter = '';
16+
protected string $previousDelimiter = '';
17+
18+
public function __construct(
19+
protected string $text,
20+
protected string $delimiters = ' '
21+
) {
22+
$this->length = strlen($this->text);
23+
}
24+
25+
/**
26+
* Get the current delimiter to be found.
27+
*/
28+
public function currentDelimiter(): string
29+
{
30+
return $this->currentDelimiter;
31+
}
32+
33+
/**
34+
* Get the previous delimiter found.
35+
*/
36+
public function previousDelimiter(): string
37+
{
38+
return $this->previousDelimiter;
39+
}
40+
41+
/**
42+
* Get the next token between delimiters.
43+
* Returns false if there's no further tokens.
44+
*/
45+
public function next(): string|false
46+
{
47+
$token = '';
48+
49+
for ($i = $this->currentIndex; $i < $this->length; $i++) {
50+
$char = $this->text[$i];
51+
if (str_contains($this->delimiters, $char)) {
52+
$this->previousDelimiter = $this->currentDelimiter;
53+
$this->currentDelimiter = $char;
54+
$this->currentIndex = $i + 1;
55+
return $token;
56+
}
57+
58+
$token .= $char;
59+
}
60+
61+
if ($token) {
62+
$this->currentIndex = $this->length;
63+
$this->previousDelimiter = $this->currentDelimiter;
64+
$this->currentDelimiter = '';
65+
return $token;
66+
}
67+
68+
return false;
69+
}
70+
}

tests/Search/SearchIndexingTest.php

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,20 @@ public function test_tag_names_and_values_are_indexed_for_search()
7474
$this->assertEquals(3, $scoreByTerm->get('Animal'));
7575
$this->assertEquals(3, $scoreByTerm->get('SuperImportant'));
7676
}
77+
78+
public function test_terms_containing_punctuation_within_retain_original_form_and_split_form_in_index()
79+
{
80+
$page = $this->entities->newPage(['html' => '<p>super.duper awesome-beans big- barry cheese.</p><p>biscuits</p><p>a-bs</p>']);
81+
82+
$scoreByTerm = $page->searchTerms()->pluck('score', 'term');
83+
$expected = ['super', 'duper', 'super.duper', 'awesome-beans', 'awesome', 'beans', 'big', 'barry', 'cheese', 'biscuits', 'a-bs', 'a', 'bs'];
84+
foreach ($expected as $term) {
85+
$this->assertNotNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is indexed");
86+
}
87+
88+
$nonExpected = ['big-', 'big-barry', 'cheese.', 'cheese.biscuits'];
89+
foreach ($nonExpected as $term) {
90+
$this->assertNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is not indexed");
91+
}
92+
}
7793
}

0 commit comments

Comments
 (0)