Skip to content

Commit e8a8a9a

Browse files
fix: improve Unigram Tokenizer handling of multibyte strings
1 parent 7e880dd commit e8a8a9a

File tree

4 files changed

+88
-89
lines changed

4 files changed

+88
-89
lines changed

Diff for: src/DataStructures/CharTrie.php

+8-4
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,10 @@ public function extend(array $texts): void
4040
public function push(string $text): void
4141
{
4242
$node = $this->root;
43+
$length = mb_strlen($text);
4344

44-
for ($i = 0, $length = strlen($text); $i < $length; $i++) {
45-
$ch = $text[$i];
45+
for ($i = 0; $i < $length; $i++) {
46+
$ch = mb_substr($text, $i, 1);
4647
$node = $node->getChild($ch);
4748
}
4849

@@ -59,10 +60,13 @@ public function commonPrefixSearch(string $text): Generator
5960
{
6061
$node = $this->root;
6162
$prefix = "";
62-
for ($i = 0; $i < strlen($text) && $node != null; $i++) {
63-
$ch = $text[$i];
63+
$length = mb_strlen($text);
64+
65+
for ($i = 0; $i < $length && $node != null; $i++) {
66+
$ch = mb_substr($text, $i, 1);
6467
$prefix .= $ch;
6568
$node = $node->getChild($ch);
69+
6670
if ($node?->isLeaf) {
6771
yield $prefix;
6872
}

Diff for: src/DataStructures/TokenLattice.php

+2-2
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ public function __construct(
3636
public ?int $bosTokenId,
3737
public ?int $eosTokenId)
3838
{
39-
$this->len = strlen($sentence);
39+
$this->len = mb_strlen($sentence);
4040
$this->beginNodes = array_fill(0, $this->len + 1, []);
4141
$this->endNodes = array_fill(0, $this->len + 1, []);
4242

@@ -124,7 +124,7 @@ public function viterbi(): array
124124
*/
125125
public function piece(TokenLatticeNode $node): string
126126
{
127-
return substr($this->sentence, $node->pos, $node->length);
127+
return mb_substr($this->sentence, $node->pos, $node->length);
128128
}
129129

130130
/**

0 commit comments

Comments
 (0)