Skip to content

Commit e42ea75

Browse files
larryliu0820facebook-github-bot
authored andcommitted
Add a tokenizer python script (pytorch#1611)
Summary: Add a tokenizer python script that adds some post processing to the vanila `sentencepiece` tokenizer model. This comes in handy when we want to consume it in C++. Pull Request resolved: pytorch#1611 Differential Revision: D52821402 Pulled By: larryliu0820 fbshipit-source-id: a9b10b37a3157f00983c7ce0f0badeefbee1aa4a
1 parent fa50ded commit e42ea75

File tree

4 files changed

+192
-0
lines changed

4 files changed

+192
-0
lines changed

examples/models/llama2/tokenizer/__init__.py

Whitespace-only changes.

examples/models/llama2/tokenizer/test/__init__.py

Whitespace-only changes.
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
8+
import struct
9+
import tempfile
10+
import unittest
11+
from unittest.mock import patch
12+
13+
from executorch.examples.models.llama2.tokenizer.tokenizer import Tokenizer
14+
15+
16+
class TestTokenizer(unittest.TestCase):
17+
@patch(
18+
"executorch.examples.models.llama2.tokenizer.tokenizer.SentencePieceProcessor"
19+
)
20+
def test_export(self, mock_sp):
21+
# Set up the mock SentencePieceProcessor
22+
mock_sp.return_value.vocab_size.return_value = 0
23+
mock_sp.return_value.bos_id.return_value = 1
24+
mock_sp.return_value.eos_id.return_value = 2
25+
mock_sp.return_value.get_piece_size.return_value = 0
26+
# Create a temporary file
27+
with tempfile.NamedTemporaryFile(delete=True) as temp:
28+
# Initialize the tokenizer with the temporary file as the model
29+
tokenizer = Tokenizer(temp.name)
30+
# Export the tokenizer to another temporary file
31+
with open("/tmp/test.bin", "wb") as output:
32+
tokenizer.export(output.name)
33+
# Open the output file in binary mode and read the first 16 bytes
34+
with open(output.name, "rb") as f:
35+
data = f.read(16)
36+
# Unpack the data as 4 integers
37+
vocab_size, bos_id, eos_id, max_token_length = struct.unpack(
38+
"IIII", data
39+
)
40+
# Check that the integers match the properties of the tokenizer
41+
self.assertEqual(vocab_size, 0)
42+
self.assertEqual(bos_id, 1)
43+
self.assertEqual(eos_id, 2)
44+
# Check that the max token length is correct
45+
self.assertEqual(max_token_length, 0)
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
8+
# Script to rewrite tokenizer model given by sentencepiece, with lightweight
9+
# postprocessing logic.
10+
11+
import argparse
12+
import logging
13+
import os
14+
import struct
15+
from typing import List
16+
17+
from sentencepiece import SentencePieceProcessor as SentencePieceProcessor
18+
19+
20+
class Tokenizer:
21+
def __init__(self, model_path: str):
22+
assert os.path.isfile(
23+
model_path
24+
), f"Need a valid tokenizer model path but got {model_path}"
25+
# pyre-fixme[28]: Unexpected keyword argument `model_file` to call `SentencePieceProcessor.__init__`.
26+
self.sp_model = SentencePieceProcessor(model_file=model_path)
27+
self.model_path = model_path
28+
29+
# BOS / EOS token IDs
30+
self.n_words: int = self.sp_model.vocab_size()
31+
self.bos_id: int = self.sp_model.bos_id()
32+
self.eos_id: int = self.sp_model.eos_id()
33+
logging.info(
34+
f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
35+
)
36+
# pyre-fixme[16]: `SentencePieceProcessor` has no attribute `get_piece_size`.
37+
assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
38+
39+
def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
40+
assert type(s) is str
41+
# pyre-fixme[16]: `SentencePieceProcessor` has no attribute `encode`.
42+
t = self.sp_model.encode(s)
43+
if bos:
44+
t = [self.bos_id] + t
45+
if eos:
46+
t = t + [self.eos_id]
47+
return t
48+
49+
def decode(self, t: List[int]) -> str:
50+
# pyre-fixme[16]: `SentencePieceProcessor` has no attribute `encode`.
51+
return self.sp_model.decode(t)
52+
53+
def export(self, output_path: str, *, prepend_padding: bool = False) -> None:
54+
"""
55+
Export tokenizer.model to another serialization format. Here we did some lightweight
56+
processing such as supporting prepend padding token, prepend max token length and
57+
replace '_' back to empty space.
58+
59+
The binary format is:
60+
1. vocab size: int32
61+
2. bos id: int32
62+
3. eos id: int32
63+
4. max token length: int32
64+
5. score: float32, len of bytes: int32, token bytes: [byte] for each token
65+
66+
:param output_path: output path of the new binary.
67+
:param prepend_padding: a boolean to control if we want to prepend a padding token.
68+
69+
:return: None
70+
"""
71+
72+
# get all the tokens (postprocessed) and their scores as floats
73+
tokens, scores = [], []
74+
75+
if prepend_padding:
76+
# Here we use the default padding token and its score.
77+
tokens.append("<pad>".encode("utf-8"))
78+
scores.append(-1)
79+
80+
for i in range(self.n_words):
81+
82+
# decode the token and light postprocessing
83+
# pyre-fixme[16]: `SentencePieceProcessor` has no attribute `id_to_piece`.
84+
t = self.sp_model.id_to_piece(i)
85+
# pyre-fixme[16]: `SentencePieceProcessor` has no attribute `get_score`.
86+
s = self.sp_model.get_score(i)
87+
# sentencepiece use '<s>' as BOS and '</s>' for EOS
88+
if i == self.bos_id:
89+
t = "<s>"
90+
elif i == self.eos_id:
91+
t = "</s>"
92+
t = t.replace("▁", " ") # sentencepiece uses this character as whitespace
93+
b = t.encode("utf-8") # bytes of this token, utf-8 encoded
94+
95+
tokens.append(b)
96+
scores.append(s)
97+
98+
# record the max token length
99+
max_token_length = 0 if not tokens else max(len(t) for t in tokens)
100+
101+
# write to a binary file
102+
with open(output_path, "wb") as f:
103+
# write the vocab size, bos/eos ids and max token length
104+
f.write(
105+
struct.pack(
106+
"IIII", self.n_words, self.bos_id, self.eos_id, max_token_length
107+
)
108+
)
109+
for bytes, score in zip(tokens, scores):
110+
f.write(struct.pack("fI", score, len(bytes)))
111+
f.write(bytes)
112+
logging.info(f"Wrote tokenizer to {output_path}")
113+
114+
115+
if __name__ == "__main__":
116+
parser = argparse.ArgumentParser()
117+
parser.add_argument(
118+
"-t",
119+
"--tokenizer-model",
120+
type=str,
121+
default="tokenizer.model",
122+
help="path to tokenizer model, given by sentencepiece",
123+
)
124+
parser.add_argument(
125+
"-o",
126+
"--output-path",
127+
type=str,
128+
default=None,
129+
help="output path of postprocessed tokenizer model",
130+
)
131+
parser.add_argument(
132+
"-p",
133+
"--prepend-padding",
134+
action="store_true",
135+
help="whether to prepend a padding token to the beginning of the tokenizer",
136+
)
137+
138+
args = parser.parse_args()
139+
140+
t = Tokenizer(args.tokenizer_model)
141+
142+
output_path = (
143+
args.output_path
144+
if args.output_path
145+
else args.tokenizer_model.replace(".model", ".bin")
146+
)
147+
t.export(output_path, prepend_padding=args.prepend_padding)

0 commit comments

Comments
 (0)