Skip to content

Commit 00587b5

Browse files
committed
[llm] Add a tokenizer python script
Summary: Add a tokenizer python script that adds some post processing to the vanila `sentencepiece` tokenizer model. This comes in handy when we want to consume it in C++. Test Plan: Reviewers: Subscribers: Tasks: Tags:
1 parent 91a2310 commit 00587b5

File tree

1 file changed

+132
-0
lines changed

1 file changed

+132
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
8+
# Script to rewrite tokenizer model given by sentencepiece, with lightweight
9+
# postprocessing logic.
10+
11+
import argparse
12+
import logging
13+
import os
14+
import struct
15+
from typing import List
16+
17+
from sentencepiece import SentencePieceProcessor
18+
19+
20+
class Tokenizer:
21+
def __init__(self, model_path: str):
22+
assert os.path.isfile(
23+
model_path
24+
), f"Need a valid tokenizer model path but got {model_path}"
25+
self.sp_model = SentencePieceProcessor(model_file=model_path)
26+
self.model_path = model_path
27+
28+
# BOS / EOS token IDs
29+
self.n_words: int = self.sp_model.vocab_size()
30+
self.bos_id: int = self.sp_model.bos_id()
31+
self.eos_id: int = self.sp_model.eos_id()
32+
logging.info(
33+
f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
34+
)
35+
assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
36+
37+
def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
38+
assert type(s) is str
39+
# pyre-fixme[16]: `SentencePieceProcessor` has no attribute `encode`.
40+
t = self.sp_model.encode(s)
41+
if bos:
42+
t = [self.bos_id] + t
43+
if eos:
44+
t = t + [self.eos_id]
45+
return t
46+
47+
def decode(self, t: List[int]) -> str:
48+
# pyre-fixme[16]: `SentencePieceProcessor` has no attribute `encode`.
49+
return self.sp_model.decode(t)
50+
51+
def export(self, output_path: str, *, prepend_padding: bool = False) -> None:
52+
"""
53+
Export tokenizer.model to another serialization format. Here we did some lightweight
54+
processing such as supporting prepend padding token, prepend max token length and
55+
replace '_' back to empty space.
56+
57+
:param output_path: output path of the new binary.
58+
:param prepend_padding: a boolean to control if we want to prepend a padding token.
59+
60+
:return: None
61+
"""
62+
63+
# get all the tokens (postprocessed) and their scores as floats
64+
tokens, scores = [], []
65+
66+
if prepend_padding:
67+
# Here we use the default padding token and its score.
68+
tokens.append("<pad>".encode("utf-8"))
69+
scores.append(-1)
70+
71+
for i in range(self.n_words):
72+
73+
# decode the token and light postprocessing
74+
# pyre-fixme[16]: `SentencePieceProcessor` has no attribute `id_to_piece`.
75+
t = self.sp_model.id_to_piece(i)
76+
# pyre-fixme[16]: `SentencePieceProcessor` has no attribute `get_score`.
77+
s = self.sp_model.get_score(i)
78+
if i == self.bos_id:
79+
t = "\n<s>\n"
80+
elif i == self.eos_id:
81+
t = "\n</s>\n"
82+
t = t.replace("▁", " ") # sentencepiece uses this character as whitespace
83+
b = t.encode("utf-8") # bytes of this token, utf-8 encoded
84+
85+
tokens.append(b)
86+
scores.append(s)
87+
88+
# record the max token length
89+
max_token_length = max(len(t) for t in tokens)
90+
91+
# write to a binary file
92+
with open(output_path, "wb") as f:
93+
f.write(struct.pack("I", max_token_length))
94+
for bytes, score in zip(tokens, scores):
95+
f.write(struct.pack("fI", score, len(bytes)))
96+
f.write(bytes)
97+
logging.info(f"Wrote tokenizer to {output_path}")
98+
99+
100+
if __name__ == "__main__":
101+
parser = argparse.ArgumentParser()
102+
parser.add_argument(
103+
"-t",
104+
"--tokenizer-model",
105+
type=str,
106+
default="tokenizer.model",
107+
help="path to tokenizer model, given by sentencepiece",
108+
)
109+
parser.add_argument(
110+
"-o",
111+
"--output-path",
112+
type=str,
113+
default=None,
114+
help="output path of postprocessed tokenizer model",
115+
)
116+
parser.add_argument(
117+
"-p",
118+
"--prepend-padding",
119+
action="store_true",
120+
help="whether to prepend a padding token to the beginning of the tokenizer",
121+
)
122+
123+
args = parser.parse_args()
124+
125+
t = Tokenizer(args.tokenizer_model)
126+
127+
output_path = (
128+
args.output_path
129+
if args.output_path
130+
else args.tokenizer_model.replace(".model", ".bin")
131+
)
132+
t.export(output_path, prepend_padding=args.prepend_padding)

0 commit comments

Comments
 (0)