pytorch · larryliu0820 · Jan 18, 2024
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import struct
+import tempfile
+import unittest
+from unittest.mock import patch
+
+from executorch.examples.models.llama2.tokenizer.tokenizer import Tokenizer
+
+
+class TestTokenizer(unittest.TestCase):
+    @patch(
+        "executorch.examples.models.llama2.tokenizer.tokenizer.SentencePieceProcessor"
+    )
+    def test_export(self, mock_sp):
+        # Set up the mock SentencePieceProcessor
+        mock_sp.return_value.vocab_size.return_value = 0
+        mock_sp.return_value.bos_id.return_value = 1
+        mock_sp.return_value.eos_id.return_value = 2
+        mock_sp.return_value.get_piece_size.return_value = 0
+        # Create a temporary file
+        with tempfile.NamedTemporaryFile(delete=True) as temp:
+            # Initialize the tokenizer with the temporary file as the model
+            tokenizer = Tokenizer(temp.name)
+            # Export the tokenizer to another temporary file
+            with tempfile.NamedTemporaryFile(delete=True) as output:
+                tokenizer.export(output.name)
+                # Open the output file in binary mode and read the first 16 bytes
+                with open(output.name, "rb") as f:
+                    data = f.read(16)
+                # Unpack the data as 4 integers
+                vocab_size, bos_id, eos_id, max_token_length = struct.unpack(
+                    "IIII", data
+                )
+                # Check that the integers match the properties of the tokenizer
+                self.assertEqual(vocab_size, 0)
+                self.assertEqual(bos_id, 1)
+                self.assertEqual(eos_id, 2)
+                # Check that the max token length is correct
+                self.assertEqual(max_token_length, 0)
@@ -0,0 +1,145 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# Script to rewrite tokenizer model given by sentencepiece, with lightweight
+# postprocessing logic.
+
+import argparse
+import logging
+import os
+import struct
+from typing import List
+
+from sentencepiece import SentencePieceProcessor as SentencePieceProcessor
+
+
+class Tokenizer:
+    def __init__(self, model_path: str):
+        assert os.path.isfile(
+            model_path
+        ), f"Need a valid tokenizer model path but got {model_path}"
+        self.sp_model = SentencePieceProcessor(model_file=model_path)
+        self.model_path = model_path
+
+        # BOS / EOS token IDs
+        self.n_words: int = self.sp_model.vocab_size()
+        self.bos_id: int = self.sp_model.bos_id()
+        self.eos_id: int = self.sp_model.eos_id()
+        logging.info(
+            f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
+        )
+        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
+
+    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
+        assert type(s) is str
+        # pyre-fixme[16]: `SentencePieceProcessor` has no attribute `encode`.
+        t = self.sp_model.encode(s)
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+        return t
+
+    def decode(self, t: List[int]) -> str:
+        # pyre-fixme[16]: `SentencePieceProcessor` has no attribute `encode`.
+        return self.sp_model.decode(t)
+
+    def export(self, output_path: str, *, prepend_padding: bool = False) -> None:
+        """
+        Export tokenizer.model to another serialization format. Here we did some lightweight
+        processing such as supporting prepend padding token, prepend max token length and
+        replace '_' back to empty space.
+
+        The binary format is:
+        1. vocab size: int32
+        2. bos id: int32
+        3. eos id: int32
+        4. max token length: int32
+        5. score: float32, len of bytes: int32, token bytes: [byte] for each token
+
+        :param output_path: output path of the new binary.
+        :param prepend_padding: a boolean to control if we want to prepend a padding token.
+
+        :return: None
+        """
+
+        # get all the tokens (postprocessed) and their scores as floats
+        tokens, scores = [], []
+
+        if prepend_padding:
+            # Here we use the default padding token and its score.
+            tokens.append("<pad>".encode("utf-8"))
+            scores.append(-1)
+
+        for i in range(self.n_words):
+
+            # decode the token and light postprocessing
+            # pyre-fixme[16]: `SentencePieceProcessor` has no attribute `id_to_piece`.
+            t = self.sp_model.id_to_piece(i)
+            # pyre-fixme[16]: `SentencePieceProcessor` has no attribute `get_score`.
+            s = self.sp_model.get_score(i)
+            # sentencepiece use '<s>' as BOS and '</s>' for EOS
+            if i == self.bos_id:
+                t = "<s>"
+            elif i == self.eos_id:
+                t = "</s>"
+            t = t.replace("▁", " ")  # sentencepiece uses this character as whitespace
+            b = t.encode("utf-8")  # bytes of this token, utf-8 encoded
+
+            tokens.append(b)
+            scores.append(s)
+
+        # record the max token length
+        max_token_length = 0 if not tokens else max(len(t) for t in tokens)
+
+        # write to a binary file
+        with open(output_path, "wb") as f:
+            # write the vocab size, bos/eos ids and max token length
+            f.write(
+                struct.pack(
+                    "IIII", self.n_words, self.bos_id, self.eos_id, max_token_length
+                )
+            )
+            for bytes, score in zip(tokens, scores):
+                f.write(struct.pack("fI", score, len(bytes)))
+                f.write(bytes)
+        logging.info(f"Wrote tokenizer to {output_path}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-t",
+        "--tokenizer-model",
+        type=str,
+        default="tokenizer.model",
+        help="path to tokenizer model, given by sentencepiece",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-path",
+        type=str,
+        default=None,
+        help="output path of postprocessed tokenizer model",
+    )
+    parser.add_argument(
+        "-p",
+        "--prepend-padding",
+        action="store_true",
+        help="whether to prepend a padding token to the beginning of the tokenizer",
+    )
+
+    args = parser.parse_args()
+
+    t = Tokenizer(args.tokenizer_model)
+
+    output_path = (
+        args.output_path
+        if args.output_path
+        else args.tokenizer_model.replace(".model", ".bin")
+    )
+    t.export(output_path, prepend_padding=args.prepend_padding)