pytorch · larryliu0820 · Jan 19, 2024 · Jan 19, 2024
@@ -0,0 +1,33 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    runtime.cxx_library(
+        name = "tokenizer_lib",
+        srcs = ["tokenizer.cpp"],
+        headers = ["tokenizer.h"],
+        exported_deps = [
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/kernel:kernel_includes",
+        ],
+        visibility = [
+            "//executorch/...",
+        ],
+    )
+
+    if not runtime.is_oss:
+        # no resources support
+        runtime.export_file(
+            name = "tokenizer_file",
+            src = "test/test.bin",
+        )
+
+        runtime.cxx_test(
+            name = "test_tokenizer_cpp",
+            srcs = ["test/test_tokenizer.cpp"],
+            deps = [
+                ":tokenizer_lib",
+                "//executorch/codegen:macros",
+                "fbsource//xplat/tools/cxx:resources",
+            ],
+            resources = [":tokenizer_file"],
+        )
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <gtest/gtest.h>
+#include "tools/cxx/Resources.h"
+
+using namespace ::testing;
+
+namespace torch {
+namespace executor {
+
+class TokenizerExtensionTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    torch::executor::runtime_init();
+    modelPath_ =
+        build::getResourcePath(
+            "executorch/examples/models/llama2/tokenizer/test/test.bin")
+            .string();
+    tokenizer_ = std::make_unique<Tokenizer>(32000);
+  }
+
+  std::unique_ptr<Tokenizer> tokenizer_;
+  std::string modelPath_;
+};
+
+TEST_F(TokenizerExtensionTest, EncodeWithoutLoadFails) {
+  Error error = tokenizer_->encode("hello world", 0, 0, nullptr, nullptr);
+  EXPECT_EQ(error, Error::NotSupported);
+}
+
+TEST_F(TokenizerExtensionTest, DecodeWithoutLoadFails) {
+  auto result = tokenizer_->decode(0, 0);
+  EXPECT_EQ(result.error(), Error::NotSupported);
+}
+
+TEST_F(TokenizerExtensionTest, TokenizerVocabSizeIsExpected) {
+  Error res = tokenizer_->load(modelPath_.c_str());
+  EXPECT_EQ(res, Error::Ok);
+  // test.bin has vocab size 0 but the tokenizer respects the vocab size being
+  // passed in and add placeholder tokens.
+  EXPECT_EQ(tokenizer_->vocab_size(), 32000);
+  EXPECT_EQ(tokenizer_->bos_tok(), 1);
+  EXPECT_EQ(tokenizer_->eos_tok(), 2);
+}
+
+} // namespace executor
+} // namespace torch
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import struct
+import tempfile
+import unittest
+from unittest.mock import patch
+
+from executorch.examples.models.llama2.tokenizer.tokenizer import Tokenizer
+
+
+class TestTokenizer(unittest.TestCase):
+    @patch(
+        "executorch.examples.models.llama2.tokenizer.tokenizer.SentencePieceProcessor"
+    )
+    def test_export(self, mock_sp):
+        # Set up the mock SentencePieceProcessor
+        mock_sp.return_value.vocab_size.return_value = 0
+        mock_sp.return_value.bos_id.return_value = 1
+        mock_sp.return_value.eos_id.return_value = 2
+        mock_sp.return_value.get_piece_size.return_value = 0
+        # Create a temporary file
+        with tempfile.NamedTemporaryFile(delete=True) as temp:
+            # Initialize the tokenizer with the temporary file as the model
+            tokenizer = Tokenizer(temp.name)
+            # Export the tokenizer to another temporary file
+            with open("/tmp/test.bin", "wb") as output:
+                tokenizer.export(output.name)
+                # Open the output file in binary mode and read the first 16 bytes
+                with open(output.name, "rb") as f:
+                    data = f.read(16)
+                # Unpack the data as 4 integers
+                vocab_size, bos_id, eos_id, max_token_length = struct.unpack(
+                    "IIII", data
+                )
+                # Check that the integers match the properties of the tokenizer
+                self.assertEqual(vocab_size, 0)
+                self.assertEqual(bos_id, 1)
+                self.assertEqual(eos_id, 2)
+                # Check that the max token length is correct
+                self.assertEqual(max_token_length, 0)