Skip to content

Commit 6aab49e

Browse files
larryliu0820facebook-github-bot
authored andcommitted
Add a tokenizer (#1641)
Summary: Pull Request resolved: #1641 Add a tokenizer in examples. This needs to consume the artifact generated by `tokenizer.py`. Reviewed By: mikekgfb Differential Revision: D52894344 fbshipit-source-id: e3f03fa5cd2d2607ef2bfd371125de1f72adc968
1 parent 78ccd2e commit 6aab49e

File tree

5 files changed

+494
-0
lines changed

5 files changed

+494
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
2+
3+
def define_common_targets():
4+
runtime.cxx_library(
5+
name = "tokenizer_lib",
6+
srcs = ["tokenizer.cpp"],
7+
headers = ["tokenizer.h"],
8+
exported_deps = [
9+
"//executorch/runtime/core/exec_aten:lib",
10+
"//executorch/runtime/kernel:kernel_includes",
11+
],
12+
visibility = [
13+
"//executorch/...",
14+
],
15+
)
16+
17+
if not runtime.is_oss:
18+
# no resources support
19+
runtime.export_file(
20+
name = "tokenizer_file",
21+
src = "test/test.bin",
22+
)
23+
24+
runtime.cxx_test(
25+
name = "test_tokenizer_cpp",
26+
srcs = ["test/test_tokenizer.cpp"],
27+
deps = [
28+
":tokenizer_lib",
29+
"//executorch/codegen:macros",
30+
"fbsource//xplat/tools/cxx:resources",
31+
],
32+
resources = [":tokenizer_file"],
33+
)
16 Bytes
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
10+
#include <executorch/runtime/platform/runtime.h>
11+
#include <gtest/gtest.h>
12+
#include "tools/cxx/Resources.h"
13+
14+
using namespace ::testing;
15+
16+
namespace torch {
17+
namespace executor {
18+
19+
class TokenizerExtensionTest : public ::testing::Test {
20+
public:
21+
void SetUp() override {
22+
torch::executor::runtime_init();
23+
modelPath_ =
24+
build::getResourcePath(
25+
"executorch/examples/models/llama2/tokenizer/test/test.bin")
26+
.string();
27+
tokenizer_ = std::make_unique<Tokenizer>(32000);
28+
}
29+
30+
std::unique_ptr<Tokenizer> tokenizer_;
31+
std::string modelPath_;
32+
};
33+
34+
TEST_F(TokenizerExtensionTest, EncodeWithoutLoadFails) {
35+
Error error = tokenizer_->encode("hello world", 0, 0, nullptr, nullptr);
36+
EXPECT_EQ(error, Error::NotSupported);
37+
}
38+
39+
TEST_F(TokenizerExtensionTest, DecodeWithoutLoadFails) {
40+
auto result = tokenizer_->decode(0, 0);
41+
EXPECT_EQ(result.error(), Error::NotSupported);
42+
}
43+
44+
TEST_F(TokenizerExtensionTest, TokenizerVocabSizeIsExpected) {
45+
Error res = tokenizer_->load(modelPath_.c_str());
46+
EXPECT_EQ(res, Error::Ok);
47+
// test.bin has vocab size 0 but the tokenizer respects the vocab size being
48+
// passed in and add placeholder tokens.
49+
EXPECT_EQ(tokenizer_->vocab_size(), 32000);
50+
EXPECT_EQ(tokenizer_->bos_tok(), 1);
51+
EXPECT_EQ(tokenizer_->eos_tok(), 2);
52+
}
53+
54+
} // namespace executor
55+
} // namespace torch

0 commit comments

Comments
 (0)