Skip to content

Commit ba117d2

Browse files
larryliu0820facebook-github-bot
authored andcommitted
Add pytorch-labs/tokenizers into ET submodules (#9074)
Summary: Pull Request resolved: #9074 This stack is aiming to replace `extension/llm/tokenizer` with `pytorch-labs/tokenizers` repo. This PR is adding `pytorch-labs/tokenizers` into ET submodules (`extension/llm/tokenizers`) and remove `extension/llm/third-party/` and use `extension/llm/tokenizers/third-party/` instead. Differential Revision: D70862880
1 parent 51901f3 commit ba117d2

File tree

15 files changed

+33
-81
lines changed

15 files changed

+33
-81
lines changed

.gitmodules

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,9 @@
2828
[submodule "backends/xnnpack/third-party/pthreadpool"]
2929
path = backends/xnnpack/third-party/pthreadpool
3030
url = https://github.com/Maratyszcza/pthreadpool.git
31-
[submodule "extension/llm/third-party/abseil-cpp"]
32-
path = extension/llm/third-party/abseil-cpp
33-
url = https://github.com/abseil/abseil-cpp.git
34-
[submodule "extension/llm/third-party/re2"]
35-
path = extension/llm/third-party/re2
36-
url = https://github.com/google/re2.git
37-
[submodule "extension/llm/third-party/sentencepiece"]
38-
path = extension/llm/third-party/sentencepiece
39-
url = https://github.com/google/sentencepiece.git
31+
[submodule "extension/llm/tokenizers"]
32+
path = extension/llm/tokenizers
33+
url = https://github.com/pytorch-labs/tokenizers.git
4034
[submodule "kernels/optimized/third-party/eigen"]
4135
path = kernels/optimized/third-party/eigen
4236
url = https://gitlab.com/libeigen/eigen.git

examples/mediatek/CMakeLists.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -122,17 +122,17 @@ if(${ANDROID})
122122
)
123123
# Build ABSL and RE2
124124
set(EXTENSIONS_LLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm)
125-
set(THIRD_PARTY_ABSL_DIR ${EXTENSIONS_LLM_DIR}/third-party/abseil-cpp)
126-
set(THIRD_PARTY_RE2_DIR ${EXTENSIONS_LLM_DIR}/third-party/re2)
125+
set(THIRD_PARTY_ABSL_DIR ${EXTENSIONS_LLM_DIR}/tokenizers/third-party/abseil-cpp)
126+
set(THIRD_PARTY_RE2_DIR ${EXTENSIONS_LLM_DIR}/tokenizers/third-party/re2)
127127
set(ABSL_ENABLE_INSTALL ON)
128128
set(ABSL_PROPAGATE_CXX_STD ON)
129129
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
130130
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
131131
add_subdirectory(
132-
${THIRD_PARTY_ABSL_DIR} ${CMAKE_CURRENT_BINARY_DIR}/third-party/abseil
132+
${THIRD_PARTY_ABSL_DIR} ${CMAKE_CURRENT_BINARY_DIR}/tokenizers/third-party/abseil
133133
)
134134
add_subdirectory(
135-
${THIRD_PARTY_RE2_DIR} ${CMAKE_CURRENT_BINARY_DIR}/third-party/re2
135+
${THIRD_PARTY_RE2_DIR} ${CMAKE_CURRENT_BINARY_DIR}/tokenizers/third-party/re2
136136
)
137137
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
138138

examples/models/llama/runner/CMakeLists.txt

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,11 @@ set(ABSL_PROPAGATE_CXX_STD ON)
6666
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
6767
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
6868
add_subdirectory(
69-
${EXECUTORCH_ROOT}/extension/llm/third-party/abseil-cpp
69+
${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/abseil-cpp
7070
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
7171
)
7272
add_subdirectory(
73-
${EXECUTORCH_ROOT}/extension/llm/third-party/re2
73+
${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/re2
7474
${CMAKE_CURRENT_BINARY_DIR}/re2
7575
)
7676
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
@@ -82,6 +82,8 @@ set(llama_runner_deps executorch extension_data_loader extension_module
8282
target_link_libraries(llama_runner PUBLIC ${llama_runner_deps})
8383

8484
target_include_directories(
85-
llama_runner INTERFACE ${_common_include_directories} ${EXECUTORCH_ROOT}
85+
llama_runner
86+
INTERFACE ${_common_include_directories} ${EXECUTORCH_ROOT}
87+
${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
8688
)
8789
target_compile_options(llama_runner PUBLIC ${_preprocessor_flag})

examples/models/llama/tokenizer/llama_tiktoken.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
namespace example {
1212

13-
using ::executorch::extension::llm::Tiktoken;
13+
using ::tokenizers::Tiktoken;
1414

1515
namespace {
1616
static constexpr int32_t kSpecialTokensSize = 256;

examples/models/llama/tokenizer/llama_tiktoken.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
#pragma once
1010

11-
#include <executorch/extension/llm/tokenizer/tiktoken.h>
11+
#include <pytorch/tokenizers/tiktoken.h>
1212

1313
namespace example {
1414

@@ -17,7 +17,7 @@ enum class Version {
1717
Multimodal,
1818
};
1919

20-
std::unique_ptr<::executorch::extension::llm::Tiktoken> get_tiktoken_for_llama(
20+
std::unique_ptr<::tokenizers::Tiktoken> get_tiktoken_for_llama(
2121
Version version = Version::Default);
2222

2323
} // namespace example

examples/models/llama/tokenizer/targets.bzl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def define_common_targets():
1515
"llama_tiktoken.h",
1616
],
1717
exported_deps = [
18-
"//executorch/extension/llm/tokenizer:tiktoken",
18+
"//pytorch/tokenizers:tiktoken",
1919
],
2020
visibility = [
2121
"@EXECUTORCH_CLIENTS",

examples/models/llama/tokenizer/test/CMakeLists.txt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ include(${EXECUTORCH_ROOT}/build/Test.cmake)
1919

2020
set(_tokenizer_test_srcs
2121
test_tiktoken.cpp
22-
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizer/tiktoken.cpp
22+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizers/src/tiktoken.cpp
2323
${CMAKE_CURRENT_SOURCE_DIR}/../llama_tiktoken.cpp
2424
)
2525

@@ -29,11 +29,11 @@ set(ABSL_PROPAGATE_CXX_STD ON)
2929
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
3030
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
3131
add_subdirectory(
32-
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/abseil-cpp
32+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizers/third-party/abseil-cpp
3333
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
3434
)
3535
add_subdirectory(
36-
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/re2
36+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizers/third-party/re2
3737
${CMAKE_CURRENT_BINARY_DIR}/re2
3838
)
3939
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
@@ -43,5 +43,6 @@ target_include_directories(
4343
tokenizer_test
4444
PRIVATE
4545
${CMAKE_INSTALL_PREFIX}/include
46-
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/abseil-cpp
46+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizers/include
47+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizers/third-party/abseil-cpp
4748
)

examples/qualcomm/CMakeLists.txt

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,10 @@ target_compile_options(
6363
full_portable_ops_lib INTERFACE -DET_EVENT_TRACER_ENABLED
6464
)
6565
target_include_directories(
66-
full_portable_ops_lib PUBLIC ${_common_include_directories}
66+
full_portable_ops_lib
67+
PUBLIC
68+
${_common_include_directories}
69+
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/tokenizers/include
6770
)
6871

6972
# find RE2 for tokenizer
@@ -72,11 +75,11 @@ set(ABSL_PROPAGATE_CXX_STD ON)
7275
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
7376
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
7477
add_subdirectory(
75-
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/abseil-cpp
78+
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/tokenizers/third-party/abseil-cpp
7679
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
7780
)
7881
add_subdirectory(
79-
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/re2
82+
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/tokenizers/third-party/re2
8083
${CMAKE_CURRENT_BINARY_DIR}/re2
8184
)
8285
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})

extension/llm/third-party/TARGETS

Lines changed: 0 additions & 47 deletions
This file was deleted.

extension/llm/third-party/abseil-cpp

Lines changed: 0 additions & 1 deletion
This file was deleted.

extension/llm/third-party/re2

Lines changed: 0 additions & 1 deletion
This file was deleted.
Lines changed: 0 additions & 1 deletion
This file was deleted.

extension/llm/tokenizer/CMakeLists.txt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,11 @@ set(ABSL_PROPAGATE_CXX_STD ON)
2121
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
2222
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
2323
add_subdirectory(
24-
${CMAKE_CURRENT_SOURCE_DIR}/../third-party/abseil-cpp
24+
${CMAKE_CURRENT_SOURCE_DIR}/../tokenizers/third-party/abseil-cpp
2525
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
2626
)
2727
add_subdirectory(
28-
${CMAKE_CURRENT_SOURCE_DIR}/../third-party/re2
28+
${CMAKE_CURRENT_SOURCE_DIR}/../tokenizers/third-party/re2
2929
${CMAKE_CURRENT_BINARY_DIR}/re2
3030
)
3131
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
@@ -35,6 +35,7 @@ add_library(extension_llm_tokenizer ${_extension_llm_tokenizer__srcs})
3535
target_include_directories(
3636
extension_llm_tokenizer PUBLIC ${EXECUTORCH_ROOT}/..
3737
${_common_include_directories}
38+
${CMAKE_CURRENT_SOURCE_DIR}/../tokenizers/include
3839
)
3940

4041
target_link_libraries(extension_llm_tokenizer re2::re2)
@@ -53,7 +54,7 @@ install(
5354
target_include_directories(
5455
extension_llm_tokenizer
5556
PRIVATE ${CMAKE_INSTALL_PREFIX}/include
56-
${CMAKE_CURRENT_SOURCE_DIR}/../third-party/abseil-cpp
57+
${CMAKE_CURRENT_SOURCE_DIR}/../tokenizers/third-party/abseil-cpp
5758
)
5859

5960
if(BUILD_TESTING)

extension/llm/tokenizers

Submodule tokenizers added at 4da2387

shim_et/xplat/executorch/build/env_interface.bzl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ _EXTERNAL_DEPS = {
4343
"nlohmann_json": [], # Intentionally not supporting OSS buck build HF tokenizer.
4444
"prettytable": "//third-party:prettytable",
4545
"pybind11": "//third-party:pybind11",
46-
"re2": "//extension/llm/third-party:re2",
46+
"re2": "//extension/llm/tokenizers/third-party:re2",
4747
"sentencepiece": [], # Intentionally not supporting OSS buck build of sentencepiece.
4848
"sentencepiece-py": [],
4949
# Core C++ PyTorch functionality like Tensor and ScalarType.

0 commit comments

Comments
 (0)