Skip to content

Add pytorch-labs/tokenizers into ET submodules #9074

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .ci/scripts/build_llama_android.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ install_executorch_and_backend_lib() {

build_llama_runner() {
echo "Building llama runner for Android..."
pushd extension/llm/tokenizers
echo "Updating tokenizers submodule"
git submodule update --init
popd
ANDROID_ABI=arm64-v8a
cmake -DBUCK2="${BUCK2}" \
-DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK"/build/cmake/android.toolchain.cmake \
Expand Down
6 changes: 6 additions & 0 deletions .ci/scripts/test_ane_static_llama.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ fi

which "${PYTHON_EXECUTABLE}"

# Update tokenizers submodule
pushd $EXECUTORCH_ROOT/extension/llm/tokenizers
echo "Update tokenizers submodule"
git submodule update --init
popd

pushd $EXECUTORCH_ROOT/examples/apple/coreml/llama

# Download stories llama110m artifacts
Expand Down
4 changes: 4 additions & 0 deletions .ci/scripts/test_llama.sh
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,10 @@ cmake_install_executorch_libraries() {

cmake_build_llama_runner() {
echo "Building llama runner"
pushd extension/llm/tokenizers
echo "Updating tokenizers submodule"
git submodule update --init
popd
dir="examples/models/llama"
retry cmake \
-DCMAKE_INSTALL_PREFIX=cmake-out \
Expand Down
12 changes: 3 additions & 9 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,9 @@
[submodule "backends/xnnpack/third-party/pthreadpool"]
path = backends/xnnpack/third-party/pthreadpool
url = https://github.com/Maratyszcza/pthreadpool.git
[submodule "extension/llm/third-party/abseil-cpp"]
path = extension/llm/third-party/abseil-cpp
url = https://github.com/abseil/abseil-cpp.git
[submodule "extension/llm/third-party/re2"]
path = extension/llm/third-party/re2
url = https://github.com/google/re2.git
[submodule "extension/llm/third-party/sentencepiece"]
path = extension/llm/third-party/sentencepiece
url = https://github.com/google/sentencepiece.git
[submodule "extension/llm/tokenizers"]
path = extension/llm/tokenizers
url = https://github.com/pytorch-labs/tokenizers.git
[submodule "kernels/optimized/third-party/eigen"]
path = kernels/optimized/third-party/eigen
url = https://gitlab.com/libeigen/eigen.git
Expand Down
4 changes: 4 additions & 0 deletions backends/qualcomm/scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,10 @@ if [ "$BUILD_X86_64" = true ]; then
EXAMPLE_ROOT=examples/qualcomm
CMAKE_PREFIX_PATH="${BUILD_ROOT}/lib/cmake/ExecuTorch;${BUILD_ROOT}/third-party/gflags;"

echo "Update tokenizers submodule..."
pushd $PRJ_ROOT/extension/llm/tokenizers
git submodule update --init
popd
cmake $PRJ_ROOT/$EXAMPLE_ROOT \
-DCMAKE_BUILD_TYPE=$BUILD_TYPE \
-DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
Expand Down
5 changes: 5 additions & 0 deletions build/build_android_library.sh
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,11 @@ build_android_native_library() {
fi
cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config "${EXECUTORCH_CMAKE_BUILD_TYPE}"

# Update tokenizers submodule
pushd extension/llm/tokenizers
echo "Update tokenizers submodule"
git submodule update --init
popd
cmake extension/android \
-DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
-DANDROID_ABI="${ANDROID_ABI}" \
Expand Down
8 changes: 4 additions & 4 deletions examples/mediatek/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -122,17 +122,17 @@ if(${ANDROID})
)
# Build ABSL and RE2
set(EXTENSIONS_LLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm)
set(THIRD_PARTY_ABSL_DIR ${EXTENSIONS_LLM_DIR}/third-party/abseil-cpp)
set(THIRD_PARTY_RE2_DIR ${EXTENSIONS_LLM_DIR}/third-party/re2)
set(THIRD_PARTY_ABSL_DIR ${EXTENSIONS_LLM_DIR}/tokenizers/third-party/abseil-cpp)
set(THIRD_PARTY_RE2_DIR ${EXTENSIONS_LLM_DIR}/tokenizers/third-party/re2)
set(ABSL_ENABLE_INSTALL ON)
set(ABSL_PROPAGATE_CXX_STD ON)
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
add_subdirectory(
${THIRD_PARTY_ABSL_DIR} ${CMAKE_CURRENT_BINARY_DIR}/third-party/abseil
${THIRD_PARTY_ABSL_DIR} ${CMAKE_CURRENT_BINARY_DIR}/tokenizers/third-party/abseil
)
add_subdirectory(
${THIRD_PARTY_RE2_DIR} ${CMAKE_CURRENT_BINARY_DIR}/third-party/re2
${THIRD_PARTY_RE2_DIR} ${CMAKE_CURRENT_BINARY_DIR}/tokenizers/third-party/re2
)
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})

Expand Down
8 changes: 5 additions & 3 deletions examples/models/llama/runner/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,11 @@ set(ABSL_PROPAGATE_CXX_STD ON)
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
add_subdirectory(
${EXECUTORCH_ROOT}/extension/llm/third-party/abseil-cpp
${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/abseil-cpp
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
)
add_subdirectory(
${EXECUTORCH_ROOT}/extension/llm/third-party/re2
${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/re2
${CMAKE_CURRENT_BINARY_DIR}/re2
)
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
Expand All @@ -82,6 +82,8 @@ set(llama_runner_deps executorch extension_data_loader extension_module
target_link_libraries(llama_runner PUBLIC ${llama_runner_deps})

target_include_directories(
llama_runner INTERFACE ${_common_include_directories} ${EXECUTORCH_ROOT}
llama_runner
INTERFACE ${_common_include_directories} ${EXECUTORCH_ROOT}
${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
)
target_compile_options(llama_runner PUBLIC ${_preprocessor_flag})
9 changes: 5 additions & 4 deletions examples/models/llama/tokenizer/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ include(${EXECUTORCH_ROOT}/build/Test.cmake)

set(_tokenizer_test_srcs
test_tiktoken.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizer/tiktoken.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizers/src/tiktoken.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../llama_tiktoken.cpp
)

Expand All @@ -29,11 +29,11 @@ set(ABSL_PROPAGATE_CXX_STD ON)
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/abseil-cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizers/third-party/abseil-cpp
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/re2
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizers/third-party/re2
${CMAKE_CURRENT_BINARY_DIR}/re2
)
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
Expand All @@ -43,5 +43,6 @@ target_include_directories(
tokenizer_test
PRIVATE
${CMAKE_INSTALL_PREFIX}/include
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/abseil-cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizers/include
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizers/third-party/abseil-cpp
)
9 changes: 6 additions & 3 deletions examples/qualcomm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,10 @@ target_compile_options(
full_portable_ops_lib INTERFACE -DET_EVENT_TRACER_ENABLED
)
target_include_directories(
full_portable_ops_lib PUBLIC ${_common_include_directories}
full_portable_ops_lib
PUBLIC
${_common_include_directories}
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/tokenizers/include
)

# find RE2 for tokenizer
Expand All @@ -72,11 +75,11 @@ set(ABSL_PROPAGATE_CXX_STD ON)
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/abseil-cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/tokenizers/third-party/abseil-cpp
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/re2
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/tokenizers/third-party/re2
${CMAKE_CURRENT_BINARY_DIR}/re2
)
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
Expand Down
47 changes: 0 additions & 47 deletions extension/llm/third-party/TARGETS

This file was deleted.

1 change: 0 additions & 1 deletion extension/llm/third-party/abseil-cpp
Submodule abseil-cpp deleted from eb8522
1 change: 0 additions & 1 deletion extension/llm/third-party/re2
Submodule re2 deleted from 6dcd83
1 change: 0 additions & 1 deletion extension/llm/third-party/sentencepiece
Submodule sentencepiece deleted from 6225e0
7 changes: 4 additions & 3 deletions extension/llm/tokenizer/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ set(ABSL_PROPAGATE_CXX_STD ON)
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../third-party/abseil-cpp
${CMAKE_CURRENT_SOURCE_DIR}/../tokenizers/third-party/abseil-cpp
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../third-party/re2
${CMAKE_CURRENT_SOURCE_DIR}/../tokenizers/third-party/re2
${CMAKE_CURRENT_BINARY_DIR}/re2
)
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
Expand All @@ -35,6 +35,7 @@ add_library(extension_llm_tokenizer ${_extension_llm_tokenizer__srcs})
target_include_directories(
extension_llm_tokenizer PUBLIC ${EXECUTORCH_ROOT}/..
${_common_include_directories}
${CMAKE_CURRENT_SOURCE_DIR}/../tokenizers/include
)

target_link_libraries(extension_llm_tokenizer re2::re2)
Expand All @@ -53,7 +54,7 @@ install(
target_include_directories(
extension_llm_tokenizer
PRIVATE ${CMAKE_INSTALL_PREFIX}/include
${CMAKE_CURRENT_SOURCE_DIR}/../third-party/abseil-cpp
${CMAKE_CURRENT_SOURCE_DIR}/../tokenizers/third-party/abseil-cpp
)

if(BUILD_TESTING)
Expand Down
1 change: 1 addition & 0 deletions extension/llm/tokenizers
Submodule tokenizers added at 4da238
3 changes: 2 additions & 1 deletion shim_et/xplat/executorch/build/env_interface.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,10 @@ _EXTERNAL_DEPS = {
"nlohmann_json": [], # Intentionally not supporting OSS buck build HF tokenizer.
"prettytable": "//third-party:prettytable",
"pybind11": "//third-party:pybind11",
"re2": "//extension/llm/third-party:re2",
"re2": "//extension/llm/tokenizers/third-party:re2",
"sentencepiece": [], # Intentionally not supporting OSS buck build of sentencepiece.
"sentencepiece-py": [],
"tiktoken": "//extension/llm/tokenizers:tiktoken",
# Core C++ PyTorch functionality like Tensor and ScalarType.
"torch-core-cpp": "//third-party:libtorch",
"torchgen": "//third-party:torchgen",
Expand Down
Loading