Skip to content

Move examples/models/... out of the torch namespace #5318

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@ @interface LLaMARunner ()<ExecuTorchLogSink>
@end

@implementation LLaMARunner {
std::unique_ptr<Runner> _runner;
std::unique_ptr<example::Runner> _runner;
}

- (instancetype)initWithModelPath:(NSString*)modelPath
tokenizerPath:(NSString*)tokenizerPath {
self = [super init];
if (self) {
[ExecuTorchLog.sharedLog addSink:self];
_runner = std::make_unique<Runner>(
_runner = std::make_unique<example::Runner>(
modelPath.UTF8String, tokenizerPath.UTF8String);
}
return self;
Expand Down Expand Up @@ -109,15 +109,15 @@ @interface LLaVARunner ()<ExecuTorchLogSink>
@end

@implementation LLaVARunner {
std::unique_ptr<LlavaRunner> _runner;
std::unique_ptr<example::LlavaRunner> _runner;
}

- (instancetype)initWithModelPath:(NSString*)modelPath
tokenizerPath:(NSString*)tokenizerPath {
self = [super init];
if (self) {
[ExecuTorchLog.sharedLog addSink:self];
_runner = std::make_unique<LlavaRunner>(
_runner = std::make_unique<example::LlavaRunner>(
modelPath.UTF8String, tokenizerPath.UTF8String);
}
return self;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ std::unique_ptr<Tokenizer> load_tokenizer() {
if (FLAGS_tokenizer_type == "bpe") {
tokenizer = std::make_unique<torch::executor::BPETokenizer>();
} else if (FLAGS_tokenizer_type == "tiktoken") {
tokenizer = torch::executor::get_tiktoken_for_llama();
tokenizer = example::get_tiktoken_for_llama();
}
ET_CHECK_MSG(
tokenizer, "Invalid tokenizer type: %s", FLAGS_tokenizer_type.c_str());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@
#include <algorithm>
#include <string>

namespace torch::executor {
namespace example {

using ::executorch::aten::ScalarType;
using ::executorch::aten::Tensor;
using ::executorch::aten::TensorImpl;

// Fowrward declaration needed for ARM compilers.
int32_t safe_size_t_to_sizes_type(size_t value);
Expand Down Expand Up @@ -166,4 +170,4 @@ std::vector<executorch::extension::TensorPtr> cross_attention_mask(
return cross_attention_masks;
}

} // namespace torch::executor
} // namespace example
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@
#include <executorch/extension/tensor/tensor.h>
#include <executorch/runtime/core/exec_aten/exec_aten.h>

namespace torch {
namespace executor {
namespace example {

/**
* Computes the cross-attention mask for text + image inputs. Text tokens that
Expand Down Expand Up @@ -61,11 +60,10 @@ namespace executor {
*/
std::vector<::executorch::extension::TensorPtr> cross_attention_mask(
const std::vector<int>& tokens,
const std::vector<Tensor>& images,
const std::vector<::executorch::aten::Tensor>& images,
size_t tile_size,
size_t patch_size,
int image_token_id,
std::vector<std::vector<int>>& out);

} // namespace executor
} // namespace torch
} // namespace example
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ TEST(CrossAttentxnMaskTest, TestCrossAttentionMask) {

std::vector<Tensor> images = {a, b, c};
std::vector<std::vector<int>> mask_data;
auto output_masks = torch::executor::cross_attention_mask(
auto output_masks = example::cross_attention_mask(
tokens,
images,
/*tile_size=*/1,
Expand Down
2 changes: 1 addition & 1 deletion examples/models/llama2/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ int32_t main(int32_t argc, char** argv) {
}
#endif
// create llama runner
::torch::executor::Runner runner(model_path, tokenizer_path, temperature);
example::Runner runner(model_path, tokenizer_path, temperature);

// generate
runner.generate(prompt, seq_len);
Expand Down
41 changes: 24 additions & 17 deletions examples/models/llama2/runner/runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,14 @@
#include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>

namespace torch::executor {
namespace example {

using ::executorch::extension::Module;
using ::executorch::runtime::Error;
using ::executorch::runtime::Result;

namespace llm = ::executorch::extension::llm;

namespace {
static constexpr auto kAppendEosToPrompt = "append_eos_to_prompt";
static constexpr auto kEnableDynamicShape = "enable_dynamic_shape";
Expand Down Expand Up @@ -80,7 +87,7 @@ Error Runner::load() {
"Failed to load %s as a Tiktoken artifact, trying BPE tokenizer",
tokenizer_path_.c_str());
tokenizer_.reset();
tokenizer_ = std::make_unique<BPETokenizer>();
tokenizer_ = std::make_unique<llm::BPETokenizer>();
tokenizer_->load(tokenizer_path_);
}

Expand Down Expand Up @@ -119,17 +126,17 @@ Error Runner::load() {
ET_LOG(Info, "eos_id = %" PRId64, value);
}
}
text_decoder_runner_ = std::make_unique<TextDecoderRunner>(
text_decoder_runner_ = std::make_unique<llm::TextDecoderRunner>(
module_.get(),
metadata_.at(kUseKVCache),
metadata_.at(kVocabSize),
temperature_);
text_prefiller_ = std::make_unique<TextPrefiller>(
text_prefiller_ = std::make_unique<llm::TextPrefiller>(
text_decoder_runner_.get(),
metadata_.at(kUseKVCache),
metadata_.at(kEnableDynamicShape));

text_token_generator_ = std::make_unique<TextTokenGenerator>(
text_token_generator_ = std::make_unique<llm::TextTokenGenerator>(
tokenizer_.get(),
text_decoder_runner_.get(),
metadata_.at(kUseKVCache),
Expand All @@ -143,26 +150,26 @@ Error Runner::generate(
const std::string& prompt,
int32_t seq_len,
std::function<void(const std::string&)> token_callback,
std::function<void(const Stats&)> stats_callback,
std::function<void(const llm::Stats&)> stats_callback,
bool echo) {
// Prepare the inputs.
// Use ones-initialized inputs.
ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
if (!is_loaded()) {
stats_.model_load_start_ms = util::time_in_ms();
stats_.model_load_start_ms = llm::time_in_ms();
ET_CHECK_OK_OR_RETURN_ERROR(load());
stats_.model_load_end_ms = util::time_in_ms();
stats_.model_load_end_ms = llm::time_in_ms();
}

ET_LOG(
Info,
"RSS after loading model: %f MiB (0 if unsupported)",
util::get_rss_bytes() / 1024.0 / 1024.0);
llm::get_rss_bytes() / 1024.0 / 1024.0);

// Wrap the token_callback with print function
std::function<void(const std::string&)> wrapped_callback =
[token_callback](const std::string& piece) {
util::safe_printf(piece.c_str());
llm::safe_printf(piece.c_str());
fflush(stdout);
if (token_callback) {
token_callback(piece);
Expand All @@ -171,7 +178,7 @@ Error Runner::generate(
// First token time only measures the time it takes to encode the prompt and
// return a response token.

stats_.inference_start_ms = util::time_in_ms();
stats_.inference_start_ms = llm::time_in_ms();
shouldStop_ = false;

// Set the sequence length to the max seq length if not provided
Expand Down Expand Up @@ -214,8 +221,8 @@ Error Runner::generate(
}
int64_t pos = 0;
auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos);
stats_.first_token_ms = util::time_in_ms();
stats_.prompt_eval_end_ms = util::time_in_ms();
stats_.first_token_ms = llm::time_in_ms();
stats_.prompt_eval_end_ms = llm::time_in_ms();
ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
uint64_t cur_token = prefill_res.get();

Expand All @@ -224,19 +231,19 @@ Error Runner::generate(
ET_LOG(
Info,
"RSS after prompt prefill: %f MiB (0 if unsupported)",
util::get_rss_bytes() / 1024.0 / 1024.0);
llm::get_rss_bytes() / 1024.0 / 1024.0);

// start the main loop
prompt_tokens.push_back(cur_token);
int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
prompt_tokens, num_prompt_tokens, seq_len, wrapped_callback));

stats_.inference_end_ms = util::time_in_ms();
stats_.inference_end_ms = llm::time_in_ms();
printf("\n");
ET_LOG(
Info,
"RSS after finishing text generation: %f MiB (0 if unsupported)",
util::get_rss_bytes() / 1024.0 / 1024.0);
llm::get_rss_bytes() / 1024.0 / 1024.0);

if (num_prompt_tokens + num_generated_tokens == seq_len) {
ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len);
Expand All @@ -259,4 +266,4 @@ void Runner::stop() {
ET_LOG(Error, "Token generator is not loaded, cannot stop");
}
}
} // namespace torch::executor
} // namespace example
26 changes: 14 additions & 12 deletions examples/models/llama2/runner/runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@
#include <executorch/extension/llm/tokenizer/tokenizer.h>
#include <executorch/extension/module/module.h>

namespace torch::executor {
using Stats = ::executorch::llm::Stats;
namespace example {

class Runner {
public:
Expand All @@ -35,12 +34,13 @@ class Runner {
const float temperature = 0.8f);

bool is_loaded() const;
Error load();
Error generate(
::executorch::runtime::Error load();
::executorch::runtime::Error generate(
const std::string& prompt,
int32_t seq_len = 128,
std::function<void(const std::string&)> token_callback = {},
std::function<void(const Stats&)> stats_callback = {},
std::function<void(const ::executorch::extension::llm::Stats&)>
stats_callback = {},
bool echo = true);
void stop();

Expand All @@ -49,16 +49,18 @@ class Runner {
bool shouldStop_{false};

// model
std::unique_ptr<Module> module_;
std::unique_ptr<::executorch::extension::Module> module_;
std::string tokenizer_path_;
std::unique_ptr<Tokenizer> tokenizer_;
std::unique_ptr<::executorch::extension::llm::Tokenizer> tokenizer_;
std::unordered_map<std::string, int64_t> metadata_;
std::unique_ptr<TextDecoderRunner> text_decoder_runner_;
std::unique_ptr<TextPrefiller> text_prefiller_;
std::unique_ptr<TextTokenGenerator> text_token_generator_;
std::unique_ptr<::executorch::extension::llm::TextDecoderRunner>
text_decoder_runner_;
std::unique_ptr<::executorch::extension::llm::TextPrefiller> text_prefiller_;
std::unique_ptr<::executorch::extension::llm::TextTokenGenerator>
text_token_generator_;

// stats
Stats stats_;
::executorch::extension::llm::Stats stats_;
};

} // namespace torch::executor
} // namespace example
11 changes: 6 additions & 5 deletions examples/models/llama2/tokenizer/llama_tiktoken.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@

#include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>

namespace torch {
namespace executor {
namespace example {

using ::executorch::extension::llm::Tiktoken;

namespace {
static constexpr int32_t kSpecialTokensSize = 256;
static constexpr size_t kBOSTokenIndex = 0;
Expand Down Expand Up @@ -72,7 +74,7 @@ _get_multimodal_special_tokens() {

std::unique_ptr<std::vector<std::string>> _get_special_tokens(Version version) {
switch (version) {
case MULTIMODAL:
case Version::Multimodal:
return _get_multimodal_special_tokens();
default:
return _get_default_special_tokens();
Expand All @@ -86,5 +88,4 @@ std::unique_ptr<Tiktoken> get_tiktoken_for_llama(Version version) {
_get_special_tokens(version), kBOSTokenIndex, kEOSTokenIndex);
}

} // namespace executor
} // namespace torch
} // namespace example
15 changes: 7 additions & 8 deletions examples/models/llama2/tokenizer/llama_tiktoken.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,14 @@

#include <executorch/extension/llm/tokenizer/tiktoken.h>

namespace torch {
namespace executor {
namespace example {

enum Version {
DEFAULT,
MULTIMODAL,
enum class Version {
Default,
Multimodal,
};

std::unique_ptr<Tiktoken> get_tiktoken_for_llama(Version version = DEFAULT);
std::unique_ptr<::executorch::extension::llm::Tiktoken> get_tiktoken_for_llama(
Version version = Version::Default);

} // namespace executor
} // namespace torch
} // namespace example
17 changes: 10 additions & 7 deletions examples/models/llama2/tokenizer/test/test_tiktoken.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,25 @@
*/

#include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>

#include <vector>

#include <executorch/runtime/platform/runtime.h>

#include <gtest/gtest.h>
#include <vector>

using namespace ::testing;

namespace torch {
namespace executor {
using ::example::Version;
using ::executorch::extension::llm::Tokenizer;
using ::executorch::runtime::Error;
using ::executorch::runtime::Result;

class MultimodalTiktokenV5ExtensionTest : public Test {
public:
void SetUp() override {
torch::executor::runtime_init();
tokenizer_ = get_tiktoken_for_llama(MULTIMODAL);
executorch::runtime::runtime_init();
tokenizer_ = get_tiktoken_for_llama(Version::Multimodal);
modelPath_ = std::getenv("RESOURCES_PATH") +
std::string("/test_tiktoken_tokenizer.model");
}
Expand Down Expand Up @@ -79,5 +84,3 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerDecodeCorrectly) {
EXPECT_EQ(out.get(), expected[i]);
}
}
} // namespace executor
} // namespace torch
2 changes: 1 addition & 1 deletion examples/models/llava/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ int32_t main(int32_t argc, char** argv) {
}
#endif
// create llama runner
torch::executor::LlavaRunner runner(model_path, tokenizer_path, temperature);
example::LlavaRunner runner(model_path, tokenizer_path, temperature);

// read image and resize the longest edge to 336
std::vector<uint8_t> image_data;
Expand Down
Loading
Loading