pytorch · dbort · Sep 14, 2024
@@ -21,15 +21,15 @@ @interface LLaMARunner ()<ExecuTorchLogSink>
 @end
 
 @implementation LLaMARunner {
-  std::unique_ptr<Runner> _runner;
+  std::unique_ptr<example::Runner> _runner;
 }
 
 - (instancetype)initWithModelPath:(NSString*)modelPath
                     tokenizerPath:(NSString*)tokenizerPath {
   self = [super init];
   if (self) {
     [ExecuTorchLog.sharedLog addSink:self];
-    _runner = std::make_unique<Runner>(
+    _runner = std::make_unique<example::Runner>(
         modelPath.UTF8String, tokenizerPath.UTF8String);
   }
   return self;
@@ -109,15 +109,15 @@ @interface LLaVARunner ()<ExecuTorchLogSink>
 @end
 
 @implementation LLaVARunner {
-  std::unique_ptr<LlavaRunner> _runner;
+  std::unique_ptr<example::LlavaRunner> _runner;
 }
 
 - (instancetype)initWithModelPath:(NSString*)modelPath
                     tokenizerPath:(NSString*)tokenizerPath {
   self = [super init];
   if (self) {
     [ExecuTorchLog.sharedLog addSink:self];
-    _runner = std::make_unique<LlavaRunner>(
+    _runner = std::make_unique<example::LlavaRunner>(
         modelPath.UTF8String, tokenizerPath.UTF8String);
   }
   return self;

@@ -316,7 +316,7 @@ std::unique_ptr<Tokenizer> load_tokenizer() {
   if (FLAGS_tokenizer_type == "bpe") {
     tokenizer = std::make_unique<torch::executor::BPETokenizer>();
   } else if (FLAGS_tokenizer_type == "tiktoken") {
-    tokenizer = torch::executor::get_tiktoken_for_llama();
+    tokenizer = example::get_tiktoken_for_llama();
   }
   ET_CHECK_MSG(
       tokenizer, "Invalid tokenizer type: %s", FLAGS_tokenizer_type.c_str());

@@ -11,7 +11,11 @@
 #include <algorithm>
 #include <string>
 
-namespace torch::executor {
+namespace example {
+
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::aten::TensorImpl;
 
 // Fowrward declaration needed for ARM compilers.
 int32_t safe_size_t_to_sizes_type(size_t value);
@@ -166,4 +170,4 @@ std::vector<executorch::extension::TensorPtr> cross_attention_mask(
   return cross_attention_masks;
 }
 
-} // namespace torch::executor
+} // namespace example
@@ -13,8 +13,7 @@
 #include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 
-namespace torch {
-namespace executor {
+namespace example {
 
 /**
  * Computes the cross-attention mask for text + image inputs. Text tokens that
@@ -61,11 +60,10 @@ namespace executor {
  */
 std::vector<::executorch::extension::TensorPtr> cross_attention_mask(
     const std::vector<int>& tokens,
-    const std::vector<Tensor>& images,
+    const std::vector<::executorch::aten::Tensor>& images,
     size_t tile_size,
     size_t patch_size,
     int image_token_id,
     std::vector<std::vector<int>>& out);
 
-} // namespace executor
-} // namespace torch
+} // namespace example
@@ -41,7 +41,7 @@ TEST(CrossAttentxnMaskTest, TestCrossAttentionMask) {
 
   std::vector<Tensor> images = {a, b, c};
   std::vector<std::vector<int>> mask_data;
-  auto output_masks = torch::executor::cross_attention_mask(
+  auto output_masks = example::cross_attention_mask(
       tokens,
       images,
       /*tile_size=*/1,

@@ -69,7 +69,7 @@ int32_t main(int32_t argc, char** argv) {
   }
 #endif
   // create llama runner
-  ::torch::executor::Runner runner(model_path, tokenizer_path, temperature);
+  example::Runner runner(model_path, tokenizer_path, temperature);
 
   // generate
   runner.generate(prompt, seq_len);

@@ -18,7 +18,14 @@
 #include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
 #include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
 
-namespace torch::executor {
+namespace example {
+
+using ::executorch::extension::Module;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
+
+namespace llm = ::executorch::extension::llm;
+
 namespace {
 static constexpr auto kAppendEosToPrompt = "append_eos_to_prompt";
 static constexpr auto kEnableDynamicShape = "enable_dynamic_shape";
@@ -80,7 +87,7 @@ Error Runner::load() {
         "Failed to load %s as a Tiktoken artifact, trying BPE tokenizer",
         tokenizer_path_.c_str());
     tokenizer_.reset();
-    tokenizer_ = std::make_unique<BPETokenizer>();
+    tokenizer_ = std::make_unique<llm::BPETokenizer>();
     tokenizer_->load(tokenizer_path_);
   }
 
@@ -119,17 +126,17 @@ Error Runner::load() {
       ET_LOG(Info, "eos_id = %" PRId64, value);
     }
   }
-  text_decoder_runner_ = std::make_unique<TextDecoderRunner>(
+  text_decoder_runner_ = std::make_unique<llm::TextDecoderRunner>(
       module_.get(),
       metadata_.at(kUseKVCache),
       metadata_.at(kVocabSize),
       temperature_);
-  text_prefiller_ = std::make_unique<TextPrefiller>(
+  text_prefiller_ = std::make_unique<llm::TextPrefiller>(
       text_decoder_runner_.get(),
       metadata_.at(kUseKVCache),
       metadata_.at(kEnableDynamicShape));
 
-  text_token_generator_ = std::make_unique<TextTokenGenerator>(
+  text_token_generator_ = std::make_unique<llm::TextTokenGenerator>(
       tokenizer_.get(),
       text_decoder_runner_.get(),
       metadata_.at(kUseKVCache),
@@ -143,26 +150,26 @@ Error Runner::generate(
     const std::string& prompt,
     int32_t seq_len,
     std::function<void(const std::string&)> token_callback,
-    std::function<void(const Stats&)> stats_callback,
+    std::function<void(const llm::Stats&)> stats_callback,
     bool echo) {
   // Prepare the inputs.
   // Use ones-initialized inputs.
   ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
   if (!is_loaded()) {
-    stats_.model_load_start_ms = util::time_in_ms();
+    stats_.model_load_start_ms = llm::time_in_ms();
     ET_CHECK_OK_OR_RETURN_ERROR(load());
-    stats_.model_load_end_ms = util::time_in_ms();
+    stats_.model_load_end_ms = llm::time_in_ms();
   }
 
   ET_LOG(
       Info,
       "RSS after loading model: %f MiB (0 if unsupported)",
-      util::get_rss_bytes() / 1024.0 / 1024.0);
+      llm::get_rss_bytes() / 1024.0 / 1024.0);
 
   // Wrap the token_callback with print function
   std::function<void(const std::string&)> wrapped_callback =
       [token_callback](const std::string& piece) {
-        util::safe_printf(piece.c_str());
+        llm::safe_printf(piece.c_str());
         fflush(stdout);
         if (token_callback) {
           token_callback(piece);
@@ -171,7 +178,7 @@ Error Runner::generate(
   // First token time only measures the time it takes to encode the prompt and
   // return a response token.
 
-  stats_.inference_start_ms = util::time_in_ms();
+  stats_.inference_start_ms = llm::time_in_ms();
   shouldStop_ = false;
 
   // Set the sequence length to the max seq length if not provided
@@ -214,8 +221,8 @@ Error Runner::generate(
   }
   int64_t pos = 0;
   auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos);
-  stats_.first_token_ms = util::time_in_ms();
-  stats_.prompt_eval_end_ms = util::time_in_ms();
+  stats_.first_token_ms = llm::time_in_ms();
+  stats_.prompt_eval_end_ms = llm::time_in_ms();
   ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
   uint64_t cur_token = prefill_res.get();
 
@@ -224,19 +231,19 @@ Error Runner::generate(
   ET_LOG(
       Info,
       "RSS after prompt prefill: %f MiB (0 if unsupported)",
-      util::get_rss_bytes() / 1024.0 / 1024.0);
+      llm::get_rss_bytes() / 1024.0 / 1024.0);
 
   // start the main loop
   prompt_tokens.push_back(cur_token);
   int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
       prompt_tokens, num_prompt_tokens, seq_len, wrapped_callback));
 
-  stats_.inference_end_ms = util::time_in_ms();
+  stats_.inference_end_ms = llm::time_in_ms();
   printf("\n");
   ET_LOG(
       Info,
       "RSS after finishing text generation: %f MiB (0 if unsupported)",
-      util::get_rss_bytes() / 1024.0 / 1024.0);
+      llm::get_rss_bytes() / 1024.0 / 1024.0);
 
   if (num_prompt_tokens + num_generated_tokens == seq_len) {
     ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len);
@@ -259,4 +266,4 @@ void Runner::stop() {
     ET_LOG(Error, "Token generator is not loaded, cannot stop");
   }
 }
-} // namespace torch::executor
+} // namespace example
@@ -24,8 +24,7 @@
 #include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
 
-namespace torch::executor {
-using Stats = ::executorch::llm::Stats;
+namespace example {
 
 class Runner {
  public:
@@ -35,12 +34,13 @@ class Runner {
       const float temperature = 0.8f);
 
   bool is_loaded() const;
-  Error load();
-  Error generate(
+  ::executorch::runtime::Error load();
+  ::executorch::runtime::Error generate(
       const std::string& prompt,
       int32_t seq_len = 128,
       std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const Stats&)> stats_callback = {},
+      std::function<void(const ::executorch::extension::llm::Stats&)>
+          stats_callback = {},
       bool echo = true);
   void stop();
 
@@ -49,16 +49,18 @@ class Runner {
   bool shouldStop_{false};
 
   // model
-  std::unique_ptr<Module> module_;
+  std::unique_ptr<::executorch::extension::Module> module_;
   std::string tokenizer_path_;
-  std::unique_ptr<Tokenizer> tokenizer_;
+  std::unique_ptr<::executorch::extension::llm::Tokenizer> tokenizer_;
   std::unordered_map<std::string, int64_t> metadata_;
-  std::unique_ptr<TextDecoderRunner> text_decoder_runner_;
-  std::unique_ptr<TextPrefiller> text_prefiller_;
-  std::unique_ptr<TextTokenGenerator> text_token_generator_;
+  std::unique_ptr<::executorch::extension::llm::TextDecoderRunner>
+      text_decoder_runner_;
+  std::unique_ptr<::executorch::extension::llm::TextPrefiller> text_prefiller_;
+  std::unique_ptr<::executorch::extension::llm::TextTokenGenerator>
+      text_token_generator_;
 
   // stats
-  Stats stats_;
+  ::executorch::extension::llm::Stats stats_;
 };
 
-} // namespace torch::executor
+} // namespace example
@@ -8,8 +8,10 @@
 
 #include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
 
-namespace torch {
-namespace executor {
+namespace example {
+
+using ::executorch::extension::llm::Tiktoken;
+
 namespace {
 static constexpr int32_t kSpecialTokensSize = 256;
 static constexpr size_t kBOSTokenIndex = 0;
@@ -72,7 +74,7 @@ _get_multimodal_special_tokens() {
 
 std::unique_ptr<std::vector<std::string>> _get_special_tokens(Version version) {
   switch (version) {
-    case MULTIMODAL:
+    case Version::Multimodal:
       return _get_multimodal_special_tokens();
     default:
       return _get_default_special_tokens();
@@ -86,5 +88,4 @@ std::unique_ptr<Tiktoken> get_tiktoken_for_llama(Version version) {
       _get_special_tokens(version), kBOSTokenIndex, kEOSTokenIndex);
 }
 
-} // namespace executor
-} // namespace torch
+} // namespace example
@@ -10,15 +10,14 @@
 
 #include <executorch/extension/llm/tokenizer/tiktoken.h>
 
-namespace torch {
-namespace executor {
+namespace example {
 
-enum Version {
-  DEFAULT,
-  MULTIMODAL,
+enum class Version {
+  Default,
+  Multimodal,
 };
 
-std::unique_ptr<Tiktoken> get_tiktoken_for_llama(Version version = DEFAULT);
+std::unique_ptr<::executorch::extension::llm::Tiktoken> get_tiktoken_for_llama(
+    Version version = Version::Default);
 
-} // namespace executor
-} // namespace torch
+} // namespace example
@@ -7,20 +7,25 @@
  */
 
 #include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
+
+#include <vector>
+
 #include <executorch/runtime/platform/runtime.h>
+
 #include <gtest/gtest.h>
-#include <vector>
 
 using namespace ::testing;
 
-namespace torch {
-namespace executor {
+using ::example::Version;
+using ::executorch::extension::llm::Tokenizer;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
 
 class MultimodalTiktokenV5ExtensionTest : public Test {
  public:
   void SetUp() override {
-    torch::executor::runtime_init();
-    tokenizer_ = get_tiktoken_for_llama(MULTIMODAL);
+    executorch::runtime::runtime_init();
+    tokenizer_ = get_tiktoken_for_llama(Version::Multimodal);
     modelPath_ = std::getenv("RESOURCES_PATH") +
         std::string("/test_tiktoken_tokenizer.model");
   }
@@ -79,5 +84,3 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerDecodeCorrectly) {
     EXPECT_EQ(out.get(), expected[i]);
   }
 }
-} // namespace executor
-} // namespace torch
@@ -80,7 +80,7 @@ int32_t main(int32_t argc, char** argv) {
   }
 #endif
   // create llama runner
-  torch::executor::LlavaRunner runner(model_path, tokenizer_path, temperature);
+  example::LlavaRunner runner(model_path, tokenizer_path, temperature);
 
   // read image and resize the longest edge to 336
   std::vector<uint8_t> image_data;