Creating ReadOnlyEmbeddingKVDB class and necessary functions (#4225)

Raahul Kalyaan Jakka · facebook-github-bot · commit c358b8646bfb · 2025-06-04T12:06:43.000-07:00
Summary: Pull Request resolved: #4225 X-link: facebookresearch/FBGEMM#1301 Design doc: https://docs.google.com/document/d/149LdAEHOLP7ei4hwVVkAFXGa4N9uLs1J7efxfBZp3dY/edit?tab=t.0#heading=h.49t3yfaqmt54 Context: We are enabling the usage of rocksDB checkpoint feature in KVTensorWrapper. This allows us to create checkpoints of the embedding tables in SSD. Later, these checkpoints are used by the checkpointing component to create a checkpoint and upload it it to the manifold In this diff: The primary objective of adding the checkpointhandle is to allow multiple process read through the KVTensor. To enable this, we would require to create a read-only KVTensor object that can be read concurrently. To support this, we introduce an ReadOnlyEmbedding KVDB class which is a read-only implementation of EmbeddingKVDB class. We have added a new constructor to the KVTensorWrapper which takes in a serialized KVTensor meta data. When deserializing, we create a readOnlyEmbeddingKVDB for the KVTensorWrapper object Reviewed By: duduyi2013 Differential Revision: D75489873
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/kv_tensor_wrapper.h b/fbgemm_gpu/src/ssd_split_embeddings_cache/kv_tensor_wrapper.h
@@ -66,6 +66,8 @@ class KVTensorWrapper : public torch::jit::CustomClassHolder {
       c10::intrusive_ptr<RocksdbCheckpointHandleWrapper> checkpoint_handle =
           c10::intrusive_ptr<RocksdbCheckpointHandleWrapper>(nullptr));
 
+  explicit KVTensorWrapper(const std::string& serialized);
+
   at::Tensor narrow(int64_t dim, int64_t start, int64_t length);
 
   /// @brief if the backend storage is SSD, use this function
@@ -108,6 +110,16 @@ class KVTensorWrapper : public torch::jit::CustomClassHolder {
 
   std::string layout_str();
 
+  std::string serialize() const;
+
+  // ONLY FOR DEBUGGING PURPOSES, Please don't use this function in production
+  std::string logs() const;
+
+  void deserialize(const std::string& serialized);
+
+  friend void to_json(json& j, const KVTensorWrapper& kvt);
+  friend void from_json(const json& j, KVTensorWrapper& kvt);
+
  private:
   std::shared_ptr<kv_db::EmbeddingKVDB> db_;
   c10::intrusive_ptr<EmbeddingSnapshotHandleWrapper> snapshot_handle_;
@@ -119,6 +131,26 @@ class KVTensorWrapper : public torch::jit::CustomClassHolder {
   int64_t width_offset_;
   std::mutex mtx;
   c10::intrusive_ptr<RocksdbCheckpointHandleWrapper> checkpoint_handle_;
+  //   Used for initializting a readonly rocksdb instance, that we will used for
+  //   cross process async read
+  std::shared_ptr<ReadOnlyEmbeddingKVDB> readonly_db_;
+  // below are variables that is used to hold ReadOnlyEmbeddingKVDB constructor
+  // arguments, they will be filled up when serialize happens and will be used
+  // to construct ReadOnlyEmbeddingKVDB instance later after deserialization
+  //
+  // we don't do ReadOnlyEmbeddingKVDB construction upon KVTensorWrapper
+  // construction, because one ReadOnlyEmbeddingKVDB(rdb checkpoint) could store
+  // table shards for multiple tables, they should share the same underlying
+  // ReadOnlyEmbeddingKVDB instance to easily manage rdb checkpoint lifetime.
+  std::vector<std::string> rdb_shard_checkpoint_paths;
+  std::string tbe_uuid;
+  int64_t num_shards{};
+  int64_t num_threads{};
+  int64_t max_D{};
+  std::string checkpoint_uuid;
 };
 
+void to_json(json& j, const KVTensorWrapper& kvt);
+void from_json(const json& j, KVTensorWrapper& kvt);
+
 } // namespace ssd
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_table_batched_embeddings.cpp b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_table_batched_embeddings.cpp
@@ -11,6 +11,7 @@
 #include <c10/core/ScalarTypeToTypeMeta.h>
 #include <torch/library.h>
 
+#include <nlohmann/json.hpp>
 #include <torch/custom_class.h>
 #include <mutex>
 #include "../dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h"
@@ -324,6 +325,10 @@ CheckpointHandle::CheckpointHandle(
   }
 }
 
+std::vector<std::string> CheckpointHandle::get_shard_checkpoints() const {
+  return shard_checkpoints_;
+}
+
 EmbeddingSnapshotHandleWrapper::EmbeddingSnapshotHandleWrapper(
     const SnapshotHandle* handle,
     std::shared_ptr<EmbeddingRocksDB> db)
@@ -377,6 +382,64 @@ KVTensorWrapper::KVTensorWrapper(
   checkpoint_handle_ = checkpoint_handle;
 }
 
+std::string KVTensorWrapper::serialize() const {
+  // auto call to_json()
+  ssd::json json_serialized = *this;
+  return json_serialized.dump();
+}
+
+std::string KVTensorWrapper::logs() const {
+  std::stringstream ss;
+  if (db_) {
+    CHECK(readonly_db_ == nullptr) << "rdb logs, ro_rdb must be nullptr";
+    ss << "from ckpt paths: " << std::endl;
+    // Required to cast as the KVTensorWrapper.db_ is a pointer for the
+    // EmbeddingKVDB class which is inherited by the EmbeddingRocksDB class
+    auto* db = dynamic_cast<EmbeddingRocksDB*>(db_.get());
+    auto ckpts = db->get_checkpoints(checkpoint_handle_->uuid);
+    for (int i = 0; i < ckpts.size(); i++) {
+      ss << "  shard:" << i << ", ckpt_path:" << ckpts[i] << std::endl;
+    }
+    ss << "  tbe_uuid: " << db->get_tbe_uuid() << std::endl;
+    ss << "  num_shards: " << db->num_shards() << std::endl;
+    ss << "  num_threads: " << db->num_threads() << std::endl;
+    ss << "  max_D: " << db->get_max_D() << std::endl;
+    ss << "  row_offset: " << row_offset_ << std::endl;
+    ss << "  shape: " << shape_ << std::endl;
+    ss << "  dtype: " << static_cast<int64_t>(options_.dtype().toScalarType())
+       << std::endl;
+    ss << "  checkpoint_uuid: " << checkpoint_handle_->uuid << std::endl;
+  } else {
+    CHECK(readonly_db_) << "ro_rdb logs, ro_rdb must be valid";
+    ss << "from ckpt paths: " << std::endl;
+    auto* db = dynamic_cast<ReadOnlyEmbeddingKVDB*>(readonly_db_.get());
+    auto rdb_shard_checkpoint_paths = db->get_rdb_shard_checkpoint_paths();
+    for (int i = 0; i < rdb_shard_checkpoint_paths.size(); i++) {
+      ss << "  shard:" << i << ", ckpt_path:" << rdb_shard_checkpoint_paths[i]
+         << std::endl;
+    }
+    ss << "  tbe_uuid: " << db->get_tbe_uuid() << std::endl;
+    ss << "  num_shards: " << db->num_shards() << std::endl;
+    ss << "  num_threads: " << db->num_threads() << std::endl;
+    ss << "  max_D: " << db->get_max_D() << std::endl;
+    ss << "  row_offset: " << row_offset_ << std::endl;
+    ss << "  shape: " << shape_ << std::endl;
+    ss << "  dtype: " << static_cast<int64_t>(options_.dtype().toScalarType())
+       << std::endl;
+    ss << "  checkpoint_uuid: " << checkpoint_uuid << std::endl;
+  }
+  return ss.str();
+}
+
+void KVTensorWrapper::deserialize(const std::string& serialized) {
+  ssd::json json_serialized = ssd::json::parse(serialized);
+  from_json(json_serialized, *this);
+}
+
+KVTensorWrapper::KVTensorWrapper(const std::string& serialized) {
+  deserialize(serialized);
+}
+
 void KVTensorWrapper::set_embedding_rocks_dp_wrapper(
     c10::intrusive_ptr<EmbeddingRocksDBWrapper> db) {
   db_ = db->impl_;
@@ -454,6 +517,55 @@ void KVTensorWrapper::set_weights_and_ids(
   }
 }
 
+void to_json(ssd::json& j, const KVTensorWrapper& kvt) {
+  // Required to cast as the KVTensorWrapper.db_ is a pointer for the
+  // EmbeddingKVDB class which is inherited by the EmbeddingRocksDB class
+  std::shared_ptr<EmbeddingRocksDB> db =
+      std::dynamic_pointer_cast<EmbeddingRocksDB>(kvt.db_);
+  j = ssd::json{
+      {"rdb_shard_checkpoint_paths",
+       db->get_checkpoints(kvt.checkpoint_handle_->uuid)},
+      {"tbe_uuid", db->get_tbe_uuid()},
+      {"num_shards", db->num_shards()},
+      {"num_threads", db->num_threads()},
+      {"max_D", db->get_max_D()},
+      {"row_offset", kvt.row_offset_},
+      {"shape", kvt.shape_},
+      {"dtype", static_cast<int64_t>(kvt.options_.dtype().toScalarType())},
+      {"checkpoint_uuid", kvt.checkpoint_handle_->uuid}};
+}
+
+void from_json(const ssd::json& j, KVTensorWrapper& kvt) {
+  std::vector<std::string> rdb_shard_checkpoint_paths;
+  std::string tbe_uuid;
+  int64_t num_shards;
+  int64_t num_threads;
+  int64_t max_D;
+  int64_t dtype;
+  j.at("rdb_shard_checkpoint_paths").get_to(rdb_shard_checkpoint_paths);
+  j.at("tbe_uuid").get_to(tbe_uuid);
+  j.at("num_shards").get_to(num_shards);
+  j.at("num_threads").get_to(num_threads);
+  j.at("max_D").get_to(max_D);
+  j.at("dtype").get_to(dtype);
+
+  // initialize ro rdb during KV tensor deserialization
+  // one rdb checkpoint is related to # tables of KVT, this way each KVT will
+  // hold their own rdb instance link to the same checkpoint during destruction,
+  // they will delete the same checkpoint, but since ckpt path has been opened
+  // during ro rdb init, OS will not delete the file until all file handles are
+  // closed
+  kvt.readonly_db_ = std::make_shared<ReadOnlyEmbeddingKVDB>(
+      rdb_shard_checkpoint_paths, tbe_uuid, num_shards, num_threads, max_D);
+  j.at("checkpoint_uuid").get_to(kvt.checkpoint_uuid);
+  j.at("row_offset").get_to(kvt.row_offset_);
+  j.at("shape").get_to(kvt.shape_);
+  kvt.options_ = at::TensorOptions()
+                     .dtype(static_cast<at::ScalarType>(dtype))
+                     .device(at::kCPU)
+                     .layout(at::kStrided);
+}
+
 at::Tensor KVTensorWrapper::get_weights_by_ids(const at::Tensor& ids) {
   CHECK_TRUE(db_ != nullptr);
   CHECK_GE(db_->get_max_D(), shape_[1]);
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_table_batched_embeddings.h b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_table_batched_embeddings.h