Merge branch 'master' into manually_register_einsum_XLA

pgmoka · web-flow · commit 4180ef9fe5b4 · 2025-03-05T14:42:36.000-08:00
diff --git a/WORKSPACE b/WORKSPACE
@@ -46,7 +46,7 @@ new_local_repository(
 
 # To build PyTorch/XLA with OpenXLA to a new revision, update following xla_hash to
 # the openxla git commit hash.
-xla_hash = 'd091218ab839d35c541e9c683767b7d8034cadf8'
+xla_hash = '0622372b580e16fd84930c2f6a184a7559428309'
 
 http_archive(
     name = "xla",
diff --git a/setup.py b/setup.py
@@ -66,12 +66,10 @@
 
 USE_NIGHTLY = True  # whether to use nightly or stable libtpu and jax
 
-_date = '20250210'
-
-# Note: jax/jaxlib 20250115 build will fail. Check https://github.com/pytorch/xla/pull/8621#issuecomment-2616564634 for more details.
-_libtpu_version = '0.0.10'
-_jax_version = '0.5.1'
-_jaxlib_version = '0.5.1'
+_date = '20250303'
+_libtpu_version = '0.0.11'
+_jax_version = '0.5.2'
+_jaxlib_version = '0.5.2'
 
 _libtpu_wheel_name = f'libtpu-{_libtpu_version}'
 _libtpu_storage_directory = 'libtpu-lts-releases'
diff --git a/test/run_tests.sh b/test/run_tests.sh
@@ -199,6 +199,7 @@ function run_xla_op_tests2 {
   run_test "$CDIR/scan/test_scan_layers.py"
   run_test "$CDIR/test_gru.py"
   run_test "$CDIR/test_as_stride_use_slice.py"
+  run_test "$CDIR/test_placeholder.py"
   run_xla_hlo_debug run_test "$CDIR/scan/test_scan_debug.py"
   run_test "$CDIR/test_autocast.py"
   run_test "$CDIR/eager/test_eager.py"
diff --git a/test/scan/test_scan.py b/test/scan/test_scan.py
@@ -613,6 +613,39 @@ def compute_outputs_and_gradients(carry, x):
     self.compare_pytree(grad_init, expected_grads['init'])
     self.compare_pytree(grad_x, expected_grads['x'])
 
+  def test_scan_tracing_does_not_allocate_device_memory(self):
+    """
+    When scan is tracing the function to obtain an HLO, it should not allocate
+    device memory.
+    """
+
+    def fn1(carry, x):
+      carry = torch.sin(carry)
+      x = torch.sin(x)
+      return carry, x
+
+    def fn2(carry, x):
+      """
+      Test cases where input/outputs are aliased.
+      """
+      return carry, x
+
+    for fn in [fn1, fn2]:
+      init = torch.tensor([0.0, 0.0], requires_grad=True, device=self.device)
+      xs = torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
+                        requires_grad=True,
+                        device=self.device)
+      torch_xla.sync(wait=True)
+      met.clear_all()
+      self.assertFalse(met.metric_data("TransferToDeviceTime"))
+      # Use `scan` to lower `fn` into HLO and run it. Doing so should not
+      # transfer anything from host to device since `init` and `xs` are
+      # already on the device.
+      # In practice, `carry` and `x` will be placeholder tensors in `fn`.
+      _ = scan(fn, init, xs)
+      torch_xla.sync(wait=True)
+      self.assertFalse(met.metric_data("TransferToDeviceTime"))
+
 
 if __name__ == '__main__':
   test = unittest.main()
diff --git a/test/spmd/test_train_spmd_linear_model.py b/test/spmd/test_train_spmd_linear_model.py
@@ -74,10 +74,26 @@ def test_gradient_accumulation_matches(self):
     # Verify that the model losses are not zero, and that the runs match.
     assert all(loss != 0 for loss in baseline_grad_acc_losses)
     assert all(
-        torch.allclose(baseline_loss, checkpointing_loss, rtol=1e-4, atol=1e-8)
-        for baseline_loss, checkpointing_loss in zip(baseline_grad_acc_losses,
+        torch.allclose(baseline_loss, loop_grad_acc_loss, rtol=1e-4, atol=1e-8)
+        for baseline_loss, loop_grad_acc_loss in zip(baseline_grad_acc_losses,
                                                      loop_grad_acc_losses))
 
+    if not SKIP_GRADIENT_CHECKPOINTING:
+      print('Training loop with XLA\'s `While` gradient accumulation and '
+            'gradient checkpointing.')
+      with extended_argv(
+          COMMON_GRAD_ACC_ARGS +
+          ["--use_gradient_accumulation_loop", "--use_gradient_checkpointing"]):
+        loop_grad_acc_grad_chkpt_losses = train_and_evaluate_grad_acc()
+      assert all(
+          torch.allclose(
+              baseline_loss,
+              loop_grad_acc_grad_chkpt_loss,
+              rtol=1e-4,
+              atol=1e-8)
+          for baseline_loss, loop_grad_acc_grad_chkpt_loss in zip(
+              baseline_grad_acc_losses, loop_grad_acc_grad_chkpt_losses))
+
 
 if __name__ == '__main__':
   parser = argparse.ArgumentParser()
diff --git a/test/test_placeholder.py b/test/test_placeholder.py
@@ -0,0 +1,73 @@
+from absl.testing import absltest
+import torch
+import torch_xla
+from torch_xla.core.xla_builder import create_placeholder_tensor
+import torch_xla.debug.metrics as met
+import re
+
+
+class TestPlaceholder(absltest.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    torch_xla._XLAC._xla_set_enable_alias_with_buffer_donor_config(True)
+
+  def test_create_placeholder(self):
+    for shape, dtype in zip(
+        ((1, 2), (2, 3, 4), (3, 4, 5, 6)),
+        (torch.float32, torch.bfloat16, torch.int8),
+    ):
+      p = create_placeholder_tensor(shape, dtype)
+      assert isinstance(p, torch.Tensor)
+      assert p.device == torch_xla.device()
+      self.assertEqual(p.dtype, dtype)
+      self.assertEqual(p.shape, shape)
+      self.assertTrue(torch_xla._XLAC._is_placecholder(p))
+
+  def test_read_value_crashes(self):
+    p = create_placeholder_tensor((1,), torch.bfloat16)
+    with self.assertRaises(RuntimeError):
+      p.cpu()
+
+  def test_trace_graph(self):
+    met.clear_all()
+    self.assertFalse(met.metric_data("TransferToDeviceTime"))
+
+    p1 = create_placeholder_tensor((2, 3), torch.bfloat16)
+    a = torch.sin(p1)
+    p2 = create_placeholder_tensor((3, 4), torch.bfloat16)
+    # We use p1 once and p2 twice. But the graph should still only have two parameters.
+    b = (a @ p2) @ p2.T
+    ir: str = torch_xla._XLAC._get_xla_tensors_text([b])
+    self.assertEqual(ir.count("xla::device_data()"), 2)
+    self.assertEqual(ir.count("bf16[3,4]{1,0} xla::device_data()"), 1)
+    self.assertEqual(ir.count("bf16[2,3]{1,0} xla::device_data()"), 1)
+    hlo: str = torch_xla._XLAC._get_xla_tensors_hlo([b])
+    regex = r'\(p.*: bf16\[3,4\], p.*: bf16\[2,3\]\) -> \(bf16\[2,3\]\)'
+    assert re.search(regex, hlo) is not None
+
+    # There should be no buffers transferred to the device during tracing
+    self.assertFalse(met.metric_data("TransferToDeviceTime"))
+
+  def test_placeholder_handle_unique(self):
+    p1 = create_placeholder_tensor((1,), torch.bfloat16)
+    p2 = create_placeholder_tensor((1,), torch.bfloat16)
+    h1, h2 = torch_xla._XLAC._get_tensors_handle([p1, p2])
+    self.assertNotEqual(h1, h2)
+
+  def test_cannot_get_handle_from_deleted_pjrt_buffer(self):
+    xla_device = torch_xla.device()
+    t0 = torch.randn(4, 2, 2).to(xla_device)
+    t1 = torch.randn(4, 2, 2).to(xla_device)
+    self.assertTrue(torch_xla._XLAC._set_buffer_donation(t0, True))
+    self.assertTrue(torch_xla._XLAC._get_buffer_donation(t0))
+    _ = t0 + t1
+    torch_xla.sync(wait=True)
+
+    self.assertTrue(torch_xla._XLAC._is_placecholder(t0))
+    with self.assertRaises(RuntimeError, msg='is deleted'):
+      torch_xla._XLAC._get_tensors_handle([t0])
+
+
+if __name__ == "__main__":
+  absltest.main()
diff --git a/test/test_python_ops.py b/test/test_python_ops.py
@@ -27,6 +27,10 @@ def test_put(self, dtype):
     if dtype in self.unsupported_dtypes:
       raise unittest.SkipTest("Dtype {0} is unsupported by XLA".format(
           str(dtype)))
+    if dtype == torch.uint8:
+      raise unittest.SkipTest(
+          'TODO(https://github.com/pytorch/xla/issues/8799): Re-enable uint8 test'
+      )
 
     device = xm.xla_device()
     real_device_type = xm.xla_device_hw(str(xm.xla_device()))
diff --git a/torch_xla/core/xla_builder.py b/torch_xla/core/xla_builder.py
@@ -40,6 +40,24 @@ class Type:
     Type.PRED: torch.bool,
 }
 
+_PT_XLA_TYPE_MAP = {
+    torch.float32: Type.F32,
+    torch.float64: Type.F64,
+    torch.bfloat16: Type.BF16,
+    torch.float16: Type.F16,
+    torch.uint8: Type.U8,
+    torch.int8: Type.S8,
+    torch.uint16: Type.U16,
+    torch.int16: Type.S16,
+    torch.uint32: Type.U32,
+    torch.int32: Type.S32,
+    torch.uint64: Type.U64,
+    torch.int64: Type.S64,
+    torch.complex64: Type.C64,
+    torch.complex128: Type.C128,
+    torch.bool: Type.PRED,
+}
+
 
 class Shape(object):
   """Wraps a core XLA shape object to provide a more friendly API."""
@@ -751,6 +769,10 @@ def map(cls, ops, computation, dimensions, static_operands=(), builder=None):
   def to_torch_type(cls, dtype):
     return _XLA_PT_TYPE_MAP[dtype] if dtype else torch.float32
 
+  @classmethod
+  def from_torch_type(cls, dtype):
+    return _PT_XLA_TYPE_MAP[dtype]
+
 
 def create_builder(name):
   return torch_xla._XLAC._xla_op_create_builder(name)
@@ -846,3 +868,14 @@ def fn_flattened_inputs(*flattened):
   if isinstance(result, list) and len(result) == 1:
     return result[0]
   return result
+
+
+def create_placeholder_tensor(shape, dtype):
+  """
+  Creates a placeholder tensor that does not hold any device buffer.
+  This is primarily useful for staging out the HLO of a user computation.
+  Accessing the value of the tensor will panic.
+  """
+  dtype = Op.from_torch_type(dtype)
+  shape = mkshape(dtype, shape)
+  return torch_xla._XLAC._xla_create_placeholder_tensor(shape.shape)
diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
@@ -1866,6 +1866,16 @@ void InitXlaModuleBindings(py::module m) {
         });
   m.def("_xla_optimization_barrier_",
         [](std::vector<at::Tensor>& inputs) { OptimizationBarrier_(inputs); });
+  // Creates a placeholder tensor that does not hold any device buffer.
+  // This is primarily useful for staging out the HLO of a user computation.
+  // Accessing the value of the tensor will panic.
+  m.def("_xla_create_placeholder_tensor", [](py::object py_shape) {
+    xla::Shape shape = op_builder::PyShapeToShape(py_shape);
+    auto xla_tensor = XLATensor::Create(
+        torch_xla::runtime::GetComputationClient()->CreateDataPlaceholder(
+            bridge::GetCurrentDevice().toString(), std::move(shape)));
+    return bridge::AtenFromXlaTensor(xla_tensor);
+  });
   m.def("_xla_set_default_device", [](const std::string& device) {
     return SetCurrentThreadDevice(device);
   });
diff --git a/torch_xla/csrc/ops/embedding_bag.cpp b/torch_xla/csrc/ops/embedding_bag.cpp
@@ -116,8 +116,7 @@ std::vector<xla::XlaOp> BuildEmbeddingBag(xla::XlaOp weight, xla::XlaOp indices,
     // Create a While node with computations for the condition and the body.
     auto init_tuple = xla::Tuple(
         offsets.builder(),
-        {xla::Reshape(start, {0}, {}), xla::Reshape(end, {0}, {}),
-         embeddings_weighted,
+        {xla::Reshape(start, {}), xla::Reshape(end, {}), embeddings_weighted,
          xla::ConvertElementType(
              xla::ConstantFromArray<float>(offsets.builder(), initial_vector),
              weight_shape.element_type())});
@@ -189,4 +188,4 @@ XlaOpVector EmbeddingBag::Lower(LoweringContext* loctx) const {
   return ReturnOps(absl::MakeSpan(ops), loctx);
 }
 
-}  // namespace torch_xla
+}  // namespace torch_xla
diff --git a/torch_xla/csrc/runtime/ifrt_computation_client.cc b/torch_xla/csrc/runtime/ifrt_computation_client.cc
@@ -25,7 +25,9 @@
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/python/ifrt/attribute_map.h"
+#include "xla/python/ifrt/basic_device_list.h"
 #include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/device_list.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
@@ -76,7 +78,7 @@ torch::lazy::hash_t hash_comp_env(
     ifrt_devices.push_back(device);
   }
 
-  tsl::RCReference<xla::ifrt::DeviceList> device_list =
+  xla::ifrt::DeviceListRef device_list =
       xla::ifrt::BasicDeviceList::Create(std::move(ifrt_devices));
 
   auto topology_desc = client->GetTopologyForDevices(device_list);
@@ -235,10 +237,9 @@ ComputationClient::DataPtr IfrtComputationClient::WrapDataShards(
     shard_shapes.push_back(ifrt_shard->buffer->shape());
   }
   xla::ifrt::Shape ifrt_shape(shape.dimensions());
-  tsl::RCReference<xla::ifrt::DeviceList> devices_list =
-      xla::ifrt::BasicDeviceList::Create(
-          {client_->addressable_devices().begin(),
-           client_->addressable_devices().end()});
+  xla::ifrt::DeviceListRef devices_list = xla::ifrt::BasicDeviceList::Create(
+      {client_->addressable_devices().begin(),
+       client_->addressable_devices().end()});
 
   XLA_CHECK_EQ(shard_shapes.size(), devices_list->size());
   std::unique_ptr<xla::ifrt::Sharding> ifrt_sharding =
@@ -324,10 +325,9 @@ ComputationClient::DataPtr IfrtComputationClient::TransferShardsToDevice(
     shard_shapes.push_back(ifrt_shard->buffer->shape());
   }
   xla::ifrt::Shape ifrt_shape(shape.dimensions());
-  tsl::RCReference<xla::ifrt::DeviceList> devices_list =
-      xla::ifrt::BasicDeviceList::Create(
-          {client_->addressable_devices().begin(),
-           client_->addressable_devices().end()});
+  xla::ifrt::DeviceListRef devices_list = xla::ifrt::BasicDeviceList::Create(
+      {client_->addressable_devices().begin(),
+       client_->addressable_devices().end()});
   std::unique_ptr<xla::ifrt::Sharding> ifrt_sharding =
       xla::ifrt::ConcreteSharding::Create(devices_list, xla::ifrt::MemoryKind(),
                                           ifrt_shape, shard_shapes);
diff --git a/torch_xla/csrc/runtime/ifrt_computation_client.h b/torch_xla/csrc/runtime/ifrt_computation_client.h
@@ -203,9 +203,11 @@ class IfrtComputationClient : public ComputationClient {
           sharding_(sharding) {}
 
     Handle GetHandle() override {
-      XLA_CHECK(HasValue())
-          << "buffer with shape " << shape().ToString() << " on device "
-          << device() << (buffer == nullptr ? " is null" : " is deleted");
+      // If the data is a placeholder, use the address of this object as the
+      // handle.
+      if (buffer == nullptr) {
+        return reinterpret_cast<std::uintptr_t>(this);
+      }
       return reinterpret_cast<std::uintptr_t>(buffer.get());
     };
     void Assign(const torch::lazy::BackendData& data) override;
diff --git a/torch_xla/csrc/runtime/pjrt_computation_client.h b/torch_xla/csrc/runtime/pjrt_computation_client.h
@@ -191,9 +191,15 @@ class PjRtComputationClient : public ComputationClient {
           buffer(buffer) {}
 
     Handle GetHandle() override {
-      XLA_CHECK(HasValue())
+      // If the data is a placeholder, use the address of this object as the
+      // handle.
+      if (buffer == nullptr) {
+        return reinterpret_cast<std::uintptr_t>(this);
+      }
+
+      XLA_CHECK(!buffer->IsDeleted())
           << "buffer with shape " << shape().ToString() << " on device "
-          << device() << (buffer == nullptr ? " is null" : " is deleted");
+          << device() << " is deleted";
       return reinterpret_cast<std::uintptr_t>(buffer.get());
     };
     void Assign(const torch::lazy::BackendData& data) override;
diff --git a/torch_xla/csrc/runtime/xla_util.cc b/torch_xla/csrc/runtime/xla_util.cc
@@ -79,7 +79,7 @@ absl::StatusOr<std::string> GetComputationHloText(
     const xla::XlaComputation& computation) {
   TF_ASSIGN_OR_RETURN(auto hlo_module,
                       CreateModuleFromProto(computation.proto()));
-  return hlo_module->ToString();
+  return hlo_module->ToString(xla::HloPrintOptions());
 }
 
 void ReportComputationError(
diff --git a/torch_xla/csrc/xla_op_builder.cpp b/torch_xla/csrc/xla_op_builder.cpp
@@ -161,7 +161,7 @@ xla::XlaOp Reshape(const BuilderPtr& builder,
       ArgOptional<py::tuple>(args, "dimensions");
   if (arg_dimensions) {
     std::vector<int64_t> dimensions = GetTupleVector<int64_t>(*arg_dimensions);
-    return xla::Reshape(operands.at(0)->op, dimensions, sizes);
+    return xla::Reshape(xla::Transpose(operands.at(0)->op, dimensions), sizes);
   }
   int64_t inferred_dimension =
       ArgOrDefault<int64_t>(args, "inferred_dimension", -1);
diff --git a/torch_xla/experimental/gradient_accumulation.py b/torch_xla/experimental/gradient_accumulation.py
diff --git a/torch_xla/experimental/scan.py b/torch_xla/experimental/scan.py

Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@ absl::StatusOr<std::string> GetComputationHloText(`
`79`	`79`	`const xla::XlaComputation& computation) {`
`80`	`80`	`TF_ASSIGN_OR_RETURN(auto hlo_module,`
`81`	`81`	`CreateModuleFromProto(computation.proto()));`
`82`		`- return hlo_module->ToString();`
	`82`	`+ return hlo_module->ToString(xla::HloPrintOptions());`
`83`	`83`	`}`
`84`	`84`
`85`	`85`	`void ReportComputationError(`
Original file line number	Diff line number	Diff line change
`@@ -161,7 +161,7 @@ xla::XlaOp Reshape(const BuilderPtr& builder,`
`161`	`161`	`ArgOptional<py::tuple>(args, "dimensions");`
`162`	`162`	`if (arg_dimensions) {`
`163`	`163`	`std::vector<int64_t> dimensions = GetTupleVector<int64_t>(*arg_dimensions);`
`164`		`- return xla::Reshape(operands.at(0)->op, dimensions, sizes);`
	`164`	`+ return xla::Reshape(xla::Transpose(operands.at(0)->op, dimensions), sizes);`
`165`	`165`	`}`
`166`	`166`	`int64_t inferred_dimension =`
`167`	`167`	`ArgOrDefault<int64_t>(args, "inferred_dimension", -1);`