Extend buffer donation aliasing APIs

rpsilva-aws · rpsilva-aws · commit b6a9371287e0 · 2025-02-20T22:29:09.000Z
diff --git a/test/dynamo/test_dynamo_aliasing.py b/test/dynamo/test_dynamo_aliasing.py
@@ -1,3 +1,4 @@
+import sys
 import unittest
 
 import torch
@@ -14,7 +15,7 @@ def test_hash_with_buffer_donor(self):
     input = torch.randn(5, 5).to(device)
     res = torch.cos(input)
     hash_no_donor = torch_xla._XLAC._get_graph_hash([res])
-    self.assertTrue(torch_xla._XLAC._set_buffer_donation(input, True))
+    self.assertTrue(torch_xla._XLAC._set_buffer_donation([input], True))
     # without the alias_with_buffer_donor_config context, buffer donor will be ignored,
     # so we still expect the hash to be the same.
     hash_with_donor = torch_xla._XLAC._get_graph_hash([res])
@@ -116,7 +117,7 @@ def test_manual_buffer_donation(self):
 
     met.clear_all()
     # input is a device_data, we should be able to set the buffer donation field.
-    self.assertTrue(torch_xla._XLAC._set_buffer_donation(input, True))
+    self.assertTrue(torch_xla._XLAC._set_buffer_donation([input], True))
     # make sure buffer donation setting is correctly updated
     self.assertTrue(torch_xla._XLAC._get_buffer_donation(input))
     self.assertIn('XlaSetBufferDonation', met.counter_names())
@@ -133,7 +134,7 @@ def test_manual_buffer_donation_for_non_inplce_op(self):
 
     met.clear_all()
     # input is a device_data, we should be able to set the buffer donation field.
-    self.assertTrue(torch_xla._XLAC._set_buffer_donation(input, True))
+    self.assertTrue(torch_xla._XLAC._set_buffer_donation([input], True))
     # make sure buffer donation setting is correctly updated
     self.assertTrue(torch_xla._XLAC._get_buffer_donation(input))
     self.assertIn('XlaSetBufferDonation', met.counter_names())
@@ -158,7 +159,7 @@ def dummy_inplace(input):
     xm.mark_step()
     met.clear_all()
     # input is a device_data, we should be able to set the buffer donation field.
-    self.assertTrue(torch_xla._XLAC._set_buffer_donation(input, True))
+    self.assertTrue(torch_xla._XLAC._set_buffer_donation([input], True))
     # make sure buffer donation setting is correctly updated
     self.assertTrue(torch_xla._XLAC._get_buffer_donation(input))
 
@@ -179,7 +180,7 @@ def test_buffer_donation_on_non_data_tensor(self):
 
     met.clear_all()
     # res now points to a `Add` IR, only data's buffer can be aliased
-    self.assertFalse(torch_xla._XLAC._set_buffer_donation(res, True))
+    self.assertFalse(torch_xla._XLAC._set_buffer_donation([res], True))
     self.assertFalse(torch_xla._XLAC._get_buffer_donation(res))
     self.assertNotIn('XlaSetBufferDonation', met.counter_names())
 
@@ -198,12 +199,12 @@ def test_buffer_donation_skip_for_non_dynamo(self):
 
     # We should be able to set buffer donation for input tensor, but when mark_step
     # triggered, the buffer donation should be ignored.
-    self.assertTrue(torch_xla._XLAC._set_buffer_donation(input, True))
+    self.assertTrue(torch_xla._XLAC._set_buffer_donation([input], True))
     res = self.dummy_fn(input)
     xm.mark_step()
     # Make sure that input buffer is not aliased and can be used for other compuations.
     # Also make sure that buffer_donation will not trigger recompilation in non-dynamo.
-    self.assertTrue(torch_xla._XLAC._set_buffer_donation(input, False))
+    self.assertTrue(torch_xla._XLAC._set_buffer_donation([input], False))
     res2 = self.dummy_fn(input)
     xm.mark_step()
     torch.allclose(res.cpu(), res2.cpu())
@@ -212,7 +213,7 @@ def test_buffer_donation_skip_for_non_dynamo(self):
   def test_no_op_mark_step_keep_buffer_donation(self):
     device = xm.xla_device()
     input = torch.randn(5, 5).to(device)
-    self.assertTrue(torch_xla._XLAC._set_buffer_donation(input, True))
+    self.assertTrue(torch_xla._XLAC._set_buffer_donation([input], True))
     xm.mark_step()
     self.assertTrue(torch_xla._XLAC._get_buffer_donation(input))
     xm.mark_step()
diff --git a/test/neuron/run_tests.sh b/test/neuron/run_tests.sh
@@ -218,6 +218,7 @@ function run_xla_op_tests3 {
   run_test "$CDIR/spmd/test_fsdp_v2.py"
   run_test "$CDIR/test_operations_hlo.py" "$@" --verbosity=$VERBOSITY
   run_test "$CDIR/test_input_output_aliases.py"
+  run_test_without_functionalization "$CDIR/test_input_output_aliases.py"
   run_test "$CDIR/test_torch_distributed_xla_backend.py"
   run_torchrun "$CDIR/pjrt/test_torchrun.py"
   run_test "$CDIR/test_persistent_cache.py"
diff --git a/test/run_tests.sh b/test/run_tests.sh
@@ -238,6 +238,7 @@ function run_xla_op_tests3 {
   run_save_tensor_hlo run_test "$CDIR/spmd/test_spmd_lowering_context.py"
   run_test "$CDIR/test_operations_hlo.py" "$@" --verbosity=$VERBOSITY
   run_test "$CDIR/test_input_output_aliases.py"
+  run_test_without_functionalization "$CDIR/test_input_output_aliases.py"
   run_test "$CDIR/test_torch_distributed_xla_backend.py"
   run_torchrun "$CDIR/pjrt/test_torchrun.py"
   run_test "$CDIR/test_persistent_cache.py"
diff --git a/test/test_input_output_aliases.py b/test/test_input_output_aliases.py
@@ -8,9 +8,31 @@
 import torch_xla.core.xla_model as xm
 import torch_xla.debug.metrics as met
 import unittest
+import contextlib
 import copy
 
 
+def create_xla_config_context(set_func, get_func):
+
+  @contextlib.contextmanager
+  def config_context(value):
+    original_value = get_func()
+    set_func(value)
+    try:
+      assert get_func() == value
+      yield
+    finally:
+      set_func(original_value)
+
+  return config_context
+
+
+alias_with_buffer_donor_config_context = create_xla_config_context(
+    torch_xla._XLAC._xla_set_enable_alias_with_buffer_donor_config,
+    torch_xla._XLAC._xla_get_enable_alias_with_buffer_donor_config,
+)
+
+
 # TODO(alanwaketan): add test for views.
 class InputOutputAliasesTest(unittest.TestCase):
 
@@ -210,6 +232,69 @@ def test_device_data_cache_no_aliasing(self):
     # ...if it doesn't crash, the value here would be 44.
     self.assertEqual(t1.item(), 43)
 
+  def test_user_config_donation_with_ltc_donation(self):
+    with alias_with_buffer_donor_config_context(True):
+      met.clear_all()
+      xla_device = xm.xla_device()
+      t0 = torch.randn(4, 2, 2).to(xla_device)
+      t1 = torch.randn(4, 2, 2).to(xla_device)
+      self.assertTrue(torch_xla._XLAC._set_buffer_donation(t0, True))
+      self.assertTrue(torch_xla._XLAC._get_buffer_donation(t0))
+      self.assertFalse(torch_xla._XLAC._get_buffer_donation(t1))
+      t3 = t0 + t1
+      t1 += 2
+      xm.mark_step()
+
+      self.assertEqual(met.metric_data("InputOutputAliasCount")[1], 2.0)
+
+  def test_user_config_donation_with_ltc_donation_overlap(self):
+    with alias_with_buffer_donor_config_context(True):
+      met.clear_all()
+      xla_device = xm.xla_device()
+      t0 = torch.randn(4, 2, 2).to(xla_device)
+      self.assertTrue(torch_xla._XLAC._set_buffer_donation(t0, True))
+      self.assertTrue(torch_xla._XLAC._get_buffer_donation(t0))
+      t0 += 2
+      xm.mark_step()
+
+      self.assertEqual(met.metric_data("InputOutputAliasCount")[1], 1.0)
+
+  def test_user_config_donation(self):
+    with alias_with_buffer_donor_config_context(True):
+      met.clear_all()
+      xla_device = xm.xla_device()
+      t0 = torch.randn(4, 2, 2).to(xla_device)
+      self.assertTrue(torch_xla._XLAC._set_buffer_donation(t0, True))
+      self.assertTrue(torch_xla._XLAC._get_buffer_donation(t0))
+      self.assertIn('XlaSetBufferDonation', met.counter_names())
+      self.assertEqual(met.counter_value('XlaSetBufferDonation'), 1)
+      t1 = t0 + 1
+      torch_xla._XLAC._xla_sync_multi([t0, t1], [str(xla_device)], True, False)
+
+      self.assertEqual(met.metric_data("InputOutputAliasCount")[1], 1.0)
+
+  def test_user_config_donation_inplace_aliasing(self):
+    with alias_with_buffer_donor_config_context(True):
+      met.clear_all()
+      xla_device = xm.xla_device()
+      t0 = torch.randn(4, 2, 2).to(xla_device)
+      self.assertTrue(torch_xla._XLAC._set_buffer_donation(t0, True))
+      self.assertTrue(torch_xla._XLAC._get_buffer_donation(t0))
+      t0 *= 2
+      torch_xla._XLAC._xla_sync_multi([t0], [str(xla_device)], True, False)
+
+      self.assertEqual(met.metric_data("InputOutputAliasCount")[1], 1.0)
+
+  def test_user_config_donation_no_op_mark_step(self):
+    with alias_with_buffer_donor_config_context(True):
+      xla_device = xm.xla_device()
+      t0 = torch.randn(4, 2, 2).to(xla_device)
+      self.assertTrue(torch_xla._XLAC._set_buffer_donation(t0, True))
+      xm.mark_step()
+      self.assertTrue(torch_xla._XLAC._get_buffer_donation(t0))
+      xm.mark_step()
+      self.assertTrue(torch_xla._XLAC._get_buffer_donation(t0))
+
 
 if __name__ == '__main__':
   test = unittest.main()
diff --git a/torch_xla/_dynamo/dynamo_bridge.py b/torch_xla/_dynamo/dynamo_bridge.py
@@ -36,13 +36,13 @@
 
 @contextmanager
 def alias_with_buffer_donor_config(should_alias: bool = True):
-  saved_config = torch_xla._XLAC._xla_get_should_alias_with_buffer_donor_config(
+  saved_config = torch_xla._XLAC._xla_get_enable_alias_with_buffer_donor_config(
   )
-  torch_xla._XLAC._xla_set_should_alias_with_buffer_donor_config(should_alias)
+  torch_xla._XLAC._xla_set_enable_alias_with_buffer_donor_config(should_alias)
   try:
     yield saved_config
   finally:
-    torch_xla._XLAC._xla_set_should_alias_with_buffer_donor_config(saved_config)
+    torch_xla._XLAC._xla_set_enable_alias_with_buffer_donor_config(saved_config)
 
 
 @dataclasses.dataclass
diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
@@ -1906,14 +1906,15 @@ void InitXlaModuleBindings(py::module m) {
       [](const std::string& device) { return GetRngSeed(device); },
       py::arg("device") = "");
   m.def(
-      "_xla_set_should_alias_with_buffer_donor_config",
-      [](bool should_alias, const std::string& device_str) {
+      "_xla_set_enable_alias_with_buffer_donor_config",
+      [](bool enable_user_config_alias, const std::string& device_str) {
         torch::lazy::BackendDevice device = GetDeviceOrCurrent(device_str);
-        XLAGraphExecutor::Get()->SetAliasWithBufferDonorConfig(should_alias);
+        XLAGraphExecutor::Get()->SetAliasWithBufferDonorConfig(
+            enable_user_config_alias);
       },
-      py::arg("should_alias") = false, py::arg("device") = "");
+      py::arg("enable_user_config_alias") = false, py::arg("device") = "");
   m.def(
-      "_xla_get_should_alias_with_buffer_donor_config",
+      "_xla_get_enable_alias_with_buffer_donor_config",
       [](const std::string& device_str) {
         torch::lazy::BackendDevice device = GetDeviceOrCurrent(device_str);
         return XLAGraphExecutor::Get()->GetAliasWithBufferDonorConfig();
@@ -2737,19 +2738,19 @@ void InitXlaModuleBindings(py::module m) {
 
   // This api will set the `should_donate_buffer_` field in the
   // ComputationClient::Data. This api is currently only useful if you are
-  // running with `torch.compile`. Buffer assocaited with data with
-  // `should_donate_buffer_` set to true will be donated to the output, You
-  // should only use this api if
-  // 1. You are using torch.compile
-  // 2. You will inplace update a tensor in the `torch.compiled` function(so the
-  //    currnet buffer can be donated after compuation)
+  // running with `torch.compile`. The buffer associated with the data has
+  // `should_donate_buffer_` set to true will be donated to the output. This
+  // can be used if:
+  // 1. You are using torch.compile, and there is an inplace udpate of a tensor
+  //    so that the current buffer can be donated after computation.
+  // 2. You want to explicitly donate a tensor because it is not necessary
+  //    after the current computation.
+  // Note that donated buffers can not be used after being donated.
   m.def("_set_buffer_donation",
-        [](at::Tensor& input, bool should_donate) -> bool {
-          XLATensorPtr xtensor = bridge::GetXlaTensor(input);
+        [](at::Tensor& tensor, bool should_donate) -> bool {
+          XLATensorPtr xtensor = bridge::GetXlaTensor(tensor);
           bool buffer_donation_updated = false;
-          if (!xtensor) {
-            // input tensor is not a XLATensor, return here.
-          } else if (xtensor->CurrentDataHandle() != nullptr) {
+          if (xtensor->CurrentDataHandle() != nullptr) {
             auto data =
                 std::dynamic_pointer_cast<runtime::ComputationClient::Data>(
                     xtensor->CurrentDataHandle());
diff --git a/torch_xla/csrc/xla_graph_executor.cpp b/torch_xla/csrc/xla_graph_executor.cpp
@@ -366,8 +366,8 @@ torch::lazy::BackendDataPtr XLAGraphExecutor::GetBaseSeedData(
   return DeviceContextArena::Get()->GetBaseSeedData(device);
 }
 
-void XLAGraphExecutor::SetAliasWithBufferDonorConfig(bool should_alias) {
-  DeviceContextArena::Get()->SetAliasWithBufferDonorConfig(should_alias);
+void XLAGraphExecutor::SetAliasWithBufferDonorConfig(bool enable_alias) {
+  DeviceContextArena::Get()->SetAliasWithBufferDonorConfig(enable_alias);
 }
 
 bool XLAGraphExecutor::GetAliasWithBufferDonorConfig() {
@@ -1290,53 +1290,73 @@ std::vector<size_t> XLAGraphExecutor::GetBufferDonors(
   static const bool enable_aliasing =
       runtime::sys_util::GetEnvBool("XLA_ENABLE_PARAM_ALIASING", true);
   static const bool use_autosharding = ShardingUtil::GetAutoSharding();
-
-  std::vector<size_t> buffer_donor_indices;
   // TODO(yeounoh) enable aliasing is disabled for partitioned computation,
   // since the current aliasing compares the unpartitioned input and output
   // shapes which can lead to an incorrect aliasing pairs if sharded.
-  if (enable_aliasing && !use_autosharding) {
-    if (coll.config.sync_ltc_data && coll.config.force_ltc_data) {
-      // We can only alias at the step barrier, when force_ltc_data is true.
-      // Consider the case:
-      //   1. Tensor A(DEVICE_DATA)
-      //   2. Tensor B = A + 0.9
-      //   3. A += 0.4
-      // If we activate aliasing for A's graph, and we do:
-      //   print(A)
-      //   print(A)
-      // The first print will update DEVICE_DATA' with DEVICE_DATA+0.4, and the
-      // second print will again update DEVICE_DATA" with DEVICE_DATA'+0.4,
-      // which will lead to incorrect results. We cannot normally turn A's state
-      // into DEVICE_DATA, as if any of the sources is a view, this will not
-      // lead to correct results (as A's value taken at different times need to
-      // reflect view source changes):
-      //   1. Tensor A = some_graph_with_view_source(V)
-      //   2. print(A)
-      //   3. V += 1
-      //   4. print(A)
-      // The second print should reflect the new value due to V's changes.
-      // Also in the first example, unless we are doing a step barrier and hence
-      // include all live tensors, if the B value is not part of the graph, it
-      // will later fetch the new value of A, which is incorrect.
-      // But, when we issue a step barrier (force_ltc_data == true) we have to
-      // turn everything into DEVICE_DATA, so we can activate aliasing.
-      buffer_donor_indices = GetBufferDonorIndexForStepMarker(
-          tensors, coll.indices, parameters_data);
-    } else if (GetAliasWithBufferDonorConfig()) {
-      // only alias based on buffer donor if LTC can't auto infer the input
-      // output aliasing.
-      buffer_donor_indices = GetBufferDonorIndexFromUserConfig(parameters_data);
-    }
+  if (use_autosharding) {
+    return {};
   }
+
+  if (!enable_aliasing) {
+    return {};
+  }
+
+  std::vector<size_t> ltc_buffer_donor_indices;
+  if (coll.config.sync_ltc_data && coll.config.force_ltc_data) {
+    // We can only alias at the step barrier, when force_ltc_data is true.
+    // Consider the case:
+    //   1. Tensor A(DEVICE_DATA)
+    //   2. Tensor B = A + 0.9
+    //   3. A += 0.4
+    // If we activate aliasing for A's graph, and we do:
+    //   print(A)
+    //   print(A)
+    // The first print will update DEVICE_DATA' with DEVICE_DATA+0.4, and the
+    // second print will again update DEVICE_DATA" with DEVICE_DATA'+0.4,
+    // which will lead to incorrect results. We cannot normally turn A's state
+    // into DEVICE_DATA, as if any of the sources is a view, this will not
+    // lead to correct results (as A's value taken at different times need to
+    // reflect view source changes):
+    //   1. Tensor A = some_graph_with_view_source(V)
+    //   2. print(A)
+    //   3. V += 1
+    //   4. print(A)
+    // The second print should reflect the new value due to V's changes.
+    // Also in the first example, unless we are doing a step barrier and hence
+    // include all live tensors, if the B value is not part of the graph, it
+    // will later fetch the new value of A, which is incorrect.
+    // But, when we issue a step barrier (force_ltc_data == true) we have to
+    // turn everything into DEVICE_DATA, so we can activate aliasing.
+    ltc_buffer_donor_indices = GetBufferDonorIndexForStepMarker(
+        tensors, coll.indices, parameters_data);
+  }
+
+  std::vector<size_t> user_config_buffer_donor_indices;
+  if (GetAliasWithBufferDonorConfig()) {
+    user_config_buffer_donor_indices =
+        GetBufferDonorIndexFromUserConfig(parameters_data);
+  }
+
+  // Both LTC and user config buffer donation indices vector are originally
+  // sorted. In order to ensure that we get deterministic hash across runs, in
+  // cases where there is an alternating aliasing among auto LTC and user
+  // specified buffer donor indices, we ensure we retain the sorting and remove
+  // any duplicates when merging the two indices vector.
+  std::vector<size_t> buffer_donor_indices;
+  buffer_donor_indices.reserve(
+      std::max(ltc_buffer_donor_indices.size(),
+               user_config_buffer_donor_indices.size()));
+  std::set_union(ltc_buffer_donor_indices.cbegin(),
+                 ltc_buffer_donor_indices.cend(),
+                 user_config_buffer_donor_indices.cbegin(),
+                 user_config_buffer_donor_indices.cend(),
+                 std::back_inserter(buffer_donor_indices));
   return buffer_donor_indices;
 }
 
 void XLAGraphExecutor::SetBufferDonors(
     LoweringContext* lowering_ctx,
     const std::vector<size_t>& buffer_donor_indexs) {
-  const std::vector<torch::lazy::BackendDataPtr>& parameters_data =
-      lowering_ctx->GetParametersData();
   for (size_t i : buffer_donor_indexs) {
     lowering_ctx->builder()->AddBufferDonor(/*param_number=*/i,
                                             /*param_index=*/{});
diff --git a/torch_xla/csrc/xla_graph_executor.h b/torch_xla/csrc/xla_graph_executor.h