Extend buffer donation aliasing APIs

rpsilva-aws · rpsilva-aws · commit 03e26acf5ba1 · 2025-02-19T03:03:26.000Z
diff --git a/test/dynamo/test_dynamo_aliasing.py b/test/dynamo/test_dynamo_aliasing.py
@@ -1,3 +1,4 @@
+import sys
 import unittest
 
 import torch
@@ -14,7 +15,7 @@ def test_hash_with_buffer_donor(self):
     input = torch.randn(5, 5).to(device)
     res = torch.cos(input)
     hash_no_donor = torch_xla._XLAC._get_graph_hash([res])
-    self.assertTrue(torch_xla._XLAC._set_buffer_donation(input, True))
+    self.assertTrue(all(torch_xla._XLAC._set_buffer_donation([input], True)))
     # without the alias_with_buffer_donor_config context, buffer donor will be ignored,
     # so we still expect the hash to be the same.
     hash_with_donor = torch_xla._XLAC._get_graph_hash([res])
@@ -116,7 +117,7 @@ def test_manual_buffer_donation(self):
 
     met.clear_all()
     # input is a device_data, we should be able to set the buffer donation field.
-    self.assertTrue(torch_xla._XLAC._set_buffer_donation(input, True))
+    self.assertTrue(all(torch_xla._XLAC._set_buffer_donation([input], True)))
     # make sure buffer donation setting is correctly updated
     self.assertTrue(torch_xla._XLAC._get_buffer_donation(input))
     self.assertIn('XlaSetBufferDonation', met.counter_names())
@@ -133,7 +134,7 @@ def test_manual_buffer_donation_for_non_inplce_op(self):
 
     met.clear_all()
     # input is a device_data, we should be able to set the buffer donation field.
-    self.assertTrue(torch_xla._XLAC._set_buffer_donation(input, True))
+    self.assertTrue(all(torch_xla._XLAC._set_buffer_donation([input], True)))
     # make sure buffer donation setting is correctly updated
     self.assertTrue(torch_xla._XLAC._get_buffer_donation(input))
     self.assertIn('XlaSetBufferDonation', met.counter_names())
@@ -158,7 +159,7 @@ def dummy_inplace(input):
     xm.mark_step()
     met.clear_all()
     # input is a device_data, we should be able to set the buffer donation field.
-    self.assertTrue(torch_xla._XLAC._set_buffer_donation(input, True))
+    self.assertTrue(all(torch_xla._XLAC._set_buffer_donation([input], True)))
     # make sure buffer donation setting is correctly updated
     self.assertTrue(torch_xla._XLAC._get_buffer_donation(input))
 
@@ -179,7 +180,7 @@ def test_buffer_donation_on_non_data_tensor(self):
 
     met.clear_all()
     # res now points to a `Add` IR, only data's buffer can be aliased
-    self.assertFalse(torch_xla._XLAC._set_buffer_donation(res, True))
+    self.assertFalse(all(torch_xla._XLAC._set_buffer_donation([res], True)))
     self.assertFalse(torch_xla._XLAC._get_buffer_donation(res))
     self.assertNotIn('XlaSetBufferDonation', met.counter_names())
 
@@ -198,12 +199,12 @@ def test_buffer_donation_skip_for_non_dynamo(self):
 
     # We should be able to set buffer donation for input tensor, but when mark_step
     # triggered, the buffer donation should be ignored.
-    self.assertTrue(torch_xla._XLAC._set_buffer_donation(input, True))
+    self.assertTrue(all(torch_xla._XLAC._set_buffer_donation([input], True)))
     res = self.dummy_fn(input)
     xm.mark_step()
     # Make sure that input buffer is not aliased and can be used for other compuations.
     # Also make sure that buffer_donation will not trigger recompilation in non-dynamo.
-    self.assertTrue(torch_xla._XLAC._set_buffer_donation(input, False))
+    self.assertTrue(all(torch_xla._XLAC._set_buffer_donation([input], False)))
     res2 = self.dummy_fn(input)
     xm.mark_step()
     torch.allclose(res.cpu(), res2.cpu())
@@ -212,7 +213,7 @@ def test_buffer_donation_skip_for_non_dynamo(self):
   def test_no_op_mark_step_keep_buffer_donation(self):
     device = xm.xla_device()
     input = torch.randn(5, 5).to(device)
-    self.assertTrue(torch_xla._XLAC._set_buffer_donation(input, True))
+    self.assertTrue(all(torch_xla._XLAC._set_buffer_donation([input], True)))
     xm.mark_step()
     self.assertTrue(torch_xla._XLAC._get_buffer_donation(input))
     xm.mark_step()
diff --git a/test/neuron/run_tests.sh b/test/neuron/run_tests.sh
@@ -218,6 +218,7 @@ function run_xla_op_tests3 {
   run_test "$CDIR/spmd/test_fsdp_v2.py"
   run_test "$CDIR/test_operations_hlo.py" "$@" --verbosity=$VERBOSITY
   run_test "$CDIR/test_input_output_aliases.py"
+  run_test_without_functionalization "$CDIR/test_input_output_aliases.py"
   run_test "$CDIR/test_torch_distributed_xla_backend.py"
   run_torchrun "$CDIR/pjrt/test_torchrun.py"
   run_test "$CDIR/test_persistent_cache.py"
diff --git a/test/run_tests.sh b/test/run_tests.sh
@@ -238,6 +238,7 @@ function run_xla_op_tests3 {
   run_save_tensor_hlo run_test "$CDIR/spmd/test_spmd_lowering_context.py"
   run_test "$CDIR/test_operations_hlo.py" "$@" --verbosity=$VERBOSITY
   run_test "$CDIR/test_input_output_aliases.py"
+  run_test_without_functionalization "$CDIR/test_input_output_aliases.py"
   run_test "$CDIR/test_torch_distributed_xla_backend.py"
   run_torchrun "$CDIR/pjrt/test_torchrun.py"
   run_test "$CDIR/test_persistent_cache.py"
diff --git a/test/test_input_output_aliases.py b/test/test_input_output_aliases.py
@@ -8,9 +8,38 @@
 import torch_xla.core.xla_model as xm
 import torch_xla.debug.metrics as met
 import unittest
+import contextlib
 import copy
 
 
+def create_xla_config_context(set_func, get_func):
+
+  @contextlib.contextmanager
+  def config_context(value):
+    original_value = get_func()
+    set_func(value)
+    try:
+      assert get_func() == value
+      yield
+    finally:
+      set_func(original_value)
+
+  return config_context
+
+
+# Create context managers to simplify the test setup and cleanup with different
+# aliasing configurations.
+parameter_aliasing_context = create_xla_config_context(
+    torch_xla._XLAC._xla_set_enable_parameter_aliasing,
+    torch_xla._XLAC._xla_get_enable_parameter_aliasing,
+)
+
+alias_with_buffer_donor_config_context = create_xla_config_context(
+    torch_xla._XLAC._xla_set_enable_alias_with_buffer_donor_config,
+    torch_xla._XLAC._xla_get_enable_alias_with_buffer_donor_config,
+)
+
+
 # TODO(alanwaketan): add test for views.
 class InputOutputAliasesTest(unittest.TestCase):
 
@@ -210,7 +239,102 @@ def test_device_data_cache_no_aliasing(self):
     # ...if it doesn't crash, the value here would be 44.
     self.assertEqual(t1.item(), 43)
 
-
-if __name__ == '__main__':
-  test = unittest.main()
-  sys.exit(0 if test.result.wasSuccessful() else 1)
+  def test_disable_param_aliasing(self):
+    with parameter_aliasing_context(False):
+      xla_device = xm.xla_device()
+      t = torch.tensor(42, device=xla_device)
+      xm.mark_step()
+
+      met.clear_all()
+      t.add_(1)
+      xm.mark_step()
+
+      self.assertEqual(met.metric_data("InputOutputAliasCount"), None)
+
+  def test_user_config_donation_with_ltc_donation(self):
+    with alias_with_buffer_donor_config_context(True):
+      met.clear_all()
+      xla_device = xm.xla_device()
+      t0 = torch.randn(4, 2, 2).to(xla_device)
+      t1 = torch.randn(4, 2, 2).to(xla_device)
+      self.assertTrue(all(torch_xla._XLAC._set_buffer_donation([t0], True)))
+      self.assertTrue(torch_xla._XLAC._get_buffer_donation(t0))
+      self.assertFalse(torch_xla._XLAC._get_buffer_donation(t1))
+      t3 = t0 + t1
+      t1 += 2
+      xm.mark_step()
+
+      self.assertEqual(met.metric_data("InputOutputAliasCount")[1], 2.0)
+
+  def test_user_config_donation_with_ltc_donation_overlap(self):
+    with alias_with_buffer_donor_config_context(True):
+      met.clear_all()
+      xla_device = xm.xla_device()
+      t0 = torch.randn(4, 2, 2).to(xla_device)
+      self.assertTrue(all(torch_xla._XLAC._set_buffer_donation([t0], True)))
+      self.assertTrue(torch_xla._XLAC._get_buffer_donation(t0))
+      t0 += 2
+      xm.mark_step()
+
+      self.assertEqual(met.metric_data("InputOutputAliasCount")[1], 1.0)
+
+  def test_user_config_donation(self):
+    with alias_with_buffer_donor_config_context(True):
+      met.clear_all()
+      xla_device = xm.xla_device()
+      t0 = torch.randn(4, 2, 2).to(xla_device)
+      self.assertTrue(all(torch_xla._XLAC._set_buffer_donation([t0], True)))
+      self.assertTrue(torch_xla._XLAC._get_buffer_donation(t0))
+      t1 = t0 + 1
+      torch_xla._XLAC._xla_sync_multi([t0, t1], [str(xla_device)], True, False)
+
+      self.assertEqual(met.metric_data("InputOutputAliasCount")[1], 1.0)
+
+  def test_user_config_donation_inplace_aliasing(self):
+    with alias_with_buffer_donor_config_context(True):
+      met.clear_all()
+      xla_device = xm.xla_device()
+      t0 = torch.randn(4, 2, 2).to(xla_device)
+      self.assertTrue(all(torch_xla._XLAC._set_buffer_donation([t0], True)))
+      self.assertTrue(torch_xla._XLAC._get_buffer_donation(t0))
+      t0 *= 2
+      torch_xla._XLAC._xla_sync_multi([t0], [str(xla_device)], True, False)
+
+      self.assertEqual(met.metric_data("InputOutputAliasCount")[1], 1.0)
+
+  def test_user_config_donation_with_disable_param_aliasing(self):
+    with alias_with_buffer_donor_config_context(
+        True), parameter_aliasing_context(False):
+      met.clear_all()
+      xla_device = xm.xla_device()
+      t0 = torch.randn(4, 2, 2).to(xla_device)
+      self.assertTrue(all(torch_xla._XLAC._set_buffer_donation([t0], True)))
+      self.assertTrue(torch_xla._XLAC._get_buffer_donation(t0))
+
+      xm.mark_step()
+
+      self.assertEqual(met.metric_data("InputOutputAliasCount"), None)
+
+  def test_user_config_donation_no_op_mark_step(self):
+    with alias_with_buffer_donor_config_context(True):
+      xla_device = xm.xla_device()
+      t0 = torch.randn(4, 2, 2).to(xla_device)
+      self.assertTrue(all(torch_xla._XLAC._set_buffer_donation([t0], True)))
+      xm.mark_step()
+      self.assertTrue(torch_xla._XLAC._get_buffer_donation(t0))
+      xm.mark_step()
+      self.assertTrue(torch_xla._XLAC._get_buffer_donation(t0))
+
+
+if __name__ == "__main__":
+  loader = unittest.TestLoader()
+  test_cases = loader.getTestCaseNames(InputOutputAliasesTest)
+  failed = False
+  for test_name in test_cases:
+    test = InputOutputAliasesTest(test_name)
+    runner = unittest.TextTestRunner(failfast=True)
+    result = runner.run(test)
+    if not result.wasSuccessful():
+      failed = True
+
+  sys.exit(1 if failed else 0)
diff --git a/torch_xla/__init__.py b/torch_xla/__init__.py
@@ -195,6 +195,9 @@ def _check_deprecated_env_var():
 if os.environ.get('TF_CPP_MIN_LOG_LEVEL') == '0':
   logger.setLevel(logging.INFO)
 
+if 'XLA_ENABLE_PARAM_ALIASING' in os.environ:
+  _XLAC.set_enable_parameter_aliasing(os.environ['XLA_ENABLE_PARAM_ALIASING'])
+
 import atexit
 from ._patched_functions import _apply_patches
 
diff --git a/torch_xla/_dynamo/dynamo_bridge.py b/torch_xla/_dynamo/dynamo_bridge.py
@@ -36,13 +36,13 @@
 
 @contextmanager
 def alias_with_buffer_donor_config(should_alias: bool = True):
-  saved_config = torch_xla._XLAC._xla_get_should_alias_with_buffer_donor_config(
+  saved_config = torch_xla._XLAC._xla_get_enable_alias_with_buffer_donor_config(
   )
-  torch_xla._XLAC._xla_set_should_alias_with_buffer_donor_config(should_alias)
+  torch_xla._XLAC._xla_set_enable_alias_with_buffer_donor_config(should_alias)
   try:
     yield saved_config
   finally:
-    torch_xla._XLAC._xla_set_should_alias_with_buffer_donor_config(saved_config)
+    torch_xla._XLAC._xla_set_enable_alias_with_buffer_donor_config(saved_config)
 
 
 @dataclasses.dataclass
diff --git a/torch_xla/_internal/custom_kernel.py b/torch_xla/_internal/custom_kernel.py
@@ -38,7 +38,7 @@ def dynamo_mark_sharding(input: torch.Tensor, device_ids: List[int],
 
 @impl(XLA_LIB, "dynamo_set_buffer_donor_", "XLA")
 def dynamo_set_buffer_donor_xla_(t: torch.Tensor, should_donoate: bool):
-  torch_xla._XLAC._set_buffer_donation(t, should_donoate)
+  torch_xla._XLAC._set_buffer_donation([t], should_donoate)
   return t
 
 
diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
@@ -1906,14 +1906,30 @@ void InitXlaModuleBindings(py::module m) {
       [](const std::string& device) { return GetRngSeed(device); },
       py::arg("device") = "");
   m.def(
-      "_xla_set_should_alias_with_buffer_donor_config",
-      [](bool should_alias, const std::string& device_str) {
+      "_xla_set_enable_parameter_aliasing",
+      [](bool enable_parameter_aliasing, const std::string& device_str) {
         torch::lazy::BackendDevice device = GetDeviceOrCurrent(device_str);
-        XLAGraphExecutor::Get()->SetAliasWithBufferDonorConfig(should_alias);
+        XLAGraphExecutor::Get()->SetEnableParameterAliasing(
+            enable_parameter_aliasing);
       },
-      py::arg("should_alias") = false, py::arg("device") = "");
+      py::arg("enable_parameter_aliasing") = true, py::arg("device") = "");
   m.def(
-      "_xla_get_should_alias_with_buffer_donor_config",
+      "_xla_get_enable_parameter_aliasing",
+      [](const std::string& device_str) {
+        torch::lazy::BackendDevice device = GetDeviceOrCurrent(device_str);
+        return XLAGraphExecutor::Get()->GetEnableParameterAliasing();
+      },
+      py::arg("device") = "");
+  m.def(
+      "_xla_set_enable_alias_with_buffer_donor_config",
+      [](bool enable_user_config_alias, const std::string& device_str) {
+        torch::lazy::BackendDevice device = GetDeviceOrCurrent(device_str);
+        XLAGraphExecutor::Get()->SetAliasWithBufferDonorConfig(
+            enable_user_config_alias);
+      },
+      py::arg("enable_user_config_alias") = false, py::arg("device") = "");
+  m.def(
+      "_xla_get_enable_alias_with_buffer_donor_config",
       [](const std::string& device_str) {
         torch::lazy::BackendDevice device = GetDeviceOrCurrent(device_str);
         return XLAGraphExecutor::Get()->GetAliasWithBufferDonorConfig();
@@ -2737,36 +2753,43 @@ void InitXlaModuleBindings(py::module m) {
 
   // This api will set the `should_donate_buffer_` field in the
   // ComputationClient::Data. This api is currently only useful if you are
-  // running with `torch.compile`. Buffer assocaited with data with
-  // `should_donate_buffer_` set to true will be donated to the output, You
-  // should only use this api if
-  // 1. You are using torch.compile
-  // 2. You will inplace update a tensor in the `torch.compiled` function(so the
-  //    currnet buffer can be donated after compuation)
+  // running with `torch.compile`. The buffer associated with the data has
+  // `should_donate_buffer_` set to true will be donated to the output. This
+  // can be used if:
+  // 1. You are using torch.compile, and there is an inplace udpate of a tensor
+  //    so that the current buffer can be donated after computation.
+  // 2. You want to explicitly donate a tensor because it is not necessary
+  //    after the current computation.
+  // Note that donated buffers can not be used after being donated.
   m.def("_set_buffer_donation",
-        [](at::Tensor& input, bool should_donate) -> bool {
-          XLATensorPtr xtensor = bridge::GetXlaTensor(input);
-          bool buffer_donation_updated = false;
-          if (!xtensor) {
-            // input tensor is not a XLATensor, return here.
-          } else if (xtensor->CurrentDataHandle() != nullptr) {
-            auto data =
-                std::dynamic_pointer_cast<runtime::ComputationClient::Data>(
-                    xtensor->CurrentDataHandle());
-            data->set_should_donate_buffer(should_donate);
-            buffer_donation_updated = true;
-          } else if (xtensor->CurrentIrValue().node != nullptr) {
-            torch::lazy::NodePtr node = xtensor->CurrentIrValue().node;
-            auto device_data = torch_xla::DeviceData::Cast(node.get());
-            if (device_data != nullptr) {
-              device_data->set_buffer_donation(should_donate);
-              buffer_donation_updated = true;
+        [](const std::vector<at::Tensor>& tensors,
+           bool should_donate) -> std::vector<bool> {
+          std::vector<bool> buffer_donations_updated;
+          for (const at::Tensor& tensor : tensors) {
+            XLATensorPtr xtensor = bridge::GetXlaTensor(tensor);
+            bool donation_updated = false;
+            if (!xtensor) {
+              // input tensor is not a XLATensor, return here.
+            } else if (xtensor->CurrentDataHandle() != nullptr) {
+              auto data =
+                  std::dynamic_pointer_cast<runtime::ComputationClient::Data>(
+                      xtensor->CurrentDataHandle());
+              data->set_should_donate_buffer(should_donate);
+              donation_updated = true;
+            } else if (xtensor->CurrentIrValue().node != nullptr) {
+              torch::lazy::NodePtr node = xtensor->CurrentIrValue().node;
+              auto device_data = torch_xla::DeviceData::Cast(node.get());
+              if (device_data != nullptr) {
+                device_data->set_buffer_donation(should_donate);
+                donation_updated = true;
+              }
             }
+            if (donation_updated) {
+              TORCH_LAZY_COUNTER("XlaSetBufferDonation", 1);
+            }
+            buffer_donations_updated.push_back(donation_updated);
           }
-          if (buffer_donation_updated) {
-            TORCH_LAZY_COUNTER("XlaSetBufferDonation", 1);
-          }
-          return buffer_donation_updated;
+          return buffer_donations_updated;
         });
 
   m.def("_get_buffer_donation", [](const at::Tensor& input) -> bool {
diff --git a/torch_xla/csrc/xla_graph_executor.cpp b/torch_xla/csrc/xla_graph_executor.cpp
diff --git a/torch_xla/csrc/xla_graph_executor.h b/torch_xla/csrc/xla_graph_executor.h
diff --git a/torch_xla/experimental/gradient_accumulation.py b/torch_xla/experimental/gradient_accumulation.py