Initial changes to manually register einsum for XLA

pgmoka · pgmoka · commit 017ccd01aef4 · 2025-03-05T23:44:36.000Z
diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
@@ -1867,17 +1867,6 @@ void InitXlaModuleBindings(py::module m) {
   m.def("_xla_optimization_barrier_",
         [](std::vector<at::Tensor>& inputs) { OptimizationBarrier_(inputs); });
 
-  // TODO(https://github.com/pytorch/xla/issues/8713): torch.einsum is getting
-  // decomposed when inside a custom op. This C++ op is an escape hatch to call
-  // XLA einsum without going through torch.einsum. We should remove this
-  // operation when the linked bug is fixed.
-  m.def("_xla_einsum",
-        [](const std::string& equation, const std::vector<at::Tensor>& inputs) {
-          std::vector<XLATensorPtr> xla_tensors = bridge::GetXlaTensors(inputs);
-          XLATensorPtr output = tensor_methods::einsum(equation, xla_tensors);
-          return bridge::AtenFromXlaTensor(output);
-        });
-
   // Creates a placeholder tensor that does not hold any device buffer.
   // This is primarily useful for staging out the HLO of a user computation.
   // Accessing the value of the tensor will panic.
diff --git a/torch_xla/csrc/xla_manual_registration.cpp b/torch_xla/csrc/xla_manual_registration.cpp
@@ -1,6 +1,7 @@
 #include <ATen/ATen.h>
 #include <torch/library.h>
 
+#include "torch_xla/csrc/XLANativeFunctions.h"
 #include "torch_xla/csrc/aten_fallback.h"
 #include "torch_xla/csrc/aten_xla_bridge.h"
 #include "torch_xla/csrc/debug_util.h"
@@ -49,5 +50,11 @@ TORCH_LIBRARY_IMPL(torchvision, XLA, m) {
   m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN(nms_kernel));
 }
 
+// Register generated XLANativeFunctions::einsum as aten::einsum for XLA key.
+// This utilizes the implementation from `xla/torch_xla/csrc/aten_xla_type.cpp`.
+TORCH_LIBRARY_IMPL(aten, XLA, m) {
+  m.impl("aten::einsum", TORCH_FN(XLANativeFunctions::einsum));
+}
+
 }  // namespace manual
 }  // namespace torch_xla
diff --git a/torch_xla/distributed/spmd/xla_sharding.py b/torch_xla/distributed/spmd/xla_sharding.py
@@ -680,7 +680,7 @@ def _einsum_linear_forward(input: Tensor, weight: Tensor,
     # decomposed when inside a custom op. This C++ op is an escape hatch to call
     # XLA einsum without going through torch.einsum. We should remove this
     # _einsum escape hatch when the linked bug is fixed.
-    product = torch_xla._XLAC._xla_einsum('...n,mn->...m', (input, weight))
+    product = torch.einsum('...n,mn->...m', (input, weight))
     if bias is not None:
       return product + bias
     return product
@@ -708,19 +708,17 @@ def _einsum_linear_backward(grad_output: Tensor, input: Tensor, weight: Tensor,
     grad_input = grad_weight = grad_bias = None
 
     if needs_input_grad_input:
-      grad_input = torch_xla._XLAC._xla_einsum('...m,mn->...n',
-                                               (grad_output, weight))
+      grad_input = torch.einsum('...m,mn->...n', (grad_output, weight))
     else:
       grad_input = None
 
     if needs_input_grad_weight:
-      grad_weight = torch_xla._XLAC._xla_einsum('...m,...n->mn',
-                                                (grad_output, input))
+      grad_weight = torch.einsum('...m,...n->mn', (grad_output, input))
     else:
       grad_weight = None
 
     if bias is not None and needs_input_grad_bias:
-      grad_bias = torch_xla._XLAC._xla_einsum('...m->m', (grad_output,))
+      grad_bias = torch.einsum('...m->m', (grad_output,))
     else:
       grad_bias = None
 
@@ -765,8 +763,8 @@ class XLAPatchedLinear(torch.autograd.Function):
   autocast context, when autocast is enabled.
   torch.get_autocast_dtype() fetches datatype for ops run in autocast [2], with the specified device (here, 'xla').
 
-  References: 
-  [1] https://pytorch.org/docs/stable/notes/amp_examples.html#functions-with-multiple-inputs-or-autocastable-ops 
+  References:
+  [1] https://pytorch.org/docs/stable/notes/amp_examples.html#functions-with-multiple-inputs-or-autocastable-ops
   [2] https://github.com/pytorch/pytorch/blob/2cc01cc6d3ad2aff47e8460667ba654b2e4c9f21/torch/amp/autocast_mode.py#L500
 
   TODO (alanwaketan): Let's patch it on the dispatcher level.
@@ -1260,8 +1258,8 @@ class MarkShardingFunction(torch.autograd.Function):
   Usage:
   new_tensor = MarkShardingFunction.apply(tensor, mesh, ('axis_1', 'axis_2'))
 
-  This is required to guide GSPMD sharding propagation better during the 
-  backward pass as during complicated workloads the compiler can introduce extra 
+  This is required to guide GSPMD sharding propagation better during the
+  backward pass as during complicated workloads the compiler can introduce extra
   collectives that can hurt performance.
   """