Add optimization barrier

JackCaoG · JackCaoG · commit 15790eff02a1 · 2022-04-08T03:50:51.000Z
diff --git a/torch_xla/core/xla_model.py b/torch_xla/core/xla_model.py
@@ -1024,3 +1024,14 @@ def get_memory_info(device):
     memory in KB) keys.
   """
   return torch_xla._XLAC._xla_memory_info(str(device))
+
+
+def optimization_barrier(tensors):
+  """Blocks xla compiler from moving computations across this barrier. The common
+  use case would be blocking xla common-subexpression elimination pass from undoing
+  the gradient checkpointing.
+
+  Args:
+    tensors (torch.Tensor): `torch.Tensor`s to add barrier to.
+  """
+  return torch_xla._XLAC._xla_optimization_barrier(tensors)
diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
@@ -276,6 +276,13 @@ std::pair<at::Tensor, std::shared_ptr<ir::Value>> CollectivePermute(
       std::make_shared<ir::Value>(new_token));
 }
 
+at::Tensor OptimizationBarrier(const at::Tensor& input) {
+  at::Tensor result = bridge::AtenFromXlaTensor(
+      XLATensor::optimization_barrier(bridge::GetXlaTensor(input)));
+  return torch::autograd::make_variable(
+      result, /*requires_grad=*/input.requires_grad());
+}
+
 void SyncTensors(const std::vector<at::Tensor>& tensors,
                  const std::vector<std::string>& devices, bool wait,
                  bool sync_xla_data) {
@@ -1028,6 +1035,8 @@ void InitXlaModuleBindings(py::module m) {
           }
           return new_token;
         });
+  m.def("_xla_optimization_barrier",
+        [](const at::Tensor& input) { return OptimizationBarrier(input); });
   m.def("_xla_set_default_device", [](const std::string& device) {
     return SetCurrentThreadDevice(device);
   });
diff --git a/torch_xla/csrc/ops/ops.cpp b/torch_xla/csrc/ops/ops.cpp
@@ -25,6 +25,7 @@
 #include "torch_xla/csrc/ops/permute.h"
 #include "torch_xla/csrc/ops/softmax_backward.h"
 #include "torch_xla/csrc/ops/sum.h"
+#include "torch_xla/csrc/ops/xla_ops.h"
 #include "torch_xla/csrc/pooling.h"
 #include "torch_xla/csrc/tensor_util.h"
 #include "torch_xla/csrc/torch_util.h"
@@ -1031,6 +1032,17 @@ NodePtr Softplus(const Value& input, const Value& beta,
                    std::move(lower_fn));
 }
 
+NodePtr OptimizationBarrier(const Value& input) {
+  auto lower_fn = [](const Node& node, LoweringContext* loctx) -> XlaOpVector {
+    xla::XlaOp xla_input = loctx->GetOutputOp(node.operand(0));
+    xla::XlaOp xla_output = xla::OptimizationBarrier(xla_input);
+    return node.ReturnOp(xla_output, loctx);
+  };
+
+  return GenericOp(xla_optimization_barrier, {input}, input.xla_shape(),
+                   std::move(lower_fn));
+}
+
 }  // namespace ops
 }  // namespace ir
 }  // namespace torch_xla
diff --git a/torch_xla/csrc/ops/ops.h b/torch_xla/csrc/ops/ops.h
@@ -247,6 +247,8 @@ NodePtr SLogDet(const Value& input);
 
 NodePtr Softplus(const Value& input, const Value& beta, const Value& threshold);
 
+NodePtr OptimizationBarrier(const Value& input);
+
 }  // namespace ops
 }  // namespace ir
 }  // namespace torch_xla
diff --git a/torch_xla/csrc/ops/xla_ops.cpp b/torch_xla/csrc/ops/xla_ops.cpp
@@ -18,6 +18,7 @@ const OpKindWrapper xla_get_dimensions_size("xla::xla_get_dimensions_size");
 const OpKindWrapper xla_moving_average("xla::moving_average");
 const OpKindWrapper xla_nms("xla::nms");
 const OpKindWrapper xla_not_supported("xla::not_supported");
+const OpKindWrapper xla_optimization_barrier("xla::optimization_barrier");
 const OpKindWrapper xla_reduce_scatter("xla::reduce_scatter");
 const OpKindWrapper xla_replication_pad("xla::replication_pad");
 const OpKindWrapper xla_replication_pad_backward(
diff --git a/torch_xla/csrc/ops/xla_ops.h b/torch_xla/csrc/ops/xla_ops.h
@@ -43,6 +43,7 @@ extern const OpKindWrapper xla_get_dimensions_size;
 extern const OpKindWrapper xla_moving_average;
 extern const OpKindWrapper xla_nms;
 extern const OpKindWrapper xla_not_supported;
+extern const OpKindWrapper xla_optimization_barrier;
 extern const OpKindWrapper xla_reduce_scatter;
 extern const OpKindWrapper xla_replication_pad;
 extern const OpKindWrapper xla_replication_pad_backward;
diff --git a/torch_xla/csrc/tensor.h b/torch_xla/csrc/tensor.h
@@ -918,6 +918,8 @@ class XLATensor {
   static XLATensor not_supported(std::string description, xla::Shape shape,
                                  const Device& device);
 
+  static XLATensor optimization_barrier(const XLATensor& input);
+
   // Permute the dimensions of this tensor according to the given permutation.
   static XLATensor permute(const XLATensor& input,
                            absl::Span<const int64_t> dims);
diff --git a/torch_xla/csrc/tensor_methods.cpp b/torch_xla/csrc/tensor_methods.cpp
@@ -2182,6 +2182,10 @@ XLATensor XLATensor::not_supported(std::string description, xla::Shape shape,
                 device);
 }
 
+XLATensor XLATensor::optimization_barrier(const XLATensor& input) {
+  return input.CreateFrom(ir::ops::OptimizationBarrier(input.GetIrValue()));
+}
+
 XLATensor XLATensor::permute(const XLATensor& input,
                              absl::Span<const int64_t> dims) {
   auto input_shape = input.shape();