pytorch · yaochengji · Feb 6, 2025 · yaochengji · Feb 11, 2025
diff --git a/torchax/test/test_functions.py b/torchax/test/test_functions.py
@@ -66,6 +66,13 @@ def test_flatten(self):
       a = a.flatten(0, 1)
       self.assertEqual(tuple(a.shape), (6, 4))
 
+  def test_copy_(self):
+    with self.env:
+      a = torch.zeros((2, 3), device="cpu")
+      b = torch.ones((2, 3))
+      b.copy_(a)
+      self.assertTrue(torch.allclose(a, b.cpu()))
+
   def test_rnn(self):
     model = SeqModel()
     x = torch.randn((2, 100, 20))

diff --git a/torchax/torchax/ops/jaten.py b/torchax/torchax/ops/jaten.py
@@ -122,6 +122,8 @@ def _aten_add(x, y, *, alpha=1):
 
 @op(torch.ops.aten.copy_, is_jax_function=False)
 def _aten_copy(x, y, memory_format=None):
+  if y.device.type == "cpu":
+    y = y.to(x.device)
   if x.ndim == 1 and y.ndim == 0:
     # case of torch.empty((1,)).copy_(tensor(N))
     # we need to return 0D tensor([N]) and not scalar tensor(N)