Add checkpoint code

JackCaoG · JackCaoG · commit da0356eaacc8 · 2022-04-23T00:37:36.000Z
diff --git a/test/run_tests.sh b/test/run_tests.sh
@@ -93,6 +93,7 @@ function run_all_tests {
   run_opbyop python3 "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY
   run_eager_debug python3 "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY
   run_async_rng python3 "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY
+  run_test python3 "$CDIR/test_checkpoint.py"
   run_test python3 "$CDIR/test_mp_replication.py"
   run_test python3 "$CDIR/test_mp_all_to_all.py"
   run_test python3 "$CDIR/test_mp_collective_permute.py"
diff --git a/test/test_checkpoint.py b/test/test_checkpoint.py
@@ -0,0 +1,37 @@
+import torch
+import torch_xla.core.xla_model as xm
+import torch_xla.debug.metrics as met
+import torch_xla
+import torch_xla.utils.checkpoint as checkpoint
+
+
+def run(grad_checkpoint):
+  device = xm.xla_device()
+  model = torch.nn.ModuleList([
+      torch.nn.Sequential(
+          torch.nn.Conv2d(1024, 1024, 1),
+          torch.nn.ReLU(),
+          torch.nn.Conv2d(1024, 1024, 1),
+          torch.nn.ReLU(),
+      ) for _ in range(2)
+  ]).to(device)
+  optimizer = torch.optim.SGD(model.parameters(), lr=0.0)
+
+  for step in range(20):
+    dummy_data = torch.zeros(64, 1024, 14, 14, device=device)
+    optimizer.zero_grad()
+    x = dummy_data
+    for n_l, layer in enumerate(model):
+      x = checkpoint.checkpoint(layer, x)
+    dummy_loss = x.sum()
+    dummy_loss.backward()
+    optimizer.step()
+    xm.mark_step()
+    xm.wait_device_ops()
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--grad_checkpoint", type=int, required=True)
+  args = parser.parse_args()
+  run(args.grad_checkpoint)