[fbsync] add crestereo implementation (#6310)

YosuaMichael · jdsgomes · facebook-github-bot · commit 09779fdee726 · 2022-09-15T12:11:08.000-07:00
Summary:
* crestereo draft implementation

* minor model fixes. positional embedding changes.

* aligned base configuration with paper

* Adressing comments

* Broke down Adaptive Correlation Layer. Adressed some other commets.

* adressed some nits

* changed search size, added output channels to model attrs

* changed weights naming

* changed from iterations to num_iters

* removed _make_coords, adressed comments

* fixed jit test

* config nit

* Changed device arg to str

Reviewed By: jdsgomes

Differential Revision: D39543279

fbshipit-source-id: c6101958588eb43201f92ff4f687bd32cbbcbbd1

Co-authored-by: Joao Gomes &lt;jdsgomes@fb.com&gt;
Co-authored-by: YosuaMichael &lt;yosuamichaelm@gmail.com&gt;
diff --git a/test/expect/ModelTester.test_crestereo_base_expect.pkl b/test/expect/ModelTester.test_crestereo_base_expect.pkl
diff --git a/test/test_prototype_models.py b/test/test_prototype_models.py
@@ -5,7 +5,7 @@
 from torchvision.prototype import models
 
 
-@pytest.mark.parametrize("model_fn", TM.list_model_fns(models.depth.stereo))
+@pytest.mark.parametrize("model_fn", (models.depth.stereo.raft_stereo_base,))
 @pytest.mark.parametrize("model_mode", ("standard", "scripted"))
 @pytest.mark.parametrize("dev", cpu_and_gpu())
 def test_raft_stereo(model_fn, model_mode, dev):
@@ -35,4 +35,50 @@ def test_raft_stereo(model_fn, model_mode, dev):
     ), f"The output shape of depth_pred should be [1, 1, 64, 64] but instead it is {preds[0].shape}"
 
     # Test against expected file output
-    TM._assert_expected(depth_pred.cpu(), name=model_fn.__name__, atol=1e-2, rtol=1e-2)
+    TM._assert_expected(depth_pred, name=model_fn.__name__, atol=1e-2, rtol=1e-2)
+
+
+@pytest.mark.parametrize("model_fn", (models.depth.stereo.crestereo_base,))
+@pytest.mark.parametrize("model_mode", ("standard", "scripted"))
+@pytest.mark.parametrize("dev", cpu_and_gpu())
+def test_crestereo(model_fn, model_mode, dev):
+    set_rng_seed(0)
+
+    model = model_fn().eval().to(dev)
+
+    if model_mode == "scripted":
+        model = torch.jit.script(model)
+
+    img1 = torch.rand(1, 3, 64, 64).to(dev)
+    img2 = torch.rand(1, 3, 64, 64).to(dev)
+    iterations = 3
+
+    preds = model(img1, img2, flow_init=None, num_iters=iterations)
+    disparity_pred = preds[-1]
+
+    # all the pyramid levels except the highest res make only half the number of iterations
+    expected_iterations = (iterations // 2) * (len(model.resolutions) - 1)
+    expected_iterations += iterations
+    assert (
+        len(preds) == expected_iterations
+    ), "Number of predictions should be the number of iterations multiplied by the number of pyramid levels"
+
+    assert disparity_pred.shape == torch.Size(
+        [1, 2, 64, 64]
+    ), f"Predicted disparity should have the same spatial shape as the input. Inputs shape {img1.shape[2:]}, Prediction shape {disparity_pred.shape[2:]}"
+
+    assert all(
+        d.shape == torch.Size([1, 2, 64, 64]) for d in preds
+    ), "All predicted disparities are expected to have the same shape"
+
+    # test a backward pass with a dummy loss as well
+    preds = torch.stack(preds, dim=0)
+    targets = torch.ones_like(preds, requires_grad=False)
+    loss = torch.nn.functional.mse_loss(preds, targets)
+
+    try:
+        loss.backward()
+    except Exception as e:
+        assert False, f"Backward pass failed with an unexpected exception: {e.__class__.__name__} {e}"
+
+    TM._assert_expected(disparity_pred, name=model_fn.__name__, atol=1e-2, rtol=1e-2)
diff --git a/torchvision/models/optical_flow/_utils.py b/torchvision/models/optical_flow/_utils.py
@@ -19,8 +19,9 @@ def grid_sample(img: Tensor, absolute_grid: Tensor, mode: str = "bilinear", alig
     return F.grid_sample(img, normalized_grid, mode=mode, align_corners=align_corners)
 
 
-def make_coords_grid(batch_size: int, h: int, w: int):
-    coords = torch.meshgrid(torch.arange(h), torch.arange(w), indexing="ij")
+def make_coords_grid(batch_size: int, h: int, w: int, device: str = "cpu"):
+    device = torch.device(device)
+    coords = torch.meshgrid(torch.arange(h, device=device), torch.arange(w, device=device), indexing="ij")
     coords = torch.stack(coords[::-1], dim=0).float()
     return coords[None].repeat(batch_size, 1, 1, 1)
 
diff --git a/torchvision/models/optical_flow/raft.py b/torchvision/models/optical_flow/raft.py
@@ -27,7 +27,7 @@
 class ResidualBlock(nn.Module):
     """Slightly modified Residual block with extra relu and biases."""
 
-    def __init__(self, in_channels, out_channels, *, norm_layer, stride=1):
+    def __init__(self, in_channels, out_channels, *, norm_layer, stride=1, always_project: bool = False):
         super().__init__()
 
         # Note regarding bias=True:
@@ -43,7 +43,10 @@ def __init__(self, in_channels, out_channels, *, norm_layer, stride=1):
             out_channels, out_channels, norm_layer=norm_layer, kernel_size=3, bias=True
         )
 
-        if stride == 1:
+        # make mypy happy
+        self.downsample: nn.Module
+
+        if stride == 1 and not always_project:
             self.downsample = nn.Identity()
         else:
             self.downsample = Conv2dNormActivation(
@@ -144,6 +147,10 @@ def __init__(
                 if m.bias is not None:
                     nn.init.constant_(m.bias, 0)
 
+        num_downsamples = len(list(filter(lambda s: s == 2, strides)))
+        self.output_dim = layers[-1]
+        self.downsample_factor = 2**num_downsamples
+
     def _make_2_blocks(self, block, in_channels, out_channels, norm_layer, first_stride):
         block1 = block(in_channels, out_channels, norm_layer=norm_layer, stride=first_stride)
         block2 = block(out_channels, out_channels, norm_layer=norm_layer, stride=1)
diff --git a/torchvision/prototype/models/depth/stereo/__init__.py b/torchvision/prototype/models/depth/stereo/__init__.py
@@ -1 +1,2 @@
 from .raft_stereo import *
+from .crestereo import *
diff --git a/torchvision/prototype/models/depth/stereo/crestereo.py b/torchvision/prototype/models/depth/stereo/crestereo.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`from .raft_stereo import *`
	`2`	`+from .crestereo import *`