Fix AutoencoderTiny encoder scaling convention

madebyollin · madebyollin · commit 7a9a7c21e85b · 2023-08-19T11:16:33.000-07:00
* Add [-1, 1] -> [0, 1] rescaling to EncoderTiny (this fixes huggingface#4676) * Move [0, 1] -> [-1, 1] rescaling from AutoencoderTiny.decode to DecoderTiny (i.e. immediately after the final conv, as early as possible) * Fix missing [0, 255] -> [0, 1] rescaling in AutoencoderTiny.forward * Update AutoencoderTinyIntegrationTests to protect against scaling issues. The new test constructs a simple image, round-trips it through AutoencoderTiny, and confirms the decoded result is approximately equal to the source image. This test checks behavior with and without tiling enabled. This test will fail if new AutoencoderTiny scaling issues are introduced. * Context: Raw TAESD weights expect images in [0, 1], but diffusers' convention represents images with zero-centered values in [-1, 1], so AutoencoderTiny needs to scale / unscale images at the start of encoding and at the end of decoding in order to work with diffusers.
diff --git a/src/diffusers/models/autoencoder_tiny.py b/src/diffusers/models/autoencoder_tiny.py
@@ -312,9 +312,6 @@ def decode(self, x: torch.FloatTensor, return_dict: bool = True) -> Union[Decode
             output = torch.cat(output)
         else:
             output = self._tiled_decode(x) if self.use_tiling else self.decoder(x)
-        # Refer to the following discussion to know why this is needed.
-        # https://github.com/huggingface/diffusers/pull/4384#discussion_r1279401854
-        output = output.mul_(2).sub_(1)
 
         if not return_dict:
             return (output,)
@@ -334,7 +331,7 @@ def forward(
         """
         enc = self.encode(sample).latents
         scaled_enc = self.scale_latents(enc).mul_(255).round_().byte()
-        unscaled_enc = self.unscale_latents(scaled_enc)
+        unscaled_enc = self.unscale_latents(scaled_enc / 255.0)
         dec = self.decode(unscaled_enc)
 
         if not return_dict:
diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py
@@ -732,7 +732,8 @@ def custom_forward(*inputs):
                 x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x)
 
         else:
-            x = self.layers(x)
+            # scale image from [-1, 1] to [0, 1] to match TAESD convention
+            x = self.layers(x.add(1).div(2))
 
         return x
 
@@ -790,4 +791,5 @@ def custom_forward(*inputs):
         else:
             x = self.layers(x)
 
-        return x
+        # scale image from [0, 1] to [-1, 1] to match diffusers convention
+        return x.mul(2).sub(1)
diff --git a/tests/models/test_models_vae.py b/tests/models/test_models_vae.py
@@ -270,14 +270,6 @@ def tearDown(self):
         gc.collect()
         torch.cuda.empty_cache()
 
-    def get_file_format(self, seed, shape):
-        return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
-
-    def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False):
-        dtype = torch.float16 if fp16 else torch.float32
-        image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
-        return image
-
     def get_sd_vae_model(self, model_id="hf-internal-testing/taesd-diffusers", fp16=False):
         torch_dtype = torch.float16 if fp16 else torch.float32
 
@@ -302,19 +294,27 @@ def test_tae_tiling(self, in_shape, out_shape):
             dec = model.decode(zeros).sample
             assert dec.shape == out_shape
 
-    def test_stable_diffusion(self):
+    @parameterized.expand([True, False])
+    def test_tae_roundtrip(self, enable_tiling):
+        # load the autoencoder
         model = self.get_sd_vae_model()
-        image = self.get_sd_image(seed=33)
+        if enable_tiling:
+            model.enable_tiling()
+
+        # make a black image with a white square in the middle,
+        # which is large enough to split across multiple tiles
+        image = -torch.ones(1, 3, 1024, 1024, device=torch_device)
+        image[..., 256:768, 256:768] = 1.0
 
+        # round-trip the image through the autoencoder
         with torch.no_grad():
             sample = model(image).sample
 
-        assert sample.shape == image.shape
-
-        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
-        expected_output_slice = torch.tensor([0.9858, 0.9262, 0.8629, 1.0974, -0.091, -0.2485, 0.0936, 0.0604])
+        # the autoencoder reconstruction should match original image, sorta
+        def downscale(x):
+            return torch.nn.functional.avg_pool2d(x, model.spatial_scale_factor)
+        assert torch_all_close(downscale(sample), downscale(image), atol=0.125)
 
-        assert torch_all_close(output_slice, expected_output_slice, atol=3e-3)
 
 
 @slow