diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 2a433ee14d98..6ccf9b465ebd 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -312,12 +312,17 @@ def numpy_to_depth(self, images): """ if images.ndim == 3: images = images[None, ...] - images = (images * 255).round().astype("uint8") - if images.shape[-1] == 1: - # special case for grayscale (single channel) images - raise Exception("Not supported") + images_depth = images[:, :, :, 3:] + if images.shape[-1] == 6: + images_depth = (images_depth * 255).round().astype("uint8") + pil_images = [ + Image.fromarray(self.rgblike_to_depthmap(image_depth), mode="I;16") for image_depth in images_depth + ] + elif images.shape[-1] == 4: + images_depth = (images_depth * 65535.0).astype(np.uint16) + pil_images = [Image.fromarray(image_depth, mode="I;16") for image_depth in images_depth] else: - pil_images = [Image.fromarray(self.rgblike_to_depthmap(image[:, :, 3:]), mode="I;16") for image in images] + raise Exception("Not supported") return pil_images @@ -349,7 +354,11 @@ def postprocess( image = self.pt_to_numpy(image) if output_type == "np": - return image[:, :, :, :3], np.stack([self.rgblike_to_depthmap(im[:, :, 3:]) for im in image], axis=0) + if image.shape[-1] == 6: + image_depth = np.stack([self.rgblike_to_depthmap(im[:, :, 3:]) for im in image], axis=0) + else: + image_depth = image[:, :, :, 3:] + return image[:, :, :, :3], image_depth if output_type == "pil": return self.numpy_to_pil(image), self.numpy_to_depth(image) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py index 933e4307a41b..e2164e8117ad 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py @@ -130,9 +130,9 @@ def test_stable_diffusion_ddim(self): assert depth.shape == (1, 64, 64) expected_slice_rgb = np.array( - [0.37301102, 0.7023895, 0.7418312, 0.5163375, 0.5825485, 0.60929704, 0.4188174, 0.48407027, 0.46555096] + [0.37338176, 0.70247, 0.74203193, 0.51643604, 0.58256793, 0.60932136, 0.4181095, 0.48355877, 0.46535262] ) - expected_slice_depth = np.array([103.4673, 85.81202, 87.84926]) + expected_slice_depth = np.array([103.46727, 85.812004, 87.849236]) assert np.abs(image_slice_rgb.flatten() - expected_slice_rgb).max() < 1e-2 assert np.abs(image_slice_depth.flatten() - expected_slice_depth).max() < 1e-2 @@ -280,10 +280,30 @@ def test_ldm3d(self): output = ldm3d_pipe(**inputs) rgb, depth = output.rgb, output.depth - expected_rgb_mean = 0.54461557 - expected_rgb_std = 0.2806707 - expected_depth_mean = 143.64595 - expected_depth_std = 83.491776 + expected_rgb_mean = 0.495586 + expected_rgb_std = 0.33795515 + expected_depth_mean = 112.48518 + expected_depth_std = 98.489746 + assert np.abs(expected_rgb_mean - rgb.mean()) < 1e-3 + assert np.abs(expected_rgb_std - rgb.std()) < 1e-3 + assert np.abs(expected_depth_mean - depth.mean()) < 1e-3 + assert np.abs(expected_depth_std - depth.std()) < 1e-3 + + def test_ldm3d_v2(self): + ldm3d_pipe = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d-4c").to(torch_device) + ldm3d_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_inputs(torch_device) + output = ldm3d_pipe(**inputs) + rgb, depth = output.rgb, output.depth + + expected_rgb_mean = 0.4194127 + expected_rgb_std = 0.35375586 + expected_depth_mean = 0.5638502 + expected_depth_std = 0.34686103 + + assert rgb.shape == (1, 512, 512, 3) + assert depth.shape == (1, 512, 512, 1) assert np.abs(expected_rgb_mean - rgb.mean()) < 1e-3 assert np.abs(expected_rgb_std - rgb.std()) < 1e-3 assert np.abs(expected_depth_mean - depth.mean()) < 1e-3