Skip to content

Commit 3fab662

Browse files
toshasyiyixuxustevhliu
authored
Marigold Update: v1-1 models, Intrinsic Image Decomposition pipeline, documentation (#10884)
* minor documentation fixes of the depth and normals pipelines * update license headers * update model checkpoints in examples fix missing prediction_type in register_to_config in the normals pipeline * add initial marigold intrinsics pipeline update comments about num_inference_steps and ensemble_size minor fixes in comments of marigold normals and depth pipelines * update uncertainty visualization to work with intrinsics * integrate iid --------- Co-authored-by: YiYi Xu <[email protected]> Co-authored-by: Steven Liu <[email protected]>
1 parent f0ac7aa commit 3fab662

14 files changed

+1886
-258
lines changed

docs/source/en/api/pipelines/marigold.md

+89-34
Large diffs are not rendered by default.

docs/source/en/api/pipelines/overview.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
6565
| [Latte](latte) | text2image |
6666
| [LEDITS++](ledits_pp) | image editing |
6767
| [Lumina-T2X](lumina) | text2image |
68-
| [Marigold](marigold) | depth |
68+
| [Marigold](marigold) | depth-estimation, normals-estimation, intrinsic-decomposition |
6969
| [MultiDiffusion](panorama) | text2image |
7070
| [MusicLDM](musicldm) | text2audio |
7171
| [PAG](pag) | text2image |

docs/source/en/using-diffusers/marigold_usage.md

+312-173
Large diffs are not rendered by default.

src/diffusers/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,7 @@
345345
"Lumina2Text2ImgPipeline",
346346
"LuminaText2ImgPipeline",
347347
"MarigoldDepthPipeline",
348+
"MarigoldIntrinsicsPipeline",
348349
"MarigoldNormalsPipeline",
349350
"MochiPipeline",
350351
"MusicLDMPipeline",
@@ -845,6 +846,7 @@
845846
Lumina2Text2ImgPipeline,
846847
LuminaText2ImgPipeline,
847848
MarigoldDepthPipeline,
849+
MarigoldIntrinsicsPipeline,
848850
MarigoldNormalsPipeline,
849851
MochiPipeline,
850852
MusicLDMPipeline,

src/diffusers/pipelines/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,7 @@
261261
_import_structure["marigold"].extend(
262262
[
263263
"MarigoldDepthPipeline",
264+
"MarigoldIntrinsicsPipeline",
264265
"MarigoldNormalsPipeline",
265266
]
266267
)
@@ -603,6 +604,7 @@
603604
from .lumina2 import Lumina2Text2ImgPipeline
604605
from .marigold import (
605606
MarigoldDepthPipeline,
607+
MarigoldIntrinsicsPipeline,
606608
MarigoldNormalsPipeline,
607609
)
608610
from .mochi import MochiPipeline

src/diffusers/pipelines/marigold/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
else:
2424
_import_structure["marigold_image_processing"] = ["MarigoldImageProcessor"]
2525
_import_structure["pipeline_marigold_depth"] = ["MarigoldDepthOutput", "MarigoldDepthPipeline"]
26+
_import_structure["pipeline_marigold_intrinsics"] = ["MarigoldIntrinsicsOutput", "MarigoldIntrinsicsPipeline"]
2627
_import_structure["pipeline_marigold_normals"] = ["MarigoldNormalsOutput", "MarigoldNormalsPipeline"]
2728

2829
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -35,6 +36,7 @@
3536
else:
3637
from .marigold_image_processing import MarigoldImageProcessor
3738
from .pipeline_marigold_depth import MarigoldDepthOutput, MarigoldDepthPipeline
39+
from .pipeline_marigold_intrinsics import MarigoldIntrinsicsOutput, MarigoldIntrinsicsPipeline
3840
from .pipeline_marigold_normals import MarigoldNormalsOutput, MarigoldNormalsPipeline
3941

4042
else:

src/diffusers/pipelines/marigold/marigold_image_processing.py

+127-14
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,22 @@
1-
from typing import List, Optional, Tuple, Union
1+
# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
2+
# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
# --------------------------------------------------------------------------
16+
# More information and citation instructions are available on the
17+
# Marigold project website: https://marigoldcomputervision.github.io
18+
# --------------------------------------------------------------------------
19+
from typing import Any, Dict, List, Optional, Tuple, Union
220

321
import numpy as np
422
import PIL
@@ -379,7 +397,7 @@ def visualize_depth(
379397
val_min: float = 0.0,
380398
val_max: float = 1.0,
381399
color_map: str = "Spectral",
382-
) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
400+
) -> List[PIL.Image.Image]:
383401
"""
384402
Visualizes depth maps, such as predictions of the `MarigoldDepthPipeline`.
385403
@@ -391,7 +409,7 @@ def visualize_depth(
391409
color_map (`str`, *optional*, defaults to `"Spectral"`): Color map used to convert a single-channel
392410
depth prediction into colored representation.
393411
394-
Returns: `PIL.Image.Image` or `List[PIL.Image.Image]` with depth maps visualization.
412+
Returns: `List[PIL.Image.Image]` with depth maps visualization.
395413
"""
396414
if val_max <= val_min:
397415
raise ValueError(f"Invalid values range: [{val_min}, {val_max}].")
@@ -436,7 +454,7 @@ def export_depth_to_16bit_png(
436454
depth: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
437455
val_min: float = 0.0,
438456
val_max: float = 1.0,
439-
) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
457+
) -> List[PIL.Image.Image]:
440458
def export_depth_to_16bit_png_one(img, idx=None):
441459
prefix = "Depth" + (f"[{idx}]" if idx else "")
442460
if not isinstance(img, np.ndarray) and not torch.is_tensor(img):
@@ -478,7 +496,7 @@ def visualize_normals(
478496
flip_x: bool = False,
479497
flip_y: bool = False,
480498
flip_z: bool = False,
481-
) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
499+
) -> List[PIL.Image.Image]:
482500
"""
483501
Visualizes surface normals, such as predictions of the `MarigoldNormalsPipeline`.
484502
@@ -492,7 +510,7 @@ def visualize_normals(
492510
flip_z (`bool`, *optional*, defaults to `False`): Flips the Z axis of the normals frame of reference.
493511
Default direction is facing the observer.
494512
495-
Returns: `PIL.Image.Image` or `List[PIL.Image.Image]` with surface normals visualization.
513+
Returns: `List[PIL.Image.Image]` with surface normals visualization.
496514
"""
497515
flip_vec = None
498516
if any((flip_x, flip_y, flip_z)):
@@ -528,6 +546,99 @@ def visualize_normals_one(img, idx=None):
528546
else:
529547
raise ValueError(f"Unexpected input type: {type(normals)}")
530548

549+
@staticmethod
550+
def visualize_intrinsics(
551+
prediction: Union[
552+
np.ndarray,
553+
torch.Tensor,
554+
List[np.ndarray],
555+
List[torch.Tensor],
556+
],
557+
target_properties: Dict[str, Any],
558+
color_map: Union[str, Dict[str, str]] = "binary",
559+
) -> List[Dict[str, PIL.Image.Image]]:
560+
"""
561+
Visualizes intrinsic image decomposition, such as predictions of the `MarigoldIntrinsicsPipeline`.
562+
563+
Args:
564+
prediction (`Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]]`):
565+
Intrinsic image decomposition.
566+
target_properties (`Dict[str, Any]`):
567+
Decomposition properties. Expected entries: `target_names: List[str]` and a dictionary with keys
568+
`prediction_space: str`, `sub_target_names: List[Union[str, Null]]` (must have 3 entries, null for
569+
missing modalities), `up_to_scale: bool`, one for each target and sub-target.
570+
color_map (`Union[str, Dict[str, str]]`, *optional*, defaults to `"Spectral"`):
571+
Color map used to convert a single-channel predictions into colored representations. When a dictionary
572+
is passed, each modality can be colored with its own color map.
573+
574+
Returns: `List[Dict[str, PIL.Image.Image]]` with intrinsic image decomposition visualization.
575+
"""
576+
if "target_names" not in target_properties:
577+
raise ValueError("Missing `target_names` in target_properties")
578+
if not isinstance(color_map, str) and not (
579+
isinstance(color_map, dict)
580+
and all(isinstance(k, str) and isinstance(v, str) for k, v in color_map.items())
581+
):
582+
raise ValueError("`color_map` must be a string or a dictionary of strings")
583+
n_targets = len(target_properties["target_names"])
584+
585+
def visualize_targets_one(images, idx=None):
586+
# img: [T, 3, H, W]
587+
out = {}
588+
for target_name, img in zip(target_properties["target_names"], images):
589+
img = img.permute(1, 2, 0) # [H, W, 3]
590+
prediction_space = target_properties[target_name].get("prediction_space", "srgb")
591+
if prediction_space == "stack":
592+
sub_target_names = target_properties[target_name]["sub_target_names"]
593+
if len(sub_target_names) != 3 or any(
594+
not (isinstance(s, str) or s is None) for s in sub_target_names
595+
):
596+
raise ValueError(f"Unexpected target sub-names {sub_target_names} in {target_name}")
597+
for i, sub_target_name in enumerate(sub_target_names):
598+
if sub_target_name is None:
599+
continue
600+
sub_img = img[:, :, i]
601+
sub_prediction_space = target_properties[sub_target_name].get("prediction_space", "srgb")
602+
if sub_prediction_space == "linear":
603+
sub_up_to_scale = target_properties[sub_target_name].get("up_to_scale", False)
604+
if sub_up_to_scale:
605+
sub_img = sub_img / max(sub_img.max().item(), 1e-6)
606+
sub_img = sub_img ** (1 / 2.2)
607+
cmap_name = (
608+
color_map if isinstance(color_map, str) else color_map.get(sub_target_name, "binary")
609+
)
610+
sub_img = MarigoldImageProcessor.colormap(sub_img, cmap=cmap_name, bytes=True)
611+
sub_img = PIL.Image.fromarray(sub_img.cpu().numpy())
612+
out[sub_target_name] = sub_img
613+
elif prediction_space == "linear":
614+
up_to_scale = target_properties[target_name].get("up_to_scale", False)
615+
if up_to_scale:
616+
img = img / max(img.max().item(), 1e-6)
617+
img = img ** (1 / 2.2)
618+
elif prediction_space == "srgb":
619+
pass
620+
img = (img * 255).to(dtype=torch.uint8, device="cpu").numpy()
621+
img = PIL.Image.fromarray(img)
622+
out[target_name] = img
623+
return out
624+
625+
if prediction is None or isinstance(prediction, list) and any(o is None for o in prediction):
626+
raise ValueError("Input prediction is `None`")
627+
if isinstance(prediction, (np.ndarray, torch.Tensor)):
628+
prediction = MarigoldImageProcessor.expand_tensor_or_array(prediction)
629+
if isinstance(prediction, np.ndarray):
630+
prediction = MarigoldImageProcessor.numpy_to_pt(prediction) # [N*T,3,H,W]
631+
if not (prediction.ndim == 4 and prediction.shape[1] == 3 and prediction.shape[0] % n_targets == 0):
632+
raise ValueError(f"Unexpected input shape={prediction.shape}, expecting [N*T,3,H,W].")
633+
N_T, _, H, W = prediction.shape
634+
N = N_T // n_targets
635+
prediction = prediction.reshape(N, n_targets, 3, H, W)
636+
return [visualize_targets_one(img, idx) for idx, img in enumerate(prediction)]
637+
elif isinstance(prediction, list):
638+
return [visualize_targets_one(img, idx) for idx, img in enumerate(prediction)]
639+
else:
640+
raise ValueError(f"Unexpected input type: {type(prediction)}")
641+
531642
@staticmethod
532643
def visualize_uncertainty(
533644
uncertainty: Union[
@@ -537,24 +648,26 @@ def visualize_uncertainty(
537648
List[torch.Tensor],
538649
],
539650
saturation_percentile=95,
540-
) -> Union[PIL.Image.Image, List[PIL.Image.Image]]:
651+
) -> List[PIL.Image.Image]:
541652
"""
542-
Visualizes dense uncertainties, such as produced by `MarigoldDepthPipeline` or `MarigoldNormalsPipeline`.
653+
Visualizes dense uncertainties, such as produced by `MarigoldDepthPipeline`, `MarigoldNormalsPipeline`, or
654+
`MarigoldIntrinsicsPipeline`.
543655
544656
Args:
545657
uncertainty (`Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]]`):
546658
Uncertainty maps.
547659
saturation_percentile (`int`, *optional*, defaults to `95`):
548660
Specifies the percentile uncertainty value visualized with maximum intensity.
549661
550-
Returns: `PIL.Image.Image` or `List[PIL.Image.Image]` with uncertainty visualization.
662+
Returns: `List[PIL.Image.Image]` with uncertainty visualization.
551663
"""
552664

553665
def visualize_uncertainty_one(img, idx=None):
554666
prefix = "Uncertainty" + (f"[{idx}]" if idx else "")
555667
if img.min() < 0:
556-
raise ValueError(f"{prefix}: unexected data range, min={img.min()}.")
557-
img = img.squeeze(0).cpu().numpy()
668+
raise ValueError(f"{prefix}: unexpected data range, min={img.min()}.")
669+
img = img.permute(1, 2, 0) # [H,W,C]
670+
img = img.squeeze(2).cpu().numpy() # [H,W] or [H,W,3]
558671
saturation_value = np.percentile(img, saturation_percentile)
559672
img = np.clip(img * 255 / saturation_value, 0, 255)
560673
img = img.astype(np.uint8)
@@ -566,9 +679,9 @@ def visualize_uncertainty_one(img, idx=None):
566679
if isinstance(uncertainty, (np.ndarray, torch.Tensor)):
567680
uncertainty = MarigoldImageProcessor.expand_tensor_or_array(uncertainty)
568681
if isinstance(uncertainty, np.ndarray):
569-
uncertainty = MarigoldImageProcessor.numpy_to_pt(uncertainty) # [N,1,H,W]
570-
if not (uncertainty.ndim == 4 and uncertainty.shape[1] == 1):
571-
raise ValueError(f"Unexpected input shape={uncertainty.shape}, expecting [N,1,H,W].")
682+
uncertainty = MarigoldImageProcessor.numpy_to_pt(uncertainty) # [N,C,H,W]
683+
if not (uncertainty.ndim == 4 and uncertainty.shape[1] in (1, 3)):
684+
raise ValueError(f"Unexpected input shape={uncertainty.shape}, expecting [N,C,H,W] with C in (1,3).")
572685
return [visualize_uncertainty_one(img, idx) for idx, img in enumerate(uncertainty)]
573686
elif isinstance(uncertainty, list):
574687
return [visualize_uncertainty_one(img, idx) for idx, img in enumerate(uncertainty)]

src/diffusers/pipelines/marigold/pipeline_marigold_depth.py

+19-15
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
# Copyright 2024 Marigold authors, PRS ETH Zurich. All rights reserved.
2-
# Copyright 2024 The HuggingFace Team. All rights reserved.
1+
# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
2+
# Copyright 2024-2025 The HuggingFace Team. All rights reserved.
33
#
44
# Licensed under the Apache License, Version 2.0 (the "License");
55
# you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
1414
# limitations under the License.
1515
# --------------------------------------------------------------------------
1616
# More information and citation instructions are available on the
17-
# Marigold project website: https://marigoldmonodepth.github.io
17+
# Marigold project website: https://marigoldcomputervision.github.io
1818
# --------------------------------------------------------------------------
1919
from dataclasses import dataclass
2020
from functools import partial
@@ -64,7 +64,7 @@
6464
>>> import torch
6565
6666
>>> pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
67-
... "prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
67+
... "prs-eth/marigold-depth-v1-1", variant="fp16", torch_dtype=torch.float16
6868
... ).to("cuda")
6969
7070
>>> image = diffusers.utils.load_image("https://marigoldmonodepth.github.io/images/einstein.jpg")
@@ -86,11 +86,12 @@ class MarigoldDepthOutput(BaseOutput):
8686
8787
Args:
8888
prediction (`np.ndarray`, `torch.Tensor`):
89-
Predicted depth maps with values in the range [0, 1]. The shape is always $numimages \times 1 \times height
90-
\times width$, regardless of whether the images were passed as a 4D array or a list.
89+
Predicted depth maps with values in the range [0, 1]. The shape is $numimages \times 1 \times height \times
90+
width$ for `torch.Tensor` or $numimages \times height \times width \times 1$ for `np.ndarray`.
9191
uncertainty (`None`, `np.ndarray`, `torch.Tensor`):
9292
Uncertainty maps computed from the ensemble, with values in the range [0, 1]. The shape is $numimages
93-
\times 1 \times height \times width$.
93+
\times 1 \times height \times width$ for `torch.Tensor` or $numimages \times height \times width \times 1$
94+
for `np.ndarray`.
9495
latent (`None`, `torch.Tensor`):
9596
Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
9697
The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
@@ -208,6 +209,11 @@ def check_inputs(
208209
output_type: str,
209210
output_uncertainty: bool,
210211
) -> int:
212+
actual_vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
213+
if actual_vae_scale_factor != self.vae_scale_factor:
214+
raise ValueError(
215+
f"`vae_scale_factor` computed at initialization ({self.vae_scale_factor}) differs from the actual one ({actual_vae_scale_factor})."
216+
)
211217
if num_inference_steps is None:
212218
raise ValueError("`num_inference_steps` is not specified and could not be resolved from the model config.")
213219
if num_inference_steps < 1:
@@ -320,6 +326,7 @@ def check_inputs(
320326

321327
return num_images
322328

329+
@torch.compiler.disable
323330
def progress_bar(self, iterable=None, total=None, desc=None, leave=True):
324331
if not hasattr(self, "_progress_bar_config"):
325332
self._progress_bar_config = {}
@@ -370,11 +377,9 @@ def __call__(
370377
same width and height.
371378
num_inference_steps (`int`, *optional*, defaults to `None`):
372379
Number of denoising diffusion steps during inference. The default value `None` results in automatic
373-
selection. The number of steps should be at least 10 with the full Marigold models, and between 1 and 4
374-
for Marigold-LCM models.
380+
selection.
375381
ensemble_size (`int`, defaults to `1`):
376-
Number of ensemble predictions. Recommended values are 5 and higher for better precision, or 1 for
377-
faster inference.
382+
Number of ensemble predictions. Higher values result in measurable improvements and visual degradation.
378383
processing_resolution (`int`, *optional*, defaults to `None`):
379384
Effective processing resolution. When set to `0`, matches the larger input image dimension. This
380385
produces crisper predictions, but may also lead to the overall loss of global context. The default
@@ -486,9 +491,7 @@ def __call__(
486491
# `pred_latent` variable. The variable `image_latent` is of the same shape: it contains each input image encoded
487492
# into latent space and replicated `E` times. The latents can be either generated (see `generator` to ensure
488493
# reproducibility), or passed explicitly via the `latents` argument. The latter can be set outside the pipeline
489-
# code. For example, in the Marigold-LCM video processing demo, the latents initialization of a frame is taken
490-
# as a convex combination of the latents output of the pipeline for the previous frame and a newly-sampled
491-
# noise. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
494+
# code. This behavior can be achieved by setting the `output_latent` argument to `True`. The latent space
492495
# dimensions are `(h, w)`. Encoding into latent space happens in batches of size `batch_size`.
493496
# Model invocation: self.vae.encoder.
494497
image_latent, pred_latent = self.prepare_latents(
@@ -733,6 +736,7 @@ def init_param(depth: torch.Tensor):
733736
param = init_s.cpu().numpy()
734737
else:
735738
raise ValueError("Unrecognized alignment.")
739+
param = param.astype(np.float64)
736740

737741
return param
738742

@@ -775,7 +779,7 @@ def cost_fn(param: np.ndarray, depth: torch.Tensor) -> float:
775779

776780
if regularizer_strength > 0:
777781
prediction, _ = ensemble(depth_aligned, return_uncertainty=False)
778-
err_near = (0.0 - prediction.min()).abs().item()
782+
err_near = prediction.min().abs().item()
779783
err_far = (1.0 - prediction.max()).abs().item()
780784
cost += (err_near + err_far) * regularizer_strength
781785

0 commit comments

Comments
 (0)