huggingface
diff --git a/‎.github/workflows/nightly_tests.yml
+56 b/‎.github/workflows/nightly_tests.yml
+56
diff --git a/‎.github/workflows/ssh-runner.yml
+2-1 b/‎.github/workflows/ssh-runner.yml
+2-1
diff --git a/‎docker/diffusers-onnxruntime-cuda/Dockerfile
+1-1 b/‎docker/diffusers-onnxruntime-cuda/Dockerfile
+1-1
diff --git a/‎docker/diffusers-pytorch-compile-cuda/Dockerfile
+1-1 b/‎docker/diffusers-pytorch-compile-cuda/Dockerfile
+1-1
diff --git a/‎docker/diffusers-pytorch-cpu/Dockerfile
+1-1 b/‎docker/diffusers-pytorch-cpu/Dockerfile
+1-1
diff --git a/‎docker/diffusers-pytorch-cuda/Dockerfile
+1-1 b/‎docker/diffusers-pytorch-cuda/Dockerfile
+1-1
diff --git a/‎docker/diffusers-pytorch-xformers-cuda/Dockerfile
+1-1 b/‎docker/diffusers-pytorch-xformers-cuda/Dockerfile
+1-1
diff --git a/‎docs/source/en/_toctree.yml
+8 b/‎docs/source/en/_toctree.yml
+8
diff --git a/‎docs/source/en/api/models/autoencoderkl_mochi.md
+32 b/‎docs/source/en/api/models/autoencoderkl_mochi.md
+32
diff --git a/‎docs/source/en/api/models/controlnet.md
+2-2 b/‎docs/source/en/api/models/controlnet.md
+2-2
diff --git a/‎docs/source/en/api/models/controlnet_sd3.md
+1-1 b/‎docs/source/en/api/models/controlnet_sd3.md
+1-1
diff --git a/‎docs/source/en/api/models/mochi_transformer3d.md
+30 b/‎docs/source/en/api/models/mochi_transformer3d.md
+30
diff --git a/‎docs/source/en/api/pipelines/cogvideox.md
+24-8 b/‎docs/source/en/api/pipelines/cogvideox.md
+24-8
diff --git a/‎docs/source/en/api/pipelines/controlnet_sd3.md
+1 b/‎docs/source/en/api/pipelines/controlnet_sd3.md
+1
@@ -180,6 +180,62 @@ jobs:
         pip install slack_sdk tabulate
         python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
 
+  run_big_gpu_torch_tests:
+    name: Torch tests on big GPU
+    strategy:
+      fail-fast: false
+      max-parallel: 2
+    runs-on:
+      group: aws-g6e-xlarge-plus
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "16gb" --ipc host --gpus 0
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      - name: NVIDIA-SMI
+        run: nvidia-smi
+      - name: Install dependencies
+        run: |
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          python -m uv pip install -e [quality,test]
+          python -m uv pip install peft@git+https://github.com/huggingface/peft.git
+          pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+          python -m uv pip install pytest-reportlog
+      - name: Environment
+        run: |
+          python utils/print_env.py
+      - name: Selected Torch CUDA Test on big GPU
+        env:
+          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+          CUBLAS_WORKSPACE_CONFIG: :16:8
+          BIG_GPU_MEMORY: 40
+        run: |
+          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+            -m "big_gpu_with_torch_cuda" \
+            --make-reports=tests_big_gpu_torch_cuda \
+            --report-log=tests_big_gpu_torch_cuda.log \
+            tests/
+      - name: Failure short reports
+        if: ${{ failure() }}
+        run: |
+          cat reports/tests_big_gpu_torch_cuda_stats.txt
+          cat reports/tests_big_gpu_torch_cuda_failures_short.txt
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: torch_cuda_big_gpu_test_reports
+          path: reports
+      - name: Generate Report and Notify Channel
+        if: always()
+        run: |
+          pip install slack_sdk tabulate
+          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+
   run_flax_tpu_tests:
     name: Nightly Flax TPU Tests
     runs-on: docker-tpu
 
@@ -4,12 +4,13 @@ on:
   workflow_dispatch:
     inputs:
       runner_type:
-        description: 'Type of runner to test (aws-g6-4xlarge-plus: a10 or aws-g4dn-2xlarge: t4)'
+        description: 'Type of runner to test (aws-g6-4xlarge-plus: a10, aws-g4dn-2xlarge: t4, aws-g6e-xlarge-plus: L40)'
         type: choice
         required: true
         options:
           - aws-g6-4xlarge-plus
           - aws-g4dn-2xlarge
+          - aws-g6e-xlarge-plus
       docker_image:
         description: 'Name of the Docker image'
         required: true
 
@@ -28,7 +28,7 @@ ENV PATH="/opt/venv/bin:$PATH"
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
 RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
     python3.10 -m uv pip install --no-cache-dir \
-        "torch<2.5.0" \
+        torch \
         torchvision \
         torchaudio \
         "onnxruntime-gpu>=1.13.1" \
 
@@ -29,7 +29,7 @@ ENV PATH="/opt/venv/bin:$PATH"
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
 RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
     python3.10 -m uv pip install --no-cache-dir \
-    "torch<2.5.0" \
+    torch \
     torchvision \
     torchaudio \
     invisible_watermark && \
 
@@ -29,7 +29,7 @@ ENV PATH="/opt/venv/bin:$PATH"
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
 RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
     python3.10 -m uv pip install --no-cache-dir \
-        "torch<2.5.0" \
+        torch \
         torchvision \
         torchaudio \
         invisible_watermark \
 
@@ -29,7 +29,7 @@ ENV PATH="/opt/venv/bin:$PATH"
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
 RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
     python3.10 -m uv pip install --no-cache-dir \
-    "torch<2.5.0" \
+    torch \
     torchvision \
     torchaudio \
     invisible_watermark && \
 
@@ -29,7 +29,7 @@ ENV PATH="/opt/venv/bin:$PATH"
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
 RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
     python3.10 -m pip install --no-cache-dir \
-        "torch<2.5.0" \
+        torch \
         torchvision \
         torchaudio \
         invisible_watermark && \
 
@@ -55,6 +55,8 @@
 - sections:
   - local: using-diffusers/overview_techniques
     title: Overview
+  - local: using-diffusers/create_a_server
+    title: Create a server
   - local: training/distributed_inference
     title: Distributed inference
   - local: using-diffusers/merge_loras
@@ -270,6 +272,8 @@
         title: LatteTransformer3DModel
       - local: api/models/lumina_nextdit2d
         title: LuminaNextDiT2DModel
+      - local: api/models/mochi_transformer3d
+        title: MochiTransformer3DModel
       - local: api/models/pixart_transformer2d
         title: PixArtTransformer2DModel
       - local: api/models/prior_transformer
@@ -306,6 +310,8 @@
         title: AutoencoderKLAllegro
       - local: api/models/autoencoderkl_cogvideox
         title: AutoencoderKLCogVideoX
+      - local: api/models/autoencoderkl_mochi
+        title: AutoencoderKLMochi
       - local: api/models/asymmetricautoencoderkl
         title: AsymmetricAutoencoderKL
       - local: api/models/consistency_decoder_vae
@@ -400,6 +406,8 @@
       title: Lumina-T2X
     - local: api/pipelines/marigold
       title: Marigold
+    - local: api/pipelines/mochi
+      title: Mochi
     - local: api/pipelines/panorama
       title: MultiDiffusion
     - local: api/pipelines/musicldm
 
@@ -0,0 +1,32 @@
+<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# AutoencoderKLMochi
+
+The 3D variational autoencoder (VAE) model with KL loss used in [Mochi](https://github.com/genmoai/models) was introduced in [Mochi 1 Preview](https://huggingface.co/genmo/mochi-1-preview) by Tsinghua University & ZhipuAI.
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import AutoencoderKLMochi
+
+vae = AutoencoderKLMochi.from_pretrained("genmo/mochi-1-preview", subfolder="vae", torch_dtype=torch.float32).to("cuda")
+```
+
+## AutoencoderKLMochi
+
+[[autodoc]] AutoencoderKLMochi
+    - decode
+    - all
+
+## DecoderOutput
+
+[[autodoc]] models.autoencoders.vae.DecoderOutput
@@ -39,12 +39,12 @@ pipe = StableDiffusionControlNetPipeline.from_single_file(url, controlnet=contro
 
 ## ControlNetOutput
 
-[[autodoc]] models.controlnet.ControlNetOutput
+[[autodoc]] models.controlnets.controlnet.ControlNetOutput
 
 ## FlaxControlNetModel
 
 [[autodoc]] FlaxControlNetModel
 
 ## FlaxControlNetOutput
 
-[[autodoc]] models.controlnet_flax.FlaxControlNetOutput
+[[autodoc]] models.controlnets.controlnet_flax.FlaxControlNetOutput
@@ -38,5 +38,5 @@ pipe = StableDiffusion3ControlNetPipeline.from_pretrained("stabilityai/stable-di
 
 ## SD3ControlNetOutput
 
-[[autodoc]] models.controlnet_sd3.SD3ControlNetOutput
+[[autodoc]] models.controlnets.controlnet_sd3.SD3ControlNetOutput
 
@@ -0,0 +1,30 @@
+<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# MochiTransformer3DModel
+
+A Diffusion Transformer model for 3D video-like data was introduced in [Mochi-1 Preview](https://huggingface.co/genmo/mochi-1-preview) by Genmo.
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import MochiTransformer3DModel
+
+vae = MochiTransformer3DModel.from_pretrained("genmo/mochi-1-preview", subfolder="transformer", torch_dtype=torch.float16).to("cuda")
+```
+
+## MochiTransformer3DModel
+
+[[autodoc]] MochiTransformer3DModel
+
+## Transformer2DModelOutput
+
+[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
@@ -29,16 +29,32 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.m
 
 This pipeline was contributed by [zRzRzRzRzRzRzR](https://github.com/zRzRzRzRzRzRzR). The original codebase can be found [here](https://huggingface.co/THUDM). The original weights can be found under [hf.co/THUDM](https://huggingface.co/THUDM).
 
-There are two models available that can be used with the text-to-video and video-to-video CogVideoX pipelines:
-- [`THUDM/CogVideoX-2b`](https://huggingface.co/THUDM/CogVideoX-2b): The recommended dtype for running this model is `fp16`.
-- [`THUDM/CogVideoX-5b`](https://huggingface.co/THUDM/CogVideoX-5b): The recommended dtype for running this model is `bf16`.
+There are three official CogVideoX checkpoints for text-to-video and video-to-video.
 
-There is one model available that can be used with the image-to-video CogVideoX pipeline:
-- [`THUDM/CogVideoX-5b-I2V`](https://huggingface.co/THUDM/CogVideoX-5b-I2V): The recommended dtype for running this model is `bf16`.
+| checkpoints | recommended inference dtype |
+|:---:|:---:|
+| [`THUDM/CogVideoX-2b`](https://huggingface.co/THUDM/CogVideoX-2b) | torch.float16 |
+| [`THUDM/CogVideoX-5b`](https://huggingface.co/THUDM/CogVideoX-5b) | torch.bfloat16 |
+| [`THUDM/CogVideoX1.5-5b`](https://huggingface.co/THUDM/CogVideoX1.5-5b) | torch.bfloat16 |
 
-There are two models that support pose controllable generation (by the [Alibaba-PAI](https://huggingface.co/alibaba-pai) team):
-- [`alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose`](https://huggingface.co/alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose): The recommended dtype for running this model is `bf16`.
-- [`alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose`](https://huggingface.co/alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose): The recommended dtype for running this model is `bf16`.
+There are two official CogVideoX checkpoints available for image-to-video.
+
+| checkpoints | recommended inference dtype |
+|:---:|:---:|
+| [`THUDM/CogVideoX-5b-I2V`](https://huggingface.co/THUDM/CogVideoX-5b-I2V) | torch.bfloat16 |
+| [`THUDM/CogVideoX-1.5-5b-I2V`](https://huggingface.co/THUDM/CogVideoX-1.5-5b-I2V) | torch.bfloat16 |
+
+For the CogVideoX 1.5 series:
+- Text-to-video (T2V) works best at a resolution of 1360x768 because it was trained with that specific resolution.
+- Image-to-video (I2V) works for multiple resolutions. The width can vary from 768 to 1360, but the height must be 768. The height/width must be divisible by 16.
+- Both T2V and I2V models support generation with 81 and 161 frames and work best at this value. Exporting videos at 16 FPS is recommended.
+
+There are two official CogVideoX checkpoints that support pose controllable generation (by the [Alibaba-PAI](https://huggingface.co/alibaba-pai) team).
+
+| checkpoints | recommended inference dtype |
+|:---:|:---:|
+| [`alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose`](https://huggingface.co/alibaba-pai/CogVideoX-Fun-V1.1-2b-Pose) | torch.bfloat16 |
+| [`alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose`](https://huggingface.co/alibaba-pai/CogVideoX-Fun-V1.1-5b-Pose) | torch.bfloat16 |
 
 ## Inference
 
 
@@ -28,6 +28,7 @@ This controlnet code is mainly implemented by [The InstantX Team](https://huggin
 | ControlNet type | Developer | Link |
 | -------- | ---------- | ---- |
 | Canny | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/InstantX/SD3-Controlnet-Canny) |
+| Depth | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/InstantX/SD3-Controlnet-Depth) |
 | Pose | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/InstantX/SD3-Controlnet-Pose) |
 | Tile | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/InstantX/SD3-Controlnet-Tile) |
 | Inpainting | [The AlimamaCreative Team](https://huggingface.co/alimama-creative) | [link](https://huggingface.co/alimama-creative/SD3-Controlnet-Inpainting) |
Original file line number	Diff line number	Diff line change
`@@ -38,5 +38,5 @@ pipe = StableDiffusion3ControlNetPipeline.from_pretrained("stabilityai/stable-di`
`38`	`38`
`39`	`39`	`## SD3ControlNetOutput`
`40`	`40`
`41`		`-[[autodoc]] models.controlnet_sd3.SD3ControlNetOutput`
	`41`	`+[[autodoc]] models.controlnets.controlnet_sd3.SD3ControlNetOutput`
`42`	`42`