PaddlePaddle
diff --git a/‎ppdiffusers/examples/inference/image_to_text_generation-unidiffuser.py
+23 b/‎ppdiffusers/examples/inference/image_to_text_generation-unidiffuser.py
+23
diff --git a/‎ppdiffusers/examples/inference/image_variation-unidiffuser.py
+23 b/‎ppdiffusers/examples/inference/image_variation-unidiffuser.py
+23
diff --git a/‎ppdiffusers/examples/inference/text_to_image_generation-unidiffuser.py
+22 b/‎ppdiffusers/examples/inference/text_to_image_generation-unidiffuser.py
+22
diff --git a/‎ppdiffusers/examples/inference/text_variation-unidiffuser.py
+23 b/‎ppdiffusers/examples/inference/text_variation-unidiffuser.py
+23
diff --git a/‎ppdiffusers/examples/inference/unconditional_image_generation-unidiffuser.py
+21 b/‎ppdiffusers/examples/inference/unconditional_image_generation-unidiffuser.py
+21
diff --git a/‎ppdiffusers/examples/inference/unconditional_image_text_joint_generation-unidiffuser.py
+24 b/‎ppdiffusers/examples/inference/unconditional_image_text_joint_generation-unidiffuser.py
+24
diff --git a/‎ppdiffusers/examples/inference/unconditional_text_generation-unidiffuser.py
+22 b/‎ppdiffusers/examples/inference/unconditional_text_generation-unidiffuser.py
+22
diff --git a/‎ppdiffusers/ppdiffusers/__init__.py
+21 b/‎ppdiffusers/ppdiffusers/__init__.py
+21
diff --git a/‎ppdiffusers/ppdiffusers/models/__init__.py
+13-1 b/‎ppdiffusers/ppdiffusers/models/__init__.py
+13-1
diff --git a/‎ppdiffusers/ppdiffusers/models/attention.py
+53 b/‎ppdiffusers/ppdiffusers/models/attention.py
+53
diff --git a/‎ppdiffusers/ppdiffusers/models/autoencoder_kl.py
+2 b/‎ppdiffusers/ppdiffusers/models/autoencoder_kl.py
+2
diff --git a/‎ppdiffusers/ppdiffusers/models/embeddings.py
+11-3 b/‎ppdiffusers/ppdiffusers/models/embeddings.py
+11-3
@@ -0,0 +1,23 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ppdiffusers import UniDiffuserPipeline
+from ppdiffusers.utils import load_image
+
+pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser")
+image = load_image("https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg")
+result = pipe(mode="i2t", image=image, prompt=None)
+text = result.texts[0]
+with open("image_to_text_generation-unidiffuser-result.txt", "w") as f:
+    print("{}\n".format(text), file=f)
@@ -0,0 +1,23 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ppdiffusers import UniDiffuserPipeline
+from ppdiffusers.utils import load_image
+
+pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser")
+image = load_image("https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg")
+result = pipe(mode="i2t2i", image=image, prompt=None)
+image = result.images[0]
+image.save("image_variation-unidiffuser-result.png")
@@ -0,0 +1,22 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ppdiffusers import UniDiffuserPipeline
+
+pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser")
+prompt = "an elephant under the sea"
+result = pipe(mode="t2i", image=None, prompt=prompt)
+image = result.images[0]
+image.save("text_to_image_generation-unidiffuser-result.png")
@@ -0,0 +1,23 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ppdiffusers import UniDiffuserPipeline
+
+pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser")
+prompt = "an elephant under the sea"
+result = pipe(mode="t2i2t", image=None, prompt=prompt)
+text = result.texts[0]
+with open("text_variation-unidiffuser-result.txt", "w") as f:
+    print("{}\n".format(text), file=f)
@@ -0,0 +1,21 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ppdiffusers import UniDiffuserPipeline
+
+pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser")
+result = pipe(mode="i", image=None, prompt=None)
+image = result.images[0]
+image.save("unconditional_image_generation-unidiffuser-result.png")
@@ -0,0 +1,24 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ppdiffusers import UniDiffuserPipeline
+
+pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser")
+result = pipe(mode="joint", image=None, prompt=None)
+image = result.images[0]
+image.save("unconditional_image_text_generation-unidiffuser-result.png")
+text = result.texts[0]
+with open("unconditional_image_text_generation-unidiffuser-result.txt", "w") as f:
+    print("{}\n".format(text), file=f)
@@ -0,0 +1,22 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ppdiffusers import UniDiffuserPipeline
+
+pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser")
+result = pipe(mode="t", image=None, prompt=None)
+text = result.texts[0]
+with open("unconditional_text_generation-unidiffuser-result.txt", "w") as f:
+    print("{}\n".format(text), file=f)
@@ -19,6 +19,7 @@
 from .configuration_utils import ConfigMixin
 from .utils import (
     OptionalDependencyNotAvailable,
+    is_einops_available,
     is_fastdeploy_available,
     is_inflect_available,
     is_k_diffusion_available,
@@ -88,6 +89,7 @@
         PNDMPipeline,
         RePaintPipeline,
         ScoreSdeVePipeline,
+        TextPipelineOutput,
     )
     from .schedulers import (
         DDIMInverseScheduler,
@@ -96,6 +98,7 @@
         DEISMultistepScheduler,
         DPMSolverMultistepScheduler,
         DPMSolverSinglestepScheduler,
+        DPMSolverUniDiffuserScheduler,
         EulerAncestralDiscreteScheduler,
         EulerDiscreteScheduler,
         HeunDiscreteScheduler,
@@ -161,13 +164,15 @@
         TextToVideoSDPipeline,
         UnCLIPImageVariationPipeline,
         UnCLIPPipeline,
+        UniDiffuserPipeline,
         VersatileDiffusionDualGuidedPipeline,
         VersatileDiffusionImageVariationPipeline,
         VersatileDiffusionPipeline,
         VersatileDiffusionTextToImagePipeline,
         VQDiffusionPipeline,
     )
     from .pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel
+    from .pipelines.unidiffuser.caption_decoder import CaptionDecoder
 
 try:
     if not (is_paddle_available() and is_paddlenlp_available() and is_k_diffusion_available()):
@@ -200,3 +205,19 @@
     from .utils.dummy_paddle_and_librosa_objects import *  # noqa F403
 else:
     from .pipelines import AudioDiffusionPipeline, Mel
+
+try:
+    if not (is_paddle_available() and is_paddlenlp_available() and is_einops_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils.dummy_paddle_and_paddlenlp_and_einops_objects import *  # noqa F403
+else:
+    from .pipelines import UniDiffuserPipeline
+
+try:
+    if not (is_paddle_available() and is_einops_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils.dummy_paddle_and_einops_objects import *  # noqa F403
+else:
+    from .models import UViTModel
@@ -15,7 +15,11 @@
 # flake8: noqa
 
 
-from ..utils.import_utils import is_paddle_available
+from ..utils.import_utils import (
+    OptionalDependencyNotAvailable,
+    is_einops_available,
+    is_paddle_available,
+)
 
 if is_paddle_available():
     from .autoencoder_kl import AutoencoderKL
@@ -30,3 +34,11 @@
     from .unet_2d_condition import UNet2DConditionModel
     from .unet_3d_condition import UNet3DConditionModel
     from .vq_model import VQModel
+
+try:
+    if not (is_paddle_available() and is_einops_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils.dummy_paddle_and_einops_objects import *  # noqa F403
+else:
+    from .uvit import UViTModel
@@ -24,6 +24,59 @@
 from .embeddings import CombinedTimestepLabelEmbeddings
 
 
+def drop_path(input, drop_prob: float = 0.0, training: bool = False):
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + paddle.rand(shape, dtype=input.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = (input / keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+class Mlp(nn.Layer):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
 class AttentionBlock(nn.Layer):
     """
     An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted
 
@@ -160,6 +160,8 @@ def disable_slicing(self):
 
     @apply_forward_hook
     def encode(self, x: paddle.Tensor, return_dict: bool = True) -> AutoencoderKLOutput:
+        # TODO junnyu, support float16
+        x = x.cast(self.dtype)
         if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
             return self.tiled_encode(x, return_dict=return_dict)
 
 
@@ -125,6 +125,7 @@ def __init__(
         layer_norm=False,
         flatten=True,
         bias=True,
+        add_pos_embed=True,
     ):
         super().__init__()
 
@@ -141,16 +142,23 @@ def __init__(
         else:
             self.norm = None
 
-        pos_embed = get_2d_sincos_pos_embed(embed_dim, int(num_patches**0.5))
-        self.register_buffer("pos_embed", paddle.to_tensor(pos_embed).cast("float32").unsqueeze(0), persistable=False)
+        self.add_pos_embed = add_pos_embed
+        if add_pos_embed:
+            pos_embed = get_2d_sincos_pos_embed(embed_dim, int(num_patches**0.5))
+            self.register_buffer(
+                "pos_embed", paddle.to_tensor(pos_embed).cast("float32").unsqueeze(0), persistable=False
+            )
 
     def forward(self, latent):
         latent = self.proj(latent)
         if self.flatten:
             latent = latent.flatten(2).transpose([0, 2, 1])  # BCHW -> BNC
         if self.layer_norm:
             latent = self.norm(latent)
-        return latent + self.pos_embed
+        if self.add_pos_embed:
+            return latent + self.pos_embed
+        else:
+            return latent
 
 
 class TimestepEmbedding(nn.Layer):