diff --git a/Package.swift b/Package.swift
index de151f4f..3288400e 100644
--- a/Package.swift
+++ b/Package.swift
@@ -6,8 +6,8 @@ import PackageDescription
 let package = Package(
     name: "stable-diffusion",
     platforms: [
-        .macOS(.v11),
-        .iOS(.v14),
+        .macOS(.v13),
+        .iOS(.v16),
     ],
     products: [
         .library(
@@ -18,12 +18,15 @@ let package = Package(
             targets: ["StableDiffusionCLI"])
     ],
     dependencies: [
-        .package(url: "https://github.com/apple/swift-argument-parser.git", from: "1.2.3")
+        .package(url: "https://github.com/apple/swift-argument-parser.git", from: "1.2.3"),
+        .package(url: "https://github.com/huggingface/swift-transformers.git", exact: "0.1.8"),
     ],
     targets: [
         .target(
             name: "StableDiffusion",
-            dependencies: [],
+            dependencies:  [
+                .product(name: "Transformers", package: "swift-transformers"),
+            ],
             path: "swift/StableDiffusion"),
         .executableTarget(
             name: "StableDiffusionCLI",
diff --git a/README.md b/README.md
index 45414e9d..0cfe9275 100644
--- a/README.md
+++ b/README.md
@@ -246,6 +246,52 @@ An example `<selected-recipe-string-key>` would be `"recipe_4.50_bit_mixedpalett
 
 </details>
 
+
+## <a name="using-stable-diffusion-3"></a> Using Stable Diffusion 3
+
+<details>
+  <summary> Details (Click to expand) </summary>
+
+### Model Conversion
+
+Stable Diffusion 3 uses some new and some old models to run. For the text encoders, the conversion can be done using a similar command as before with the `--sd3-version` flag.
+
+```bash
+python -m python_coreml_stable_diffusion.torch2coreml --model-version stabilityai/stable-diffusion-3-medium --bundle-resources-for-swift-cli --convert-text-encoder --sd3-version -o <output-dir>
+```
+
+For the new models (MMDiT, a new VAE with 16 channels, and the T5 text encoder), there are a number of new CLI flags that utilize the [DiffusionKit](https://www.github.com/argmaxinc/DiffusionKit) repo:
+
+- `--sd3-version`: Indicates to the converter to treat this as a Stable Diffusion 3 model
+- `--convert-mmdit`: Convert the MMDiT model
+- `--convert-vae-decoder`: Convert the new VAE model (this will use the 16 channel version if --sd3-version is set)
+- `--include-t5`: Downloads and includes a pre-converted T5 text encoder in the conversion
+
+e.g.:
+```bash
+python -m python_coreml_stable_diffusion.torch2coreml --model-version stabilityai/stable-diffusion-3-medium --bundle-resources-for-swift-cli --convert-vae-decoder --convert-mmdit  --include-t5 --sd3-version -o <output-dir>
+```
+
+To convert the full pipeline with at 1024x1024 resolution, the following command may be used:
+
+```bash
+python -m python_coreml_stable_diffusion.torch2coreml --model-version stabilityai/stable-diffusion-3-medium --bundle-resources-for-swift-cli --convert-text-encoder --convert-vae-decoder --convert-mmdit --include-t5 --sd3-version --latent-h 128 --latent-w 128 -o <output-dir>
+```
+
+Keep in mind that the MMDiT model is quite large and will require increasingly more memory and time to convert as the latent resolution increases.
+
+Also note that currently the MMDiT model requires fp32 and therefore only supports `CPU_AND_GPU` compute units and `ORIGINAL` attention implementation (the default for this pipeline).
+
+### Swift Inference
+
+Swift inference for Stable Diffusion 3 is similar to the previous versions. The only difference is that the `--sd3` flag should be used to indicate that the model is a Stable Diffusion 3 model.
+
+```bash
+swift run StableDiffusionSample <prompt> --resource-path <output-mlpackages-directory/Resources> --output-path <output-dir> --compute-units cpuAndGPU --sd3
+```
+
+</details>
+
 ## <a name="using-stable-diffusion-xl"></a> Using Stable Diffusion XL
 
 <details>
@@ -356,6 +402,7 @@ Resources:
   - [`stabilityai/stable-diffusion-2-1-base`](https://huggingface.co/apple/coreml-stable-diffusion-2-1-base)
   - [`stabilityai/stable-diffusion-xl-base-1.0`](https://huggingface.co/apple/coreml-stable-diffusion-xl-base)
   - [`stabilityai/stable-diffusion-xl-{base+refiner}-1.0`](https://huggingface.co/apple/coreml-stable-diffusion-xl-base-with-refiner)
+  - [`stabilityai/stable-diffusion-3-medium`](https://huggingface.co/stabilityai/stable-diffusion-3-medium)
 
 If you want to use any of those models you may download the weights and proceed to [generate images with Python](#image-generation-with-python) or [Swift](#image-generation-with-swift).
 
diff --git a/python_coreml_stable_diffusion/torch2coreml.py b/python_coreml_stable_diffusion/torch2coreml.py
index 602f78e7..a85c2cd0 100644
--- a/python_coreml_stable_diffusion/torch2coreml.py
+++ b/python_coreml_stable_diffusion/torch2coreml.py
@@ -16,7 +16,12 @@
     DiffusionPipeline,
     ControlNetModel
 )
+from diffusionkit.tests.torch2coreml import (
+    convert_mmdit_to_mlpackage,
+    convert_vae_to_mlpackage
+)
 import gc
+from huggingface_hub import snapshot_download
 
 import logging
 
@@ -207,6 +212,26 @@ def _compile_coreml_model(source_model_path, output_dir, final_name):
     return target_path
 
 
+def _download_t5_model(args, t5_save_path):
+    t5_url = args.text_encoder_t5_url
+    match = re.match(r'https://huggingface.co/(.+)/resolve/main/(.+)', t5_url)
+    if not match:
+        raise ValueError(f"Invalid Hugging Face URL: {t5_url}")
+    repo_id, model_subpath = match.groups()
+
+    download_path = snapshot_download(
+        repo_id=repo_id,
+        revision="main",
+        allow_patterns=[f"{model_subpath}/*"]
+    )
+    logger.info(f"Downloaded T5 model to {download_path}")
+
+    # Move the downloaded model to the top level of the Resources directory
+    logger.info(f"Copying T5 model from {download_path} to {t5_save_path}")
+    cache_path = os.path.join(download_path, model_subpath)
+    shutil.copytree(cache_path, t5_save_path)
+
+
 def bundle_resources_for_swift_cli(args):
     """
     - Compiles Core ML models from mlpackage into mlmodelc format
@@ -228,6 +253,7 @@ def bundle_resources_for_swift_cli(args):
                                      ("refiner", "UnetRefiner"),
                                      ("refiner_chunk1", "UnetRefinerChunk1"),
                                      ("refiner_chunk2", "UnetRefinerChunk2"),
+                                     ("mmdit", "MultiModalDiffusionTransformer"),
                                      ("control-unet", "ControlledUnet"),
                                      ("control-unet_chunk1", "ControlledUnetChunk1"),
                                      ("control-unet_chunk2", "ControlledUnetChunk2"),
@@ -241,7 +267,7 @@ def bundle_resources_for_swift_cli(args):
             logger.warning(
                 f"{source_path} not found, skipping compilation to {target_name}.mlmodelc"
             )
-            
+
     if args.convert_controlnet:
         for controlnet_model_version in args.convert_controlnet:
             controlnet_model_name = controlnet_model_version.replace("/", "_")
@@ -271,6 +297,25 @@ def bundle_resources_for_swift_cli(args):
         f.write(requests.get(args.text_encoder_merges_url).content)
     logger.info("Done")
 
+    # Fetch and save pre-converted T5 text encoder model
+    t5_model_name = "TextEncoderT5.mlmodelc"
+    t5_save_path = os.path.join(resources_dir, t5_model_name)
+    if args.include_t5:
+        if not os.path.exists(t5_save_path):
+            logger.info("Downloading pre-converted T5 encoder model TextEncoderT5.mlmodelc")
+            _download_t5_model(args, t5_save_path)
+            logger.info("Done")
+        else:
+            logger.info(f"Skipping T5 download as {t5_save_path} already exists")
+            
+        # Fetch and save T5 text tokenizer JSON files
+        logger.info("Downloading and saving T5 tokenizer files tokenizer_config.json and tokenizer.json")
+        with open(os.path.join(resources_dir, "tokenizer_config.json"), "wb") as f:
+            f.write(requests.get(args.text_encoder_t5_config_url).content)
+        with open(os.path.join(resources_dir, "tokenizer.json"), "wb") as f:
+            f.write(requests.get(args.text_encoder_t5_data_url).content)
+        logger.info("Done")
+
     return resources_dir
 
 
@@ -557,6 +602,61 @@ def forward(self, z):
     del traced_vae_decoder, pipe.vae.decoder, coreml_vae_decoder
     gc.collect()
 
+def convert_vae_decoder_sd3(args):
+    """ Converts the VAE component of Stable Diffusion 3
+    """
+    out_path = _get_out_path(args, "vae_decoder")
+    if os.path.exists(out_path):
+        logger.info(
+            f"`vae_decoder` already exists at {out_path}, skipping conversion."
+        )
+        return
+    
+    # Convert the VAE Decoder model via DiffusionKit
+    converted_vae_path = convert_vae_to_mlpackage(
+        model_version=args.model_version, 
+        latent_h=args.latent_h, 
+        latent_w=args.latent_w, 
+        output_dir=args.o,
+    )
+
+    # Load converted model
+    coreml_vae_decoder = ct.models.MLModel(converted_vae_path)
+
+    # Set model metadata
+    coreml_vae_decoder.author = f"Please refer to the Model Card available at huggingface.co/{args.model_version}"
+    coreml_vae_decoder.license = "Stability AI Community License (https://huggingface.co/stabilityai/stable-diffusion-3-medium/blob/main/LICENSE.md)"
+    coreml_vae_decoder.version = args.model_version
+    coreml_vae_decodershort_description = \
+        "Stable Diffusion 3 generates images conditioned on text or other images as input through the diffusion process. " \
+        "Please refer to https://arxiv.org/pdf/2403.03206 for details."
+
+    # Set the input descriptions
+    coreml_vae_decoder.input_description["z"] = \
+        "The denoised latent embeddings from the unet model after the last step of reverse diffusion"
+
+    # Set the output descriptions
+    coreml_vae_decoder.output_description[
+        "image"] = "Generated image normalized to range [-1, 1]"
+
+    # Set package version metadata
+    from python_coreml_stable_diffusion._version import __version__
+    coreml_vae_decoder.user_defined_metadata["com.github.apple.ml-stable-diffusion.version"] = __version__
+    from diffusionkit.version import __version__
+    coreml_vae_decoder.user_defined_metadata["com.github.argmax.diffusionkit.version"] = __version__
+
+    # Save the updated model
+    coreml_vae_decoder.save(out_path)
+
+    logger.info(f"Saved vae_decoder into {out_path}")
+
+    # Delete the original file
+    if os.path.exists(converted_vae_path):
+        shutil.rmtree(converted_vae_path)
+
+    del coreml_vae_decoder
+    gc.collect()
+
 
 def convert_vae_encoder(pipe, args):
     """ Converts the VAE Encoder component of Stable Diffusion
@@ -909,6 +1009,72 @@ def convert_unet(pipe, args, model_name = None):
         chunk_mlprogram.main(args)
 
 
+def convert_mmdit(args):
+    """ Converts the MMDiT component of Stable Diffusion 3
+    """
+    out_path = _get_out_path(args, "mmdit")
+    if os.path.exists(out_path):
+        logger.info(
+            f"`mmdit` already exists at {out_path}, skipping conversion."
+        )
+        return
+    
+    # Convert the MMDiT model via DiffusionKit
+    converted_mmdit_path = convert_mmdit_to_mlpackage(
+        model_version=args.model_version, 
+        latent_h=args.latent_h, 
+        latent_w=args.latent_w, 
+        output_dir=args.o,
+        # FIXME: Hardcoding to CPU_AND_GPU since ANE doesn't support FLOAT32
+        compute_precision=ct.precision.FLOAT32,
+        compute_unit=ct.ComputeUnit.CPU_AND_GPU,
+    )
+
+    # Load converted model
+    coreml_mmdit = ct.models.MLModel(converted_mmdit_path)
+
+    # Set model metadata
+    coreml_mmdit.author = f"Please refer to the Model Card available at huggingface.co/{args.model_version}"
+    coreml_mmdit.license = "Stability AI Community License (https://huggingface.co/stabilityai/stable-diffusion-3-medium/blob/main/LICENSE.md)"
+    coreml_mmdit.version = args.model_version
+    coreml_mmdit.short_description = \
+    "Stable Diffusion 3 generates images conditioned on text or other images as input through the diffusion process. " \
+    "Please refer to https://arxiv.org/pdf/2403.03206 for details."
+
+    # Set the input descriptions
+    coreml_mmdit.input_description["latent_image_embeddings"] = \
+        "The low resolution latent feature maps being denoised through reverse diffusion"
+    coreml_mmdit.input_description["token_level_text_embeddings"] = \
+        "Output embeddings from the associated text_encoder model to condition to generated image on text. " \
+        "A maximum of 77 tokens (~40 words) are allowed. Longer text is truncated. "
+    coreml_mmdit.input_description["pooled_text_embeddings"] = \
+        "Additional embeddings that if specified are added to the embeddings that are passed along to the MMDiT model."
+    coreml_mmdit.input_description["timestep"] = \
+        "A value emitted by the associated scheduler object to condition the model on a given noise schedule"
+    
+    # Set the output descriptions
+    coreml_mmdit.output_description["denoiser_output"] = \
+        "Same shape and dtype as the `latent_image_embeddings` input. " \
+        "The predicted noise to facilitate the reverse diffusion (denoising) process"
+
+    # Set package version metadata
+    from python_coreml_stable_diffusion._version import __version__
+    coreml_mmdit.user_defined_metadata["com.github.apple.ml-stable-diffusion.version"] = __version__
+    from diffusionkit.version import __version__
+    coreml_mmdit.user_defined_metadata["com.github.argmax.diffusionkit.version"] = __version__
+
+    # Save the updated model
+    coreml_mmdit.save(out_path)
+
+    logger.info(f"Saved vae_decoder into {out_path}")
+
+    # Delete the original file
+    if os.path.exists(converted_mmdit_path):
+        shutil.rmtree(converted_mmdit_path)
+
+    del coreml_mmdit
+    gc.collect()
+
 def convert_safety_checker(pipe, args):
     """ Converts the Safety Checker component of Stable Diffusion
     """
@@ -1288,6 +1454,16 @@ def get_pipeline(args):
                                             use_safetensors=True,
                                             vae=vae,
                                             use_auth_token=True)
+    elif args.sd3_version:
+        # SD3 uses standard SDXL diffusers pipeline besides the vae, denoiser, and T5 text encoder
+        sdxl_base_version = "stabilityai/stable-diffusion-xl-base-1.0"
+        args.xl_version = True
+        logger.info(f"SD3 version specified, initializing DiffusionPipeline with {sdxl_base_version} for non-SD3 components..")
+        pipe = DiffusionPipeline.from_pretrained(sdxl_base_version,
+                                            torch_dtype=torch.float16,
+                                            variant="fp16",
+                                            use_safetensors=True,
+                                            use_auth_token=True)
     else:
         pipe = DiffusionPipeline.from_pretrained(model_version,
                                             torch_dtype=torch.float16,
@@ -1316,7 +1492,10 @@ def main(args):
     # Convert models
     if args.convert_vae_decoder:
         logger.info("Converting vae_decoder")
-        convert_vae_decoder(pipe, args)
+        if args.sd3_version:
+            convert_vae_decoder_sd3(args)
+        else:
+            convert_vae_decoder(pipe, args)
         logger.info("Converted vae_decoder")
 
     if args.convert_vae_encoder:
@@ -1363,6 +1542,11 @@ def main(args):
         del pipe
         gc.collect()
         logger.info(f"Converted refiner")
+    
+    if args.convert_mmdit:
+        logger.info("Converting mmdit")
+        convert_mmdit(args)
+        logger.info("Converted mmdit")
 
     if args.quantize_nbits is not None:
         logger.info(f"Quantizing weights to {args.quantize_nbits}-bit precision")
@@ -1383,6 +1567,7 @@ def parser_spec():
     parser.add_argument("--convert-vae-decoder", action="store_true")
     parser.add_argument("--convert-vae-encoder", action="store_true")
     parser.add_argument("--convert-unet", action="store_true")
+    parser.add_argument("--convert-mmdit", action="store_true")
     parser.add_argument("--convert-safety-checker", action="store_true")
     parser.add_argument(
         "--convert-controlnet", 
@@ -1489,6 +1674,7 @@ def parser_spec():
         "If specified, enable unet to receive additional inputs from controlnet. "
         "Each input added to corresponding resnet output."
         )
+    parser.add_argument("--include-t5", action="store_true")
 
     # Swift CLI Resource Bundling
     parser.add_argument(
@@ -1508,11 +1694,30 @@ def parser_spec():
         default=
         "https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/merges.txt",
         help="The URL to the merged pairs used in by the text tokenizer.")
+    parser.add_argument(
+        "--text-encoder-t5-url",
+        default=
+        "https://huggingface.co/argmaxinc/coreml-stable-diffusion-3-medium/resolve/main/TextEncoderT5.mlmodelc",
+        help="The URL to the pre-converted T5 encoder model.")
+    parser.add_argument(
+        "--text-encoder-t5-config-url",
+        default=
+        "https://huggingface.co/google-t5/t5-small/resolve/main/tokenizer_config.json",
+        help="The URL to the merged pairs used in by the text tokenizer.")
+    parser.add_argument(
+        "--text-encoder-t5-data-url",
+        default=
+        "https://huggingface.co/google-t5/t5-small/resolve/main/tokenizer.json",
+        help="The URL to the merged pairs used in by the text tokenizer.")
     parser.add_argument(
         "--xl-version",
         action="store_true",
         help=("If specified, the pre-trained model will be treated as an instantiation of "
         "`diffusers.pipelines.StableDiffusionXLPipeline` instead of `diffusers.pipelines.StableDiffusionPipeline`"))
+    parser.add_argument(
+        "--sd3-version",
+        action="store_true",
+        help=("If specified, the pre-trained model will be treated as an SD3 model."))
 
     return parser
 
diff --git a/requirements.txt b/requirements.txt
index 8fa7b07a..bd8bb117 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 coremltools>=7.0
 diffusers[torch]
+diffusionkit
 torch
 transformers==4.29.2
 scipy
diff --git a/swift/StableDiffusion/pipeline/Decoder.swift b/swift/StableDiffusion/pipeline/Decoder.swift
index cc8b9b9d..1d39aa32 100644
--- a/swift/StableDiffusion/pipeline/Decoder.swift
+++ b/swift/StableDiffusion/pipeline/Decoder.swift
@@ -1,5 +1,5 @@
 // For licensing see accompanying LICENSE.md file.
-// Copyright (C) 2022 Apple Inc. All Rights Reserved.
+// Copyright (C) 2024 Apple Inc. All Rights Reserved.
 
 import Foundation
 import CoreML
@@ -28,7 +28,7 @@ public struct Decoder: ResourceManaging {
 
     /// Unload the underlying model to free up memory
     public func unloadResources() {
-       model.unloadResources()
+        model.unloadResources()
     }
 
     /// Batch decode latent samples into images
@@ -39,14 +39,15 @@ public struct Decoder: ResourceManaging {
     ///  - Returns: decoded images
     public func decode(
         _ latents: [MLShapedArray<Float32>],
-        scaleFactor: Float32
+        scaleFactor: Float32,
+        shiftFactor: Float32 = 0.0
     ) throws -> [CGImage] {
 
         // Form batch inputs for model
         let inputs: [MLFeatureProvider] = try latents.map { sample in
             // Reference pipeline scales the latent samples before decoding
             let sampleScaled = MLShapedArray<Float32>(
-                scalars: sample.scalars.map { $0 / scaleFactor },
+                scalars: sample.scalars.map { $0 / scaleFactor + shiftFactor },
                 shape: sample.shape)
 
             let dict = [inputName: MLMultiArray(sampleScaled)]
diff --git a/swift/StableDiffusion/pipeline/DiscreteFlowScheduler.swift b/swift/StableDiffusion/pipeline/DiscreteFlowScheduler.swift
new file mode 100644
index 00000000..59e3ea4a
--- /dev/null
+++ b/swift/StableDiffusion/pipeline/DiscreteFlowScheduler.swift
@@ -0,0 +1,123 @@
+// For licensing see accompanying LICENSE.md file.
+// Copyright (C) 2024 Apple Inc. All Rights Reserved.
+
+import CoreML
+
+/// A scheduler used to compute a de-noised image
+@available(iOS 16.2, macOS 13.1, *)
+public final class DiscreteFlowScheduler: Scheduler {
+    public let trainStepCount: Int
+    public let inferenceStepCount: Int
+    public var timeSteps = [Int]()
+    public var betas = [Float]()
+    public var alphas = [Float]()
+    public var alphasCumProd = [Float]()
+
+    public private(set) var modelOutputs: [MLShapedArray<Float32>] = []
+
+    var trainSteps: Float
+    var shift: Float
+    var counter: Int
+    var sigmas = [Float]()
+
+    /// Create a scheduler that uses a second order DPM-Solver++ algorithm.
+    ///
+    /// - Parameters:
+    ///   - stepCount: Number of inference steps to schedule
+    ///   - trainStepCount: Number of training diffusion steps
+    ///   - timeStepShift: Amount to shift the timestep schedule
+    /// - Returns: A scheduler ready for its first step
+    public init(
+        stepCount: Int = 50,
+        trainStepCount: Int = 1000,
+        timeStepShift: Float = 3.0
+    ) {
+        self.trainStepCount = trainStepCount
+        self.inferenceStepCount = stepCount
+        self.trainSteps = Float(trainStepCount)
+        self.shift = timeStepShift
+        self.counter = 0
+
+        let sigmaDistribution = linspace(1, trainSteps, Int(trainSteps)).map { sigmaFromTimestep($0) }
+        let timeStepDistribution = linspace(sigmaDistribution.first!, sigmaDistribution.last!, stepCount).reversed()
+        self.timeSteps = timeStepDistribution.map { Int($0 * trainSteps) }
+        self.sigmas = timeStepDistribution.map { sigmaFromTimestep($0 * trainSteps) }
+    }
+
+    func sigmaFromTimestep(_ timestep: Float) -> Float {
+        if shift == 1.0 {
+            return timestep / trainSteps
+        } else {
+            // shift * timestep / (1 + (shift - 1) * timestep)
+            let t = timestep / trainSteps
+            return shift * t / (1 + (shift - 1) * t)
+        }
+    }
+
+    func timestepsFromSigmas() -> [Float] {
+        return sigmas.map { $0 * trainSteps }
+    }
+
+    /// Convert the model output to the corresponding type the algorithm needs.
+    func convertModelOutput(modelOutput: MLShapedArray<Float32>, timestep: Int, sample: MLShapedArray<Float32>) -> MLShapedArray<Float32> {
+        assert(modelOutput.scalarCount == sample.scalarCount)
+        let stepIndex = timeSteps.firstIndex(of: timestep) ?? counter
+        let sigma = sigmas[stepIndex]
+
+        return MLShapedArray<Float>(unsafeUninitializedShape: modelOutput.shape) { result, _ in
+            modelOutput.withUnsafeShapedBufferPointer { noiseScalars, _, _ in
+                sample.withUnsafeShapedBufferPointer { latentScalars, _, _ in
+                    for i in 0..<result.count {
+                        let denoised = latentScalars[i] - noiseScalars[i] * sigma
+                        result.initializeElement(
+                            at: i,
+                            to: denoised
+                        )
+                    }
+                }
+            }
+        }
+    }
+
+    public func calculateTimestepsFromSigmas(strength: Float?) -> [Float] {
+        guard let strength else { return timestepsFromSigmas() }
+        let startStep = max(inferenceStepCount - Int(Float(inferenceStepCount) * strength), 0)
+        let actualTimesteps = Array(timestepsFromSigmas()[startStep...])
+        return actualTimesteps
+    }
+
+    public func step(output: MLShapedArray<Float32>, timeStep t: Int, sample: MLShapedArray<Float32>) -> MLShapedArray<Float32> {
+        let stepIndex = timeSteps.firstIndex(of: t) ?? counter // TODO: allow float timesteps in scheduler step protocol
+        let modelOutput = convertModelOutput(modelOutput: output, timestep: t, sample: sample)
+        modelOutputs.append(modelOutput)
+
+        let sigma = sigmas[stepIndex]
+        var dt = sigma
+        var prevSigma: Float = 0
+        if stepIndex < sigmas.count - 1 {
+            prevSigma = sigmas[stepIndex + 1]
+            dt = prevSigma - sigma
+        }
+
+        let prevSample: MLShapedArray<Float32> = MLShapedArray<Float>(unsafeUninitializedShape: modelOutput.shape) { result, _ in
+            modelOutput.withUnsafeShapedBufferPointer { noiseScalars, _, _ in
+                sample.withUnsafeShapedBufferPointer { latentScalars, _, _ in
+                    for i in 0..<result.count {
+                        let denoised = noiseScalars[i]
+                        let x = latentScalars[i]
+
+                        let d = (x - denoised) / sigma
+                        let prev_x = x + d * dt
+                        result.initializeElement(
+                            at: i,
+                            to: prev_x
+                        )
+                    }
+                }
+            }
+        }
+
+        counter += 1
+        return prevSample
+    }
+}
diff --git a/swift/StableDiffusion/pipeline/ManagedMLModel.swift b/swift/StableDiffusion/pipeline/ManagedMLModel.swift
index 5640a5f6..e1983186 100644
--- a/swift/StableDiffusion/pipeline/ManagedMLModel.swift
+++ b/swift/StableDiffusion/pipeline/ManagedMLModel.swift
@@ -72,6 +72,56 @@ public final class ManagedMLModel: ResourceManaging {
                                       configuration: configuration)
         }
     }
+}
+
+@available(iOS 16.2, macOS 13.1, *)
+public extension Array where Element == ManagedMLModel {
+    /// Performs batch predictions using an array of `[ManagedMLModel]` instances in a pipeline.
+    /// - Parameter batch: Inputs for btached predictions.
+    /// - Returns: Final prediction results after processing through all models.
+    /// - Throws: Errors if the array is empty, predictions fail, or results can't be combined.
+    func predictions(from batch: MLBatchProvider) throws -> MLBatchProvider {
+        var results = try self.first!.perform { model in
+            try model.predictions(fromBatch: batch)
+        }
+
+        if self.count == 1 {
+            return results
+        }
+
+        // Manual pipeline batch prediction
+        let inputs = batch.arrayOfFeatureValueDictionaries
+        for stage in self.dropFirst() {
+            // Combine the original inputs with the outputs of the last stage
+            let next = try results.arrayOfFeatureValueDictionaries
+                .enumerated().map { index, dict in
+                    let nextDict = dict.merging(inputs[index]) { out, _ in out }
+                    return try MLDictionaryFeatureProvider(dictionary: nextDict)
+                }
+            let nextBatch = MLArrayBatchProvider(array: next)
+
+            // Predict
+            results = try stage.perform { model in
+                try model.predictions(fromBatch: nextBatch)
+            }
+        }
 
+        return results
+    }
+}
 
+extension MLFeatureProvider {
+    var featureValueDictionary: [String : MLFeatureValue] {
+        self.featureNames.reduce(into: [String : MLFeatureValue]()) { result, name in
+            result[name] = self.featureValue(for: name)
+        }
+    }
+}
+
+extension MLBatchProvider {
+    var arrayOfFeatureValueDictionaries: [[String : MLFeatureValue]] {
+        (0..<self.count).map {
+            self.features(at: $0).featureValueDictionary
+        }
+    }
 }
diff --git a/swift/StableDiffusion/pipeline/MultiModalDiffusionTransformer.swift b/swift/StableDiffusion/pipeline/MultiModalDiffusionTransformer.swift
new file mode 100644
index 00000000..b5ee7ffa
--- /dev/null
+++ b/swift/StableDiffusion/pipeline/MultiModalDiffusionTransformer.swift
@@ -0,0 +1,125 @@
+// For licensing see accompanying LICENSE.md file.
+// Copyright (C) 2022 Apple Inc. All Rights Reserved.
+
+import Foundation
+
+import CoreML
+
+/// MMDiT noise prediction model for stable diffusion
+@available(iOS 16.2, macOS 13.1, *)
+public struct MultiModalDiffusionTransformer: ResourceManaging {
+    /// Model used to predict noise residuals given an input, diffusion time step, and conditional embedding
+    ///
+    /// It can be in the form of a single model or multiple stages
+    var models: [ManagedMLModel]
+
+    /// Creates a MMDiT noise prediction model
+    ///
+    /// - Parameters:
+    ///   - url: Location of single MMDiT compiled Core ML model
+    ///   - configuration: Configuration to be used when the model is loaded
+    /// - Returns: MMDiT model that will lazily load its required resources when needed or requested
+    public init(modelAt url: URL,
+                configuration: MLModelConfiguration)
+    {
+        self.models = [ManagedMLModel(modelAt: url, configuration: configuration)]
+    }
+
+    /// Load resources.
+    public func loadResources() throws {
+        for model in models {
+            try model.loadResources()
+        }
+    }
+
+    /// Unload the underlying model to free up memory
+    public func unloadResources() {
+        for model in models {
+            model.unloadResources()
+        }
+    }
+
+    /// Pre-warm resources
+    public func prewarmResources() throws {
+        // Override default to pre-warm each model
+        for model in models {
+            try model.loadResources()
+            model.unloadResources()
+        }
+    }
+
+    var latentImageEmbeddingsDescription: MLFeatureDescription {
+        try! models.first!.perform { model in
+            model.modelDescription.inputDescriptionsByName["latent_image_embeddings"]!
+        }
+    }
+
+    /// The expected shape of the models latent sample input
+    public var latentImageEmbeddingsShape: [Int] {
+        latentImageEmbeddingsDescription.multiArrayConstraint!.shape.map { $0.intValue }
+    }
+
+    var tokenLevelTextEmbeddingsDescription: MLFeatureDescription {
+        try! models.first!.perform { model in
+            model.modelDescription.inputDescriptionsByName["token_level_text_embeddings"]!
+        }
+    }
+
+    /// The expected shape of the geometry conditioning
+    public var tokenLevelTextEmbeddingsShape: [Int] {
+        tokenLevelTextEmbeddingsDescription.multiArrayConstraint!.shape.map { $0.intValue }
+    }
+
+    /// Batch prediction noise from latent samples
+    ///
+    /// - Parameters:
+    ///   - latents: Batch of latent samples in an array
+    ///   - timeStep: Current diffusion timestep
+    ///   - hiddenStates: Hidden state to condition on
+    /// - Returns: Array of predicted noise residuals
+    func predictNoise(
+        latents: [MLShapedArray<Float32>],
+        timeStep: Float,
+        tokenLevelTextEmbeddings: MLShapedArray<Float32>,
+        pooledTextEmbeddings: MLShapedArray<Float32>
+    ) throws -> [MLShapedArray<Float32>] {
+        // Match time step batch dimension to the model / latent samples
+        let t = MLShapedArray<Float32>(scalars: [timeStep, timeStep], shape: [2])
+
+        // Form batch input to model
+        let inputs = try latents.enumerated().map {
+            let dict: [String: Any] = [
+                "latent_image_embeddings": MLMultiArray($0.element),
+                "timestep": MLMultiArray(t),
+                "token_level_text_embeddings": MLMultiArray(tokenLevelTextEmbeddings),
+                "pooled_text_embeddings": MLMultiArray(pooledTextEmbeddings),
+            ]
+            return try MLDictionaryFeatureProvider(dictionary: dict)
+        }
+        let batch = MLArrayBatchProvider(array: inputs)
+
+        // Make predictions
+        let results = try models.predictions(from: batch)
+
+        // Pull out the results in Float32 format
+        let noise = (0..<results.count).map { i in
+
+            let result = results.features(at: i)
+            let outputName = result.featureNames.first!
+
+            let outputNoise = result.featureValue(for: outputName)!.multiArrayValue!
+
+            // To conform to this func return type make sure we return float32
+            // Use the fact that the concatenating constructor for MLMultiArray
+            // can do type conversion:
+            let fp32Noise = MLMultiArray(
+                concatenating: [outputNoise],
+                axis: 0,
+                dataType: .float32
+            )
+            return MLShapedArray<Float32>(fp32Noise)
+        }
+
+        return noise
+    }
+}
diff --git a/swift/StableDiffusion/pipeline/StableDiffusion3Pipeline+Resources.swift b/swift/StableDiffusion/pipeline/StableDiffusion3Pipeline+Resources.swift
new file mode 100644
index 00000000..08d36af7
--- /dev/null
+++ b/swift/StableDiffusion/pipeline/StableDiffusion3Pipeline+Resources.swift
@@ -0,0 +1,97 @@
+// For licensing see accompanying LICENSE.md file.
+// Copyright (C) 2024 Apple Inc. All Rights Reserved.
+
+import CoreML
+import Foundation
+import Tokenizers
+import Hub
+
+@available(iOS 17.0, macOS 14.0, *)
+public extension StableDiffusion3Pipeline {
+    struct ResourceURLs {
+        public let textEncoderURL: URL
+        public let textEncoder2URL: URL
+        public let textEncoderT5URL: URL
+        public let mmditURL: URL
+        public let decoderURL: URL
+        public let encoderURL: URL
+        public let vocabURL: URL
+        public let mergesURL: URL
+        public let configT5URL: URL
+        public let dataT5URL: URL
+
+        public init(resourcesAt baseURL: URL) {
+            textEncoderURL = baseURL.appending(path: "TextEncoder.mlmodelc")
+            textEncoder2URL = baseURL.appending(path: "TextEncoder2.mlmodelc")
+            textEncoderT5URL = baseURL.appending(path: "TextEncoderT5.mlmodelc")
+            mmditURL = baseURL.appending(path: "MultiModalDiffusionTransformer.mlmodelc")
+            decoderURL = baseURL.appending(path: "VAEDecoder.mlmodelc")
+            encoderURL = baseURL.appending(path: "VAEEncoder.mlmodelc")
+            vocabURL = baseURL.appending(path: "vocab.json")
+            mergesURL = baseURL.appending(path: "merges.txt")
+            configT5URL = baseURL.appending(path: "tokenizer_config.json")
+            dataT5URL = baseURL.appending(path: "tokenizer.json")
+        }
+    }
+
+    /// Create stable diffusion pipeline using model resources at a
+    /// specified URL
+    ///
+    /// - Parameters:
+    ///   - baseURL: URL pointing to directory holding all model and tokenization resources
+    ///   - configuration: The configuration to load model resources with
+    ///   - reduceMemory: Setup pipeline in reduced memory mode
+    /// - Returns:
+    ///  Pipeline ready for image generation if all  necessary resources loaded
+    init(
+        resourcesAt baseURL: URL,
+        configuration config: MLModelConfiguration = .init(),
+        reduceMemory: Bool = false
+    ) throws {
+        // Expect URL of each resource
+        let urls = ResourceURLs(resourcesAt: baseURL)
+        let tokenizer = try BPETokenizer(mergesAt: urls.mergesURL, vocabularyAt: urls.vocabURL)
+        let textEncoder = TextEncoderXL(tokenizer: tokenizer, modelAt: urls.textEncoderURL, configuration: config)
+
+        // padToken is different in the second XL text encoder
+        let tokenizer2 = try BPETokenizer(mergesAt: urls.mergesURL, vocabularyAt: urls.vocabURL, padToken: "!")
+        let textEncoder2 = TextEncoderXL(tokenizer: tokenizer2, modelAt: urls.textEncoder2URL, configuration: config)
+
+        // Optional T5 encoder
+        var textEncoderT5: TextEncoderT5?
+        if FileManager.default.fileExists(atPath: urls.configT5URL.path),
+           FileManager.default.fileExists(atPath: urls.dataT5URL.path),
+           FileManager.default.fileExists(atPath: urls.textEncoderT5URL.path)
+        {
+            let tokenizerT5 = try PreTrainedTokenizer(tokenizerConfig: Config(fileURL: urls.configT5URL), tokenizerData: Config(fileURL: urls.dataT5URL))
+            textEncoderT5 = TextEncoderT5(tokenizer: tokenizerT5, modelAt: urls.textEncoderT5URL, configuration: config)
+        } else {
+            textEncoderT5 = nil
+        }
+
+        // Denoiser model
+        let mmdit = MultiModalDiffusionTransformer(modelAt: urls.mmditURL, configuration: config)
+
+        // Image Decoder
+        let decoder = Decoder(modelAt: urls.decoderURL, configuration: config)
+
+        // Optional Image Encoder
+        let encoder: Encoder?
+        if FileManager.default.fileExists(atPath: urls.encoderURL.path) {
+            encoder = Encoder(modelAt: urls.encoderURL, configuration: config)
+        } else {
+            encoder = nil
+        }
+
+        // Construct pipeline
+        self.init(
+            textEncoder: textEncoder,
+            textEncoder2: textEncoder2,
+            textEncoderT5: textEncoderT5,
+            mmdit: mmdit,
+            decoder: decoder,
+            encoder: encoder,
+            reduceMemory: reduceMemory
+        )
+    }
+}
diff --git a/swift/StableDiffusion/pipeline/StableDiffusion3Pipeline.swift b/swift/StableDiffusion/pipeline/StableDiffusion3Pipeline.swift
new file mode 100644
index 00000000..b39057cd
--- /dev/null
+++ b/swift/StableDiffusion/pipeline/StableDiffusion3Pipeline.swift
@@ -0,0 +1,486 @@
+// For licensing see accompanying LICENSE.md file.
+// Copyright (C) 2024 Apple Inc. All Rights Reserved.
+
+import Accelerate
+import CoreGraphics
+import CoreImage
+import CoreML
+import Foundation
+
+@available(iOS 17.0, macOS 14.0, *)
+public struct StableDiffusion3Pipeline: StableDiffusionPipelineProtocol {
+    public typealias Configuration = PipelineConfiguration
+    public typealias Progress = PipelineProgress
+
+    /// Model to generate embeddings for tokenized input text
+    var textEncoder: TextEncoderXLModel
+    var textEncoder2: TextEncoderXLModel
+    var textEncoderT5: TextEncoderT5Model?
+
+    /// Model used to predict noise residuals given an input, diffusion time step, and conditional embedding
+    var mmdit: MultiModalDiffusionTransformer
+
+    /// Model used to generate final image from latent diffusion process
+    var decoder: Decoder
+
+    /// Model used to latent space for image2image, and soon, in-painting
+    var encoder: Encoder?
+
+    /// Option to reduce memory during image generation
+    ///
+    /// If true, the pipeline will lazily load TextEncoder, Unet, Decoder, and SafetyChecker
+    /// when needed and aggressively unload their resources after
+    ///
+    /// This will increase latency in favor of reducing memory
+    var reduceMemory: Bool = false
+
+    /// Creates a pipeline using the specified models and tokenizer
+    ///
+    /// - Parameters:
+    ///   - textEncoder: Model for encoding tokenized text
+    ///   - textEncoder2: Second text encoding model
+    ///   - mmdit: Model for noise prediction on latent samples
+    ///   - decoder: Model for decoding latent sample to image
+    ///   - reduceMemory: Option to enable reduced memory mode
+    /// - Returns: Pipeline ready for image generation
+    public init(
+        textEncoder: TextEncoderXLModel,
+        textEncoder2: TextEncoderXLModel,
+        textEncoderT5: TextEncoderT5?,
+        mmdit: MultiModalDiffusionTransformer,
+        decoder: Decoder,
+        encoder: Encoder?,
+        reduceMemory: Bool = false
+    ) {
+        self.textEncoder = textEncoder
+        self.textEncoder2 = textEncoder2
+        self.textEncoderT5 = textEncoderT5
+        self.mmdit = mmdit
+        self.decoder = decoder
+        self.encoder = encoder
+        self.reduceMemory = reduceMemory
+    }
+
+    /// Load required resources for this pipeline
+    ///
+    /// If reducedMemory is true this will instead call prewarmResources instead
+    /// and let the pipeline lazily load resources as needed
+    public func loadResources() throws {
+        if reduceMemory {
+            try prewarmResources()
+        } else {
+            try textEncoder.loadResources()
+            try textEncoder2.loadResources()
+            try textEncoderT5?.loadResources()
+            try mmdit.loadResources()
+            try decoder.loadResources()
+
+            do {
+                try encoder?.loadResources()
+            } catch {
+                print("Error loading resources for vae encoder: \(error)")
+            }
+        }
+    }
+
+    /// Unload the underlying resources to free up memory
+    public func unloadResources() {
+        textEncoder.unloadResources()
+        textEncoder2.unloadResources()
+        textEncoderT5?.unloadResources()
+        mmdit.unloadResources()
+        decoder.unloadResources()
+        encoder?.unloadResources()
+    }
+
+    /// Prewarm resources one at a time
+    public func prewarmResources() throws {
+        try textEncoder.prewarmResources()
+        try textEncoder2.prewarmResources()
+        try textEncoderT5?.prewarmResources()
+        try mmdit.prewarmResources()
+        try decoder.prewarmResources()
+
+        do {
+            try encoder?.prewarmResources()
+        } catch {
+            print("Error prewarming resources for vae encoder: \(error)")
+        }
+    }
+
+    /// Image generation using stable diffusion
+    /// - Parameters:
+    ///   - configuration: Image generation configuration
+    ///   - progressHandler: Callback to perform after each step, stops on receiving false response
+    /// - Returns: An array of `imageCount` optional images.
+    ///            The images will be nil if safety checks were performed and found the result to be un-safe
+    public func generateImages(
+        configuration config: Configuration,
+        progressHandler: (Progress) -> Bool = { _ in true }
+    ) throws -> [CGImage?] {
+        // Setup geometry conditioning for base/refiner inputs
+        let sd3Input: ModelInputs = try generateConditioning(using: config)
+
+        if reduceMemory {
+            textEncoder.unloadResources()
+            textEncoder2.unloadResources()
+            textEncoderT5?.unloadResources()
+        }
+
+        // Setup schedulers
+        let scheduler: [DiscreteFlowScheduler] = (0..<config.imageCount).map { _ in
+            DiscreteFlowScheduler(stepCount: config.stepCount, timeStepShift: config.schedulerTimestepShift)
+        }
+
+        // Generate random latent samples from specified seed
+        var latents: [MLShapedArray<Float32>] = try generateLatentSamples(configuration: config, scheduler: scheduler[0])
+
+        // Store denoised latents from scheduler to pass into decoder
+        var denoisedLatents: [MLShapedArray<Float32>] = latents.map { MLShapedArray(converting: $0) }
+
+        if reduceMemory {
+            encoder?.unloadResources()
+        }
+
+        let timestepStrength: Float? = config.mode == .imageToImage ? config.strength : nil
+
+        // Store current model
+        let mmditModel = mmdit
+
+        let mmditHiddenStates = sd3Input.hiddenStates
+        let mmditPooledStates = sd3Input.pooledStates
+
+        let timeSteps: [Float] = scheduler[0].calculateTimestepsFromSigmas(strength: timestepStrength)
+
+        // De-noising loop
+        for (step, t) in timeSteps.enumerated() {
+            // Expand the latents for classifier-free guidance
+            // and input to the MMDiT noise prediction model
+            let latentUnetInput = latents.map {
+                MLShapedArray<Float32>(concatenating: [$0, $0], alongAxis: 0)
+            }
+
+            // Predict noise residuals from latent samples
+            // and current time step conditioned on hidden states
+            var noise = try mmditModel.predictNoise(
+                latents: latentUnetInput,
+                timeStep: t,
+                tokenLevelTextEmbeddings: mmditHiddenStates,
+                pooledTextEmbeddings: mmditPooledStates
+            )
+
+            noise = performGuidance(noise, config.guidanceScale)
+
+            // Have the scheduler compute the previous (t-1) latent
+            // sample given the predicted noise and current sample
+            for i in 0..<config.imageCount {
+                latents[i] = scheduler[i].step(
+                    output: noise[i],
+                    timeStep: scheduler[i].timeSteps[step], // TODO: allow float timesteps in scheduler step protocol
+                    sample: latents[i]
+                )
+
+                denoisedLatents[i] = scheduler[i].modelOutputs.last ?? latents[i]
+            }
+
+            let currentLatentSamples = config.useDenoisedIntermediates ? denoisedLatents : latents
+
+            // Report progress
+            let progress = Progress(
+                pipeline: self,
+                prompt: config.prompt,
+                step: step,
+                stepCount: timeSteps.count,
+                currentLatentSamples: currentLatentSamples,
+                configuration: config
+            )
+
+            if !progressHandler(progress) {
+                // Stop if requested by handler
+                return []
+            }
+        }
+
+        // Unload resources
+        if reduceMemory {
+            mmdit.unloadResources()
+        }
+
+        // Decode the latent samples to images
+        return try decodeToImages(denoisedLatents, configuration: config)
+    }
+
+    func encodePrompt(_ prompt: String) throws -> (MLShapedArray<Float32>, MLShapedArray<Float32>) {
+        var embeds = MLShapedArray<Float32>()
+        var pooled = MLShapedArray<Float32>()
+
+        let (embeds1, pooledValue1) = try textEncoder.encode(prompt)
+        let (embeds2, pooledValue2) = try textEncoder2.encode(prompt)
+        var embedsT5 = try textEncoderT5?.encode(prompt).encoderHiddenStates ?? MLShapedArray<Float32>(repeating: 0, shape: [1, 4096, 1, 77])
+
+        // Truncate T5
+        embedsT5 = truncatedT5Embeds(embedsT5)
+
+        let padding1 = MLShapedArray<Float32>(repeating: 0, shape: [1, 77, 2048])
+
+        // Base needs concatenated embeddings
+        // [1, 77, 768], [1, 77, 1280], [1, 77, 2048] -> [1, 77, 4096]
+        embeds = MLShapedArray<Float32>(
+            concatenating: [embeds1, embeds2, padding1],
+            alongAxis: 2
+        )
+
+        // [1, 77, 4096] -> [1, 4096, 1 77]
+        embeds = toHiddenStates(embeds)
+
+        // [1, 4096, 1 77], [1, 4096, 1, 77] -> [1, 4096, 1, 154]
+        embeds = MLShapedArray<Float32>(
+            concatenating: [embeds, embedsT5],
+            alongAxis: 3
+        )
+
+        // [1, 768], [1, 1280] -> [1, 2048]
+        pooled = MLShapedArray<Float32>(
+            concatenating: [pooledValue1, pooledValue2],
+            alongAxis: 1
+        )
+
+        return (embeds, pooled)
+    }
+
+    func generateConditioning(using config: Configuration) throws -> ModelInputs {
+        // Encode the input prompt and negative prompt
+        let (promptEmbedding, pooled) = try encodePrompt(config.prompt)
+        let (negativePromptEmbedding, negativePooled) = try encodePrompt(config.negativePrompt)
+
+        // Convert to Unet hidden state representation
+        // Concatenate the prompt and negative prompt embeddings
+        let hiddenStates = MLShapedArray(concatenating: [promptEmbedding, negativePromptEmbedding], alongAxis: 0)
+        let pooledScalars = MLShapedArray(concatenating: [pooled, negativePooled], alongAxis: 0)
+
+        let pooledStates = MLShapedArray<Float32>(
+            scalars: pooledScalars.scalars,
+            shape: [2, 2048, 1, 1]
+        )
+
+        return ModelInputs(hiddenStates: hiddenStates, pooledStates: pooledStates)
+    }
+
+    func generateLatentSamples(configuration config: Configuration, scheduler: Scheduler) throws -> [MLShapedArray<Float32>] {
+        var sampleShape = mmdit.latentImageEmbeddingsShape
+        sampleShape[0] = 1
+
+        let stdev = scheduler.initNoiseSigma
+        var random = randomSource(from: config.rngType, seed: config.seed)
+        let samples = (0..<config.imageCount).map { _ in
+            MLShapedArray<Float32>(
+                converting: random.normalShapedArray(sampleShape, mean: 0.0, stdev: Double(stdev)))
+        }
+        if let image = config.startingImage, config.mode == .imageToImage {
+            guard let encoder else {
+                throw PipelineError.startingImageProvidedWithoutEncoder
+            }
+            let latent = try encoder.encode(image, scaleFactor: config.encoderScaleFactor, random: &random)
+            return scheduler.addNoise(originalSample: latent, noise: samples, strength: config.strength)
+        }
+        return samples
+    }
+
+    func performGuidance(_ noise: [MLShapedArray<Float32>], _ guidanceScale: Float) -> [MLShapedArray<Float32>] {
+        noise.map { performGuidance($0, guidanceScale) }
+    }
+
+    func performGuidance(_ noise: MLShapedArray<Float32>, _ guidanceScale: Float) -> MLShapedArray<Float32> {
+        var shape = noise.shape
+        shape[0] = 1
+        return MLShapedArray<Float>(unsafeUninitializedShape: shape) { result, _ in
+            noise.withUnsafeShapedBufferPointer { scalars, _, strides in
+                for i in 0..<result.count {
+                    // unconditioned + guidance*(text - unconditioned)
+                    let text = scalars[i]
+                    let negText = scalars[strides[0] + i]
+                    let guidance = negText + guidanceScale * (text - negText)
+                    result.initializeElement(
+                        at: i,
+                        to: guidance
+                    )
+                }
+            }
+        }
+    }
+
+    public func decodeToImages(_ latents: [MLShapedArray<Float32>], configuration config: Configuration) throws -> [CGImage?] {
+        defer {
+            if reduceMemory {
+                decoder.unloadResources()
+            }
+        }
+
+        return try decoder.decode(latents, scaleFactor: config.decoderScaleFactor, shiftFactor: config.decoderShiftFactor)
+
+        // TODO: use latent rgb factors with blur for preview images
+        // This will require a method to decode with either the vae or the rgb factors depending on config
+        // return try decodePreviewImage(latents, scaleFactor: config.decoderScaleFactor)
+    }
+
+    /// Shape 16 x 3
+    let rgbFactors: [[Float]] = [
+        [-0.0645,  0.0177,  0.1052], [ 0.0028,  0.0312,  0.0650],
+        [ 0.1848,  0.0762,  0.0360], [ 0.0944,  0.0360,  0.0889],
+        [ 0.0897,  0.0506, -0.0364], [-0.0020,  0.1203,  0.0284],
+        [ 0.0855,  0.0118,  0.0283], [-0.0539,  0.0658,  0.1047],
+        [-0.0057,  0.0116,  0.0700], [-0.0412,  0.0281, -0.0039],
+        [ 0.1106,  0.1171,  0.1220], [-0.0248,  0.0682, -0.0481],
+        [ 0.0815,  0.0846,  0.1207], [-0.0120, -0.0055, -0.0867],
+        [-0.0749, -0.0634, -0.0456], [-0.1418, -0.1457, -0.1259]
+    ]
+
+    public func decodePreviewImage(
+        _ latents: [MLShapedArray<Float32>],
+        scaleFactor: Float32
+    ) throws -> [CGImage] {
+        let height = 64
+        let width = 64
+        let channels = 16
+        let outputChannels = 3
+
+        // Ensure there is a first element in latents and extract its scalars
+        guard let latentScalars = latents.first?.scalars else {
+            throw NSError(domain: "DecodeError", code: 0, userInfo: [NSLocalizedDescriptionKey: "Invalid latent array"])
+        }
+
+        // The latentScalars is a flat array, we need to reshape and multiply
+        var reshapedLatent = [Float32](repeating: 0, count: height * width * channels)
+
+        // We reorder the indices manually to switch from [channels, height, width] to [height, width, channels]
+        for h in 0..<height {
+            for w in 0..<width {
+                for c in 0..<channels {
+                    let oldIndex = c * height * width + h * width + w
+                    let newIndex = h * width * channels + w * channels + c
+                    reshapedLatent[newIndex] = latentScalars[oldIndex] // 1.5305 + 0.0609
+                }
+            }
+        }
+
+        // Prepare to hold the result of the multiplication
+        var imageArray = [Float32](repeating: 0, count: height * width * outputChannels)
+
+        // Perform matrix multiplication using Accelerate
+        vDSP_mmul(reshapedLatent, 1,
+                  rgbFactors.flatMap { $0 }, 1,
+                  &imageArray, 1,
+                  vDSP_Length(height * width),  // number of rows in output
+                  vDSP_Length(outputChannels),  // number of columns in output
+                  vDSP_Length(channels))        // common dimension
+
+        // Convert imageArray into a CGImage
+        let latentImage = imageArray.toCGImage(width: width, height: height)
+
+        // Apply a Gaussian blur to the preview image to reduce pixeled look
+        let ciImage = CIImage(cgImage: latentImage!)
+        let blurFilter = CIFilter(name: "CIGaussianBlur")!
+        blurFilter.setValue(ciImage, forKey: kCIInputImageKey)
+        blurFilter.setValue(4.0, forKey: kCIInputRadiusKey)
+
+        let context = CIContext()
+        guard let outputImage = blurFilter.outputImage,
+              let cgBlurredPreview = context.createCGImage(outputImage, from: ciImage.extent)
+        else {
+            throw PipelineError.errorCreatingPreview
+        }
+
+        return [cgBlurredPreview]
+    }
+
+    struct ModelInputs {
+        var hiddenStates: MLShapedArray<Float32>
+        var pooledStates: MLShapedArray<Float32>
+    }
+
+    /// Helper function to truncate the T5 embeddings
+    func truncatedT5Embeds(_ embedding: MLShapedArray<Float32>) -> MLShapedArray<Float32> {
+        // Unoptimized manual truncation
+        // e.g. From [1, 4096, 1, 128] to [1, 4096, 1, 77]
+        let fromShape = embedding.shape
+        let stateShape = [fromShape[0], fromShape[1], fromShape[2], 77]
+        var states = MLShapedArray<Float32>(repeating: 0.0, shape: stateShape)
+        for i0 in 0..<fromShape[0] {
+            for i1 in 0..<fromShape[1] {
+                for i2 in 0..<fromShape[2] {
+                    for i3 in 0..<stateShape[3] {
+                        states[scalarAt: i0, i1, i2, i3] = embedding[scalarAt: i0, i1, i2, i3]
+                    }
+                }
+            }
+        }
+        return states
+    }
+}
+
+extension Array where Element == Float32 {
+    func toCGImage(width: Int, height: Int) -> CGImage? {
+        // Define color space and bitmap info
+        let colorSpace = CGColorSpaceCreateDeviceRGB()
+        let bitmapInfo = CGBitmapInfo.byteOrder32Big.rawValue | CGImageAlphaInfo.premultipliedLast.rawValue
+
+        // Calculate bytes per pixel and bytes per row
+        let bytesPerPixel = 4
+        let bytesPerRow = width * bytesPerPixel
+
+        // Allocate memory for the pixel data
+        var data = [UInt8](repeating: 0, count: height * bytesPerRow)
+
+        // Fill the data array with pixel data
+        for h in 0..<height {
+            for w in 0..<width {
+                let pixelIndex = h * width + w
+                let dataIndex = h * bytesPerRow + w * bytesPerPixel
+                let pixelBase = pixelIndex * 3 // Base index for R, G, B values in the source array
+
+                // Ensure your source array has enough data
+                if (pixelBase + 3) < self.count {
+                    let redValue = (self[pixelBase] + 1) / 2 * 255
+                    let bluValue = (self[pixelBase + 1] + 1) / 2 * 255
+                    let grnValue = (self[pixelBase + 2] + 1) / 2 * 255
+                    data[dataIndex] = UInt8(clamp(value: redValue, lower: 0, upper: 255))     // Red
+                    data[dataIndex + 1] = UInt8(clamp(value: bluValue, lower: 0, upper: 255)) // Green
+                    data[dataIndex + 2] = UInt8(clamp(value: grnValue, lower: 0, upper: 255)) // Blue
+                    data[dataIndex + 3] = 255 // Alpha
+                }
+            }
+        }
+
+        // Create the context
+        guard let context = CGContext(data: &data, width: width, height: height, bitsPerComponent: 8, bytesPerRow: bytesPerRow, space: colorSpace, bitmapInfo: bitmapInfo) else {
+            print("Failed to create CGContext.")
+            return nil
+        }
+
+        // Create a CGImage from context
+        guard let smallImage = context.makeImage() else {
+            return nil
+        }
+
+        // Define the upscaled dimensions
+        let scaledWidth = width * 8
+        let scaledHeight = height * 8
+
+        // Create a new context with scaled dimensions
+        guard let largeContext = CGContext(data: nil, width: scaledWidth, height: scaledHeight, bitsPerComponent: 8, bytesPerRow: scaledWidth * 4, space: colorSpace, bitmapInfo: bitmapInfo) else {
+            return nil
+        }
+
+        // Draw the small image into the large context
+        largeContext.interpolationQuality = .high
+        largeContext.draw(smallImage, in: CGRect(x: 0, y: 0, width: scaledWidth, height: scaledHeight))
+
+        // Convert the upscaled context to a CGImage
+        return largeContext.makeImage()
+    }
+
+    /// Helper function to clamp the values within the specified range
+    private func clamp(value: Float32, lower: UInt8, upper: UInt8) -> UInt8 {
+        return UInt8(Swift.max(Float32(lower), Swift.min(value, Float32(upper))))
+    }
+}
diff --git a/swift/StableDiffusion/pipeline/StableDiffusionPipeline.Configuration.swift b/swift/StableDiffusion/pipeline/StableDiffusionPipeline.Configuration.swift
index 18a2b158..12f67963 100644
--- a/swift/StableDiffusion/pipeline/StableDiffusionPipeline.Configuration.swift
+++ b/swift/StableDiffusion/pipeline/StableDiffusionPipeline.Configuration.swift
@@ -46,12 +46,16 @@ public struct PipelineConfiguration: Hashable {
     public var schedulerType: StableDiffusionScheduler = .pndmScheduler
     /// The spacing to use for scheduler sigmas and time steps. Only supported when using `.dpmppScheduler`.
     public var schedulerTimestepSpacing: TimeStepSpacing = .linspace
+    /// Resolution dependent shifting of timestep schedules
+    public var schedulerTimestepShift: Float = 3.0
     /// The type of RNG to use
     public var rngType: StableDiffusionRNG = .numpyRNG
     /// Scale factor to use on the latent after encoding
     public var encoderScaleFactor: Float32 = 0.18215
     /// Scale factor to use on the latent before decoding
     public var decoderScaleFactor: Float32 = 0.18215
+    /// Shift factor to use on the latent before decoding
+    public var decoderShiftFactor: Float32 = 0.0
     /// If `originalSize` is not the same as `targetSize` the image will appear to be down- or upsampled.
     /// Part of SDXL’s micro-conditioning as explained in section 2.2 of https://huggingface.co/papers/2307.01952.
     public var originalSize: Float32 = 1024
diff --git a/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift b/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift
index 59f2a363..a0dcddb4 100644
--- a/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift
+++ b/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift
@@ -13,6 +13,8 @@ public enum StableDiffusionScheduler {
     case pndmScheduler
     /// Scheduler that uses a second order DPM-Solver++ algorithm
     case dpmSolverMultistepScheduler
+    /// Scheduler for rectified flow based multimodal diffusion transformer models
+    case discreteFlowScheduler
 }
 
 /// RNG compatible with StableDiffusionPipeline
@@ -30,6 +32,7 @@ public enum PipelineError: String, Swift.Error {
     case startingImageProvidedWithoutEncoder
     case startingText2ImgWithoutTextEncoder
     case unsupportedOSVersion
+    case errorCreatingPreview
 }
 
 @available(iOS 16.2, macOS 13.1, *)
@@ -229,6 +232,7 @@ public struct StableDiffusionPipeline: StableDiffusionPipelineProtocol {
             switch config.schedulerType {
             case .pndmScheduler: return PNDMScheduler(stepCount: config.stepCount)
             case .dpmSolverMultistepScheduler: return DPMSolverMultistepScheduler(stepCount: config.stepCount, timeStepSpacing: config.schedulerTimestepSpacing)
+            case .discreteFlowScheduler: return DiscreteFlowScheduler(stepCount: config.stepCount, timeStepShift: config.schedulerTimestepShift)
             }
         }
 
diff --git a/swift/StableDiffusion/pipeline/StableDiffusionXLPipeline.swift b/swift/StableDiffusion/pipeline/StableDiffusionXLPipeline.swift
index 58b6b5be..a4aad817 100644
--- a/swift/StableDiffusion/pipeline/StableDiffusionXLPipeline.swift
+++ b/swift/StableDiffusion/pipeline/StableDiffusionXLPipeline.swift
@@ -176,6 +176,7 @@ public struct StableDiffusionXLPipeline: StableDiffusionPipelineProtocol {
             switch config.schedulerType {
             case .pndmScheduler: return PNDMScheduler(stepCount: config.stepCount)
             case .dpmSolverMultistepScheduler: return DPMSolverMultistepScheduler(stepCount: config.stepCount, timeStepSpacing: config.schedulerTimestepSpacing)
+            case .discreteFlowScheduler: return DiscreteFlowScheduler(stepCount: config.stepCount, timeStepShift: config.schedulerTimestepShift)
             }
         }
 
diff --git a/swift/StableDiffusion/pipeline/TextEncoderT5.swift b/swift/StableDiffusion/pipeline/TextEncoderT5.swift
new file mode 100644
index 00000000..51ad305e
--- /dev/null
+++ b/swift/StableDiffusion/pipeline/TextEncoderT5.swift
@@ -0,0 +1,124 @@
+// For licensing see accompanying LICENSE.md file.
+// Copyright (C) 2023 Apple Inc. All Rights Reserved.
+
+import Foundation
+import CoreML
+import Tokenizers
+
+@available(iOS 17.0, macOS 14.0, *)
+public protocol TextEncoderT5Model: ResourceManaging {
+    func encode(_ text: String) throws -> TextEncoderT5Output
+}
+
+@available(iOS 17.0, macOS 14.0, *)
+public struct TextEncoderT5Output {
+    public let encoderHiddenStates: MLShapedArray<Float32>
+}
+
+///  A model for encoding text, suitable for SD3
+@available(iOS 17.0, macOS 14.0, *)
+public struct TextEncoderT5: TextEncoderT5Model {
+
+    /// Text tokenizer
+    var tokenizer: Tokenizer
+
+    /// Embedding model
+    var model: ManagedMLModel
+
+    /// Creates text encoder which embeds a tokenized string
+    ///
+    /// - Parameters:
+    ///   - tokenizer: Tokenizer for input text
+    ///   - url: Location of compiled text encoding  Core ML model
+    ///   - configuration: configuration to be used when the model is loaded
+    /// - Returns: A text encoder that will lazily load its required resources when needed or requested
+    public init(tokenizer: Tokenizer,
+                modelAt url: URL,
+                configuration: MLModelConfiguration) {
+        self.tokenizer = tokenizer
+        self.model = ManagedMLModel(modelAt: url, configuration: configuration)
+    }
+
+    /// Ensure the model has been loaded into memory
+    public func loadResources() throws {
+        try model.loadResources()
+    }
+
+    /// Unload the underlying model to free up memory
+    public func unloadResources() {
+       model.unloadResources()
+    }
+
+    /// Encode input text/string
+    ///
+    ///  - Parameters:
+    ///     - text: Input text to be tokenized and then embedded
+    ///  - Returns: Embedding representing the input text
+    public func encode(_ text: String) throws -> TextEncoderT5Output {
+
+        // Get models expected input length
+        let inputLength = inputShape.last!
+
+        // Tokenize, padding to the expected length
+        var tokens = tokenizer.tokenize(text: text)
+        var ids = tokens.map { tokenizer.convertTokenToId($0) ?? 0 }
+
+        // Truncate if necessary
+        if ids.count > inputLength {
+            tokens = tokens.dropLast(tokens.count - inputLength)
+            ids = ids.dropLast(ids.count - inputLength)
+            print("Needed to truncate input for TextEncoderT5")
+        }
+
+        // Use the model to generate the embedding
+        let encodedText = try encode(ids: ids)
+        return encodedText
+    }
+
+    func encode(ids: [Int]) throws -> TextEncoderT5Output {
+        let inputName = "input_ids"
+        let inputShape = inputShape
+        let inputLength = inputShape[1]
+                
+        let bosToken = tokenizer.bosTokenId ?? 0
+        let eosToken = tokenizer.eosTokenId ?? 1
+        let padToken = bosToken
+        let maskToken = eosToken
+
+        // Truncate and pad input to the expected length
+        let truncatedIds = ids.prefix(inputLength - 1) + [eosToken]
+        let inputIds = truncatedIds + Array(repeating: padToken, count: inputLength - truncatedIds.count)
+
+        let attentionMaskName = "attention_mask"
+        var attentionMask: [Int] = inputIds.map { token in
+            token == padToken ? maskToken : padToken
+        }
+        attentionMask[0] = bosToken
+
+        let floatIds = inputIds.map { Float32($0) }
+        let floatMask = attentionMask.map { Float32($0) }
+
+        let inputArray = MLShapedArray<Float32>(scalars: floatIds, shape: inputShape)
+        let maskArray = MLShapedArray<Float32>(scalars: floatMask, shape: inputShape)
+        let inputFeatures = try! MLDictionaryFeatureProvider(
+            dictionary: [inputName: MLMultiArray(inputArray),
+                         attentionMaskName: MLMultiArray(maskArray)])
+
+        let result = try model.perform { model in
+            try model.prediction(from: inputFeatures)
+        }
+
+        let embeddingFeature = result.featureValue(for: "encoder_hidden_states")
+        return TextEncoderT5Output(encoderHiddenStates: MLShapedArray<Float32>(converting: embeddingFeature!.multiArrayValue!))
+    }
+
+    var inputDescription: MLFeatureDescription {
+        try! model.perform { model in
+            model.modelDescription.inputDescriptionsByName.first!.value
+        }
+    }
+    
+    var inputShape: [Int] {
+        inputDescription.multiArrayConstraint!.shape.map { $0.intValue }
+    }
+}
diff --git a/swift/StableDiffusion/pipeline/Unet.swift b/swift/StableDiffusion/pipeline/Unet.swift
index 21e093c8..be93bce1 100644
--- a/swift/StableDiffusion/pipeline/Unet.swift
+++ b/swift/StableDiffusion/pipeline/Unet.swift
@@ -114,7 +114,7 @@ public struct Unet: ResourceManaging {
         let batch = MLArrayBatchProvider(array: inputs)
 
         // Make predictions
-        let results = try predictions(from: batch)
+        let results = try models.predictions(from: batch)
 
         // Pull out the results in Float32 format
         let noise = (0..<results.count).map { i in
@@ -173,7 +173,7 @@ public struct Unet: ResourceManaging {
         let batch = MLArrayBatchProvider(array: inputs)
 
         // Make predictions
-        let results = try predictions(from: batch)
+        let results = try models.predictions(from: batch)
 
         // Pull out the results in Float32 format
         let noise = (0..<results.count).map { i in
@@ -196,51 +196,4 @@ public struct Unet: ResourceManaging {
 
         return noise
     }
-
-    func predictions(from batch: MLBatchProvider) throws -> MLBatchProvider {
-
-        var results = try models.first!.perform { model in
-            try model.predictions(fromBatch: batch)
-        }
-
-        if models.count == 1 {
-            return results
-        }
-
-        // Manual pipeline batch prediction
-        let inputs = batch.arrayOfFeatureValueDictionaries
-        for stage in models.dropFirst() {
-
-            // Combine the original inputs with the outputs of the last stage
-            let next = try results.arrayOfFeatureValueDictionaries
-                .enumerated().map { (index, dict) in
-                    let nextDict =  dict.merging(inputs[index]) { (out, _) in out }
-                    return try MLDictionaryFeatureProvider(dictionary: nextDict)
-            }
-            let nextBatch = MLArrayBatchProvider(array: next)
-
-            // Predict
-            results = try stage.perform { model in
-                try model.predictions(fromBatch: nextBatch)
-            }
-        }
-
-        return results
-    }
-}
-
-extension MLFeatureProvider {
-    var featureValueDictionary: [String : MLFeatureValue] {
-        self.featureNames.reduce(into: [String : MLFeatureValue]()) { result, name in
-            result[name] = self.featureValue(for: name)
-        }
-    }
-}
-
-extension MLBatchProvider {
-    var arrayOfFeatureValueDictionaries: [[String : MLFeatureValue]] {
-        (0..<self.count).map {
-            self.features(at: $0).featureValueDictionary
-        }
-    }
 }
diff --git a/swift/StableDiffusion/tokenizer/T5Tokenizer.swift b/swift/StableDiffusion/tokenizer/T5Tokenizer.swift
new file mode 100644
index 00000000..d6238d93
--- /dev/null
+++ b/swift/StableDiffusion/tokenizer/T5Tokenizer.swift
@@ -0,0 +1,21 @@
+// For licensing see accompanying LICENSE.md file.
+// Copyright (C) 2024 Apple Inc. All Rights Reserved.
+
+import Foundation
+import Hub
+import Tokenizers
+
+/// Extension to swift-transfomers Hub.swift to load local Config files
+public extension Config {
+    /// Assumes the file is already present at local url.
+    /// `fileURL` is a complete local file path for the given model
+    public init(fileURL: URL) throws  {
+        let data = try Data(contentsOf: fileURL)
+        let parsed = try JSONSerialization.jsonObject(with: data, options: [])
+        guard var dictionary = parsed as? [String: Any] else { throw Hub.HubClientError.parse }
+        
+        // Necessary override for loading local tokenizer configs
+        dictionary["tokenizer_class"] = "T5Tokenizer"
+        self.init(dictionary)
+    }
+}
diff --git a/swift/StableDiffusionCLI/main.swift b/swift/StableDiffusionCLI/main.swift
index dea2798d..7e386643 100644
--- a/swift/StableDiffusionCLI/main.swift
+++ b/swift/StableDiffusionCLI/main.swift
@@ -39,6 +39,9 @@ struct StableDiffusionSample: ParsableCommand {
     @Flag(name: .customLong("xl"), help: "The resources correspond to a Stable Diffusion XL model")
     var isXL: Bool = false
 
+    @Flag(name: .customLong("sd3"), help: "The resources correspond to a Stable Diffusion 3 model")
+    var isSD3: Bool = false
+
     @Option(help: "Path to starting image.")
     var image: String? = nil
     
@@ -114,6 +117,8 @@ struct StableDiffusionSample: ParsableCommand {
         log("(Note: This can take a while the first time using these resources)\n")
         let pipeline: StableDiffusionPipelineProtocol
         var scaleFactor: Float32 = 0.18215
+        var shiftFactor: Float32 = 0.0
+        var timestepShift: Float32 = 1.0
         if #available(macOS 14.0, iOS 17.0, *) {
             if isXL {
                 scaleFactor = 0.13025
@@ -128,6 +133,21 @@ struct StableDiffusionSample: ParsableCommand {
                     configuration: config,
                     reduceMemory: reduceMemory
                 )
+            } else if isSD3 {
+                scaleFactor = 1.5305
+                shiftFactor = 0.0609
+                timestepShift = 3.0
+                if !controlnet.isEmpty {
+                    throw RunError.unsupported("ControlNet is not supported for Stable Diffusion 3")
+                }
+                if useMultilingualTextEncoder {
+                    throw RunError.unsupported("Multilingual text encoder is not yet supported for Stable Diffusion 3")
+                }
+                pipeline = try StableDiffusion3Pipeline(
+                    resourcesAt: resourceURL,
+                    configuration: config,
+                    reduceMemory: reduceMemory
+                )
             } else {
                 pipeline = try StableDiffusionPipeline(
                     resourcesAt: resourceURL,
@@ -198,6 +218,8 @@ struct StableDiffusionSample: ParsableCommand {
         pipelineConfig.useDenoisedIntermediates = true
         pipelineConfig.encoderScaleFactor = scaleFactor
         pipelineConfig.decoderScaleFactor = scaleFactor
+        pipelineConfig.decoderShiftFactor = shiftFactor
+        pipelineConfig.schedulerTimestepShift = timestepShift
 
         let images = try pipeline.generateImages(
             configuration: pipelineConfig) { progress in