takeyat
diff --git a/‎Package.swift
+3-3 b/‎Package.swift
+3-3
diff --git a/‎README.md
+146-26 b/‎README.md
+146-26
diff --git a/‎python_coreml_stable_diffusion/pipeline.py
+1-1 b/‎python_coreml_stable_diffusion/pipeline.py
+1-1
diff --git a/‎python_coreml_stable_diffusion/torch2coreml.py
+1-1 b/‎python_coreml_stable_diffusion/torch2coreml.py
+1-1
diff --git a/‎setup.py
+1 b/‎setup.py
+1
diff --git a/‎swift/StableDiffusion/pipeline/DPMSolverMultistepScheduler.swift
+182 b/‎swift/StableDiffusion/pipeline/DPMSolverMultistepScheduler.swift
+182
diff --git a/‎swift/StableDiffusion/pipeline/Decoder.swift
+24-10 b/‎swift/StableDiffusion/pipeline/Decoder.swift
+24-10
diff --git a/‎swift/StableDiffusion/pipeline/ManagedMLModel.swift
+77 b/‎swift/StableDiffusion/pipeline/ManagedMLModel.swift
+77
diff --git a/‎swift/StableDiffusion/pipeline/Random.swift
+1 b/‎swift/StableDiffusion/pipeline/Random.swift
+1
diff --git a/‎swift/StableDiffusion/pipeline/ResourceManaging.swift
+20 b/‎swift/StableDiffusion/pipeline/ResourceManaging.swift
+20
@@ -6,9 +6,9 @@ import PackageDescription
 let package = Package(
     name: "stable-diffusion",
     platforms: [
-        .macOS(.v13),
-        .iOS(.v16),
-       ],
+        .macOS(.v11),
+        .iOS(.v14),
+    ],
     products: [
         .library(
             name: "StableDiffusion",
 
@@ -38,7 +38,7 @@
 import time
 import torch  # Only used for `torch.from_tensor` in `pipe.scheduler.step()`
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
-from typing import Union, Optional
+from typing import List, Optional, Union
 
 
 class CoreMLStableDiffusionPipeline(DiffusionPipeline):
 
@@ -576,7 +576,7 @@ def convert_unet(pipe, args):
         # Set the output descriptions
         coreml_unet.output_description["noise_pred"] = \
             "Same shape and dtype as the `sample` input. " \
-            "The predicted noise to faciliate the reverse diffusion (denoising) process"
+            "The predicted noise to facilitate the reverse diffusion (denoising) process"
 
         _save_mlpackage(coreml_unet, out_path)
         logger.info(f"Saved unet into {out_path}")
 
@@ -19,6 +19,7 @@
         "torch",
         "transformers",
         "scipy",
+        "numpy<1.24",
     ],
     packages=find_packages(),
     classifiers=[
 
@@ -0,0 +1,182 @@
+// For licensing see accompanying LICENSE.md file.
+// Copyright (C) 2022 Apple Inc. and The HuggingFace Team. All Rights Reserved.
+
+import Accelerate
+import CoreML
+
+/// A scheduler used to compute a de-noised image
+///
+///  This implementation matches:
+///  [Hugging Face Diffusers DPMSolverMultistepScheduler](https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py)
+///
+/// It uses the DPM-Solver++ algorithm: [code](https://github.com/LuChengTHU/dpm-solver) [paper](https://arxiv.org/abs/2211.01095).
+/// Limitations:
+///  - Only implemented for DPM-Solver++ algorithm (not DPM-Solver).
+///  - Second order only.
+///  - Assumes the model predicts epsilon.
+///  - No dynamic thresholding.
+///  - `midpoint` solver algorithm.
+@available(iOS 16.2, macOS 13.1, *)
+public final class DPMSolverMultistepScheduler: Scheduler {
+    public let trainStepCount: Int
+    public let inferenceStepCount: Int
+    public let betas: [Float]
+    public let alphas: [Float]
+    public let alphasCumProd: [Float]
+    public let timeSteps: [Int]
+
+    public let alpha_t: [Float]
+    public let sigma_t: [Float]
+    public let lambda_t: [Float]
+    
+    public let solverOrder = 2
+    private(set) var lowerOrderStepped = 0
+    
+    /// Whether to use lower-order solvers in the final steps. Only valid for less than 15 inference steps.
+    /// We empirically find this trick can stabilize the sampling of DPM-Solver, especially with 10 or fewer steps.
+    public let useLowerOrderFinal = true
+    
+    // Stores solverOrder (2) items
+    private(set) var modelOutputs: [MLShapedArray<Float32>] = []
+
+    /// Create a scheduler that uses a second order DPM-Solver++ algorithm.
+    ///
+    /// - Parameters:
+    ///   - stepCount: Number of inference steps to schedule
+    ///   - trainStepCount: Number of training diffusion steps
+    ///   - betaSchedule: Method to schedule betas from betaStart to betaEnd
+    ///   - betaStart: The starting value of beta for inference
+    ///   - betaEnd: The end value for beta for inference
+    /// - Returns: A scheduler ready for its first step
+    public init(
+        stepCount: Int = 50,
+        trainStepCount: Int = 1000,
+        betaSchedule: BetaSchedule = .scaledLinear,
+        betaStart: Float = 0.00085,
+        betaEnd: Float = 0.012
+    ) {
+        self.trainStepCount = trainStepCount
+        self.inferenceStepCount = stepCount
+        
+        switch betaSchedule {
+        case .linear:
+            self.betas = linspace(betaStart, betaEnd, trainStepCount)
+        case .scaledLinear:
+            self.betas = linspace(pow(betaStart, 0.5), pow(betaEnd, 0.5), trainStepCount).map({ $0 * $0 })
+        }
+        
+        self.alphas = betas.map({ 1.0 - $0 })
+        var alphasCumProd = self.alphas
+        for i in 1..<alphasCumProd.count {
+            alphasCumProd[i] *= alphasCumProd[i -  1]
+        }
+        self.alphasCumProd = alphasCumProd
+
+        // Currently we only support VP-type noise shedule
+        self.alpha_t = vForce.sqrt(self.alphasCumProd)
+        self.sigma_t = vForce.sqrt(vDSP.subtract([Float](repeating: 1, count: self.alphasCumProd.count), self.alphasCumProd))
+        self.lambda_t = zip(self.alpha_t, self.sigma_t).map { α, σ in log(α) - log(σ) }
+
+        self.timeSteps = linspace(0, Float(self.trainStepCount-1), stepCount).reversed().map { Int(round($0)) }
+    }
+    
+    /// Convert the model output to the corresponding type the algorithm needs.
+    /// This implementation is for second-order DPM-Solver++ assuming epsilon prediction.
+    func convertModelOutput(modelOutput: MLShapedArray<Float32>, timestep: Int, sample: MLShapedArray<Float32>) -> MLShapedArray<Float32> {
+        assert(modelOutput.scalars.count == sample.scalars.count)
+        let (alpha_t, sigma_t) = (self.alpha_t[timestep], self.sigma_t[timestep])
+        
+        // This could be optimized with a Metal kernel if we find we need to
+        let x0_scalars = zip(modelOutput.scalars, sample.scalars).map { m, s in
+            (s - m * sigma_t) / alpha_t
+        }
+        return MLShapedArray(scalars: x0_scalars, shape: modelOutput.shape)
+    }
+
+    /// One step for the first-order DPM-Solver (equivalent to DDIM).
+    /// See https://arxiv.org/abs/2206.00927 for the detailed derivation.
+    /// var names and code structure mostly follow https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+    func firstOrderUpdate(
+        modelOutput: MLShapedArray<Float32>,
+        timestep: Int,
+        prevTimestep: Int,
+        sample: MLShapedArray<Float32>
+    ) -> MLShapedArray<Float32> {
+        let (p_lambda_t, lambda_s) = (Double(lambda_t[prevTimestep]), Double(lambda_t[timestep]))
+        let p_alpha_t = Double(alpha_t[prevTimestep])
+        let (p_sigma_t, sigma_s) = (Double(sigma_t[prevTimestep]), Double(sigma_t[timestep]))
+        let h = p_lambda_t - lambda_s
+        // x_t = (sigma_t / sigma_s) * sample - (alpha_t * (torch.exp(-h) - 1.0)) * model_output
+        let x_t = weightedSum(
+            [p_sigma_t / sigma_s, -p_alpha_t * (exp(-h) - 1)],
+            [sample, modelOutput]
+        )
+        return x_t
+    }
+
+    /// One step for the second-order multistep DPM-Solver++ algorithm, using the midpoint method.
+    /// var names and code structure mostly follow https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+    func secondOrderUpdate(
+        modelOutputs: [MLShapedArray<Float32>],
+        timesteps: [Int],
+        prevTimestep t: Int,
+        sample: MLShapedArray<Float32>
+    ) -> MLShapedArray<Float32> {
+        let (s0, s1) = (timesteps[back: 1], timesteps[back: 2])
+        let (m0, m1) = (modelOutputs[back: 1], modelOutputs[back: 2])
+        let (p_lambda_t, lambda_s0, lambda_s1) = (Double(lambda_t[t]), Double(lambda_t[s0]), Double(lambda_t[s1]))
+        let p_alpha_t = Double(alpha_t[t])
+        let (p_sigma_t, sigma_s0) = (Double(sigma_t[t]), Double(sigma_t[s0]))
+        let (h, h_0) = (p_lambda_t - lambda_s0, lambda_s0 - lambda_s1)
+        let r0 = h_0 / h
+        let D0 = m0
+        
+        // D1 = (1.0 / r0) * (m0 - m1)
+        let D1 = weightedSum(
+            [1/r0, -1/r0],
+            [m0, m1]
+        )
+        
+        // See https://arxiv.org/abs/2211.01095 for detailed derivations
+        // x_t = (
+        //     (sigma_t / sigma_s0) * sample
+        //     - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+        //     - 0.5 * (alpha_t * (torch.exp(-h) - 1.0)) * D1
+        // )
+        let x_t = weightedSum(
+            [p_sigma_t/sigma_s0, -p_alpha_t * (exp(-h) - 1), -0.5 * p_alpha_t * (exp(-h) - 1)],
+            [sample, D0, D1]
+        )
+        return x_t
+    }
+
+    public func step(output: MLShapedArray<Float32>, timeStep t: Int, sample: MLShapedArray<Float32>) -> MLShapedArray<Float32> {
+        let stepIndex = timeSteps.firstIndex(of: t) ?? timeSteps.count - 1
+        let prevTimestep = stepIndex == timeSteps.count - 1 ? 0 : timeSteps[stepIndex + 1]
+
+        let lowerOrderFinal = useLowerOrderFinal && stepIndex == timeSteps.count - 1 && timeSteps.count < 15
+        let lowerOrderSecond = useLowerOrderFinal && stepIndex == timeSteps.count - 2 && timeSteps.count < 15
+        let lowerOrder = lowerOrderStepped < 1 || lowerOrderFinal || lowerOrderSecond
+        
+        let modelOutput = convertModelOutput(modelOutput: output, timestep: t, sample: sample)
+        if modelOutputs.count == solverOrder { modelOutputs.removeFirst() }
+        modelOutputs.append(modelOutput)
+        
+        let prevSample: MLShapedArray<Float32>
+        if lowerOrder {
+            prevSample = firstOrderUpdate(modelOutput: modelOutput, timestep: t, prevTimestep: prevTimestep, sample: sample)
+        } else {
+            prevSample = secondOrderUpdate(
+                modelOutputs: modelOutputs,
+                timesteps: [timeSteps[stepIndex - 1], t],
+                prevTimestep: prevTimestep,
+                sample: sample
+            )
+        }
+        if lowerOrderStepped < solverOrder {
+            lowerOrderStepped += 1
+        }
+        
+        return prevSample
+    }
+}
@@ -6,21 +6,31 @@ import CoreML
 import Accelerate
 
 /// A decoder model which produces RGB images from latent samples
-public struct Decoder {
+@available(iOS 16.2, macOS 13.1, *)
+public struct Decoder: ResourceManaging {
 
     /// VAE decoder model
-    var model: MLModel
+    var model: ManagedMLModel
 
     /// Create decoder from Core ML model
     ///
-    /// - Parameters
-    ///     - model: Core ML model for VAE decoder
-    public init(model: MLModel) {
-        self.model = model
+    /// - Parameters:
+    ///     - url: Location of compiled VAE decoder Core ML model
+    ///     - configuration: configuration to be used when the model is loaded
+    /// - Returns: A decoder that will lazily load its required resources when needed or requested
+    public init(modelAt url: URL, configuration: MLModelConfiguration) {
+        self.model = ManagedMLModel(modelAt: url, configuration: configuration)
     }
 
-    /// Prediction queue
-    let queue = DispatchQueue(label: "decoder.predict")
+    /// Ensure the model has been loaded into memory
+    public func loadResources() throws {
+        try model.loadResources()
+    }
+
+    /// Unload the underlying model to free up memory
+    public func unloadResources() {
+       model.unloadResources()
+    }
 
     /// Batch decode latent samples into images
     ///
@@ -42,7 +52,9 @@ public struct Decoder {
         let batch = MLArrayBatchProvider(array: inputs)
 
         // Batch predict with model
-        let results = try queue.sync { try model.predictions(fromBatch: batch) }
+        let results = try model.perform { model in
+            try model.predictions(fromBatch: batch)
+        }
 
         // Transform the outputs to CGImages
         let images: [CGImage] = (0..<results.count).map { i in
@@ -57,7 +69,9 @@ public struct Decoder {
     }
 
     var inputName: String {
-        model.modelDescription.inputDescriptionsByName.first!.key
+        try! model.perform { model in
+            model.modelDescription.inputDescriptionsByName.first!.key
+        }
     }
 
     typealias PixelBufferPFx1 = vImage.PixelBuffer<vImage.PlanarF>
 
@@ -0,0 +1,77 @@
+// For licensing see accompanying LICENSE.md file.
+// Copyright (C) 2022 Apple Inc. All Rights Reserved.
+
+import CoreML
+
+/// A class to manage and gate access to a Core ML model
+///
+/// It will automatically load a model into memory when needed or requested
+/// It allows one to request to unload the model from memory
+@available(iOS 16.2, macOS 13.1, *)
+public final class ManagedMLModel: ResourceManaging {
+
+    /// The location of the model
+    var modelURL: URL
+
+    /// The configuration to be used when the model is loaded
+    var configuration: MLModelConfiguration
+
+    /// The loaded model (when loaded)
+    var loadedModel: MLModel?
+
+    /// Queue to protect access to loaded model
+    var queue: DispatchQueue
+
+    /// Create a managed model given its location and desired loaded configuration
+    ///
+    /// - Parameters:
+    ///     - url: The location of the model
+    ///     - configuration: The configuration to be used when the model is loaded/used
+    /// - Returns: A managed model that has not been loaded
+    public init(modelAt url: URL, configuration: MLModelConfiguration) {
+        self.modelURL = url
+        self.configuration = configuration
+        self.loadedModel = nil
+        self.queue = DispatchQueue(label: "managed.\(url.lastPathComponent)")
+    }
+
+    /// Instantiation and load model into memory
+    public func loadResources() throws {
+        try queue.sync {
+            try loadModel()
+        }
+    }
+
+    /// Unload the model if it was loaded
+    public func unloadResources() {
+        queue.sync {
+            loadedModel = nil
+        }
+    }
+
+    /// Perform an operation with the managed model via a supplied closure.
+    ///  The model will be loaded and supplied to the closure and should only be
+    ///  used within the closure to ensure all resource management is synchronized
+    ///
+    /// - Parameters:
+    ///     - body: Closure which performs and action on a loaded model
+    /// - Returns: The result of the closure
+    /// - Throws: An error if the model cannot be loaded or if the closure throws
+    public func perform<R>(_ body: (MLModel) throws -> R) throws -> R {
+        return try queue.sync {
+            try autoreleasepool {
+                try loadModel()
+                return try body(loadedModel!)
+            }
+        }
+    }
+
+    private func loadModel() throws {
+        if loadedModel == nil {
+            loadedModel = try MLModel(contentsOf: modelURL,
+                                      configuration: configuration)
+        }
+    }
+
+
+}
@@ -9,6 +9,7 @@ import CoreML
 ///  This implementation matches:
 ///  [NumPy's older randomkit.c](https://github.com/numpy/numpy/blob/v1.0/numpy/random/mtrand/randomkit.c)
 ///
+@available(iOS 16.2, macOS 13.1, *)
 struct NumPyRandomSource: RandomNumberGenerator {
 
     struct State {
 
@@ -0,0 +1,20 @@
+// For licensing see accompanying LICENSE.md file.
+// Copyright (C) 2022 Apple Inc. All Rights Reserved.
+
+/// Protocol for managing internal resources
+public protocol ResourceManaging {
+
+    /// Request resources to be loaded and ready if possible
+    func loadResources() throws
+
+    /// Request resources are unloaded / remove from memory if possible
+    func unloadResources()
+}
+
+extension ResourceManaging {
+    /// Request resources are pre-warmed by loading and unloading
+    func prewarmResources() throws {
+        try loadResources()
+        unloadResources()
+    }
+}