ml-explore · davidkoski · Apr 9, 2025 · Feb 12, 2025 · Feb 13, 2025 · Feb 13, 2025
diff --git a/Applications/VLMEval/ContentView.swift b/Applications/VLMEval/ContentView.swift
@@ -15,6 +15,11 @@ import SwiftUI
     typealias PlatformImage = NSImage
 #endif
 
+let videoSystemPrompt =
+    "Focus only on describing the key dramatic action or notable event occurring in this video segment. Skip general context or scene-setting details unless they are crucial to understanding the main action."
+let imageSystemPrompt =
+    "You are an image understanding model capable of describing the salient features of any image."
+
 struct ContentView: View {
     @State var prompt = ""
     @State var llm = VLMEvaluator()
@@ -28,7 +33,7 @@ struct ContentView: View {
             }
         }
     }
-    @State private var selectedVideoURL: URL? = nil {
+    @State private var selectedVideoURL: URL? {
         didSet {
             if let selectedVideoURL {
                 player = AVPlayer(url: selectedVideoURL)
@@ -61,7 +66,11 @@ struct ContentView: View {
                 }
 
                 VStack {
-                    if let selectedImage {
+                    if let player {
+                        VideoPlayer(player: player)
+                            .frame(height: 300)
+                            .cornerRadius(12)
+                    } else if let selectedImage {
                         Group {
                             #if os(iOS) || os(visionOS)
                                 Image(uiImage: selectedImage)
@@ -91,11 +100,6 @@ struct ContentView: View {
                                 EmptyView()
                             }
                         }
-                    } else if let player {
-                        VideoPlayer(player: player)
-                            .scaledToFit()
-                            .frame(maxHeight: 300)
-                            .cornerRadius(12)
                     }
 
                     HStack {
@@ -193,6 +197,7 @@ struct ContentView: View {
                         .id("bottom")
                 }
             }
+            .frame(minHeight: 200)
 
             HStack {
                 TextField("prompt", text: $prompt)
@@ -205,6 +210,11 @@ struct ContentView: View {
                     .disabled(llm.running)
             }
         }
+        .onAppear {
+            selectedVideoURL = URL(
+                string:
+                    "https://videos.pexels.com/video-files/4066325/4066325-uhd_2560_1440_24fps.mp4")!
+        }
         #if os(visionOS)
             .padding(40)
         #else
@@ -320,12 +330,12 @@ class VLMEvaluator {
     var modelInfo = ""
     var stat = ""
 
-    /// This controls which model loads. `qwen2VL2BInstruct4Bit` is one of the smaller ones, so this will fit on
+    /// This controls which model loads. `smolvlm` is very small even unquantized, so it will fit on
     /// more devices.
-    let modelConfiguration = ModelRegistry.qwen2VL2BInstruct4Bit
+    let modelConfiguration = VLMRegistry.smolvlm
 
-    /// parameters controlling the output
-    let generateParameters = MLXLMCommon.GenerateParameters(temperature: 0.6)
+    /// parameters controlling the output – use values appropriate for the model selected above
+    let generateParameters = MLXLMCommon.GenerateParameters(temperature: 0.7, topP: 0.9)
     let maxTokens = 800
 
     /// update the display every N tokens -- 4 looks like it updates continuously
@@ -401,7 +411,11 @@ class VLMEvaluator {
                             [
                                 "role": "user",
                                 "content": [
-                                    ["type": "text", "text": prompt]
+                                    [
+                                        "type": "text",
+                                        "text": videoURL != nil
+                                            ? videoSystemPrompt : imageSystemPrompt,
+                                    ]
                                 ]
                                     // Messages format for Qwen 2 VL, Qwen 2.5 VL. May need to be adapted for other models.
                                     + images.map { _ in

diff --git a/Libraries/MLXVLM/MediaProcessing.swift b/Libraries/MLXVLM/MediaProcessing.swift
@@ -5,6 +5,18 @@ import CoreImage.CIFilterBuiltins
 import MLX
 import MLXLMCommon
 
+public struct VideoFrame {
+    let frame: CIImage
+    let timeStamp: CMTime
+}
+
+public struct ProcessedFrames {
+    let frames: [MLXArray]
+    let timestamps: [CMTime]
+    let totalDuration: CMTime
+}
+
+// TODO: verify working color space, rendering color space
 private let context = CIContext()
 
 /// Collection of methods for processing media (images, video, etc.).
@@ -58,6 +70,12 @@ public enum MediaProcessing {
         min(other.width / size.width, other.height / size.height)
     }
 
+    static public func aspectRatioForResample(_ image: CIImage, size: CGSize) -> Float {
+        let inputAspectRatio = image.extent.width / image.extent.height
+        let desiredAspectRatio = size.width / size.height
+        return Float(1 / inputAspectRatio * desiredAspectRatio)
+    }
+
     /// Resample the image using bicubic interpolation.
     public static func resampleBicubic(_ image: CIImage, to size: CGSize) -> CIImage {
         let filter = CIFilter.bicubicScaleTransform()
@@ -66,9 +84,34 @@ public enum MediaProcessing {
         filter.inputImage = image
 
         // set the aspect ratio to match the aspect ratio of the target
-        let inputAspectRatio = extent.width / extent.height
-        let desiredAspectRatio = size.width / size.height
-        filter.aspectRatio = Float(1 / inputAspectRatio * desiredAspectRatio)
+        filter.aspectRatio = aspectRatioForResample(image, size: size)
+
+        // that image is now the aspect ratio of the target and the size
+        // of the shorter dimension
+        let scale: CGFloat
+        if extent.width < extent.height {
+            scale = size.width / extent.width
+        } else {
+            scale = size.height / extent.height
+        }
+        filter.scale = Float(scale)
+
+        let rescaled = filter.outputImage!
+
+        // the image has a DoD larger than the requested size so crop
+        // it to the desired size
+        return rescaled.cropped(to: CGRect(origin: .zero, size: size))
+    }
+
+    /// Resample the image using Lanczos interpolation.
+    static public func resampleLanczos(_ image: CIImage, to size: CGSize) -> CIImage {
+        let filter = CIFilter.lanczosScaleTransform()
+        let extent = image.extent.size
+
+        filter.inputImage = image
+
+        // set the aspect ratio to match the aspect ratio of the target
+        filter.aspectRatio = aspectRatioForResample(image, size: size)
 
         // that image is now the aspect ratio of the target and the size
         // of the shorter dimension
@@ -264,4 +307,105 @@ public enum MediaProcessing {
 
         return ciImages
     }
+
+    static public func asProcessedSequence(
+        _ asset: AVAsset, samplesPerSecond: Int,
+        frameProcessing: (VideoFrame) throws -> VideoFrame = { $0 }
+    ) async throws -> ProcessedFrames {
+        return try await asProcessedSequence(
+            asset, maxFrames: Int.max, targetFPS: { _ in Double(samplesPerSecond) },
+            frameProcessing: frameProcessing)
+    }
+
+    static public func asProcessedSequence(
+        _ asset: AVAsset, maxFrames: Int, targetFPS: (CMTime) -> Double,
+        frameProcessing: (VideoFrame) throws -> VideoFrame = { $0 }
+    ) async throws -> ProcessedFrames {
+        // Use AVAssetImageGenerator to extract frames
+        let generator = AVAssetImageGenerator(asset: asset)
+        generator.appliesPreferredTrackTransform = true
+        generator.requestedTimeToleranceBefore = .zero
+        generator.requestedTimeToleranceAfter = .zero
+
+        guard let duration = try? await asset.load(.duration) else {
+            throw NSError(
+                domain: "MediaProcessing", code: -1,
+                userInfo: [NSLocalizedDescriptionKey: "Failed to load the asset's duration"])
+        }
+        let fps = targetFPS(duration)
+        // Note: the round was not present in `asCIImageSequence`, so we may now be passing 1 more frame to Qwen depending on video duration.
-        // Note: the round was not present in `asCIImageSequence`, so we may now be passing 1 more frame to Qwen depending on video duration.
-        // Note: the round was not present in `asCIImageSequence`, so we may now be passing 1 more frame to Qwen depending on video duration.
+        let estimatedFrames = Int(round(fps * duration.seconds))
+        var desiredFrames = min(estimatedFrames, maxFrames)
+        let finalFrameCount = max(desiredFrames, 1)
+
+        let sampledTimeValues = MLXArray.linspace(
+            0, duration.value, count: Int(finalFrameCount)
+        ).asArray(Int64.self)
+
+        // Construct a CMTime using the sampled CMTimeValue's and the asset's timescale
+        let timescale = duration.timescale
+        let sampledTimes = sampledTimeValues.map { CMTime(value: $0, timescale: timescale) }
+
+        // Collect the frames
+        var ciImages: [CIImage] = []
+        var timestamps: [CMTime] = []
+
+        var frames: [VideoFrame] = []
+
+        for await result in await generator.images(for: sampledTimes) {
+            switch result {
+            case .success(requestedTime: let requested, let image, actualTime: let actual):
+                let ciImage = CIImage(
+                    cgImage: image, options: [.colorSpace: CGColorSpace(name: CGColorSpace.sRGB)!])
+                let frame = try frameProcessing(.init(frame: ciImage, timeStamp: actual))
+                ciImages.append(frame.frame)
+                timestamps.append(frame.timeStamp)
+            case .failure(requestedTime: let requested, let error):
+                break
+            }
+        }
+
+        let framesAsArrays = ciImages.map { $0.asMLXArray() }
+        return ProcessedFrames(
+            frames: framesAsArrays,
+            timestamps: timestamps,
+            totalDuration: duration
+        )
+    }
+}
+
+// MARK: - Convenience
+
+extension CIImage {
+    public enum ResamplingMethod {
+        case bicubic
+        case lanczos
+    }
+
+    public func resampled(to size: CGSize, method: ResamplingMethod = .bicubic) -> CIImage {
+        switch method {
+        case .bicubic:
+            return MediaProcessing.resampleBicubic(self, to: size)
+        case .lanczos:
+            return MediaProcessing.resampleLanczos(self, to: size)
+        }
+    }
+
+    public func toSRGB() -> CIImage {
+        return MediaProcessing.inSRGBToneCurveSpace(self)
+    }
+
+    public func toLinear() -> CIImage {
+        return MediaProcessing.inLinearToneCurveSpace(self)
+    }
+
+    public func normalized(mean: (CGFloat, CGFloat, CGFloat), std: (CGFloat, CGFloat, CGFloat))
+        -> CIImage
+    {
+        return MediaProcessing.normalize(self, mean: mean, std: std)
+    }
+
+    public func asMLXArray(colorSpace: CGColorSpace? = nil) -> MLXArray {
+        return MediaProcessing.asMLXArray(self, colorSpace: colorSpace)
+    }
 }