ml-explore
diff --git a/‎Libraries/MLXVLM/MediaProcessing.swift
Lines changed: 38 additions & 16 deletions b/‎Libraries/MLXVLM/MediaProcessing.swift
Lines changed: 38 additions & 16 deletions
diff --git a/‎Libraries/MLXVLM/Models/Qwen25VL.swift
Lines changed: 76 additions & 27 deletions b/‎Libraries/MLXVLM/Models/Qwen25VL.swift
Lines changed: 76 additions & 27 deletions
diff --git a/‎Libraries/MLXVLM/Models/Qwen2VL.swift
Lines changed: 76 additions & 27 deletions b/‎Libraries/MLXVLM/Models/Qwen2VL.swift
Lines changed: 76 additions & 27 deletions
@@ -63,26 +63,48 @@ public enum MediaProcessing {
     ///   - image: The image to resample
     ///   - size: The target size
     /// - Returns: The resampled image
-    static public func resampleBicubic(_ image: CIImage, to size: CGSize) -> CIImage {
-        let filter = CIFilter.bicubicScaleTransform()
-        let extent = image.extent.size
-
-        filter.inputImage = image
+    public static func resampleBicubic(_ image: CIImage, to size: CGSize) -> CIImage {
+        // First, create a CIFilter for precise resampling
+        guard let filter = CIFilter(name: "CILanczosScaleTransform") else {
+            // Fall back to affine transform if filter isn't available
+            let scaleX = size.width / image.extent.width
+            let scaleY = size.height / image.extent.height
+            let transform = CGAffineTransform(scaleX: scaleX, y: scaleY)
+            let scaled = image.transformed(by: transform)
+
+            // Force exact dimensions by cropping
+            return scaled.cropped(to: CGRect(origin: .zero, size: size))
+        }
 
-        // set the aspect ratio to match the aspect ratio of the target
-        let inputAspectRatio = extent.width / extent.height
-        let desiredAspectRatio = size.width / size.height
-        filter.aspectRatio = Float(1 / inputAspectRatio * desiredAspectRatio)
+        filter.setValue(image, forKey: kCIInputImageKey)
+        filter.setValue(size.width / image.extent.width, forKey: kCIInputScaleKey)
+        filter.setValue(1.0, forKey: kCIInputAspectRatioKey)
 
-        // Use the same scaling approach regardless of orientation
-        let scale = min(size.width / extent.width, size.height / extent.height)
-        filter.scale = Float(scale)
+        guard let scaledImage = filter.outputImage else {
+            // Fall back if filter fails
+            let scaleX = size.width / image.extent.width
+            let scaleY = size.height / image.extent.height
+            let transform = CGAffineTransform(scaleX: scaleX, y: scaleY)
+            let scaled = image.transformed(by: transform)
 
-        let rescaled = filter.outputImage!
+            return scaled.cropped(to: CGRect(origin: .zero, size: size))
+        }
 
-        // The image has a DoD larger than the requested size, so crop
-        // it to the desired size
-        return rescaled.cropped(to: CGRect(origin: .zero, size: size))
+        // Calculate the crop rect to get exactly the requested size
+        // Scale height separately to match the target height
+        let heightScale = size.height / scaledImage.extent.height
+        let finalImage = scaledImage.transformed(by: CGAffineTransform(scaleX: 1.0, y: heightScale))
+
+        // Create a rect with the exact dimensions we want
+        let exactRect = CGRect(
+            x: 0,
+            y: 0,
+            width: size.width,
+            height: size.height
+        )
+
+        // Crop to ensure exact dimensions
+        return finalImage.cropped(to: exactRect)
     }
 
     /// Normalize the image using the given mean and standard deviation parameters.
 
@@ -668,18 +668,18 @@ public class Qwen25VLProcessor: UserInputProcessor {
     public func preprocess(images: [CIImage], processing: UserInput.Processing?) throws -> (
         MLXArray, THW
     ) {
-        // first apply the user requested resizing, etc. if any
+        // First apply the user requested resizing, etc. if any
         let images = images.map { MediaProcessing.apply($0, processing: processing) }
 
         // image_processing_qwen2_vl._preprocess
-
         let size = images[0].extent.size
         let (resizedHeight, resizedWidth) = try QwenVL.targetSize(
             height: Int(size.height), width: Int(size.width),
             factor: config.patchSize * config.mergeSize,
             minPixels: config.size.minPixels, maxPixels: config.size.maxPixels)
         let resizedSize = CGSize(width: resizedWidth, height: resizedHeight)
 
+        // Process images
         let processedImages =
             try images
             .map {
@@ -696,42 +696,79 @@ public class Qwen25VLProcessor: UserInputProcessor {
                 MediaProcessing.asMLXArray($0)
             }
 
+        // Calculate grid dimensions
+        let gridT = images.count
+        let gridH = resizedHeight / config.patchSize
+        let gridW = resizedWidth / config.patchSize
+
+        // Ensure dimensions are valid
+        guard
+            resizedHeight % config.patchSize == 0 && resizedWidth % config.patchSize == 0
+                && gridH % config.mergeSize == 0 && gridW % config.mergeSize == 0
+        else {
+            throw VLMError.imageProcessingFailure(
+                "Image dimensions must be divisible by patch size and merge size")
+        }
+
+        // Concatenate images and handle temporal patch size
         var patches = concatenated(processedImages)
+        let channel = patches.dim(1)
+
+        // Pad to match temporal patch size if needed
         let mod = patches.dim(0) % config.temporalPatchSize
         if mod != 0 {
             let lastPatch = patches[-1, .ellipsis]
             let lastPatchRepeated = tiled(
                 lastPatch, repetitions: [config.temporalPatchSize - mod, 1, 1, 1])
             patches = concatenated([patches, lastPatchRepeated])
         }
-        let channel = patches.dim(1)
-        let gridT = patches.dim(0) / self.config.temporalPatchSize
-        let gridH = resizedHeight / self.config.patchSize
-        let gridW = resizedWidth / self.config.patchSize
-
-        patches = patches.reshaped(
-            gridT,
-            config.temporalPatchSize,
-            channel,
-            gridH / config.mergeSize,
-            config.mergeSize,
-            config.patchSize,
-            gridW / config.mergeSize,
-            config.mergeSize,
-            config.patchSize
-        )
+
+        // Recalculate gridT after padding
+        let actualGridT = patches.dim(0) / config.temporalPatchSize
+
+        // Calculate expected size for verification
+        let totalElements = patches.size
+        let expectedElements =
+            actualGridT * config.temporalPatchSize * channel * resizedHeight * resizedWidth
+
+        // Try to reshape with careful dimension calculation
+        do {
+            patches = patches.reshaped(
+                actualGridT,
+                config.temporalPatchSize,
+                channel,
+                gridH / config.mergeSize,
+                config.mergeSize,
+                config.patchSize,
+                gridW / config.mergeSize,
+                config.mergeSize,
+                config.patchSize
+            )
+        } catch {
+            // If reshape fails, provide detailed error
+            throw VLMError.imageProcessingFailure(
+                "Failed to reshape patches: \(error). Patches shape: \(patches.shape), "
+                    + "Target shape: (\(actualGridT), \(config.temporalPatchSize), \(channel), "
+                    + "\(gridH / config.mergeSize), \(config.mergeSize), \(config.patchSize), "
+                    + "\(gridW / config.mergeSize), \(config.mergeSize), \(config.patchSize))"
+            )
+        }
+
+        // Continue with transpose and final reshape
         patches = patches.transposed(0, 3, 6, 4, 7, 2, 1, 5, 8)
 
         let flattenedPatches = patches.reshaped(
-            gridT * gridH * gridW,
-            channel * config.temporalPatchSize * config.patchSize * config.patchSize
+            actualGridT * (gridH / config.mergeSize) * (gridW / config.mergeSize),
+            channel * config.temporalPatchSize * (config.mergeSize * config.patchSize)
+                * (config.mergeSize * config.patchSize)
         )
 
-        return (flattenedPatches, .init(gridT, gridH, gridW))
+        return (flattenedPatches, .init(actualGridT, gridH, gridW))
     }
 
     public func prepare(input: UserInput) async throws -> LMInput {
         let messages = input.prompt.asMessages()
+
         var promptTokens = try tokenizer.applyChatTemplate(messages: messages)
 
         // Text-only input
@@ -748,10 +785,16 @@ public class Qwen25VLProcessor: UserInputProcessor {
             let imagePixelsConcatenated = concatenated(imagePixelsAndFrames.map { $0.0 })
             processedImage = LMInput.ProcessedImage(
                 pixels: imagePixelsConcatenated, frames: imagePixelsAndFrames.map { $0.1 })
+
             if let imageFrames = processedImage?.frames {
-                promptTokens = try QwenVL.replacePaddingTokens(
-                    in: promptTokens, frames: imageFrames, paddingToken: "<|image_pad|>",
-                    mergeSize: config.mergeSize, tokenizer: tokenizer)
+                do {
+                    promptTokens = try QwenVL.replacePaddingTokens(
+                        in: promptTokens, frames: imageFrames, paddingToken: "<|image_pad|>",
+                        mergeSize: config.mergeSize, tokenizer: tokenizer)
+                } catch {
+                    print("Error in replacePaddingTokens: \(error)")
+                    throw error
+                }
             }
         }
 
@@ -772,10 +815,16 @@ public class Qwen25VLProcessor: UserInputProcessor {
             let videoPixelsConcatenated = concatenated(videoPixelsAndFrames.map { $0.0 })
             processedVideo = LMInput.ProcessedVideo(
                 pixels: videoPixelsConcatenated, frames: videoPixelsAndFrames.map { $0.1 })
+
             if let videoFrames = processedVideo?.frames {
-                promptTokens = try QwenVL.replacePaddingTokens(
-                    in: promptTokens, frames: videoFrames, paddingToken: "<|video_pad|>",
-                    mergeSize: config.mergeSize, tokenizer: tokenizer)
+                do {
+                    promptTokens = try QwenVL.replacePaddingTokens(
+                        in: promptTokens, frames: videoFrames, paddingToken: "<|video_pad|>",
+                        mergeSize: config.mergeSize, tokenizer: tokenizer)
+                } catch {
+                    print("Error in video replacePaddingTokens: \(error)")
+                    throw error
+                }
             }
         }
 
 
@@ -531,18 +531,18 @@ public class Qwen2VLProcessor: UserInputProcessor {
     public func preprocess(images: [CIImage], processing: UserInput.Processing?) throws -> (
         MLXArray, THW
     ) {
-        // first apply the user requested resizing, etc. if any
+        // First apply the user requested resizing, etc. if any
         let images = images.map { MediaProcessing.apply($0, processing: processing) }
 
         // image_processing_qwen2_vl._preprocess
-
         let size = images[0].extent.size
         let (resizedHeight, resizedWidth) = try QwenVL.targetSize(
             height: Int(size.height), width: Int(size.width),
             factor: config.patchSize * config.mergeSize,
             minPixels: config.size.minPixels, maxPixels: config.size.maxPixels)
         let resizedSize = CGSize(width: resizedWidth, height: resizedHeight)
 
+        // Process images
         let processedImages =
             try images
             .map {
@@ -559,42 +559,79 @@ public class Qwen2VLProcessor: UserInputProcessor {
                 MediaProcessing.asMLXArray($0)
             }
 
+        // Calculate grid dimensions
+        let gridT = images.count
+        let gridH = resizedHeight / config.patchSize
+        let gridW = resizedWidth / config.patchSize
+
+        // Ensure dimensions are valid
+        guard
+            resizedHeight % config.patchSize == 0 && resizedWidth % config.patchSize == 0
+                && gridH % config.mergeSize == 0 && gridW % config.mergeSize == 0
+        else {
+            throw VLMError.imageProcessingFailure(
+                "Image dimensions must be divisible by patch size and merge size")
+        }
+
+        // Concatenate images and handle temporal patch size
         var patches = concatenated(processedImages)
+        let channel = patches.dim(1)
+
+        // Pad to match temporal patch size if needed
         let mod = patches.dim(0) % config.temporalPatchSize
         if mod != 0 {
             let lastPatch = patches[-1, .ellipsis]
             let lastPatchRepeated = tiled(
                 lastPatch, repetitions: [config.temporalPatchSize - mod, 1, 1, 1])
             patches = concatenated([patches, lastPatchRepeated])
         }
-        let channel = patches.dim(1)
-        let gridT = patches.dim(0) / self.config.temporalPatchSize
-        let gridH = resizedHeight / self.config.patchSize
-        let gridW = resizedWidth / self.config.patchSize
-
-        patches = patches.reshaped(
-            gridT,
-            config.temporalPatchSize,
-            channel,
-            gridH / config.mergeSize,
-            config.mergeSize,
-            config.patchSize,
-            gridW / config.mergeSize,
-            config.mergeSize,
-            config.patchSize
-        )
+
+        // Recalculate gridT after padding
+        let actualGridT = patches.dim(0) / config.temporalPatchSize
+
+        // Calculate expected size for verification
+        let totalElements = patches.size
+        let expectedElements =
+            actualGridT * config.temporalPatchSize * channel * resizedHeight * resizedWidth
+
+        // Try to reshape with careful dimension calculation
+        do {
+            patches = patches.reshaped(
+                actualGridT,
+                config.temporalPatchSize,
+                channel,
+                gridH / config.mergeSize,
+                config.mergeSize,
+                config.patchSize,
+                gridW / config.mergeSize,
+                config.mergeSize,
+                config.patchSize
+            )
+        } catch {
+            // If reshape fails, provide detailed error
+            throw VLMError.imageProcessingFailure(
+                "Failed to reshape patches: \(error). Patches shape: \(patches.shape), "
+                    + "Target shape: (\(actualGridT), \(config.temporalPatchSize), \(channel), "
+                    + "\(gridH / config.mergeSize), \(config.mergeSize), \(config.patchSize), "
+                    + "\(gridW / config.mergeSize), \(config.mergeSize), \(config.patchSize))"
+            )
+        }
+
+        // Continue with transpose and final reshape
         patches = patches.transposed(0, 3, 6, 4, 7, 2, 1, 5, 8)
 
         let flattenedPatches = patches.reshaped(
-            gridT * gridH * gridW,
-            channel * config.temporalPatchSize * config.patchSize * config.patchSize
+            actualGridT * (gridH / config.mergeSize) * (gridW / config.mergeSize),
+            channel * config.temporalPatchSize * (config.mergeSize * config.patchSize)
+                * (config.mergeSize * config.patchSize)
         )
 
-        return (flattenedPatches, .init(gridT, gridH, gridW))
+        return (flattenedPatches, .init(actualGridT, gridH, gridW))
     }
 
     public func prepare(input: UserInput) async throws -> LMInput {
         let messages = input.prompt.asMessages()
+
         var promptTokens = try tokenizer.applyChatTemplate(messages: messages)
 
         // Text-only input
@@ -611,10 +648,16 @@ public class Qwen2VLProcessor: UserInputProcessor {
             let imagePixelsConcatenated = concatenated(imagePixelsAndFrames.map { $0.0 })
             processedImage = LMInput.ProcessedImage(
                 pixels: imagePixelsConcatenated, frames: imagePixelsAndFrames.map { $0.1 })
+
             if let imageFrames = processedImage?.frames {
-                promptTokens = try QwenVL.replacePaddingTokens(
-                    in: promptTokens, frames: imageFrames, paddingToken: "<|image_pad|>",
-                    mergeSize: config.mergeSize, tokenizer: tokenizer)
+                do {
+                    promptTokens = try QwenVL.replacePaddingTokens(
+                        in: promptTokens, frames: imageFrames, paddingToken: "<|image_pad|>",
+                        mergeSize: config.mergeSize, tokenizer: tokenizer)
+                } catch {
+                    print("Error in replacePaddingTokens: \(error)")
+                    throw error
+                }
             }
         }
 
@@ -635,10 +678,16 @@ public class Qwen2VLProcessor: UserInputProcessor {
             let videoPixelsConcatenated = concatenated(videoPixelsAndFrames.map { $0.0 })
             processedVideo = LMInput.ProcessedVideo(
                 pixels: videoPixelsConcatenated, frames: videoPixelsAndFrames.map { $0.1 })
+
             if let videoFrames = processedVideo?.frames {
-                promptTokens = try QwenVL.replacePaddingTokens(
-                    in: promptTokens, frames: videoFrames, paddingToken: "<|video_pad|>",
-                    mergeSize: config.mergeSize, tokenizer: tokenizer)
+                do {
+                    promptTokens = try QwenVL.replacePaddingTokens(
+                        in: promptTokens, frames: videoFrames, paddingToken: "<|video_pad|>",
+                        mergeSize: config.mergeSize, tokenizer: tokenizer)
+                } catch {
+                    print("Error in video replacePaddingTokens: \(error)")
+                    throw error
+                }
             }
         }