ml-explore
diff --git a/‎Libraries/MLXVLM/Models/Qwen25VL.swift
Lines changed: 18 additions & 175 deletions b/‎Libraries/MLXVLM/Models/Qwen25VL.swift
Lines changed: 18 additions & 175 deletions
@@ -9,16 +9,6 @@ import MLXLMCommon
 import MLXNN
 import Tokenizers
 
-// MARK: - Common
-
-/// Rotates half the hidden dims of the input
-private func rotateHalf(_ x: MLXArray) -> MLXArray {
-    let index = x.dim(-1) / 2
-    let x1 = x[.ellipsis, 0 ..< index]
-    let x2 = x[.ellipsis, index...]
-    return concatenated([-x2, x1], axis: -1)
-}
-
 // MARK: - Language
 
 private enum Language {
@@ -45,8 +35,8 @@ private enum Language {
             )[0..., .newAxis, 0..., 0...]
 
         // Apply rotary embedding
-        let qEmbed = (q * cos) + (rotateHalf(q) * sin)
-        let kEmbed = (k * cos) + (rotateHalf(k) * sin)
+        let qEmbed = (q * cos) + (QwenVL.rotateHalf(q) * sin)
+        let kEmbed = (k * cos) + (QwenVL.rotateHalf(k) * sin)
         return (qEmbed, kEmbed)
     }
 
@@ -264,64 +254,10 @@ private enum Vision {
         sin = tiled(sin, repetitions: [1, 1, 2])
         sin = expandedDimensions(sin, axis: 0)
 
-        let output = (tensor * cos) + (rotateHalf(tensor) * sin)
+        let output = (tensor * cos) + (QwenVL.rotateHalf(tensor) * sin)
         return output.asType(tensor.dtype)
     }
 
-    fileprivate class VisionRotaryEmbedding {
-        let dimensions: Int
-        let theta: Float
-        let inverseFreq: MLXArray
-
-        init(dimensions: Int, theta: Float) {
-            self.dimensions = dimensions
-            self.theta = theta
-            let p = MLXArray(stride(from: 0, to: dimensions, by: 2)).asType(.float32) / dimensions
-            self.inverseFreq = 1.0 / pow(theta, p)
-        }
-
-        func callAsFunction(sequenceLength: Int) -> MLXArray {
-            let seq = MLXArray(0 ..< sequenceLength).asType(inverseFreq.dtype)
-            let freqs = outer(seq, inverseFreq)
-            return freqs
-        }
-    }
-
-    fileprivate class PatchEmbed: Module, UnaryLayer {
-        @ModuleInfo var proj: Conv3d
-
-        let patchSize: Int
-        let temporalPatchSize: Int
-        let inChannels: Int
-        let hiddenSize: Int
-
-        init(patchSize: Int, temporalPatchSize: Int, inChannels: Int, hiddenSize: Int) {
-            self.patchSize = patchSize
-            self.temporalPatchSize = temporalPatchSize
-            self.inChannels = inChannels
-            self.hiddenSize = hiddenSize
-
-            let kernelSize = IntOrTriple([temporalPatchSize, patchSize, patchSize])
-            self._proj.wrappedValue = Conv3d(
-                inputChannels: inChannels,
-                outputChannels: hiddenSize,
-                kernelSize: kernelSize,
-                stride: kernelSize,
-                bias: false
-            )
-        }
-
-        func callAsFunction(_ hiddenStates: MLXArray) -> MLXArray {
-            var hiddenStates = hiddenStates.reshaped(
-                -1, inChannels, temporalPatchSize, patchSize, patchSize
-            ).movedAxis(source: 1, destination: 4)
-
-            hiddenStates = proj(hiddenStates)
-            hiddenStates = hiddenStates.reshaped(-1, hiddenSize)
-            return hiddenStates
-        }
-    }
-
     fileprivate class PatchMerger: Module, UnaryLayer {
         let hiddenSize: Int
         @ModuleInfo(key: "ln_q") var layerNormQ: RMSNorm
@@ -457,8 +393,8 @@ private enum Vision {
 
     fileprivate class VisionModel: Module {
 
-        @ModuleInfo(key: "patch_embed") var patchEmbed: PatchEmbed
-        @ModuleInfo(key: "rotary_pos_emb") var rotaryPositionEmbedding: VisionRotaryEmbedding
+        @ModuleInfo(key: "patch_embed") var patchEmbed: QwenVL.PatchEmbed
+        @ModuleInfo(key: "rotary_pos_emb") var rotaryPositionEmbedding: QwenVL.VisionRotaryEmbedding
         @ModuleInfo(key: "blocks") var blocks: [Qwen25VLVisionBlock]
         @ModuleInfo(key: "merger") var patchMerger: PatchMerger
 
@@ -475,14 +411,14 @@ private enum Vision {
             self.spatialMergeUnit = config.spatialMergeSize * config.spatialMergeSize
             self.fullattBlockIndexes = config.fullattBlockIndexes
 
-            self._patchEmbed.wrappedValue = PatchEmbed(
+            self._patchEmbed.wrappedValue = QwenVL.PatchEmbed(
                 patchSize: config.patchSize,
                 temporalPatchSize: config.temporalPatchSize,
                 inChannels: config.inChannels,
                 hiddenSize: config.hiddenSize)
 
             let headDimensions = config.hiddenSize / config.numHeads
-            self._rotaryPositionEmbedding.wrappedValue = VisionRotaryEmbedding(
+            self._rotaryPositionEmbedding.wrappedValue = QwenVL.VisionRotaryEmbedding(
                 dimensions: headDimensions / 2, theta: 10_000)
 
             self._blocks.wrappedValue = (0 ..< config.depth).map { _ in
@@ -729,38 +665,6 @@ public class Qwen25VLProcessor: UserInputProcessor {
         self.tokenizer = tokenizer
     }
 
-    // image_processing_qwen2_vl.smart_resize
-    private func targetSize(height: Int, width: Int, factor: Int, minPixels: Int, maxPixels: Int)
-        throws -> (Int, Int)
-    {
-        if height < factor {
-            throw VLMError.imageProcessingFailure(
-                "height: \(height) must be larger than factor: \(factor)")
-        }
-        if width < factor {
-            throw VLMError.imageProcessingFailure(
-                "width: \(width) must be larger than factor: \(factor)")
-        }
-        if max(height, width) / min(height, width) > 200 {
-            throw VLMError.imageProcessingFailure(
-                "absolute aspect ratio must be smaller than 200: \(width)x\(height)")
-        }
-
-        var hBar = max(factor, Int(round(Float(height) / Float(factor))) * factor)
-        var wBar = max(factor, Int(round(Float(width) / Float(factor))) * factor)
-
-        if hBar * wBar > maxPixels {
-            let beta = sqrt(Float(height * width) / Float(maxPixels))
-            hBar = Int(floor(Float(height) / beta / Float(factor))) * factor
-            wBar = Int(floor(Float(width) / beta / Float(factor))) * factor
-        } else if hBar * wBar < minPixels {
-            let beta = sqrt(Float(minPixels) / Float(height * width))
-            hBar = Int(ceil(Float(height) * beta / Float(factor))) * factor
-            wBar = Int(ceil(Float(width) * beta / Float(factor))) * factor
-        }
-        return (hBar, wBar)
-    }
-
     public func preprocess(images: [CIImage], processing: UserInput.Processing?) throws -> (
         MLXArray, THW
     ) {
@@ -770,7 +674,7 @@ public class Qwen25VLProcessor: UserInputProcessor {
         // image_processing_qwen2_vl._preprocess
 
         let size = images[0].extent.size
-        let (resizedHeight, resizedWidth) = try targetSize(
+        let (resizedHeight, resizedWidth) = try QwenVL.targetSize(
             height: Int(size.height), width: Int(size.width),
             factor: config.patchSize * config.mergeSize,
             minPixels: config.size.minPixels, maxPixels: config.size.maxPixels)
@@ -845,8 +749,9 @@ public class Qwen25VLProcessor: UserInputProcessor {
             processedImage = LMInput.ProcessedImage(
                 pixels: imagePixelsConcatenated, frames: imagePixelsAndFrames.map { $0.1 })
             if let imageFrames = processedImage?.frames {
-                promptTokens = try replacePaddingTokens(
-                    in: promptTokens, frames: imageFrames, paddingToken: "<|image_pad|>")
+                promptTokens = try QwenVL.replacePaddingTokens(
+                    in: promptTokens, frames: imageFrames, paddingToken: "<|image_pad|>",
+                    mergeSize: config.mergeSize, tokenizer: tokenizer)
             }
         }
 
@@ -868,8 +773,9 @@ public class Qwen25VLProcessor: UserInputProcessor {
             processedVideo = LMInput.ProcessedVideo(
                 pixels: videoPixelsConcatenated, frames: videoPixelsAndFrames.map { $0.1 })
             if let videoFrames = processedVideo?.frames {
-                promptTokens = try replacePaddingTokens(
-                    in: promptTokens, frames: videoFrames, paddingToken: "<|video_pad|>")
+                promptTokens = try QwenVL.replacePaddingTokens(
+                    in: promptTokens, frames: videoFrames, paddingToken: "<|video_pad|>",
+                    mergeSize: config.mergeSize, tokenizer: tokenizer)
             }
         }
 
@@ -880,42 +786,6 @@ public class Qwen25VLProcessor: UserInputProcessor {
             image: processedImage,
             video: processedVideo)
     }
-
-    func replacePaddingTokens(in promptTokens: [Int], frames: [THW], paddingToken: String)
-        throws -> [Int]
-    {
-        // Replace single padding token with correct number for each image or video frame
-        let placeholderTokens = try tokenizer.encode(
-            text: "<|vision_start|>\(paddingToken)<|vision_end|>")
-        let placeholderRanges = promptTokens.ranges(of: placeholderTokens)
-        guard placeholderRanges.count == frames.count else {
-            throw VLMError.processing(
-                "Number of placeholder tokens does not match number of frames")
-        }
-        let mergeLength = config.mergeSize * config.mergeSize
-        let replacementSequences = try frames.map { frame in
-            let paddingCount = frame.product / mergeLength
-            return try tokenizer.encode(
-                text:
-                    "<|vision_start|>\(Array(repeating: paddingToken, count: paddingCount).joined())<|vision_end|>"
-            )
-        }
-        // Build the final array
-        var result: [Int] = []
-        var currentIndex = promptTokens.startIndex
-        for (range, replacement) in zip(placeholderRanges, replacementSequences) {
-            // Add tokens before the placeholder
-            result.append(contentsOf: promptTokens[currentIndex ..< range.lowerBound])
-            // Add replacement sequence
-            result.append(contentsOf: replacement)
-            currentIndex = range.upperBound
-        }
-        // Add any remaining tokens after the last replacement
-        if currentIndex < promptTokens.endIndex {
-            result.append(contentsOf: promptTokens[currentIndex...])
-        }
-        return result
-    }
 }
 
 // MARK: - Model
@@ -961,37 +831,10 @@ public class Qwen25VL: Module, VLMModel, KVCacheDimensionProvider {
         }
 
         // Insert special image tokens in the input_ids
-        return mergeInputIdsWithImageFeatures(
-            inputIds: inputIds, inputEmbeds: inputEmbeds, imageFeatures: hiddenStates)
-    }
-
-    private func mergeInputIdsWithImageFeatures(
-        inputIds: MLXArray, inputEmbeds: MLXArray, imageFeatures: MLXArray
-    ) -> MLXArray {
-        let imageTokenId = config.baseConfiguration.imageTokenId
-        let videoTokenId = config.baseConfiguration.videoTokenId
-
-        var imageIndices = [Int]()
-        for (i, v) in inputIds.asArray(Int.self).enumerated() {
-            if v == imageTokenId || v == videoTokenId {
-                imageIndices.append(i)
-            }
-        }
-
-        // Make sure shapes match before assignment
-        var result = inputEmbeds
-        if result.ndim == 2 {
-            result = result[.newAxis, 0..., 0...]
-        }
-
-        if imageFeatures.ndim == 2 {
-            let reshapedFeatures = imageFeatures[.newAxis, 0..., 0...]
-            result[0..., MLXArray(imageIndices), 0...] = reshapedFeatures
-        } else {
-            result[0..., MLXArray(imageIndices), 0...] = imageFeatures
-        }
-
-        return result
+        return QwenVL.mergeInputIdsWithImageFeatures(
+            inputIds: inputIds, inputEmbeds: inputEmbeds, imageFeatures: hiddenStates,
+            imageTokenId: config.baseConfiguration.imageTokenId,
+            videoTokenId: config.baseConfiguration.videoTokenId)
     }
 
     public func prepare(_ input: LMInput, cache: [any KVCache], windowSize: Int?) throws