Skip to content

Add Qwen 2.5 VL #222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Apr 14, 2025
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Applications/VLMEval/ContentView.swift
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ class VLMEvaluator {

/// This controls which model loads. `qwen2VL2BInstruct4Bit` is one of the smaller ones, so this will fit on
/// more devices.
let modelConfiguration = ModelRegistry.qwen2VL2BInstruct4Bit
let modelConfiguration = ModelRegistry.qwen2_5VL3BInstruct4Bit

/// parameters controlling the output
let generateParameters = MLXLMCommon.GenerateParameters(temperature: 0.6)
Expand Down Expand Up @@ -421,7 +421,7 @@ class VLMEvaluator {
]
}
var userInput = UserInput(messages: messages, images: images, videos: videos)
userInput.processing.resize = .init(width: 448, height: 448)
userInput.processing.resize = .init(width: 1344, height: 1344)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two thoughts:

  • maybe the default processing should be defined on line 326 next to the ModelConfiguration -- I think they go together
  • should the VLMs provide defaults inside the UserInputProcessor? for example if the Processing is not specified they could provide a default

I think maybe the latter as it is close to the VLM and it would be better suited to know what size inputs it needs.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and indeed the size may also depend on config, so a second point for the second approach

let input = try await context.processor.prepare(input: userInput)
return try MLXLMCommon.generate(
input: input,
Expand Down
68 changes: 41 additions & 27 deletions Libraries/MLXVLM/MediaProcessing.swift
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ private let context = CIContext()
/// var image: CIImage
/// image = MediaProcessing.inSRGBToneCurveSpace(image)
///
/// // apply user instructions
/// // Apply user instructions
/// image = MediaProcessing.apply(image, processing: processing)
///
/// image = MediaProcessing.resampleBicubic(image, to: config.size.cgSize)
Expand Down Expand Up @@ -58,33 +58,43 @@ public enum MediaProcessing {
min(other.width / size.width, other.height / size.height)
}

/// Resample the image using bicubic interpolation.
static public func resampleBicubic(_ image: CIImage, to size: CGSize) -> CIImage {
let filter = CIFilter.bicubicScaleTransform()
let extent = image.extent.size

filter.inputImage = image
enum MediaProcessingError: LocalizedError {
case transformFailed

// set the aspect ratio to match the aspect ratio of the target
let inputAspectRatio = extent.width / extent.height
let desiredAspectRatio = size.width / size.height
filter.aspectRatio = Float(1 / inputAspectRatio * desiredAspectRatio)

// that image is now the aspect ratio of the target and the size
// of the shorter dimension
let scale: CGFloat
if extent.width < extent.height {
scale = size.width / extent.width
} else {
scale = size.height / extent.height
var errorDescription: String? {
switch self {
case .transformFailed: "Failed to transform image"
}
}
filter.scale = Float(scale)
}

let rescaled = filter.outputImage!
/// Resample the image using bicubic interpolation.
/// - Parameters:
/// - image: The image to resample
/// - size: The target size
/// - Returns: The resampled image
public static func resampleBicubic(_ image: CIImage, to size: CGSize) throws -> CIImage {
// Create a bicubic scale filter

let yScale = size.height / image.extent.height
let xScale = size.width / image.extent.width

// the image has a DoD larger than the requested size so crop
// it to the desired size
return rescaled.cropped(to: CGRect(origin: .zero, size: size))
let filter = CIFilter.bicubicScaleTransform()
filter.inputImage = image
filter.scale = Float(yScale)
filter.aspectRatio = Float(xScale / yScale)
guard let scaledImage = filter.outputImage else {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would probably use ! here too -- it will be nil if we have an invalid scale. Perhaps these should throws so we can indicate failure rather than having fallbacks like this.

throw MediaProcessingError.transformFailed
}
// Create a rect with the exact dimensions we want
let exactRect = CGRect(
x: 0,
y: 0,
width: size.width,
height: size.height
)
// Crop to ensure exact dimensions
return scaledImage.cropped(to: exactRect)
}

/// Normalize the image using the given mean and standard deviation parameters.
Expand All @@ -94,7 +104,7 @@ public enum MediaProcessing {
let filter = CIFilter.colorMatrix()
filter.inputImage = image

// this should match
// This should match
// https://pytorch.org/vision/main/generated/torchvision.transforms.Normalize.html
//
// output[channel] = (input[channel] - mean[channel]) / std[channel]
Expand All @@ -113,6 +123,10 @@ public enum MediaProcessing {
}

/// Convert the CIImage into a planar 3 channel MLXArray `[1, C, H, W]`
/// - Parameters:
/// - image: The image to convert
/// - colorSpace: Optional color space for rendering
/// - Returns: The MLXArray representation of the image
static public func asMLXArray(_ image: CIImage, colorSpace: CGColorSpace? = nil) -> MLXArray {
let size = image.extent.size
let w = Int(size.width.rounded())
Expand All @@ -135,10 +149,10 @@ public enum MediaProcessing {

var array = MLXArray(data, [h, w, 4], type: Float32.self)

// drop 4th channel
// Drop 4th channel
array = array[0..., 0..., ..<3]

// convert to 1, C, H, W
// Convert to 1, C, H, W
array = array.reshaped(1, h, w, 3).transposed(0, 3, 1, 2)

return array
Expand Down
2 changes: 1 addition & 1 deletion Libraries/MLXVLM/Models/Idefics3.swift
Original file line number Diff line number Diff line change
Expand Up @@ -837,7 +837,7 @@ public class Idefics3Processor: UserInputProcessor {
height: fixedImageSize
)
image = MediaProcessing.apply(image, processing: input.processing)
image = MediaProcessing.resampleBicubic(image, to: targetSize)
image = try MediaProcessing.resampleBicubic(image, to: targetSize)
image = MediaProcessing.normalize(
image,
mean: config.imageMeanTuple,
Expand Down
8 changes: 4 additions & 4 deletions Libraries/MLXVLM/Models/Paligemma.swift
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ private enum Vision {
/// PaliGemma VLM `UserInputProcessor`.
///
/// This is meant to be used with ``PaliGemma`` and is typically created by ``VLMModelFactory``.
public class PaligGemmaProcessor: UserInputProcessor {
public class PaliGemmaProcessor: UserInputProcessor {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whoops


private let config: PaliGemmaProcessorConfiguration
private let tokenizer: any Tokenizer
Expand All @@ -451,7 +451,7 @@ public class PaligGemmaProcessor: UserInputProcessor {
self.tokenizer = tokenizer
}

private func prepare(image: CIImage, processing: UserInput.Processing?) -> MLXArray {
private func prepare(image: CIImage, processing: UserInput.Processing?) throws -> MLXArray {
// based on image_processing_siglip from transformers
var image = image

Expand All @@ -463,7 +463,7 @@ public class PaligGemmaProcessor: UserInputProcessor {
// apply user instructions
image = MediaProcessing.apply(image, processing: processing)

image = MediaProcessing.resampleBicubic(image, to: config.size.cgSize)
image = try MediaProcessing.resampleBicubic(image, to: config.size.cgSize)
image = MediaProcessing.normalize(
image, mean: config.imageMeanTuple, std: config.imageStdTuple)

Expand Down Expand Up @@ -705,7 +705,7 @@ public struct PaliGemmaConfiguration: Codable, Sendable {
}
}

/// Configuration for ``PaligGemmaProcessor``
/// Configuration for ``PaliGemmaProcessor``
public struct PaliGemmaProcessorConfiguration: Codable, Sendable {

public struct Size: Codable, Sendable {
Expand Down
Loading