Skip to content

Add Qwen 2.5 VL #222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Apr 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 52 additions & 49 deletions Libraries/MLXVLM/MediaProcessing.swift
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ public struct ProcessedFrames {
let totalDuration: CMTime
}

// TODO: verify working color space, rendering color space
private let context = CIContext()

/// Collection of methods for processing media (images, video, etc.).
Expand All @@ -27,7 +26,7 @@ private let context = CIContext()
/// var image: CIImage
/// image = MediaProcessing.inSRGBToneCurveSpace(image)
///
/// // apply user instructions
/// // Apply user instructions
/// image = MediaProcessing.apply(image, processing: processing)
///
/// image = MediaProcessing.resampleBicubic(image, to: config.size.cgSize)
Expand Down Expand Up @@ -76,58 +75,58 @@ public enum MediaProcessing {
return Float(1 / inputAspectRatio * desiredAspectRatio)
}

/// Resample the image using bicubic interpolation.
public static func resampleBicubic(_ image: CIImage, to size: CGSize) -> CIImage {
let filter = CIFilter.bicubicScaleTransform()
let extent = image.extent.size

filter.inputImage = image

// set the aspect ratio to match the aspect ratio of the target
filter.aspectRatio = aspectRatioForResample(image, size: size)

// that image is now the aspect ratio of the target and the size
// of the shorter dimension
let scale: CGFloat
if extent.width < extent.height {
scale = size.width / extent.width
} else {
scale = size.height / extent.height
}
filter.scale = Float(scale)

let rescaled = filter.outputImage!

// the image has a DoD larger than the requested size so crop
// it to the desired size
return rescaled.cropped(to: CGRect(origin: .zero, size: size))
}

/// Resample the image using Lanczos interpolation.
static public func resampleLanczos(_ image: CIImage, to size: CGSize) -> CIImage {
let filter = CIFilter.lanczosScaleTransform()
let extent = image.extent.size
// Create a bicubic scale filter

let yScale = size.height / image.extent.height
let xScale = size.width / image.extent.width

let filter = CIFilter.lanczosScaleTransform()
filter.inputImage = image
filter.scale = Float(yScale)
filter.aspectRatio = Float(xScale / yScale)
let scaledImage = filter.outputImage!

// Create a rect with the exact dimensions we want
let exactRect = CGRect(
x: 0,
y: 0,
width: size.width,
height: size.height
)

// set the aspect ratio to match the aspect ratio of the target
filter.aspectRatio = aspectRatioForResample(image, size: size)
// Crop to ensure exact dimensions
return scaledImage.cropped(to: exactRect)
}

// that image is now the aspect ratio of the target and the size
// of the shorter dimension
let scale: CGFloat
if extent.width < extent.height {
scale = size.width / extent.width
} else {
scale = size.height / extent.height
}
filter.scale = Float(scale)
/// Resample the image using bicubic interpolation.
/// - Parameters:
/// - image: The image to resample
/// - size: The target size
/// - Returns: The resampled image
public static func resampleBicubic(_ image: CIImage, to size: CGSize) -> CIImage {
// Create a bicubic scale filter

let yScale = size.height / image.extent.height
let xScale = size.width / image.extent.width

let rescaled = filter.outputImage!
let filter = CIFilter.bicubicScaleTransform()
filter.inputImage = image
filter.scale = Float(yScale)
filter.aspectRatio = Float(xScale / yScale)
let scaledImage = filter.outputImage!

// Create a rect with the exact dimensions we want
let exactRect = CGRect(
x: 0,
y: 0,
width: size.width,
height: size.height
)

// the image has a DoD larger than the requested size so crop
// it to the desired size
return rescaled.cropped(to: CGRect(origin: .zero, size: size))
// Crop to ensure exact dimensions
return scaledImage.cropped(to: exactRect)
}

/// Normalize the image using the given mean and standard deviation parameters.
Expand All @@ -137,7 +136,7 @@ public enum MediaProcessing {
let filter = CIFilter.colorMatrix()
filter.inputImage = image

// this should match
// This should match
// https://pytorch.org/vision/main/generated/torchvision.transforms.Normalize.html
//
// output[channel] = (input[channel] - mean[channel]) / std[channel]
Expand All @@ -156,6 +155,10 @@ public enum MediaProcessing {
}

/// Convert the CIImage into a planar 3 channel MLXArray `[1, C, H, W]`
/// - Parameters:
/// - image: The image to convert
/// - colorSpace: Optional color space for rendering
/// - Returns: The MLXArray representation of the image
public static func asMLXArray(_ image: CIImage, colorSpace: CGColorSpace? = nil) -> MLXArray {
let size = image.extent.size
let w = Int(size.width.rounded())
Expand All @@ -178,10 +181,10 @@ public enum MediaProcessing {

var array = MLXArray(data, [h, w, 4], type: Float32.self)

// drop 4th channel
// Drop 4th channel
array = array[0..., 0..., ..<3]

// convert to 1, C, H, W
// Convert to 1, C, H, W
array = array.reshaped(1, h, w, 3).transposed(0, 3, 1, 2)

return array
Expand Down
2 changes: 1 addition & 1 deletion Libraries/MLXVLM/Models/Idefics3.swift
Original file line number Diff line number Diff line change
Expand Up @@ -851,7 +851,7 @@ public class Idefics3Processor: UserInputProcessor {
height: fixedImageSize
)
image = MediaProcessing.apply(image, processing: input.processing)
image = MediaProcessing.resampleBicubic(image, to: targetSize)
image = try MediaProcessing.resampleBicubic(image, to: targetSize)
image = MediaProcessing.normalize(
image,
mean: config.imageMeanTuple,
Expand Down
8 changes: 4 additions & 4 deletions Libraries/MLXVLM/Models/Paligemma.swift
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ private enum Vision {
/// PaliGemma VLM `UserInputProcessor`.
///
/// This is meant to be used with ``PaliGemma`` and is typically created by ``VLMModelFactory``.
public class PaligGemmaProcessor: UserInputProcessor {
public class PaliGemmaProcessor: UserInputProcessor {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whoops


private let config: PaliGemmaProcessorConfiguration
private let tokenizer: any Tokenizer
Expand All @@ -451,7 +451,7 @@ public class PaligGemmaProcessor: UserInputProcessor {
self.tokenizer = tokenizer
}

private func prepare(image: CIImage, processing: UserInput.Processing?) -> MLXArray {
private func prepare(image: CIImage, processing: UserInput.Processing?) throws -> MLXArray {
// based on image_processing_siglip from transformers
var image = image

Expand All @@ -463,7 +463,7 @@ public class PaligGemmaProcessor: UserInputProcessor {
// apply user instructions
image = MediaProcessing.apply(image, processing: processing)

image = MediaProcessing.resampleBicubic(image, to: config.size.cgSize)
image = try MediaProcessing.resampleBicubic(image, to: config.size.cgSize)
image = MediaProcessing.normalize(
image, mean: config.imageMeanTuple, std: config.imageStdTuple)

Expand Down Expand Up @@ -705,7 +705,7 @@ public struct PaliGemmaConfiguration: Codable, Sendable {
}
}

/// Configuration for ``PaligGemmaProcessor``
/// Configuration for ``PaliGemmaProcessor``
public struct PaliGemmaProcessorConfiguration: Codable, Sendable {

public struct Size: Codable, Sendable {
Expand Down
Loading