-
Notifications
You must be signed in to change notification settings - Fork 240
Add Qwen 2.5 VL #222
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Qwen 2.5 VL #222
Changes from 16 commits
a6f552b
b6532fa
e2ef119
788fffa
9356549
6a917fe
008d804
362939f
19e2aa8
4add690
aa59c6e
7124499
d60188a
d37f85b
20ac074
ee978b0
5261072
fbc88c0
e70190a
97f6ba1
184d0b9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,7 +15,7 @@ private let context = CIContext() | |
/// var image: CIImage | ||
/// image = MediaProcessing.inSRGBToneCurveSpace(image) | ||
/// | ||
/// // apply user instructions | ||
/// // Apply user instructions | ||
/// image = MediaProcessing.apply(image, processing: processing) | ||
/// | ||
/// image = MediaProcessing.resampleBicubic(image, to: config.size.cgSize) | ||
|
@@ -58,33 +58,43 @@ public enum MediaProcessing { | |
min(other.width / size.width, other.height / size.height) | ||
} | ||
|
||
/// Resample the image using bicubic interpolation. | ||
static public func resampleBicubic(_ image: CIImage, to size: CGSize) -> CIImage { | ||
let filter = CIFilter.bicubicScaleTransform() | ||
let extent = image.extent.size | ||
|
||
filter.inputImage = image | ||
enum MediaProcessingError: LocalizedError { | ||
case transformFailed | ||
|
||
// set the aspect ratio to match the aspect ratio of the target | ||
let inputAspectRatio = extent.width / extent.height | ||
let desiredAspectRatio = size.width / size.height | ||
filter.aspectRatio = Float(1 / inputAspectRatio * desiredAspectRatio) | ||
|
||
// that image is now the aspect ratio of the target and the size | ||
// of the shorter dimension | ||
let scale: CGFloat | ||
if extent.width < extent.height { | ||
scale = size.width / extent.width | ||
} else { | ||
scale = size.height / extent.height | ||
var errorDescription: String? { | ||
switch self { | ||
case .transformFailed: "Failed to transform image" | ||
} | ||
} | ||
filter.scale = Float(scale) | ||
} | ||
|
||
let rescaled = filter.outputImage! | ||
/// Resample the image using bicubic interpolation. | ||
/// - Parameters: | ||
/// - image: The image to resample | ||
/// - size: The target size | ||
/// - Returns: The resampled image | ||
public static func resampleBicubic(_ image: CIImage, to size: CGSize) throws -> CIImage { | ||
// Create a bicubic scale filter | ||
|
||
let yScale = size.height / image.extent.height | ||
let xScale = size.width / image.extent.width | ||
|
||
// the image has a DoD larger than the requested size so crop | ||
// it to the desired size | ||
return rescaled.cropped(to: CGRect(origin: .zero, size: size)) | ||
let filter = CIFilter.bicubicScaleTransform() | ||
filter.inputImage = image | ||
filter.scale = Float(yScale) | ||
filter.aspectRatio = Float(xScale / yScale) | ||
guard let scaledImage = filter.outputImage else { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would probably use |
||
throw MediaProcessingError.transformFailed | ||
} | ||
// Create a rect with the exact dimensions we want | ||
let exactRect = CGRect( | ||
x: 0, | ||
y: 0, | ||
width: size.width, | ||
height: size.height | ||
) | ||
// Crop to ensure exact dimensions | ||
return scaledImage.cropped(to: exactRect) | ||
} | ||
|
||
/// Normalize the image using the given mean and standard deviation parameters. | ||
|
@@ -94,7 +104,7 @@ public enum MediaProcessing { | |
let filter = CIFilter.colorMatrix() | ||
filter.inputImage = image | ||
|
||
// this should match | ||
// This should match | ||
// https://pytorch.org/vision/main/generated/torchvision.transforms.Normalize.html | ||
// | ||
// output[channel] = (input[channel] - mean[channel]) / std[channel] | ||
|
@@ -113,6 +123,10 @@ public enum MediaProcessing { | |
} | ||
|
||
/// Convert the CIImage into a planar 3 channel MLXArray `[1, C, H, W]` | ||
/// - Parameters: | ||
/// - image: The image to convert | ||
/// - colorSpace: Optional color space for rendering | ||
/// - Returns: The MLXArray representation of the image | ||
static public func asMLXArray(_ image: CIImage, colorSpace: CGColorSpace? = nil) -> MLXArray { | ||
let size = image.extent.size | ||
let w = Int(size.width.rounded()) | ||
|
@@ -135,10 +149,10 @@ public enum MediaProcessing { | |
|
||
var array = MLXArray(data, [h, w, 4], type: Float32.self) | ||
|
||
// drop 4th channel | ||
// Drop 4th channel | ||
array = array[0..., 0..., ..<3] | ||
|
||
// convert to 1, C, H, W | ||
// Convert to 1, C, H, W | ||
array = array.reshaped(1, h, w, 3).transposed(0, 3, 1, 2) | ||
|
||
return array | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -441,7 +441,7 @@ private enum Vision { | |
/// PaliGemma VLM `UserInputProcessor`. | ||
/// | ||
/// This is meant to be used with ``PaliGemma`` and is typically created by ``VLMModelFactory``. | ||
public class PaligGemmaProcessor: UserInputProcessor { | ||
public class PaliGemmaProcessor: UserInputProcessor { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Whoops |
||
|
||
private let config: PaliGemmaProcessorConfiguration | ||
private let tokenizer: any Tokenizer | ||
|
@@ -451,7 +451,7 @@ public class PaligGemmaProcessor: UserInputProcessor { | |
self.tokenizer = tokenizer | ||
} | ||
|
||
private func prepare(image: CIImage, processing: UserInput.Processing?) -> MLXArray { | ||
private func prepare(image: CIImage, processing: UserInput.Processing?) throws -> MLXArray { | ||
// based on image_processing_siglip from transformers | ||
var image = image | ||
|
||
|
@@ -463,7 +463,7 @@ public class PaligGemmaProcessor: UserInputProcessor { | |
// apply user instructions | ||
image = MediaProcessing.apply(image, processing: processing) | ||
|
||
image = MediaProcessing.resampleBicubic(image, to: config.size.cgSize) | ||
image = try MediaProcessing.resampleBicubic(image, to: config.size.cgSize) | ||
image = MediaProcessing.normalize( | ||
image, mean: config.imageMeanTuple, std: config.imageStdTuple) | ||
|
||
|
@@ -705,7 +705,7 @@ public struct PaliGemmaConfiguration: Codable, Sendable { | |
} | ||
} | ||
|
||
/// Configuration for ``PaligGemmaProcessor`` | ||
/// Configuration for ``PaliGemmaProcessor`` | ||
public struct PaliGemmaProcessorConfiguration: Codable, Sendable { | ||
|
||
public struct Size: Codable, Sendable { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Two thoughts:
I think maybe the latter as it is close to the VLM and it would be better suited to know what size inputs it needs.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
and indeed the size may also depend on config, so a second point for the second approach