-
Notifications
You must be signed in to change notification settings - Fork 231
VLM support for image and video processing with SmolVLM support #206
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b7c61ac
cc31f91
1ba603c
610a457
113f93d
dc6b71f
1cf5906
da3f80f
ad6f05c
7be743d
6b63fcf
521b927
0eaab62
76707dd
5f39269
ee2cd3b
cb22d0d
a94f419
0fe3a46
a831a12
7a6d2c6
c73bfe3
2807b88
d407259
b86cdf2
97ed22b
08b1e8c
6f5e2f4
9d7ad6e
ac482a3
481756e
b232921
00394f1
7d1934d
61a95b9
45fcaf8
261fd98
8791309
06dc330
49de515
b48773e
f9881d5
ac0ff2b
67b6c0d
20ebcc4
5323a5e
fcc30a5
466722e
92eaa56
a1404fb
7ed7086
99cffac
8af30ff
6d977f6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,11 @@ import SwiftUI | |
typealias PlatformImage = NSImage | ||
#endif | ||
|
||
let videoSystemPrompt = | ||
"Focus only on describing the key dramatic action or notable event occurring in this video segment. Skip general context or scene-setting details unless they are crucial to understanding the main action." | ||
let imageSystemPrompt = | ||
"You are an image understanding model capable of describing the salient features of any image." | ||
|
||
struct ContentView: View { | ||
@State var prompt = "" | ||
@State var llm = VLMEvaluator() | ||
|
@@ -28,7 +33,7 @@ struct ContentView: View { | |
} | ||
} | ||
} | ||
@State private var selectedVideoURL: URL? = nil { | ||
@State private var selectedVideoURL: URL? { | ||
didSet { | ||
if let selectedVideoURL { | ||
player = AVPlayer(url: selectedVideoURL) | ||
|
@@ -61,7 +66,11 @@ struct ContentView: View { | |
} | ||
|
||
VStack { | ||
if let selectedImage { | ||
if let player { | ||
VideoPlayer(player: player) | ||
.frame(height: 300) | ||
.cornerRadius(12) | ||
} else if let selectedImage { | ||
Group { | ||
#if os(iOS) || os(visionOS) | ||
Image(uiImage: selectedImage) | ||
|
@@ -91,11 +100,6 @@ struct ContentView: View { | |
EmptyView() | ||
} | ||
} | ||
} else if let player { | ||
VideoPlayer(player: player) | ||
.scaledToFit() | ||
.frame(maxHeight: 300) | ||
.cornerRadius(12) | ||
} | ||
|
||
HStack { | ||
|
@@ -193,6 +197,7 @@ struct ContentView: View { | |
.id("bottom") | ||
} | ||
} | ||
.frame(minHeight: 200) | ||
|
||
HStack { | ||
TextField("prompt", text: $prompt) | ||
|
@@ -205,6 +210,11 @@ struct ContentView: View { | |
.disabled(llm.running) | ||
} | ||
} | ||
.onAppear { | ||
selectedVideoURL = URL( | ||
string: | ||
"https://videos.pexels.com/video-files/4066325/4066325-uhd_2560_1440_24fps.mp4")! | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is nice for testing but I think we should probably remove the example asset for the example -- force people use their own images & videos. Also, I don't know the license on this video :-) On the other hand this is meant as an example for developers to build on and maybe it is good to have something ready to go? Anyone have any thoughts on this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm fine either way. cc @cyrilzakka on the video rights (but also for opinion) :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we could do something like this? selectedVideoURL = URL(string: "https://videos.pexels.com/video-files/4066325/4066325-uhd_2560_1440_24fps.mp4")! These videos on https://www.pexels.com/search/videos/public%20domain/ are marked as "free to use" and many are pretty short. This way we don't need to include the resource (size + license) but we can still give an example for people to use on first launch. |
||
#if os(visionOS) | ||
.padding(40) | ||
#else | ||
|
@@ -320,12 +330,12 @@ class VLMEvaluator { | |
var modelInfo = "" | ||
var stat = "" | ||
|
||
/// This controls which model loads. `qwen2VL2BInstruct4Bit` is one of the smaller ones, so this will fit on | ||
/// This controls which model loads. `smolvlm` is very small even unquantized, so it will fit on | ||
/// more devices. | ||
let modelConfiguration = ModelRegistry.qwen2VL2BInstruct4Bit | ||
let modelConfiguration = VLMRegistry.smolvlm | ||
|
||
/// parameters controlling the output | ||
let generateParameters = MLXLMCommon.GenerateParameters(temperature: 0.6) | ||
/// parameters controlling the output – use values appropriate for the model selected above | ||
let generateParameters = MLXLMCommon.GenerateParameters(temperature: 0.7, topP: 0.9) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These parameters are also smolvlm-specific. |
||
let maxTokens = 800 | ||
|
||
/// update the display every N tokens -- 4 looks like it updates continuously | ||
|
@@ -401,7 +411,11 @@ class VLMEvaluator { | |
[ | ||
"role": "user", | ||
"content": [ | ||
["type": "text", "text": prompt] | ||
[ | ||
"type": "text", | ||
"text": videoURL != nil | ||
? videoSystemPrompt : imageSystemPrompt, | ||
] | ||
] | ||
// Messages format for Qwen 2 VL, Qwen 2.5 VL. May need to be adapted for other models. | ||
+ images.map { _ in | ||
|
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -5,6 +5,18 @@ import CoreImage.CIFilterBuiltins | |||
import MLX | ||||
import MLXLMCommon | ||||
|
||||
public struct VideoFrame { | ||||
let frame: CIImage | ||||
let timeStamp: CMTime | ||||
} | ||||
|
||||
public struct ProcessedFrames { | ||||
let frames: [MLXArray] | ||||
let timestamps: [CMTime] | ||||
let totalDuration: CMTime | ||||
} | ||||
|
||||
// TODO: verify working color space, rendering color space | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a good idea. I think the python processing code is roughly equivalent to the colorspace of the input, no conversion to linear, and what is called "device RGB" (don't touch my colors). In other words it isn't managed color, but that is what we have here. We could certainly do something like use the non-linear form of the input colorspace and output to the same. In practice I am not sure it matters that much. These models are probably trained on consistent colorspace inputs (though sRGB is likely, displayP3 from iPhone images is pretty likely, and videos are much more diverse). Maybe this should turn into an issue? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That said: I don't think we should try to replicate the unmanaged colorspace of the python version. I think we should pick a colorspace (sRGB or displayP3) and be consistent. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, makes sense to turn into an issue. I also think that Python pre-processing is mostly oblivious to colorspace. |
||||
private let context = CIContext() | ||||
|
||||
/// Collection of methods for processing media (images, video, etc.). | ||||
|
@@ -58,6 +70,12 @@ public enum MediaProcessing { | |||
min(other.width / size.width, other.height / size.height) | ||||
} | ||||
|
||||
static public func aspectRatioForResample(_ image: CIImage, size: CGSize) -> Float { | ||||
let inputAspectRatio = image.extent.width / image.extent.height | ||||
let desiredAspectRatio = size.width / size.height | ||||
return Float(1 / inputAspectRatio * desiredAspectRatio) | ||||
} | ||||
|
||||
/// Resample the image using bicubic interpolation. | ||||
public static func resampleBicubic(_ image: CIImage, to size: CGSize) -> CIImage { | ||||
let filter = CIFilter.bicubicScaleTransform() | ||||
|
@@ -66,9 +84,34 @@ public enum MediaProcessing { | |||
filter.inputImage = image | ||||
|
||||
// set the aspect ratio to match the aspect ratio of the target | ||||
let inputAspectRatio = extent.width / extent.height | ||||
let desiredAspectRatio = size.width / size.height | ||||
filter.aspectRatio = Float(1 / inputAspectRatio * desiredAspectRatio) | ||||
filter.aspectRatio = aspectRatioForResample(image, size: size) | ||||
|
||||
// that image is now the aspect ratio of the target and the size | ||||
// of the shorter dimension | ||||
let scale: CGFloat | ||||
if extent.width < extent.height { | ||||
scale = size.width / extent.width | ||||
} else { | ||||
scale = size.height / extent.height | ||||
} | ||||
filter.scale = Float(scale) | ||||
|
||||
let rescaled = filter.outputImage! | ||||
|
||||
// the image has a DoD larger than the requested size so crop | ||||
// it to the desired size | ||||
return rescaled.cropped(to: CGRect(origin: .zero, size: size)) | ||||
} | ||||
|
||||
/// Resample the image using Lanczos interpolation. | ||||
static public func resampleLanczos(_ image: CIImage, to size: CGSize) -> CIImage { | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Smol uses Lanczos? I agree it is the better resampling method for humans, but the sinc it simulates has an edge strengthening effect -- I am surprised to see it used here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it does, I was surprised too when I saw it but didn't follow up with the team. cc @mfarre, just curious if there's any insight :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is inherited from the Idefics3 image processor :) |
||||
let filter = CIFilter.lanczosScaleTransform() | ||||
let extent = image.extent.size | ||||
|
||||
filter.inputImage = image | ||||
|
||||
// set the aspect ratio to match the aspect ratio of the target | ||||
filter.aspectRatio = aspectRatioForResample(image, size: size) | ||||
|
||||
// that image is now the aspect ratio of the target and the size | ||||
// of the shorter dimension | ||||
|
@@ -264,4 +307,105 @@ public enum MediaProcessing { | |||
|
||||
return ciImages | ||||
} | ||||
|
||||
static public func asProcessedSequence( | ||||
_ asset: AVAsset, samplesPerSecond: Int, | ||||
frameProcessing: (VideoFrame) throws -> VideoFrame = { $0 } | ||||
) async throws -> ProcessedFrames { | ||||
return try await asProcessedSequence( | ||||
asset, maxFrames: Int.max, targetFPS: { _ in Double(samplesPerSecond) }, | ||||
frameProcessing: frameProcessing) | ||||
} | ||||
|
||||
static public func asProcessedSequence( | ||||
_ asset: AVAsset, maxFrames: Int, targetFPS: (CMTime) -> Double, | ||||
frameProcessing: (VideoFrame) throws -> VideoFrame = { $0 } | ||||
) async throws -> ProcessedFrames { | ||||
// Use AVAssetImageGenerator to extract frames | ||||
let generator = AVAssetImageGenerator(asset: asset) | ||||
generator.appliesPreferredTrackTransform = true | ||||
generator.requestedTimeToleranceBefore = .zero | ||||
generator.requestedTimeToleranceAfter = .zero | ||||
|
||||
guard let duration = try? await asset.load(.duration) else { | ||||
throw NSError( | ||||
domain: "MediaProcessing", code: -1, | ||||
userInfo: [NSLocalizedDescriptionKey: "Failed to load the asset's duration"]) | ||||
} | ||||
let fps = targetFPS(duration) | ||||
// Note: the round was not present in `asCIImageSequence`, so we may now be passing 1 more frame to Qwen depending on video duration. | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As noted in the comment, this may result in an additional frame being extracted for users of the previous
Suggested change
|
||||
let estimatedFrames = Int(round(fps * duration.seconds)) | ||||
var desiredFrames = min(estimatedFrames, maxFrames) | ||||
let finalFrameCount = max(desiredFrames, 1) | ||||
|
||||
let sampledTimeValues = MLXArray.linspace( | ||||
0, duration.value, count: Int(finalFrameCount) | ||||
).asArray(Int64.self) | ||||
|
||||
// Construct a CMTime using the sampled CMTimeValue's and the asset's timescale | ||||
let timescale = duration.timescale | ||||
let sampledTimes = sampledTimeValues.map { CMTime(value: $0, timescale: timescale) } | ||||
|
||||
// Collect the frames | ||||
var ciImages: [CIImage] = [] | ||||
var timestamps: [CMTime] = [] | ||||
|
||||
var frames: [VideoFrame] = [] | ||||
|
||||
for await result in await generator.images(for: sampledTimes) { | ||||
switch result { | ||||
case .success(requestedTime: let requested, let image, actualTime: let actual): | ||||
let ciImage = CIImage( | ||||
cgImage: image, options: [.colorSpace: CGColorSpace(name: CGColorSpace.sRGB)!]) | ||||
let frame = try frameProcessing(.init(frame: ciImage, timeStamp: actual)) | ||||
ciImages.append(frame.frame) | ||||
timestamps.append(frame.timeStamp) | ||||
case .failure(requestedTime: let requested, let error): | ||||
break | ||||
} | ||||
} | ||||
|
||||
let framesAsArrays = ciImages.map { $0.asMLXArray() } | ||||
return ProcessedFrames( | ||||
frames: framesAsArrays, | ||||
timestamps: timestamps, | ||||
totalDuration: duration | ||||
) | ||||
} | ||||
} | ||||
|
||||
// MARK: - Convenience | ||||
|
||||
extension CIImage { | ||||
public enum ResamplingMethod { | ||||
case bicubic | ||||
case lanczos | ||||
} | ||||
|
||||
public func resampled(to size: CGSize, method: ResamplingMethod = .bicubic) -> CIImage { | ||||
switch method { | ||||
case .bicubic: | ||||
return MediaProcessing.resampleBicubic(self, to: size) | ||||
case .lanczos: | ||||
return MediaProcessing.resampleLanczos(self, to: size) | ||||
} | ||||
} | ||||
|
||||
public func toSRGB() -> CIImage { | ||||
return MediaProcessing.inSRGBToneCurveSpace(self) | ||||
} | ||||
|
||||
public func toLinear() -> CIImage { | ||||
return MediaProcessing.inLinearToneCurveSpace(self) | ||||
} | ||||
|
||||
public func normalized(mean: (CGFloat, CGFloat, CGFloat), std: (CGFloat, CGFloat, CGFloat)) | ||||
-> CIImage | ||||
{ | ||||
return MediaProcessing.normalize(self, mean: mean, std: std) | ||||
} | ||||
|
||||
public func asMLXArray(colorSpace: CGColorSpace? = nil) -> MLXArray { | ||||
return MediaProcessing.asMLXArray(self, colorSpace: colorSpace) | ||||
} | ||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think these are fine for now, but this + the message formatting needs to be figured out (later :-) )
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed!