@@ -5,6 +5,18 @@ import CoreImage.CIFilterBuiltins
5
5
import MLX
6
6
import MLXLMCommon
7
7
8
+ public struct VideoFrame {
9
+ let frame : CIImage
10
+ let timeStamp : CMTime
11
+ }
12
+
13
+ public struct ProcessedFrames {
14
+ let frames : [ MLXArray ]
15
+ let timestamps : [ CMTime ]
16
+ let totalDuration : CMTime
17
+ }
18
+
19
+ // TODO: verify working color space, rendering color space
8
20
private let context = CIContext ( )
9
21
10
22
/// Collection of methods for processing media (images, video, etc.).
@@ -58,6 +70,12 @@ public enum MediaProcessing {
58
70
min ( other. width / size. width, other. height / size. height)
59
71
}
60
72
73
+ static public func aspectRatioForResample( _ image: CIImage , size: CGSize ) -> Float {
74
+ let inputAspectRatio = image. extent. width / image. extent. height
75
+ let desiredAspectRatio = size. width / size. height
76
+ return Float ( 1 / inputAspectRatio * desiredAspectRatio)
77
+ }
78
+
61
79
/// Resample the image using bicubic interpolation.
62
80
public static func resampleBicubic( _ image: CIImage , to size: CGSize ) -> CIImage {
63
81
let filter = CIFilter . bicubicScaleTransform ( )
@@ -66,9 +84,34 @@ public enum MediaProcessing {
66
84
filter. inputImage = image
67
85
68
86
// set the aspect ratio to match the aspect ratio of the target
69
- let inputAspectRatio = extent. width / extent. height
70
- let desiredAspectRatio = size. width / size. height
71
- filter. aspectRatio = Float ( 1 / inputAspectRatio * desiredAspectRatio)
87
+ filter. aspectRatio = aspectRatioForResample ( image, size: size)
88
+
89
+ // that image is now the aspect ratio of the target and the size
90
+ // of the shorter dimension
91
+ let scale : CGFloat
92
+ if extent. width < extent. height {
93
+ scale = size. width / extent. width
94
+ } else {
95
+ scale = size. height / extent. height
96
+ }
97
+ filter. scale = Float ( scale)
98
+
99
+ let rescaled = filter. outputImage!
100
+
101
+ // the image has a DoD larger than the requested size so crop
102
+ // it to the desired size
103
+ return rescaled. cropped ( to: CGRect ( origin: . zero, size: size) )
104
+ }
105
+
106
+ /// Resample the image using Lanczos interpolation.
107
+ static public func resampleLanczos( _ image: CIImage , to size: CGSize ) -> CIImage {
108
+ let filter = CIFilter . lanczosScaleTransform ( )
109
+ let extent = image. extent. size
110
+
111
+ filter. inputImage = image
112
+
113
+ // set the aspect ratio to match the aspect ratio of the target
114
+ filter. aspectRatio = aspectRatioForResample ( image, size: size)
72
115
73
116
// that image is now the aspect ratio of the target and the size
74
117
// of the shorter dimension
@@ -264,4 +307,105 @@ public enum MediaProcessing {
264
307
265
308
return ciImages
266
309
}
310
+
311
+ static public func asProcessedSequence(
312
+ _ asset: AVAsset , samplesPerSecond: Int ,
313
+ frameProcessing: ( VideoFrame ) throws -> VideoFrame = { $0 }
314
+ ) async throws -> ProcessedFrames {
315
+ return try await asProcessedSequence (
316
+ asset, maxFrames: Int . max, targetFPS: { _ in Double ( samplesPerSecond) } ,
317
+ frameProcessing: frameProcessing)
318
+ }
319
+
320
+ static public func asProcessedSequence(
321
+ _ asset: AVAsset , maxFrames: Int , targetFPS: ( CMTime ) -> Double ,
322
+ frameProcessing: ( VideoFrame ) throws -> VideoFrame = { $0 }
323
+ ) async throws -> ProcessedFrames {
324
+ // Use AVAssetImageGenerator to extract frames
325
+ let generator = AVAssetImageGenerator ( asset: asset)
326
+ generator. appliesPreferredTrackTransform = true
327
+ generator. requestedTimeToleranceBefore = . zero
328
+ generator. requestedTimeToleranceAfter = . zero
329
+
330
+ guard let duration = try ? await asset. load ( . duration) else {
331
+ throw NSError (
332
+ domain: " MediaProcessing " , code: - 1 ,
333
+ userInfo: [ NSLocalizedDescriptionKey: " Failed to load the asset's duration " ] )
334
+ }
335
+ let fps = targetFPS ( duration)
336
+ // Note: the round was not present in `asCIImageSequence`, so we may now be passing 1 more frame to Qwen depending on video duration.
337
+ let estimatedFrames = Int ( round ( fps * duration. seconds) )
338
+ var desiredFrames = min ( estimatedFrames, maxFrames)
339
+ let finalFrameCount = max ( desiredFrames, 1 )
340
+
341
+ let sampledTimeValues = MLXArray . linspace (
342
+ 0 , duration. value, count: Int ( finalFrameCount)
343
+ ) . asArray ( Int64 . self)
344
+
345
+ // Construct a CMTime using the sampled CMTimeValue's and the asset's timescale
346
+ let timescale = duration. timescale
347
+ let sampledTimes = sampledTimeValues. map { CMTime ( value: $0, timescale: timescale) }
348
+
349
+ // Collect the frames
350
+ var ciImages : [ CIImage ] = [ ]
351
+ var timestamps : [ CMTime ] = [ ]
352
+
353
+ var frames : [ VideoFrame ] = [ ]
354
+
355
+ for await result in await generator. images ( for: sampledTimes) {
356
+ switch result {
357
+ case . success( requestedTime: let requested, let image, actualTime: let actual) :
358
+ let ciImage = CIImage (
359
+ cgImage: image, options: [ . colorSpace: CGColorSpace ( name: CGColorSpace . sRGB) !] )
360
+ let frame = try frameProcessing ( . init( frame: ciImage, timeStamp: actual) )
361
+ ciImages. append ( frame. frame)
362
+ timestamps. append ( frame. timeStamp)
363
+ case . failure( requestedTime: let requested, let error) :
364
+ break
365
+ }
366
+ }
367
+
368
+ let framesAsArrays = ciImages. map { $0. asMLXArray ( ) }
369
+ return ProcessedFrames (
370
+ frames: framesAsArrays,
371
+ timestamps: timestamps,
372
+ totalDuration: duration
373
+ )
374
+ }
375
+ }
376
+
377
+ // MARK: - Convenience
378
+
379
+ extension CIImage {
380
+ public enum ResamplingMethod {
381
+ case bicubic
382
+ case lanczos
383
+ }
384
+
385
+ public func resampled( to size: CGSize , method: ResamplingMethod = . bicubic) -> CIImage {
386
+ switch method {
387
+ case . bicubic:
388
+ return MediaProcessing . resampleBicubic ( self , to: size)
389
+ case . lanczos:
390
+ return MediaProcessing . resampleLanczos ( self , to: size)
391
+ }
392
+ }
393
+
394
+ public func toSRGB( ) -> CIImage {
395
+ return MediaProcessing . inSRGBToneCurveSpace ( self )
396
+ }
397
+
398
+ public func toLinear( ) -> CIImage {
399
+ return MediaProcessing . inLinearToneCurveSpace ( self )
400
+ }
401
+
402
+ public func normalized( mean: ( CGFloat , CGFloat , CGFloat ) , std: ( CGFloat , CGFloat , CGFloat ) )
403
+ -> CIImage
404
+ {
405
+ return MediaProcessing . normalize ( self , mean: mean, std: std)
406
+ }
407
+
408
+ public func asMLXArray( colorSpace: CGColorSpace ? = nil ) -> MLXArray {
409
+ return MediaProcessing . asMLXArray ( self , colorSpace: colorSpace)
410
+ }
267
411
}
0 commit comments