add documentation, images/video are live

davidkoski · davidkoski · commit c15e408d2040 · 2025-04-17T13:44:51.000-07:00
diff --git a/Libraries/MLXLMCommon/Chat.swift b/Libraries/MLXLMCommon/Chat.swift
@@ -3,16 +3,26 @@
 public enum Chat {
     public struct Message {
         /// The role of the message sender.
-        public let role: Role
+        public var role: Role
 
         /// The content of the message.
-        public let content: String
+        public var content: String
 
         /// Array of image data associated with the message.
-        public let images: [UserInput.Image]
+        public var images: [UserInput.Image]
 
         /// Array of video data associated with the message.
-        public let videos: [UserInput.Video]
+        public var videos: [UserInput.Video]
+
+        public init(
+            role: Role, content: String, images: [UserInput.Image] = [],
+            videos: [UserInput.Video] = []
+        ) {
+            self.role = role
+            self.content = content
+            self.images = images
+            self.videos = videos
+        }
 
         public static func system(
             _ content: String, images: [UserInput.Image] = [], videos: [UserInput.Video] = []
@@ -40,13 +50,25 @@ public enum Chat {
     }
 }
 
+/// Protocol for something that can convert structured
+/// ``Chat.Message`` into model specific ``Message``
+/// (raw dictionary) format.
+///
+/// Typically this is owned and used by a ``UserInputProcessor``:
+///
+/// ```swift
+/// public func prepare(input: UserInput) async throws -> LMInput {
+///     let messages = Qwen2VLMessageGenerator().generate(from: input)
+///     ...
+/// ```
 public protocol MessageGenerator {
-    /// Returns [String: Any] aka Message
+
+    /// Returns `[String: Any]` aka ``Message``.
     func generate(message: Chat.Message) -> Message
 }
 
 extension MessageGenerator {
-    /// Returns array of [String: Any] aka Message
+    /// Returns array of `[String: Any]` aka ``Message``
     public func generate(messages: [Chat.Message]) -> [Message] {
         var rawMessages: [Message] = []
 
@@ -71,6 +93,15 @@ extension MessageGenerator {
     }
 }
 
+/// Default implementation of ``MessageGenerator`` that produces a
+/// `role` and `content`.
+///
+/// ```swift
+/// [
+///     "role": message.role.rawValue,
+///     "content": message.content,
+/// ]
+/// ```
 public struct DefaultMessageGenerator: MessageGenerator {
     public init() {}
 
diff --git a/Libraries/MLXLMCommon/UserInput.swift b/Libraries/MLXLMCommon/UserInput.swift
@@ -13,10 +13,19 @@ public typealias Message = [String: Any]
 /// A ``UserInputProcessor`` can convert this to ``LMInput``.
 /// See also ``ModelContext``.
 public struct UserInput: Sendable {
+
     /// Representation of a prompt or series of messages (conversation).
+    ///
+    /// This may be a single string with a user prompt or a series of back
+    /// and forth responses representing a conversation.
     public enum Prompt: Sendable, CustomStringConvertible {
+        /// a single string
         case text(String)
+
+        /// model specific array of dictionaries
         case messages([Message])
+
+        /// model agnostic structured chat (series of messages)
         case chat([Chat.Message])
 
         public var description: String {
@@ -31,6 +40,7 @@ public struct UserInput: Sendable {
         }
     }
 
+    /// Representation of a video resource.
     public enum Video: Sendable {
         case avAsset(AVAsset)
         case url(URL)
@@ -45,7 +55,7 @@ public struct UserInput: Sendable {
         }
     }
 
-    /// Representation of a single image.
+    /// Representation of an image resource.
     public enum Image: Sendable {
         case ciImage(CIImage)
         case url(URL)
@@ -118,60 +128,196 @@ public struct UserInput: Sendable {
         }
     }
 
+    /// The prompt to evaluate.
     public var prompt: Prompt
-    public var images = [Image]()
-    public var videos = [Video]()
+
+    /// The images associated with the `UserInput`.
+    ///
+    /// If the ``prompt-swift.property`` is a ``Prompt-swift.enum/chat(_:)`` this will
+    /// collect the images from the chat messages, otherwise these are the stored images with the ``UserInput``.
+    public var images: [Image] {
+        get {
+            switch prompt {
+            case .text: _images
+            case .messages: _images
+            case .chat(let messages):
+                messages.reduce(into: []) { result, message in
+                    result.append(contentsOf: message.images)
+                }
+            }
+        }
+        set {
+            switch prompt {
+            case .text, .messages:
+                _images = newValue
+            case .chat:
+                break
+            }
+        }
+    }
+
+    private var _images = [Image]()
+
+    /// The images associated with the `UserInput`.
+    ///
+    /// If the ``prompt-swift.property`` is a ``Prompt-swift.enum/chat(_:)`` this will
+    /// collect the videos from the chat messages, otherwise these are the stored videos with the ``UserInput``.
+    public var videos: [Video] {
+        get {
+            switch prompt {
+            case .text: _videos
+            case .messages: _videos
+            case .chat(let messages):
+                messages.reduce(into: []) { result, message in
+                    result.append(contentsOf: message.videos)
+                }
+            }
+        }
+        set {
+            switch prompt {
+            case .text, .messages:
+                _videos = newValue
+            case .chat:
+                break
+            }
+        }
+    }
+
+    private var _videos = [Video]()
+
     public var tools: [ToolSpec]?
+
     /// Additional values provided for the chat template rendering context
     public var additionalContext: [String: Any]?
     public var processing: Processing = .init()
 
+    /// Initialize the `UserInput` with a single text prompt.
+    ///
+    /// - Parameters:
+    ///   - prompt: text prompt
+    ///   - images: optional images
+    ///   - videos: optional videos
+    ///   - tools: optional tool specifications
+    ///   - additionalContext: optional context (model specific)
+    /// ### See Also
+    /// - ``Prompt-swift.enum/text(_:)``
+    /// - ``init(chat:tools:additionalContext:)``
     public init(
         prompt: String, images: [Image] = [Image](), videos: [Video] = [Video](),
         tools: [ToolSpec]? = nil,
         additionalContext: [String: Any]? = nil
     ) {
-        self.prompt = .text(prompt)
-        self.images = images
-        self.videos = videos
+        self.prompt = .chat([
+            .user(prompt, images: images, videos: videos)
+        ])
         self.tools = tools
         self.additionalContext = additionalContext
     }
 
+    /// Initialize the `UserInput` with model specific mesage structures.
+    ///
+    /// For example, the Qwen2VL model wants input in this format:
+    ///
+    /// ```
+    /// [
+    ///     [
+    ///         "role": "user",
+    ///         "content": [
+    ///             [
+    ///                 "type": "text",
+    ///                 "text": "What is this?"
+    ///             ],
+    ///             [
+    ///                 "type": "image",
+    ///             ],
+    ///         ]
+    ///     ]
+    /// ]
+    /// ```
+    ///
+    /// Typically the ``init(chat:tools:additionalContext:)`` should be used instead
+    /// along with a model specific ``MessageGenerator`` (supplied by the ``UserInputProcessor``).
+    ///
+    /// - Parameters:
+    ///   - messages: array of dictionaries representing the prompt in a model specific format
+    ///   - images: optional images
+    ///   - videos: optional videos
+    ///   - tools: optional tool specifications
+    ///   - additionalContext: optional context (model specific)
+    /// ### See Also
+    /// - ``Prompt-swift.enum/text(_:)``
+    /// - ``init(chat:tools:additionalContext:)``
     public init(
         messages: [Message], images: [Image] = [Image](), videos: [Video] = [Video](),
         tools: [ToolSpec]? = nil,
         additionalContext: [String: Any]? = nil
     ) {
         self.prompt = .messages(messages)
-        self.images = images
-        self.videos = videos
         self.tools = tools
         self.additionalContext = additionalContext
     }
 
+    /// Initialize the `UserInput` with a model agnostic structured context.
+    ///
+    /// For example:
+    ///
+    /// ```
+    /// let chat: [Chat.Message] = [
+    ///     .system("You are a helpful photographic assistant."),
+    ///     .user("Please describe the photo.", images: [image1]),
+    /// ]
+    /// let userInput = UserInput(chat: chat)
+    /// ```
+    ///
+    /// A model specific ``MessageGenerator`` (supplied by the ``UserInputProcessor``)
+    /// is used to convert this into a model specific format.
+    ///
+    /// - Parameters:
+    ///   - chat: structured content
+    ///   - tools: optional tool specifications
+    ///   - additionalContext: optional context (model specific)
+    /// ### See Also
+    /// - ``Prompt-swift.enum/text(_:)``
+    /// - ``init(chat:tools:additionalContext:)``
     public init(
-        messages: [Chat.Message],
+        chat: [Chat.Message],
         tools: [ToolSpec]? = nil,
         additionalContext: [String: Any]? = nil
     ) {
-        self.prompt = .chat(messages)
-        self.images = messages.reduce(into: []) { result, message in
-            result.append(contentsOf: message.images)
-        }
-        self.videos = messages.reduce(into: []) { result, message in
-            result.append(contentsOf: message.videos)
-        }
+        self.prompt = .chat(chat)
         self.tools = tools
         self.additionalContext = additionalContext
     }
 
+    /// Initialize the `UserInput` with a preconfigured ``Prompt-swift.enum``.
+    ///
+    /// ``init(chat:tools:additionalContext:)`` is the preferred mechanism.
+    ///
+    /// - Parameters:
+    ///   - prompt: the prompt
+    ///   - images: optional images
+    ///   - videos: optional videos
+    ///   - tools: optional tool specifications
+    ///   - processing: optional processing to be applied to media
+    ///   - additionalContext: optional context (model specific)
+    /// ### See Also
+    /// - ``Prompt-swift.enum/text(_:)``
+    /// - ``init(chat:tools:additionalContext:)``
     public init(
-        prompt: Prompt, images: [Image] = [Image](), processing: Processing = .init(),
+        prompt: Prompt,
+        images: [Image] = [Image](),
+        videos: [Video] = [Video](),
+        processing: Processing = .init(),
         tools: [ToolSpec]? = nil, additionalContext: [String: Any]? = nil
     ) {
         self.prompt = prompt
-        self.images = images
+        switch prompt {
+        case .text, .messages:
+            _images = images
+            _videos = videos
+        case .chat:
+            break
+        }
         self.processing = processing
         self.tools = tools
         self.additionalContext = additionalContext