Skip to content

Commit c15e408

Browse files
committed
add documentation, images/video are live
1 parent 596f129 commit c15e408

File tree

2 files changed

+201
-24
lines changed

2 files changed

+201
-24
lines changed

Libraries/MLXLMCommon/Chat.swift

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,26 @@
33
public enum Chat {
44
public struct Message {
55
/// The role of the message sender.
6-
public let role: Role
6+
public var role: Role
77

88
/// The content of the message.
9-
public let content: String
9+
public var content: String
1010

1111
/// Array of image data associated with the message.
12-
public let images: [UserInput.Image]
12+
public var images: [UserInput.Image]
1313

1414
/// Array of video data associated with the message.
15-
public let videos: [UserInput.Video]
15+
public var videos: [UserInput.Video]
16+
17+
public init(
18+
role: Role, content: String, images: [UserInput.Image] = [],
19+
videos: [UserInput.Video] = []
20+
) {
21+
self.role = role
22+
self.content = content
23+
self.images = images
24+
self.videos = videos
25+
}
1626

1727
public static func system(
1828
_ content: String, images: [UserInput.Image] = [], videos: [UserInput.Video] = []
@@ -40,13 +50,25 @@ public enum Chat {
4050
}
4151
}
4252

53+
/// Protocol for something that can convert structured
54+
/// ``Chat.Message`` into model specific ``Message``
55+
/// (raw dictionary) format.
56+
///
57+
/// Typically this is owned and used by a ``UserInputProcessor``:
58+
///
59+
/// ```swift
60+
/// public func prepare(input: UserInput) async throws -> LMInput {
61+
/// let messages = Qwen2VLMessageGenerator().generate(from: input)
62+
/// ...
63+
/// ```
4364
public protocol MessageGenerator {
44-
/// Returns [String: Any] aka Message
65+
66+
/// Returns `[String: Any]` aka ``Message``.
4567
func generate(message: Chat.Message) -> Message
4668
}
4769

4870
extension MessageGenerator {
49-
/// Returns array of [String: Any] aka Message
71+
/// Returns array of `[String: Any]` aka ``Message``
5072
public func generate(messages: [Chat.Message]) -> [Message] {
5173
var rawMessages: [Message] = []
5274

@@ -71,6 +93,15 @@ extension MessageGenerator {
7193
}
7294
}
7395

96+
/// Default implementation of ``MessageGenerator`` that produces a
97+
/// `role` and `content`.
98+
///
99+
/// ```swift
100+
/// [
101+
/// "role": message.role.rawValue,
102+
/// "content": message.content,
103+
/// ]
104+
/// ```
74105
public struct DefaultMessageGenerator: MessageGenerator {
75106
public init() {}
76107

Libraries/MLXLMCommon/UserInput.swift

Lines changed: 164 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,19 @@ public typealias Message = [String: Any]
1313
/// A ``UserInputProcessor`` can convert this to ``LMInput``.
1414
/// See also ``ModelContext``.
1515
public struct UserInput: Sendable {
16+
1617
/// Representation of a prompt or series of messages (conversation).
18+
///
19+
/// This may be a single string with a user prompt or a series of back
20+
/// and forth responses representing a conversation.
1721
public enum Prompt: Sendable, CustomStringConvertible {
22+
/// a single string
1823
case text(String)
24+
25+
/// model specific array of dictionaries
1926
case messages([Message])
27+
28+
/// model agnostic structured chat (series of messages)
2029
case chat([Chat.Message])
2130

2231
public var description: String {
@@ -31,6 +40,7 @@ public struct UserInput: Sendable {
3140
}
3241
}
3342

43+
/// Representation of a video resource.
3444
public enum Video: Sendable {
3545
case avAsset(AVAsset)
3646
case url(URL)
@@ -45,7 +55,7 @@ public struct UserInput: Sendable {
4555
}
4656
}
4757

48-
/// Representation of a single image.
58+
/// Representation of an image resource.
4959
public enum Image: Sendable {
5060
case ciImage(CIImage)
5161
case url(URL)
@@ -118,60 +128,196 @@ public struct UserInput: Sendable {
118128
}
119129
}
120130

131+
/// The prompt to evaluate.
121132
public var prompt: Prompt
122-
public var images = [Image]()
123-
public var videos = [Video]()
133+
134+
/// The images associated with the `UserInput`.
135+
///
136+
/// If the ``prompt-swift.property`` is a ``Prompt-swift.enum/chat(_:)`` this will
137+
/// collect the images from the chat messages, otherwise these are the stored images with the ``UserInput``.
138+
public var images: [Image] {
139+
get {
140+
switch prompt {
141+
case .text: _images
142+
case .messages: _images
143+
case .chat(let messages):
144+
messages.reduce(into: []) { result, message in
145+
result.append(contentsOf: message.images)
146+
}
147+
}
148+
}
149+
set {
150+
switch prompt {
151+
case .text, .messages:
152+
_images = newValue
153+
case .chat:
154+
break
155+
}
156+
}
157+
}
158+
159+
private var _images = [Image]()
160+
161+
/// The images associated with the `UserInput`.
162+
///
163+
/// If the ``prompt-swift.property`` is a ``Prompt-swift.enum/chat(_:)`` this will
164+
/// collect the videos from the chat messages, otherwise these are the stored videos with the ``UserInput``.
165+
public var videos: [Video] {
166+
get {
167+
switch prompt {
168+
case .text: _videos
169+
case .messages: _videos
170+
case .chat(let messages):
171+
messages.reduce(into: []) { result, message in
172+
result.append(contentsOf: message.videos)
173+
}
174+
}
175+
}
176+
set {
177+
switch prompt {
178+
case .text, .messages:
179+
_videos = newValue
180+
case .chat:
181+
break
182+
}
183+
}
184+
}
185+
186+
private var _videos = [Video]()
187+
124188
public var tools: [ToolSpec]?
189+
125190
/// Additional values provided for the chat template rendering context
126191
public var additionalContext: [String: Any]?
127192
public var processing: Processing = .init()
128193

194+
/// Initialize the `UserInput` with a single text prompt.
195+
///
196+
/// - Parameters:
197+
/// - prompt: text prompt
198+
/// - images: optional images
199+
/// - videos: optional videos
200+
/// - tools: optional tool specifications
201+
/// - additionalContext: optional context (model specific)
202+
/// ### See Also
203+
/// - ``Prompt-swift.enum/text(_:)``
204+
/// - ``init(chat:tools:additionalContext:)``
129205
public init(
130206
prompt: String, images: [Image] = [Image](), videos: [Video] = [Video](),
131207
tools: [ToolSpec]? = nil,
132208
additionalContext: [String: Any]? = nil
133209
) {
134-
self.prompt = .text(prompt)
135-
self.images = images
136-
self.videos = videos
210+
self.prompt = .chat([
211+
.user(prompt, images: images, videos: videos)
212+
])
137213
self.tools = tools
138214
self.additionalContext = additionalContext
139215
}
140216

217+
/// Initialize the `UserInput` with model specific mesage structures.
218+
///
219+
/// For example, the Qwen2VL model wants input in this format:
220+
///
221+
/// ```
222+
/// [
223+
/// [
224+
/// "role": "user",
225+
/// "content": [
226+
/// [
227+
/// "type": "text",
228+
/// "text": "What is this?"
229+
/// ],
230+
/// [
231+
/// "type": "image",
232+
/// ],
233+
/// ]
234+
/// ]
235+
/// ]
236+
/// ```
237+
///
238+
/// Typically the ``init(chat:tools:additionalContext:)`` should be used instead
239+
/// along with a model specific ``MessageGenerator`` (supplied by the ``UserInputProcessor``).
240+
///
241+
/// - Parameters:
242+
/// - messages: array of dictionaries representing the prompt in a model specific format
243+
/// - images: optional images
244+
/// - videos: optional videos
245+
/// - tools: optional tool specifications
246+
/// - additionalContext: optional context (model specific)
247+
/// ### See Also
248+
/// - ``Prompt-swift.enum/text(_:)``
249+
/// - ``init(chat:tools:additionalContext:)``
141250
public init(
142251
messages: [Message], images: [Image] = [Image](), videos: [Video] = [Video](),
143252
tools: [ToolSpec]? = nil,
144253
additionalContext: [String: Any]? = nil
145254
) {
146255
self.prompt = .messages(messages)
147-
self.images = images
148-
self.videos = videos
149256
self.tools = tools
150257
self.additionalContext = additionalContext
151258
}
152259

260+
/// Initialize the `UserInput` with a model agnostic structured context.
261+
///
262+
/// For example:
263+
///
264+
/// ```
265+
/// let chat: [Chat.Message] = [
266+
/// .system("You are a helpful photographic assistant."),
267+
/// .user("Please describe the photo.", images: [image1]),
268+
/// ]
269+
/// let userInput = UserInput(chat: chat)
270+
/// ```
271+
///
272+
/// A model specific ``MessageGenerator`` (supplied by the ``UserInputProcessor``)
273+
/// is used to convert this into a model specific format.
274+
///
275+
/// - Parameters:
276+
/// - chat: structured content
277+
/// - tools: optional tool specifications
278+
/// - additionalContext: optional context (model specific)
279+
/// ### See Also
280+
/// - ``Prompt-swift.enum/text(_:)``
281+
/// - ``init(chat:tools:additionalContext:)``
153282
public init(
154-
messages: [Chat.Message],
283+
chat: [Chat.Message],
155284
tools: [ToolSpec]? = nil,
156285
additionalContext: [String: Any]? = nil
157286
) {
158-
self.prompt = .chat(messages)
159-
self.images = messages.reduce(into: []) { result, message in
160-
result.append(contentsOf: message.images)
161-
}
162-
self.videos = messages.reduce(into: []) { result, message in
163-
result.append(contentsOf: message.videos)
164-
}
287+
self.prompt = .chat(chat)
165288
self.tools = tools
166289
self.additionalContext = additionalContext
167290
}
168291

292+
/// Initialize the `UserInput` with a preconfigured ``Prompt-swift.enum``.
293+
///
294+
/// ``init(chat:tools:additionalContext:)`` is the preferred mechanism.
295+
///
296+
/// - Parameters:
297+
/// - prompt: the prompt
298+
/// - images: optional images
299+
/// - videos: optional videos
300+
/// - tools: optional tool specifications
301+
/// - processing: optional processing to be applied to media
302+
/// - additionalContext: optional context (model specific)
303+
/// ### See Also
304+
/// - ``Prompt-swift.enum/text(_:)``
305+
/// - ``init(chat:tools:additionalContext:)``
169306
public init(
170-
prompt: Prompt, images: [Image] = [Image](), processing: Processing = .init(),
307+
prompt: Prompt,
308+
images: [Image] = [Image](),
309+
videos: [Video] = [Video](),
310+
processing: Processing = .init(),
171311
tools: [ToolSpec]? = nil, additionalContext: [String: Any]? = nil
172312
) {
173313
self.prompt = prompt
174-
self.images = images
314+
switch prompt {
315+
case .text, .messages:
316+
_images = images
317+
_videos = videos
318+
case .chat:
319+
break
320+
}
175321
self.processing = processing
176322
self.tools = tools
177323
self.additionalContext = additionalContext

0 commit comments

Comments
 (0)