Skip to content

Commit 2b78ff9

Browse files
Implement Structured Chat Messages (#257)
* Initial implementation for structured chat messages * Do not require images and videos in init(messages:) * Refactor message generation to use unified generate(from:) method * fix #270 * implement tests Co-authored-by: David Koski <[email protected]>
1 parent b53b34a commit 2b78ff9

File tree

17 files changed

+881
-56
lines changed

17 files changed

+881
-56
lines changed

.circleci/config.yml

+8
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,14 @@ jobs:
2828
brew install swift-format
2929
pre-commit run --all
3030
if ! git diff --quiet; then echo 'Style checks failed, please install pre-commit and run pre-commit run --all and push the change'; exit 1; fi
31+
- run:
32+
name: Run Tests (Xcode, macOS)
33+
command: |
34+
xcodebuild -version
35+
xcrun --show-sdk-build-version
36+
swift --version
37+
find . -name Package.resolved -exec rm {} \;
38+
xcodebuild test -scheme mlx-libraries-Package -destination 'platform=OS X'
3139
- run:
3240
name: Build Examples
3341
command: |

Applications/VLMEval/ContentView.swift

+11-2
Original file line numberDiff line numberDiff line change
@@ -412,13 +412,22 @@ class VLMEvaluator {
412412
if !images.isEmpty || !videos.isEmpty {
413413
[
414414
[
415-
"role": "user",
415+
"role": "system",
416416
"content": [
417417
[
418418
"type": "text",
419419
"text": videoURL != nil
420420
? videoSystemPrompt : imageSystemPrompt,
421421
]
422+
],
423+
],
424+
[
425+
"role": "user",
426+
"content": [
427+
[
428+
"type": "text",
429+
"text": prompt,
430+
]
422431
]
423432
// Messages format for Qwen 2 VL, Qwen 2.5 VL. May need to be adapted for other models.
424433
+ images.map { _ in
@@ -427,7 +436,7 @@ class VLMEvaluator {
427436
+ videos.map { _ in
428437
["type": "video"]
429438
},
430-
]
439+
],
431440
]
432441
} else {
433442
[

Libraries/MLXLLM/LLMModelFactory.swift

+13-5
Original file line numberDiff line numberDiff line change
@@ -199,23 +199,29 @@ private struct LLMUserInputProcessor: UserInputProcessor {
199199

200200
let tokenizer: Tokenizer
201201
let configuration: ModelConfiguration
202+
let messageGenerator: MessageGenerator
202203

203-
internal init(tokenizer: any Tokenizer, configuration: ModelConfiguration) {
204+
internal init(
205+
tokenizer: any Tokenizer, configuration: ModelConfiguration,
206+
messageGenerator: MessageGenerator
207+
) {
204208
self.tokenizer = tokenizer
205209
self.configuration = configuration
210+
self.messageGenerator = messageGenerator
206211
}
207212

208213
func prepare(input: UserInput) throws -> LMInput {
214+
let messages = messageGenerator.generate(from: input)
215+
209216
do {
210-
let messages = input.prompt.asMessages()
211217
let promptTokens = try tokenizer.applyChatTemplate(
212218
messages: messages, tools: input.tools, additionalContext: input.additionalContext)
213219
return LMInput(tokens: MLXArray(promptTokens))
214220
} catch {
215221
// #150 -- it might be a TokenizerError.chatTemplate("No chat template was specified")
216222
// but that is not public so just fall back to text
217-
let prompt = input.prompt
218-
.asMessages()
223+
let prompt =
224+
messages
219225
.compactMap { $0["content"] as? String }
220226
.joined(separator: ". ")
221227
let promptTokens = tokenizer.encode(text: prompt)
@@ -273,7 +279,9 @@ public class LLMModelFactory: ModelFactory {
273279

274280
return .init(
275281
configuration: configuration, model: model,
276-
processor: LLMUserInputProcessor(tokenizer: tokenizer, configuration: configuration),
282+
processor: LLMUserInputProcessor(
283+
tokenizer: tokenizer, configuration: configuration,
284+
messageGenerator: DefaultMessageGenerator()),
277285
tokenizer: tokenizer)
278286
}
279287

Libraries/MLXLMCommon/Chat.swift

+114
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
// Copyright © 2025 Apple Inc.
2+
3+
public enum Chat {
4+
public struct Message {
5+
/// The role of the message sender.
6+
public var role: Role
7+
8+
/// The content of the message.
9+
public var content: String
10+
11+
/// Array of image data associated with the message.
12+
public var images: [UserInput.Image]
13+
14+
/// Array of video data associated with the message.
15+
public var videos: [UserInput.Video]
16+
17+
public init(
18+
role: Role, content: String, images: [UserInput.Image] = [],
19+
videos: [UserInput.Video] = []
20+
) {
21+
self.role = role
22+
self.content = content
23+
self.images = images
24+
self.videos = videos
25+
}
26+
27+
public static func system(
28+
_ content: String, images: [UserInput.Image] = [], videos: [UserInput.Video] = []
29+
) -> Self {
30+
Self(role: .system, content: content, images: images, videos: videos)
31+
}
32+
33+
public static func assistant(
34+
_ content: String, images: [UserInput.Image] = [], videos: [UserInput.Video] = []
35+
) -> Self {
36+
Self(role: .assistant, content: content, images: images, videos: videos)
37+
}
38+
39+
public static func user(
40+
_ content: String, images: [UserInput.Image] = [], videos: [UserInput.Video] = []
41+
) -> Self {
42+
Self(role: .user, content: content, images: images, videos: videos)
43+
}
44+
45+
public enum Role: String {
46+
case user
47+
case assistant
48+
case system
49+
}
50+
}
51+
}
52+
53+
/// Protocol for something that can convert structured
54+
/// ``Chat.Message`` into model specific ``Message``
55+
/// (raw dictionary) format.
56+
///
57+
/// Typically this is owned and used by a ``UserInputProcessor``:
58+
///
59+
/// ```swift
60+
/// public func prepare(input: UserInput) async throws -> LMInput {
61+
/// let messages = Qwen2VLMessageGenerator().generate(from: input)
62+
/// ...
63+
/// ```
64+
public protocol MessageGenerator {
65+
66+
/// Returns `[String: Any]` aka ``Message``.
67+
func generate(message: Chat.Message) -> Message
68+
}
69+
70+
extension MessageGenerator {
71+
/// Returns array of `[String: Any]` aka ``Message``
72+
public func generate(messages: [Chat.Message]) -> [Message] {
73+
var rawMessages: [Message] = []
74+
75+
for message in messages {
76+
let raw = generate(message: message)
77+
rawMessages.append(raw)
78+
}
79+
80+
return rawMessages
81+
}
82+
83+
/// Generates messages from the input.
84+
public func generate(from input: UserInput) -> [Message] {
85+
switch input.prompt {
86+
case .text(let text):
87+
generate(messages: [.user(text)])
88+
case .messages(let messages):
89+
messages
90+
case .chat(let messages):
91+
generate(messages: messages)
92+
}
93+
}
94+
}
95+
96+
/// Default implementation of ``MessageGenerator`` that produces a
97+
/// `role` and `content`.
98+
///
99+
/// ```swift
100+
/// [
101+
/// "role": message.role.rawValue,
102+
/// "content": message.content,
103+
/// ]
104+
/// ```
105+
public struct DefaultMessageGenerator: MessageGenerator {
106+
public init() {}
107+
108+
public func generate(message: Chat.Message) -> Message {
109+
[
110+
"role": message.role.rawValue,
111+
"content": message.content,
112+
]
113+
}
114+
}

0 commit comments

Comments
 (0)