Add argument for extra EOS token to llm-tool (#217)

DePasqualeOrg · davidkoski · web-flow · commit ea86d94c6e2a · 2025-03-10T09:55:53.000-07:00
* Improve llm-tool

- make ModelConfiguration and ModelContext properties mutable
- update context/configuration with extra EOS tokens

---------

Co-authored-by: David Koski &lt;dkoski@apple.com&gt;
diff --git a/Libraries/MLXLMCommon/ModelConfiguration.swift b/Libraries/MLXLMCommon/ModelConfiguration.swift
@@ -31,10 +31,10 @@ public struct ModelConfiguration: Sendable {
     public let overrideTokenizer: String?
 
     /// A reasonable default prompt for the model
-    public let defaultPrompt: String
+    public var defaultPrompt: String
 
     /// Additional tokens to use for end of string
-    public let extraEOSTokens: Set<String>
+    public var extraEOSTokens: Set<String>
 
     public init(
         id: String, tokenizerId: String? = nil, overrideTokenizer: String? = nil,
diff --git a/Libraries/MLXLMCommon/ModelContainer.swift b/Libraries/MLXLMCommon/ModelContainer.swift
@@ -32,12 +32,11 @@ import Tokenizers
 /// }
 /// ```
 public actor ModelContainer {
-    let context: ModelContext
-    nonisolated public let configuration: ModelConfiguration
+    var context: ModelContext
+    public var configuration: ModelConfiguration { context.configuration }
 
     public init(context: ModelContext) {
         self.context = context
-        self.configuration = context.configuration
     }
 
     /// Perform an action on the model and/or tokenizer.  Callers _must_ eval any `MLXArray` before returning as
@@ -75,4 +74,9 @@ public actor ModelContainer {
         try await action(context, values)
     }
 
+    /// Update the owned `ModelContext`.
+    /// - Parameter action: update action
+    public func update(_ action: @Sendable (inout ModelContext) -> Void) {
+        action(&context)
+    }
 }
diff --git a/Libraries/MLXLMCommon/ModelFactory.swift b/Libraries/MLXLMCommon/ModelFactory.swift
@@ -22,10 +22,10 @@ public enum ModelFactoryError: Error {
 /// See also ``ModelFactory/loadContainer(hub:configuration:progressHandler:)`` and
 /// ``ModelContainer``.
 public struct ModelContext {
-    public let configuration: ModelConfiguration
-    public let model: any LanguageModel
-    public let processor: any UserInputProcessor
-    public let tokenizer: Tokenizer
+    public var configuration: ModelConfiguration
+    public var model: any LanguageModel
+    public var processor: any UserInputProcessor
+    public var tokenizer: Tokenizer
 
     public init(
         configuration: ModelConfiguration, model: any LanguageModel,
diff --git a/Tools/llm-tool/LLMTool.swift b/Tools/llm-tool/LLMTool.swift
@@ -31,6 +31,8 @@ struct ModelArguments: ParsableArguments, Sendable {
 
         let modelName = self.model ?? defaultModel
 
+        print("Loading \(modelName)...")
+
         if modelName.hasPrefix("/") {
             // path
             modelConfiguration = ModelConfiguration(directory: URL(filePath: modelName))
@@ -67,6 +69,9 @@ struct GenerateArguments: ParsableArguments, Sendable {
     @Option(name: .long, help: "The number of tokens to consider for repetition penalty")
     var repetitionContextSize: Int = 20
 
+    @Option(name: .long, help: "Additional end-of-sequence token to stop generation")
+    var extraEosToken: String?
+
     @Option(name: .long, help: "The PRNG seed")
     var seed: UInt64 = 0
 
@@ -89,17 +94,22 @@ struct GenerateArguments: ParsableArguments, Sendable {
         }
     }
 
+    func prepare(
+        _ context: inout ModelContext
+    ) {
+        if let extraEosToken {
+            context.configuration.extraEOSTokens.insert(extraEosToken)
+        }
+    }
+
     func generate(
         input: LMInput, context: ModelContext
-    )
-        throws -> GenerateResult
-    {
+    ) throws -> GenerateResult {
         var detokenizer = NaiveStreamingDetokenizer(tokenizer: context.tokenizer)
 
         return try MLXLMCommon.generate(
             input: input, parameters: generateParameters, context: context
         ) { tokens in
-
             if let last = tokens.last {
                 detokenizer.append(token: last)
             }
@@ -276,11 +286,16 @@ struct EvaluateCommand: AsyncParsableCommand {
             try await args.load(defaultModel: defaultModel.name, modelFactory: modelFactory)
         }
 
+        // update the context/configuration with any command line parameters
+        await modelContainer.update { [generate] context in
+            generate.prepare(&context)
+        }
+
         // Get the resolved configuration (this has the default prompt)
-        let modelConfiguration = modelContainer.configuration
+        let modelConfiguration = await modelContainer.configuration
 
         if !generate.quiet {
-            print("Model loaded -> \(modelConfiguration.id)")
+            print("Loaded \(modelConfiguration.name)")
         }
 
         let userInput = self.userInput(modelConfiguration: modelConfiguration)
diff --git a/Tools/llm-tool/LoraCommands.swift b/Tools/llm-tool/LoraCommands.swift
@@ -48,7 +48,7 @@ struct LoRAModelArguments: ParsableArguments, Sendable {
         // convert some of the Linear layers to LoRALinear
         await modelContainer.perform { context in
             guard let lora = context.model as? LoRAModel else {
-                fatalError("Model \(modelContainer.configuration.name) is not a LoRAModel")
+                fatalError("Model \(context.configuration.name) is not a LoRAModel")
             }
             LoRATrain.convert(model: context.model, layers: lora.loraLinearLayers(loraLayers))
         }
@@ -197,7 +197,7 @@ struct LoRAFuseCommand: AsyncParsableCommand {
         // fuse them back into Linear/QuantizedLinear
         await modelContainer.perform { [args, deQuantize] context in
             guard let lora = context.model as? LoRAModel else {
-                fatalError("Model \(modelContainer.configuration.name) is not a LoRAModel")
+                fatalError("Model \(context.configuration.name) is not a LoRAModel")
             }
 
             LoRATrain.fuse(
@@ -207,7 +207,7 @@ struct LoRAFuseCommand: AsyncParsableCommand {
 
         // make the new directory and copy files from source model
         try FileManager.default.createDirectory(at: outputURL, withIntermediateDirectories: true)
-        let inputURL = modelContainer.configuration.modelDirectory()
+        let inputURL = await modelContainer.configuration.modelDirectory()
         let enumerator = FileManager.default.enumerator(
             at: inputURL, includingPropertiesForKeys: nil)!
         for case let url as URL in enumerator {
@@ -296,7 +296,8 @@ struct LoRAEvalCommand: AsyncParsableCommand {
 
         memory.start()
 
-        let prompt = generate.prompt ?? modelContainer.configuration.defaultPrompt
+        let defaultPrompt = await modelContainer.configuration.defaultPrompt
+        let prompt = generate.prompt ?? defaultPrompt
 
         if !generate.quiet {
             print("Starting generation ...")
diff --git a/mlx-swift-examples.xcodeproj/xcshareddata/xcschemes/llm-tool.xcscheme b/mlx-swift-examples.xcodeproj/xcshareddata/xcschemes/llm-tool.xcscheme
@@ -56,12 +56,12 @@
             isEnabled = "NO">
          </CommandLineArgument>
          <CommandLineArgument
-            argument = "--prompt &apos;Describe the image in English.&apos; --image https://www.gstatic.com/webp/gallery/1.webp"
+            argument = "--model microsoft/Phi-4-mini-instruct --prompt &quot;Why is the sky blue?&quot; --extra-eos-token &quot;&lt;|end|&gt;&quot;"
             isEnabled = "NO">
          </CommandLineArgument>
          <CommandLineArgument
-            argument = "--model mlx-community/Qwen2-VL-2B-Instruct-4bit"
-            isEnabled = "NO">
+            argument = "--model mlx-community/Qwen2-VL-2B-Instruct-4bit --prompt &apos;Describe the image in English.&apos; --image https://www.gstatic.com/webp/gallery/1.webp"
+            isEnabled = "YES">
          </CommandLineArgument>
          <CommandLineArgument
             argument = "--repetition-penalty 1.2"
@@ -89,15 +89,15 @@
          </CommandLineArgument>
          <CommandLineArgument
             argument = "--prompt &apos;Why is the sky blue?&apos;"
-            isEnabled = "YES">
+            isEnabled = "NO">
          </CommandLineArgument>
          <CommandLineArgument
             argument = "--model mlx-community/Mistral-7B-v0.1-hf-4bit-mlx"
             isEnabled = "NO">
          </CommandLineArgument>
          <CommandLineArgument
             argument = "--model mlx-community/Llama-3.2-1B-Instruct-4bit"
-            isEnabled = "YES">
+            isEnabled = "NO">
          </CommandLineArgument>
          <CommandLineArgument
             argument = "--model mlx-community/phi-2-hf-4bit-mlx"