Add text-only model

DePasqualeOrg · DePasqualeOrg · commit 194bb10db1ab · 2025-03-14T16:23:42.000+01:00
diff --git a/Applications/LLMEval/ContentView.swift b/Applications/LLMEval/ContentView.swift
@@ -165,7 +165,7 @@ class LLMEvaluator {
 
     /// This controls which model loads. `qwen2_5_1_5b` is one of the smaller ones, so this will fit on
     /// more devices.
-    let modelConfiguration = ModelRegistry.qwen2_5_1_5b
+    let modelConfiguration = LLMRegistry.gemma3_1B_4bit
 
     /// parameters controlling the output
     let generateParameters = GenerateParameters(temperature: 0.6)
diff --git a/Libraries/MLXLLM/LLMModelFactory.swift b/Libraries/MLXLLM/LLMModelFactory.swift
@@ -40,6 +40,7 @@ public class LLMTypeRegistry: ModelTypeRegistry, @unchecked Sendable {
             "cohere": create(CohereConfiguration.self, CohereModel.init),
             "openelm": create(OpenElmConfiguration.self, OpenELMModel.init),
             "internlm2": create(InternLM2Configuration.self, InternLM2Model.init),
+            "gemma3_text": create(Gemma3TextConfiguration.self, Gemma3TextModel.init),
         ]
     }
 
@@ -166,6 +167,11 @@ public class LLMRegistry: AbstractModelRegistry, @unchecked Sendable {
         defaultPrompt: "What is the difference between a fruit and a vegetable?"
     )
 
+    static public let gemma3_1B_4bit = ModelConfiguration(
+        id: "mlx-community/gemma-3-1b-it-4bit",
+        defaultPrompt: "What is the difference between a fruit and a vegetable?"
+    )
+
     private static func all() -> [ModelConfiguration] {
         [
             codeLlama13b4bit,
@@ -187,6 +193,7 @@ public class LLMRegistry: AbstractModelRegistry, @unchecked Sendable {
             qwen2_5_7b,
             qwen2_5_1_5b,
             smolLM_135M_4bit,
+            gemma3_1B_4bit,
         ]
     }
 
diff --git a/Libraries/MLXLLM/Models/Gemma3Text.swift b/Libraries/MLXLLM/Models/Gemma3Text.swift
@@ -0,0 +1,331 @@
+//
+//  Gemma3Text.swift
+//  mlx-swift-examples
+//
+//  Created by Anthony DePasquale on 14.03.2025.
+//
+
+import Foundation
+import MLX
+import MLXFast
+import MLXLMCommon
+import MLXNN
+
+public struct Gemma3TextConfiguration: Codable {
+    let modelType: String
+    let hiddenSize: Int
+    let hiddenLayers: Int
+    let intermediateSize: Int
+    let attentionHeads: Int
+    let headDim: Int
+    let rmsNormEps: Float
+    let vocabularySize: Int
+    let kvHeads: Int
+    let ropeGlobalBaseFreq: Float
+    let ropeLocalBaseFreq: Float
+    let ropeTraditional: Bool
+    let queryPreAttnScalar: Float
+    let slidingWindow: Int
+    let slidingWindowPattern: Int
+
+    enum CodingKeys: String, CodingKey {
+        case modelType = "model_type"
+        case hiddenSize = "hidden_size"
+        case hiddenLayers = "num_hidden_layers"
+        case intermediateSize = "intermediate_size"
+        case attentionHeads = "num_attention_heads"
+        case headDim = "head_dim"
+        case rmsNormEps = "rms_norm_eps"
+        case vocabularySize = "vocab_size"
+        case kvHeads = "num_key_value_heads"
+        case ropeGlobalBaseFreq = "rope_global_base_freq"
+        case ropeLocalBaseFreq = "rope_local_base_freq"
+        case ropeTraditional = "rope_traditional"
+        case queryPreAttnScalar = "query_pre_attn_scalar"
+        case slidingWindow = "sliding_window"
+        case slidingWindowPattern = "sliding_window_pattern"
+    }
+
+    public init(from decoder: Decoder) throws {
+        let container = try decoder.container(keyedBy: CodingKeys.self)
+
+        modelType = try container.decode(String.self, forKey: .modelType)
+        hiddenSize = try container.decode(Int.self, forKey: .hiddenSize)
+        hiddenLayers = try container.decode(Int.self, forKey: .hiddenLayers)
+        intermediateSize = try container.decode(Int.self, forKey: .intermediateSize)
+
+        // Default values with optional decoding
+        attentionHeads = try container.decodeIfPresent(Int.self, forKey: .attentionHeads) ?? 4
+        headDim = try container.decodeIfPresent(Int.self, forKey: .headDim) ?? 256
+        rmsNormEps = try container.decodeIfPresent(Float.self, forKey: .rmsNormEps) ?? 1.0e-6
+        vocabularySize = try container.decodeIfPresent(Int.self, forKey: .vocabularySize) ?? 262144
+        kvHeads = try container.decodeIfPresent(Int.self, forKey: .kvHeads) ?? 1
+        ropeGlobalBaseFreq =
+            try container.decodeIfPresent(Float.self, forKey: .ropeGlobalBaseFreq) ?? 1_000_000.0
+        ropeLocalBaseFreq =
+            try container.decodeIfPresent(Float.self, forKey: .ropeLocalBaseFreq) ?? 10_000.0
+        ropeTraditional =
+            try container.decodeIfPresent(Bool.self, forKey: .ropeTraditional) ?? false
+        queryPreAttnScalar =
+            try container.decodeIfPresent(Float.self, forKey: .queryPreAttnScalar) ?? 256
+        slidingWindow = try container.decodeIfPresent(Int.self, forKey: .slidingWindow) ?? 512
+        slidingWindowPattern =
+            try container.decodeIfPresent(Int.self, forKey: .slidingWindowPattern) ?? 6
+    }
+}
+
+private class Attention: Module {
+    let nHeads: Int
+    let nKVHeads: Int
+    let repeats: Int
+    let headDim: Int
+    let layerIdx: Int
+    let scale: Float
+    let isSliding: Bool
+
+    @ModuleInfo(key: "q_proj") var queryProj: Linear
+    @ModuleInfo(key: "k_proj") var keyProj: Linear
+    @ModuleInfo(key: "v_proj") var valueProj: Linear
+    @ModuleInfo(key: "o_proj") var outputProj: Linear
+
+    @ModuleInfo(key: "q_norm") var queryNorm: GemmaUtils.RMSNorm
+    @ModuleInfo(key: "k_norm") var keyNorm: GemmaUtils.RMSNorm
+
+    @ModuleInfo var rope: RoPE
+
+    init(_ config: Gemma3TextConfiguration, layerIdx: Int) {
+        let dim = config.hiddenSize
+        self.nHeads = config.attentionHeads
+        self.nKVHeads = config.kvHeads
+        self.repeats = nHeads / nKVHeads
+        self.headDim = config.headDim
+        self.layerIdx = layerIdx
+
+        self.scale = pow(config.queryPreAttnScalar, -0.5)
+
+        self._queryProj.wrappedValue = Linear(dim, nHeads * headDim, bias: false)
+        self._keyProj.wrappedValue = Linear(dim, nKVHeads * headDim, bias: false)
+        self._valueProj.wrappedValue = Linear(dim, nKVHeads * headDim, bias: false)
+        self._outputProj.wrappedValue = Linear(nHeads * headDim, dim, bias: false)
+
+        self._queryNorm.wrappedValue = GemmaUtils.RMSNorm(
+            dimensions: headDim, eps: config.rmsNormEps)
+        self._keyNorm.wrappedValue = GemmaUtils.RMSNorm(dimensions: headDim, eps: config.rmsNormEps)
+
+        self.isSliding = (layerIdx + 1) % config.slidingWindowPattern != 0
+
+        let baseFreq = isSliding ? config.ropeLocalBaseFreq : config.ropeGlobalBaseFreq
+        self._rope.wrappedValue = RoPE(
+            dimensions: headDim,
+            traditional: config.ropeTraditional,
+            base: baseFreq
+        )
+
+        super.init()
+    }
+
+    func callAsFunction(
+        _ x: MLXArray,
+        mask: MLXArray? = nil,
+        cache: KVCache? = nil
+    ) -> MLXArray {
+        let (B, L, _) = (x.dim(0), x.dim(1), x.dim(2))
+
+        var queries = queryProj(x)
+        var keys = keyProj(x)
+        var values = valueProj(x)
+
+        queries = queries.reshaped(B, L, nHeads, -1).transposed(0, 2, 1, 3)
+        keys = keys.reshaped(B, L, nKVHeads, -1).transposed(0, 2, 1, 3)
+        values = values.reshaped(B, L, nKVHeads, -1).transposed(0, 2, 1, 3)
+
+        queries = queryNorm(queries)
+        keys = keyNorm(keys)
+
+        var localMask = mask
+
+        if let cache = cache {
+            queries = rope(queries, offset: cache.offset)
+            keys = rope(keys, offset: cache.offset)
+            (keys, values) = cache.update(keys: keys, values: values)
+        } else {
+            queries = rope(queries)
+            keys = rope(keys)
+        }
+
+        // Sliding window mask adjustment
+        if localMask != nil && localMask!.dim(-1) != keys.dim(-2) {
+            let keyLen = keys.dim(-2)
+            localMask = localMask![0..., 0..., 0..., (localMask!.dim(-1) - keyLen)...]
+        }
+
+        let output = MLXFast.scaledDotProductAttention(
+            queries: queries,
+            keys: keys,
+            values: values,
+            scale: scale,
+            mask: localMask
+        )
+        .transposed(0, 2, 1, 3)
+        .reshaped(B, L, -1)
+
+        return outputProj(output)
+    }
+}
+
+private class MLP: Module {
+    @ModuleInfo(key: "gate_proj") var gateProj: Linear
+    @ModuleInfo(key: "down_proj") var downProj: Linear
+    @ModuleInfo(key: "up_proj") var upProj: Linear
+
+    init(dimensions: Int, hiddenDimensions: Int) {
+        self._gateProj.wrappedValue = Linear(dimensions, hiddenDimensions, bias: false)
+        self._downProj.wrappedValue = Linear(hiddenDimensions, dimensions, bias: false)
+        self._upProj.wrappedValue = Linear(dimensions, hiddenDimensions, bias: false)
+        super.init()
+    }
+
+    func callAsFunction(_ x: MLXArray) -> MLXArray {
+        return downProj(geluApproximate(gateProj(x)) * upProj(x))
+    }
+}
+
+private class TransformerBlock: Module {
+    @ModuleInfo(key: "self_attn") var selfAttention: Attention
+    @ModuleInfo var mlp: MLP
+    @ModuleInfo(key: "input_layernorm") var inputLayerNorm: RMSNorm
+    @ModuleInfo(key: "post_attention_layernorm") var postAttentionLayerNorm: RMSNorm
+    @ModuleInfo(key: "pre_feedforward_layernorm") var preFeedforwardLayerNorm: RMSNorm
+    @ModuleInfo(key: "post_feedforward_layernorm") var postFeedforwardLayerNorm: RMSNorm
+
+    let numAttentionHeads: Int
+    let hiddenSize: Int
+
+    init(_ config: Gemma3TextConfiguration, layerIdx: Int) {
+        self.numAttentionHeads = config.attentionHeads
+        self.hiddenSize = config.hiddenSize
+
+        self._selfAttention.wrappedValue = Attention(config, layerIdx: layerIdx)
+        self.mlp = MLP(dimensions: config.hiddenSize, hiddenDimensions: config.intermediateSize)
+
+        self._inputLayerNorm.wrappedValue = RMSNorm(
+            dimensions: config.hiddenSize, eps: config.rmsNormEps)
+        self._postAttentionLayerNorm.wrappedValue = RMSNorm(
+            dimensions: config.hiddenSize, eps: config.rmsNormEps)
+        self._preFeedforwardLayerNorm.wrappedValue = RMSNorm(
+            dimensions: config.hiddenSize, eps: config.rmsNormEps)
+        self._postFeedforwardLayerNorm.wrappedValue = RMSNorm(
+            dimensions: config.hiddenSize, eps: config.rmsNormEps)
+
+        super.init()
+    }
+
+    func callAsFunction(
+        _ x: MLXArray,
+        mask: MLXArray? = nil
+    ) -> MLXArray {
+        let r = selfAttention(inputLayerNorm(x), mask: mask, cache: nil)
+        let h = x + postAttentionLayerNorm(r)
+        let r2 = mlp(preFeedforwardLayerNorm(h))
+        let out = h + postFeedforwardLayerNorm(r2)
+        return out
+    }
+}
+
+private class Gemma3Model: Module {
+    @ModuleInfo(key: "embed_tokens") var embedTokens: Embedding
+    @ModuleInfo var layers: [TransformerBlock]
+    @ModuleInfo var norm: RMSNorm
+
+    let config: Gemma3TextConfiguration
+
+    init(_ config: Gemma3TextConfiguration) {
+        self.config = config
+
+        self._embedTokens.wrappedValue = Embedding(
+            embeddingCount: config.vocabularySize,
+            dimensions: config.hiddenSize
+        )
+
+        self._layers.wrappedValue = (0 ..< config.hiddenLayers).map { layerIdx in
+            TransformerBlock(config, layerIdx: layerIdx)
+        }
+
+        self.norm = RMSNorm(dimensions: config.hiddenSize, eps: config.rmsNormEps)
+
+        super.init()
+    }
+
+    func callAsFunction(_ inputs: MLXArray, mask: MLXArray? = nil) -> MLXArray {
+        var h = embedTokens(inputs)
+        h = h * sqrt(Float(config.hiddenSize))
+
+        var fullMask: MLXArray? = nil
+        var slidingWindowMask: MLXArray? = nil
+
+        if mask == nil {
+            let j = config.slidingWindowPattern
+            slidingWindowMask = createAttentionMask(h: h, cache: nil)
+        }
+
+        for (i, layer) in layers.enumerated() {
+            let isSliding = (i % config.slidingWindowPattern == config.slidingWindowPattern - 1)
+
+            var layerMask = mask
+            if mask == nil {
+                layerMask = isSliding ? slidingWindowMask : fullMask
+            }
+
+            h = layer(h, mask: layerMask)
+        }
+
+        return norm(h)
+    }
+}
+
+public class Gemma3TextModel: Module, LLMModel, KVCacheDimensionProvider {
+    @ModuleInfo private var model: Gemma3Model
+    @ModuleInfo(key: "lm_head") var lmHead: Linear
+
+    public let config: Gemma3TextConfiguration
+    public var vocabularySize: Int { config.vocabularySize }
+    public var kvHeads: [Int]
+
+    public init(_ config: Gemma3TextConfiguration) {
+        self.config = config
+        self.model = Gemma3Model(config)
+        self._lmHead.wrappedValue = Linear(config.hiddenSize, config.vocabularySize, bias: false)
+
+        // Set up KV heads array based on sliding window pattern
+        var heads: [Int] = []
+        for i in 0 ..< config.hiddenLayers {
+            heads.append(config.kvHeads)
+        }
+        self.kvHeads = heads
+
+        super.init()
+    }
+
+    public func callAsFunction(_ inputs: MLXArray, cache: [KVCache]?) -> MLXArray {
+        let out = model(inputs, mask: nil)
+        return lmHead(out)
+    }
+
+    public func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
+        var sanitizedWeights = weights
+
+        if sanitizedWeights["lm_head.weight"] == nil {
+            sanitizedWeights["lm_head.weight"] = sanitizedWeights["model.embed_tokens.weight"]
+        }
+
+        return sanitizedWeights.filter { key, _ in
+            !key.contains("self_attn.rotary_emb.inv_freq")
+        }
+    }
+}
+
+extension Gemma3TextModel: LoRAModel {
+    public func loraLinearLayers() -> LoRALinearLayers {
+        model.layers.map { ($0.selfAttention, ["q_proj", "v_proj"]) }
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -40,6 +40,7 @@ public class LLMTypeRegistry: ModelTypeRegistry, @unchecked Sendable {`
`40`	`40`	`"cohere": create(CohereConfiguration.self, CohereModel.init),`
`41`	`41`	`"openelm": create(OpenElmConfiguration.self, OpenELMModel.init),`
`42`	`42`	`"internlm2": create(InternLM2Configuration.self, InternLM2Model.init),`
	`43`	`+ "gemma3_text": create(Gemma3TextConfiguration.self, Gemma3TextModel.init),`
`43`	`44`	`]`
`44`	`45`	`}`
`45`	`46`
`@@ -166,6 +167,11 @@ public class LLMRegistry: AbstractModelRegistry, @unchecked Sendable {`
`166`	`167`	`defaultPrompt: "What is the difference between a fruit and a vegetable?"`
`167`	`168`	`)`
`168`	`169`
	`170`	`+ static public let gemma3_1B_4bit = ModelConfiguration(`
	`171`	`+ id: "mlx-community/gemma-3-1b-it-4bit",`
	`172`	`+ defaultPrompt: "What is the difference between a fruit and a vegetable?"`
	`173`	`+ )`
	`174`	`+`
`169`	`175`	`private static func all() -> [ModelConfiguration] {`
`170`	`176`	`[`
`171`	`177`	`codeLlama13b4bit,`
`@@ -187,6 +193,7 @@ public class LLMRegistry: AbstractModelRegistry, @unchecked Sendable {`
`187`	`193`	`qwen2_5_7b,`
`188`	`194`	`qwen2_5_1_5b,`
`189`	`195`	`smolLM_135M_4bit,`
	`196`	`+ gemma3_1B_4bit,`
`190`	`197`	`]`
`191`	`198`	`}`
`192`	`199`