huggingface
diff --git a/‎Package.swift
+4-2 b/‎Package.swift
+4-2
diff --git a/‎Sources/Generation/Generation.swift
+3-3 b/‎Sources/Generation/Generation.swift
+3-3
diff --git a/‎Sources/Hub/Hub.swift
+97 b/‎Sources/Hub/Hub.swift
+97
diff --git a/‎Sources/Models/LanguageModel.swift
+112-15 b/‎Sources/Models/LanguageModel.swift
+112-15
diff --git a/‎Sources/Models/LanguageModelTypes.swift
+8-8 b/‎Sources/Models/LanguageModelTypes.swift
+8-8
diff --git a/‎Sources/Tokenizers/Architecture.swift
+9 b/‎Sources/Tokenizers/Architecture.swift
+9
diff --git a/‎Sources/Tokenizers/BertTokenizer.swift
+2-11 b/‎Sources/Tokenizers/BertTokenizer.swift
+2-11
@@ -10,10 +10,12 @@ let package = Package(
         .library(name: "Transformers", targets: ["Tokenizers", "Generation", "Models"]),
     ],
     targets: [
-        .target(name: "Tokenizers", resources: [.process("Vocabs")]),
+        .target(name: "Hub"),
+        .target(name: "Tokenizers", dependencies: ["Hub"]),
         .target(name: "TensorUtils"),
         .target(name: "Generation", dependencies: ["Tokenizers", "TensorUtils"]),
         .target(name: "Models", dependencies: ["Tokenizers", "Generation", "TensorUtils"]),
-        .testTarget(name: "TokenizersTests", dependencies: ["Tokenizers"], resources: [.process("Resources")]),
+        .testTarget(name: "TokenizersTests", dependencies: ["Tokenizers"], resources: [.process("Resources"), .process("Vocabs")]),
+        .testTarget(name: "HubTests", dependencies: ["Hub"]),
     ]
 )
@@ -22,7 +22,7 @@ public typealias InputTokens = [Int]
 public typealias GenerationOutput = [Int]
 
 /// A callable (a model, usually), that predicts the next token after a given sequence
-public typealias NextTokenModel = (InputTokens) -> any MLShapedArrayProtocol
+public typealias NextTokenModel = (InputTokens, GenerationConfig) -> any MLShapedArrayProtocol
 
 public typealias PredictionTokensCallback = (GenerationOutput) -> Void
 public typealias PredictionStringCallback = (String) -> Void
@@ -40,7 +40,7 @@ public extension Generation {
         // TODO: additional stopping criteria
         var outputTokens = tokens
         while outputTokens.count < config.maxLength {
-            let logits = model(outputTokens)
+            let logits = model(outputTokens, config)
             let (nextToken, _) = Math.argmax(logits)
             if nextToken == config.eosTokenId { break }
             outputTokens.append(nextToken)
@@ -55,7 +55,7 @@ public extension Generation {
         // TODO: additional stopping criteria
         var outputTokens = tokens
         while outputTokens.count < config.maxLength {
-            let outputs = model(outputTokens)
+            let outputs = model(outputTokens, config)
 
             /// `floats` can be much faster than `scalars` for a vector with stride 1, as it uses `memcpy` in that case
             var logits = (outputs as? MLShapedArraySlice<Float>)?.floats ?? outputs.scalars as! [Float]
 
@@ -0,0 +1,97 @@
+//
+//  Hub.swift
+//  
+//
+//  Created by Pedro Cuenca on 18/5/23.
+//
+
+import Foundation
+
+public struct Hub {}
+
+public extension Hub {
+    enum HubClientError: Error {
+        case download
+        case parse
+    }
+    
+    static func download(url: URL) async throws -> Data {
+        let (data, _) = try await URLSession.shared.data(from: url)
+        return data
+    }
+    
+    static func download(url: String) async throws -> Data {
+        guard let realUrl = URL(string: url) else { throw HubClientError.download }
+        let (data, _) = try await URLSession.shared.data(from: realUrl)
+        return data
+    }
+    
+    /// Downloads file from the given repo, and JSON-decodes it
+    /// Returns a `Config` (just a dictionary wrapper) as I'm not sure we can use the same object structure for all tokenizers or models
+    static func downloadConfig(repoId: String, filename: String) async throws -> Config {
+        let url = "https://huggingface.co/\(repoId)/resolve/main/\(filename)"
+        let data = try await download(url: url)
+        
+        let parsed = try JSONSerialization.jsonObject(with: data, options: [])
+        guard let dictionary = parsed as? [String: Any] else { throw HubClientError.parse }
+        return Config(dictionary)
+    }
+}
+
+@dynamicMemberLookup
+public struct Config {
+    public private(set) var dictionary: [String: Any]
+
+    init(_ dictionary: [String: Any]) {
+        self.dictionary = dictionary
+    }
+
+    func camelCase(_ string: String) -> String {
+        return string
+            .split(separator: "_")
+            .enumerated()
+            .map { $0.offset == 0 ? $0.element.lowercased() : $0.element.capitalized }
+            .joined()
+    }
+    
+    func uncamelCase(_ string: String) -> String {
+        let scalars = string.unicodeScalars
+        var result = ""
+        
+        var previousCharacterIsLowercase = false
+        for scalar in scalars {
+            if CharacterSet.uppercaseLetters.contains(scalar) {
+                if previousCharacterIsLowercase {
+                    result += "_"
+                }
+                let lowercaseChar = Character(scalar).lowercased()
+                result += lowercaseChar
+                previousCharacterIsLowercase = false
+            } else {
+                result += String(scalar)
+                previousCharacterIsLowercase = true
+            }
+        }
+        
+        return result
+    }
+
+
+    public subscript(dynamicMember member: String) -> Config? {
+        let key = dictionary[member] != nil ? member : uncamelCase(member)
+        if let value = dictionary[key] as? [String: Any] {
+            return Config(value)
+        } else if let value = dictionary[key] {
+            return Config(["value": value])
+        }
+        return nil
+    }
+
+    public var value: Any? {
+        return dictionary["value"]
+    }
+    
+    public var intValue: Int? { value as? Int }
+    public var boolValue: Bool? { value as? Bool }
+    public var stringValue: String? { value as? String }
+}
@@ -8,6 +8,7 @@
 import CoreML
 import Tokenizers
 import Generation
+import Hub
 
 public class LanguageModel {
     public let model: MLModel
@@ -17,10 +18,15 @@ public class LanguageModel {
 
     let input_ids = "input_ids"
     let attention_mask = "attention_mask"
-
-    lazy public var tokenizer: Tokenizer = {
-        return architecture.tokenizerClass.init()
-    }()
+    
+    struct Configurations {
+        var modelConfig: Config
+        var tokenizerConfig: Config?
+        var tokenizerData: Config
+    }
+    
+    private var configPromise: Task<Configurations, Error>? = nil
+    private var _tokenizer: Tokenizer? = nil
 
     public required init(model: MLModel) {
         self.model = model
@@ -49,6 +55,10 @@ public class LanguageModel {
             minContextLength = 128
             maxContextLength = 128
         }
+        
+        self.configPromise = Task.init {
+            return try await self.loadConfig()
+        }
     }
 }
 
@@ -71,16 +81,7 @@ public extension LanguageModel {
         guard let modelName = model.configuration.modelDisplayName else { fatalError("Models must have a name that identifies them") }
         return modelName
     }
-    
-    var architecture: Architecture {
-        guard let architecture = Architecture.from(modelName: modelName) else { fatalError("Cannot obtain model architecture") }
-        return architecture
-    }
 
-    var padTokenId: Int? { architecture.padTokenId ?? architecture.eosTokenId }
-    var bosTokenId: Int? { architecture.bosTokenId }
-    var eosTokenId: Int? { architecture.eosTokenId }
-    
     var inputIdsDescription: MLFeatureDescription {
         model.modelDescription.inputDescriptionsByName[input_ids]!
     }
@@ -99,13 +100,13 @@ public extension LanguageModel {
     }
 
     // MLShapedArrayProtocol is either a MLShapedArray or a MLShapedArraySlice
-    func predictNextTokenScores(_ tokens: InputTokens) -> any MLShapedArrayProtocol {
+    func predictNextTokenScores(_ tokens: InputTokens, config: GenerationConfig) -> any MLShapedArrayProtocol {
         // TODO: exceptions
 
         // Maybe pad or truncate
         let maxTokens = min(tokens.count, maxContextLength)
         let padLength = maxTokens >= minContextLength ? 0 : minContextLength-maxTokens
-        let inputTokens = Array(tokens[0..<maxTokens]) + Array(repeating: padTokenId ?? 0, count: padLength)
+        let inputTokens = Array(tokens[0..<maxTokens]) + Array(repeating: config.padTokenId ?? 0, count: padLength)
 
         let inputIds = MLMultiArray.from(inputTokens, dims: inputIdsShape.count)
         var inputDictionary = [inputIdsName: inputIds]
@@ -126,6 +127,100 @@ public extension LanguageModel {
     }
 }
 
+extension LanguageModel {
+    func loadConfig() async throws -> Configurations {
+        // TODO: caching
+        async let modelConfig = try Hub.downloadConfig(repoId: modelName, filename: "config.json")
+        async let tokenizerConfig = try Hub.downloadConfig(repoId: modelName, filename: "tokenizer_config.json")
+        async let tokenizerVocab = try Hub.downloadConfig(repoId: modelName, filename: "tokenizer.json")
+        
+        // Note tokenizerConfig may be nil (does not exist in all models)
+        let configs = await Configurations(modelConfig: try modelConfig, tokenizerConfig: try? tokenizerConfig, tokenizerData: try tokenizerVocab)
+        return configs
+    }
+}
+
+/// async properties downloaded from the configuration
+public extension LanguageModel {
+    var modelConfig: Config {
+        get async throws {
+            try await configPromise!.value.modelConfig
+        }
+    }
+    
+    var tokenizerConfig: Config? {
+        get async throws {
+            try await configPromise!.value.tokenizerConfig
+        }
+    }
+    
+    var tokenizerData: Config {
+        get async throws {
+            try await configPromise!.value.tokenizerData
+        }
+    }
+    
+    var modelType: String? {
+        get async throws {
+            try await modelConfig.modelType?.stringValue
+        }
+    }
+    
+    var textGenerationParameters: Config? {
+        get async throws {
+            try await modelConfig.taskSpecificParams?.textGeneration
+        }
+    }
+    
+    var defaultDoSample: Bool {
+        get async throws {
+            try await textGenerationParameters?.doSample?.boolValue ?? true
+        }
+    }
+    
+    var architecture: Architecture? {
+        get async throws {
+            guard let modelType = try await modelType else { return nil }
+            return Architecture.from(modelType: modelType)
+        }
+    }
+    
+    var padTokenId: Int? {
+        get async throws {
+            guard let architecture = try await architecture else { return nil }
+            return architecture.padTokenId ?? architecture.eosTokenId
+        }
+    }
+    
+    var bosTokenId: Int? {
+        get async throws {
+            let modelConfig = try await modelConfig
+            if let bosTokenId = modelConfig.bosTokenId?.intValue { return bosTokenId }
+            return try await architecture?.bosTokenId
+        }
+    }
+    
+    var eosTokenId: Int? {
+        get async throws {
+            let modelConfig = try await modelConfig
+            if let eosTokenId = modelConfig.eosTokenId?.intValue { return eosTokenId }
+            return try await architecture?.eosTokenId
+        }
+    }
+    
+    var tokenizer: Tokenizer {
+        get async throws {
+            guard _tokenizer == nil else { return _tokenizer! }
+            guard let architecture = try await architecture else { throw "Cannot retrieve Tokenizer" }
+            let tokenizerData = try await tokenizerData
+            guard let vocab = tokenizerData.model?.vocab?.dictionary as? [String: Int] else { throw "Cannot find vocab in tokenizer JSON" }
+            let merges = tokenizerData.model?.merges?.value as? [String]
+            _tokenizer = architecture.tokenizerClass.init(vocab: vocab, merges: merges)
+            return _tokenizer!
+        }
+    }
+}
+
 extension LanguageModel: TextGenerationModel {
     //TODO: retrieve from the json: https://huggingface.co/nlpcloud/instruct-gpt-j-fp16/blob/main/config.json#L26
     public var defaultGenerationConfig: GenerationConfig {
@@ -139,3 +234,5 @@ extension LanguageModel: TextGenerationModel {
         return config
     }
 }
+
+extension String: Error {}
@@ -13,29 +13,29 @@ public protocol LanguageModelProtocol {
     /// `name_or_path` in the Python world
     var modelName: String { get }
 
-    var tokenizer: Tokenizer { get }
+    var tokenizer: Tokenizer { get async throws }
     var model: MLModel { get }
 
     init(model: MLModel)
 
     /// Make prediction callable (this works like __call__ in Python)
-    func predictNextTokenScores(_ tokens: InputTokens) -> any MLShapedArrayProtocol //MLShapedArray<Float>
-    func callAsFunction(_ tokens: InputTokens) -> any MLShapedArrayProtocol //MLShapedArray<Float>
+    func predictNextTokenScores(_ tokens: InputTokens, config: GenerationConfig) -> any MLShapedArrayProtocol //MLShapedArray<Float>
+    func callAsFunction(_ tokens: InputTokens, config: GenerationConfig) -> any MLShapedArrayProtocol //MLShapedArray<Float>
 }
 
 public extension LanguageModelProtocol {
-    func callAsFunction(_ tokens: InputTokens) -> any MLShapedArrayProtocol {
-        predictNextTokenScores(tokens)
+    func callAsFunction(_ tokens: InputTokens, config: GenerationConfig) -> any MLShapedArrayProtocol {
+        predictNextTokenScores(tokens, config: config)
     }
 }
 
 public protocol TextGenerationModel: Generation, LanguageModelProtocol {
     var defaultGenerationConfig: GenerationConfig { get }
-    func generate(config: GenerationConfig, prompt: String, callback: PredictionStringCallback?) async -> String
+    func generate(config: GenerationConfig, prompt: String, callback: PredictionStringCallback?) async throws -> String
 }
 
 public extension TextGenerationModel {
-    func generate(config: GenerationConfig, prompt: String, callback: PredictionStringCallback? = nil) async -> String {
-        await self.generate(config: config, prompt: prompt, model: self.callAsFunction(_:), tokenizer: self.tokenizer, callback: callback)
+    func generate(config: GenerationConfig, prompt: String, callback: PredictionStringCallback? = nil) async throws -> String {
+        try await self.generate(config: config, prompt: prompt, model: self.callAsFunction, tokenizer: self.tokenizer, callback: callback)
     }
 }
@@ -45,4 +45,13 @@ extension Architecture {
         }
         return nil
     }
+    
+    public static func from(modelType: String) -> Architecture? {
+        for arch in SupportedArchitecture.allCases {
+            if modelType.contains(arch.rawValue) {
+                return arch.architecture
+            }
+        }
+        return nil
+    }
 }
@@ -20,18 +20,9 @@ class BertTokenizer {
     private let vocab: [String: Int]
     private let ids_to_tokens: [Int: String]
 
-    required init() {
-        let url = Bundle.module.url(forResource: "bert-vocab", withExtension: "txt")!
-        let vocabTxt = try! String(contentsOf: url)
-        let tokens = vocabTxt.split(separator: "\n").map { String($0) }
-        var vocab: [String: Int] = [:]
-        var ids_to_tokens: [Int: String] = [:]
-        for (i, token) in tokens.enumerated() {
-            vocab[token] = i
-            ids_to_tokens[i] = token
-        }
+    required init(vocab: [String: Int], merges: [String]?) {
         self.vocab = vocab
-        self.ids_to_tokens = ids_to_tokens
+        self.ids_to_tokens = Utils.invert(vocab)
         self.wordpieceTokenizer = WordpieceTokenizer(vocab: self.vocab)
     }
Original file line number	Diff line number	Diff line change
`@@ -13,29 +13,29 @@ public protocol LanguageModelProtocol {`
`13`	`13`	/// `name_or_path` in the Python world
`14`	`14`	`var modelName: String { get }`
`15`	`15`
`16`		`- var tokenizer: Tokenizer { get }`
	`16`	`+ var tokenizer: Tokenizer { get async throws }`
`17`	`17`	`var model: MLModel { get }`
`18`	`18`
`19`	`19`	`init(model: MLModel)`
`20`	`20`
`21`	`21`	`/// Make prediction callable (this works like __call__ in Python)`
`22`		`- func predictNextTokenScores(_ tokens: InputTokens) -> any MLShapedArrayProtocol //MLShapedArray<Float>`
`23`		`- func callAsFunction(_ tokens: InputTokens) -> any MLShapedArrayProtocol //MLShapedArray<Float>`
	`22`	`+ func predictNextTokenScores(_ tokens: InputTokens, config: GenerationConfig) -> any MLShapedArrayProtocol //MLShapedArray<Float>`
	`23`	`+ func callAsFunction(_ tokens: InputTokens, config: GenerationConfig) -> any MLShapedArrayProtocol //MLShapedArray<Float>`
`24`	`24`	`}`
`25`	`25`
`26`	`26`	`public extension LanguageModelProtocol {`
`27`		`- func callAsFunction(_ tokens: InputTokens) -> any MLShapedArrayProtocol {`
`28`		`- predictNextTokenScores(tokens)`
	`27`	`+ func callAsFunction(_ tokens: InputTokens, config: GenerationConfig) -> any MLShapedArrayProtocol {`
	`28`	`+ predictNextTokenScores(tokens, config: config)`
`29`	`29`	`}`
`30`	`30`	`}`
`31`	`31`
`32`	`32`	`public protocol TextGenerationModel: Generation, LanguageModelProtocol {`
`33`	`33`	`var defaultGenerationConfig: GenerationConfig { get }`
`34`		`- func generate(config: GenerationConfig, prompt: String, callback: PredictionStringCallback?) async -> String`
	`34`	`+ func generate(config: GenerationConfig, prompt: String, callback: PredictionStringCallback?) async throws -> String`
`35`	`35`	`}`
`36`	`36`
`37`	`37`	`public extension TextGenerationModel {`
`38`		`- func generate(config: GenerationConfig, prompt: String, callback: PredictionStringCallback? = nil) async -> String {`
`39`		`- await self.generate(config: config, prompt: prompt, model: self.callAsFunction(_:), tokenizer: self.tokenizer, callback: callback)`
	`38`	`+ func generate(config: GenerationConfig, prompt: String, callback: PredictionStringCallback? = nil) async throws -> String {`
	`39`	`+ try await self.generate(config: config, prompt: prompt, model: self.callAsFunction, tokenizer: self.tokenizer, callback: callback)`
`40`	`40`	`}`
`41`	`41`	`}`
Original file line number	Diff line number	Diff line change
`@@ -45,4 +45,13 @@ extension Architecture {`
`45`	`45`	`}`
`46`	`46`	`return nil`
`47`	`47`	`}`
	`48`	`+`
	`49`	`+ public static func from(modelType: String) -> Architecture? {`
	`50`	`+ for arch in SupportedArchitecture.allCases {`
	`51`	`+ if modelType.contains(arch.rawValue) {`
	`52`	`+ return arch.architecture`
	`53`	`+ }`
	`54`	`+ }`
	`55`	`+ return nil`
	`56`	`+ }`
`48`	`57`	`}`