From 4743f582dfc3287813ba161cd523508996f406e5 Mon Sep 17 00:00:00 2001 From: shavit Date: Sat, 22 Mar 2025 12:15:11 -0400 Subject: [PATCH] Add BART tokenizer --- Sources/Tokenizers/BartTokenizer.swift | 47 +++++++++++++++++++ Sources/Tokenizers/Tokenizer.swift | 1 + .../Resources/bart_large_mnli.json | 1 + Tests/TokenizersTests/TokenizerTests.swift | 17 +++++++ 4 files changed, 66 insertions(+) create mode 100644 Sources/Tokenizers/BartTokenizer.swift create mode 100644 Tests/TokenizersTests/Resources/bart_large_mnli.json diff --git a/Sources/Tokenizers/BartTokenizer.swift b/Sources/Tokenizers/BartTokenizer.swift new file mode 100644 index 0000000..555d7dc --- /dev/null +++ b/Sources/Tokenizers/BartTokenizer.swift @@ -0,0 +1,47 @@ +import Hub + +class BartTokenizer { + public var bosToken: String? + public var bosTokenId: Int? + public var eosToken: String? + public var eosTokenId: Int? + public var unknownToken: String? + public var unknownTokenId: Int? + public var fuseUnknownTokens: Bool + + public let padToken: String + public let sepToken: String + public let clsToken: String + public let maskToken: String + + private let vocab: [String: Int] + private let ids_to_tokens: [Int: String] + private let bpe: BPETokenizer + + required public init(tokenizerConfig: Config, tokenizerData: Config, addedTokens: [String : Int]) throws { + guard let vocab = tokenizerData.model?.vocab?.dictionary as? [String: Int] else { throw TokenizerError.missingVocab } + self.bosToken = tokenizerConfig.bosToken?.stringValue ?? "" + self.bosTokenId = bosToken == nil ? nil : vocab[bosToken!] + self.eosToken = tokenizerConfig.eosToken?.stringValue ?? "" + self.eosTokenId = eosToken == nil ? nil : vocab[eosToken!] + self.unknownToken = tokenizerConfig.unkToken?.stringValue ?? "" + self.unknownTokenId = unknownToken == nil ? nil : vocab[unknownToken!] + self.fuseUnknownTokens = tokenizerConfig.fuseUnk?.boolValue ?? false + self.padToken = tokenizerConfig.padToken?.stringValue ?? "" + self.sepToken = tokenizerConfig.sepToken?.stringValue ?? "" + self.clsToken = tokenizerConfig.clsToken?.stringValue ?? "" + self.maskToken = tokenizerConfig.maskToken?.stringValue ?? "" + self.vocab = vocab + self.ids_to_tokens = Utils.invert(vocab) + self.bpe = try BPETokenizer(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData, addedTokens: addedTokens) + } + + func callAsFunction(_ text: String) -> [String] { bpe.tokenize(text: text) } + func unTokenize(tokens: [Int]) -> [String] { tokens.compactMap({ ids_to_tokens[$0] }) } +} + +extension BartTokenizer: PreTrainedTokenizerModel { + func convertTokenToId(_ token: String) -> Int? { vocab[token] ?? unknownTokenId } + func convertIdToToken(_ id: Int) -> String? { ids_to_tokens[id] } + func tokenize(text: String) -> [String] { bpe.tokenize(text: text) } +} diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index db53337..0e73de2 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -77,6 +77,7 @@ public protocol PreTrainedTokenizerModel: TokenizingModel { struct TokenizerModel { static let knownTokenizers: [String : PreTrainedTokenizerModel.Type] = [ + "BartTokenizer" : BartTokenizer.self, "BertTokenizer" : BertTokenizer.self, "DistilbertTokenizer": BertTokenizer.self, "DistilBertTokenizer": BertTokenizer.self, diff --git a/Tests/TokenizersTests/Resources/bart_large_mnli.json b/Tests/TokenizersTests/Resources/bart_large_mnli.json new file mode 100644 index 0000000..7e7e9fd --- /dev/null +++ b/Tests/TokenizersTests/Resources/bart_large_mnli.json @@ -0,0 +1 @@ +{"text": "Justin Timberlake and Jessica Biel, welcome to parenthood.\n The celebrity couple announced the arrival of their son, Silas Randall Timberlake, in statements to People.\n \"Silas was the middle name of Timberlake's maternal grandfather Bill Bomar, who died in 2012, while Randall is the musician's own middle name, as well as his father's first,\" People reports.\n The couple announced the pregnancy in January, with an Instagram post. It is the first baby for both.", "token_ids": [0, 29466, 19440, 11985, 8, 7103, 163, 5255, 6, 2814, 7, 181, 40532, 4, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 20, 6794, 891, 585, 5, 5237, 9, 49, 979, 6, 8897, 281, 16222, 19440, 11985, 6, 11, 1997, 7, 1806, 4, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 22, 23719, 281, 21, 5, 1692, 766, 9, 19440, 11985, 18, 22835, 10642, 1585, 23707, 271, 6, 54, 962, 11, 1125, 6, 150, 16222, 16, 5, 9613, 18, 308, 1692, 766, 6, 25, 157, 25, 39, 1150, 18, 78, 60, 1806, 690, 4, 50118, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 20, 891, 585, 5, 6690, 11, 644, 6, 19, 41, 1838, 618, 4, 85, 16, 5, 78, 1928, 13, 258, 4, 2], "bpe_tokens": ["", "Justin", "\u0120Timber", "lake", "\u0120and", "\u0120Jessica", "\u0120B", "iel", ",", "\u0120welcome", "\u0120to", "\u0120p", "arenthood", ".", "\u010a", "\u0120", "\u0120", "\u0120", "\u0120", "\u0120", "\u0120", "\u0120", "\u0120The", "\u0120celebrity", "\u0120couple", "\u0120announced", "\u0120the", "\u0120arrival", "\u0120of", "\u0120their", "\u0120son", ",", "\u0120Sil", "as", "\u0120Randall", "\u0120Timber", "lake", ",", "\u0120in", "\u0120statements", "\u0120to", "\u0120People", ".", "\u010a", "\u0120", "\u0120", "\u0120", "\u0120", "\u0120", "\u0120", "\u0120", "\u0120\"", "Sil", "as", "\u0120was", "\u0120the", "\u0120middle", "\u0120name", "\u0120of", "\u0120Timber", "lake", "'s", "\u0120maternal", "\u0120grandfather", "\u0120Bill", "\u0120Bom", "ar", ",", "\u0120who", "\u0120died", "\u0120in", "\u01202012", ",", "\u0120while", "\u0120Randall", "\u0120is", "\u0120the", "\u0120musician", "'s", "\u0120own", "\u0120middle", "\u0120name", ",", "\u0120as", "\u0120well", "\u0120as", "\u0120his", "\u0120father", "'s", "\u0120first", ",\"", "\u0120People", "\u0120reports", ".", "\u010a", "\u0120", "\u0120", "\u0120", "\u0120", "\u0120", "\u0120", "\u0120", "\u0120The", "\u0120couple", "\u0120announced", "\u0120the", "\u0120pregnancy", "\u0120in", "\u0120January", ",", "\u0120with", "\u0120an", "\u0120Instagram", "\u0120post", ".", "\u0120It", "\u0120is", "\u0120the", "\u0120first", "\u0120baby", "\u0120for", "\u0120both", ".", ""], "decoded_text": "Justin Timberlake and Jessica Biel, welcome to parenthood.\n The celebrity couple announced the arrival of their son, Silas Randall Timberlake, in statements to People.\n \"Silas was the middle name of Timberlake's maternal grandfather Bill Bomar, who died in 2012, while Randall is the musician's own middle name, as well as his father's first,\" People reports.\n The couple announced the pregnancy in January, with an Instagram post. It is the first baby for both."} diff --git a/Tests/TokenizersTests/TokenizerTests.swift b/Tests/TokenizersTests/TokenizerTests.swift index eae7003..6561c9a 100644 --- a/Tests/TokenizersTests/TokenizerTests.swift +++ b/Tests/TokenizersTests/TokenizerTests.swift @@ -72,6 +72,23 @@ class BertUncasedTokenizerTests: TokenizerTests { override class var unknownTokenId: Int? { 100 } } +class BartLargeMnliTokenizerTests: TokenizerTests { + override class var hubModelName: String? { "facebook/bart-large-mnli" } + override class var encodedSamplesFilename: String? { "bart_large_mnli" } + override class var unknownTokenId: Int? { 3 } + + override func testTokenize() async { + let testCases: [(String, [String])] = [ + ("Justin Timberlake and Jessica Biel, welcome to parenthood.", ["Justin", "\u{0120}Timber", "lake", "\u{0120}and", "\u{0120}Jessica", "\u{0120}B", "iel", ",", "\u{0120}welcome", "\u{0120}to", "\u{0120}p", "arenthood", "."]), + ("Silas Randall Timberlake, in statements to People. \"Silas was the middle name of Timberlake's maternal grandfather Bill Bomar, who died in 2012, while Randall is the musician's own middle name, as well as his father's first,\" People reports.", ["Sil", "as", "\u{0120}Randall", "\u{0120}Timber", "lake", ",", "\u{0120}in", "\u{0120}statements", "\u{0120}to", "\u{0120}People", ".", "\u{0120}\"", "Sil", "as", "\u{0120}was", "\u{0120}the", "\u{0120}middle", "\u{0120}name", "\u{0120}of", "\u{0120}Timber", "lake", "'s", "\u{0120}maternal", "\u{0120}grandfather", "\u{0120}Bill", "\u{0120}Bom", "ar", ",", "\u{0120}who", "\u{0120}died", "\u{0120}in", "\u{0120}2012", ",", "\u{0120}while", "\u{0120}Randall", "\u{0120}is", "\u{0120}the", "\u{0120}musician", "'s", "\u{0120}own", "\u{0120}middle", "\u{0120}name", ",", "\u{0120}as", "\u{0120}well", "\u{0120}as", "\u{0120}his", "\u{0120}father", "'s", "\u{0120}first", ",\"", "\u{0120}People", "\u{0120}reports", "."]), + ] + let tokenizer = await BartLargeMnliTokenizerTests._tester!.tokenizer! + for (text, expectTokens) in testCases { + XCTAssertEqual(tokenizer.tokenize(text: text), expectTokens) + } + } +} + class GemmaTokenizerTests: TokenizerTests { override class var hubModelName: String? { "pcuenq/gemma-tokenizer" } override class var encodedSamplesFilename: String? { "gemma_encoded" }