@@ -255,7 +255,7 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) ->
255
255
self .fname_tokenizer = fname_tokenizer
256
256
self .fname_added_tokens = fname_added_tokens
257
257
258
- def bpe_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
258
+ def bpe_tokens (self ) -> Iterable [Tuple [bytes , float , gguf . TokenType ]]:
259
259
tokenizer = self .bpe_tokenizer
260
260
from transformers .models .gpt2 import tokenization_gpt2
261
261
byte_encoder = tokenization_gpt2 .bytes_to_unicode ()
@@ -265,12 +265,12 @@ def bpe_tokens(self) -> Iterable[Tuple[bytes, float]]:
265
265
score : float = - i
266
266
yield text , score , gguf .TokenType .USER_DEFINED
267
267
268
- def added_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
268
+ def added_tokens (self ) -> Iterable [Tuple [bytes , float , gguf . TokenType ]]:
269
269
for text in self .added_tokens_list :
270
270
score = - 1000.0
271
271
yield text .encode ("utf-8" ), score , gguf .TokenType .USER_DEFINED
272
272
273
- def all_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
273
+ def all_tokens (self ) -> Iterable [Tuple [bytes , float , gguf . TokenType ]]:
274
274
yield from self .bpe_tokens ()
275
275
yield from self .added_tokens ()
276
276
@@ -286,6 +286,7 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) ->
286
286
added_tokens = json .load (open (fname_added_tokens , encoding = "utf-8" ))
287
287
else :
288
288
added_tokens = {}
289
+
289
290
vocab_size : int = self .sentencepiece_tokenizer .vocab_size ()
290
291
expected_ids = list (range (vocab_size , vocab_size + len (added_tokens )))
291
292
actual_ids = sorted (added_tokens .values ())
@@ -299,7 +300,7 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) ->
299
300
self .fname_tokenizer = fname_tokenizer
300
301
self .fname_added_tokens = fname_added_tokens
301
302
302
- def sentencepiece_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
303
+ def sentencepiece_tokens (self ) -> Iterable [Tuple [bytes , float , gguf . TokenType ]]:
303
304
tokenizer = self .sentencepiece_tokenizer
304
305
for i in range (tokenizer .vocab_size ()):
305
306
piece = tokenizer .id_to_piece (i )
@@ -323,12 +324,12 @@ def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
323
324
324
325
yield text , score , toktype
325
326
326
- def added_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
327
+ def added_tokens (self ) -> Iterable [Tuple [bytes , float , gguf . TokenType ]]:
327
328
for text in self .added_tokens_list :
328
329
score = - 1000.0
329
330
yield text .encode ("utf-8" ), score , gguf .TokenType .USER_DEFINED
330
331
331
- def all_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
332
+ def all_tokens (self ) -> Iterable [Tuple [bytes , float , gguf . TokenType ]]:
332
333
yield from self .sentencepiece_tokens ()
333
334
yield from self .added_tokens ()
334
335
@@ -727,7 +728,7 @@ def __init__(self, fname_out: Path) -> None:
727
728
self .gguf = gguf .GGUFWriter (fname_out , gguf .MODEL_ARCH_NAMES [ARCH ])
728
729
729
730
def add_meta_arch (self , params : Params ) -> None :
730
- self .gguf .add_name ("llama " )
731
+ self .gguf .add_name ("LLaMA " )
731
732
self .gguf .add_context_length (params .n_ctx )
732
733
self .gguf .add_embedding_length (params .n_embd )
733
734
self .gguf .add_block_count (params .n_layer )
0 commit comments