Skip to content

Commit 019ba1d

Browse files
authored
convert : fix Baichuan2 models by using vocab size in config.json (#3299)
Use local GGUF package when possible in Baichuan converter
1 parent beabc8c commit 019ba1d

File tree

1 file changed

+8
-2
lines changed

1 file changed

+8
-2
lines changed

convert-baichuan-hf-to-gguf.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,14 @@
1111
from pathlib import Path
1212
from typing import TYPE_CHECKING, Any
1313
import itertools
14-
import gguf
1514
import numpy as np
1615
import torch
1716
from sentencepiece import SentencePieceProcessor # type: ignore[import]
1817

18+
if 'NO_LOCAL_GGUF' not in os.environ:
19+
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
20+
import gguf
21+
1922

2023
if TYPE_CHECKING:
2124
from typing import TypeAlias
@@ -174,8 +177,11 @@ def parse_args() -> argparse.Namespace:
174177
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
175178

176179
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
180+
vocab_size = hparams.get('vocab_size')
181+
if vocab_size is None:
182+
vocab_size = tokenizer.vocab_size()
177183

178-
for i in range(tokenizer.vocab_size()):
184+
for i in range(vocab_size):
179185
text: bytes
180186
score: float
181187

0 commit comments

Comments
 (0)