Skip to content

Commit 1107194

Browse files
committed
Merge 'origin/master' into hipblas
2 parents 04c0d48 + a3b85b2 commit 1107194

File tree

11 files changed

+374
-85
lines changed

11 files changed

+374
-85
lines changed

.github/workflows/build.yml

+77
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,82 @@ jobs:
210210
path: |
211211
llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
212212
213+
windows-latest-cmake-cublas:
214+
runs-on: windows-latest
215+
216+
strategy:
217+
matrix:
218+
cuda: ['12.1.0', '11.7.1']
219+
build: ['cublas']
220+
221+
steps:
222+
- name: Clone
223+
id: checkout
224+
uses: actions/checkout@v1
225+
226+
- uses: Jimver/[email protected]
227+
id: cuda-toolkit
228+
with:
229+
cuda: ${{ matrix.cuda }}
230+
# TODO(green-sky): _dev seems to fail, and non dev are not enought
231+
#sub-packages: '["nvcc", "cudart", "cublas", "cudart_dev", "cublas_dev"]'
232+
233+
- name: Build
234+
id: cmake_build
235+
run: |
236+
mkdir build
237+
cd build
238+
cmake .. -DLLAMA_CUBLAS=ON
239+
cmake --build . --config Release
240+
241+
- name: Get commit hash
242+
id: commit
243+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
244+
uses: pr-mpt/actions-commit-hash@v2
245+
246+
- name: Pack artifacts
247+
id: pack_artifacts
248+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
249+
run: |
250+
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
251+
252+
- name: Upload artifacts
253+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
254+
uses: actions/upload-artifact@v3
255+
with:
256+
path: |
257+
llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
258+
259+
- name: Copy and pack Cuda runtime
260+
if: ${{ matrix.cuda == '12.1.0' }}
261+
# TODO(green-sky): paths are cuda 12 specific
262+
run: |
263+
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
264+
mkdir '.\build\bin\cudart\'
265+
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_12.dll" '.\build\bin\cudart\'
266+
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_12.dll" '.\build\bin\cudart\'
267+
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_12.dll" '.\build\bin\cudart\'
268+
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
269+
270+
- name: Copy and pack Cuda runtime
271+
if: ${{ matrix.cuda == '11.7.1' }}
272+
# TODO(green-sky): paths are cuda 11 specific
273+
run: |
274+
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
275+
mkdir '.\build\bin\cudart\'
276+
ls "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin"
277+
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_110.dll" '.\build\bin\cudart\'
278+
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_11.dll" '.\build\bin\cudart\'
279+
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_11.dll" '.\build\bin\cudart\'
280+
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
281+
282+
- name: Upload Cuda runtime
283+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
284+
uses: actions/upload-artifact@v3
285+
with:
286+
path: |
287+
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
288+
213289
release:
214290
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
215291

@@ -221,6 +297,7 @@ jobs:
221297
- macOS-latest-make
222298
- macOS-latest-cmake
223299
- windows-latest-cmake
300+
- windows-latest-cmake-cublas
224301

225302
steps:
226303
- name: Download artifacts

Makefile

+6-1
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,12 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
121121
endif
122122
ifdef LLAMA_CLBLAST
123123
CFLAGS += -DGGML_USE_CLBLAST
124-
LDFLAGS += -lclblast -lOpenCL
124+
# Mac provides OpenCL as a framework
125+
ifeq ($(UNAME_S),Darwin)
126+
LDFLAGS += -lclblast -framework OpenCL
127+
else
128+
LDFLAGS += -lclblast -lOpenCL
129+
endif
125130
OBJS += ggml-opencl.o
126131
ggml-opencl.o: ggml-opencl.c ggml-opencl.h
127132
$(CC) $(CFLAGS) -c $< -o $@

README.md

+5-2
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
1818

1919
- Plain C/C++ implementation without dependencies
2020
- Apple silicon first-class citizen - optimized via ARM NEON and Accelerate framework
21-
- AVX2 support for x86 architectures
21+
- AVX, AVX2 and AVX512 support for x86 architectures
2222
- Mixed F16 / F32 precision
23-
- 4-bit integer quantization support
23+
- 4-bit, 5-bit and 8-bit integer quantization support
2424
- Runs on the CPU
25+
- OpenBLAS support
26+
- cuBLAS and CLBlast support
2527

2628
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
2729
Since then, the project has improved significantly thanks to many contributions. This project is for educational purposes and serves
@@ -43,6 +45,7 @@ as the main playground for developing new features for the [ggml](https://github
4345
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
4446
- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
4547
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
48+
- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
4649

4750
**Bindings:**
4851

convert.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ class QuantizedDataType:
6767
{ftype: dtype for (dtype, ftype) in DATA_TYPE_TO_FTYPE.items()}
6868

6969
DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = {
70+
DT_BF16: np.dtype(np.uint16),
7071
DT_F16: np.dtype(np.float16),
7172
DT_F32: np.dtype(np.float32),
7273
DT_I32: np.dtype(np.int32),
@@ -276,6 +277,12 @@ def permute(self, n_head: int) -> 'Tensor': ...
276277
def to_ggml(self) -> 'GGMLCompatibleTensor': ...
277278

278279

280+
def bf16_to_fp32(bf16_arr: np.ndarray) -> np.ndarray:
281+
assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
282+
fp32_arr = bf16_arr.astype(np.uint32) << 16
283+
return fp32_arr.view(np.float32)
284+
285+
279286
class UnquantizedTensor(Tensor):
280287
def __init__(self, ndarray: NDArray) -> None:
281288
assert isinstance(ndarray, np.ndarray)
@@ -284,6 +291,8 @@ def __init__(self, ndarray: NDArray) -> None:
284291

285292
def astype(self, data_type: DataType) -> Tensor:
286293
dtype = DATA_TYPE_TO_NUMPY[data_type]
294+
if self.data_type == DT_BF16:
295+
self.ndarray = bf16_to_fp32(self.ndarray)
287296
return UnquantizedTensor(self.ndarray.astype(dtype))
288297

289298
def to_ggml(self) -> 'UnquantizedTensor':
@@ -686,6 +695,7 @@ def load(offset: int, elm_count: int) -> NDArray:
686695
description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
687696
return LazyStorage(load=load, kind=pid[1], description=description)
688697

698+
# @staticmethod
689699
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, # pyright: ignore[reportSelfClsParameterName]
690700
requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
691701
assert isinstance(storage, LazyStorage)
@@ -696,12 +706,18 @@ def load() -> UnquantizedTensor:
696706
description = f'pickled storage_offset={storage_offset} in {storage.description}'
697707
return LazyTensor(load, list(size), storage.kind.data_type, description)
698708

709+
# @staticmethod
710+
def rebuild_from_type_v2(func, new_type, args, state):
711+
return func(*args)
712+
699713
CLASSES: Dict[Any, Any] = {
714+
('torch._tensor', '_rebuild_from_type_v2'): rebuild_from_type_v2,
700715
('torch._utils', '_rebuild_tensor_v2'): lazy_rebuild_tensor_v2,
701716
('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
702717
('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
703718
('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
704719
('torch', 'IntStorage'): LazyStorageKind(DT_I32),
720+
('torch', 'Tensor'): LazyTensor,
705721
}
706722

707723
def find_class(self, module: str, name: str) -> Any:
@@ -961,7 +977,7 @@ def do_item(item: Tuple[str, LazyTensor]) -> NDArray:
961977

962978
def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType:
963979
wq_type = model["layers.0.attention.wq.weight"].data_type
964-
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
980+
if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
965981
return GGMLFileType.AllF32
966982
if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
967983
return GGMLFileType.MostlyF16

examples/common.cpp

+32-21
Original file line numberDiff line numberDiff line change
@@ -66,35 +66,33 @@ int32_t get_num_physical_cores() {
6666
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
6767
}
6868

69-
std::string process_escapes(const char* input) {
70-
std::string output;
69+
void process_escapes(std::string& input) {
70+
std::size_t input_len = input.length();
71+
std::size_t output_idx = 0;
7172

72-
if (input != nullptr) {
73-
std::size_t input_len = std::strlen(input);
74-
output.reserve(input_len);
75-
76-
for (std::size_t i = 0; i < input_len; ++i) {
77-
if (input[i] == '\\' && i + 1 < input_len) {
78-
switch (input[++i]) {
79-
case 'n': output.push_back('\n'); break;
80-
case 't': output.push_back('\t'); break;
81-
case '\'': output.push_back('\''); break;
82-
case '\"': output.push_back('\"'); break;
83-
case '\\': output.push_back('\\'); break;
84-
default: output.push_back('\\');
85-
output.push_back(input[i]); break;
86-
}
87-
} else {
88-
output.push_back(input[i]);
73+
for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
74+
if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
75+
switch (input[++input_idx]) {
76+
case 'n': input[output_idx++] = '\n'; break;
77+
case 'r': input[output_idx++] = '\r'; break;
78+
case 't': input[output_idx++] = '\t'; break;
79+
case '\'': input[output_idx++] = '\''; break;
80+
case '\"': input[output_idx++] = '\"'; break;
81+
case '\\': input[output_idx++] = '\\'; break;
82+
default: input[output_idx++] = '\\';
83+
input[output_idx++] = input[input_idx]; break;
8984
}
85+
} else {
86+
input[output_idx++] = input[input_idx];
9087
}
9188
}
9289

93-
return output;
90+
input.resize(output_idx);
9491
}
9592

9693
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
9794
bool invalid_param = false;
95+
bool escape_prompt = false;
9896
std::string arg;
9997
gpt_params default_params;
10098

@@ -118,7 +116,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
118116
invalid_param = true;
119117
break;
120118
}
121-
params.prompt = process_escapes(argv[i]);
119+
params.prompt = argv[i];
120+
} else if (arg == "-e") {
121+
escape_prompt = true;
122122
} else if (arg == "--session") {
123123
if (++i >= argc) {
124124
invalid_param = true;
@@ -324,6 +324,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
324324
break;
325325
}
326326
params.input_prefix = argv[i];
327+
} else if (arg == "--in-suffix") {
328+
if (++i >= argc) {
329+
invalid_param = true;
330+
break;
331+
}
332+
params.input_suffix = argv[i];
327333
} else {
328334
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
329335
gpt_print_usage(argc, argv, default_params);
@@ -335,6 +341,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
335341
gpt_print_usage(argc, argv, default_params);
336342
exit(1);
337343
}
344+
if (escape_prompt) {
345+
process_escapes(params.prompt);
346+
}
338347

339348
return true;
340349
}
@@ -355,9 +364,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
355364
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
356365
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
357366
fprintf(stderr, " prompt to start generation with (default: empty)\n");
367+
fprintf(stderr, " -e process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
358368
fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n");
359369
fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
360370
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
371+
fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
361372
fprintf(stderr, " -f FNAME, --file FNAME\n");
362373
fprintf(stderr, " prompt file to start generation.\n");
363374
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);

examples/common.h

+1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ struct gpt_params {
4343
std::string prompt = "";
4444
std::string path_session = ""; // path to file for saving/loading model eval state
4545
std::string input_prefix = ""; // string to prefix user inputs with
46+
std::string input_suffix = ""; // string to suffix user inputs with
4647
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
4748

4849
std::string lora_adapter = ""; // lora adapter path

0 commit comments

Comments
 (0)