Skip to content

Commit 222cbbb

Browse files
committed
add additional hipblas conditions for cublas
1 parent e1f9581 commit 222cbbb

File tree

9 files changed

+36
-36
lines changed

9 files changed

+36
-36
lines changed

examples/common.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
304304
invalid_param = true;
305305
break;
306306
}
307-
#ifdef GGML_USE_CUBLAS
307+
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
308308
params.main_gpu = std::stoi(argv[i]);
309309
#else
310310
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
@@ -314,7 +314,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
314314
invalid_param = true;
315315
break;
316316
}
317-
#ifdef GGML_USE_CUBLAS
317+
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
318318
std::string arg_next = argv[i];
319319

320320
// split string by , and /
@@ -334,7 +334,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
334334
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
335335
#endif // GGML_USE_CUBLAS
336336
} else if (arg == "--low-vram" || arg == "-lv") {
337-
#ifdef GGML_USE_CUBLAS
337+
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
338338
params.low_vram = true;
339339
#else
340340
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
@@ -414,7 +414,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
414414
exit(1);
415415
}
416416

417-
#ifdef GGML_USE_CUBLAS
417+
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
418418
if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) {
419419
fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__);
420420
exit(1);

examples/server/server.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -560,7 +560,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
560560
invalid_param = true;
561561
break;
562562
}
563-
#ifdef GGML_USE_CUBLAS
563+
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
564564
std::string arg_next = argv[i];
565565

566566
// split string by , and /
@@ -583,7 +583,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
583583
}
584584
else if (arg == "--low-vram" || arg == "-lv")
585585
{
586-
#ifdef GGML_USE_CUBLAS
586+
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
587587
params.low_vram = true;
588588
#else
589589
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
@@ -594,7 +594,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
594594
invalid_param = true;
595595
break;
596596
}
597-
#ifdef GGML_USE_CUBLAS
597+
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
598598
params.main_gpu = std::stoi(argv[i]);
599599
#else
600600
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});

ggml.c

+5-5
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ inline static void* ggml_aligned_malloc(size_t size) {
161161
#endif
162162
#elif defined(GGML_USE_OPENBLAS)
163163
#include <cblas.h>
164-
#elif defined(GGML_USE_CUBLAS) | defined(GGML_USE_HIPBLAS)
164+
#elif defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
165165
#include "ggml-cuda.h"
166166
#elif defined(GGML_USE_CLBLAST)
167167
#include "ggml-opencl.h"
@@ -4116,7 +4116,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
41164116
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
41174117
}
41184118

4119-
#if defined(GGML_USE_CUBLAS)
4119+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
41204120
ggml_init_cublas();
41214121
#elif defined(GGML_USE_CLBLAST)
41224122
ggml_cl_init();
@@ -14875,7 +14875,7 @@ static void ggml_compute_forward_cross_entropy_loss_back(
1487514875
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
1487614876
GGML_ASSERT(params);
1487714877

14878-
#ifdef GGML_USE_CUBLAS
14878+
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
1487914879
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
1488014880
if (skip_cpu) {
1488114881
return;
@@ -16362,7 +16362,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
1636216362

1636316363
size_t cur = 0;
1636416364

16365-
#if defined(GGML_USE_CUBLAS)
16365+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
1636616366
if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
1636716367
node->n_tasks = 1; // TODO: this actually is doing nothing
1636816368
// the threads are still spinning
@@ -18637,7 +18637,7 @@ int ggml_cpu_has_wasm_simd(void) {
1863718637
}
1863818638

1863918639
int ggml_cpu_has_blas(void) {
18640-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
18640+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) || defined(GGML_USE_CLBLAST)
1864118641
return 1;
1864218642
#else
1864318643
return 0;

llama-util.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -441,7 +441,7 @@ struct llama_buffer {
441441
llama_buffer& operator=(llama_buffer&&) = delete;
442442
};
443443

444-
#ifdef GGML_USE_CUBLAS
444+
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
445445
#include "ggml-cuda.h"
446446
struct llama_ctx_buffer {
447447
uint8_t * addr = NULL;

llama.cpp

+11-11
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
#include "llama.h"
1111

1212
#include "ggml.h"
13-
#ifdef GGML_USE_CUBLAS
13+
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
1414
#include "ggml-cuda.h"
1515
#elif defined(GGML_USE_CLBLAST)
1616
#include "ggml-opencl.h"
@@ -175,7 +175,7 @@ struct llama_kv_cache {
175175
ggml_free(ctx);
176176
}
177177

178-
#ifdef GGML_USE_CUBLAS
178+
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
179179
ggml_cuda_free_data(k);
180180
ggml_cuda_free_data(v);
181181
#endif // GGML_USE_CUBLAS
@@ -220,7 +220,7 @@ struct llama_model {
220220
ggml_free(ctx);
221221
}
222222

223-
#ifdef GGML_USE_CUBLAS
223+
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
224224
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
225225
ggml_cuda_free_data(tensors_by_name[i].second);
226226
}
@@ -791,7 +791,7 @@ struct llama_model_loader {
791791
lmlock->grow_to(lock_size);
792792
}
793793
break;
794-
#if defined(GGML_USE_CUBLAS)
794+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
795795
case GGML_BACKEND_GPU:
796796
case GGML_BACKEND_GPU_SPLIT:
797797
ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
@@ -911,7 +911,7 @@ static bool kv_cache_init(
911911
ggml_set_name(cache.v, "cache_v");
912912

913913
(void) n_gpu_layers;
914-
#ifdef GGML_USE_CUBLAS
914+
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
915915
if (n_gpu_layers > n_layer + 1) {
916916
ggml_cuda_assign_buffers_no_scratch(cache.v);
917917
}
@@ -1141,7 +1141,7 @@ static void llama_model_load_internal(
11411141
}
11421142

11431143
(void) main_gpu;
1144-
#if defined(GGML_USE_CUBLAS)
1144+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
11451145
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
11461146
ggml_cuda_set_main_device(main_gpu);
11471147
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
@@ -1252,7 +1252,7 @@ static void llama_model_load_internal(
12521252

12531253
(void) vram_scratch;
12541254
(void) n_batch;
1255-
#ifdef GGML_USE_CUBLAS
1255+
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
12561256
if (low_vram) {
12571257
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
12581258
ggml_cuda_set_scratch_size(0); // disable scratch
@@ -1265,7 +1265,7 @@ static void llama_model_load_internal(
12651265
}
12661266
}
12671267
#endif // GGML_USE_CUBLAS
1268-
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1268+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) || defined(GGML_USE_CLBLAST)
12691269
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
12701270

12711271
fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
@@ -1305,7 +1305,7 @@ static void llama_model_load_internal(
13051305
}
13061306

13071307
(void) tensor_split;
1308-
#if defined(GGML_USE_CUBLAS)
1308+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
13091309
{
13101310
ggml_cuda_set_tensor_split(tensor_split);
13111311
}
@@ -1425,7 +1425,7 @@ static bool llama_eval_internal(
14251425
offload_func_t offload_func_kq = llama_nop;
14261426
offload_func_t offload_func_v = llama_nop;
14271427

1428-
#ifdef GGML_USE_CUBLAS
1428+
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
14291429
if (n_gpu_layers > n_layer) {
14301430
offload_func_nr = ggml_cuda_assign_buffers;
14311431
}
@@ -1440,7 +1440,7 @@ static bool llama_eval_internal(
14401440
for (int il = 0; il < n_layer; ++il) {
14411441
offload_func_t offload_func = llama_nop;
14421442

1443-
#ifdef GGML_USE_CUBLAS
1443+
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
14441444
if (il >= i_gpu_start) {
14451445
offload_func = ggml_cuda_assign_buffers;
14461446
}

llama.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#define LLAMA_H
33

44
#include "ggml.h"
5-
#ifdef GGML_USE_CUBLAS
5+
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
66
#include "ggml-cuda.h"
77
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
88
#else
@@ -38,7 +38,7 @@
3838
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
3939
#define LLAMA_SESSION_VERSION 1
4040

41-
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
41+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
4242
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
4343
#define LLAMA_SUPPORTS_GPU_OFFLOAD
4444
#endif

otherarch/ggml_v2.c

+7-7
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ inline static void* ggml_v2_aligned_malloc(size_t size) {
139139
#include <Accelerate/Accelerate.h>
140140
#elif defined(GGML_USE_OPENBLAS)
141141
#include <cblas.h>
142-
#elif defined(GGML_USE_CUBLAS)
142+
#elif defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
143143
#include "ggml_v2-cuda.h"
144144
#endif
145145
#if defined(GGML_USE_CLBLAST)
@@ -3894,7 +3894,7 @@ struct ggml_v2_context * ggml_v2_init(struct ggml_v2_init_params params) {
38943894
GGML_V2_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
38953895
}
38963896

3897-
#if defined(GGML_USE_CUBLAS)
3897+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
38983898
ggml_v2_init_cublas();
38993899
#elif defined(GGML_USE_CLBLAST)
39003900
if(quants_unshuffled)
@@ -9448,7 +9448,7 @@ static void ggml_v2_compute_forward_mul_mat_f32(
94489448
// nb01 >= nb00 - src0 is not transposed
94499449
// compute by src0 rows
94509450

9451-
#if defined(GGML_USE_CUBLAS)
9451+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
94529452
if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) {
94539453
if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) {
94549454
ggml_v2_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
@@ -9642,7 +9642,7 @@ static void ggml_v2_compute_forward_mul_mat_f16_f32(
96429642
// nb01 >= nb00 - src0 is not transposed
96439643
// compute by src0 rows
96449644

9645-
#if defined(GGML_USE_CUBLAS)
9645+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
96469646
if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) {
96479647
if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) {
96489648
ggml_v2_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
@@ -9881,7 +9881,7 @@ static void ggml_v2_compute_forward_mul_mat_q_f32(
98819881
// nb01 >= nb00 - src0 is not transposed
98829882
// compute by src0 rows
98839883

9884-
#if defined(GGML_USE_CUBLAS)
9884+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
98859885
if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) {
98869886
if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) {
98879887
ggml_v2_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
@@ -14061,7 +14061,7 @@ void ggml_v2_graph_compute(struct ggml_v2_context * ctx, struct ggml_v2_cgraph *
1406114061

1406214062
size_t cur = 0;
1406314063

14064-
#if defined(GGML_USE_CUBLAS)
14064+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
1406514065
if (ggml_v2_cuda_can_mul_mat(node->src0, node->src1, node)) {
1406614066
node->n_tasks = 1; // TODO: this actually is doing nothing
1406714067
// the threads are still spinning
@@ -15559,7 +15559,7 @@ int ggml_v2_cpu_has_wasm_simd(void) {
1555915559
}
1556015560

1556115561
int ggml_v2_cpu_has_blas(void) {
15562-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
15562+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) || defined(GGML_USE_CLBLAST)
1556315563
return 1;
1556415564
#else
1556515565
return 0;

otherarch/llama_v2-util.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -415,7 +415,7 @@ struct llama_v2_buffer {
415415
llama_v2_buffer& operator=(llama_v2_buffer&&) = delete;
416416
};
417417

418-
#ifdef GGML_USE_CUBLAS
418+
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
419419
#include "ggml_v2-cuda.h"
420420
struct llama_v2_ctx_buffer {
421421
uint8_t * addr = NULL;

otherarch/llama_v2.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
#include "llama_v2.h"
1010

1111
#include "ggml_v2.h"
12-
#ifdef GGML_USE_CUBLAS
12+
#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS
1313
#include "ggml_v2-cuda.h"
1414
#elif defined(GGML_USE_CLBLAST)
1515
#include "ggml_v2-opencl.h"
@@ -3088,4 +3088,4 @@ std::vector<llama_token> llama_v2_tokenize(struct llama_v2_context * ctx, const
30883088
res.resize(n);
30893089

30903090
return res;
3091-
}
3091+
}

0 commit comments

Comments
 (0)