Skip to content

Commit 67e229b

Browse files
committed
Merge 'origin/master' into hipblas
2 parents 6f7c156 + b241649 commit 67e229b

11 files changed

+264
-50
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ models/*
3434
/perplexity
3535
/embedding
3636
/train-text-from-scratch
37+
/simple
3738
/benchmark-matmult
3839
/vdot
3940
/server

Makefile

+1-8
Original file line numberDiff line numberDiff line change
@@ -144,11 +144,7 @@ endif # LLAMA_NO_ACCELERATE
144144

145145
ifdef LLAMA_OPENBLAS
146146
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
147-
ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/null),)
148-
LDFLAGS += -lopenblas -lcblas
149-
else
150-
LDFLAGS += -lopenblas
151-
endif
147+
LDFLAGS += -lopenblas
152148
endif # LLAMA_OPENBLAS
153149

154150
ifdef LLAMA_BLIS
@@ -298,9 +294,6 @@ main: examples/main/main.cpp build-info.h ggml.
298294

299295
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS)
300296
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
301-
@echo
302-
@echo '==== Run ./simple -h for help. ===='
303-
@echo
304297

305298
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
306299
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

README.md

-1
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,6 @@ Building the program with BLAS support may lead to some performance improvements
336336
cmake .. -DLLAMA_CUBLAS=ON
337337
cmake --build . --config Release
338338
```
339-
Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1.
340339
341340
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
342341

examples/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ else()
3838
add_subdirectory(benchmark)
3939
add_subdirectory(baby-llama)
4040
add_subdirectory(train-text-from-scratch)
41+
add_subdirectory(simple)
4142
if (LLAMA_METAL)
4243
add_subdirectory(metal)
4344
endif()

examples/common.cpp

-3
Original file line numberDiff line numberDiff line change
@@ -106,9 +106,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
106106
}
107107

108108
if (arg == "-s" || arg == "--seed") {
109-
#if defined(GGML_USE_CUBLAS)
110-
fprintf(stderr, "WARNING: when using cuBLAS generation results are NOT guaranteed to be reproducible.\n");
111-
#endif
112109
if (++i >= argc) {
113110
invalid_param = true;
114111
break;

examples/main/main.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,7 @@ int main(int argc, char ** argv) {
354354
if ((int)embd.size() > max_embd_size) {
355355
auto skipped_tokens = embd.size() - max_embd_size;
356356
console_set_color(con_st, CONSOLE_COLOR_ERROR);
357-
printf("<<input too long: skipped %" PRIu64 " token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
357+
printf("<<input too long: skipped %zu token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
358358
console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
359359
fflush(stdout);
360360
embd.resize(max_embd_size);

ggml-cuda.cu

+24-34
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,10 @@
6868
#include "ggml-cuda.h"
6969
#include "ggml.h"
7070

71+
#if defined(_MSC_VER)
72+
#pragma warning(disable: 4244 4267) // possible loss of data
73+
#endif
74+
7175
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
7276

7377
#define CUDA_CHECK(err) \
@@ -1518,19 +1522,13 @@ static void * g_scratch_buffer = nullptr;
15181522
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
15191523
static size_t g_scratch_offset = 0;
15201524

1521-
#define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
1522-
#define GGML_CUDA_MAX_EVENTS 64
1523-
15241525
static int g_device_count = -1;
15251526
static int g_main_device = 0;
15261527
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
15271528

15281529
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
15291530

1530-
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1531-
1532-
static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1533-
static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr };
1531+
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
15341532

15351533
void ggml_init_cublas() {
15361534
static bool initialized = false;
@@ -1554,15 +1552,8 @@ void ggml_init_cublas() {
15541552
for (int id = 0; id < g_device_count; ++id) {
15551553
CUDA_CHECK(cudaSetDevice(id));
15561554

1557-
// create streams
1558-
for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) {
1559-
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id][i], cudaStreamNonBlocking));
1560-
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking));
1561-
}
1562-
// create events
1563-
for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
1564-
CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming));
1565-
}
1555+
// create main stream
1556+
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking));
15661557

15671558
// create cublas handle
15681559
CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
@@ -2029,6 +2020,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
20292020
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
20302021
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
20312022

2023+
// if multiple GPUs are used they need to wait for the main GPU to finish
2024+
if (split && g_device_count > 1) {
2025+
CUDA_CHECK(cudaSetDevice(g_main_device));
2026+
CUDA_CHECK(cudaDeviceSynchronize());
2027+
}
2028+
20322029
for (int id = 0; id < g_device_count; ++id) {
20332030
if (!split && id != g_main_device) {
20342031
continue;
@@ -2127,9 +2124,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
21272124
}
21282125
const int64_t i11 = i13*ne12 + i12;
21292126

2130-
cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS];
2131-
cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
2132-
cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
2127+
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
21332128

21342129
// for split tensors the data begins at i0 == i0_offset_low
21352130
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
@@ -2157,14 +2152,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
21572152
if (src1->backend == GGML_BACKEND_CPU) {
21582153
GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
21592154
int64_t nrows1 = flatten_rows ? nrows0 : ne11;
2160-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_memcpy_src1));
2155+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main));
21612156
} else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
21622157
if (id != g_main_device) {
21632158
GGML_ASSERT(!flatten_rows);
21642159
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
21652160
src1_ddf_i_source += i11*src1_stride;
21662161
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
2167-
cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1));
2162+
cudaMemcpyDeviceToDevice, cudaStream_main));
21682163
}
21692164
} else if (src1_on_device && !src1_is_contiguous) {
21702165
GGML_ASSERT(!split);
@@ -2173,7 +2168,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
21732168
GGML_ASSERT(false);
21742169
}
21752170
}
2176-
CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
21772171

21782172
if (!src0_on_device || !src0_is_contiguous) {
21792173
if (src0_is_f32) {
@@ -2189,9 +2183,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
21892183
CUDA_CHECK(cudaGetLastError());
21902184
}
21912185

2192-
// wait with main stream until src1 memcpy is done
2193-
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, cudaEvent_memcpy_src1, 0));
2194-
21952186
// do the computation
21962187
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
21972188

@@ -2229,8 +2220,13 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
22292220

22302221
// wait until each device is finished, then free their buffers
22312222
for (int id = 0; id < g_device_count; ++id) {
2223+
if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
2224+
continue;
2225+
}
2226+
22322227
CUDA_CHECK(cudaSetDevice(id));
22332228
CUDA_CHECK(cudaDeviceSynchronize());
2229+
22342230
if (src0_asq[id] > 0) {
22352231
ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
22362232
}
@@ -2296,7 +2292,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
22962292
const int64_t ne02 = src0->ne[2];
22972293

22982294
CUDA_CHECK(cudaSetDevice(g_main_device));
2299-
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
2295+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
23002296

23012297
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
23022298
void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -2308,8 +2304,6 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
23082304
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
23092305

23102306
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
2311-
2312-
CUDA_CHECK(cudaDeviceSynchronize());
23132307
}
23142308

23152309
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -2327,7 +2321,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
23272321
const int64_t nb02 = src0->nb[2];
23282322

23292323
CUDA_CHECK(cudaSetDevice(g_main_device));
2330-
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
2324+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
23312325

23322326
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
23332327
void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -2342,8 +2336,6 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
23422336
const int channel_stride_x = nb02 / sizeof(half);
23432337

23442338
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
2345-
2346-
CUDA_CHECK(cudaDeviceSynchronize());
23472339
}
23482340

23492341
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2399,7 +2391,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
23992391
const int64_t nb12 = src1->nb[2];
24002392

24012393
CUDA_CHECK(cudaSetDevice(g_main_device));
2402-
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
2394+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
24032395

24042396
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
24052397
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
@@ -2417,8 +2409,6 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
24172409
GGML_ASSERT(false);
24182410
}
24192411

2420-
CUDA_CHECK(cudaDeviceSynchronize());
2421-
24222412
(void) dst;
24232413
}
24242414

ggml-metal.m

+80-2
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
GGML_METAL_DECL_KERNEL(get_rows_q5_k);
5858
GGML_METAL_DECL_KERNEL(get_rows_q6_k);
5959
GGML_METAL_DECL_KERNEL(rms_norm);
60+
GGML_METAL_DECL_KERNEL(norm);
6061
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
6162
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
6263
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
@@ -66,8 +67,10 @@
6667
GGML_METAL_DECL_KERNEL(mul_mat_q5_k_f32);
6768
GGML_METAL_DECL_KERNEL(mul_mat_q6_k_f32);
6869
GGML_METAL_DECL_KERNEL(rope);
70+
GGML_METAL_DECL_KERNEL(alibi_f32);
6971
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
7072
GGML_METAL_DECL_KERNEL(cpy_f32_f32);
73+
GGML_METAL_DECL_KERNEL(cpy_f16_f16);
7174

7275
#undef GGML_METAL_DECL_KERNEL
7376
};
@@ -162,6 +165,7 @@ @implementation GGMLMetalClass
162165
GGML_METAL_ADD_KERNEL(get_rows_q5_k);
163166
GGML_METAL_ADD_KERNEL(get_rows_q6_k);
164167
GGML_METAL_ADD_KERNEL(rms_norm);
168+
GGML_METAL_ADD_KERNEL(norm);
165169
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
166170
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
167171
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
@@ -171,8 +175,10 @@ @implementation GGMLMetalClass
171175
GGML_METAL_ADD_KERNEL(mul_mat_q5_k_f32);
172176
GGML_METAL_ADD_KERNEL(mul_mat_q6_k_f32);
173177
GGML_METAL_ADD_KERNEL(rope);
178+
GGML_METAL_ADD_KERNEL(alibi_f32);
174179
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
175180
GGML_METAL_ADD_KERNEL(cpy_f32_f32);
181+
GGML_METAL_ADD_KERNEL(cpy_f16_f16);
176182

177183
#undef GGML_METAL_ADD_KERNEL
178184
}
@@ -250,10 +256,10 @@ bool ggml_metal_add_buffer(
250256
if (ctx->buffers[ctx->n_buffers].metal == nil) {
251257
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
252258
return false;
253-
} else {
254-
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
255259
}
256260

261+
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
262+
257263
++ctx->n_buffers;
258264
}
259265

@@ -735,6 +741,70 @@ void ggml_metal_graph_compute(
735741

736742
[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
737743
} break;
744+
case GGML_OP_NORM:
745+
{
746+
if (encoder == nil) {
747+
encoder = [command_buffer computeCommandEncoder];
748+
}
749+
750+
const float eps = 1e-5f;
751+
752+
const int nth = 256;
753+
754+
[encoder setComputePipelineState:ctx->pipeline_norm];
755+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
756+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
757+
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
758+
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
759+
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
760+
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
761+
762+
const int64_t nrows = ggml_nrows(src0);
763+
764+
[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
765+
} break;
766+
case GGML_OP_ALIBI:
767+
{
768+
if (encoder == nil) {
769+
encoder = [command_buffer computeCommandEncoder];
770+
}
771+
772+
GGML_ASSERT((src0t == GGML_TYPE_F32));
773+
774+
const int n_past = ((int32_t *) src1->data)[0]; UNUSED(n_past);
775+
const int n_head = ((int32_t *) src1->data)[1];
776+
const float max_bias = ((float *) src1->data)[2];
777+
778+
if (__builtin_popcount(n_head) != 1) {
779+
GGML_ASSERT(false && "only power-of-two n_head implemented");
780+
}
781+
782+
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
783+
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
784+
785+
[encoder setComputePipelineState:ctx->pipeline_alibi_f32];
786+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
787+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
788+
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
789+
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
790+
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
791+
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
792+
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
793+
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
794+
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
795+
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
796+
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
797+
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
798+
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
799+
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
800+
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
801+
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
802+
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
803+
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
804+
[encoder setBytes:&m0 length:sizeof( float) atIndex:18];
805+
const int nth = 32;
806+
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
807+
} break;
738808
case GGML_OP_ROPE:
739809
{
740810
if (encoder == nil) {
@@ -788,6 +858,14 @@ void ggml_metal_graph_compute(
788858
default: GGML_ASSERT(false && "not implemented");
789859
};
790860
} break;
861+
case GGML_TYPE_F16:
862+
{
863+
switch (dstt) {
864+
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f16]; break;
865+
case GGML_TYPE_F32: GGML_ASSERT(false && "cpy_f16_f32 not implemented"); break;
866+
default: GGML_ASSERT(false && "not implemented");
867+
};
868+
} break;
791869
default: GGML_ASSERT(false && "not implemented");
792870
}
793871

0 commit comments

Comments
 (0)