Skip to content

Commit 770e674

Browse files
committed
Merge remote-tracking branch 'upstream/concedo'
2 parents 2b289cd + 5941514 commit 770e674

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+1751
-903
lines changed

.devops/full-cuda.Dockerfile

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
ARG UBUNTU_VERSION=22.04
2+
3+
# This needs to generally match the container host's environment.
4+
ARG CUDA_VERSION=11.7.1
5+
6+
# Target the CUDA build image
7+
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
8+
9+
FROM ${BASE_CUDA_DEV_CONTAINER} as build
10+
11+
# Unless otherwise specified, we make a fat build.
12+
ARG CUDA_DOCKER_ARCH=all
13+
14+
RUN apt-get update && \
15+
apt-get install -y build-essential python3 python3-pip
16+
17+
COPY requirements.txt requirements.txt
18+
19+
RUN pip install --upgrade pip setuptools wheel \
20+
&& pip install -r requirements.txt
21+
22+
WORKDIR /app
23+
24+
COPY . .
25+
26+
# Set nvcc architecture
27+
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
28+
# Enable cuBLAS
29+
ENV LLAMA_CUBLAS=1
30+
31+
RUN make
32+
33+
ENTRYPOINT ["/app/.devops/tools.sh"]

.devops/main-cuda.Dockerfile

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
ARG UBUNTU_VERSION=22.04
2+
# This needs to generally match the container host's environment.
3+
ARG CUDA_VERSION=11.7.1
4+
# Target the CUDA build image
5+
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
6+
# Target the CUDA runtime image
7+
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
8+
9+
FROM ${BASE_CUDA_DEV_CONTAINER} as build
10+
11+
# Unless otherwise specified, we make a fat build.
12+
ARG CUDA_DOCKER_ARCH=all
13+
14+
RUN apt-get update && \
15+
apt-get install -y build-essential
16+
17+
WORKDIR /app
18+
19+
COPY . .
20+
21+
# Set nvcc architecture
22+
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
23+
# Enable cuBLAS
24+
ENV LLAMA_CUBLAS=1
25+
26+
RUN make
27+
28+
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
29+
30+
COPY --from=build /app/main /main
31+
32+
ENTRYPOINT [ "/main" ]

.gitignore

+4-1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ build-static/
2020
build-cublas/
2121
build-opencl/
2222
build-metal/
23+
build-mpi/
2324
build-no-accel/
2425
build-sanitize-addr/
2526
build-sanitize-thread/
@@ -67,4 +68,6 @@ koboldcpp_failsafe.dll
6768
koboldcpp_openblas.dll
6869
koboldcpp_openblas_noavx2.dll
6970
koboldcpp_clblast.dll
70-
koboldcpp_cublas.dll
71+
koboldcpp_cublas.dll
72+
cublas64_11.dll
73+
cublasLt64_11.dll

CMakeLists.txt

+23-11
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ set(LLAMA_SANITIZE_THREAD OFF)
2828
set(LLAMA_SANITIZE_ADDRESS OFF)
2929
set(LLAMA_SANITIZE_UNDEFINED OFF)
3030

31+
option(MAKE_MISC_FILES "MAKE_MISC_FILES" OFF)
32+
3133
# instruction set specific
3234
option(LLAMA_AVX "llama: enable AVX" ON)
3335
option(LLAMA_AVX2 "llama: enable AVX2" ON)
@@ -73,16 +75,16 @@ if (LLAMA_CUBLAS)
7375

7476
enable_language(CUDA)
7577

76-
set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
78+
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
7779
set(GGML_V2_CUDA_SOURCES otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h)
7880
set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
7981

8082
add_compile_definitions(GGML_USE_CUBLAS)
81-
add_compile_definitions(GGML_CUDA_FORCE_DMMV) #non dmmv broken for me
82-
83+
#add_compile_definitions(GGML_CUDA_FORCE_DMMV) #non dmmv broken for me
84+
8385
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
8486
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
85-
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
87+
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
8688
if (LLAMA_CUDA_DMMV_F16)
8789
add_compile_definitions(GGML_CUDA_DMMV_F16)
8890
endif()
@@ -292,7 +294,7 @@ add_library(ggml OBJECT
292294
ggml.h
293295
k_quants.h
294296
k_quants.c
295-
${GGML_CUDA_SOURCES})
297+
${GGML_SOURCES_CUDA})
296298
target_include_directories(ggml PUBLIC . ./otherarch ./otherarch/tools)
297299
target_compile_features(ggml PUBLIC c_std_11) # don't bump
298300
target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
@@ -332,12 +334,6 @@ target_link_libraries(gpttype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
332334
set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
333335

334336

335-
if (GGML_CUDA_SOURCES)
336-
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
337-
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
338-
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
339-
endif()
340-
341337
set(TARGET koboldcpp_cublas)
342338
add_library(${TARGET} SHARED expose.cpp expose.h)
343339
target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./examples)
@@ -347,3 +343,19 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
347343
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
348344
target_link_libraries(${TARGET} PUBLIC ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${CMAKE_THREAD_LIBS_INIT})
349345
target_compile_features(${TARGET} PRIVATE cxx_std_11)
346+
347+
348+
if (MAKE_MISC_FILES)
349+
add_library(llama
350+
llama.cpp
351+
llama.h
352+
llama-util.h
353+
)
354+
target_include_directories(llama PUBLIC .)
355+
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
356+
target_link_libraries(llama PRIVATE
357+
ggml
358+
${LLAMA_EXTRA_LIBS}
359+
)
360+
add_subdirectory(examples)
361+
endif()

Makefile

+13-5
Original file line numberDiff line numberDiff line change
@@ -144,19 +144,27 @@ ifdef LLAMA_CUBLAS
144144
CUBLASLD_FLAGS = -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
145145
CUBLAS_OBJS = ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
146146
NVCC = nvcc
147-
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_FORCE_DMMV
147+
NVCCFLAGS = --forward-unknown-to-host-compiler
148+
ifdef CUDA_DOCKER_ARCH
149+
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
150+
else
151+
NVCCFLAGS += -arch=native
152+
endif # CUDA_DOCKER_ARCH
153+
ifdef LLAMA_CUDA_FORCE_DMMV
154+
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
155+
endif # LLAMA_CUDA_FORCE_DMMV
148156
ifdef LLAMA_CUDA_DMMV_X
149157
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
150158
else
151159
NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
152160
endif # LLAMA_CUDA_DMMV_X
153-
ifdef LLAMA_CUDA_DMMV_Y
161+
ifdef LLAMA_CUDA_MMV_Y
154162
NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
155-
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
163+
else ifdef LLAMA_CUDA_DMMV_Y
164+
NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
156165
else
157-
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
158166
NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
159-
endif # LLAMA_CUDA_DMMV_Y
167+
endif # LLAMA_CUDA_MMV_Y
160168
ifdef LLAMA_CUDA_DMMV_F16
161169
NVCCFLAGS += -DGGML_CUDA_DMMV_F16
162170
endif # LLAMA_CUDA_DMMV_F16

convert.py

+1
Original file line numberDiff line numberDiff line change
@@ -828,6 +828,7 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
828828

829829

830830
SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
831+
'BF16': DT_BF16,
831832
'F16': DT_F16,
832833
'F32': DT_F32,
833834
'I32': DT_I32,

examples/baby-llama/baby-llama.cpp

+18-6
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,17 @@ float frand_normal(struct random_normal_distribution * rnd) {
3131
return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
3232
}
3333

34+
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
35+
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
36+
37+
if (plan.work_size > 0) {
38+
buf.resize(plan.work_size);
39+
plan.work_data = buf.data();
40+
}
41+
42+
ggml_graph_compute(graph, &plan);
43+
}
44+
3445
struct ggml_tensor * randomize_tensor(
3546
struct ggml_tensor * tensor,
3647
int ndims,
@@ -1569,6 +1580,8 @@ int main(int argc, char ** argv) {
15691580
int n_tokens = model.hparams.n_ctx;
15701581
int n_vocab = model.hparams.n_vocab;
15711582

1583+
std::vector<uint8_t> work_buffer;
1584+
15721585
for (int ex=0; ex<n_examples; ++ex) {
15731586
struct ggml_init_params params = {
15741587
/*.mem_size =*/ compute_size,
@@ -1586,7 +1599,6 @@ int main(int argc, char ** argv) {
15861599
int n_past = 0;
15871600

15881601
ggml_cgraph gf = {};
1589-
gf.n_threads = 1;
15901602

15911603
get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
15921604

@@ -1595,7 +1607,7 @@ int main(int argc, char ** argv) {
15951607
struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
15961608

15971609
ggml_build_forward_expand(&gf, e);
1598-
ggml_graph_compute(ctx0, &gf);
1610+
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
15991611

16001612
float error_before_opt = ggml_get_f32_1d(e, 0);
16011613

@@ -1611,7 +1623,7 @@ int main(int argc, char ** argv) {
16111623
ggml_opt(ctx0, opt_params_lbfgs, e);
16121624
//
16131625
ggml_build_forward_expand(&gf, e);
1614-
ggml_graph_compute(ctx0, &gf);
1626+
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
16151627

16161628
float error_after_opt = ggml_get_f32_1d(e, 0);
16171629

@@ -1659,13 +1671,12 @@ int main(int argc, char ** argv) {
16591671
struct ggml_context * ctx0 = ggml_init(params);
16601672

16611673
ggml_cgraph gf = {};
1662-
gf.n_threads = 1;
16631674

16641675
int n_past = 0;
16651676
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
16661677

16671678
ggml_build_forward_expand(&gf, logits);
1668-
ggml_graph_compute(ctx0, &gf);
1679+
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
16691680

16701681
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
16711682
struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
@@ -1687,10 +1698,11 @@ int main(int argc, char ** argv) {
16871698
}
16881699

16891700
print_matrix(model.tok_embeddings);
1690-
16911701
printf("done\n");
1702+
16921703
// ggml_free(kv_self.ctx);
16931704
// ggml_free(model_lora.ctx);
16941705
ggml_free(model.ctx);
1706+
16951707
return 0;
16961708
}

examples/benchmark/benchmark-matmult.cpp

+20-9
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,17 @@
2020
#pragma warning(disable: 4244 4267) // possible loss of data
2121
#endif
2222

23+
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
24+
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
25+
26+
if (plan.work_size > 0) {
27+
buf.resize(plan.work_size);
28+
plan.work_data = buf.data();
29+
}
30+
31+
ggml_graph_compute(graph, &plan);
32+
}
33+
2334
float tensor_sum_elements(const ggml_tensor * tensor) {
2435
float sum = 0;
2536
if (tensor->type==GGML_TYPE_F32) {
@@ -159,13 +170,14 @@ int main(int argc, char ** argv) {
159170
// printf("Creating compute graph\n");
160171
struct ggml_cgraph gf = ggml_build_forward(m11xm2);
161172

162-
gf.n_threads=benchmark_params.n_threads;
163-
printf("cgraph->n_threads=%i\n",gf.n_threads);
173+
printf("n_threads=%i\n", benchmark_params.n_threads);
164174

165175
TENSOR_DUMP(m11);
166176
TENSOR_DUMP(m2);
167177

168-
ggml_graph_compute(ctx, &gf);
178+
std::vector<uint8_t> work_buffer;
179+
180+
ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads);
169181

170182
TENSOR_DUMP(gf.nodes[0]);
171183

@@ -187,7 +199,6 @@ int main(int argc, char ** argv) {
187199

188200
// printf("Creating compute graph\n");
189201
struct ggml_cgraph gf31 = ggml_build_forward(q31);
190-
gf31.n_threads=benchmark_params.n_threads;
191202

192203
// Set up a second graph computation to make sure we override the CPU cache lines
193204
// printf("Creating new tensor q12 & Running quantize\n");
@@ -199,8 +210,7 @@ int main(int argc, char ** argv) {
199210

200211
//printf("Creating compute graph\n");
201212
struct ggml_cgraph gf32 = ggml_build_forward(q32);
202-
gf32.n_threads=benchmark_params.n_threads;
203-
printf("cgraph->n_threads=%i\n",gf31.n_threads);
213+
printf("n_threads=%i\n", benchmark_params.n_threads);
204214

205215
const int dimx = sizex;
206216
const int dimy = sizey;
@@ -221,14 +231,15 @@ int main(int argc, char ** argv) {
221231

222232
long long int start = ggml_time_us();
223233
//printf("Running ggml_graph_compute\n");
224-
ggml_graph_compute(ctx, &gf31);
234+
ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads);
235+
225236
long long int stop = ggml_time_us();
226237
long long int usec = stop-start;
227238
double gflops = (double)(flops_per_matrix)/usec/1000.0;
228239
gflops_sum += gflops;
229240
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
230241
i,
231-
gf31.n_threads,
242+
benchmark_params.n_threads,
232243
sizex, sizey, sizez, flops_per_matrix,
233244
usec,gflops);
234245

@@ -253,7 +264,7 @@ int main(int argc, char ** argv) {
253264
}
254265

255266
// Running a different graph computation to make sure we override the CPU cache lines
256-
ggml_graph_compute(ctx, &gf32);
267+
ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads);
257268
}
258269
printf("\n");
259270
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));

0 commit comments

Comments
 (0)