Skip to content

Commit 8ee4082

Browse files
joeatoddAlcpz
authored andcommitted
Revert "llama : offload to RPC in addition to other backends (ggml-org#7640)" (ggml-org#7981)
This reverts commit bde7cd3.
1 parent a4467e0 commit 8ee4082

File tree

6 files changed

+53
-86
lines changed

6 files changed

+53
-86
lines changed

Makefile

+5-24
Original file line numberDiff line numberDiff line change
@@ -69,10 +69,6 @@ ifeq ($(UNAME_S),Darwin)
6969
endif
7070
endif
7171

72-
ifdef LLAMA_RPC
73-
BUILD_TARGETS += rpc-server
74-
endif
75-
7672
default: $(BUILD_TARGETS)
7773

7874
test: $(TEST_TARGETS)
@@ -433,11 +429,6 @@ ifdef LLAMA_BLIS
433429
MK_LDFLAGS += -lblis -L/usr/local/lib
434430
endif # LLAMA_BLIS
435431

436-
ifdef LLAMA_RPC
437-
MK_CPPFLAGS += -DGGML_USE_RPC
438-
OBJS += ggml-rpc.o
439-
endif # LLAMA_RPC
440-
441432
ifdef LLAMA_CUBLAS
442433
# LLAMA_CUBLAS is deprecated and will be removed in the future
443434
LLAMA_CUDA := 1
@@ -647,26 +638,11 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
647638
endif
648639
endif # LLAMA_METAL
649640

650-
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
651-
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
652-
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
653-
654641
ifndef LLAMA_NO_LLAMAFILE
655642
sgemm.o: sgemm.cpp sgemm.h ggml.h
656643
$(CXX) $(CXXFLAGS) -c $< -o $@
657644
endif
658645

659-
ifdef LLAMA_RPC
660-
ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
661-
$(CXX) $(CXXFLAGS) -c $< -o $@
662-
663-
rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
664-
$(CXX) $(CXXFLAGS) -c $< -o $@
665-
666-
rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
667-
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
668-
endif # LLAMA_RPC
669-
670646
GF_CC := $(CC)
671647
include scripts/get-flags.mk
672648

@@ -746,9 +722,14 @@ unicode.o: unicode.cpp unicode.h
746722
unicode-data.o: unicode-data.cpp unicode-data.h
747723
$(CXX) $(CXXFLAGS) -c $< -o $@
748724

725+
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
726+
749727
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
750728
$(CXX) $(CXXFLAGS) -c $< -o $@
751729

730+
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
731+
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
732+
752733
common.o: common/common.cpp $(COMMON_H_DEPS)
753734
$(CXX) $(CXXFLAGS) -c $< -o $@
754735

ggml-alloc.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -750,7 +750,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
750750
// this tensor was allocated without ggml-backend
751751
return;
752752
}
753-
ggml_backend_view_init(tensor);
753+
ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
754754
}
755755
} else {
756756
if (tensor->data == NULL) {
@@ -899,12 +899,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
899899
if (t->view_src == NULL) {
900900
ggml_tallocr_alloc(&tallocr, t);
901901
} else if (t->buffer == NULL) {
902-
ggml_backend_view_init(t);
902+
ggml_backend_view_init(buffer, t);
903903
}
904904
} else {
905905
if (t->view_src != NULL && t->buffer == NULL) {
906906
// view of a pre-allocated tensor
907-
ggml_backend_view_init(t);
907+
ggml_backend_view_init(buffer, t);
908908
}
909909
}
910910
}

ggml-backend.c

+5-5
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
151151
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
152152
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
153153
if (dst_buf->iface.cpy_tensor) {
154-
return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
154+
return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
155155
}
156156
return false;
157157
}
@@ -1887,15 +1887,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
18871887

18881888
// utils
18891889

1890-
void ggml_backend_view_init(struct ggml_tensor * tensor) {
1890+
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
18911891
GGML_ASSERT(tensor->buffer == NULL);
18921892
GGML_ASSERT(tensor->view_src != NULL);
18931893
GGML_ASSERT(tensor->view_src->buffer != NULL);
18941894
GGML_ASSERT(tensor->view_src->data != NULL);
18951895

1896-
tensor->buffer = tensor->view_src->buffer;
1896+
tensor->buffer = buffer;
18971897
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
1898-
ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
1898+
ggml_backend_buffer_init_tensor(buffer, tensor);
18991899
}
19001900

19011901
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
@@ -1954,7 +1954,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
19541954
struct ggml_tensor * dst = node_copies[id];
19551955
if (dst->view_src != NULL) {
19561956
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
1957-
ggml_backend_view_init(dst);
1957+
ggml_backend_view_init(dst->view_src->buffer, dst);
19581958
}
19591959
else {
19601960
ggml_backend_tensor_copy(src, dst);

ggml-backend.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ extern "C" {
225225

226226
// Tensor initialization
227227
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
228-
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
228+
GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
229229

230230

231231
#ifdef __cplusplus

ggml-rpc.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -491,7 +491,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
491491
if (remote_ptr != 0) {
492492
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
493493
ggml_backend_rpc_buffer_interface,
494-
new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"},
494+
new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC"},
495495
remote_size);
496496
return buffer;
497497
} else {
@@ -692,7 +692,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const
692692
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
693693
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
694694
/* .endpoint = */ endpoint,
695-
/* .name = */ "RPC[" + std::string(endpoint) + "]",
695+
/* .name = */ "RPC",
696696
};
697697

698698
ggml_backend_t backend = new ggml_backend {

llama.cpp

+37-51
Original file line numberDiff line numberDiff line change
@@ -2370,34 +2370,13 @@ struct llama_context {
23702370
struct llama_control_vector cvec;
23712371
};
23722372

2373-
static size_t llama_get_device_count(const llama_model & model) {
2374-
size_t count = 1;
2375-
#if defined(GGML_USE_CUDA)
2376-
count = ggml_backend_cuda_get_device_count();
2377-
#elif defined(GGML_USE_SYCL)
2378-
count = ggml_backend_sycl_get_device_count();
2379-
#elif defined(GGML_USE_VULKAN)
2380-
count = ggml_backend_vk_get_device_count();
2381-
#endif
2382-
#if defined(GGML_USE_RPC)
2383-
count += model.rpc_servers.size();
2384-
#endif
2385-
return count;
2386-
GGML_UNUSED(model);
2387-
}
2388-
23892373
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
23902374
ggml_backend_buffer_type_t buft = nullptr;
23912375

2392-
#if defined(GGML_USE_RPC)
2393-
int dev_count = (int)llama_get_device_count(model);
2394-
int rpc_count = (int)model.rpc_servers.size();
2395-
if (gpu >= dev_count - rpc_count) {
2396-
const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
2397-
return ggml_backend_rpc_buffer_type(endpoint);
2398-
}
2399-
#endif
2400-
#if defined(GGML_USE_METAL)
2376+
#ifdef GGML_USE_RPC
2377+
std::string endpoint = model.rpc_servers[gpu];
2378+
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
2379+
#elif defined(GGML_USE_METAL)
24012380
buft = ggml_backend_metal_buffer_type();
24022381
#elif defined(GGML_USE_CUDA)
24032382
buft = ggml_backend_cuda_buffer_type(gpu);
@@ -2443,19 +2422,29 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
24432422
GGML_UNUSED(tensor_split);
24442423
}
24452424

2446-
static size_t llama_get_device_memory(const llama_model & model, int device) {
2425+
static size_t llama_get_device_count(const llama_model & model) {
24472426
#if defined(GGML_USE_RPC)
2448-
int dev_count = (int)llama_get_device_count(model);
2449-
int rpc_count = (int)model.rpc_servers.size();
2450-
if (device >= dev_count - rpc_count) {
2451-
size_t total;
2452-
size_t free;
2453-
const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
2454-
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
2455-
return free;
2456-
}
2427+
return model.rpc_servers.size();
2428+
#elif defined(GGML_USE_CUDA)
2429+
return ggml_backend_cuda_get_device_count();
2430+
#elif defined(GGML_USE_SYCL)
2431+
return ggml_backend_sycl_get_device_count();
2432+
#elif defined(GGML_USE_VULKAN)
2433+
return ggml_backend_vk_get_device_count();
2434+
#else
2435+
return 1;
24572436
#endif
2458-
#if defined(GGML_USE_CUDA)
2437+
GGML_UNUSED(model);
2438+
}
2439+
2440+
static size_t llama_get_device_memory(const llama_model & model, int device) {
2441+
#if defined(GGML_USE_RPC)
2442+
size_t total;
2443+
size_t free;
2444+
std::string endpoint = model.rpc_servers[device];
2445+
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
2446+
return free;
2447+
#elif defined(GGML_USE_CUDA)
24592448
size_t total;
24602449
size_t free;
24612450
ggml_backend_cuda_get_device_memory(device, &free, &total);
@@ -15995,7 +15984,7 @@ struct llama_model * llama_load_model_from_file(
1599515984
return true;
1599615985
};
1599715986
}
15998-
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
15987+
if (params.rpc_servers != nullptr) {
1599915988
// split the servers set them into model->rpc_servers
1600015989
std::string servers(params.rpc_servers);
1600115990
size_t pos = 0;
@@ -16158,7 +16147,17 @@ struct llama_context * llama_new_context_with_model(
1615816147

1615916148
if (!hparams.vocab_only) {
1616016149
// initialize backends
16161-
#if defined(GGML_USE_METAL)
16150+
#if defined(GGML_USE_RPC)
16151+
for (auto & server : model->rpc_servers) {
16152+
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
16153+
if (backend == nullptr) {
16154+
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
16155+
llama_free(ctx);
16156+
return nullptr;
16157+
}
16158+
ctx->backends.push_back(backend);
16159+
}
16160+
#elif defined(GGML_USE_METAL)
1616216161
if (model->n_gpu_layers > 0) {
1616316162
ctx->backend_metal = ggml_backend_metal_init();
1616416163
if (ctx->backend_metal == nullptr) {
@@ -16250,19 +16249,6 @@ struct llama_context * llama_new_context_with_model(
1625016249
}
1625116250
ctx->backends.push_back(backend);
1625216251
}
16253-
#endif
16254-
#if defined(GGML_USE_RPC)
16255-
if (model->n_gpu_layers > 0) {
16256-
for (const auto & endpoint : model->rpc_servers) {
16257-
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
16258-
if (backend == nullptr) {
16259-
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
16260-
llama_free(ctx);
16261-
return nullptr;
16262-
}
16263-
ctx->backends.push_back(backend);
16264-
}
16265-
}
1626616252
#endif
1626716253
ctx->backend_cpu = ggml_backend_cpu_init();
1626816254
if (ctx->backend_cpu == nullptr) {

0 commit comments

Comments
 (0)