Skip to content

Commit cfa9539

Browse files
ggerganoviThalay
authored andcommitted
whisper : load the model into multiple buffers of max size 1GB (ggml-org#1763)
1 parent dd2beeb commit cfa9539

File tree

1 file changed

+53
-9
lines changed

1 file changed

+53
-9
lines changed

whisper.cpp

+53-9
Original file line numberDiff line numberDiff line change
@@ -701,7 +701,7 @@ struct whisper_model {
701701
struct ggml_context * ctx;
702702

703703
// the model backend data is read-only and can be shared between processors
704-
struct ggml_backend_buffer * buffer;
704+
std::vector<struct ggml_backend_buffer *> buffers;
705705

706706
// tensors
707707
int n_loaded;
@@ -1514,24 +1514,64 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
15141514

15151515
wctx.backend = whisper_backend_init(wctx.params);
15161516

1517+
// some devices have a limit on the maximum size of single memory buffer
1518+
// for example, iPhones are limited to 1GB per buffer
1519+
// to workaround this, we will allocate multiple buffers of smaller size and will split the tensors with the
1520+
// model weights between them
1521+
//
1522+
// the map_t2b maps tensor names to buffer indices
1523+
// as we iterate over the tensors, we will allocate new buffers when the current one is full
1524+
//
1525+
// finally, we create a separate allocator for each buffer and use it to allocate the tensors
1526+
// we keep the allocators alive until all the tensors are loaded
1527+
1528+
GGML_ASSERT(model.buffers.empty());
1529+
1530+
std::map<std::string, int> map_t2b;
1531+
15171532
{
15181533
size_t size_main = 0;
1534+
size_t size_cur = 0;
1535+
1536+
static const size_t GB = 1024ull*1024ull*1024ull;
15191537

15201538
for (const auto & t : model.tensors) {
1521-
size_main += ggml_nbytes(t.second) + ggml_tensor_overhead();
1539+
const size_t cur = ggml_nbytes(t.second) + ggml_tensor_overhead();
1540+
1541+
// adding the tensor to the current buffer will exceed the limit, so we need to allocate a new buffer
1542+
if (size_cur + cur > GB) {
1543+
GGML_ASSERT(size_cur > 0 && "A tensor is too large to fit in a single buffer");
1544+
1545+
model.buffers.emplace_back(ggml_backend_alloc_buffer(wctx.backend, size_cur));
1546+
1547+
size_cur = cur;
1548+
}
1549+
1550+
map_t2b[t.first] = model.buffers.size();
1551+
1552+
size_cur += cur;
1553+
size_main += cur;
1554+
}
1555+
1556+
// allocate the last buffer if needed
1557+
if (size_cur > 0) {
1558+
model.buffers.emplace_back(ggml_backend_alloc_buffer(wctx.backend, size_cur));
15221559
}
15231560

1524-
model.buffer = ggml_backend_alloc_buffer(wctx.backend, size_main);
1561+
GGML_ASSERT(model.buffers.size() > 0);
15251562

1526-
WHISPER_LOG_INFO("%s: %8s buffer size = %8.2f MB\n", __func__, ggml_backend_name(wctx.backend), size_main / 1e6);
1563+
WHISPER_LOG_INFO("%s: %8s total size = %8.2f MB (%d buffers)\n", __func__, ggml_backend_name(wctx.backend), size_main / 1e6, (int) model.buffers.size());
15271564
}
15281565

1529-
ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer);
1566+
std::vector<ggml_allocr *> allocs(model.buffers.size());
1567+
for (size_t i = 0; i < allocs.size(); ++i) {
1568+
allocs[i] = ggml_allocr_new_from_buffer(model.buffers[i]);
1569+
}
15301570

15311571
// allocate tensors in the backend buffers
15321572
{
15331573
for (const auto & t : model.tensors) {
1534-
ggml_allocr_alloc(alloc, t.second);
1574+
ggml_allocr_alloc(allocs[map_t2b[t.first]], t.second);
15351575
}
15361576
}
15371577

@@ -1632,7 +1672,9 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
16321672
}
16331673
}
16341674

1635-
ggml_allocr_free(alloc);
1675+
for (auto & alloc : allocs) {
1676+
ggml_allocr_free(alloc);
1677+
}
16361678

16371679
wctx.t_load_us = ggml_time_us() - t_start_us;
16381680

@@ -3376,8 +3418,10 @@ void whisper_free(struct whisper_context * ctx) {
33763418
ggml_free(ctx->model.ctx);
33773419
}
33783420

3379-
if (ctx->model.buffer) {
3380-
ggml_backend_buffer_free(ctx->model.buffer);
3421+
for (auto & buffer : ctx->model.buffers) {
3422+
if (buffer) {
3423+
ggml_backend_buffer_free(buffer);
3424+
}
33813425
}
33823426

33833427
whisper_free_state(ctx->state);

0 commit comments

Comments
 (0)