@@ -701,7 +701,7 @@ struct whisper_model {
701
701
struct ggml_context * ctx;
702
702
703
703
// the model backend data is read-only and can be shared between processors
704
- struct ggml_backend_buffer * buffer ;
704
+ std::vector< struct ggml_backend_buffer *> buffers ;
705
705
706
706
// tensors
707
707
int n_loaded;
@@ -1514,24 +1514,64 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1514
1514
1515
1515
wctx.backend = whisper_backend_init (wctx.params );
1516
1516
1517
+ // some devices have a limit on the maximum size of single memory buffer
1518
+ // for example, iPhones are limited to 1GB per buffer
1519
+ // to workaround this, we will allocate multiple buffers of smaller size and will split the tensors with the
1520
+ // model weights between them
1521
+ //
1522
+ // the map_t2b maps tensor names to buffer indices
1523
+ // as we iterate over the tensors, we will allocate new buffers when the current one is full
1524
+ //
1525
+ // finally, we create a separate allocator for each buffer and use it to allocate the tensors
1526
+ // we keep the allocators alive until all the tensors are loaded
1527
+
1528
+ GGML_ASSERT (model.buffers .empty ());
1529
+
1530
+ std::map<std::string, int > map_t2b;
1531
+
1517
1532
{
1518
1533
size_t size_main = 0 ;
1534
+ size_t size_cur = 0 ;
1535
+
1536
+ static const size_t GB = 1024ull *1024ull *1024ull ;
1519
1537
1520
1538
for (const auto & t : model.tensors ) {
1521
- size_main += ggml_nbytes (t.second ) + ggml_tensor_overhead ();
1539
+ const size_t cur = ggml_nbytes (t.second ) + ggml_tensor_overhead ();
1540
+
1541
+ // adding the tensor to the current buffer will exceed the limit, so we need to allocate a new buffer
1542
+ if (size_cur + cur > GB) {
1543
+ GGML_ASSERT (size_cur > 0 && " A tensor is too large to fit in a single buffer" );
1544
+
1545
+ model.buffers .emplace_back (ggml_backend_alloc_buffer (wctx.backend , size_cur));
1546
+
1547
+ size_cur = cur;
1548
+ }
1549
+
1550
+ map_t2b[t.first ] = model.buffers .size ();
1551
+
1552
+ size_cur += cur;
1553
+ size_main += cur;
1554
+ }
1555
+
1556
+ // allocate the last buffer if needed
1557
+ if (size_cur > 0 ) {
1558
+ model.buffers .emplace_back (ggml_backend_alloc_buffer (wctx.backend , size_cur));
1522
1559
}
1523
1560
1524
- model.buffer = ggml_backend_alloc_buffer (wctx. backend , size_main );
1561
+ GGML_ASSERT ( model.buffers . size () > 0 );
1525
1562
1526
- WHISPER_LOG_INFO (" %s: %8s buffer size = %8.2f MB\n " , __func__, ggml_backend_name (wctx.backend ), size_main / 1e6 );
1563
+ WHISPER_LOG_INFO (" %s: %8s total size = %8.2f MB (%d buffers) \n " , __func__, ggml_backend_name (wctx.backend ), size_main / 1e6 , ( int ) model. buffers . size () );
1527
1564
}
1528
1565
1529
- ggml_allocr * alloc = ggml_allocr_new_from_buffer (model.buffer );
1566
+ std::vector<ggml_allocr *> allocs (model.buffers .size ());
1567
+ for (size_t i = 0 ; i < allocs.size (); ++i) {
1568
+ allocs[i] = ggml_allocr_new_from_buffer (model.buffers [i]);
1569
+ }
1530
1570
1531
1571
// allocate tensors in the backend buffers
1532
1572
{
1533
1573
for (const auto & t : model.tensors ) {
1534
- ggml_allocr_alloc (alloc , t.second );
1574
+ ggml_allocr_alloc (allocs[map_t2b[t. first ]] , t.second );
1535
1575
}
1536
1576
}
1537
1577
@@ -1632,7 +1672,9 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
1632
1672
}
1633
1673
}
1634
1674
1635
- ggml_allocr_free (alloc);
1675
+ for (auto & alloc : allocs) {
1676
+ ggml_allocr_free (alloc);
1677
+ }
1636
1678
1637
1679
wctx.t_load_us = ggml_time_us () - t_start_us;
1638
1680
@@ -3376,8 +3418,10 @@ void whisper_free(struct whisper_context * ctx) {
3376
3418
ggml_free (ctx->model .ctx );
3377
3419
}
3378
3420
3379
- if (ctx->model .buffer ) {
3380
- ggml_backend_buffer_free (ctx->model .buffer );
3421
+ for (auto & buffer : ctx->model .buffers ) {
3422
+ if (buffer) {
3423
+ ggml_backend_buffer_free (buffer);
3424
+ }
3381
3425
}
3382
3426
3383
3427
whisper_free_state (ctx->state );
0 commit comments