@@ -1583,91 +1583,6 @@ struct llama_mmap {
1583
1583
llama_mmap() {}
1584
1584
};
1585
1585
1586
- struct llama_anonymous_mmap : llama_mmap {
1587
- llama_file * file;
1588
-
1589
- llama_anonymous_mmap(const llama_anonymous_mmap &) = delete;
1590
-
1591
- #ifdef _POSIX_MAPPED_FILES
1592
- #ifndef MAP_ANONYMOUS
1593
- #define MAP_ANONYMOUS MAP_ANON
1594
- #endif
1595
- llama_anonymous_mmap(struct llama_file * file) {
1596
- this->file = file;
1597
- size = file->size;
1598
- addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
1599
- if (addr == MAP_FAILED) { // NOLINT
1600
- throw std::runtime_error(format("mmap(.., MAP_ANONYMOUS) failed: %s", strerror(errno)));
1601
- }
1602
- #ifdef __linux__
1603
- // THP is enabled by default for anonymous memory mappings on madvise
1604
- if (madvise(addr, size, MADV_HUGEPAGE)) {
1605
- LLAMA_LOG_WARN("warning: madvise(.., MADV_HUGEPAGE) failed: %s\n", strerror(errno));
1606
- }
1607
- #endif
1608
- mapped_fragments.emplace_back(0, size);
1609
- }
1610
-
1611
- void populate(size_t first, size_t last) const override {
1612
- int page_size = sysconf(_SC_PAGESIZE);
1613
-
1614
- align_to_previous_page(&first, page_size);
1615
- align_to_next_page(&last, page_size);
1616
-
1617
- size_t bytes_read = file->read_direct((char *) addr + first, last - first, first);
1618
- if (bytes_read != std::min(last, file->size) - first) {
1619
- throw std::runtime_error("unexpectedly reached end of file");
1620
- }
1621
- }
1622
- #elif defined(_WIN32)
1623
- llama_anonymous_mmap(struct llama_file * file) {
1624
- this->file = file;
1625
- size = file->size;
1626
-
1627
- HANDLE hMapping = CreateFileMapping(INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, size >> 32, size, NULL);
1628
- if (hMapping == NULL) {
1629
- throw std::runtime_error(format("CreateFileMapping failed: %s", llama_format_win_err(GetLastError()).c_str()));
1630
- }
1631
-
1632
- addr = MapViewOfFile(hMapping, FILE_MAP_ALL_ACCESS, 0, 0, size);
1633
- DWORD dwError = GetLastError();
1634
-
1635
- CloseHandle(hMapping);
1636
-
1637
- if (addr == NULL) {
1638
- throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(dwError).c_str()));
1639
- }
1640
- }
1641
-
1642
- void populate(size_t first, size_t last) const override {
1643
- SYSTEM_INFO siSysInfo;
1644
- GetSystemInfo(&siSysInfo);
1645
- DWORD dwPageSize = siSysInfo.dwPageSize;
1646
-
1647
- align_to_previous_page(&first, dwPageSize);
1648
- align_to_next_page(&last, dwPageSize);
1649
-
1650
- size_t bytes_read = file->read_direct((char *) addr + first, last - first, first);
1651
- if (bytes_read != std::min(last, file->size) - first) {
1652
- throw std::runtime_error("unexpectedly reached end of file");
1653
- }
1654
- }
1655
- #else
1656
- llama_anonymous_mmap(struct llama_file * file) {
1657
- GGML_UNUSED(file);
1658
-
1659
- throw std::runtime_error("mmap not supported");
1660
- }
1661
-
1662
- void populate(size_t first, size_t last) const override {
1663
- GGML_UNUSED(first);
1664
- GGML_UNUSED(last);
1665
-
1666
- throw std::runtime_error("mmap not supported");
1667
- }
1668
- #endif
1669
- };
1670
-
1671
1586
using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
1672
1587
1673
1588
// Represents some region of memory being locked using mlock or VirtualLock;
@@ -3470,13 +3385,16 @@ struct llama_model_loader {
3470
3385
}
3471
3386
3472
3387
// either file or anonymous mappings
3473
- this->use_mmap = use_mmap || use_direct_io;
3388
+ this->use_mmap = use_mmap && ! use_direct_io;
3474
3389
this->use_direct_io = use_direct_io;
3475
3390
3476
3391
this->check_tensors = check_tensors;
3477
3392
}
3478
3393
3479
3394
~llama_model_loader() {
3395
+ for (auto & task : load_tasks) {
3396
+ task.wait();
3397
+ }
3480
3398
if (meta) {
3481
3399
gguf_free(meta);
3482
3400
}
@@ -3668,12 +3586,12 @@ struct llama_model_loader {
3668
3586
}
3669
3587
}
3670
3588
3671
- void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr, bool anonymous = false ) {
3589
+ void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) {
3672
3590
if (use_mmap) {
3673
3591
mappings.reserve(files.size());
3674
3592
mmaps_used.reserve(files.size());
3675
3593
for (const auto & file : files) {
3676
- std::unique_ptr<llama_mmap> mapping(anonymous ? new llama_anonymous_mmap(file.get()) : new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
3594
+ std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
3677
3595
mmaps_used.emplace_back(mapping->size, 0);
3678
3596
if (mlock_mmaps) {
3679
3597
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
@@ -3741,6 +3659,26 @@ struct llama_model_loader {
3741
3659
size_t size_done = 0;
3742
3660
size_t size_data = 0;
3743
3661
std::vector<std::pair<size_t, size_t>> mmaps_used;
3662
+ std::vector<std::future<void>> load_tasks;
3663
+
3664
+ void add_load_task(llama_file * file, const llama_tensor_weight * weight, ggml_tensor * cur) {
3665
+ load_tasks.emplace_back(std::async(std::launch::async, [file, cur, weight] {
3666
+ size_t page_size = sysconf(_SC_PAGESIZE);
3667
+ size_t aligned_offset = weight->offs & ~(page_size - 1);
3668
+ size_t diff = weight->offs - aligned_offset;
3669
+ size_t read_size = GGML_PAD(ggml_nbytes(cur) + diff, page_size);
3670
+ std::vector<no_init<uint8_t>> read_buf(read_size + page_size);
3671
+ uint8_t * read_buf_dio;
3672
+ size_t page_offset = (uintptr_t)read_buf.data() % page_size;
3673
+ if (page_offset > 0) {
3674
+ read_buf_dio = (uint8_t*)read_buf.data() + page_size - page_offset;
3675
+ } else {
3676
+ read_buf_dio = (uint8_t*)read_buf.data();
3677
+ }
3678
+ file->read_direct(read_buf_dio, read_size , aligned_offset);
3679
+ ggml_backend_tensor_set(cur, read_buf_dio + diff, 0, ggml_nbytes(cur));
3680
+ }));
3681
+ }
3744
3682
3745
3683
// Returns false if cancelled by progress_callback
3746
3684
bool load_all_data(
@@ -3751,15 +3689,6 @@ struct llama_model_loader {
3751
3689
void * progress_callback_user_data) {
3752
3690
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
3753
3691
3754
- if (use_mmap) {
3755
- for (uint32_t idx = 0; idx < files.size(); idx++) {
3756
- void * addr = nullptr;
3757
- size_t first, last;
3758
- get_mapping_range(&first, &last, &addr, idx, ctx);
3759
- mappings.at(idx)->populate(first, last);
3760
- }
3761
- }
3762
-
3763
3692
std::vector<no_init<uint8_t>> read_buf;
3764
3693
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
3765
3694
@@ -3811,7 +3740,11 @@ struct llama_model_loader {
3811
3740
const auto & file = files.at(weight->idx);
3812
3741
if (ggml_backend_buffer_is_host(cur->buffer)) {
3813
3742
file->seek(weight->offs, SEEK_SET);
3814
- file->read_raw(cur->data, n_size);
3743
+ if (use_direct_io) {
3744
+ add_load_task(file.get(), weight, cur);
3745
+ } else {
3746
+ file->read_raw(cur->data, n_size);
3747
+ }
3815
3748
if (check_tensors) {
3816
3749
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
3817
3750
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
@@ -3820,14 +3753,31 @@ struct llama_model_loader {
3820
3753
} else {
3821
3754
read_buf.resize(n_size);
3822
3755
file->seek(weight->offs, SEEK_SET);
3823
- file->read_raw(read_buf.data(), n_size);
3824
- ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3756
+ if (use_direct_io) {
3757
+ add_load_task(file.get(), weight, cur);
3758
+ } else {
3759
+ file->read_raw(read_buf.data(), n_size);
3760
+ ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3761
+ }
3825
3762
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
3826
3763
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3827
3764
}
3828
3765
}
3829
3766
}
3830
3767
3768
+ const int max_load_tasks = 8;
3769
+ while (load_tasks.size() >= max_load_tasks) {
3770
+ size_t n_size = load_tasks.size();
3771
+ for (size_t i = 0; i < n_size; i++) {
3772
+ auto & future = load_tasks.at(i);
3773
+ if (future.wait_for(std::chrono::milliseconds(10)) == std::future_status::ready) {
3774
+ future.get();
3775
+ load_tasks.erase(load_tasks.begin() + i);
3776
+ break;
3777
+ }
3778
+ }
3779
+ }
3780
+
3831
3781
size_done += n_size;
3832
3782
}
3833
3783
@@ -6219,7 +6169,7 @@ static bool llm_load_tensors(
6219
6169
6220
6170
ml.done_getting_tensors();
6221
6171
6222
- ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr, /* anonymous */ ml.use_direct_io );
6172
+ ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
6223
6173
model.mappings.reserve(ml.mappings.size());
6224
6174
6225
6175
// create the backend buffers
0 commit comments