@@ -1374,6 +1374,7 @@ using llama_files = std::vector<std::unique_ptr<llama_file>>;
1374
1374
struct llama_mmap {
1375
1375
void * addr;
1376
1376
size_t size;
1377
+ bool prefetch;
1377
1378
1378
1379
llama_mmap(const llama_mmap &) = delete;
1379
1380
@@ -1402,6 +1403,7 @@ struct llama_mmap {
1402
1403
1403
1404
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
1404
1405
size = file->size;
1406
+ this->prefetch = prefetch > 0;
1405
1407
int fd = fileno(file->fp);
1406
1408
int flags = MAP_SHARED;
1407
1409
// prefetch/readahead impairs performance on NUMA systems
@@ -1503,6 +1505,7 @@ struct llama_mmap {
1503
1505
GGML_UNUSED(numa);
1504
1506
1505
1507
size = file->size;
1508
+ this->prefetch = prefetch > 0;
1506
1509
1507
1510
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
1508
1511
@@ -1592,9 +1595,10 @@ struct llama_anonymous_mmap : llama_mmap {
1592
1595
#ifndef MAP_ANONYMOUS
1593
1596
#define MAP_ANONYMOUS MAP_ANON
1594
1597
#endif
1595
- llama_anonymous_mmap(struct llama_file * file) {
1598
+ llama_anonymous_mmap(struct llama_file * file, bool prefetch ) {
1596
1599
this->file = file;
1597
1600
size = file->size;
1601
+ this->prefetch = prefetch;
1598
1602
addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
1599
1603
if (addr == MAP_FAILED) { // NOLINT
1600
1604
throw std::runtime_error(format("mmap(.., MAP_ANONYMOUS) failed: %s", strerror(errno)));
@@ -1620,9 +1624,10 @@ struct llama_anonymous_mmap : llama_mmap {
1620
1624
}
1621
1625
}
1622
1626
#elif defined(_WIN32)
1623
- llama_anonymous_mmap(struct llama_file * file) {
1627
+ llama_anonymous_mmap(struct llama_file * file, bool prefetch ) {
1624
1628
this->file = file;
1625
1629
size = file->size;
1630
+ this->prefetch = prefetch;
1626
1631
1627
1632
HANDLE hMapping = CreateFileMapping(INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, size >> 32, size, NULL);
1628
1633
if (hMapping == NULL) {
@@ -3699,7 +3704,7 @@ struct llama_model_loader {
3699
3704
if (use_mmap) {
3700
3705
mappings.reserve(files.size());
3701
3706
for (const auto & file : files) {
3702
- std::unique_ptr<llama_mmap> mapping(anonymous ? new llama_anonymous_mmap(file.get()) : new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
3707
+ std::unique_ptr<llama_mmap> mapping(anonymous ? new llama_anonymous_mmap(file.get(), prefetch ) : new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
3703
3708
if (mlock_mmaps) {
3704
3709
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
3705
3710
mlock_mmap->init(mapping->addr);
@@ -3799,8 +3804,11 @@ struct llama_model_loader {
3799
3804
3800
3805
if (use_mmap) {
3801
3806
for (uint32_t idx = 0; idx < files.size(); idx++) {
3802
- for (auto range : get_mapping_ranges(idx, ctx)) {
3803
- mappings.at(idx)->populate(range.first, range.second);
3807
+ const auto & mapping = mappings.at(idx);
3808
+ if (mapping->prefetch) {
3809
+ for (auto range : get_mapping_ranges(idx, ctx)) {
3810
+ mapping->populate(range.first, range.second);
3811
+ }
3804
3812
}
3805
3813
}
3806
3814
}
@@ -3838,6 +3846,9 @@ struct llama_model_loader {
3838
3846
}
3839
3847
3840
3848
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
3849
+ if (!mapping->prefetch) {
3850
+ mapping->populate(weight->offs, weight->offs + n_size);
3851
+ }
3841
3852
if (buf_mmap && cur->data == nullptr) {
3842
3853
ggml_backend_tensor_alloc(buf_mmap, cur, data);
3843
3854
if (lmlocks) {
@@ -3846,7 +3857,7 @@ struct llama_model_loader {
3846
3857
}
3847
3858
} else {
3848
3859
ggml_backend_tensor_set(cur, data, 0, n_size);
3849
- mappings.at(weight->idx) ->unmap_fragment(weight->offs, weight->offs + n_size);
3860
+ mapping ->unmap_fragment(weight->offs, weight->offs + n_size);
3850
3861
}
3851
3862
} else {
3852
3863
GGML_ASSERT(weight->idx < files.size());
0 commit comments