Skip to content

Commit d129edb

Browse files
committed
Fine-grained mapping/unmapping
1 parent 46db350 commit d129edb

File tree

1 file changed

+33
-30
lines changed

1 file changed

+33
-30
lines changed

llama.cpp

Lines changed: 33 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3671,10 +3671,8 @@ struct llama_model_loader {
36713671
void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr, bool anonymous = false) {
36723672
if (use_mmap) {
36733673
mappings.reserve(files.size());
3674-
mmaps_used.reserve(files.size());
36753674
for (const auto & file : files) {
36763675
std::unique_ptr<llama_mmap> mapping(anonymous ? new llama_anonymous_mmap(file.get()) : new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
3677-
mmaps_used.emplace_back(mapping->size, 0);
36783676
if (mlock_mmaps) {
36793677
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
36803678
mlock_mmap->init(mapping->addr);
@@ -3690,13 +3688,10 @@ struct llama_model_loader {
36903688
}
36913689
}
36923690

3693-
void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
3691+
std::vector<std::pair<size_t, size_t>> get_mapping_ranges(int idx, ggml_context * ctx) const {
36943692
GGML_ASSERT(!mappings.empty());
3695-
const auto & mapping = mappings.at(idx);
36963693

3697-
*first = mapping->size;
3698-
*last = 0;
3699-
*addr = mapping->addr;
3694+
std::vector<std::pair<size_t, size_t>> sorted;
37003695
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
37013696
try {
37023697
const auto * weight = get_weight(ggml_get_name(tensor));
@@ -3706,12 +3701,37 @@ struct llama_model_loader {
37063701
if (weight->idx != idx) {
37073702
continue;
37083703
}
3709-
*first = std::min(*first, weight->offs);
3710-
*last = std::max(*last, weight->offs + ggml_nbytes(tensor));
3704+
sorted.emplace_back(weight->offs, weight->offs + ggml_nbytes(tensor));
37113705
} catch(...) {
37123706
// the tensor is not in the model
37133707
}
37143708
}
3709+
3710+
std::sort(sorted.begin(), sorted.end(), [](std::pair<size_t, size_t> a, std::pair<size_t, size_t> b) { return a.first < b.first; });
3711+
3712+
std::vector<std::pair<size_t, size_t>> merged;
3713+
for (auto range : sorted) {
3714+
if (!merged.empty() && merged.back().second == range.first) {
3715+
auto last = merged.back();
3716+
merged.pop_back();
3717+
merged.emplace_back(last.first, range.second);
3718+
} else {
3719+
merged.emplace_back(range.first, range.second);
3720+
}
3721+
}
3722+
3723+
return merged;
3724+
}
3725+
3726+
void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
3727+
GGML_ASSERT(!mappings.empty());
3728+
3729+
*addr = mappings.at(idx)->addr;
3730+
3731+
auto ranges = get_mapping_ranges(idx, ctx);
3732+
GGML_ASSERT(!ranges.empty());
3733+
*first = ranges.front().first;
3734+
*last = ranges.back().second;
37153735
}
37163736

37173737
// for backwards compatibility, does not support ggml-backend
@@ -3740,7 +3760,6 @@ struct llama_model_loader {
37403760

37413761
size_t size_done = 0;
37423762
size_t size_data = 0;
3743-
std::vector<std::pair<size_t, size_t>> mmaps_used;
37443763

37453764
// Returns false if cancelled by progress_callback
37463765
bool load_all_data(
@@ -3753,10 +3772,9 @@ struct llama_model_loader {
37533772

37543773
if (use_mmap) {
37553774
for (uint32_t idx = 0; idx < files.size(); idx++) {
3756-
void * addr = nullptr;
3757-
size_t first, last;
3758-
get_mapping_range(&first, &last, &addr, idx, ctx);
3759-
mappings.at(idx)->populate(first, last);
3775+
for (auto range : get_mapping_ranges(idx, ctx)) {
3776+
mappings.at(idx)->populate(range.first, range.second);
3777+
}
37603778
}
37613779
}
37623780

@@ -3799,12 +3817,9 @@ struct llama_model_loader {
37993817
const auto & lmlock = lmlocks->at(weight->idx);
38003818
lmlock->grow_to(weight->offs + n_size);
38013819
}
3802-
3803-
auto & mmap_used = mmaps_used[weight->idx];
3804-
mmap_used.first = std::min(mmap_used.first, weight->offs);
3805-
mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
38063820
} else {
38073821
ggml_backend_tensor_set(cur, data, 0, n_size);
3822+
mappings.at(weight->idx)->unmap_fragment(weight->offs, weight->offs + n_size);
38083823
}
38093824
} else {
38103825
GGML_ASSERT(weight->idx < files.size());
@@ -3844,19 +3859,7 @@ struct llama_model_loader {
38443859
throw std::runtime_error("found tensors with invalid data");
38453860
}
38463861

3847-
// check if this is the last call and do final cleanup
38483862
if (size_done >= size_data) {
3849-
// unmap offloaded tensors and metadata
3850-
if (use_mmap) {
3851-
for (uint32_t idx = 0; idx < mappings.size(); idx++) {
3852-
const auto & mmap_used = mmaps_used.at(idx);
3853-
auto & mapping = mappings.at(idx);
3854-
mapping->unmap_fragment(0, mmap_used.first);
3855-
if (mmap_used.second != 0) {
3856-
mapping->unmap_fragment(mmap_used.second, mapping->size);
3857-
}
3858-
}
3859-
}
38603863
if (progress_callback) {
38613864
// Even though the model is done loading, we still honor
38623865
// cancellation since we need to free allocations.

0 commit comments

Comments
 (0)