@@ -3671,10 +3671,8 @@ struct llama_model_loader {
3671
3671
void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr, bool anonymous = false) {
3672
3672
if (use_mmap) {
3673
3673
mappings.reserve(files.size());
3674
- mmaps_used.reserve(files.size());
3675
3674
for (const auto & file : files) {
3676
3675
std::unique_ptr<llama_mmap> mapping(anonymous ? new llama_anonymous_mmap(file.get()) : new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
3677
- mmaps_used.emplace_back(mapping->size, 0);
3678
3676
if (mlock_mmaps) {
3679
3677
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
3680
3678
mlock_mmap->init(mapping->addr);
@@ -3690,13 +3688,10 @@ struct llama_model_loader {
3690
3688
}
3691
3689
}
3692
3690
3693
- void get_mapping_range( size_t * first , size_t * last, void ** addr, int idx, ggml_context * ctx) const {
3691
+ std::vector<std::pair< size_t, size_t>> get_mapping_ranges( int idx, ggml_context * ctx) const {
3694
3692
GGML_ASSERT(!mappings.empty());
3695
- const auto & mapping = mappings.at(idx);
3696
3693
3697
- *first = mapping->size;
3698
- *last = 0;
3699
- *addr = mapping->addr;
3694
+ std::vector<std::pair<size_t, size_t>> sorted;
3700
3695
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
3701
3696
try {
3702
3697
const auto * weight = get_weight(ggml_get_name(tensor));
@@ -3706,12 +3701,37 @@ struct llama_model_loader {
3706
3701
if (weight->idx != idx) {
3707
3702
continue;
3708
3703
}
3709
- *first = std::min(*first, weight->offs);
3710
- *last = std::max(*last, weight->offs + ggml_nbytes(tensor));
3704
+ sorted.emplace_back(weight->offs, weight->offs + ggml_nbytes(tensor));
3711
3705
} catch(...) {
3712
3706
// the tensor is not in the model
3713
3707
}
3714
3708
}
3709
+
3710
+ std::sort(sorted.begin(), sorted.end(), [](std::pair<size_t, size_t> a, std::pair<size_t, size_t> b) { return a.first < b.first; });
3711
+
3712
+ std::vector<std::pair<size_t, size_t>> merged;
3713
+ for (auto range : sorted) {
3714
+ if (!merged.empty() && merged.back().second == range.first) {
3715
+ auto last = merged.back();
3716
+ merged.pop_back();
3717
+ merged.emplace_back(last.first, range.second);
3718
+ } else {
3719
+ merged.emplace_back(range.first, range.second);
3720
+ }
3721
+ }
3722
+
3723
+ return merged;
3724
+ }
3725
+
3726
+ void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
3727
+ GGML_ASSERT(!mappings.empty());
3728
+
3729
+ *addr = mappings.at(idx)->addr;
3730
+
3731
+ auto ranges = get_mapping_ranges(idx, ctx);
3732
+ GGML_ASSERT(!ranges.empty());
3733
+ *first = ranges.front().first;
3734
+ *last = ranges.back().second;
3715
3735
}
3716
3736
3717
3737
// for backwards compatibility, does not support ggml-backend
@@ -3740,7 +3760,6 @@ struct llama_model_loader {
3740
3760
3741
3761
size_t size_done = 0;
3742
3762
size_t size_data = 0;
3743
- std::vector<std::pair<size_t, size_t>> mmaps_used;
3744
3763
3745
3764
// Returns false if cancelled by progress_callback
3746
3765
bool load_all_data(
@@ -3753,10 +3772,9 @@ struct llama_model_loader {
3753
3772
3754
3773
if (use_mmap) {
3755
3774
for (uint32_t idx = 0; idx < files.size(); idx++) {
3756
- void * addr = nullptr;
3757
- size_t first, last;
3758
- get_mapping_range(&first, &last, &addr, idx, ctx);
3759
- mappings.at(idx)->populate(first, last);
3775
+ for (auto range : get_mapping_ranges(idx, ctx)) {
3776
+ mappings.at(idx)->populate(range.first, range.second);
3777
+ }
3760
3778
}
3761
3779
}
3762
3780
@@ -3799,12 +3817,9 @@ struct llama_model_loader {
3799
3817
const auto & lmlock = lmlocks->at(weight->idx);
3800
3818
lmlock->grow_to(weight->offs + n_size);
3801
3819
}
3802
-
3803
- auto & mmap_used = mmaps_used[weight->idx];
3804
- mmap_used.first = std::min(mmap_used.first, weight->offs);
3805
- mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
3806
3820
} else {
3807
3821
ggml_backend_tensor_set(cur, data, 0, n_size);
3822
+ mappings.at(weight->idx)->unmap_fragment(weight->offs, weight->offs + n_size);
3808
3823
}
3809
3824
} else {
3810
3825
GGML_ASSERT(weight->idx < files.size());
@@ -3844,19 +3859,7 @@ struct llama_model_loader {
3844
3859
throw std::runtime_error("found tensors with invalid data");
3845
3860
}
3846
3861
3847
- // check if this is the last call and do final cleanup
3848
3862
if (size_done >= size_data) {
3849
- // unmap offloaded tensors and metadata
3850
- if (use_mmap) {
3851
- for (uint32_t idx = 0; idx < mappings.size(); idx++) {
3852
- const auto & mmap_used = mmaps_used.at(idx);
3853
- auto & mapping = mappings.at(idx);
3854
- mapping->unmap_fragment(0, mmap_used.first);
3855
- if (mmap_used.second != 0) {
3856
- mapping->unmap_fragment(mmap_used.second, mapping->size);
3857
- }
3858
- }
3859
- }
3860
3863
if (progress_callback) {
3861
3864
// Even though the model is done loading, we still honor
3862
3865
// cancellation since we need to free allocations.
0 commit comments