Skip to content

Commit e9095e6

Browse files
committed
async direct io per tensor test
1 parent 46db350 commit e9095e6

File tree

1 file changed

+51
-101
lines changed

1 file changed

+51
-101
lines changed

llama.cpp

+51-101
Original file line numberDiff line numberDiff line change
@@ -1583,91 +1583,6 @@ struct llama_mmap {
15831583
llama_mmap() {}
15841584
};
15851585

1586-
struct llama_anonymous_mmap : llama_mmap {
1587-
llama_file * file;
1588-
1589-
llama_anonymous_mmap(const llama_anonymous_mmap &) = delete;
1590-
1591-
#ifdef _POSIX_MAPPED_FILES
1592-
#ifndef MAP_ANONYMOUS
1593-
#define MAP_ANONYMOUS MAP_ANON
1594-
#endif
1595-
llama_anonymous_mmap(struct llama_file * file) {
1596-
this->file = file;
1597-
size = file->size;
1598-
addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
1599-
if (addr == MAP_FAILED) { // NOLINT
1600-
throw std::runtime_error(format("mmap(.., MAP_ANONYMOUS) failed: %s", strerror(errno)));
1601-
}
1602-
#ifdef __linux__
1603-
// THP is enabled by default for anonymous memory mappings on madvise
1604-
if (madvise(addr, size, MADV_HUGEPAGE)) {
1605-
LLAMA_LOG_WARN("warning: madvise(.., MADV_HUGEPAGE) failed: %s\n", strerror(errno));
1606-
}
1607-
#endif
1608-
mapped_fragments.emplace_back(0, size);
1609-
}
1610-
1611-
void populate(size_t first, size_t last) const override {
1612-
int page_size = sysconf(_SC_PAGESIZE);
1613-
1614-
align_to_previous_page(&first, page_size);
1615-
align_to_next_page(&last, page_size);
1616-
1617-
size_t bytes_read = file->read_direct((char *) addr + first, last - first, first);
1618-
if (bytes_read != std::min(last, file->size) - first) {
1619-
throw std::runtime_error("unexpectedly reached end of file");
1620-
}
1621-
}
1622-
#elif defined(_WIN32)
1623-
llama_anonymous_mmap(struct llama_file * file) {
1624-
this->file = file;
1625-
size = file->size;
1626-
1627-
HANDLE hMapping = CreateFileMapping(INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, size >> 32, size, NULL);
1628-
if (hMapping == NULL) {
1629-
throw std::runtime_error(format("CreateFileMapping failed: %s", llama_format_win_err(GetLastError()).c_str()));
1630-
}
1631-
1632-
addr = MapViewOfFile(hMapping, FILE_MAP_ALL_ACCESS, 0, 0, size);
1633-
DWORD dwError = GetLastError();
1634-
1635-
CloseHandle(hMapping);
1636-
1637-
if (addr == NULL) {
1638-
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(dwError).c_str()));
1639-
}
1640-
}
1641-
1642-
void populate(size_t first, size_t last) const override {
1643-
SYSTEM_INFO siSysInfo;
1644-
GetSystemInfo(&siSysInfo);
1645-
DWORD dwPageSize = siSysInfo.dwPageSize;
1646-
1647-
align_to_previous_page(&first, dwPageSize);
1648-
align_to_next_page(&last, dwPageSize);
1649-
1650-
size_t bytes_read = file->read_direct((char *) addr + first, last - first, first);
1651-
if (bytes_read != std::min(last, file->size) - first) {
1652-
throw std::runtime_error("unexpectedly reached end of file");
1653-
}
1654-
}
1655-
#else
1656-
llama_anonymous_mmap(struct llama_file * file) {
1657-
GGML_UNUSED(file);
1658-
1659-
throw std::runtime_error("mmap not supported");
1660-
}
1661-
1662-
void populate(size_t first, size_t last) const override {
1663-
GGML_UNUSED(first);
1664-
GGML_UNUSED(last);
1665-
1666-
throw std::runtime_error("mmap not supported");
1667-
}
1668-
#endif
1669-
};
1670-
16711586
using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
16721587

16731588
// Represents some region of memory being locked using mlock or VirtualLock;
@@ -3470,13 +3385,16 @@ struct llama_model_loader {
34703385
}
34713386

34723387
// either file or anonymous mappings
3473-
this->use_mmap = use_mmap || use_direct_io;
3388+
this->use_mmap = use_mmap && !use_direct_io;
34743389
this->use_direct_io = use_direct_io;
34753390

34763391
this->check_tensors = check_tensors;
34773392
}
34783393

34793394
~llama_model_loader() {
3395+
for (auto & task : load_tasks) {
3396+
task.wait();
3397+
}
34803398
if (meta) {
34813399
gguf_free(meta);
34823400
}
@@ -3668,12 +3586,12 @@ struct llama_model_loader {
36683586
}
36693587
}
36703588

3671-
void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr, bool anonymous = false) {
3589+
void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) {
36723590
if (use_mmap) {
36733591
mappings.reserve(files.size());
36743592
mmaps_used.reserve(files.size());
36753593
for (const auto & file : files) {
3676-
std::unique_ptr<llama_mmap> mapping(anonymous ? new llama_anonymous_mmap(file.get()) : new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
3594+
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
36773595
mmaps_used.emplace_back(mapping->size, 0);
36783596
if (mlock_mmaps) {
36793597
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
@@ -3741,6 +3659,26 @@ struct llama_model_loader {
37413659
size_t size_done = 0;
37423660
size_t size_data = 0;
37433661
std::vector<std::pair<size_t, size_t>> mmaps_used;
3662+
std::vector<std::future<void>> load_tasks;
3663+
3664+
void add_load_task(llama_file * file, const llama_tensor_weight * weight, ggml_tensor * cur) {
3665+
load_tasks.emplace_back(std::async(std::launch::async, [file, cur, weight] {
3666+
size_t page_size = sysconf(_SC_PAGESIZE);
3667+
size_t aligned_offset = weight->offs & ~(page_size - 1);
3668+
size_t diff = weight->offs - aligned_offset;
3669+
size_t read_size = GGML_PAD(ggml_nbytes(cur) + diff, page_size);
3670+
std::vector<no_init<uint8_t>> read_buf(read_size + page_size);
3671+
uint8_t * read_buf_dio;
3672+
size_t page_offset = (uintptr_t)read_buf.data() % page_size;
3673+
if (page_offset > 0) {
3674+
read_buf_dio = (uint8_t*)read_buf.data() + page_size - page_offset;
3675+
} else {
3676+
read_buf_dio = (uint8_t*)read_buf.data();
3677+
}
3678+
file->read_direct(read_buf_dio, read_size , aligned_offset);
3679+
ggml_backend_tensor_set(cur, read_buf_dio + diff, 0, ggml_nbytes(cur));
3680+
}));
3681+
}
37443682

37453683
// Returns false if cancelled by progress_callback
37463684
bool load_all_data(
@@ -3751,15 +3689,6 @@ struct llama_model_loader {
37513689
void * progress_callback_user_data) {
37523690
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
37533691

3754-
if (use_mmap) {
3755-
for (uint32_t idx = 0; idx < files.size(); idx++) {
3756-
void * addr = nullptr;
3757-
size_t first, last;
3758-
get_mapping_range(&first, &last, &addr, idx, ctx);
3759-
mappings.at(idx)->populate(first, last);
3760-
}
3761-
}
3762-
37633692
std::vector<no_init<uint8_t>> read_buf;
37643693
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
37653694

@@ -3811,7 +3740,11 @@ struct llama_model_loader {
38113740
const auto & file = files.at(weight->idx);
38123741
if (ggml_backend_buffer_is_host(cur->buffer)) {
38133742
file->seek(weight->offs, SEEK_SET);
3814-
file->read_raw(cur->data, n_size);
3743+
if (use_direct_io) {
3744+
add_load_task(file.get(), weight, cur);
3745+
} else {
3746+
file->read_raw(cur->data, n_size);
3747+
}
38153748
if (check_tensors) {
38163749
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
38173750
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
@@ -3820,14 +3753,31 @@ struct llama_model_loader {
38203753
} else {
38213754
read_buf.resize(n_size);
38223755
file->seek(weight->offs, SEEK_SET);
3823-
file->read_raw(read_buf.data(), n_size);
3824-
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3756+
if (use_direct_io) {
3757+
add_load_task(file.get(), weight, cur);
3758+
} else {
3759+
file->read_raw(read_buf.data(), n_size);
3760+
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3761+
}
38253762
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
38263763
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
38273764
}
38283765
}
38293766
}
38303767

3768+
const int max_load_tasks = 8;
3769+
while (load_tasks.size() >= max_load_tasks) {
3770+
size_t n_size = load_tasks.size();
3771+
for (size_t i = 0; i < n_size; i++) {
3772+
auto & future = load_tasks.at(i);
3773+
if (future.wait_for(std::chrono::milliseconds(10)) == std::future_status::ready) {
3774+
future.get();
3775+
load_tasks.erase(load_tasks.begin() + i);
3776+
break;
3777+
}
3778+
}
3779+
}
3780+
38313781
size_done += n_size;
38323782
}
38333783

@@ -6219,7 +6169,7 @@ static bool llm_load_tensors(
62196169

62206170
ml.done_getting_tensors();
62216171

6222-
ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr, /* anonymous */ ml.use_direct_io);
6172+
ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
62236173
model.mappings.reserve(ml.mappings.size());
62246174

62256175
// create the backend buffers

0 commit comments

Comments
 (0)