Skip to content

Commit 03d89a2

Browse files
committed
io_uring: support for user allocated memory for rings/sqes
Currently io_uring applications must call mmap(2) twice to map the rings themselves, and the sqes array. This works fine, but it does not support using huge pages to back the rings/sqes. Provide a way for the application to pass in pre-allocated memory for the rings/sqes, which can then suitably be allocated from shmfs or via mmap to get huge page support. Particularly for larger rings, this reduces the TLBs needed. If an application wishes to take advantage of that, it must pre-allocate the memory needed for the sq/cq ring, and the sqes. The former must be passed in via the io_uring_params->cq_off.user_data field, while the latter is passed in via the io_uring_params->sq_off.user_data field. Then it must set IORING_SETUP_NO_MMAP in the io_uring_params->flags field, and io_uring will then map the existing memory into the kernel for shared use. The application must not call mmap(2) to map rings as it otherwise would have, that will now fail with -EINVAL if this setup flag was used. The pages used for the rings and sqes must be contigious. The intent here is clearly that huge pages should be used, otherwise the normal setup procedure works fine as-is. The application may use one huge page for both the rings and sqes. Outside of those initialization changes, everything works like it did before. Signed-off-by: Jens Axboe <[email protected]>
1 parent 9c189ee commit 03d89a2

File tree

3 files changed

+114
-11
lines changed

3 files changed

+114
-11
lines changed

include/linux/io_uring_types.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,16 @@ struct io_ring_ctx {
211211
unsigned int compat: 1;
212212

213213
enum task_work_notify_mode notify_method;
214+
215+
/*
216+
* If IORING_SETUP_NO_MMAP is used, then the below holds
217+
* the gup'ed pages for the two rings, and the sqes.
218+
*/
219+
unsigned short n_ring_pages;
220+
unsigned short n_sqe_pages;
221+
struct page **ring_pages;
222+
struct page **sqe_pages;
223+
214224
struct io_rings *rings;
215225
struct task_struct *submitter_task;
216226
struct percpu_ref refs;

include/uapi/linux/io_uring.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,11 @@ enum {
173173
*/
174174
#define IORING_SETUP_DEFER_TASKRUN (1U << 13)
175175

176+
/*
177+
* Application provides the memory for the rings
178+
*/
179+
#define IORING_SETUP_NO_MMAP (1U << 14)
180+
176181
enum io_uring_op {
177182
IORING_OP_NOP,
178183
IORING_OP_READV,
@@ -406,7 +411,7 @@ struct io_sqring_offsets {
406411
__u32 dropped;
407412
__u32 array;
408413
__u32 resv1;
409-
__u64 resv2;
414+
__u64 user_addr;
410415
};
411416

412417
/*
@@ -425,7 +430,7 @@ struct io_cqring_offsets {
425430
__u32 cqes;
426431
__u32 flags;
427432
__u32 resv1;
428-
__u64 resv2;
433+
__u64 user_addr;
429434
};
430435

431436
/*

io_uring/io_uring.c

Lines changed: 97 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2688,12 +2688,85 @@ static void io_mem_free(void *ptr)
26882688
free_compound_page(page);
26892689
}
26902690

2691+
static void io_pages_free(struct page ***pages, int npages)
2692+
{
2693+
struct page **page_array;
2694+
int i;
2695+
2696+
if (!pages)
2697+
return;
2698+
page_array = *pages;
2699+
for (i = 0; i < npages; i++)
2700+
unpin_user_page(page_array[i]);
2701+
kvfree(page_array);
2702+
*pages = NULL;
2703+
}
2704+
2705+
static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
2706+
unsigned long uaddr, size_t size)
2707+
{
2708+
struct page **page_array;
2709+
unsigned int nr_pages;
2710+
int ret;
2711+
2712+
*npages = 0;
2713+
2714+
if (uaddr & (PAGE_SIZE - 1) || !size)
2715+
return ERR_PTR(-EINVAL);
2716+
2717+
nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
2718+
if (nr_pages > USHRT_MAX)
2719+
return ERR_PTR(-EINVAL);
2720+
page_array = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
2721+
if (!page_array)
2722+
return ERR_PTR(-ENOMEM);
2723+
2724+
ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
2725+
page_array);
2726+
if (ret != nr_pages) {
2727+
err:
2728+
io_pages_free(&page_array, ret > 0 ? ret : 0);
2729+
return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT);
2730+
}
2731+
/*
2732+
* Should be a single page. If the ring is small enough that we can
2733+
* use a normal page, that is fine. If we need multiple pages, then
2734+
* userspace should use a huge page. That's the only way to guarantee
2735+
* that we get contigious memory, outside of just being lucky or
2736+
* (currently) having low memory fragmentation.
2737+
*/
2738+
if (page_array[0] != page_array[ret - 1])
2739+
goto err;
2740+
*pages = page_array;
2741+
*npages = nr_pages;
2742+
return page_to_virt(page_array[0]);
2743+
}
2744+
2745+
static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr,
2746+
size_t size)
2747+
{
2748+
return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr,
2749+
size);
2750+
}
2751+
2752+
static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
2753+
size_t size)
2754+
{
2755+
return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr,
2756+
size);
2757+
}
2758+
26912759
static void io_rings_free(struct io_ring_ctx *ctx)
26922760
{
2693-
io_mem_free(ctx->rings);
2694-
io_mem_free(ctx->sq_sqes);
2695-
ctx->rings = NULL;
2696-
ctx->sq_sqes = NULL;
2761+
if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
2762+
io_mem_free(ctx->rings);
2763+
io_mem_free(ctx->sq_sqes);
2764+
ctx->rings = NULL;
2765+
ctx->sq_sqes = NULL;
2766+
} else {
2767+
io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
2768+
io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages);
2769+
}
26972770
}
26982771

26992772
static void *io_mem_alloc(size_t size)
@@ -3338,6 +3411,10 @@ static void *io_uring_validate_mmap_request(struct file *file,
33383411
struct page *page;
33393412
void *ptr;
33403413

3414+
/* Don't allow mmap if the ring was setup without it */
3415+
if (ctx->flags & IORING_SETUP_NO_MMAP)
3416+
return ERR_PTR(-EINVAL);
3417+
33413418
switch (offset & IORING_OFF_MMAP_MASK) {
33423419
case IORING_OFF_SQ_RING:
33433420
case IORING_OFF_CQ_RING:
@@ -3673,7 +3750,11 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
36733750
if (size == SIZE_MAX)
36743751
return -EOVERFLOW;
36753752

3676-
rings = io_mem_alloc(size);
3753+
if (!(ctx->flags & IORING_SETUP_NO_MMAP))
3754+
rings = io_mem_alloc(size);
3755+
else
3756+
rings = io_rings_map(ctx, p->cq_off.user_addr, size);
3757+
36773758
if (IS_ERR(rings))
36783759
return PTR_ERR(rings);
36793760

@@ -3693,7 +3774,11 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
36933774
return -EOVERFLOW;
36943775
}
36953776

3696-
ptr = io_mem_alloc(size);
3777+
if (!(ctx->flags & IORING_SETUP_NO_MMAP))
3778+
ptr = io_mem_alloc(size);
3779+
else
3780+
ptr = io_sqes_map(ctx, p->sq_off.user_addr, size);
3781+
36973782
if (IS_ERR(ptr)) {
36983783
io_rings_free(ctx);
36993784
return PTR_ERR(ptr);
@@ -3885,7 +3970,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
38853970
p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
38863971
p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
38873972
p->sq_off.resv1 = 0;
3888-
p->sq_off.resv2 = 0;
3973+
if (!(ctx->flags & IORING_SETUP_NO_MMAP))
3974+
p->sq_off.user_addr = 0;
38893975

38903976
p->cq_off.head = offsetof(struct io_rings, cq.head);
38913977
p->cq_off.tail = offsetof(struct io_rings, cq.tail);
@@ -3895,7 +3981,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
38953981
p->cq_off.cqes = offsetof(struct io_rings, cqes);
38963982
p->cq_off.flags = offsetof(struct io_rings, cq_flags);
38973983
p->cq_off.resv1 = 0;
3898-
p->cq_off.resv2 = 0;
3984+
if (!(ctx->flags & IORING_SETUP_NO_MMAP))
3985+
p->cq_off.user_addr = 0;
38993986

39003987
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
39013988
IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
@@ -3961,7 +4048,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
39614048
IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
39624049
IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
39634050
IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
3964-
IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN))
4051+
IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
4052+
IORING_SETUP_NO_MMAP))
39654053
return -EINVAL;
39664054

39674055
return io_uring_create(entries, &p, params);

0 commit comments

Comments
 (0)