Skip to content

Commit e6130eb

Browse files
committed
io_uring: add support for passing fixed file descriptors
With IORING_OP_MSG_RING, one ring can send a message to another ring. Extend that support to also allow sending a fixed file descriptor to that ring, enabling one ring to pass a registered descriptor to another one. Arguments are extended to pass in: sqe->addr3 fixed file slot in source ring sqe->file_index fixed file slot in destination ring IORING_OP_MSG_RING is extended to take a command argument in sqe->addr. If set to zero (or IORING_MSG_DATA), it sends just a message like before. If set to IORING_MSG_SEND_FD, a fixed file descriptor is sent according to the above arguments. Two common use cases for this are: 1) Server needs to be shutdown or restarted, pass file descriptors to another onei 2) Backend is split, and one accepts connections, while others then get the fd passed and handle the actual connection. Both of those are classic SCM_RIGHTS use cases, and it's not possible to support them with direct descriptors today. By default, this will post a CQE to the target ring, similarly to how IORING_MSG_DATA does it. If IORING_MSG_RING_CQE_SKIP is set, no message is posted to the target ring. The issuer is expected to notify the receiver side separately. Signed-off-by: Jens Axboe <[email protected]>
1 parent f110ed8 commit e6130eb

File tree

2 files changed

+140
-7
lines changed

2 files changed

+140
-7
lines changed

include/uapi/linux/io_uring.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ struct io_uring_sqe {
5151
__u32 unlink_flags;
5252
__u32 hardlink_flags;
5353
__u32 xattr_flags;
54+
__u32 msg_ring_flags;
5455
};
5556
__u64 user_data; /* data to be passed back at completion time */
5657
/* pack this to avoid bogus arm OABI complaints */
@@ -270,6 +271,22 @@ enum io_uring_op {
270271
*/
271272
#define IORING_ACCEPT_MULTISHOT (1U << 0)
272273

274+
/*
275+
* IORING_OP_MSG_RING command types, stored in sqe->addr
276+
*/
277+
enum {
278+
IORING_MSG_DATA, /* pass sqe->len as 'res' and off as user_data */
279+
IORING_MSG_SEND_FD, /* send a registered fd to another ring */
280+
};
281+
282+
/*
283+
* IORING_OP_MSG_RING flags (sqe->msg_ring_flags)
284+
*
285+
* IORING_MSG_RING_CQE_SKIP Don't post a CQE to the target ring. Not
286+
* applicable for IORING_MSG_DATA, obviously.
287+
*/
288+
#define IORING_MSG_RING_CQE_SKIP (1U << 0)
289+
273290
/*
274291
* IO completion data structure (Completion Queue Entry)
275292
*/

io_uring/msg_ring.c

Lines changed: 123 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,46 +3,162 @@
33
#include <linux/errno.h>
44
#include <linux/file.h>
55
#include <linux/slab.h>
6+
#include <linux/nospec.h>
67
#include <linux/io_uring.h>
78

89
#include <uapi/linux/io_uring.h>
910

1011
#include "io_uring.h"
12+
#include "rsrc.h"
13+
#include "filetable.h"
1114
#include "msg_ring.h"
1215

1316
struct io_msg {
1417
struct file *file;
1518
u64 user_data;
1619
u32 len;
20+
u32 cmd;
21+
u32 src_fd;
22+
u32 dst_fd;
23+
u32 flags;
1724
};
1825

26+
static int io_msg_ring_data(struct io_kiocb *req)
27+
{
28+
struct io_ring_ctx *target_ctx = req->file->private_data;
29+
struct io_msg *msg = io_kiocb_to_cmd(req);
30+
31+
if (msg->src_fd || msg->dst_fd || msg->flags)
32+
return -EINVAL;
33+
34+
if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
35+
return 0;
36+
37+
return -EOVERFLOW;
38+
}
39+
40+
static void io_double_unlock_ctx(struct io_ring_ctx *ctx,
41+
struct io_ring_ctx *octx,
42+
unsigned int issue_flags)
43+
{
44+
if (issue_flags & IO_URING_F_UNLOCKED)
45+
mutex_unlock(&ctx->uring_lock);
46+
mutex_unlock(&octx->uring_lock);
47+
}
48+
49+
static int io_double_lock_ctx(struct io_ring_ctx *ctx,
50+
struct io_ring_ctx *octx,
51+
unsigned int issue_flags)
52+
{
53+
/*
54+
* To ensure proper ordering between the two ctxs, we can only
55+
* attempt a trylock on the target. If that fails and we already have
56+
* the source ctx lock, punt to io-wq.
57+
*/
58+
if (!(issue_flags & IO_URING_F_UNLOCKED)) {
59+
if (!mutex_trylock(&octx->uring_lock))
60+
return -EAGAIN;
61+
return 0;
62+
}
63+
64+
/* Always grab smallest value ctx first. We know ctx != octx. */
65+
if (ctx < octx) {
66+
mutex_lock(&ctx->uring_lock);
67+
mutex_lock(&octx->uring_lock);
68+
} else {
69+
mutex_lock(&octx->uring_lock);
70+
mutex_lock(&ctx->uring_lock);
71+
}
72+
73+
return 0;
74+
}
75+
76+
static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
77+
{
78+
struct io_ring_ctx *target_ctx = req->file->private_data;
79+
struct io_msg *msg = io_kiocb_to_cmd(req);
80+
struct io_ring_ctx *ctx = req->ctx;
81+
unsigned long file_ptr;
82+
struct file *src_file;
83+
int ret;
84+
85+
if (target_ctx == ctx)
86+
return -EINVAL;
87+
88+
ret = io_double_lock_ctx(ctx, target_ctx, issue_flags);
89+
if (unlikely(ret))
90+
return ret;
91+
92+
ret = -EBADF;
93+
if (unlikely(msg->src_fd >= ctx->nr_user_files))
94+
goto out_unlock;
95+
96+
msg->src_fd = array_index_nospec(msg->src_fd, ctx->nr_user_files);
97+
file_ptr = io_fixed_file_slot(&ctx->file_table, msg->src_fd)->file_ptr;
98+
src_file = (struct file *) (file_ptr & FFS_MASK);
99+
get_file(src_file);
100+
101+
ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd);
102+
if (ret < 0) {
103+
fput(src_file);
104+
goto out_unlock;
105+
}
106+
107+
if (msg->flags & IORING_MSG_RING_CQE_SKIP)
108+
goto out_unlock;
109+
110+
/*
111+
* If this fails, the target still received the file descriptor but
112+
* wasn't notified of the fact. This means that if this request
113+
* completes with -EOVERFLOW, then the sender must ensure that a
114+
* later IORING_OP_MSG_RING delivers the message.
115+
*/
116+
if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
117+
ret = -EOVERFLOW;
118+
out_unlock:
119+
io_double_unlock_ctx(ctx, target_ctx, issue_flags);
120+
return ret;
121+
}
122+
19123
int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
20124
{
21125
struct io_msg *msg = io_kiocb_to_cmd(req);
22126

23-
if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in ||
24-
sqe->buf_index || sqe->personality))
127+
if (unlikely(sqe->buf_index || sqe->personality))
25128
return -EINVAL;
26129

27130
msg->user_data = READ_ONCE(sqe->off);
28131
msg->len = READ_ONCE(sqe->len);
132+
msg->cmd = READ_ONCE(sqe->addr);
133+
msg->src_fd = READ_ONCE(sqe->addr3);
134+
msg->dst_fd = READ_ONCE(sqe->file_index);
135+
msg->flags = READ_ONCE(sqe->msg_ring_flags);
136+
if (msg->flags & ~IORING_MSG_RING_CQE_SKIP)
137+
return -EINVAL;
138+
29139
return 0;
30140
}
31141

32142
int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
33143
{
34144
struct io_msg *msg = io_kiocb_to_cmd(req);
35-
struct io_ring_ctx *target_ctx;
36145
int ret;
37146

38147
ret = -EBADFD;
39148
if (!io_is_uring_fops(req->file))
40149
goto done;
41150

42-
ret = -EOVERFLOW;
43-
target_ctx = req->file->private_data;
44-
if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
45-
ret = 0;
151+
switch (msg->cmd) {
152+
case IORING_MSG_DATA:
153+
ret = io_msg_ring_data(req);
154+
break;
155+
case IORING_MSG_SEND_FD:
156+
ret = io_msg_send_fd(req, issue_flags);
157+
break;
158+
default:
159+
ret = -EINVAL;
160+
break;
161+
}
46162

47163
done:
48164
if (ret < 0)

0 commit comments

Comments
 (0)