Skip to content

Commit df09a27

Browse files
Darksonnfbq
authored andcommitted
rust: file: add DeferredFdCloser
To close an fd from kernel space, we could call `ksys_close`. However, if we do this to an fd that is held using `fdget`, then we may trigger a use-after-free. Introduce a helper that can be used to close an fd even if the fd is currently held with `fdget`. This is done by grabbing an extra refcount to the file and dropping it in a task work once we return to userspace. This is necessary for Rust Binder because otherwise the user might try to have Binder close its fd for /dev/binder, which would cause problems as this happens inside an ioctl on /dev/binder, and ioctls hold the fd using `fdget`. Additional motivation can be found in commit 80cd795 ("binder: fix use-after-free due to ksys_close() during fdget()") and in the comments on `binder_do_fd_close`. If there is some way to detect whether an fd is currently held with `fdget`, then this could be optimized to skip the allocation and task work when this is not the case. Another possible optimization would be to combine several fds into a single task work, since this is used with fd arrays that might hold several fds. That said, it might not be necessary to optimize it, because Rust Binder has two ways to send fds: BINDER_TYPE_FD and BINDER_TYPE_FDA. With BINDER_TYPE_FD, it is userspace's responsibility to close the fd, so this mechanism is used only by BINDER_TYPE_FDA, but fd arrays are used rarely these days. Reviewed-by: Benno Lossin <[email protected]> Reviewed-by: Martin Rodriguez Reboredo <[email protected]> Signed-off-by: Alice Ryhl <[email protected]> Reviewed-by: Trevor Gross <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent ec2d517 commit df09a27

File tree

4 files changed

+207
-1
lines changed

4 files changed

+207
-1
lines changed

rust/bindings/bindings_helper.h

+2
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include <linux/cred.h>
1111
#include <linux/errname.h>
1212
#include <linux/ethtool.h>
13+
#include <linux/fdtable.h>
1314
#include <linux/file.h>
1415
#include <linux/fs.h>
1516
#include <linux/jiffies.h>
@@ -20,6 +21,7 @@
2021
#include <linux/sched.h>
2122
#include <linux/security.h>
2223
#include <linux/slab.h>
24+
#include <linux/task_work.h>
2325
#include <linux/wait.h>
2426
#include <linux/workqueue.h>
2527

rust/helpers.c

+8
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include <linux/sched/signal.h>
3333
#include <linux/security.h>
3434
#include <linux/spinlock.h>
35+
#include <linux/task_work.h>
3536
#include <linux/wait.h>
3637
#include <linux/workqueue.h>
3738

@@ -243,6 +244,13 @@ void rust_helper_security_release_secctx(char *secdata, u32 seclen)
243244
EXPORT_SYMBOL_GPL(rust_helper_security_release_secctx);
244245
#endif
245246

247+
void rust_helper_init_task_work(struct callback_head *twork,
248+
task_work_func_t func)
249+
{
250+
init_task_work(twork, func);
251+
}
252+
EXPORT_SYMBOL_GPL(rust_helper_init_task_work);
253+
246254
/*
247255
* `bindgen` binds the C `size_t` type as the Rust `usize` type, so we can
248256
* use it in contexts where Rust expects a `usize` like slice (array) indices.

rust/kernel/file.rs

+183-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ use crate::{
1111
error::{code::*, Error, Result},
1212
types::{ARef, AlwaysRefCounted, NotThreadSafe, Opaque},
1313
};
14-
use core::ptr;
14+
use alloc::boxed::Box;
15+
use core::{alloc::AllocError, mem, ptr};
1516

1617
/// Flags associated with a [`File`].
1718
pub mod flags {
@@ -318,6 +319,187 @@ impl Drop for FileDescriptorReservation {
318319
}
319320
}
320321

322+
/// Helper used for closing file descriptors in a way that is safe even if the file is currently
323+
/// held using `fdget`.
324+
///
325+
/// Additional motivation can be found in commit 80cd795630d6 ("binder: fix use-after-free due to
326+
/// ksys_close() during fdget()") and in the comments on `binder_do_fd_close`.
327+
pub struct DeferredFdCloser {
328+
inner: Box<DeferredFdCloserInner>,
329+
}
330+
331+
/// SAFETY: This just holds an allocation with no real content, so there's no safety issue with
332+
/// moving it across threads.
333+
unsafe impl Send for DeferredFdCloser {}
334+
unsafe impl Sync for DeferredFdCloser {}
335+
336+
/// # Invariants
337+
///
338+
/// If the `file` pointer is non-null, then it points at a `struct file` and owns a refcount to
339+
/// that file.
340+
#[repr(C)]
341+
struct DeferredFdCloserInner {
342+
twork: mem::MaybeUninit<bindings::callback_head>,
343+
file: *mut bindings::file,
344+
}
345+
346+
impl DeferredFdCloser {
347+
/// Create a new [`DeferredFdCloser`].
348+
pub fn new() -> Result<Self, AllocError> {
349+
Ok(Self {
350+
// INVARIANT: The `file` pointer is null, so the type invariant does not apply.
351+
inner: Box::try_new(DeferredFdCloserInner {
352+
twork: mem::MaybeUninit::uninit(),
353+
file: core::ptr::null_mut(),
354+
})?,
355+
})
356+
}
357+
358+
/// Schedule a task work that closes the file descriptor when this task returns to userspace.
359+
///
360+
/// Fails if this is called from a context where we cannot run work when returning to
361+
/// userspace. (E.g., from a kthread.)
362+
pub fn close_fd(self, fd: u32) -> Result<(), DeferredFdCloseError> {
363+
use bindings::task_work_notify_mode_TWA_RESUME as TWA_RESUME;
364+
365+
// In this method, we schedule the task work before closing the file. This is because
366+
// scheduling a task work is fallible, and we need to know whether it will fail before we
367+
// attempt to close the file.
368+
369+
// Task works are not available on kthreads.
370+
let current = crate::current!();
371+
if current.is_kthread() {
372+
return Err(DeferredFdCloseError::TaskWorkUnavailable);
373+
}
374+
375+
// Transfer ownership of the box's allocation to a raw pointer. This disables the
376+
// destructor, so we must manually convert it back to a Box to drop it.
377+
//
378+
// Until we convert it back to a `Box`, there are no aliasing requirements on this
379+
// pointer.
380+
let inner = Box::into_raw(self.inner);
381+
382+
// The `callback_head` field is first in the struct, so this cast correctly gives us a
383+
// pointer to the field.
384+
let callback_head = inner.cast::<bindings::callback_head>();
385+
// SAFETY: This pointer offset operation does not go out-of-bounds.
386+
let file_field = unsafe { core::ptr::addr_of_mut!((*inner).file) };
387+
388+
let current = current.as_raw();
389+
390+
// SAFETY: This function currently has exclusive access to the `DeferredFdCloserInner`, so
391+
// it is okay for us to perform unsynchronized writes to its `callback_head` field.
392+
unsafe { bindings::init_task_work(callback_head, Some(Self::do_close_fd)) };
393+
394+
// SAFETY: This inserts the `DeferredFdCloserInner` into the task workqueue for the current
395+
// task. If this operation is successful, then this transfers exclusive ownership of the
396+
// `callback_head` field to the C side until it calls `do_close_fd`, and we don't touch or
397+
// invalidate the field during that time.
398+
//
399+
// When the C side calls `do_close_fd`, the safety requirements of that method are
400+
// satisfied because when a task work is executed, the callback is given ownership of the
401+
// pointer.
402+
//
403+
// The file pointer is currently null. If it is changed to be non-null before `do_close_fd`
404+
// is called, then that change happens due to the write at the end of this function, and
405+
// that write has a safety comment that explains why the refcount can be dropped when
406+
// `do_close_fd` runs.
407+
let res = unsafe { bindings::task_work_add(current, callback_head, TWA_RESUME) };
408+
409+
if res != 0 {
410+
// SAFETY: Scheduling the task work failed, so we still have ownership of the box, so
411+
// we may destroy it.
412+
unsafe { drop(Box::from_raw(inner)) };
413+
414+
return Err(DeferredFdCloseError::TaskWorkUnavailable);
415+
}
416+
417+
// This removes the fd from the fd table in `current`. The file is not fully closed until
418+
// `filp_close` is called. We are given ownership of one refcount to the file.
419+
//
420+
// SAFETY: This is safe no matter what `fd` is. If the `fd` is valid (that is, if the
421+
// pointer is non-null), then we call `filp_close` on the returned pointer as required by
422+
// `file_close_fd`.
423+
let file = unsafe { bindings::file_close_fd(fd) };
424+
if file.is_null() {
425+
// We don't clean up the task work since that might be expensive if the task work queue
426+
// is long. Just let it execute and let it clean up for itself.
427+
return Err(DeferredFdCloseError::BadFd);
428+
}
429+
430+
// Acquire a second refcount to the file.
431+
//
432+
// SAFETY: The `file` pointer points at a file with a non-zero refcount.
433+
unsafe { bindings::get_file(file) };
434+
435+
// This method closes the fd, consuming one of our two refcounts. There could be active
436+
// light refcounts created from that fd, so we must ensure that the file has a positive
437+
// refcount for the duration of those active light refcounts. We do that by holding on to
438+
// the second refcount until the current task returns to userspace.
439+
//
440+
// SAFETY: The `file` pointer is valid. Passing `current->files` as the file table to close
441+
// it in is correct, since we just got the `fd` from `file_close_fd` which also uses
442+
// `current->files`.
443+
//
444+
// Note: fl_owner_t is currently a void pointer.
445+
unsafe { bindings::filp_close(file, (*current).files as bindings::fl_owner_t) };
446+
447+
// We update the file pointer that the task work is supposed to fput. This transfers
448+
// ownership of our last refcount.
449+
//
450+
// INVARIANT: This changes the `file` field of a `DeferredFdCloserInner` from null to
451+
// non-null. This doesn't break the type invariant for `DeferredFdCloserInner` because we
452+
// still own a refcount to the file, so we can pass ownership of that refcount to the
453+
// `DeferredFdCloserInner`.
454+
//
455+
// When `do_close_fd` runs, it must be safe for it to `fput` the refcount. However, this is
456+
// the case because all light refcounts that are associated with the fd we closed
457+
// previously must be dropped when `do_close_fd`, since light refcounts must be dropped
458+
// before returning to userspace.
459+
//
460+
// SAFETY: Task works are executed on the current thread right before we return to
461+
// userspace, so this write is guaranteed to happen before `do_close_fd` is called, which
462+
// means that a race is not possible here.
463+
unsafe { *file_field = file };
464+
465+
Ok(())
466+
}
467+
468+
/// # Safety
469+
///
470+
/// The provided pointer must point at the `twork` field of a `DeferredFdCloserInner` stored in
471+
/// a `Box`, and the caller must pass exclusive ownership of that `Box`. Furthermore, if the
472+
/// file pointer is non-null, then it must be okay to release the refcount by calling `fput`.
473+
unsafe extern "C" fn do_close_fd(inner: *mut bindings::callback_head) {
474+
// SAFETY: The caller just passed us ownership of this box.
475+
let inner = unsafe { Box::from_raw(inner.cast::<DeferredFdCloserInner>()) };
476+
if !inner.file.is_null() {
477+
// SAFETY: By the type invariants, we own a refcount to this file, and the caller
478+
// guarantees that dropping the refcount now is okay.
479+
unsafe { bindings::fput(inner.file) };
480+
}
481+
// The allocation is freed when `inner` goes out of scope.
482+
}
483+
}
484+
485+
/// Represents a failure to close an fd in a deferred manner.
486+
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
487+
pub enum DeferredFdCloseError {
488+
/// Closing the fd failed because we were unable to schedule a task work.
489+
TaskWorkUnavailable,
490+
/// Closing the fd failed because the fd does not exist.
491+
BadFd,
492+
}
493+
494+
impl From<DeferredFdCloseError> for Error {
495+
fn from(err: DeferredFdCloseError) -> Error {
496+
match err {
497+
DeferredFdCloseError::TaskWorkUnavailable => ESRCH,
498+
DeferredFdCloseError::BadFd => EBADF,
499+
}
500+
}
501+
}
502+
321503
/// Represents the `EBADF` error code.
322504
///
323505
/// Used for methods that can only fail with `EBADF`.

rust/kernel/task.rs

+14
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,12 @@ impl Task {
145145
}
146146
}
147147

148+
/// Returns a raw pointer to the task.
149+
#[inline]
150+
pub fn as_raw(&self) -> *mut bindings::task_struct {
151+
self.0.get()
152+
}
153+
148154
/// Returns the group leader of the given task.
149155
pub fn group_leader(&self) -> &Task {
150156
// SAFETY: By the type invariant, we know that `self.0` is a valid task. Valid tasks always
@@ -189,6 +195,14 @@ impl Task {
189195
unsafe { bindings::task_tgid_nr_ns(self.0.get(), ptr::null_mut()) }
190196
}
191197

198+
/// Returns whether this task corresponds to a kernel thread.
199+
pub fn is_kthread(&self) -> bool {
200+
// SAFETY: By the type invariant, we know that `self.0.get()` is non-null and valid. There
201+
// are no further requirements to read the task's flags.
202+
let flags = unsafe { (*self.0.get()).flags };
203+
(flags & bindings::PF_KTHREAD) != 0
204+
}
205+
192206
/// Wakes up the task.
193207
pub fn wake_up(&self) {
194208
// SAFETY: By the type invariant, we know that `self.0.get()` is non-null and valid.

0 commit comments

Comments
 (0)