Skip to content

Commit fbeb558

Browse files
author
Peter Zijlstra
committed
futex/pi: Fix recursive rt_mutex waiter state
Some new assertions pointed out that the existing code has nested rt_mutex wait state in the futex code. Specifically, the futex_lock_pi() cancel case uses spin_lock() while there still is a rt_waiter enqueued for this task, resulting in a state where there are two waiters for the same task (and task_struct::pi_blocked_on gets scrambled). The reason to take hb->lock at this point is to avoid the wake_futex_pi() EAGAIN case. This happens when futex_top_waiter() and rt_mutex_top_waiter() state becomes inconsistent. The current rules are such that this inconsistency will not be observed. Notably the case that needs to be avoided is where futex_lock_pi() and futex_unlock_pi() interleave such that unlock will fail to observe a new waiter. *However* the case at hand is where a waiter is leaving, in this case the race means a waiter that is going away is not observed -- which is harmless, provided this race is explicitly handled. This is a somewhat dangerous proposition because the converse race is not observing a new waiter, which must absolutely not happen. But since the race is valid this cannot be asserted. Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Reviewed-by: Thomas Gleixner <[email protected]> Reviewed-by: Sebastian Andrzej Siewior <[email protected]> Tested-by: Sebastian Andrzej Siewior <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent 45f67f3 commit fbeb558

File tree

2 files changed

+52
-30
lines changed

2 files changed

+52
-30
lines changed

kernel/futex/pi.c

Lines changed: 48 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -611,29 +611,16 @@ int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
611611
/*
612612
* Caller must hold a reference on @pi_state.
613613
*/
614-
static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
614+
static int wake_futex_pi(u32 __user *uaddr, u32 uval,
615+
struct futex_pi_state *pi_state,
616+
struct rt_mutex_waiter *top_waiter)
615617
{
616-
struct rt_mutex_waiter *top_waiter;
617618
struct task_struct *new_owner;
618619
bool postunlock = false;
619620
DEFINE_RT_WAKE_Q(wqh);
620621
u32 curval, newval;
621622
int ret = 0;
622623

623-
top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
624-
if (WARN_ON_ONCE(!top_waiter)) {
625-
/*
626-
* As per the comment in futex_unlock_pi() this should not happen.
627-
*
628-
* When this happens, give up our locks and try again, giving
629-
* the futex_lock_pi() instance time to complete, either by
630-
* waiting on the rtmutex or removing itself from the futex
631-
* queue.
632-
*/
633-
ret = -EAGAIN;
634-
goto out_unlock;
635-
}
636-
637624
new_owner = top_waiter->task;
638625

639626
/*
@@ -1046,19 +1033,33 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
10461033
ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
10471034

10481035
cleanup:
1049-
spin_lock(q.lock_ptr);
10501036
/*
10511037
* If we failed to acquire the lock (deadlock/signal/timeout), we must
1052-
* first acquire the hb->lock before removing the lock from the
1053-
* rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
1054-
* lists consistent.
1038+
* must unwind the above, however we canont lock hb->lock because
1039+
* rt_mutex already has a waiter enqueued and hb->lock can itself try
1040+
* and enqueue an rt_waiter through rtlock.
1041+
*
1042+
* Doing the cleanup without holding hb->lock can cause inconsistent
1043+
* state between hb and pi_state, but only in the direction of not
1044+
* seeing a waiter that is leaving.
1045+
*
1046+
* See futex_unlock_pi(), it deals with this inconsistency.
10551047
*
1056-
* In particular; it is important that futex_unlock_pi() can not
1057-
* observe this inconsistency.
1048+
* There be dragons here, since we must deal with the inconsistency on
1049+
* the way out (here), it is impossible to detect/warn about the race
1050+
* the other way around (missing an incoming waiter).
1051+
*
1052+
* What could possibly go wrong...
10581053
*/
10591054
if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
10601055
ret = 0;
10611056

1057+
/*
1058+
* Now that the rt_waiter has been dequeued, it is safe to use
1059+
* spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
1060+
* the
1061+
*/
1062+
spin_lock(q.lock_ptr);
10621063
/*
10631064
* Waiter is unqueued.
10641065
*/
@@ -1143,6 +1144,7 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
11431144
top_waiter = futex_top_waiter(hb, &key);
11441145
if (top_waiter) {
11451146
struct futex_pi_state *pi_state = top_waiter->pi_state;
1147+
struct rt_mutex_waiter *rt_waiter;
11461148

11471149
ret = -EINVAL;
11481150
if (!pi_state)
@@ -1155,22 +1157,39 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
11551157
if (pi_state->owner != current)
11561158
goto out_unlock;
11571159

1158-
get_pi_state(pi_state);
11591160
/*
11601161
* By taking wait_lock while still holding hb->lock, we ensure
1161-
* there is no point where we hold neither; and therefore
1162-
* wake_futex_p() must observe a state consistent with what we
1163-
* observed.
1162+
* there is no point where we hold neither; and thereby
1163+
* wake_futex_pi() must observe any new waiters.
1164+
*
1165+
* Since the cleanup: case in futex_lock_pi() removes the
1166+
* rt_waiter without holding hb->lock, it is possible for
1167+
* wake_futex_pi() to not find a waiter while the above does,
1168+
* in this case the waiter is on the way out and it can be
1169+
* ignored.
11641170
*
11651171
* In particular; this forces __rt_mutex_start_proxy() to
11661172
* complete such that we're guaranteed to observe the
1167-
* rt_waiter. Also see the WARN in wake_futex_pi().
1173+
* rt_waiter.
11681174
*/
11691175
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1176+
1177+
/*
1178+
* Futex vs rt_mutex waiter state -- if there are no rt_mutex
1179+
* waiters even though futex thinks there are, then the waiter
1180+
* is leaving and the uncontended path is safe to take.
1181+
*/
1182+
rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
1183+
if (!rt_waiter) {
1184+
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1185+
goto do_uncontended;
1186+
}
1187+
1188+
get_pi_state(pi_state);
11701189
spin_unlock(&hb->lock);
11711190

11721191
/* drops pi_state->pi_mutex.wait_lock */
1173-
ret = wake_futex_pi(uaddr, uval, pi_state);
1192+
ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter);
11741193

11751194
put_pi_state(pi_state);
11761195

@@ -1198,6 +1217,7 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
11981217
return ret;
11991218
}
12001219

1220+
do_uncontended:
12011221
/*
12021222
* We have no kernel internal state, i.e. no waiters in the
12031223
* kernel. Waiters which are about to queue themselves are stuck

kernel/futex/requeue.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -850,11 +850,13 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
850850
pi_mutex = &q.pi_state->pi_mutex;
851851
ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
852852

853-
/* Current is not longer pi_blocked_on */
854-
spin_lock(q.lock_ptr);
853+
/*
854+
* See futex_unlock_pi()'s cleanup: comment.
855+
*/
855856
if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
856857
ret = 0;
857858

859+
spin_lock(q.lock_ptr);
858860
debug_rt_mutex_free_waiter(&rt_waiter);
859861
/*
860862
* Fixup the pi_state owner and possibly acquire the lock if we

0 commit comments

Comments
 (0)