aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/futex.c
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2019-01-29 17:15:12 -0500
committerThomas Gleixner <tglx@linutronix.de>2019-02-08 07:00:36 -0500
commit1a1fb985f2e2b85ec0d3dc2e519ee48389ec2434 (patch)
treedb20b498bbd047982a926c49bc419096cf268955 /kernel/futex.c
parent6f568ebe2afefdc33a6fb06ef20a94f8b96455f1 (diff)
futex: Handle early deadlock return correctly
commit 56222b212e8e ("futex: Drop hb->lock before enqueueing on the rtmutex") changed the locking rules in the futex code so that the hash bucket lock is not longer held while the waiter is enqueued into the rtmutex wait list. This made the lock and the unlock path symmetric, but unfortunately the possible early exit from __rt_mutex_proxy_start() due to a detected deadlock was not updated accordingly. That allows a concurrent unlocker to observe inconsitent state which triggers the warning in the unlock path. futex_lock_pi() futex_unlock_pi() lock(hb->lock) queue(hb_waiter) lock(hb->lock) lock(rtmutex->wait_lock) unlock(hb->lock) // acquired hb->lock hb_waiter = futex_top_waiter() lock(rtmutex->wait_lock) __rt_mutex_proxy_start() ---> fail remove(rtmutex_waiter); ---> returns -EDEADLOCK unlock(rtmutex->wait_lock) // acquired wait_lock wake_futex_pi() rt_mutex_next_owner() --> returns NULL --> WARN lock(hb->lock) unqueue(hb_waiter) The problem is caused by the remove(rtmutex_waiter) in the failure case of __rt_mutex_proxy_start() as this lets the unlocker observe a waiter in the hash bucket but no waiter on the rtmutex, i.e. inconsistent state. The original commit handles this correctly for the other early return cases (timeout, signal) by delaying the removal of the rtmutex waiter until the returning task reacquired the hash bucket lock. Treat the failure case of __rt_mutex_proxy_start() in the same way and let the existing cleanup code handle the eventual handover of the rtmutex gracefully. The regular rt_mutex_proxy_start() gains the rtmutex waiter removal for the failure case, so that the other callsites are still operating correctly. Add proper comments to the code so all these details are fully documented. Thanks to Peter for helping with the analysis and writing the really valuable code comments. Fixes: 56222b212e8e ("futex: Drop hb->lock before enqueueing on the rtmutex") Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com> Co-developed-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Tested-by: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: linux-s390@vger.kernel.org Cc: Stefan Liebler <stli@linux.ibm.com> Cc: Sebastian Sewior <bigeasy@linutronix.de> Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1901292311410.1950@nanos.tec.linutronix.de
Diffstat (limited to 'kernel/futex.c')
-rw-r--r--kernel/futex.c28
1 files changed, 18 insertions, 10 deletions
diff --git a/kernel/futex.c b/kernel/futex.c
index 5ec2473a3497..a0514e01c3eb 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2861,35 +2861,39 @@ retry_private:
2861 * and BUG when futex_unlock_pi() interleaves with this. 2861 * and BUG when futex_unlock_pi() interleaves with this.
2862 * 2862 *
2863 * Therefore acquire wait_lock while holding hb->lock, but drop the 2863 * Therefore acquire wait_lock while holding hb->lock, but drop the
2864 * latter before calling rt_mutex_start_proxy_lock(). This still fully 2864 * latter before calling __rt_mutex_start_proxy_lock(). This
2865 * serializes against futex_unlock_pi() as that does the exact same 2865 * interleaves with futex_unlock_pi() -- which does a similar lock
2866 * lock handoff sequence. 2866 * handoff -- such that the latter can observe the futex_q::pi_state
2867 * before __rt_mutex_start_proxy_lock() is done.
2867 */ 2868 */
2868 raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); 2869 raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
2869 spin_unlock(q.lock_ptr); 2870 spin_unlock(q.lock_ptr);
2871 /*
2872 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
2873 * such that futex_unlock_pi() is guaranteed to observe the waiter when
2874 * it sees the futex_q::pi_state.
2875 */
2870 ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); 2876 ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
2871 raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); 2877 raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
2872 2878
2873 if (ret) { 2879 if (ret) {
2874 if (ret == 1) 2880 if (ret == 1)
2875 ret = 0; 2881 ret = 0;
2876 2882 goto cleanup;
2877 spin_lock(q.lock_ptr);
2878 goto no_block;
2879 } 2883 }
2880 2884
2881
2882 if (unlikely(to)) 2885 if (unlikely(to))
2883 hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); 2886 hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
2884 2887
2885 ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); 2888 ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
2886 2889
2890cleanup:
2887 spin_lock(q.lock_ptr); 2891 spin_lock(q.lock_ptr);
2888 /* 2892 /*
2889 * If we failed to acquire the lock (signal/timeout), we must 2893 * If we failed to acquire the lock (deadlock/signal/timeout), we must
2890 * first acquire the hb->lock before removing the lock from the 2894 * first acquire the hb->lock before removing the lock from the
2891 * rt_mutex waitqueue, such that we can keep the hb and rt_mutex 2895 * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
2892 * wait lists consistent. 2896 * lists consistent.
2893 * 2897 *
2894 * In particular; it is important that futex_unlock_pi() can not 2898 * In particular; it is important that futex_unlock_pi() can not
2895 * observe this inconsistency. 2899 * observe this inconsistency.
@@ -3013,6 +3017,10 @@ retry:
3013 * there is no point where we hold neither; and therefore 3017 * there is no point where we hold neither; and therefore
3014 * wake_futex_pi() must observe a state consistent with what we 3018 * wake_futex_pi() must observe a state consistent with what we
3015 * observed. 3019 * observed.
3020 *
3021 * In particular; this forces __rt_mutex_start_proxy() to
3022 * complete such that we're guaranteed to observe the
3023 * rt_waiter. Also see the WARN in wake_futex_pi().
3016 */ 3024 */
3017 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 3025 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
3018 spin_unlock(&hb->lock); 3026 spin_unlock(&hb->lock);