diff options
author | Peter Zijlstra <peterz@infradead.org> | 2017-03-22 06:35:55 -0400 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2017-03-23 14:10:08 -0400 |
commit | 16ffa12d742534d4ff73e8b3a4e81c1de39196f0 (patch) | |
tree | 2c19026cc476ea8f7bf12540788bfac1c0b41611 /kernel/futex.c | |
parent | 73d786bd043ebc855f349c81ea805f6b11cbf2aa (diff) |
futex: Pull rt_mutex_futex_unlock() out from under hb->lock
There's a number of 'interesting' problems, all caused by holding
hb->lock while doing the rt_mutex_unlock() equivalient.
Notably:
- a PI inversion on hb->lock; and,
- a SCHED_DEADLINE crash because of pointer instability.
The previous changes:
- changed the locking rules to cover {uval,pi_state} with wait_lock.
- allow to do rt_mutex_futex_unlock() without dropping wait_lock; which in
turn allows to rely on wait_lock atomicity completely.
- simplified the waiter conundrum.
It's now sufficient to hold rtmutex::wait_lock and a reference on the
pi_state to protect the state consistency, so hb->lock can be dropped
before calling rt_mutex_futex_unlock().
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: dvhart@infradead.org
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170322104151.900002056@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'kernel/futex.c')
-rw-r--r-- | kernel/futex.c | 154 |
1 files changed, 100 insertions, 54 deletions
diff --git a/kernel/futex.c b/kernel/futex.c index 51a248af1db9..3b0aace334a8 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -921,10 +921,12 @@ void exit_pi_state_list(struct task_struct *curr) | |||
921 | pi_state->owner = NULL; | 921 | pi_state->owner = NULL; |
922 | raw_spin_unlock_irq(&curr->pi_lock); | 922 | raw_spin_unlock_irq(&curr->pi_lock); |
923 | 923 | ||
924 | rt_mutex_futex_unlock(&pi_state->pi_mutex); | 924 | get_pi_state(pi_state); |
925 | |||
926 | spin_unlock(&hb->lock); | 925 | spin_unlock(&hb->lock); |
927 | 926 | ||
927 | rt_mutex_futex_unlock(&pi_state->pi_mutex); | ||
928 | put_pi_state(pi_state); | ||
929 | |||
928 | raw_spin_lock_irq(&curr->pi_lock); | 930 | raw_spin_lock_irq(&curr->pi_lock); |
929 | } | 931 | } |
930 | raw_spin_unlock_irq(&curr->pi_lock); | 932 | raw_spin_unlock_irq(&curr->pi_lock); |
@@ -1037,6 +1039,11 @@ static int attach_to_pi_state(u32 __user *uaddr, u32 uval, | |||
1037 | * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), | 1039 | * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), |
1038 | * which in turn means that futex_lock_pi() still has a reference on | 1040 | * which in turn means that futex_lock_pi() still has a reference on |
1039 | * our pi_state. | 1041 | * our pi_state. |
1042 | * | ||
1043 | * The waiter holding a reference on @pi_state also protects against | ||
1044 | * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() | ||
1045 | * and futex_wait_requeue_pi() as it cannot go to 0 and consequently | ||
1046 | * free pi_state before we can take a reference ourselves. | ||
1040 | */ | 1047 | */ |
1041 | WARN_ON(!atomic_read(&pi_state->refcount)); | 1048 | WARN_ON(!atomic_read(&pi_state->refcount)); |
1042 | 1049 | ||
@@ -1380,48 +1387,40 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) | |||
1380 | smp_store_release(&q->lock_ptr, NULL); | 1387 | smp_store_release(&q->lock_ptr, NULL); |
1381 | } | 1388 | } |
1382 | 1389 | ||
1383 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter, | 1390 | /* |
1384 | struct futex_hash_bucket *hb) | 1391 | * Caller must hold a reference on @pi_state. |
1392 | */ | ||
1393 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) | ||
1385 | { | 1394 | { |
1386 | struct task_struct *new_owner; | ||
1387 | struct futex_pi_state *pi_state = top_waiter->pi_state; | ||
1388 | u32 uninitialized_var(curval), newval; | 1395 | u32 uninitialized_var(curval), newval; |
1396 | struct task_struct *new_owner; | ||
1397 | bool deboost = false; | ||
1389 | DEFINE_WAKE_Q(wake_q); | 1398 | DEFINE_WAKE_Q(wake_q); |
1390 | bool deboost; | ||
1391 | int ret = 0; | 1399 | int ret = 0; |
1392 | 1400 | ||
1393 | if (!pi_state) | ||
1394 | return -EINVAL; | ||
1395 | |||
1396 | /* | ||
1397 | * If current does not own the pi_state then the futex is | ||
1398 | * inconsistent and user space fiddled with the futex value. | ||
1399 | */ | ||
1400 | if (pi_state->owner != current) | ||
1401 | return -EINVAL; | ||
1402 | |||
1403 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | 1401 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); |
1404 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); | 1402 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); |
1405 | |||
1406 | /* | ||
1407 | * When we interleave with futex_lock_pi() where it does | ||
1408 | * rt_mutex_timed_futex_lock(), we might observe @this futex_q waiter, | ||
1409 | * but the rt_mutex's wait_list can be empty (either still, or again, | ||
1410 | * depending on which side we land). | ||
1411 | * | ||
1412 | * When this happens, give up our locks and try again, giving the | ||
1413 | * futex_lock_pi() instance time to complete, either by waiting on the | ||
1414 | * rtmutex or removing itself from the futex queue. | ||
1415 | */ | ||
1416 | if (!new_owner) { | 1403 | if (!new_owner) { |
1417 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | 1404 | /* |
1418 | return -EAGAIN; | 1405 | * Since we held neither hb->lock nor wait_lock when coming |
1406 | * into this function, we could have raced with futex_lock_pi() | ||
1407 | * such that we might observe @this futex_q waiter, but the | ||
1408 | * rt_mutex's wait_list can be empty (either still, or again, | ||
1409 | * depending on which side we land). | ||
1410 | * | ||
1411 | * When this happens, give up our locks and try again, giving | ||
1412 | * the futex_lock_pi() instance time to complete, either by | ||
1413 | * waiting on the rtmutex or removing itself from the futex | ||
1414 | * queue. | ||
1415 | */ | ||
1416 | ret = -EAGAIN; | ||
1417 | goto out_unlock; | ||
1419 | } | 1418 | } |
1420 | 1419 | ||
1421 | /* | 1420 | /* |
1422 | * We pass it to the next owner. The WAITERS bit is always | 1421 | * We pass it to the next owner. The WAITERS bit is always kept |
1423 | * kept enabled while there is PI state around. We cleanup the | 1422 | * enabled while there is PI state around. We cleanup the owner |
1424 | * owner died bit, because we are the owner. | 1423 | * died bit, because we are the owner. |
1425 | */ | 1424 | */ |
1426 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); | 1425 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); |
1427 | 1426 | ||
@@ -1444,10 +1443,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter | |||
1444 | ret = -EINVAL; | 1443 | ret = -EINVAL; |
1445 | } | 1444 | } |
1446 | 1445 | ||
1447 | if (ret) { | 1446 | if (ret) |
1448 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | 1447 | goto out_unlock; |
1449 | return ret; | ||
1450 | } | ||
1451 | 1448 | ||
1452 | raw_spin_lock(&pi_state->owner->pi_lock); | 1449 | raw_spin_lock(&pi_state->owner->pi_lock); |
1453 | WARN_ON(list_empty(&pi_state->list)); | 1450 | WARN_ON(list_empty(&pi_state->list)); |
@@ -1465,15 +1462,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter | |||
1465 | */ | 1462 | */ |
1466 | deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); | 1463 | deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); |
1467 | 1464 | ||
1465 | out_unlock: | ||
1468 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | 1466 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); |
1469 | spin_unlock(&hb->lock); | ||
1470 | 1467 | ||
1471 | if (deboost) { | 1468 | if (deboost) { |
1472 | wake_up_q(&wake_q); | 1469 | wake_up_q(&wake_q); |
1473 | rt_mutex_adjust_prio(current); | 1470 | rt_mutex_adjust_prio(current); |
1474 | } | 1471 | } |
1475 | 1472 | ||
1476 | return 0; | 1473 | return ret; |
1477 | } | 1474 | } |
1478 | 1475 | ||
1479 | /* | 1476 | /* |
@@ -2232,7 +2229,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | |||
2232 | /* | 2229 | /* |
2233 | * We are here either because we stole the rtmutex from the | 2230 | * We are here either because we stole the rtmutex from the |
2234 | * previous highest priority waiter or we are the highest priority | 2231 | * previous highest priority waiter or we are the highest priority |
2235 | * waiter but failed to get the rtmutex the first time. | 2232 | * waiter but have failed to get the rtmutex the first time. |
2233 | * | ||
2236 | * We have to replace the newowner TID in the user space variable. | 2234 | * We have to replace the newowner TID in the user space variable. |
2237 | * This must be atomic as we have to preserve the owner died bit here. | 2235 | * This must be atomic as we have to preserve the owner died bit here. |
2238 | * | 2236 | * |
@@ -2249,7 +2247,7 @@ retry: | |||
2249 | if (get_futex_value_locked(&uval, uaddr)) | 2247 | if (get_futex_value_locked(&uval, uaddr)) |
2250 | goto handle_fault; | 2248 | goto handle_fault; |
2251 | 2249 | ||
2252 | while (1) { | 2250 | for (;;) { |
2253 | newval = (uval & FUTEX_OWNER_DIED) | newtid; | 2251 | newval = (uval & FUTEX_OWNER_DIED) | newtid; |
2254 | 2252 | ||
2255 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) | 2253 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) |
@@ -2345,6 +2343,10 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) | |||
2345 | /* | 2343 | /* |
2346 | * Got the lock. We might not be the anticipated owner if we | 2344 | * Got the lock. We might not be the anticipated owner if we |
2347 | * did a lock-steal - fix up the PI-state in that case: | 2345 | * did a lock-steal - fix up the PI-state in that case: |
2346 | * | ||
2347 | * We can safely read pi_state->owner without holding wait_lock | ||
2348 | * because we now own the rt_mutex, only the owner will attempt | ||
2349 | * to change it. | ||
2348 | */ | 2350 | */ |
2349 | if (q->pi_state->owner != current) | 2351 | if (q->pi_state->owner != current) |
2350 | ret = fixup_pi_state_owner(uaddr, q, current); | 2352 | ret = fixup_pi_state_owner(uaddr, q, current); |
@@ -2584,6 +2586,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, | |||
2584 | ktime_t *time, int trylock) | 2586 | ktime_t *time, int trylock) |
2585 | { | 2587 | { |
2586 | struct hrtimer_sleeper timeout, *to = NULL; | 2588 | struct hrtimer_sleeper timeout, *to = NULL; |
2589 | struct futex_pi_state *pi_state = NULL; | ||
2587 | struct futex_hash_bucket *hb; | 2590 | struct futex_hash_bucket *hb; |
2588 | struct futex_q q = futex_q_init; | 2591 | struct futex_q q = futex_q_init; |
2589 | int res, ret; | 2592 | int res, ret; |
@@ -2670,12 +2673,19 @@ retry_private: | |||
2670 | * If fixup_owner() faulted and was unable to handle the fault, unlock | 2673 | * If fixup_owner() faulted and was unable to handle the fault, unlock |
2671 | * it and return the fault to userspace. | 2674 | * it and return the fault to userspace. |
2672 | */ | 2675 | */ |
2673 | if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) | 2676 | if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) { |
2674 | rt_mutex_futex_unlock(&q.pi_state->pi_mutex); | 2677 | pi_state = q.pi_state; |
2678 | get_pi_state(pi_state); | ||
2679 | } | ||
2675 | 2680 | ||
2676 | /* Unqueue and drop the lock */ | 2681 | /* Unqueue and drop the lock */ |
2677 | unqueue_me_pi(&q); | 2682 | unqueue_me_pi(&q); |
2678 | 2683 | ||
2684 | if (pi_state) { | ||
2685 | rt_mutex_futex_unlock(&pi_state->pi_mutex); | ||
2686 | put_pi_state(pi_state); | ||
2687 | } | ||
2688 | |||
2679 | goto out_put_key; | 2689 | goto out_put_key; |
2680 | 2690 | ||
2681 | out_unlock_put_key: | 2691 | out_unlock_put_key: |
@@ -2738,10 +2748,36 @@ retry: | |||
2738 | */ | 2748 | */ |
2739 | top_waiter = futex_top_waiter(hb, &key); | 2749 | top_waiter = futex_top_waiter(hb, &key); |
2740 | if (top_waiter) { | 2750 | if (top_waiter) { |
2741 | ret = wake_futex_pi(uaddr, uval, top_waiter, hb); | 2751 | struct futex_pi_state *pi_state = top_waiter->pi_state; |
2752 | |||
2753 | ret = -EINVAL; | ||
2754 | if (!pi_state) | ||
2755 | goto out_unlock; | ||
2756 | |||
2757 | /* | ||
2758 | * If current does not own the pi_state then the futex is | ||
2759 | * inconsistent and user space fiddled with the futex value. | ||
2760 | */ | ||
2761 | if (pi_state->owner != current) | ||
2762 | goto out_unlock; | ||
2763 | |||
2742 | /* | 2764 | /* |
2743 | * In case of success wake_futex_pi dropped the hash | 2765 | * Grab a reference on the pi_state and drop hb->lock. |
2744 | * bucket lock. | 2766 | * |
2767 | * The reference ensures pi_state lives, dropping the hb->lock | ||
2768 | * is tricky.. wake_futex_pi() will take rt_mutex::wait_lock to | ||
2769 | * close the races against futex_lock_pi(), but in case of | ||
2770 | * _any_ fail we'll abort and retry the whole deal. | ||
2771 | */ | ||
2772 | get_pi_state(pi_state); | ||
2773 | spin_unlock(&hb->lock); | ||
2774 | |||
2775 | ret = wake_futex_pi(uaddr, uval, pi_state); | ||
2776 | |||
2777 | put_pi_state(pi_state); | ||
2778 | |||
2779 | /* | ||
2780 | * Success, we're done! No tricky corner cases. | ||
2745 | */ | 2781 | */ |
2746 | if (!ret) | 2782 | if (!ret) |
2747 | goto out_putkey; | 2783 | goto out_putkey; |
@@ -2756,7 +2792,6 @@ retry: | |||
2756 | * setting the FUTEX_WAITERS bit. Try again. | 2792 | * setting the FUTEX_WAITERS bit. Try again. |
2757 | */ | 2793 | */ |
2758 | if (ret == -EAGAIN) { | 2794 | if (ret == -EAGAIN) { |
2759 | spin_unlock(&hb->lock); | ||
2760 | put_futex_key(&key); | 2795 | put_futex_key(&key); |
2761 | goto retry; | 2796 | goto retry; |
2762 | } | 2797 | } |
@@ -2764,7 +2799,7 @@ retry: | |||
2764 | * wake_futex_pi has detected invalid state. Tell user | 2799 | * wake_futex_pi has detected invalid state. Tell user |
2765 | * space. | 2800 | * space. |
2766 | */ | 2801 | */ |
2767 | goto out_unlock; | 2802 | goto out_putkey; |
2768 | } | 2803 | } |
2769 | 2804 | ||
2770 | /* | 2805 | /* |
@@ -2774,8 +2809,10 @@ retry: | |||
2774 | * preserve the WAITERS bit not the OWNER_DIED one. We are the | 2809 | * preserve the WAITERS bit not the OWNER_DIED one. We are the |
2775 | * owner. | 2810 | * owner. |
2776 | */ | 2811 | */ |
2777 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) | 2812 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { |
2813 | spin_unlock(&hb->lock); | ||
2778 | goto pi_faulted; | 2814 | goto pi_faulted; |
2815 | } | ||
2779 | 2816 | ||
2780 | /* | 2817 | /* |
2781 | * If uval has changed, let user space handle it. | 2818 | * If uval has changed, let user space handle it. |
@@ -2789,7 +2826,6 @@ out_putkey: | |||
2789 | return ret; | 2826 | return ret; |
2790 | 2827 | ||
2791 | pi_faulted: | 2828 | pi_faulted: |
2792 | spin_unlock(&hb->lock); | ||
2793 | put_futex_key(&key); | 2829 | put_futex_key(&key); |
2794 | 2830 | ||
2795 | ret = fault_in_user_writeable(uaddr); | 2831 | ret = fault_in_user_writeable(uaddr); |
@@ -2893,6 +2929,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
2893 | u32 __user *uaddr2) | 2929 | u32 __user *uaddr2) |
2894 | { | 2930 | { |
2895 | struct hrtimer_sleeper timeout, *to = NULL; | 2931 | struct hrtimer_sleeper timeout, *to = NULL; |
2932 | struct futex_pi_state *pi_state = NULL; | ||
2896 | struct rt_mutex_waiter rt_waiter; | 2933 | struct rt_mutex_waiter rt_waiter; |
2897 | struct futex_hash_bucket *hb; | 2934 | struct futex_hash_bucket *hb; |
2898 | union futex_key key2 = FUTEX_KEY_INIT; | 2935 | union futex_key key2 = FUTEX_KEY_INIT; |
@@ -2977,8 +3014,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
2977 | if (q.pi_state && (q.pi_state->owner != current)) { | 3014 | if (q.pi_state && (q.pi_state->owner != current)) { |
2978 | spin_lock(q.lock_ptr); | 3015 | spin_lock(q.lock_ptr); |
2979 | ret = fixup_pi_state_owner(uaddr2, &q, current); | 3016 | ret = fixup_pi_state_owner(uaddr2, &q, current); |
2980 | if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) | 3017 | if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { |
2981 | rt_mutex_futex_unlock(&q.pi_state->pi_mutex); | 3018 | pi_state = q.pi_state; |
3019 | get_pi_state(pi_state); | ||
3020 | } | ||
2982 | /* | 3021 | /* |
2983 | * Drop the reference to the pi state which | 3022 | * Drop the reference to the pi state which |
2984 | * the requeue_pi() code acquired for us. | 3023 | * the requeue_pi() code acquired for us. |
@@ -3017,13 +3056,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
3017 | * the fault, unlock the rt_mutex and return the fault to | 3056 | * the fault, unlock the rt_mutex and return the fault to |
3018 | * userspace. | 3057 | * userspace. |
3019 | */ | 3058 | */ |
3020 | if (ret && rt_mutex_owner(pi_mutex) == current) | 3059 | if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { |
3021 | rt_mutex_futex_unlock(pi_mutex); | 3060 | pi_state = q.pi_state; |
3061 | get_pi_state(pi_state); | ||
3062 | } | ||
3022 | 3063 | ||
3023 | /* Unqueue and drop the lock. */ | 3064 | /* Unqueue and drop the lock. */ |
3024 | unqueue_me_pi(&q); | 3065 | unqueue_me_pi(&q); |
3025 | } | 3066 | } |
3026 | 3067 | ||
3068 | if (pi_state) { | ||
3069 | rt_mutex_futex_unlock(&pi_state->pi_mutex); | ||
3070 | put_pi_state(pi_state); | ||
3071 | } | ||
3072 | |||
3027 | if (ret == -EINTR) { | 3073 | if (ret == -EINTR) { |
3028 | /* | 3074 | /* |
3029 | * We've already been requeued, but cannot restart by calling | 3075 | * We've already been requeued, but cannot restart by calling |