aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/futex.c
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2012-10-23 16:29:38 -0400
committerThomas Gleixner <tglx@linutronix.de>2012-11-01 07:06:54 -0400
commit59fa6245192159ab5e1e17b8e31f15afa9cff4bf (patch)
treeead58f102889a65adc7f9f10eb4bb666321f5581 /kernel/futex.c
parent1e207eb1c3f0e8b690401f02fe08e7b53903f010 (diff)
futex: Handle futex_pi OWNER_DIED take over correctly
Siddhesh analyzed a failure in the take over of pi futexes in case the owner died and provided a workaround. See: http://sourceware.org/bugzilla/show_bug.cgi?id=14076 The detailed problem analysis shows: Futex F is initialized with PTHREAD_PRIO_INHERIT and PTHREAD_MUTEX_ROBUST_NP attributes. T1 lock_futex_pi(F); T2 lock_futex_pi(F); --> T2 blocks on the futex and creates pi_state which is associated to T1. T1 exits --> exit_robust_list() runs --> Futex F userspace value TID field is set to 0 and FUTEX_OWNER_DIED bit is set. T3 lock_futex_pi(F); --> Succeeds due to the check for F's userspace TID field == 0 --> Claims ownership of the futex and sets its own TID into the userspace TID field of futex F --> returns to user space T1 --> exit_pi_state_list() --> Transfers pi_state to waiter T2 and wakes T2 via rt_mutex_unlock(&pi_state->mutex) T2 --> acquires pi_state->mutex and gains real ownership of the pi_state --> Claims ownership of the futex and sets its own TID into the userspace TID field of futex F --> returns to user space T3 --> observes inconsistent state This problem is independent of UP/SMP, preemptible/non preemptible kernels, or process shared vs. private. The only difference is that certain configurations are more likely to expose it. So as Siddhesh correctly analyzed the following check in futex_lock_pi_atomic() is the culprit: if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { We check the userspace value for a TID value of 0 and take over the futex unconditionally if that's true. AFAICT this check is there as it is correct for a different corner case of futexes: the WAITERS bit became stale. Now the proposed change - if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { + if (unlikely(ownerdied || + !(curval & (FUTEX_TID_MASK | FUTEX_WAITERS)))) { solves the problem, but it's not obvious why and it wreckages the "stale WAITERS bit" case. What happens is, that due to the WAITERS bit being set (T2 is blocked on that futex) it enforces T3 to go through lookup_pi_state(), which in the above case returns an existing pi_state and therefor forces T3 to legitimately fight with T2 over the ownership of the pi_state (via pi_state->mutex). Probelm solved! Though that does not work for the "WAITERS bit is stale" problem because if lookup_pi_state() does not find existing pi_state it returns -ERSCH (due to TID == 0) which causes futex_lock_pi() to return -ESRCH to user space because the OWNER_DIED bit is not set. Now there is a different solution to that problem. Do not look at the user space value at all and enforce a lookup of possibly available pi_state. If pi_state can be found, then the new incoming locker T3 blocks on that pi_state and legitimately races with T2 to acquire the rt_mutex and the pi_state and therefor the proper ownership of the user space futex. lookup_pi_state() has the correct order of checks. It first tries to find a pi_state associated with the user space futex and only if that fails it checks for futex TID value = 0. If no pi_state is available nothing can create new state at that point because this happens with the hash bucket lock held. So the above scenario changes to: T1 lock_futex_pi(F); T2 lock_futex_pi(F); --> T2 blocks on the futex and creates pi_state which is associated to T1. T1 exits --> exit_robust_list() runs --> Futex F userspace value TID field is set to 0 and FUTEX_OWNER_DIED bit is set. T3 lock_futex_pi(F); --> Finds pi_state and blocks on pi_state->rt_mutex T1 --> exit_pi_state_list() --> Transfers pi_state to waiter T2 and wakes it via rt_mutex_unlock(&pi_state->mutex) T2 --> acquires pi_state->mutex and gains ownership of the pi_state --> Claims ownership of the futex and sets its own TID into the userspace TID field of futex F --> returns to user space This covers all gazillion points on which T3 might come in between T1's exit_robust_list() clearing the TID field and T2 fixing it up. It also solves the "WAITERS bit stale" problem by forcing the take over. Another benefit of changing the code this way is that it makes it less dependent on untrusted user space values and therefor minimizes the possible wreckage which might be inflicted. As usual after staring for too long at the futex code my brain hurts so much that I really want to ditch that whole optimization of avoiding the syscall for the non contended case for PI futexes and rip out the maze of corner case handling code. Unfortunately we can't as user space relies on that existing behaviour, but at least thinking about it helps me to preserve my mental sanity. Maybe we should nevertheless :) Reported-and-tested-by: Siddhesh Poyarekar <siddhesh.poyarekar@gmail.com> Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1210232138540.2756@ionos Acked-by: Darren Hart <dvhart@linux.intel.com> Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'kernel/futex.c')
-rw-r--r--kernel/futex.c41
1 files changed, 22 insertions, 19 deletions
diff --git a/kernel/futex.c b/kernel/futex.c
index 3717e7b306e0..20ef219bbe9b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -716,7 +716,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
716 struct futex_pi_state **ps, 716 struct futex_pi_state **ps,
717 struct task_struct *task, int set_waiters) 717 struct task_struct *task, int set_waiters)
718{ 718{
719 int lock_taken, ret, ownerdied = 0; 719 int lock_taken, ret, force_take = 0;
720 u32 uval, newval, curval, vpid = task_pid_vnr(task); 720 u32 uval, newval, curval, vpid = task_pid_vnr(task);
721 721
722retry: 722retry:
@@ -755,17 +755,15 @@ retry:
755 newval = curval | FUTEX_WAITERS; 755 newval = curval | FUTEX_WAITERS;
756 756
757 /* 757 /*
758 * There are two cases, where a futex might have no owner (the 758 * Should we force take the futex? See below.
759 * owner TID is 0): OWNER_DIED. We take over the futex in this
760 * case. We also do an unconditional take over, when the owner
761 * of the futex died.
762 *
763 * This is safe as we are protected by the hash bucket lock !
764 */ 759 */
765 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { 760 if (unlikely(force_take)) {
766 /* Keep the OWNER_DIED bit */ 761 /*
762 * Keep the OWNER_DIED and the WAITERS bit and set the
763 * new TID value.
764 */
767 newval = (curval & ~FUTEX_TID_MASK) | vpid; 765 newval = (curval & ~FUTEX_TID_MASK) | vpid;
768 ownerdied = 0; 766 force_take = 0;
769 lock_taken = 1; 767 lock_taken = 1;
770 } 768 }
771 769
@@ -775,7 +773,7 @@ retry:
775 goto retry; 773 goto retry;
776 774
777 /* 775 /*
778 * We took the lock due to owner died take over. 776 * We took the lock due to forced take over.
779 */ 777 */
780 if (unlikely(lock_taken)) 778 if (unlikely(lock_taken))
781 return 1; 779 return 1;
@@ -790,20 +788,25 @@ retry:
790 switch (ret) { 788 switch (ret) {
791 case -ESRCH: 789 case -ESRCH:
792 /* 790 /*
793 * No owner found for this futex. Check if the 791 * We failed to find an owner for this
794 * OWNER_DIED bit is set to figure out whether 792 * futex. So we have no pi_state to block
795 * this is a robust futex or not. 793 * on. This can happen in two cases:
794 *
795 * 1) The owner died
796 * 2) A stale FUTEX_WAITERS bit
797 *
798 * Re-read the futex value.
796 */ 799 */
797 if (get_futex_value_locked(&curval, uaddr)) 800 if (get_futex_value_locked(&curval, uaddr))
798 return -EFAULT; 801 return -EFAULT;
799 802
800 /* 803 /*
801 * We simply start over in case of a robust 804 * If the owner died or we have a stale
802 * futex. The code above will take the futex 805 * WAITERS bit the owner TID in the user space
803 * and return happy. 806 * futex is 0.
804 */ 807 */
805 if (curval & FUTEX_OWNER_DIED) { 808 if (!(curval & FUTEX_TID_MASK)) {
806 ownerdied = 1; 809 force_take = 1;
807 goto retry; 810 goto retry;
808 } 811 }
809 default: 812 default: