diff options
Diffstat (limited to 'kernel/futex.c')
| -rw-r--r-- | kernel/futex.c | 1229 |
1 files changed, 911 insertions, 318 deletions
diff --git a/kernel/futex.c b/kernel/futex.c index d546b2d53a62..1c337112335c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -19,6 +19,10 @@ | |||
| 19 | * PRIVATE futexes by Eric Dumazet | 19 | * PRIVATE futexes by Eric Dumazet |
| 20 | * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> | 20 | * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> |
| 21 | * | 21 | * |
| 22 | * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> | ||
| 23 | * Copyright (C) IBM Corporation, 2009 | ||
| 24 | * Thanks to Thomas Gleixner for conceptual design and careful reviews. | ||
| 25 | * | ||
| 22 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly | 26 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly |
| 23 | * enough at me, Linus for the original (flawed) idea, Matthew | 27 | * enough at me, Linus for the original (flawed) idea, Matthew |
| 24 | * Kirkwood for proof-of-concept implementation. | 28 | * Kirkwood for proof-of-concept implementation. |
| @@ -96,8 +100,8 @@ struct futex_pi_state { | |||
| 96 | */ | 100 | */ |
| 97 | struct futex_q { | 101 | struct futex_q { |
| 98 | struct plist_node list; | 102 | struct plist_node list; |
| 99 | /* There can only be a single waiter */ | 103 | /* Waiter reference */ |
| 100 | wait_queue_head_t waiter; | 104 | struct task_struct *task; |
| 101 | 105 | ||
| 102 | /* Which hash list lock to use: */ | 106 | /* Which hash list lock to use: */ |
| 103 | spinlock_t *lock_ptr; | 107 | spinlock_t *lock_ptr; |
| @@ -107,7 +111,9 @@ struct futex_q { | |||
| 107 | 111 | ||
| 108 | /* Optional priority inheritance state: */ | 112 | /* Optional priority inheritance state: */ |
| 109 | struct futex_pi_state *pi_state; | 113 | struct futex_pi_state *pi_state; |
| 110 | struct task_struct *task; | 114 | |
| 115 | /* rt_waiter storage for requeue_pi: */ | ||
| 116 | struct rt_mutex_waiter *rt_waiter; | ||
| 111 | 117 | ||
| 112 | /* Bitset for the optional bitmasked wakeup */ | 118 | /* Bitset for the optional bitmasked wakeup */ |
| 113 | u32 bitset; | 119 | u32 bitset; |
| @@ -278,6 +284,44 @@ void put_futex_key(int fshared, union futex_key *key) | |||
| 278 | drop_futex_key_refs(key); | 284 | drop_futex_key_refs(key); |
| 279 | } | 285 | } |
| 280 | 286 | ||
| 287 | /* | ||
| 288 | * fault_in_user_writeable - fault in user address and verify RW access | ||
| 289 | * @uaddr: pointer to faulting user space address | ||
| 290 | * | ||
| 291 | * Slow path to fixup the fault we just took in the atomic write | ||
| 292 | * access to @uaddr. | ||
| 293 | * | ||
| 294 | * We have no generic implementation of a non destructive write to the | ||
| 295 | * user address. We know that we faulted in the atomic pagefault | ||
| 296 | * disabled section so we can as well avoid the #PF overhead by | ||
| 297 | * calling get_user_pages() right away. | ||
| 298 | */ | ||
| 299 | static int fault_in_user_writeable(u32 __user *uaddr) | ||
| 300 | { | ||
| 301 | int ret = get_user_pages(current, current->mm, (unsigned long)uaddr, | ||
| 302 | sizeof(*uaddr), 1, 0, NULL, NULL); | ||
| 303 | return ret < 0 ? ret : 0; | ||
| 304 | } | ||
| 305 | |||
| 306 | /** | ||
| 307 | * futex_top_waiter() - Return the highest priority waiter on a futex | ||
| 308 | * @hb: the hash bucket the futex_q's reside in | ||
| 309 | * @key: the futex key (to distinguish it from other futex futex_q's) | ||
| 310 | * | ||
| 311 | * Must be called with the hb lock held. | ||
| 312 | */ | ||
| 313 | static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, | ||
| 314 | union futex_key *key) | ||
| 315 | { | ||
| 316 | struct futex_q *this; | ||
| 317 | |||
| 318 | plist_for_each_entry(this, &hb->chain, list) { | ||
| 319 | if (match_futex(&this->key, key)) | ||
| 320 | return this; | ||
| 321 | } | ||
| 322 | return NULL; | ||
| 323 | } | ||
| 324 | |||
| 281 | static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) | 325 | static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) |
| 282 | { | 326 | { |
| 283 | u32 curval; | 327 | u32 curval; |
| @@ -539,28 +583,160 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
| 539 | return 0; | 583 | return 0; |
| 540 | } | 584 | } |
| 541 | 585 | ||
| 586 | /** | ||
| 587 | * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex | ||
| 588 | * @uaddr: the pi futex user address | ||
| 589 | * @hb: the pi futex hash bucket | ||
| 590 | * @key: the futex key associated with uaddr and hb | ||
| 591 | * @ps: the pi_state pointer where we store the result of the | ||
| 592 | * lookup | ||
| 593 | * @task: the task to perform the atomic lock work for. This will | ||
| 594 | * be "current" except in the case of requeue pi. | ||
| 595 | * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) | ||
| 596 | * | ||
| 597 | * Returns: | ||
| 598 | * 0 - ready to wait | ||
| 599 | * 1 - acquired the lock | ||
| 600 | * <0 - error | ||
| 601 | * | ||
| 602 | * The hb->lock and futex_key refs shall be held by the caller. | ||
| 603 | */ | ||
| 604 | static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, | ||
| 605 | union futex_key *key, | ||
| 606 | struct futex_pi_state **ps, | ||
| 607 | struct task_struct *task, int set_waiters) | ||
| 608 | { | ||
| 609 | int lock_taken, ret, ownerdied = 0; | ||
| 610 | u32 uval, newval, curval; | ||
| 611 | |||
| 612 | retry: | ||
| 613 | ret = lock_taken = 0; | ||
| 614 | |||
| 615 | /* | ||
| 616 | * To avoid races, we attempt to take the lock here again | ||
| 617 | * (by doing a 0 -> TID atomic cmpxchg), while holding all | ||
| 618 | * the locks. It will most likely not succeed. | ||
| 619 | */ | ||
| 620 | newval = task_pid_vnr(task); | ||
| 621 | if (set_waiters) | ||
| 622 | newval |= FUTEX_WAITERS; | ||
| 623 | |||
| 624 | curval = cmpxchg_futex_value_locked(uaddr, 0, newval); | ||
| 625 | |||
| 626 | if (unlikely(curval == -EFAULT)) | ||
| 627 | return -EFAULT; | ||
| 628 | |||
| 629 | /* | ||
| 630 | * Detect deadlocks. | ||
| 631 | */ | ||
| 632 | if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) | ||
| 633 | return -EDEADLK; | ||
| 634 | |||
| 635 | /* | ||
| 636 | * Surprise - we got the lock. Just return to userspace: | ||
| 637 | */ | ||
| 638 | if (unlikely(!curval)) | ||
| 639 | return 1; | ||
| 640 | |||
| 641 | uval = curval; | ||
| 642 | |||
| 643 | /* | ||
| 644 | * Set the FUTEX_WAITERS flag, so the owner will know it has someone | ||
| 645 | * to wake at the next unlock. | ||
| 646 | */ | ||
| 647 | newval = curval | FUTEX_WAITERS; | ||
| 648 | |||
| 649 | /* | ||
| 650 | * There are two cases, where a futex might have no owner (the | ||
| 651 | * owner TID is 0): OWNER_DIED. We take over the futex in this | ||
| 652 | * case. We also do an unconditional take over, when the owner | ||
| 653 | * of the futex died. | ||
| 654 | * | ||
| 655 | * This is safe as we are protected by the hash bucket lock ! | ||
| 656 | */ | ||
| 657 | if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { | ||
| 658 | /* Keep the OWNER_DIED bit */ | ||
| 659 | newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); | ||
| 660 | ownerdied = 0; | ||
| 661 | lock_taken = 1; | ||
| 662 | } | ||
| 663 | |||
| 664 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); | ||
| 665 | |||
| 666 | if (unlikely(curval == -EFAULT)) | ||
| 667 | return -EFAULT; | ||
| 668 | if (unlikely(curval != uval)) | ||
| 669 | goto retry; | ||
| 670 | |||
| 671 | /* | ||
| 672 | * We took the lock due to owner died take over. | ||
| 673 | */ | ||
| 674 | if (unlikely(lock_taken)) | ||
| 675 | return 1; | ||
| 676 | |||
| 677 | /* | ||
| 678 | * We dont have the lock. Look up the PI state (or create it if | ||
| 679 | * we are the first waiter): | ||
| 680 | */ | ||
| 681 | ret = lookup_pi_state(uval, hb, key, ps); | ||
| 682 | |||
| 683 | if (unlikely(ret)) { | ||
| 684 | switch (ret) { | ||
| 685 | case -ESRCH: | ||
| 686 | /* | ||
| 687 | * No owner found for this futex. Check if the | ||
| 688 | * OWNER_DIED bit is set to figure out whether | ||
| 689 | * this is a robust futex or not. | ||
| 690 | */ | ||
| 691 | if (get_futex_value_locked(&curval, uaddr)) | ||
| 692 | return -EFAULT; | ||
| 693 | |||
| 694 | /* | ||
| 695 | * We simply start over in case of a robust | ||
| 696 | * futex. The code above will take the futex | ||
| 697 | * and return happy. | ||
| 698 | */ | ||
| 699 | if (curval & FUTEX_OWNER_DIED) { | ||
| 700 | ownerdied = 1; | ||
| 701 | goto retry; | ||
| 702 | } | ||
| 703 | default: | ||
| 704 | break; | ||
| 705 | } | ||
| 706 | } | ||
| 707 | |||
| 708 | return ret; | ||
| 709 | } | ||
| 710 | |||
| 542 | /* | 711 | /* |
| 543 | * The hash bucket lock must be held when this is called. | 712 | * The hash bucket lock must be held when this is called. |
| 544 | * Afterwards, the futex_q must not be accessed. | 713 | * Afterwards, the futex_q must not be accessed. |
| 545 | */ | 714 | */ |
| 546 | static void wake_futex(struct futex_q *q) | 715 | static void wake_futex(struct futex_q *q) |
| 547 | { | 716 | { |
| 548 | plist_del(&q->list, &q->list.plist); | 717 | struct task_struct *p = q->task; |
| 718 | |||
| 549 | /* | 719 | /* |
| 550 | * The lock in wake_up_all() is a crucial memory barrier after the | 720 | * We set q->lock_ptr = NULL _before_ we wake up the task. If |
| 551 | * plist_del() and also before assigning to q->lock_ptr. | 721 | * a non futex wake up happens on another CPU then the task |
| 722 | * might exit and p would dereference a non existing task | ||
| 723 | * struct. Prevent this by holding a reference on p across the | ||
| 724 | * wake up. | ||
| 552 | */ | 725 | */ |
| 553 | wake_up(&q->waiter); | 726 | get_task_struct(p); |
| 727 | |||
| 728 | plist_del(&q->list, &q->list.plist); | ||
| 554 | /* | 729 | /* |
| 555 | * The waiting task can free the futex_q as soon as this is written, | 730 | * The waiting task can free the futex_q as soon as |
| 556 | * without taking any locks. This must come last. | 731 | * q->lock_ptr = NULL is written, without taking any locks. A |
| 557 | * | 732 | * memory barrier is required here to prevent the following |
| 558 | * A memory barrier is required here to prevent the following store to | 733 | * store to lock_ptr from getting ahead of the plist_del. |
| 559 | * lock_ptr from getting ahead of the wakeup. Clearing the lock at the | ||
| 560 | * end of wake_up() does not prevent this store from moving. | ||
| 561 | */ | 734 | */ |
| 562 | smp_wmb(); | 735 | smp_wmb(); |
| 563 | q->lock_ptr = NULL; | 736 | q->lock_ptr = NULL; |
| 737 | |||
| 738 | wake_up_state(p, TASK_NORMAL); | ||
| 739 | put_task_struct(p); | ||
| 564 | } | 740 | } |
| 565 | 741 | ||
| 566 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | 742 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) |
| @@ -689,7 +865,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) | |||
| 689 | 865 | ||
| 690 | plist_for_each_entry_safe(this, next, head, list) { | 866 | plist_for_each_entry_safe(this, next, head, list) { |
| 691 | if (match_futex (&this->key, &key)) { | 867 | if (match_futex (&this->key, &key)) { |
| 692 | if (this->pi_state) { | 868 | if (this->pi_state || this->rt_waiter) { |
| 693 | ret = -EINVAL; | 869 | ret = -EINVAL; |
| 694 | break; | 870 | break; |
| 695 | } | 871 | } |
| @@ -739,7 +915,6 @@ retry: | |||
| 739 | retry_private: | 915 | retry_private: |
| 740 | op_ret = futex_atomic_op_inuser(op, uaddr2); | 916 | op_ret = futex_atomic_op_inuser(op, uaddr2); |
| 741 | if (unlikely(op_ret < 0)) { | 917 | if (unlikely(op_ret < 0)) { |
| 742 | u32 dummy; | ||
| 743 | 918 | ||
| 744 | double_unlock_hb(hb1, hb2); | 919 | double_unlock_hb(hb1, hb2); |
| 745 | 920 | ||
| @@ -757,7 +932,7 @@ retry_private: | |||
| 757 | goto out_put_keys; | 932 | goto out_put_keys; |
| 758 | } | 933 | } |
| 759 | 934 | ||
| 760 | ret = get_user(dummy, uaddr2); | 935 | ret = fault_in_user_writeable(uaddr2); |
| 761 | if (ret) | 936 | if (ret) |
| 762 | goto out_put_keys; | 937 | goto out_put_keys; |
| 763 | 938 | ||
| @@ -802,24 +977,185 @@ out: | |||
| 802 | return ret; | 977 | return ret; |
| 803 | } | 978 | } |
| 804 | 979 | ||
| 805 | /* | 980 | /** |
| 806 | * Requeue all waiters hashed on one physical page to another | 981 | * requeue_futex() - Requeue a futex_q from one hb to another |
| 807 | * physical page. | 982 | * @q: the futex_q to requeue |
| 983 | * @hb1: the source hash_bucket | ||
| 984 | * @hb2: the target hash_bucket | ||
| 985 | * @key2: the new key for the requeued futex_q | ||
| 986 | */ | ||
| 987 | static inline | ||
| 988 | void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, | ||
| 989 | struct futex_hash_bucket *hb2, union futex_key *key2) | ||
| 990 | { | ||
| 991 | |||
| 992 | /* | ||
| 993 | * If key1 and key2 hash to the same bucket, no need to | ||
| 994 | * requeue. | ||
| 995 | */ | ||
| 996 | if (likely(&hb1->chain != &hb2->chain)) { | ||
| 997 | plist_del(&q->list, &hb1->chain); | ||
| 998 | plist_add(&q->list, &hb2->chain); | ||
| 999 | q->lock_ptr = &hb2->lock; | ||
| 1000 | #ifdef CONFIG_DEBUG_PI_LIST | ||
| 1001 | q->list.plist.lock = &hb2->lock; | ||
| 1002 | #endif | ||
| 1003 | } | ||
| 1004 | get_futex_key_refs(key2); | ||
| 1005 | q->key = *key2; | ||
| 1006 | } | ||
| 1007 | |||
| 1008 | /** | ||
| 1009 | * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue | ||
| 1010 | * q: the futex_q | ||
| 1011 | * key: the key of the requeue target futex | ||
| 1012 | * | ||
| 1013 | * During futex_requeue, with requeue_pi=1, it is possible to acquire the | ||
| 1014 | * target futex if it is uncontended or via a lock steal. Set the futex_q key | ||
| 1015 | * to the requeue target futex so the waiter can detect the wakeup on the right | ||
| 1016 | * futex, but remove it from the hb and NULL the rt_waiter so it can detect | ||
| 1017 | * atomic lock acquisition. Must be called with the q->lock_ptr held. | ||
| 1018 | */ | ||
| 1019 | static inline | ||
| 1020 | void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) | ||
| 1021 | { | ||
| 1022 | drop_futex_key_refs(&q->key); | ||
| 1023 | get_futex_key_refs(key); | ||
| 1024 | q->key = *key; | ||
| 1025 | |||
| 1026 | WARN_ON(plist_node_empty(&q->list)); | ||
| 1027 | plist_del(&q->list, &q->list.plist); | ||
| 1028 | |||
| 1029 | WARN_ON(!q->rt_waiter); | ||
| 1030 | q->rt_waiter = NULL; | ||
| 1031 | |||
| 1032 | wake_up_state(q->task, TASK_NORMAL); | ||
| 1033 | } | ||
| 1034 | |||
| 1035 | /** | ||
| 1036 | * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter | ||
| 1037 | * @pifutex: the user address of the to futex | ||
| 1038 | * @hb1: the from futex hash bucket, must be locked by the caller | ||
| 1039 | * @hb2: the to futex hash bucket, must be locked by the caller | ||
| 1040 | * @key1: the from futex key | ||
| 1041 | * @key2: the to futex key | ||
| 1042 | * @ps: address to store the pi_state pointer | ||
| 1043 | * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) | ||
| 1044 | * | ||
| 1045 | * Try and get the lock on behalf of the top waiter if we can do it atomically. | ||
| 1046 | * Wake the top waiter if we succeed. If the caller specified set_waiters, | ||
| 1047 | * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. | ||
| 1048 | * hb1 and hb2 must be held by the caller. | ||
| 1049 | * | ||
| 1050 | * Returns: | ||
| 1051 | * 0 - failed to acquire the lock atomicly | ||
| 1052 | * 1 - acquired the lock | ||
| 1053 | * <0 - error | ||
| 1054 | */ | ||
| 1055 | static int futex_proxy_trylock_atomic(u32 __user *pifutex, | ||
| 1056 | struct futex_hash_bucket *hb1, | ||
| 1057 | struct futex_hash_bucket *hb2, | ||
| 1058 | union futex_key *key1, union futex_key *key2, | ||
| 1059 | struct futex_pi_state **ps, int set_waiters) | ||
| 1060 | { | ||
| 1061 | struct futex_q *top_waiter = NULL; | ||
| 1062 | u32 curval; | ||
| 1063 | int ret; | ||
| 1064 | |||
| 1065 | if (get_futex_value_locked(&curval, pifutex)) | ||
| 1066 | return -EFAULT; | ||
| 1067 | |||
| 1068 | /* | ||
| 1069 | * Find the top_waiter and determine if there are additional waiters. | ||
| 1070 | * If the caller intends to requeue more than 1 waiter to pifutex, | ||
| 1071 | * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, | ||
| 1072 | * as we have means to handle the possible fault. If not, don't set | ||
| 1073 | * the bit unecessarily as it will force the subsequent unlock to enter | ||
| 1074 | * the kernel. | ||
| 1075 | */ | ||
| 1076 | top_waiter = futex_top_waiter(hb1, key1); | ||
| 1077 | |||
| 1078 | /* There are no waiters, nothing for us to do. */ | ||
| 1079 | if (!top_waiter) | ||
| 1080 | return 0; | ||
| 1081 | |||
| 1082 | /* | ||
| 1083 | * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in | ||
| 1084 | * the contended case or if set_waiters is 1. The pi_state is returned | ||
| 1085 | * in ps in contended cases. | ||
| 1086 | */ | ||
| 1087 | ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, | ||
| 1088 | set_waiters); | ||
| 1089 | if (ret == 1) | ||
| 1090 | requeue_pi_wake_futex(top_waiter, key2); | ||
| 1091 | |||
| 1092 | return ret; | ||
| 1093 | } | ||
| 1094 | |||
| 1095 | /** | ||
| 1096 | * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 | ||
| 1097 | * uaddr1: source futex user address | ||
| 1098 | * uaddr2: target futex user address | ||
| 1099 | * nr_wake: number of waiters to wake (must be 1 for requeue_pi) | ||
| 1100 | * nr_requeue: number of waiters to requeue (0-INT_MAX) | ||
| 1101 | * requeue_pi: if we are attempting to requeue from a non-pi futex to a | ||
| 1102 | * pi futex (pi to pi requeue is not supported) | ||
| 1103 | * | ||
| 1104 | * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire | ||
| 1105 | * uaddr2 atomically on behalf of the top waiter. | ||
| 1106 | * | ||
| 1107 | * Returns: | ||
| 1108 | * >=0 - on success, the number of tasks requeued or woken | ||
| 1109 | * <0 - on error | ||
| 808 | */ | 1110 | */ |
| 809 | static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, | 1111 | static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, |
| 810 | int nr_wake, int nr_requeue, u32 *cmpval) | 1112 | int nr_wake, int nr_requeue, u32 *cmpval, |
| 1113 | int requeue_pi) | ||
| 811 | { | 1114 | { |
| 812 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; | 1115 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; |
| 1116 | int drop_count = 0, task_count = 0, ret; | ||
| 1117 | struct futex_pi_state *pi_state = NULL; | ||
| 813 | struct futex_hash_bucket *hb1, *hb2; | 1118 | struct futex_hash_bucket *hb1, *hb2; |
| 814 | struct plist_head *head1; | 1119 | struct plist_head *head1; |
| 815 | struct futex_q *this, *next; | 1120 | struct futex_q *this, *next; |
| 816 | int ret, drop_count = 0; | 1121 | u32 curval2; |
| 1122 | |||
| 1123 | if (requeue_pi) { | ||
| 1124 | /* | ||
| 1125 | * requeue_pi requires a pi_state, try to allocate it now | ||
| 1126 | * without any locks in case it fails. | ||
| 1127 | */ | ||
| 1128 | if (refill_pi_state_cache()) | ||
| 1129 | return -ENOMEM; | ||
| 1130 | /* | ||
| 1131 | * requeue_pi must wake as many tasks as it can, up to nr_wake | ||
| 1132 | * + nr_requeue, since it acquires the rt_mutex prior to | ||
| 1133 | * returning to userspace, so as to not leave the rt_mutex with | ||
| 1134 | * waiters and no owner. However, second and third wake-ups | ||
| 1135 | * cannot be predicted as they involve race conditions with the | ||
| 1136 | * first wake and a fault while looking up the pi_state. Both | ||
| 1137 | * pthread_cond_signal() and pthread_cond_broadcast() should | ||
| 1138 | * use nr_wake=1. | ||
| 1139 | */ | ||
| 1140 | if (nr_wake != 1) | ||
| 1141 | return -EINVAL; | ||
| 1142 | } | ||
| 817 | 1143 | ||
| 818 | retry: | 1144 | retry: |
| 1145 | if (pi_state != NULL) { | ||
| 1146 | /* | ||
| 1147 | * We will have to lookup the pi_state again, so free this one | ||
| 1148 | * to keep the accounting correct. | ||
| 1149 | */ | ||
| 1150 | free_pi_state(pi_state); | ||
| 1151 | pi_state = NULL; | ||
| 1152 | } | ||
| 1153 | |||
| 819 | ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); | 1154 | ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); |
| 820 | if (unlikely(ret != 0)) | 1155 | if (unlikely(ret != 0)) |
| 821 | goto out; | 1156 | goto out; |
| 822 | ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_READ); | 1157 | ret = get_futex_key(uaddr2, fshared, &key2, |
| 1158 | requeue_pi ? VERIFY_WRITE : VERIFY_READ); | ||
| 823 | if (unlikely(ret != 0)) | 1159 | if (unlikely(ret != 0)) |
| 824 | goto out_put_key1; | 1160 | goto out_put_key1; |
| 825 | 1161 | ||
| @@ -854,32 +1190,99 @@ retry_private: | |||
| 854 | } | 1190 | } |
| 855 | } | 1191 | } |
| 856 | 1192 | ||
| 1193 | if (requeue_pi && (task_count - nr_wake < nr_requeue)) { | ||
| 1194 | /* | ||
| 1195 | * Attempt to acquire uaddr2 and wake the top waiter. If we | ||
| 1196 | * intend to requeue waiters, force setting the FUTEX_WAITERS | ||
| 1197 | * bit. We force this here where we are able to easily handle | ||
| 1198 | * faults rather in the requeue loop below. | ||
| 1199 | */ | ||
| 1200 | ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, | ||
| 1201 | &key2, &pi_state, nr_requeue); | ||
| 1202 | |||
| 1203 | /* | ||
| 1204 | * At this point the top_waiter has either taken uaddr2 or is | ||
| 1205 | * waiting on it. If the former, then the pi_state will not | ||
| 1206 | * exist yet, look it up one more time to ensure we have a | ||
| 1207 | * reference to it. | ||
| 1208 | */ | ||
| 1209 | if (ret == 1) { | ||
| 1210 | WARN_ON(pi_state); | ||
| 1211 | task_count++; | ||
| 1212 | ret = get_futex_value_locked(&curval2, uaddr2); | ||
| 1213 | if (!ret) | ||
| 1214 | ret = lookup_pi_state(curval2, hb2, &key2, | ||
| 1215 | &pi_state); | ||
| 1216 | } | ||
| 1217 | |||
| 1218 | switch (ret) { | ||
| 1219 | case 0: | ||
| 1220 | break; | ||
| 1221 | case -EFAULT: | ||
| 1222 | double_unlock_hb(hb1, hb2); | ||
| 1223 | put_futex_key(fshared, &key2); | ||
| 1224 | put_futex_key(fshared, &key1); | ||
| 1225 | ret = fault_in_user_writeable(uaddr2); | ||
| 1226 | if (!ret) | ||
| 1227 | goto retry; | ||
| 1228 | goto out; | ||
| 1229 | case -EAGAIN: | ||
| 1230 | /* The owner was exiting, try again. */ | ||
| 1231 | double_unlock_hb(hb1, hb2); | ||
| 1232 | put_futex_key(fshared, &key2); | ||
| 1233 | put_futex_key(fshared, &key1); | ||
| 1234 | cond_resched(); | ||
| 1235 | goto retry; | ||
| 1236 | default: | ||
| 1237 | goto out_unlock; | ||
| 1238 | } | ||
| 1239 | } | ||
| 1240 | |||
| 857 | head1 = &hb1->chain; | 1241 | head1 = &hb1->chain; |
| 858 | plist_for_each_entry_safe(this, next, head1, list) { | 1242 | plist_for_each_entry_safe(this, next, head1, list) { |
| 859 | if (!match_futex (&this->key, &key1)) | 1243 | if (task_count - nr_wake >= nr_requeue) |
| 1244 | break; | ||
| 1245 | |||
| 1246 | if (!match_futex(&this->key, &key1)) | ||
| 860 | continue; | 1247 | continue; |
| 861 | if (++ret <= nr_wake) { | 1248 | |
| 1249 | WARN_ON(!requeue_pi && this->rt_waiter); | ||
| 1250 | WARN_ON(requeue_pi && !this->rt_waiter); | ||
| 1251 | |||
| 1252 | /* | ||
| 1253 | * Wake nr_wake waiters. For requeue_pi, if we acquired the | ||
| 1254 | * lock, we already woke the top_waiter. If not, it will be | ||
| 1255 | * woken by futex_unlock_pi(). | ||
| 1256 | */ | ||
| 1257 | if (++task_count <= nr_wake && !requeue_pi) { | ||
| 862 | wake_futex(this); | 1258 | wake_futex(this); |
| 863 | } else { | 1259 | continue; |
| 864 | /* | 1260 | } |
| 865 | * If key1 and key2 hash to the same bucket, no need to | ||
| 866 | * requeue. | ||
| 867 | */ | ||
| 868 | if (likely(head1 != &hb2->chain)) { | ||
| 869 | plist_del(&this->list, &hb1->chain); | ||
| 870 | plist_add(&this->list, &hb2->chain); | ||
| 871 | this->lock_ptr = &hb2->lock; | ||
| 872 | #ifdef CONFIG_DEBUG_PI_LIST | ||
| 873 | this->list.plist.lock = &hb2->lock; | ||
| 874 | #endif | ||
| 875 | } | ||
| 876 | this->key = key2; | ||
| 877 | get_futex_key_refs(&key2); | ||
| 878 | drop_count++; | ||
| 879 | 1261 | ||
| 880 | if (ret - nr_wake >= nr_requeue) | 1262 | /* |
| 881 | break; | 1263 | * Requeue nr_requeue waiters and possibly one more in the case |
| 1264 | * of requeue_pi if we couldn't acquire the lock atomically. | ||
| 1265 | */ | ||
| 1266 | if (requeue_pi) { | ||
| 1267 | /* Prepare the waiter to take the rt_mutex. */ | ||
| 1268 | atomic_inc(&pi_state->refcount); | ||
| 1269 | this->pi_state = pi_state; | ||
| 1270 | ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, | ||
| 1271 | this->rt_waiter, | ||
| 1272 | this->task, 1); | ||
| 1273 | if (ret == 1) { | ||
| 1274 | /* We got the lock. */ | ||
| 1275 | requeue_pi_wake_futex(this, &key2); | ||
| 1276 | continue; | ||
| 1277 | } else if (ret) { | ||
| 1278 | /* -EDEADLK */ | ||
| 1279 | this->pi_state = NULL; | ||
| 1280 | free_pi_state(pi_state); | ||
| 1281 | goto out_unlock; | ||
| 1282 | } | ||
| 882 | } | 1283 | } |
| 1284 | requeue_futex(this, hb1, hb2, &key2); | ||
| 1285 | drop_count++; | ||
| 883 | } | 1286 | } |
| 884 | 1287 | ||
| 885 | out_unlock: | 1288 | out_unlock: |
| @@ -899,7 +1302,9 @@ out_put_keys: | |||
| 899 | out_put_key1: | 1302 | out_put_key1: |
| 900 | put_futex_key(fshared, &key1); | 1303 | put_futex_key(fshared, &key1); |
| 901 | out: | 1304 | out: |
| 902 | return ret; | 1305 | if (pi_state != NULL) |
| 1306 | free_pi_state(pi_state); | ||
| 1307 | return ret ? ret : task_count; | ||
| 903 | } | 1308 | } |
| 904 | 1309 | ||
| 905 | /* The key must be already stored in q->key. */ | 1310 | /* The key must be already stored in q->key. */ |
| @@ -907,8 +1312,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) | |||
| 907 | { | 1312 | { |
| 908 | struct futex_hash_bucket *hb; | 1313 | struct futex_hash_bucket *hb; |
| 909 | 1314 | ||
| 910 | init_waitqueue_head(&q->waiter); | ||
| 911 | |||
| 912 | get_futex_key_refs(&q->key); | 1315 | get_futex_key_refs(&q->key); |
| 913 | hb = hash_futex(&q->key); | 1316 | hb = hash_futex(&q->key); |
| 914 | q->lock_ptr = &hb->lock; | 1317 | q->lock_ptr = &hb->lock; |
| @@ -1097,7 +1500,7 @@ retry: | |||
| 1097 | handle_fault: | 1500 | handle_fault: |
| 1098 | spin_unlock(q->lock_ptr); | 1501 | spin_unlock(q->lock_ptr); |
| 1099 | 1502 | ||
| 1100 | ret = get_user(uval, uaddr); | 1503 | ret = fault_in_user_writeable(uaddr); |
| 1101 | 1504 | ||
| 1102 | spin_lock(q->lock_ptr); | 1505 | spin_lock(q->lock_ptr); |
| 1103 | 1506 | ||
| @@ -1119,35 +1522,149 @@ handle_fault: | |||
| 1119 | */ | 1522 | */ |
| 1120 | #define FLAGS_SHARED 0x01 | 1523 | #define FLAGS_SHARED 0x01 |
| 1121 | #define FLAGS_CLOCKRT 0x02 | 1524 | #define FLAGS_CLOCKRT 0x02 |
| 1525 | #define FLAGS_HAS_TIMEOUT 0x04 | ||
| 1122 | 1526 | ||
| 1123 | static long futex_wait_restart(struct restart_block *restart); | 1527 | static long futex_wait_restart(struct restart_block *restart); |
| 1124 | 1528 | ||
| 1125 | static int futex_wait(u32 __user *uaddr, int fshared, | 1529 | /** |
| 1126 | u32 val, ktime_t *abs_time, u32 bitset, int clockrt) | 1530 | * fixup_owner() - Post lock pi_state and corner case management |
| 1531 | * @uaddr: user address of the futex | ||
| 1532 | * @fshared: whether the futex is shared (1) or not (0) | ||
| 1533 | * @q: futex_q (contains pi_state and access to the rt_mutex) | ||
| 1534 | * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) | ||
| 1535 | * | ||
| 1536 | * After attempting to lock an rt_mutex, this function is called to cleanup | ||
| 1537 | * the pi_state owner as well as handle race conditions that may allow us to | ||
| 1538 | * acquire the lock. Must be called with the hb lock held. | ||
| 1539 | * | ||
| 1540 | * Returns: | ||
| 1541 | * 1 - success, lock taken | ||
| 1542 | * 0 - success, lock not taken | ||
| 1543 | * <0 - on error (-EFAULT) | ||
| 1544 | */ | ||
| 1545 | static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, | ||
| 1546 | int locked) | ||
| 1127 | { | 1547 | { |
| 1128 | struct task_struct *curr = current; | 1548 | struct task_struct *owner; |
| 1129 | struct restart_block *restart; | 1549 | int ret = 0; |
| 1130 | DECLARE_WAITQUEUE(wait, curr); | ||
| 1131 | struct futex_hash_bucket *hb; | ||
| 1132 | struct futex_q q; | ||
| 1133 | u32 uval; | ||
| 1134 | int ret; | ||
| 1135 | struct hrtimer_sleeper t; | ||
| 1136 | int rem = 0; | ||
| 1137 | 1550 | ||
| 1138 | if (!bitset) | 1551 | if (locked) { |
| 1139 | return -EINVAL; | 1552 | /* |
| 1553 | * Got the lock. We might not be the anticipated owner if we | ||
| 1554 | * did a lock-steal - fix up the PI-state in that case: | ||
| 1555 | */ | ||
| 1556 | if (q->pi_state->owner != current) | ||
| 1557 | ret = fixup_pi_state_owner(uaddr, q, current, fshared); | ||
| 1558 | goto out; | ||
| 1559 | } | ||
| 1140 | 1560 | ||
| 1141 | q.pi_state = NULL; | 1561 | /* |
| 1142 | q.bitset = bitset; | 1562 | * Catch the rare case, where the lock was released when we were on the |
| 1143 | retry: | 1563 | * way back before we locked the hash bucket. |
| 1144 | q.key = FUTEX_KEY_INIT; | 1564 | */ |
| 1145 | ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_READ); | 1565 | if (q->pi_state->owner == current) { |
| 1146 | if (unlikely(ret != 0)) | 1566 | /* |
| 1567 | * Try to get the rt_mutex now. This might fail as some other | ||
| 1568 | * task acquired the rt_mutex after we removed ourself from the | ||
| 1569 | * rt_mutex waiters list. | ||
| 1570 | */ | ||
| 1571 | if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { | ||
| 1572 | locked = 1; | ||
| 1573 | goto out; | ||
| 1574 | } | ||
| 1575 | |||
| 1576 | /* | ||
| 1577 | * pi_state is incorrect, some other task did a lock steal and | ||
| 1578 | * we returned due to timeout or signal without taking the | ||
| 1579 | * rt_mutex. Too late. We can access the rt_mutex_owner without | ||
| 1580 | * locking, as the other task is now blocked on the hash bucket | ||
| 1581 | * lock. Fix the state up. | ||
| 1582 | */ | ||
| 1583 | owner = rt_mutex_owner(&q->pi_state->pi_mutex); | ||
| 1584 | ret = fixup_pi_state_owner(uaddr, q, owner, fshared); | ||
| 1147 | goto out; | 1585 | goto out; |
| 1586 | } | ||
| 1148 | 1587 | ||
| 1149 | retry_private: | 1588 | /* |
| 1150 | hb = queue_lock(&q); | 1589 | * Paranoia check. If we did not take the lock, then we should not be |
| 1590 | * the owner, nor the pending owner, of the rt_mutex. | ||
| 1591 | */ | ||
| 1592 | if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) | ||
| 1593 | printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " | ||
| 1594 | "pi-state %p\n", ret, | ||
| 1595 | q->pi_state->pi_mutex.owner, | ||
| 1596 | q->pi_state->owner); | ||
| 1597 | |||
| 1598 | out: | ||
| 1599 | return ret ? ret : locked; | ||
| 1600 | } | ||
| 1601 | |||
| 1602 | /** | ||
| 1603 | * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal | ||
| 1604 | * @hb: the futex hash bucket, must be locked by the caller | ||
| 1605 | * @q: the futex_q to queue up on | ||
| 1606 | * @timeout: the prepared hrtimer_sleeper, or null for no timeout | ||
| 1607 | */ | ||
| 1608 | static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, | ||
| 1609 | struct hrtimer_sleeper *timeout) | ||
| 1610 | { | ||
| 1611 | queue_me(q, hb); | ||
| 1612 | |||
| 1613 | /* | ||
| 1614 | * There might have been scheduling since the queue_me(), as we | ||
| 1615 | * cannot hold a spinlock across the get_user() in case it | ||
| 1616 | * faults, and we cannot just set TASK_INTERRUPTIBLE state when | ||
| 1617 | * queueing ourselves into the futex hash. This code thus has to | ||
| 1618 | * rely on the futex_wake() code removing us from hash when it | ||
| 1619 | * wakes us up. | ||
| 1620 | */ | ||
| 1621 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 1622 | |||
| 1623 | /* Arm the timer */ | ||
| 1624 | if (timeout) { | ||
| 1625 | hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); | ||
| 1626 | if (!hrtimer_active(&timeout->timer)) | ||
| 1627 | timeout->task = NULL; | ||
| 1628 | } | ||
| 1629 | |||
| 1630 | /* | ||
| 1631 | * !plist_node_empty() is safe here without any lock. | ||
| 1632 | * q.lock_ptr != 0 is not safe, because of ordering against wakeup. | ||
| 1633 | */ | ||
| 1634 | if (likely(!plist_node_empty(&q->list))) { | ||
| 1635 | /* | ||
| 1636 | * If the timer has already expired, current will already be | ||
| 1637 | * flagged for rescheduling. Only call schedule if there | ||
| 1638 | * is no timeout, or if it has yet to expire. | ||
| 1639 | */ | ||
| 1640 | if (!timeout || timeout->task) | ||
| 1641 | schedule(); | ||
| 1642 | } | ||
| 1643 | __set_current_state(TASK_RUNNING); | ||
| 1644 | } | ||
| 1645 | |||
| 1646 | /** | ||
| 1647 | * futex_wait_setup() - Prepare to wait on a futex | ||
| 1648 | * @uaddr: the futex userspace address | ||
| 1649 | * @val: the expected value | ||
| 1650 | * @fshared: whether the futex is shared (1) or not (0) | ||
| 1651 | * @q: the associated futex_q | ||
| 1652 | * @hb: storage for hash_bucket pointer to be returned to caller | ||
| 1653 | * | ||
| 1654 | * Setup the futex_q and locate the hash_bucket. Get the futex value and | ||
| 1655 | * compare it with the expected value. Handle atomic faults internally. | ||
| 1656 | * Return with the hb lock held and a q.key reference on success, and unlocked | ||
| 1657 | * with no q.key reference on failure. | ||
| 1658 | * | ||
| 1659 | * Returns: | ||
| 1660 | * 0 - uaddr contains val and hb has been locked | ||
| 1661 | * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked | ||
| 1662 | */ | ||
| 1663 | static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, | ||
| 1664 | struct futex_q *q, struct futex_hash_bucket **hb) | ||
| 1665 | { | ||
| 1666 | u32 uval; | ||
| 1667 | int ret; | ||
| 1151 | 1668 | ||
| 1152 | /* | 1669 | /* |
| 1153 | * Access the page AFTER the hash-bucket is locked. | 1670 | * Access the page AFTER the hash-bucket is locked. |
| @@ -1165,95 +1682,83 @@ retry_private: | |||
| 1165 | * A consequence is that futex_wait() can return zero and absorb | 1682 | * A consequence is that futex_wait() can return zero and absorb |
| 1166 | * a wakeup when *uaddr != val on entry to the syscall. This is | 1683 | * a wakeup when *uaddr != val on entry to the syscall. This is |
| 1167 | * rare, but normal. | 1684 | * rare, but normal. |
| 1168 | * | ||
| 1169 | * For shared futexes, we hold the mmap semaphore, so the mapping | ||
| 1170 | * cannot have changed since we looked it up in get_futex_key. | ||
| 1171 | */ | 1685 | */ |
| 1686 | retry: | ||
| 1687 | q->key = FUTEX_KEY_INIT; | ||
| 1688 | ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ); | ||
| 1689 | if (unlikely(ret != 0)) | ||
| 1690 | return ret; | ||
| 1691 | |||
| 1692 | retry_private: | ||
| 1693 | *hb = queue_lock(q); | ||
| 1694 | |||
| 1172 | ret = get_futex_value_locked(&uval, uaddr); | 1695 | ret = get_futex_value_locked(&uval, uaddr); |
| 1173 | 1696 | ||
| 1174 | if (unlikely(ret)) { | 1697 | if (ret) { |
| 1175 | queue_unlock(&q, hb); | 1698 | queue_unlock(q, *hb); |
| 1176 | 1699 | ||
| 1177 | ret = get_user(uval, uaddr); | 1700 | ret = get_user(uval, uaddr); |
| 1178 | if (ret) | 1701 | if (ret) |
| 1179 | goto out_put_key; | 1702 | goto out; |
| 1180 | 1703 | ||
| 1181 | if (!fshared) | 1704 | if (!fshared) |
| 1182 | goto retry_private; | 1705 | goto retry_private; |
| 1183 | 1706 | ||
| 1184 | put_futex_key(fshared, &q.key); | 1707 | put_futex_key(fshared, &q->key); |
| 1185 | goto retry; | 1708 | goto retry; |
| 1186 | } | 1709 | } |
| 1187 | ret = -EWOULDBLOCK; | ||
| 1188 | if (unlikely(uval != val)) { | ||
| 1189 | queue_unlock(&q, hb); | ||
| 1190 | goto out_put_key; | ||
| 1191 | } | ||
| 1192 | 1710 | ||
| 1193 | /* Only actually queue if *uaddr contained val. */ | 1711 | if (uval != val) { |
| 1194 | queue_me(&q, hb); | 1712 | queue_unlock(q, *hb); |
| 1713 | ret = -EWOULDBLOCK; | ||
| 1714 | } | ||
| 1195 | 1715 | ||
| 1196 | /* | 1716 | out: |
| 1197 | * There might have been scheduling since the queue_me(), as we | 1717 | if (ret) |
| 1198 | * cannot hold a spinlock across the get_user() in case it | 1718 | put_futex_key(fshared, &q->key); |
| 1199 | * faults, and we cannot just set TASK_INTERRUPTIBLE state when | 1719 | return ret; |
| 1200 | * queueing ourselves into the futex hash. This code thus has to | 1720 | } |
| 1201 | * rely on the futex_wake() code removing us from hash when it | ||
| 1202 | * wakes us up. | ||
| 1203 | */ | ||
| 1204 | 1721 | ||
| 1205 | /* add_wait_queue is the barrier after __set_current_state. */ | 1722 | static int futex_wait(u32 __user *uaddr, int fshared, |
| 1206 | __set_current_state(TASK_INTERRUPTIBLE); | 1723 | u32 val, ktime_t *abs_time, u32 bitset, int clockrt) |
| 1207 | add_wait_queue(&q.waiter, &wait); | 1724 | { |
| 1208 | /* | 1725 | struct hrtimer_sleeper timeout, *to = NULL; |
| 1209 | * !plist_node_empty() is safe here without any lock. | 1726 | struct restart_block *restart; |
| 1210 | * q.lock_ptr != 0 is not safe, because of ordering against wakeup. | 1727 | struct futex_hash_bucket *hb; |
| 1211 | */ | 1728 | struct futex_q q; |
| 1212 | if (likely(!plist_node_empty(&q.list))) { | 1729 | int ret; |
| 1213 | if (!abs_time) | ||
| 1214 | schedule(); | ||
| 1215 | else { | ||
| 1216 | hrtimer_init_on_stack(&t.timer, | ||
| 1217 | clockrt ? CLOCK_REALTIME : | ||
| 1218 | CLOCK_MONOTONIC, | ||
| 1219 | HRTIMER_MODE_ABS); | ||
| 1220 | hrtimer_init_sleeper(&t, current); | ||
| 1221 | hrtimer_set_expires_range_ns(&t.timer, *abs_time, | ||
| 1222 | current->timer_slack_ns); | ||
| 1223 | |||
| 1224 | hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); | ||
| 1225 | if (!hrtimer_active(&t.timer)) | ||
| 1226 | t.task = NULL; | ||
| 1227 | 1730 | ||
| 1228 | /* | 1731 | if (!bitset) |
| 1229 | * the timer could have already expired, in which | 1732 | return -EINVAL; |
| 1230 | * case current would be flagged for rescheduling. | ||
| 1231 | * Don't bother calling schedule. | ||
| 1232 | */ | ||
| 1233 | if (likely(t.task)) | ||
| 1234 | schedule(); | ||
| 1235 | 1733 | ||
| 1236 | hrtimer_cancel(&t.timer); | 1734 | q.pi_state = NULL; |
| 1735 | q.bitset = bitset; | ||
| 1736 | q.rt_waiter = NULL; | ||
| 1237 | 1737 | ||
| 1238 | /* Flag if a timeout occured */ | 1738 | if (abs_time) { |
| 1239 | rem = (t.task == NULL); | 1739 | to = &timeout; |
| 1240 | 1740 | ||
| 1241 | destroy_hrtimer_on_stack(&t.timer); | 1741 | hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : |
| 1242 | } | 1742 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); |
| 1743 | hrtimer_init_sleeper(to, current); | ||
| 1744 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, | ||
| 1745 | current->timer_slack_ns); | ||
| 1243 | } | 1746 | } |
| 1244 | __set_current_state(TASK_RUNNING); | ||
| 1245 | 1747 | ||
| 1246 | /* | 1748 | /* Prepare to wait on uaddr. */ |
| 1247 | * NOTE: we don't remove ourselves from the waitqueue because | 1749 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); |
| 1248 | * we are the only user of it. | 1750 | if (ret) |
| 1249 | */ | 1751 | goto out; |
| 1752 | |||
| 1753 | /* queue_me and wait for wakeup, timeout, or a signal. */ | ||
| 1754 | futex_wait_queue_me(hb, &q, to); | ||
| 1250 | 1755 | ||
| 1251 | /* If we were woken (and unqueued), we succeeded, whatever. */ | 1756 | /* If we were woken (and unqueued), we succeeded, whatever. */ |
| 1252 | ret = 0; | 1757 | ret = 0; |
| 1253 | if (!unqueue_me(&q)) | 1758 | if (!unqueue_me(&q)) |
| 1254 | goto out_put_key; | 1759 | goto out_put_key; |
| 1255 | ret = -ETIMEDOUT; | 1760 | ret = -ETIMEDOUT; |
| 1256 | if (rem) | 1761 | if (to && !to->task) |
| 1257 | goto out_put_key; | 1762 | goto out_put_key; |
| 1258 | 1763 | ||
| 1259 | /* | 1764 | /* |
| @@ -1270,7 +1775,7 @@ retry_private: | |||
| 1270 | restart->futex.val = val; | 1775 | restart->futex.val = val; |
| 1271 | restart->futex.time = abs_time->tv64; | 1776 | restart->futex.time = abs_time->tv64; |
| 1272 | restart->futex.bitset = bitset; | 1777 | restart->futex.bitset = bitset; |
| 1273 | restart->futex.flags = 0; | 1778 | restart->futex.flags = FLAGS_HAS_TIMEOUT; |
| 1274 | 1779 | ||
| 1275 | if (fshared) | 1780 | if (fshared) |
| 1276 | restart->futex.flags |= FLAGS_SHARED; | 1781 | restart->futex.flags |= FLAGS_SHARED; |
| @@ -1282,6 +1787,10 @@ retry_private: | |||
| 1282 | out_put_key: | 1787 | out_put_key: |
| 1283 | put_futex_key(fshared, &q.key); | 1788 | put_futex_key(fshared, &q.key); |
| 1284 | out: | 1789 | out: |
| 1790 | if (to) { | ||
| 1791 | hrtimer_cancel(&to->timer); | ||
| 1792 | destroy_hrtimer_on_stack(&to->timer); | ||
| 1793 | } | ||
| 1285 | return ret; | 1794 | return ret; |
| 1286 | } | 1795 | } |
| 1287 | 1796 | ||
| @@ -1290,13 +1799,16 @@ static long futex_wait_restart(struct restart_block *restart) | |||
| 1290 | { | 1799 | { |
| 1291 | u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; | 1800 | u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; |
| 1292 | int fshared = 0; | 1801 | int fshared = 0; |
| 1293 | ktime_t t; | 1802 | ktime_t t, *tp = NULL; |
| 1294 | 1803 | ||
| 1295 | t.tv64 = restart->futex.time; | 1804 | if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { |
| 1805 | t.tv64 = restart->futex.time; | ||
| 1806 | tp = &t; | ||
| 1807 | } | ||
| 1296 | restart->fn = do_no_restart_syscall; | 1808 | restart->fn = do_no_restart_syscall; |
| 1297 | if (restart->futex.flags & FLAGS_SHARED) | 1809 | if (restart->futex.flags & FLAGS_SHARED) |
| 1298 | fshared = 1; | 1810 | fshared = 1; |
| 1299 | return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, | 1811 | return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, |
| 1300 | restart->futex.bitset, | 1812 | restart->futex.bitset, |
| 1301 | restart->futex.flags & FLAGS_CLOCKRT); | 1813 | restart->futex.flags & FLAGS_CLOCKRT); |
| 1302 | } | 1814 | } |
| @@ -1312,11 +1824,9 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, | |||
| 1312 | int detect, ktime_t *time, int trylock) | 1824 | int detect, ktime_t *time, int trylock) |
| 1313 | { | 1825 | { |
| 1314 | struct hrtimer_sleeper timeout, *to = NULL; | 1826 | struct hrtimer_sleeper timeout, *to = NULL; |
| 1315 | struct task_struct *curr = current; | ||
| 1316 | struct futex_hash_bucket *hb; | 1827 | struct futex_hash_bucket *hb; |
| 1317 | u32 uval, newval, curval; | ||
| 1318 | struct futex_q q; | 1828 | struct futex_q q; |
| 1319 | int ret, lock_taken, ownerdied = 0; | 1829 | int res, ret; |
| 1320 | 1830 | ||
| 1321 | if (refill_pi_state_cache()) | 1831 | if (refill_pi_state_cache()) |
| 1322 | return -ENOMEM; | 1832 | return -ENOMEM; |
| @@ -1330,6 +1840,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, | |||
| 1330 | } | 1840 | } |
| 1331 | 1841 | ||
| 1332 | q.pi_state = NULL; | 1842 | q.pi_state = NULL; |
| 1843 | q.rt_waiter = NULL; | ||
| 1333 | retry: | 1844 | retry: |
| 1334 | q.key = FUTEX_KEY_INIT; | 1845 | q.key = FUTEX_KEY_INIT; |
| 1335 | ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); | 1846 | ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); |
| @@ -1339,81 +1850,15 @@ retry: | |||
| 1339 | retry_private: | 1850 | retry_private: |
| 1340 | hb = queue_lock(&q); | 1851 | hb = queue_lock(&q); |
| 1341 | 1852 | ||
| 1342 | retry_locked: | 1853 | ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); |
| 1343 | ret = lock_taken = 0; | ||
| 1344 | |||
| 1345 | /* | ||
| 1346 | * To avoid races, we attempt to take the lock here again | ||
| 1347 | * (by doing a 0 -> TID atomic cmpxchg), while holding all | ||
| 1348 | * the locks. It will most likely not succeed. | ||
| 1349 | */ | ||
| 1350 | newval = task_pid_vnr(current); | ||
| 1351 | |||
| 1352 | curval = cmpxchg_futex_value_locked(uaddr, 0, newval); | ||
| 1353 | |||
| 1354 | if (unlikely(curval == -EFAULT)) | ||
| 1355 | goto uaddr_faulted; | ||
| 1356 | |||
| 1357 | /* | ||
| 1358 | * Detect deadlocks. In case of REQUEUE_PI this is a valid | ||
| 1359 | * situation and we return success to user space. | ||
| 1360 | */ | ||
| 1361 | if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { | ||
| 1362 | ret = -EDEADLK; | ||
| 1363 | goto out_unlock_put_key; | ||
| 1364 | } | ||
| 1365 | |||
| 1366 | /* | ||
| 1367 | * Surprise - we got the lock. Just return to userspace: | ||
| 1368 | */ | ||
| 1369 | if (unlikely(!curval)) | ||
| 1370 | goto out_unlock_put_key; | ||
| 1371 | |||
| 1372 | uval = curval; | ||
| 1373 | |||
| 1374 | /* | ||
| 1375 | * Set the WAITERS flag, so the owner will know it has someone | ||
| 1376 | * to wake at next unlock | ||
| 1377 | */ | ||
| 1378 | newval = curval | FUTEX_WAITERS; | ||
| 1379 | |||
| 1380 | /* | ||
| 1381 | * There are two cases, where a futex might have no owner (the | ||
| 1382 | * owner TID is 0): OWNER_DIED. We take over the futex in this | ||
| 1383 | * case. We also do an unconditional take over, when the owner | ||
| 1384 | * of the futex died. | ||
| 1385 | * | ||
| 1386 | * This is safe as we are protected by the hash bucket lock ! | ||
| 1387 | */ | ||
| 1388 | if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { | ||
| 1389 | /* Keep the OWNER_DIED bit */ | ||
| 1390 | newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current); | ||
| 1391 | ownerdied = 0; | ||
| 1392 | lock_taken = 1; | ||
| 1393 | } | ||
| 1394 | |||
| 1395 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); | ||
| 1396 | |||
| 1397 | if (unlikely(curval == -EFAULT)) | ||
| 1398 | goto uaddr_faulted; | ||
| 1399 | if (unlikely(curval != uval)) | ||
| 1400 | goto retry_locked; | ||
| 1401 | |||
| 1402 | /* | ||
| 1403 | * We took the lock due to owner died take over. | ||
| 1404 | */ | ||
| 1405 | if (unlikely(lock_taken)) | ||
| 1406 | goto out_unlock_put_key; | ||
| 1407 | |||
| 1408 | /* | ||
| 1409 | * We dont have the lock. Look up the PI state (or create it if | ||
| 1410 | * we are the first waiter): | ||
| 1411 | */ | ||
| 1412 | ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state); | ||
| 1413 | |||
| 1414 | if (unlikely(ret)) { | 1854 | if (unlikely(ret)) { |
| 1415 | switch (ret) { | 1855 | switch (ret) { |
| 1416 | 1856 | case 1: | |
| 1857 | /* We got the lock. */ | ||
| 1858 | ret = 0; | ||
| 1859 | goto out_unlock_put_key; | ||
| 1860 | case -EFAULT: | ||
| 1861 | goto uaddr_faulted; | ||
| 1417 | case -EAGAIN: | 1862 | case -EAGAIN: |
| 1418 | /* | 1863 | /* |
| 1419 | * Task is exiting and we just wait for the | 1864 | * Task is exiting and we just wait for the |
| @@ -1423,25 +1868,6 @@ retry_locked: | |||
| 1423 | put_futex_key(fshared, &q.key); | 1868 | put_futex_key(fshared, &q.key); |
| 1424 | cond_resched(); | 1869 | cond_resched(); |
| 1425 | goto retry; | 1870 | goto retry; |
| 1426 | |||
| 1427 | case -ESRCH: | ||
| 1428 | /* | ||
| 1429 | * No owner found for this futex. Check if the | ||
| 1430 | * OWNER_DIED bit is set to figure out whether | ||
| 1431 | * this is a robust futex or not. | ||
| 1432 | */ | ||
| 1433 | if (get_futex_value_locked(&curval, uaddr)) | ||
| 1434 | goto uaddr_faulted; | ||
| 1435 | |||
| 1436 | /* | ||
| 1437 | * We simply start over in case of a robust | ||
| 1438 | * futex. The code above will take the futex | ||
| 1439 | * and return happy. | ||
| 1440 | */ | ||
| 1441 | if (curval & FUTEX_OWNER_DIED) { | ||
| 1442 | ownerdied = 1; | ||
| 1443 | goto retry_locked; | ||
| 1444 | } | ||
| 1445 | default: | 1871 | default: |
| 1446 | goto out_unlock_put_key; | 1872 | goto out_unlock_put_key; |
| 1447 | } | 1873 | } |
| @@ -1465,71 +1891,21 @@ retry_locked: | |||
| 1465 | } | 1891 | } |
| 1466 | 1892 | ||
| 1467 | spin_lock(q.lock_ptr); | 1893 | spin_lock(q.lock_ptr); |
| 1468 | 1894 | /* | |
| 1469 | if (!ret) { | 1895 | * Fixup the pi_state owner and possibly acquire the lock if we |
| 1470 | /* | 1896 | * haven't already. |
| 1471 | * Got the lock. We might not be the anticipated owner | 1897 | */ |
| 1472 | * if we did a lock-steal - fix up the PI-state in | 1898 | res = fixup_owner(uaddr, fshared, &q, !ret); |
| 1473 | * that case: | 1899 | /* |
| 1474 | */ | 1900 | * If fixup_owner() returned an error, proprogate that. If it acquired |
| 1475 | if (q.pi_state->owner != curr) | 1901 | * the lock, clear our -ETIMEDOUT or -EINTR. |
| 1476 | ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); | 1902 | */ |
| 1477 | } else { | 1903 | if (res) |
| 1478 | /* | 1904 | ret = (res < 0) ? res : 0; |
| 1479 | * Catch the rare case, where the lock was released | ||
| 1480 | * when we were on the way back before we locked the | ||
| 1481 | * hash bucket. | ||
| 1482 | */ | ||
| 1483 | if (q.pi_state->owner == curr) { | ||
| 1484 | /* | ||
| 1485 | * Try to get the rt_mutex now. This might | ||
| 1486 | * fail as some other task acquired the | ||
| 1487 | * rt_mutex after we removed ourself from the | ||
| 1488 | * rt_mutex waiters list. | ||
| 1489 | */ | ||
| 1490 | if (rt_mutex_trylock(&q.pi_state->pi_mutex)) | ||
| 1491 | ret = 0; | ||
| 1492 | else { | ||
| 1493 | /* | ||
| 1494 | * pi_state is incorrect, some other | ||
| 1495 | * task did a lock steal and we | ||
| 1496 | * returned due to timeout or signal | ||
| 1497 | * without taking the rt_mutex. Too | ||
| 1498 | * late. We can access the | ||
| 1499 | * rt_mutex_owner without locking, as | ||
| 1500 | * the other task is now blocked on | ||
| 1501 | * the hash bucket lock. Fix the state | ||
| 1502 | * up. | ||
| 1503 | */ | ||
| 1504 | struct task_struct *owner; | ||
| 1505 | int res; | ||
| 1506 | |||
| 1507 | owner = rt_mutex_owner(&q.pi_state->pi_mutex); | ||
| 1508 | res = fixup_pi_state_owner(uaddr, &q, owner, | ||
| 1509 | fshared); | ||
| 1510 | |||
| 1511 | /* propagate -EFAULT, if the fixup failed */ | ||
| 1512 | if (res) | ||
| 1513 | ret = res; | ||
| 1514 | } | ||
| 1515 | } else { | ||
| 1516 | /* | ||
| 1517 | * Paranoia check. If we did not take the lock | ||
| 1518 | * in the trylock above, then we should not be | ||
| 1519 | * the owner of the rtmutex, neither the real | ||
| 1520 | * nor the pending one: | ||
| 1521 | */ | ||
| 1522 | if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr) | ||
| 1523 | printk(KERN_ERR "futex_lock_pi: ret = %d " | ||
| 1524 | "pi-mutex: %p pi-state %p\n", ret, | ||
| 1525 | q.pi_state->pi_mutex.owner, | ||
| 1526 | q.pi_state->owner); | ||
| 1527 | } | ||
| 1528 | } | ||
| 1529 | 1905 | ||
| 1530 | /* | 1906 | /* |
| 1531 | * If fixup_pi_state_owner() faulted and was unable to handle the | 1907 | * If fixup_owner() faulted and was unable to handle the fault, unlock |
| 1532 | * fault, unlock it and return the fault to userspace. | 1908 | * it and return the fault to userspace. |
| 1533 | */ | 1909 | */ |
| 1534 | if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) | 1910 | if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) |
| 1535 | rt_mutex_unlock(&q.pi_state->pi_mutex); | 1911 | rt_mutex_unlock(&q.pi_state->pi_mutex); |
| @@ -1537,9 +1913,7 @@ retry_locked: | |||
| 1537 | /* Unqueue and drop the lock */ | 1913 | /* Unqueue and drop the lock */ |
| 1538 | unqueue_me_pi(&q); | 1914 | unqueue_me_pi(&q); |
| 1539 | 1915 | ||
| 1540 | if (to) | 1916 | goto out; |
| 1541 | destroy_hrtimer_on_stack(&to->timer); | ||
| 1542 | return ret != -EINTR ? ret : -ERESTARTNOINTR; | ||
| 1543 | 1917 | ||
| 1544 | out_unlock_put_key: | 1918 | out_unlock_put_key: |
| 1545 | queue_unlock(&q, hb); | 1919 | queue_unlock(&q, hb); |
| @@ -1549,19 +1923,12 @@ out_put_key: | |||
| 1549 | out: | 1923 | out: |
| 1550 | if (to) | 1924 | if (to) |
| 1551 | destroy_hrtimer_on_stack(&to->timer); | 1925 | destroy_hrtimer_on_stack(&to->timer); |
| 1552 | return ret; | 1926 | return ret != -EINTR ? ret : -ERESTARTNOINTR; |
| 1553 | 1927 | ||
| 1554 | uaddr_faulted: | 1928 | uaddr_faulted: |
| 1555 | /* | ||
| 1556 | * We have to r/w *(int __user *)uaddr, and we have to modify it | ||
| 1557 | * atomically. Therefore, if we continue to fault after get_user() | ||
| 1558 | * below, we need to handle the fault ourselves, while still holding | ||
| 1559 | * the mmap_sem. This can occur if the uaddr is under contention as | ||
| 1560 | * we have to drop the mmap_sem in order to call get_user(). | ||
| 1561 | */ | ||
| 1562 | queue_unlock(&q, hb); | 1929 | queue_unlock(&q, hb); |
| 1563 | 1930 | ||
| 1564 | ret = get_user(uval, uaddr); | 1931 | ret = fault_in_user_writeable(uaddr); |
| 1565 | if (ret) | 1932 | if (ret) |
| 1566 | goto out_put_key; | 1933 | goto out_put_key; |
| 1567 | 1934 | ||
| @@ -1572,7 +1939,6 @@ uaddr_faulted: | |||
| 1572 | goto retry; | 1939 | goto retry; |
| 1573 | } | 1940 | } |
| 1574 | 1941 | ||
| 1575 | |||
| 1576 | /* | 1942 | /* |
| 1577 | * Userspace attempted a TID -> 0 atomic transition, and failed. | 1943 | * Userspace attempted a TID -> 0 atomic transition, and failed. |
| 1578 | * This is the in-kernel slowpath: we look up the PI state (if any), | 1944 | * This is the in-kernel slowpath: we look up the PI state (if any), |
| @@ -1657,23 +2023,239 @@ out: | |||
| 1657 | return ret; | 2023 | return ret; |
| 1658 | 2024 | ||
| 1659 | pi_faulted: | 2025 | pi_faulted: |
| 1660 | /* | ||
| 1661 | * We have to r/w *(int __user *)uaddr, and we have to modify it | ||
| 1662 | * atomically. Therefore, if we continue to fault after get_user() | ||
| 1663 | * below, we need to handle the fault ourselves, while still holding | ||
| 1664 | * the mmap_sem. This can occur if the uaddr is under contention as | ||
| 1665 | * we have to drop the mmap_sem in order to call get_user(). | ||
| 1666 | */ | ||
| 1667 | spin_unlock(&hb->lock); | 2026 | spin_unlock(&hb->lock); |
| 1668 | put_futex_key(fshared, &key); | 2027 | put_futex_key(fshared, &key); |
| 1669 | 2028 | ||
| 1670 | ret = get_user(uval, uaddr); | 2029 | ret = fault_in_user_writeable(uaddr); |
| 1671 | if (!ret) | 2030 | if (!ret) |
| 1672 | goto retry; | 2031 | goto retry; |
| 1673 | 2032 | ||
| 1674 | return ret; | 2033 | return ret; |
| 1675 | } | 2034 | } |
| 1676 | 2035 | ||
| 2036 | /** | ||
| 2037 | * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex | ||
| 2038 | * @hb: the hash_bucket futex_q was original enqueued on | ||
| 2039 | * @q: the futex_q woken while waiting to be requeued | ||
| 2040 | * @key2: the futex_key of the requeue target futex | ||
| 2041 | * @timeout: the timeout associated with the wait (NULL if none) | ||
| 2042 | * | ||
| 2043 | * Detect if the task was woken on the initial futex as opposed to the requeue | ||
| 2044 | * target futex. If so, determine if it was a timeout or a signal that caused | ||
| 2045 | * the wakeup and return the appropriate error code to the caller. Must be | ||
| 2046 | * called with the hb lock held. | ||
| 2047 | * | ||
| 2048 | * Returns | ||
| 2049 | * 0 - no early wakeup detected | ||
| 2050 | * <0 - -ETIMEDOUT or -ERESTARTNOINTR | ||
| 2051 | */ | ||
| 2052 | static inline | ||
| 2053 | int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | ||
| 2054 | struct futex_q *q, union futex_key *key2, | ||
| 2055 | struct hrtimer_sleeper *timeout) | ||
| 2056 | { | ||
| 2057 | int ret = 0; | ||
| 2058 | |||
| 2059 | /* | ||
| 2060 | * With the hb lock held, we avoid races while we process the wakeup. | ||
| 2061 | * We only need to hold hb (and not hb2) to ensure atomicity as the | ||
| 2062 | * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. | ||
| 2063 | * It can't be requeued from uaddr2 to something else since we don't | ||
| 2064 | * support a PI aware source futex for requeue. | ||
| 2065 | */ | ||
| 2066 | if (!match_futex(&q->key, key2)) { | ||
| 2067 | WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); | ||
| 2068 | /* | ||
| 2069 | * We were woken prior to requeue by a timeout or a signal. | ||
| 2070 | * Unqueue the futex_q and determine which it was. | ||
| 2071 | */ | ||
| 2072 | plist_del(&q->list, &q->list.plist); | ||
| 2073 | drop_futex_key_refs(&q->key); | ||
| 2074 | |||
| 2075 | if (timeout && !timeout->task) | ||
| 2076 | ret = -ETIMEDOUT; | ||
| 2077 | else | ||
| 2078 | ret = -ERESTARTNOINTR; | ||
| 2079 | } | ||
| 2080 | return ret; | ||
| 2081 | } | ||
| 2082 | |||
| 2083 | /** | ||
| 2084 | * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 | ||
| 2085 | * @uaddr: the futex we initialyl wait on (non-pi) | ||
| 2086 | * @fshared: whether the futexes are shared (1) or not (0). They must be | ||
| 2087 | * the same type, no requeueing from private to shared, etc. | ||
| 2088 | * @val: the expected value of uaddr | ||
| 2089 | * @abs_time: absolute timeout | ||
| 2090 | * @bitset: 32 bit wakeup bitset set by userspace, defaults to all. | ||
| 2091 | * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) | ||
| 2092 | * @uaddr2: the pi futex we will take prior to returning to user-space | ||
| 2093 | * | ||
| 2094 | * The caller will wait on uaddr and will be requeued by futex_requeue() to | ||
| 2095 | * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and | ||
| 2096 | * complete the acquisition of the rt_mutex prior to returning to userspace. | ||
| 2097 | * This ensures the rt_mutex maintains an owner when it has waiters; without | ||
| 2098 | * one, the pi logic wouldn't know which task to boost/deboost, if there was a | ||
| 2099 | * need to. | ||
| 2100 | * | ||
| 2101 | * We call schedule in futex_wait_queue_me() when we enqueue and return there | ||
| 2102 | * via the following: | ||
| 2103 | * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() | ||
| 2104 | * 2) wakeup on uaddr2 after a requeue and subsequent unlock | ||
| 2105 | * 3) signal (before or after requeue) | ||
| 2106 | * 4) timeout (before or after requeue) | ||
| 2107 | * | ||
| 2108 | * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. | ||
| 2109 | * | ||
| 2110 | * If 2, we may then block on trying to take the rt_mutex and return via: | ||
| 2111 | * 5) successful lock | ||
| 2112 | * 6) signal | ||
| 2113 | * 7) timeout | ||
| 2114 | * 8) other lock acquisition failure | ||
| 2115 | * | ||
| 2116 | * If 6, we setup a restart_block with futex_lock_pi() as the function. | ||
| 2117 | * | ||
| 2118 | * If 4 or 7, we cleanup and return with -ETIMEDOUT. | ||
| 2119 | * | ||
| 2120 | * Returns: | ||
| 2121 | * 0 - On success | ||
| 2122 | * <0 - On error | ||
| 2123 | */ | ||
| 2124 | static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | ||
| 2125 | u32 val, ktime_t *abs_time, u32 bitset, | ||
| 2126 | int clockrt, u32 __user *uaddr2) | ||
| 2127 | { | ||
| 2128 | struct hrtimer_sleeper timeout, *to = NULL; | ||
| 2129 | struct rt_mutex_waiter rt_waiter; | ||
| 2130 | struct rt_mutex *pi_mutex = NULL; | ||
| 2131 | struct futex_hash_bucket *hb; | ||
| 2132 | union futex_key key2; | ||
| 2133 | struct futex_q q; | ||
| 2134 | int res, ret; | ||
| 2135 | |||
| 2136 | if (!bitset) | ||
| 2137 | return -EINVAL; | ||
| 2138 | |||
| 2139 | if (abs_time) { | ||
| 2140 | to = &timeout; | ||
| 2141 | hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : | ||
| 2142 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | ||
| 2143 | hrtimer_init_sleeper(to, current); | ||
| 2144 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, | ||
| 2145 | current->timer_slack_ns); | ||
| 2146 | } | ||
| 2147 | |||
| 2148 | /* | ||
| 2149 | * The waiter is allocated on our stack, manipulated by the requeue | ||
| 2150 | * code while we sleep on uaddr. | ||
| 2151 | */ | ||
| 2152 | debug_rt_mutex_init_waiter(&rt_waiter); | ||
| 2153 | rt_waiter.task = NULL; | ||
| 2154 | |||
| 2155 | q.pi_state = NULL; | ||
| 2156 | q.bitset = bitset; | ||
| 2157 | q.rt_waiter = &rt_waiter; | ||
| 2158 | |||
| 2159 | key2 = FUTEX_KEY_INIT; | ||
| 2160 | ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); | ||
| 2161 | if (unlikely(ret != 0)) | ||
| 2162 | goto out; | ||
| 2163 | |||
| 2164 | /* Prepare to wait on uaddr. */ | ||
| 2165 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); | ||
| 2166 | if (ret) | ||
| 2167 | goto out_key2; | ||
| 2168 | |||
| 2169 | /* Queue the futex_q, drop the hb lock, wait for wakeup. */ | ||
| 2170 | futex_wait_queue_me(hb, &q, to); | ||
| 2171 | |||
| 2172 | spin_lock(&hb->lock); | ||
| 2173 | ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); | ||
| 2174 | spin_unlock(&hb->lock); | ||
| 2175 | if (ret) | ||
| 2176 | goto out_put_keys; | ||
| 2177 | |||
| 2178 | /* | ||
| 2179 | * In order for us to be here, we know our q.key == key2, and since | ||
| 2180 | * we took the hb->lock above, we also know that futex_requeue() has | ||
| 2181 | * completed and we no longer have to concern ourselves with a wakeup | ||
| 2182 | * race with the atomic proxy lock acquition by the requeue code. | ||
| 2183 | */ | ||
| 2184 | |||
| 2185 | /* Check if the requeue code acquired the second futex for us. */ | ||
| 2186 | if (!q.rt_waiter) { | ||
| 2187 | /* | ||
| 2188 | * Got the lock. We might not be the anticipated owner if we | ||
| 2189 | * did a lock-steal - fix up the PI-state in that case. | ||
| 2190 | */ | ||
| 2191 | if (q.pi_state && (q.pi_state->owner != current)) { | ||
| 2192 | spin_lock(q.lock_ptr); | ||
| 2193 | ret = fixup_pi_state_owner(uaddr2, &q, current, | ||
| 2194 | fshared); | ||
| 2195 | spin_unlock(q.lock_ptr); | ||
| 2196 | } | ||
| 2197 | } else { | ||
| 2198 | /* | ||
| 2199 | * We have been woken up by futex_unlock_pi(), a timeout, or a | ||
| 2200 | * signal. futex_unlock_pi() will not destroy the lock_ptr nor | ||
| 2201 | * the pi_state. | ||
| 2202 | */ | ||
| 2203 | WARN_ON(!&q.pi_state); | ||
| 2204 | pi_mutex = &q.pi_state->pi_mutex; | ||
| 2205 | ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); | ||
| 2206 | debug_rt_mutex_free_waiter(&rt_waiter); | ||
| 2207 | |||
| 2208 | spin_lock(q.lock_ptr); | ||
| 2209 | /* | ||
| 2210 | * Fixup the pi_state owner and possibly acquire the lock if we | ||
| 2211 | * haven't already. | ||
| 2212 | */ | ||
| 2213 | res = fixup_owner(uaddr2, fshared, &q, !ret); | ||
| 2214 | /* | ||
| 2215 | * If fixup_owner() returned an error, proprogate that. If it | ||
| 2216 | * acquired the lock, clear our -ETIMEDOUT or -EINTR. | ||
| 2217 | */ | ||
| 2218 | if (res) | ||
| 2219 | ret = (res < 0) ? res : 0; | ||
| 2220 | |||
| 2221 | /* Unqueue and drop the lock. */ | ||
| 2222 | unqueue_me_pi(&q); | ||
| 2223 | } | ||
| 2224 | |||
| 2225 | /* | ||
| 2226 | * If fixup_pi_state_owner() faulted and was unable to handle the | ||
| 2227 | * fault, unlock the rt_mutex and return the fault to userspace. | ||
| 2228 | */ | ||
| 2229 | if (ret == -EFAULT) { | ||
| 2230 | if (rt_mutex_owner(pi_mutex) == current) | ||
| 2231 | rt_mutex_unlock(pi_mutex); | ||
| 2232 | } else if (ret == -EINTR) { | ||
| 2233 | /* | ||
| 2234 | * We've already been requeued, but we have no way to | ||
| 2235 | * restart by calling futex_lock_pi() directly. We | ||
| 2236 | * could restart the syscall, but that will look at | ||
| 2237 | * the user space value and return right away. So we | ||
| 2238 | * drop back with EWOULDBLOCK to tell user space that | ||
| 2239 | * "val" has been changed. That's the same what the | ||
| 2240 | * restart of the syscall would do in | ||
| 2241 | * futex_wait_setup(). | ||
| 2242 | */ | ||
| 2243 | ret = -EWOULDBLOCK; | ||
| 2244 | } | ||
| 2245 | |||
| 2246 | out_put_keys: | ||
| 2247 | put_futex_key(fshared, &q.key); | ||
| 2248 | out_key2: | ||
| 2249 | put_futex_key(fshared, &key2); | ||
| 2250 | |||
| 2251 | out: | ||
| 2252 | if (to) { | ||
| 2253 | hrtimer_cancel(&to->timer); | ||
| 2254 | destroy_hrtimer_on_stack(&to->timer); | ||
| 2255 | } | ||
| 2256 | return ret; | ||
| 2257 | } | ||
| 2258 | |||
| 1677 | /* | 2259 | /* |
| 1678 | * Support for robust futexes: the kernel cleans up held futexes at | 2260 | * Support for robust futexes: the kernel cleans up held futexes at |
| 1679 | * thread exit time. | 2261 | * thread exit time. |
| @@ -1896,7 +2478,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, | |||
| 1896 | fshared = 1; | 2478 | fshared = 1; |
| 1897 | 2479 | ||
| 1898 | clockrt = op & FUTEX_CLOCK_REALTIME; | 2480 | clockrt = op & FUTEX_CLOCK_REALTIME; |
| 1899 | if (clockrt && cmd != FUTEX_WAIT_BITSET) | 2481 | if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) |
| 1900 | return -ENOSYS; | 2482 | return -ENOSYS; |
| 1901 | 2483 | ||
| 1902 | switch (cmd) { | 2484 | switch (cmd) { |
| @@ -1911,10 +2493,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, | |||
| 1911 | ret = futex_wake(uaddr, fshared, val, val3); | 2493 | ret = futex_wake(uaddr, fshared, val, val3); |
| 1912 | break; | 2494 | break; |
| 1913 | case FUTEX_REQUEUE: | 2495 | case FUTEX_REQUEUE: |
| 1914 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); | 2496 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); |
| 1915 | break; | 2497 | break; |
| 1916 | case FUTEX_CMP_REQUEUE: | 2498 | case FUTEX_CMP_REQUEUE: |
| 1917 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); | 2499 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, |
| 2500 | 0); | ||
| 1918 | break; | 2501 | break; |
| 1919 | case FUTEX_WAKE_OP: | 2502 | case FUTEX_WAKE_OP: |
| 1920 | ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); | 2503 | ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); |
| @@ -1931,6 +2514,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, | |||
| 1931 | if (futex_cmpxchg_enabled) | 2514 | if (futex_cmpxchg_enabled) |
| 1932 | ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); | 2515 | ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); |
| 1933 | break; | 2516 | break; |
| 2517 | case FUTEX_WAIT_REQUEUE_PI: | ||
| 2518 | val3 = FUTEX_BITSET_MATCH_ANY; | ||
| 2519 | ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, | ||
| 2520 | clockrt, uaddr2); | ||
| 2521 | break; | ||
| 2522 | case FUTEX_CMP_REQUEUE_PI: | ||
| 2523 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, | ||
| 2524 | 1); | ||
| 2525 | break; | ||
| 1934 | default: | 2526 | default: |
| 1935 | ret = -ENOSYS; | 2527 | ret = -ENOSYS; |
| 1936 | } | 2528 | } |
| @@ -1948,7 +2540,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, | |||
| 1948 | int cmd = op & FUTEX_CMD_MASK; | 2540 | int cmd = op & FUTEX_CMD_MASK; |
| 1949 | 2541 | ||
| 1950 | if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || | 2542 | if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || |
| 1951 | cmd == FUTEX_WAIT_BITSET)) { | 2543 | cmd == FUTEX_WAIT_BITSET || |
| 2544 | cmd == FUTEX_WAIT_REQUEUE_PI)) { | ||
| 1952 | if (copy_from_user(&ts, utime, sizeof(ts)) != 0) | 2545 | if (copy_from_user(&ts, utime, sizeof(ts)) != 0) |
| 1953 | return -EFAULT; | 2546 | return -EFAULT; |
| 1954 | if (!timespec_valid(&ts)) | 2547 | if (!timespec_valid(&ts)) |
| @@ -1960,11 +2553,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, | |||
| 1960 | tp = &t; | 2553 | tp = &t; |
| 1961 | } | 2554 | } |
| 1962 | /* | 2555 | /* |
| 1963 | * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. | 2556 | * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. |
| 1964 | * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. | 2557 | * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. |
| 1965 | */ | 2558 | */ |
| 1966 | if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || | 2559 | if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || |
| 1967 | cmd == FUTEX_WAKE_OP) | 2560 | cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) |
| 1968 | val2 = (u32) (unsigned long) utime; | 2561 | val2 = (u32) (unsigned long) utime; |
| 1969 | 2562 | ||
| 1970 | return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); | 2563 | return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); |
