diff options
-rw-r--r-- | Documentation/futex-requeue-pi.txt | 131 | ||||
-rw-r--r-- | arch/sparc/include/asm/thread_info_64.h | 4 | ||||
-rw-r--r-- | include/linux/futex.h | 6 | ||||
-rw-r--r-- | include/linux/thread_info.h | 3 | ||||
-rw-r--r-- | kernel/futex.c | 1188 | ||||
-rw-r--r-- | kernel/rtmutex.c | 240 | ||||
-rw-r--r-- | kernel/rtmutex_common.h | 8 |
7 files changed, 1225 insertions, 355 deletions
diff --git a/Documentation/futex-requeue-pi.txt b/Documentation/futex-requeue-pi.txt new file mode 100644 index 000000000000..9dc1ff4fd536 --- /dev/null +++ b/Documentation/futex-requeue-pi.txt | |||
@@ -0,0 +1,131 @@ | |||
1 | Futex Requeue PI | ||
2 | ---------------- | ||
3 | |||
4 | Requeueing of tasks from a non-PI futex to a PI futex requires | ||
5 | special handling in order to ensure the underlying rt_mutex is never | ||
6 | left without an owner if it has waiters; doing so would break the PI | ||
7 | boosting logic [see rt-mutex-desgin.txt] For the purposes of | ||
8 | brevity, this action will be referred to as "requeue_pi" throughout | ||
9 | this document. Priority inheritance is abbreviated throughout as | ||
10 | "PI". | ||
11 | |||
12 | Motivation | ||
13 | ---------- | ||
14 | |||
15 | Without requeue_pi, the glibc implementation of | ||
16 | pthread_cond_broadcast() must resort to waking all the tasks waiting | ||
17 | on a pthread_condvar and letting them try to sort out which task | ||
18 | gets to run first in classic thundering-herd formation. An ideal | ||
19 | implementation would wake the highest-priority waiter, and leave the | ||
20 | rest to the natural wakeup inherent in unlocking the mutex | ||
21 | associated with the condvar. | ||
22 | |||
23 | Consider the simplified glibc calls: | ||
24 | |||
25 | /* caller must lock mutex */ | ||
26 | pthread_cond_wait(cond, mutex) | ||
27 | { | ||
28 | lock(cond->__data.__lock); | ||
29 | unlock(mutex); | ||
30 | do { | ||
31 | unlock(cond->__data.__lock); | ||
32 | futex_wait(cond->__data.__futex); | ||
33 | lock(cond->__data.__lock); | ||
34 | } while(...) | ||
35 | unlock(cond->__data.__lock); | ||
36 | lock(mutex); | ||
37 | } | ||
38 | |||
39 | pthread_cond_broadcast(cond) | ||
40 | { | ||
41 | lock(cond->__data.__lock); | ||
42 | unlock(cond->__data.__lock); | ||
43 | futex_requeue(cond->data.__futex, cond->mutex); | ||
44 | } | ||
45 | |||
46 | Once pthread_cond_broadcast() requeues the tasks, the cond->mutex | ||
47 | has waiters. Note that pthread_cond_wait() attempts to lock the | ||
48 | mutex only after it has returned to user space. This will leave the | ||
49 | underlying rt_mutex with waiters, and no owner, breaking the | ||
50 | previously mentioned PI-boosting algorithms. | ||
51 | |||
52 | In order to support PI-aware pthread_condvar's, the kernel needs to | ||
53 | be able to requeue tasks to PI futexes. This support implies that | ||
54 | upon a successful futex_wait system call, the caller would return to | ||
55 | user space already holding the PI futex. The glibc implementation | ||
56 | would be modified as follows: | ||
57 | |||
58 | |||
59 | /* caller must lock mutex */ | ||
60 | pthread_cond_wait_pi(cond, mutex) | ||
61 | { | ||
62 | lock(cond->__data.__lock); | ||
63 | unlock(mutex); | ||
64 | do { | ||
65 | unlock(cond->__data.__lock); | ||
66 | futex_wait_requeue_pi(cond->__data.__futex); | ||
67 | lock(cond->__data.__lock); | ||
68 | } while(...) | ||
69 | unlock(cond->__data.__lock); | ||
70 | /* the kernel acquired the the mutex for us */ | ||
71 | } | ||
72 | |||
73 | pthread_cond_broadcast_pi(cond) | ||
74 | { | ||
75 | lock(cond->__data.__lock); | ||
76 | unlock(cond->__data.__lock); | ||
77 | futex_requeue_pi(cond->data.__futex, cond->mutex); | ||
78 | } | ||
79 | |||
80 | The actual glibc implementation will likely test for PI and make the | ||
81 | necessary changes inside the existing calls rather than creating new | ||
82 | calls for the PI cases. Similar changes are needed for | ||
83 | pthread_cond_timedwait() and pthread_cond_signal(). | ||
84 | |||
85 | Implementation | ||
86 | -------------- | ||
87 | |||
88 | In order to ensure the rt_mutex has an owner if it has waiters, it | ||
89 | is necessary for both the requeue code, as well as the waiting code, | ||
90 | to be able to acquire the rt_mutex before returning to user space. | ||
91 | The requeue code cannot simply wake the waiter and leave it to | ||
92 | acquire the rt_mutex as it would open a race window between the | ||
93 | requeue call returning to user space and the waiter waking and | ||
94 | starting to run. This is especially true in the uncontended case. | ||
95 | |||
96 | The solution involves two new rt_mutex helper routines, | ||
97 | rt_mutex_start_proxy_lock() and rt_mutex_finish_proxy_lock(), which | ||
98 | allow the requeue code to acquire an uncontended rt_mutex on behalf | ||
99 | of the waiter and to enqueue the waiter on a contended rt_mutex. | ||
100 | Two new system calls provide the kernel<->user interface to | ||
101 | requeue_pi: FUTEX_WAIT_REQUEUE_PI and FUTEX_REQUEUE_CMP_PI. | ||
102 | |||
103 | FUTEX_WAIT_REQUEUE_PI is called by the waiter (pthread_cond_wait() | ||
104 | and pthread_cond_timedwait()) to block on the initial futex and wait | ||
105 | to be requeued to a PI-aware futex. The implementation is the | ||
106 | result of a high-speed collision between futex_wait() and | ||
107 | futex_lock_pi(), with some extra logic to check for the additional | ||
108 | wake-up scenarios. | ||
109 | |||
110 | FUTEX_REQUEUE_CMP_PI is called by the waker | ||
111 | (pthread_cond_broadcast() and pthread_cond_signal()) to requeue and | ||
112 | possibly wake the waiting tasks. Internally, this system call is | ||
113 | still handled by futex_requeue (by passing requeue_pi=1). Before | ||
114 | requeueing, futex_requeue() attempts to acquire the requeue target | ||
115 | PI futex on behalf of the top waiter. If it can, this waiter is | ||
116 | woken. futex_requeue() then proceeds to requeue the remaining | ||
117 | nr_wake+nr_requeue tasks to the PI futex, calling | ||
118 | rt_mutex_start_proxy_lock() prior to each requeue to prepare the | ||
119 | task as a waiter on the underlying rt_mutex. It is possible that | ||
120 | the lock can be acquired at this stage as well, if so, the next | ||
121 | waiter is woken to finish the acquisition of the lock. | ||
122 | |||
123 | FUTEX_REQUEUE_PI accepts nr_wake and nr_requeue as arguments, but | ||
124 | their sum is all that really matters. futex_requeue() will wake or | ||
125 | requeue up to nr_wake + nr_requeue tasks. It will wake only as many | ||
126 | tasks as it can acquire the lock for, which in the majority of cases | ||
127 | should be 0 as good programming practice dictates that the caller of | ||
128 | either pthread_cond_broadcast() or pthread_cond_signal() acquire the | ||
129 | mutex prior to making the call. FUTEX_REQUEUE_PI requires that | ||
130 | nr_wake=1. nr_requeue should be INT_MAX for broadcast and 0 for | ||
131 | signal. | ||
diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index 639ac805448a..65865726b283 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h | |||
@@ -102,8 +102,8 @@ struct thread_info { | |||
102 | #define TI_KERN_CNTD1 0x00000488 | 102 | #define TI_KERN_CNTD1 0x00000488 |
103 | #define TI_PCR 0x00000490 | 103 | #define TI_PCR 0x00000490 |
104 | #define TI_RESTART_BLOCK 0x00000498 | 104 | #define TI_RESTART_BLOCK 0x00000498 |
105 | #define TI_KUNA_REGS 0x000004c0 | 105 | #define TI_KUNA_REGS 0x000004c8 |
106 | #define TI_KUNA_INSN 0x000004c8 | 106 | #define TI_KUNA_INSN 0x000004d0 |
107 | #define TI_FPREGS 0x00000500 | 107 | #define TI_FPREGS 0x00000500 |
108 | 108 | ||
109 | /* We embed this in the uppermost byte of thread_info->flags */ | 109 | /* We embed this in the uppermost byte of thread_info->flags */ |
diff --git a/include/linux/futex.h b/include/linux/futex.h index 3bf5bb5a34f9..34956c8fdebf 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h | |||
@@ -23,6 +23,8 @@ union ktime; | |||
23 | #define FUTEX_TRYLOCK_PI 8 | 23 | #define FUTEX_TRYLOCK_PI 8 |
24 | #define FUTEX_WAIT_BITSET 9 | 24 | #define FUTEX_WAIT_BITSET 9 |
25 | #define FUTEX_WAKE_BITSET 10 | 25 | #define FUTEX_WAKE_BITSET 10 |
26 | #define FUTEX_WAIT_REQUEUE_PI 11 | ||
27 | #define FUTEX_CMP_REQUEUE_PI 12 | ||
26 | 28 | ||
27 | #define FUTEX_PRIVATE_FLAG 128 | 29 | #define FUTEX_PRIVATE_FLAG 128 |
28 | #define FUTEX_CLOCK_REALTIME 256 | 30 | #define FUTEX_CLOCK_REALTIME 256 |
@@ -38,6 +40,10 @@ union ktime; | |||
38 | #define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG) | 40 | #define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG) |
39 | #define FUTEX_WAIT_BITSET_PRIVATE (FUTEX_WAIT_BITS | FUTEX_PRIVATE_FLAG) | 41 | #define FUTEX_WAIT_BITSET_PRIVATE (FUTEX_WAIT_BITS | FUTEX_PRIVATE_FLAG) |
40 | #define FUTEX_WAKE_BITSET_PRIVATE (FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG) | 42 | #define FUTEX_WAKE_BITSET_PRIVATE (FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG) |
43 | #define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | \ | ||
44 | FUTEX_PRIVATE_FLAG) | ||
45 | #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ | ||
46 | FUTEX_PRIVATE_FLAG) | ||
41 | 47 | ||
42 | /* | 48 | /* |
43 | * Support for robust futexes: the kernel cleans up held futexes at | 49 | * Support for robust futexes: the kernel cleans up held futexes at |
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index e6b820f8b56b..a8cc4e13434c 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h | |||
@@ -21,13 +21,14 @@ struct restart_block { | |||
21 | struct { | 21 | struct { |
22 | unsigned long arg0, arg1, arg2, arg3; | 22 | unsigned long arg0, arg1, arg2, arg3; |
23 | }; | 23 | }; |
24 | /* For futex_wait */ | 24 | /* For futex_wait and futex_wait_requeue_pi */ |
25 | struct { | 25 | struct { |
26 | u32 *uaddr; | 26 | u32 *uaddr; |
27 | u32 val; | 27 | u32 val; |
28 | u32 flags; | 28 | u32 flags; |
29 | u32 bitset; | 29 | u32 bitset; |
30 | u64 time; | 30 | u64 time; |
31 | u32 *uaddr2; | ||
31 | } futex; | 32 | } futex; |
32 | /* For nanosleep */ | 33 | /* For nanosleep */ |
33 | struct { | 34 | struct { |
diff --git a/kernel/futex.c b/kernel/futex.c index d546b2d53a62..80b5ce716596 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -19,6 +19,10 @@ | |||
19 | * PRIVATE futexes by Eric Dumazet | 19 | * PRIVATE futexes by Eric Dumazet |
20 | * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> | 20 | * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> |
21 | * | 21 | * |
22 | * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> | ||
23 | * Copyright (C) IBM Corporation, 2009 | ||
24 | * Thanks to Thomas Gleixner for conceptual design and careful reviews. | ||
25 | * | ||
22 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly | 26 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly |
23 | * enough at me, Linus for the original (flawed) idea, Matthew | 27 | * enough at me, Linus for the original (flawed) idea, Matthew |
24 | * Kirkwood for proof-of-concept implementation. | 28 | * Kirkwood for proof-of-concept implementation. |
@@ -96,8 +100,8 @@ struct futex_pi_state { | |||
96 | */ | 100 | */ |
97 | struct futex_q { | 101 | struct futex_q { |
98 | struct plist_node list; | 102 | struct plist_node list; |
99 | /* There can only be a single waiter */ | 103 | /* Waiter reference */ |
100 | wait_queue_head_t waiter; | 104 | struct task_struct *task; |
101 | 105 | ||
102 | /* Which hash list lock to use: */ | 106 | /* Which hash list lock to use: */ |
103 | spinlock_t *lock_ptr; | 107 | spinlock_t *lock_ptr; |
@@ -107,7 +111,9 @@ struct futex_q { | |||
107 | 111 | ||
108 | /* Optional priority inheritance state: */ | 112 | /* Optional priority inheritance state: */ |
109 | struct futex_pi_state *pi_state; | 113 | struct futex_pi_state *pi_state; |
110 | struct task_struct *task; | 114 | |
115 | /* rt_waiter storage for requeue_pi: */ | ||
116 | struct rt_mutex_waiter *rt_waiter; | ||
111 | 117 | ||
112 | /* Bitset for the optional bitmasked wakeup */ | 118 | /* Bitset for the optional bitmasked wakeup */ |
113 | u32 bitset; | 119 | u32 bitset; |
@@ -278,6 +284,25 @@ void put_futex_key(int fshared, union futex_key *key) | |||
278 | drop_futex_key_refs(key); | 284 | drop_futex_key_refs(key); |
279 | } | 285 | } |
280 | 286 | ||
287 | /** | ||
288 | * futex_top_waiter() - Return the highest priority waiter on a futex | ||
289 | * @hb: the hash bucket the futex_q's reside in | ||
290 | * @key: the futex key (to distinguish it from other futex futex_q's) | ||
291 | * | ||
292 | * Must be called with the hb lock held. | ||
293 | */ | ||
294 | static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, | ||
295 | union futex_key *key) | ||
296 | { | ||
297 | struct futex_q *this; | ||
298 | |||
299 | plist_for_each_entry(this, &hb->chain, list) { | ||
300 | if (match_futex(&this->key, key)) | ||
301 | return this; | ||
302 | } | ||
303 | return NULL; | ||
304 | } | ||
305 | |||
281 | static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) | 306 | static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) |
282 | { | 307 | { |
283 | u32 curval; | 308 | u32 curval; |
@@ -539,28 +564,160 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
539 | return 0; | 564 | return 0; |
540 | } | 565 | } |
541 | 566 | ||
567 | /** | ||
568 | * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex | ||
569 | * @uaddr: the pi futex user address | ||
570 | * @hb: the pi futex hash bucket | ||
571 | * @key: the futex key associated with uaddr and hb | ||
572 | * @ps: the pi_state pointer where we store the result of the | ||
573 | * lookup | ||
574 | * @task: the task to perform the atomic lock work for. This will | ||
575 | * be "current" except in the case of requeue pi. | ||
576 | * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) | ||
577 | * | ||
578 | * Returns: | ||
579 | * 0 - ready to wait | ||
580 | * 1 - acquired the lock | ||
581 | * <0 - error | ||
582 | * | ||
583 | * The hb->lock and futex_key refs shall be held by the caller. | ||
584 | */ | ||
585 | static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, | ||
586 | union futex_key *key, | ||
587 | struct futex_pi_state **ps, | ||
588 | struct task_struct *task, int set_waiters) | ||
589 | { | ||
590 | int lock_taken, ret, ownerdied = 0; | ||
591 | u32 uval, newval, curval; | ||
592 | |||
593 | retry: | ||
594 | ret = lock_taken = 0; | ||
595 | |||
596 | /* | ||
597 | * To avoid races, we attempt to take the lock here again | ||
598 | * (by doing a 0 -> TID atomic cmpxchg), while holding all | ||
599 | * the locks. It will most likely not succeed. | ||
600 | */ | ||
601 | newval = task_pid_vnr(task); | ||
602 | if (set_waiters) | ||
603 | newval |= FUTEX_WAITERS; | ||
604 | |||
605 | curval = cmpxchg_futex_value_locked(uaddr, 0, newval); | ||
606 | |||
607 | if (unlikely(curval == -EFAULT)) | ||
608 | return -EFAULT; | ||
609 | |||
610 | /* | ||
611 | * Detect deadlocks. | ||
612 | */ | ||
613 | if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) | ||
614 | return -EDEADLK; | ||
615 | |||
616 | /* | ||
617 | * Surprise - we got the lock. Just return to userspace: | ||
618 | */ | ||
619 | if (unlikely(!curval)) | ||
620 | return 1; | ||
621 | |||
622 | uval = curval; | ||
623 | |||
624 | /* | ||
625 | * Set the FUTEX_WAITERS flag, so the owner will know it has someone | ||
626 | * to wake at the next unlock. | ||
627 | */ | ||
628 | newval = curval | FUTEX_WAITERS; | ||
629 | |||
630 | /* | ||
631 | * There are two cases, where a futex might have no owner (the | ||
632 | * owner TID is 0): OWNER_DIED. We take over the futex in this | ||
633 | * case. We also do an unconditional take over, when the owner | ||
634 | * of the futex died. | ||
635 | * | ||
636 | * This is safe as we are protected by the hash bucket lock ! | ||
637 | */ | ||
638 | if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { | ||
639 | /* Keep the OWNER_DIED bit */ | ||
640 | newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); | ||
641 | ownerdied = 0; | ||
642 | lock_taken = 1; | ||
643 | } | ||
644 | |||
645 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); | ||
646 | |||
647 | if (unlikely(curval == -EFAULT)) | ||
648 | return -EFAULT; | ||
649 | if (unlikely(curval != uval)) | ||
650 | goto retry; | ||
651 | |||
652 | /* | ||
653 | * We took the lock due to owner died take over. | ||
654 | */ | ||
655 | if (unlikely(lock_taken)) | ||
656 | return 1; | ||
657 | |||
658 | /* | ||
659 | * We dont have the lock. Look up the PI state (or create it if | ||
660 | * we are the first waiter): | ||
661 | */ | ||
662 | ret = lookup_pi_state(uval, hb, key, ps); | ||
663 | |||
664 | if (unlikely(ret)) { | ||
665 | switch (ret) { | ||
666 | case -ESRCH: | ||
667 | /* | ||
668 | * No owner found for this futex. Check if the | ||
669 | * OWNER_DIED bit is set to figure out whether | ||
670 | * this is a robust futex or not. | ||
671 | */ | ||
672 | if (get_futex_value_locked(&curval, uaddr)) | ||
673 | return -EFAULT; | ||
674 | |||
675 | /* | ||
676 | * We simply start over in case of a robust | ||
677 | * futex. The code above will take the futex | ||
678 | * and return happy. | ||
679 | */ | ||
680 | if (curval & FUTEX_OWNER_DIED) { | ||
681 | ownerdied = 1; | ||
682 | goto retry; | ||
683 | } | ||
684 | default: | ||
685 | break; | ||
686 | } | ||
687 | } | ||
688 | |||
689 | return ret; | ||
690 | } | ||
691 | |||
542 | /* | 692 | /* |
543 | * The hash bucket lock must be held when this is called. | 693 | * The hash bucket lock must be held when this is called. |
544 | * Afterwards, the futex_q must not be accessed. | 694 | * Afterwards, the futex_q must not be accessed. |
545 | */ | 695 | */ |
546 | static void wake_futex(struct futex_q *q) | 696 | static void wake_futex(struct futex_q *q) |
547 | { | 697 | { |
548 | plist_del(&q->list, &q->list.plist); | 698 | struct task_struct *p = q->task; |
699 | |||
549 | /* | 700 | /* |
550 | * The lock in wake_up_all() is a crucial memory barrier after the | 701 | * We set q->lock_ptr = NULL _before_ we wake up the task. If |
551 | * plist_del() and also before assigning to q->lock_ptr. | 702 | * a non futex wake up happens on another CPU then the task |
703 | * might exit and p would dereference a non existing task | ||
704 | * struct. Prevent this by holding a reference on p across the | ||
705 | * wake up. | ||
552 | */ | 706 | */ |
553 | wake_up(&q->waiter); | 707 | get_task_struct(p); |
708 | |||
709 | plist_del(&q->list, &q->list.plist); | ||
554 | /* | 710 | /* |
555 | * The waiting task can free the futex_q as soon as this is written, | 711 | * The waiting task can free the futex_q as soon as |
556 | * without taking any locks. This must come last. | 712 | * q->lock_ptr = NULL is written, without taking any locks. A |
557 | * | 713 | * memory barrier is required here to prevent the following |
558 | * A memory barrier is required here to prevent the following store to | 714 | * store to lock_ptr from getting ahead of the plist_del. |
559 | * lock_ptr from getting ahead of the wakeup. Clearing the lock at the | ||
560 | * end of wake_up() does not prevent this store from moving. | ||
561 | */ | 715 | */ |
562 | smp_wmb(); | 716 | smp_wmb(); |
563 | q->lock_ptr = NULL; | 717 | q->lock_ptr = NULL; |
718 | |||
719 | wake_up_state(p, TASK_NORMAL); | ||
720 | put_task_struct(p); | ||
564 | } | 721 | } |
565 | 722 | ||
566 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | 723 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) |
@@ -689,7 +846,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) | |||
689 | 846 | ||
690 | plist_for_each_entry_safe(this, next, head, list) { | 847 | plist_for_each_entry_safe(this, next, head, list) { |
691 | if (match_futex (&this->key, &key)) { | 848 | if (match_futex (&this->key, &key)) { |
692 | if (this->pi_state) { | 849 | if (this->pi_state || this->rt_waiter) { |
693 | ret = -EINVAL; | 850 | ret = -EINVAL; |
694 | break; | 851 | break; |
695 | } | 852 | } |
@@ -802,24 +959,185 @@ out: | |||
802 | return ret; | 959 | return ret; |
803 | } | 960 | } |
804 | 961 | ||
805 | /* | 962 | /** |
806 | * Requeue all waiters hashed on one physical page to another | 963 | * requeue_futex() - Requeue a futex_q from one hb to another |
807 | * physical page. | 964 | * @q: the futex_q to requeue |
965 | * @hb1: the source hash_bucket | ||
966 | * @hb2: the target hash_bucket | ||
967 | * @key2: the new key for the requeued futex_q | ||
968 | */ | ||
969 | static inline | ||
970 | void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, | ||
971 | struct futex_hash_bucket *hb2, union futex_key *key2) | ||
972 | { | ||
973 | |||
974 | /* | ||
975 | * If key1 and key2 hash to the same bucket, no need to | ||
976 | * requeue. | ||
977 | */ | ||
978 | if (likely(&hb1->chain != &hb2->chain)) { | ||
979 | plist_del(&q->list, &hb1->chain); | ||
980 | plist_add(&q->list, &hb2->chain); | ||
981 | q->lock_ptr = &hb2->lock; | ||
982 | #ifdef CONFIG_DEBUG_PI_LIST | ||
983 | q->list.plist.lock = &hb2->lock; | ||
984 | #endif | ||
985 | } | ||
986 | get_futex_key_refs(key2); | ||
987 | q->key = *key2; | ||
988 | } | ||
989 | |||
990 | /** | ||
991 | * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue | ||
992 | * q: the futex_q | ||
993 | * key: the key of the requeue target futex | ||
994 | * | ||
995 | * During futex_requeue, with requeue_pi=1, it is possible to acquire the | ||
996 | * target futex if it is uncontended or via a lock steal. Set the futex_q key | ||
997 | * to the requeue target futex so the waiter can detect the wakeup on the right | ||
998 | * futex, but remove it from the hb and NULL the rt_waiter so it can detect | ||
999 | * atomic lock acquisition. Must be called with the q->lock_ptr held. | ||
1000 | */ | ||
1001 | static inline | ||
1002 | void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) | ||
1003 | { | ||
1004 | drop_futex_key_refs(&q->key); | ||
1005 | get_futex_key_refs(key); | ||
1006 | q->key = *key; | ||
1007 | |||
1008 | WARN_ON(plist_node_empty(&q->list)); | ||
1009 | plist_del(&q->list, &q->list.plist); | ||
1010 | |||
1011 | WARN_ON(!q->rt_waiter); | ||
1012 | q->rt_waiter = NULL; | ||
1013 | |||
1014 | wake_up_state(q->task, TASK_NORMAL); | ||
1015 | } | ||
1016 | |||
1017 | /** | ||
1018 | * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter | ||
1019 | * @pifutex: the user address of the to futex | ||
1020 | * @hb1: the from futex hash bucket, must be locked by the caller | ||
1021 | * @hb2: the to futex hash bucket, must be locked by the caller | ||
1022 | * @key1: the from futex key | ||
1023 | * @key2: the to futex key | ||
1024 | * @ps: address to store the pi_state pointer | ||
1025 | * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) | ||
1026 | * | ||
1027 | * Try and get the lock on behalf of the top waiter if we can do it atomically. | ||
1028 | * Wake the top waiter if we succeed. If the caller specified set_waiters, | ||
1029 | * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. | ||
1030 | * hb1 and hb2 must be held by the caller. | ||
1031 | * | ||
1032 | * Returns: | ||
1033 | * 0 - failed to acquire the lock atomicly | ||
1034 | * 1 - acquired the lock | ||
1035 | * <0 - error | ||
1036 | */ | ||
1037 | static int futex_proxy_trylock_atomic(u32 __user *pifutex, | ||
1038 | struct futex_hash_bucket *hb1, | ||
1039 | struct futex_hash_bucket *hb2, | ||
1040 | union futex_key *key1, union futex_key *key2, | ||
1041 | struct futex_pi_state **ps, int set_waiters) | ||
1042 | { | ||
1043 | struct futex_q *top_waiter = NULL; | ||
1044 | u32 curval; | ||
1045 | int ret; | ||
1046 | |||
1047 | if (get_futex_value_locked(&curval, pifutex)) | ||
1048 | return -EFAULT; | ||
1049 | |||
1050 | /* | ||
1051 | * Find the top_waiter and determine if there are additional waiters. | ||
1052 | * If the caller intends to requeue more than 1 waiter to pifutex, | ||
1053 | * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, | ||
1054 | * as we have means to handle the possible fault. If not, don't set | ||
1055 | * the bit unecessarily as it will force the subsequent unlock to enter | ||
1056 | * the kernel. | ||
1057 | */ | ||
1058 | top_waiter = futex_top_waiter(hb1, key1); | ||
1059 | |||
1060 | /* There are no waiters, nothing for us to do. */ | ||
1061 | if (!top_waiter) | ||
1062 | return 0; | ||
1063 | |||
1064 | /* | ||
1065 | * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in | ||
1066 | * the contended case or if set_waiters is 1. The pi_state is returned | ||
1067 | * in ps in contended cases. | ||
1068 | */ | ||
1069 | ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, | ||
1070 | set_waiters); | ||
1071 | if (ret == 1) | ||
1072 | requeue_pi_wake_futex(top_waiter, key2); | ||
1073 | |||
1074 | return ret; | ||
1075 | } | ||
1076 | |||
1077 | /** | ||
1078 | * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 | ||
1079 | * uaddr1: source futex user address | ||
1080 | * uaddr2: target futex user address | ||
1081 | * nr_wake: number of waiters to wake (must be 1 for requeue_pi) | ||
1082 | * nr_requeue: number of waiters to requeue (0-INT_MAX) | ||
1083 | * requeue_pi: if we are attempting to requeue from a non-pi futex to a | ||
1084 | * pi futex (pi to pi requeue is not supported) | ||
1085 | * | ||
1086 | * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire | ||
1087 | * uaddr2 atomically on behalf of the top waiter. | ||
1088 | * | ||
1089 | * Returns: | ||
1090 | * >=0 - on success, the number of tasks requeued or woken | ||
1091 | * <0 - on error | ||
808 | */ | 1092 | */ |
809 | static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, | 1093 | static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, |
810 | int nr_wake, int nr_requeue, u32 *cmpval) | 1094 | int nr_wake, int nr_requeue, u32 *cmpval, |
1095 | int requeue_pi) | ||
811 | { | 1096 | { |
812 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; | 1097 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; |
1098 | int drop_count = 0, task_count = 0, ret; | ||
1099 | struct futex_pi_state *pi_state = NULL; | ||
813 | struct futex_hash_bucket *hb1, *hb2; | 1100 | struct futex_hash_bucket *hb1, *hb2; |
814 | struct plist_head *head1; | 1101 | struct plist_head *head1; |
815 | struct futex_q *this, *next; | 1102 | struct futex_q *this, *next; |
816 | int ret, drop_count = 0; | 1103 | u32 curval2; |
1104 | |||
1105 | if (requeue_pi) { | ||
1106 | /* | ||
1107 | * requeue_pi requires a pi_state, try to allocate it now | ||
1108 | * without any locks in case it fails. | ||
1109 | */ | ||
1110 | if (refill_pi_state_cache()) | ||
1111 | return -ENOMEM; | ||
1112 | /* | ||
1113 | * requeue_pi must wake as many tasks as it can, up to nr_wake | ||
1114 | * + nr_requeue, since it acquires the rt_mutex prior to | ||
1115 | * returning to userspace, so as to not leave the rt_mutex with | ||
1116 | * waiters and no owner. However, second and third wake-ups | ||
1117 | * cannot be predicted as they involve race conditions with the | ||
1118 | * first wake and a fault while looking up the pi_state. Both | ||
1119 | * pthread_cond_signal() and pthread_cond_broadcast() should | ||
1120 | * use nr_wake=1. | ||
1121 | */ | ||
1122 | if (nr_wake != 1) | ||
1123 | return -EINVAL; | ||
1124 | } | ||
817 | 1125 | ||
818 | retry: | 1126 | retry: |
1127 | if (pi_state != NULL) { | ||
1128 | /* | ||
1129 | * We will have to lookup the pi_state again, so free this one | ||
1130 | * to keep the accounting correct. | ||
1131 | */ | ||
1132 | free_pi_state(pi_state); | ||
1133 | pi_state = NULL; | ||
1134 | } | ||
1135 | |||
819 | ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); | 1136 | ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); |
820 | if (unlikely(ret != 0)) | 1137 | if (unlikely(ret != 0)) |
821 | goto out; | 1138 | goto out; |
822 | ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_READ); | 1139 | ret = get_futex_key(uaddr2, fshared, &key2, |
1140 | requeue_pi ? VERIFY_WRITE : VERIFY_READ); | ||
823 | if (unlikely(ret != 0)) | 1141 | if (unlikely(ret != 0)) |
824 | goto out_put_key1; | 1142 | goto out_put_key1; |
825 | 1143 | ||
@@ -854,32 +1172,99 @@ retry_private: | |||
854 | } | 1172 | } |
855 | } | 1173 | } |
856 | 1174 | ||
1175 | if (requeue_pi && (task_count - nr_wake < nr_requeue)) { | ||
1176 | /* | ||
1177 | * Attempt to acquire uaddr2 and wake the top waiter. If we | ||
1178 | * intend to requeue waiters, force setting the FUTEX_WAITERS | ||
1179 | * bit. We force this here where we are able to easily handle | ||
1180 | * faults rather in the requeue loop below. | ||
1181 | */ | ||
1182 | ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, | ||
1183 | &key2, &pi_state, nr_requeue); | ||
1184 | |||
1185 | /* | ||
1186 | * At this point the top_waiter has either taken uaddr2 or is | ||
1187 | * waiting on it. If the former, then the pi_state will not | ||
1188 | * exist yet, look it up one more time to ensure we have a | ||
1189 | * reference to it. | ||
1190 | */ | ||
1191 | if (ret == 1) { | ||
1192 | WARN_ON(pi_state); | ||
1193 | task_count++; | ||
1194 | ret = get_futex_value_locked(&curval2, uaddr2); | ||
1195 | if (!ret) | ||
1196 | ret = lookup_pi_state(curval2, hb2, &key2, | ||
1197 | &pi_state); | ||
1198 | } | ||
1199 | |||
1200 | switch (ret) { | ||
1201 | case 0: | ||
1202 | break; | ||
1203 | case -EFAULT: | ||
1204 | double_unlock_hb(hb1, hb2); | ||
1205 | put_futex_key(fshared, &key2); | ||
1206 | put_futex_key(fshared, &key1); | ||
1207 | ret = get_user(curval2, uaddr2); | ||
1208 | if (!ret) | ||
1209 | goto retry; | ||
1210 | goto out; | ||
1211 | case -EAGAIN: | ||
1212 | /* The owner was exiting, try again. */ | ||
1213 | double_unlock_hb(hb1, hb2); | ||
1214 | put_futex_key(fshared, &key2); | ||
1215 | put_futex_key(fshared, &key1); | ||
1216 | cond_resched(); | ||
1217 | goto retry; | ||
1218 | default: | ||
1219 | goto out_unlock; | ||
1220 | } | ||
1221 | } | ||
1222 | |||
857 | head1 = &hb1->chain; | 1223 | head1 = &hb1->chain; |
858 | plist_for_each_entry_safe(this, next, head1, list) { | 1224 | plist_for_each_entry_safe(this, next, head1, list) { |
859 | if (!match_futex (&this->key, &key1)) | 1225 | if (task_count - nr_wake >= nr_requeue) |
1226 | break; | ||
1227 | |||
1228 | if (!match_futex(&this->key, &key1)) | ||
860 | continue; | 1229 | continue; |
861 | if (++ret <= nr_wake) { | 1230 | |
1231 | WARN_ON(!requeue_pi && this->rt_waiter); | ||
1232 | WARN_ON(requeue_pi && !this->rt_waiter); | ||
1233 | |||
1234 | /* | ||
1235 | * Wake nr_wake waiters. For requeue_pi, if we acquired the | ||
1236 | * lock, we already woke the top_waiter. If not, it will be | ||
1237 | * woken by futex_unlock_pi(). | ||
1238 | */ | ||
1239 | if (++task_count <= nr_wake && !requeue_pi) { | ||
862 | wake_futex(this); | 1240 | wake_futex(this); |
863 | } else { | 1241 | continue; |
864 | /* | 1242 | } |
865 | * If key1 and key2 hash to the same bucket, no need to | ||
866 | * requeue. | ||
867 | */ | ||
868 | if (likely(head1 != &hb2->chain)) { | ||
869 | plist_del(&this->list, &hb1->chain); | ||
870 | plist_add(&this->list, &hb2->chain); | ||
871 | this->lock_ptr = &hb2->lock; | ||
872 | #ifdef CONFIG_DEBUG_PI_LIST | ||
873 | this->list.plist.lock = &hb2->lock; | ||
874 | #endif | ||
875 | } | ||
876 | this->key = key2; | ||
877 | get_futex_key_refs(&key2); | ||
878 | drop_count++; | ||
879 | 1243 | ||
880 | if (ret - nr_wake >= nr_requeue) | 1244 | /* |
881 | break; | 1245 | * Requeue nr_requeue waiters and possibly one more in the case |
1246 | * of requeue_pi if we couldn't acquire the lock atomically. | ||
1247 | */ | ||
1248 | if (requeue_pi) { | ||
1249 | /* Prepare the waiter to take the rt_mutex. */ | ||
1250 | atomic_inc(&pi_state->refcount); | ||
1251 | this->pi_state = pi_state; | ||
1252 | ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, | ||
1253 | this->rt_waiter, | ||
1254 | this->task, 1); | ||
1255 | if (ret == 1) { | ||
1256 | /* We got the lock. */ | ||
1257 | requeue_pi_wake_futex(this, &key2); | ||
1258 | continue; | ||
1259 | } else if (ret) { | ||
1260 | /* -EDEADLK */ | ||
1261 | this->pi_state = NULL; | ||
1262 | free_pi_state(pi_state); | ||
1263 | goto out_unlock; | ||
1264 | } | ||
882 | } | 1265 | } |
1266 | requeue_futex(this, hb1, hb2, &key2); | ||
1267 | drop_count++; | ||
883 | } | 1268 | } |
884 | 1269 | ||
885 | out_unlock: | 1270 | out_unlock: |
@@ -899,7 +1284,9 @@ out_put_keys: | |||
899 | out_put_key1: | 1284 | out_put_key1: |
900 | put_futex_key(fshared, &key1); | 1285 | put_futex_key(fshared, &key1); |
901 | out: | 1286 | out: |
902 | return ret; | 1287 | if (pi_state != NULL) |
1288 | free_pi_state(pi_state); | ||
1289 | return ret ? ret : task_count; | ||
903 | } | 1290 | } |
904 | 1291 | ||
905 | /* The key must be already stored in q->key. */ | 1292 | /* The key must be already stored in q->key. */ |
@@ -907,8 +1294,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) | |||
907 | { | 1294 | { |
908 | struct futex_hash_bucket *hb; | 1295 | struct futex_hash_bucket *hb; |
909 | 1296 | ||
910 | init_waitqueue_head(&q->waiter); | ||
911 | |||
912 | get_futex_key_refs(&q->key); | 1297 | get_futex_key_refs(&q->key); |
913 | hb = hash_futex(&q->key); | 1298 | hb = hash_futex(&q->key); |
914 | q->lock_ptr = &hb->lock; | 1299 | q->lock_ptr = &hb->lock; |
@@ -1119,35 +1504,149 @@ handle_fault: | |||
1119 | */ | 1504 | */ |
1120 | #define FLAGS_SHARED 0x01 | 1505 | #define FLAGS_SHARED 0x01 |
1121 | #define FLAGS_CLOCKRT 0x02 | 1506 | #define FLAGS_CLOCKRT 0x02 |
1507 | #define FLAGS_HAS_TIMEOUT 0x04 | ||
1122 | 1508 | ||
1123 | static long futex_wait_restart(struct restart_block *restart); | 1509 | static long futex_wait_restart(struct restart_block *restart); |
1124 | 1510 | ||
1125 | static int futex_wait(u32 __user *uaddr, int fshared, | 1511 | /** |
1126 | u32 val, ktime_t *abs_time, u32 bitset, int clockrt) | 1512 | * fixup_owner() - Post lock pi_state and corner case management |
1513 | * @uaddr: user address of the futex | ||
1514 | * @fshared: whether the futex is shared (1) or not (0) | ||
1515 | * @q: futex_q (contains pi_state and access to the rt_mutex) | ||
1516 | * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) | ||
1517 | * | ||
1518 | * After attempting to lock an rt_mutex, this function is called to cleanup | ||
1519 | * the pi_state owner as well as handle race conditions that may allow us to | ||
1520 | * acquire the lock. Must be called with the hb lock held. | ||
1521 | * | ||
1522 | * Returns: | ||
1523 | * 1 - success, lock taken | ||
1524 | * 0 - success, lock not taken | ||
1525 | * <0 - on error (-EFAULT) | ||
1526 | */ | ||
1527 | static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, | ||
1528 | int locked) | ||
1127 | { | 1529 | { |
1128 | struct task_struct *curr = current; | 1530 | struct task_struct *owner; |
1129 | struct restart_block *restart; | 1531 | int ret = 0; |
1130 | DECLARE_WAITQUEUE(wait, curr); | ||
1131 | struct futex_hash_bucket *hb; | ||
1132 | struct futex_q q; | ||
1133 | u32 uval; | ||
1134 | int ret; | ||
1135 | struct hrtimer_sleeper t; | ||
1136 | int rem = 0; | ||
1137 | 1532 | ||
1138 | if (!bitset) | 1533 | if (locked) { |
1139 | return -EINVAL; | 1534 | /* |
1535 | * Got the lock. We might not be the anticipated owner if we | ||
1536 | * did a lock-steal - fix up the PI-state in that case: | ||
1537 | */ | ||
1538 | if (q->pi_state->owner != current) | ||
1539 | ret = fixup_pi_state_owner(uaddr, q, current, fshared); | ||
1540 | goto out; | ||
1541 | } | ||
1140 | 1542 | ||
1141 | q.pi_state = NULL; | 1543 | /* |
1142 | q.bitset = bitset; | 1544 | * Catch the rare case, where the lock was released when we were on the |
1143 | retry: | 1545 | * way back before we locked the hash bucket. |
1144 | q.key = FUTEX_KEY_INIT; | 1546 | */ |
1145 | ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_READ); | 1547 | if (q->pi_state->owner == current) { |
1146 | if (unlikely(ret != 0)) | 1548 | /* |
1549 | * Try to get the rt_mutex now. This might fail as some other | ||
1550 | * task acquired the rt_mutex after we removed ourself from the | ||
1551 | * rt_mutex waiters list. | ||
1552 | */ | ||
1553 | if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { | ||
1554 | locked = 1; | ||
1555 | goto out; | ||
1556 | } | ||
1557 | |||
1558 | /* | ||
1559 | * pi_state is incorrect, some other task did a lock steal and | ||
1560 | * we returned due to timeout or signal without taking the | ||
1561 | * rt_mutex. Too late. We can access the rt_mutex_owner without | ||
1562 | * locking, as the other task is now blocked on the hash bucket | ||
1563 | * lock. Fix the state up. | ||
1564 | */ | ||
1565 | owner = rt_mutex_owner(&q->pi_state->pi_mutex); | ||
1566 | ret = fixup_pi_state_owner(uaddr, q, owner, fshared); | ||
1147 | goto out; | 1567 | goto out; |
1568 | } | ||
1148 | 1569 | ||
1149 | retry_private: | 1570 | /* |
1150 | hb = queue_lock(&q); | 1571 | * Paranoia check. If we did not take the lock, then we should not be |
1572 | * the owner, nor the pending owner, of the rt_mutex. | ||
1573 | */ | ||
1574 | if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) | ||
1575 | printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " | ||
1576 | "pi-state %p\n", ret, | ||
1577 | q->pi_state->pi_mutex.owner, | ||
1578 | q->pi_state->owner); | ||
1579 | |||
1580 | out: | ||
1581 | return ret ? ret : locked; | ||
1582 | } | ||
1583 | |||
1584 | /** | ||
1585 | * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal | ||
1586 | * @hb: the futex hash bucket, must be locked by the caller | ||
1587 | * @q: the futex_q to queue up on | ||
1588 | * @timeout: the prepared hrtimer_sleeper, or null for no timeout | ||
1589 | */ | ||
1590 | static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, | ||
1591 | struct hrtimer_sleeper *timeout) | ||
1592 | { | ||
1593 | queue_me(q, hb); | ||
1594 | |||
1595 | /* | ||
1596 | * There might have been scheduling since the queue_me(), as we | ||
1597 | * cannot hold a spinlock across the get_user() in case it | ||
1598 | * faults, and we cannot just set TASK_INTERRUPTIBLE state when | ||
1599 | * queueing ourselves into the futex hash. This code thus has to | ||
1600 | * rely on the futex_wake() code removing us from hash when it | ||
1601 | * wakes us up. | ||
1602 | */ | ||
1603 | set_current_state(TASK_INTERRUPTIBLE); | ||
1604 | |||
1605 | /* Arm the timer */ | ||
1606 | if (timeout) { | ||
1607 | hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); | ||
1608 | if (!hrtimer_active(&timeout->timer)) | ||
1609 | timeout->task = NULL; | ||
1610 | } | ||
1611 | |||
1612 | /* | ||
1613 | * !plist_node_empty() is safe here without any lock. | ||
1614 | * q.lock_ptr != 0 is not safe, because of ordering against wakeup. | ||
1615 | */ | ||
1616 | if (likely(!plist_node_empty(&q->list))) { | ||
1617 | /* | ||
1618 | * If the timer has already expired, current will already be | ||
1619 | * flagged for rescheduling. Only call schedule if there | ||
1620 | * is no timeout, or if it has yet to expire. | ||
1621 | */ | ||
1622 | if (!timeout || timeout->task) | ||
1623 | schedule(); | ||
1624 | } | ||
1625 | __set_current_state(TASK_RUNNING); | ||
1626 | } | ||
1627 | |||
1628 | /** | ||
1629 | * futex_wait_setup() - Prepare to wait on a futex | ||
1630 | * @uaddr: the futex userspace address | ||
1631 | * @val: the expected value | ||
1632 | * @fshared: whether the futex is shared (1) or not (0) | ||
1633 | * @q: the associated futex_q | ||
1634 | * @hb: storage for hash_bucket pointer to be returned to caller | ||
1635 | * | ||
1636 | * Setup the futex_q and locate the hash_bucket. Get the futex value and | ||
1637 | * compare it with the expected value. Handle atomic faults internally. | ||
1638 | * Return with the hb lock held and a q.key reference on success, and unlocked | ||
1639 | * with no q.key reference on failure. | ||
1640 | * | ||
1641 | * Returns: | ||
1642 | * 0 - uaddr contains val and hb has been locked | ||
1643 | * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked | ||
1644 | */ | ||
1645 | static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, | ||
1646 | struct futex_q *q, struct futex_hash_bucket **hb) | ||
1647 | { | ||
1648 | u32 uval; | ||
1649 | int ret; | ||
1151 | 1650 | ||
1152 | /* | 1651 | /* |
1153 | * Access the page AFTER the hash-bucket is locked. | 1652 | * Access the page AFTER the hash-bucket is locked. |
@@ -1165,95 +1664,83 @@ retry_private: | |||
1165 | * A consequence is that futex_wait() can return zero and absorb | 1664 | * A consequence is that futex_wait() can return zero and absorb |
1166 | * a wakeup when *uaddr != val on entry to the syscall. This is | 1665 | * a wakeup when *uaddr != val on entry to the syscall. This is |
1167 | * rare, but normal. | 1666 | * rare, but normal. |
1168 | * | ||
1169 | * For shared futexes, we hold the mmap semaphore, so the mapping | ||
1170 | * cannot have changed since we looked it up in get_futex_key. | ||
1171 | */ | 1667 | */ |
1668 | retry: | ||
1669 | q->key = FUTEX_KEY_INIT; | ||
1670 | ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ); | ||
1671 | if (unlikely(ret != 0)) | ||
1672 | return ret; | ||
1673 | |||
1674 | retry_private: | ||
1675 | *hb = queue_lock(q); | ||
1676 | |||
1172 | ret = get_futex_value_locked(&uval, uaddr); | 1677 | ret = get_futex_value_locked(&uval, uaddr); |
1173 | 1678 | ||
1174 | if (unlikely(ret)) { | 1679 | if (ret) { |
1175 | queue_unlock(&q, hb); | 1680 | queue_unlock(q, *hb); |
1176 | 1681 | ||
1177 | ret = get_user(uval, uaddr); | 1682 | ret = get_user(uval, uaddr); |
1178 | if (ret) | 1683 | if (ret) |
1179 | goto out_put_key; | 1684 | goto out; |
1180 | 1685 | ||
1181 | if (!fshared) | 1686 | if (!fshared) |
1182 | goto retry_private; | 1687 | goto retry_private; |
1183 | 1688 | ||
1184 | put_futex_key(fshared, &q.key); | 1689 | put_futex_key(fshared, &q->key); |
1185 | goto retry; | 1690 | goto retry; |
1186 | } | 1691 | } |
1187 | ret = -EWOULDBLOCK; | ||
1188 | if (unlikely(uval != val)) { | ||
1189 | queue_unlock(&q, hb); | ||
1190 | goto out_put_key; | ||
1191 | } | ||
1192 | 1692 | ||
1193 | /* Only actually queue if *uaddr contained val. */ | 1693 | if (uval != val) { |
1194 | queue_me(&q, hb); | 1694 | queue_unlock(q, *hb); |
1695 | ret = -EWOULDBLOCK; | ||
1696 | } | ||
1195 | 1697 | ||
1196 | /* | 1698 | out: |
1197 | * There might have been scheduling since the queue_me(), as we | 1699 | if (ret) |
1198 | * cannot hold a spinlock across the get_user() in case it | 1700 | put_futex_key(fshared, &q->key); |
1199 | * faults, and we cannot just set TASK_INTERRUPTIBLE state when | 1701 | return ret; |
1200 | * queueing ourselves into the futex hash. This code thus has to | 1702 | } |
1201 | * rely on the futex_wake() code removing us from hash when it | ||
1202 | * wakes us up. | ||
1203 | */ | ||
1204 | 1703 | ||
1205 | /* add_wait_queue is the barrier after __set_current_state. */ | 1704 | static int futex_wait(u32 __user *uaddr, int fshared, |
1206 | __set_current_state(TASK_INTERRUPTIBLE); | 1705 | u32 val, ktime_t *abs_time, u32 bitset, int clockrt) |
1207 | add_wait_queue(&q.waiter, &wait); | 1706 | { |
1208 | /* | 1707 | struct hrtimer_sleeper timeout, *to = NULL; |
1209 | * !plist_node_empty() is safe here without any lock. | 1708 | struct restart_block *restart; |
1210 | * q.lock_ptr != 0 is not safe, because of ordering against wakeup. | 1709 | struct futex_hash_bucket *hb; |
1211 | */ | 1710 | struct futex_q q; |
1212 | if (likely(!plist_node_empty(&q.list))) { | 1711 | int ret; |
1213 | if (!abs_time) | ||
1214 | schedule(); | ||
1215 | else { | ||
1216 | hrtimer_init_on_stack(&t.timer, | ||
1217 | clockrt ? CLOCK_REALTIME : | ||
1218 | CLOCK_MONOTONIC, | ||
1219 | HRTIMER_MODE_ABS); | ||
1220 | hrtimer_init_sleeper(&t, current); | ||
1221 | hrtimer_set_expires_range_ns(&t.timer, *abs_time, | ||
1222 | current->timer_slack_ns); | ||
1223 | |||
1224 | hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); | ||
1225 | if (!hrtimer_active(&t.timer)) | ||
1226 | t.task = NULL; | ||
1227 | 1712 | ||
1228 | /* | 1713 | if (!bitset) |
1229 | * the timer could have already expired, in which | 1714 | return -EINVAL; |
1230 | * case current would be flagged for rescheduling. | ||
1231 | * Don't bother calling schedule. | ||
1232 | */ | ||
1233 | if (likely(t.task)) | ||
1234 | schedule(); | ||
1235 | 1715 | ||
1236 | hrtimer_cancel(&t.timer); | 1716 | q.pi_state = NULL; |
1717 | q.bitset = bitset; | ||
1718 | q.rt_waiter = NULL; | ||
1237 | 1719 | ||
1238 | /* Flag if a timeout occured */ | 1720 | if (abs_time) { |
1239 | rem = (t.task == NULL); | 1721 | to = &timeout; |
1240 | 1722 | ||
1241 | destroy_hrtimer_on_stack(&t.timer); | 1723 | hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : |
1242 | } | 1724 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); |
1725 | hrtimer_init_sleeper(to, current); | ||
1726 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, | ||
1727 | current->timer_slack_ns); | ||
1243 | } | 1728 | } |
1244 | __set_current_state(TASK_RUNNING); | ||
1245 | 1729 | ||
1246 | /* | 1730 | /* Prepare to wait on uaddr. */ |
1247 | * NOTE: we don't remove ourselves from the waitqueue because | 1731 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); |
1248 | * we are the only user of it. | 1732 | if (ret) |
1249 | */ | 1733 | goto out; |
1734 | |||
1735 | /* queue_me and wait for wakeup, timeout, or a signal. */ | ||
1736 | futex_wait_queue_me(hb, &q, to); | ||
1250 | 1737 | ||
1251 | /* If we were woken (and unqueued), we succeeded, whatever. */ | 1738 | /* If we were woken (and unqueued), we succeeded, whatever. */ |
1252 | ret = 0; | 1739 | ret = 0; |
1253 | if (!unqueue_me(&q)) | 1740 | if (!unqueue_me(&q)) |
1254 | goto out_put_key; | 1741 | goto out_put_key; |
1255 | ret = -ETIMEDOUT; | 1742 | ret = -ETIMEDOUT; |
1256 | if (rem) | 1743 | if (to && !to->task) |
1257 | goto out_put_key; | 1744 | goto out_put_key; |
1258 | 1745 | ||
1259 | /* | 1746 | /* |
@@ -1270,7 +1757,7 @@ retry_private: | |||
1270 | restart->futex.val = val; | 1757 | restart->futex.val = val; |
1271 | restart->futex.time = abs_time->tv64; | 1758 | restart->futex.time = abs_time->tv64; |
1272 | restart->futex.bitset = bitset; | 1759 | restart->futex.bitset = bitset; |
1273 | restart->futex.flags = 0; | 1760 | restart->futex.flags = FLAGS_HAS_TIMEOUT; |
1274 | 1761 | ||
1275 | if (fshared) | 1762 | if (fshared) |
1276 | restart->futex.flags |= FLAGS_SHARED; | 1763 | restart->futex.flags |= FLAGS_SHARED; |
@@ -1282,6 +1769,10 @@ retry_private: | |||
1282 | out_put_key: | 1769 | out_put_key: |
1283 | put_futex_key(fshared, &q.key); | 1770 | put_futex_key(fshared, &q.key); |
1284 | out: | 1771 | out: |
1772 | if (to) { | ||
1773 | hrtimer_cancel(&to->timer); | ||
1774 | destroy_hrtimer_on_stack(&to->timer); | ||
1775 | } | ||
1285 | return ret; | 1776 | return ret; |
1286 | } | 1777 | } |
1287 | 1778 | ||
@@ -1290,13 +1781,16 @@ static long futex_wait_restart(struct restart_block *restart) | |||
1290 | { | 1781 | { |
1291 | u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; | 1782 | u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; |
1292 | int fshared = 0; | 1783 | int fshared = 0; |
1293 | ktime_t t; | 1784 | ktime_t t, *tp = NULL; |
1294 | 1785 | ||
1295 | t.tv64 = restart->futex.time; | 1786 | if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { |
1787 | t.tv64 = restart->futex.time; | ||
1788 | tp = &t; | ||
1789 | } | ||
1296 | restart->fn = do_no_restart_syscall; | 1790 | restart->fn = do_no_restart_syscall; |
1297 | if (restart->futex.flags & FLAGS_SHARED) | 1791 | if (restart->futex.flags & FLAGS_SHARED) |
1298 | fshared = 1; | 1792 | fshared = 1; |
1299 | return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, | 1793 | return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, |
1300 | restart->futex.bitset, | 1794 | restart->futex.bitset, |
1301 | restart->futex.flags & FLAGS_CLOCKRT); | 1795 | restart->futex.flags & FLAGS_CLOCKRT); |
1302 | } | 1796 | } |
@@ -1312,11 +1806,10 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, | |||
1312 | int detect, ktime_t *time, int trylock) | 1806 | int detect, ktime_t *time, int trylock) |
1313 | { | 1807 | { |
1314 | struct hrtimer_sleeper timeout, *to = NULL; | 1808 | struct hrtimer_sleeper timeout, *to = NULL; |
1315 | struct task_struct *curr = current; | ||
1316 | struct futex_hash_bucket *hb; | 1809 | struct futex_hash_bucket *hb; |
1317 | u32 uval, newval, curval; | 1810 | u32 uval; |
1318 | struct futex_q q; | 1811 | struct futex_q q; |
1319 | int ret, lock_taken, ownerdied = 0; | 1812 | int res, ret; |
1320 | 1813 | ||
1321 | if (refill_pi_state_cache()) | 1814 | if (refill_pi_state_cache()) |
1322 | return -ENOMEM; | 1815 | return -ENOMEM; |
@@ -1330,6 +1823,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, | |||
1330 | } | 1823 | } |
1331 | 1824 | ||
1332 | q.pi_state = NULL; | 1825 | q.pi_state = NULL; |
1826 | q.rt_waiter = NULL; | ||
1333 | retry: | 1827 | retry: |
1334 | q.key = FUTEX_KEY_INIT; | 1828 | q.key = FUTEX_KEY_INIT; |
1335 | ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); | 1829 | ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); |
@@ -1339,81 +1833,15 @@ retry: | |||
1339 | retry_private: | 1833 | retry_private: |
1340 | hb = queue_lock(&q); | 1834 | hb = queue_lock(&q); |
1341 | 1835 | ||
1342 | retry_locked: | 1836 | ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); |
1343 | ret = lock_taken = 0; | ||
1344 | |||
1345 | /* | ||
1346 | * To avoid races, we attempt to take the lock here again | ||
1347 | * (by doing a 0 -> TID atomic cmpxchg), while holding all | ||
1348 | * the locks. It will most likely not succeed. | ||
1349 | */ | ||
1350 | newval = task_pid_vnr(current); | ||
1351 | |||
1352 | curval = cmpxchg_futex_value_locked(uaddr, 0, newval); | ||
1353 | |||
1354 | if (unlikely(curval == -EFAULT)) | ||
1355 | goto uaddr_faulted; | ||
1356 | |||
1357 | /* | ||
1358 | * Detect deadlocks. In case of REQUEUE_PI this is a valid | ||
1359 | * situation and we return success to user space. | ||
1360 | */ | ||
1361 | if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { | ||
1362 | ret = -EDEADLK; | ||
1363 | goto out_unlock_put_key; | ||
1364 | } | ||
1365 | |||
1366 | /* | ||
1367 | * Surprise - we got the lock. Just return to userspace: | ||
1368 | */ | ||
1369 | if (unlikely(!curval)) | ||
1370 | goto out_unlock_put_key; | ||
1371 | |||
1372 | uval = curval; | ||
1373 | |||
1374 | /* | ||
1375 | * Set the WAITERS flag, so the owner will know it has someone | ||
1376 | * to wake at next unlock | ||
1377 | */ | ||
1378 | newval = curval | FUTEX_WAITERS; | ||
1379 | |||
1380 | /* | ||
1381 | * There are two cases, where a futex might have no owner (the | ||
1382 | * owner TID is 0): OWNER_DIED. We take over the futex in this | ||
1383 | * case. We also do an unconditional take over, when the owner | ||
1384 | * of the futex died. | ||
1385 | * | ||
1386 | * This is safe as we are protected by the hash bucket lock ! | ||
1387 | */ | ||
1388 | if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { | ||
1389 | /* Keep the OWNER_DIED bit */ | ||
1390 | newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current); | ||
1391 | ownerdied = 0; | ||
1392 | lock_taken = 1; | ||
1393 | } | ||
1394 | |||
1395 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); | ||
1396 | |||
1397 | if (unlikely(curval == -EFAULT)) | ||
1398 | goto uaddr_faulted; | ||
1399 | if (unlikely(curval != uval)) | ||
1400 | goto retry_locked; | ||
1401 | |||
1402 | /* | ||
1403 | * We took the lock due to owner died take over. | ||
1404 | */ | ||
1405 | if (unlikely(lock_taken)) | ||
1406 | goto out_unlock_put_key; | ||
1407 | |||
1408 | /* | ||
1409 | * We dont have the lock. Look up the PI state (or create it if | ||
1410 | * we are the first waiter): | ||
1411 | */ | ||
1412 | ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state); | ||
1413 | |||
1414 | if (unlikely(ret)) { | 1837 | if (unlikely(ret)) { |
1415 | switch (ret) { | 1838 | switch (ret) { |
1416 | 1839 | case 1: | |
1840 | /* We got the lock. */ | ||
1841 | ret = 0; | ||
1842 | goto out_unlock_put_key; | ||
1843 | case -EFAULT: | ||
1844 | goto uaddr_faulted; | ||
1417 | case -EAGAIN: | 1845 | case -EAGAIN: |
1418 | /* | 1846 | /* |
1419 | * Task is exiting and we just wait for the | 1847 | * Task is exiting and we just wait for the |
@@ -1423,25 +1851,6 @@ retry_locked: | |||
1423 | put_futex_key(fshared, &q.key); | 1851 | put_futex_key(fshared, &q.key); |
1424 | cond_resched(); | 1852 | cond_resched(); |
1425 | goto retry; | 1853 | goto retry; |
1426 | |||
1427 | case -ESRCH: | ||
1428 | /* | ||
1429 | * No owner found for this futex. Check if the | ||
1430 | * OWNER_DIED bit is set to figure out whether | ||
1431 | * this is a robust futex or not. | ||
1432 | */ | ||
1433 | if (get_futex_value_locked(&curval, uaddr)) | ||
1434 | goto uaddr_faulted; | ||
1435 | |||
1436 | /* | ||
1437 | * We simply start over in case of a robust | ||
1438 | * futex. The code above will take the futex | ||
1439 | * and return happy. | ||
1440 | */ | ||
1441 | if (curval & FUTEX_OWNER_DIED) { | ||
1442 | ownerdied = 1; | ||
1443 | goto retry_locked; | ||
1444 | } | ||
1445 | default: | 1854 | default: |
1446 | goto out_unlock_put_key; | 1855 | goto out_unlock_put_key; |
1447 | } | 1856 | } |
@@ -1465,71 +1874,21 @@ retry_locked: | |||
1465 | } | 1874 | } |
1466 | 1875 | ||
1467 | spin_lock(q.lock_ptr); | 1876 | spin_lock(q.lock_ptr); |
1468 | 1877 | /* | |
1469 | if (!ret) { | 1878 | * Fixup the pi_state owner and possibly acquire the lock if we |
1470 | /* | 1879 | * haven't already. |
1471 | * Got the lock. We might not be the anticipated owner | 1880 | */ |
1472 | * if we did a lock-steal - fix up the PI-state in | 1881 | res = fixup_owner(uaddr, fshared, &q, !ret); |
1473 | * that case: | 1882 | /* |
1474 | */ | 1883 | * If fixup_owner() returned an error, proprogate that. If it acquired |
1475 | if (q.pi_state->owner != curr) | 1884 | * the lock, clear our -ETIMEDOUT or -EINTR. |
1476 | ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); | 1885 | */ |
1477 | } else { | 1886 | if (res) |
1478 | /* | 1887 | ret = (res < 0) ? res : 0; |
1479 | * Catch the rare case, where the lock was released | ||
1480 | * when we were on the way back before we locked the | ||
1481 | * hash bucket. | ||
1482 | */ | ||
1483 | if (q.pi_state->owner == curr) { | ||
1484 | /* | ||
1485 | * Try to get the rt_mutex now. This might | ||
1486 | * fail as some other task acquired the | ||
1487 | * rt_mutex after we removed ourself from the | ||
1488 | * rt_mutex waiters list. | ||
1489 | */ | ||
1490 | if (rt_mutex_trylock(&q.pi_state->pi_mutex)) | ||
1491 | ret = 0; | ||
1492 | else { | ||
1493 | /* | ||
1494 | * pi_state is incorrect, some other | ||
1495 | * task did a lock steal and we | ||
1496 | * returned due to timeout or signal | ||
1497 | * without taking the rt_mutex. Too | ||
1498 | * late. We can access the | ||
1499 | * rt_mutex_owner without locking, as | ||
1500 | * the other task is now blocked on | ||
1501 | * the hash bucket lock. Fix the state | ||
1502 | * up. | ||
1503 | */ | ||
1504 | struct task_struct *owner; | ||
1505 | int res; | ||
1506 | |||
1507 | owner = rt_mutex_owner(&q.pi_state->pi_mutex); | ||
1508 | res = fixup_pi_state_owner(uaddr, &q, owner, | ||
1509 | fshared); | ||
1510 | |||
1511 | /* propagate -EFAULT, if the fixup failed */ | ||
1512 | if (res) | ||
1513 | ret = res; | ||
1514 | } | ||
1515 | } else { | ||
1516 | /* | ||
1517 | * Paranoia check. If we did not take the lock | ||
1518 | * in the trylock above, then we should not be | ||
1519 | * the owner of the rtmutex, neither the real | ||
1520 | * nor the pending one: | ||
1521 | */ | ||
1522 | if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr) | ||
1523 | printk(KERN_ERR "futex_lock_pi: ret = %d " | ||
1524 | "pi-mutex: %p pi-state %p\n", ret, | ||
1525 | q.pi_state->pi_mutex.owner, | ||
1526 | q.pi_state->owner); | ||
1527 | } | ||
1528 | } | ||
1529 | 1888 | ||
1530 | /* | 1889 | /* |
1531 | * If fixup_pi_state_owner() faulted and was unable to handle the | 1890 | * If fixup_owner() faulted and was unable to handle the fault, unlock |
1532 | * fault, unlock it and return the fault to userspace. | 1891 | * it and return the fault to userspace. |
1533 | */ | 1892 | */ |
1534 | if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) | 1893 | if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) |
1535 | rt_mutex_unlock(&q.pi_state->pi_mutex); | 1894 | rt_mutex_unlock(&q.pi_state->pi_mutex); |
@@ -1537,9 +1896,7 @@ retry_locked: | |||
1537 | /* Unqueue and drop the lock */ | 1896 | /* Unqueue and drop the lock */ |
1538 | unqueue_me_pi(&q); | 1897 | unqueue_me_pi(&q); |
1539 | 1898 | ||
1540 | if (to) | 1899 | goto out; |
1541 | destroy_hrtimer_on_stack(&to->timer); | ||
1542 | return ret != -EINTR ? ret : -ERESTARTNOINTR; | ||
1543 | 1900 | ||
1544 | out_unlock_put_key: | 1901 | out_unlock_put_key: |
1545 | queue_unlock(&q, hb); | 1902 | queue_unlock(&q, hb); |
@@ -1549,7 +1906,7 @@ out_put_key: | |||
1549 | out: | 1906 | out: |
1550 | if (to) | 1907 | if (to) |
1551 | destroy_hrtimer_on_stack(&to->timer); | 1908 | destroy_hrtimer_on_stack(&to->timer); |
1552 | return ret; | 1909 | return ret != -EINTR ? ret : -ERESTARTNOINTR; |
1553 | 1910 | ||
1554 | uaddr_faulted: | 1911 | uaddr_faulted: |
1555 | /* | 1912 | /* |
@@ -1572,7 +1929,6 @@ uaddr_faulted: | |||
1572 | goto retry; | 1929 | goto retry; |
1573 | } | 1930 | } |
1574 | 1931 | ||
1575 | |||
1576 | /* | 1932 | /* |
1577 | * Userspace attempted a TID -> 0 atomic transition, and failed. | 1933 | * Userspace attempted a TID -> 0 atomic transition, and failed. |
1578 | * This is the in-kernel slowpath: we look up the PI state (if any), | 1934 | * This is the in-kernel slowpath: we look up the PI state (if any), |
@@ -1674,6 +2030,229 @@ pi_faulted: | |||
1674 | return ret; | 2030 | return ret; |
1675 | } | 2031 | } |
1676 | 2032 | ||
2033 | /** | ||
2034 | * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex | ||
2035 | * @hb: the hash_bucket futex_q was original enqueued on | ||
2036 | * @q: the futex_q woken while waiting to be requeued | ||
2037 | * @key2: the futex_key of the requeue target futex | ||
2038 | * @timeout: the timeout associated with the wait (NULL if none) | ||
2039 | * | ||
2040 | * Detect if the task was woken on the initial futex as opposed to the requeue | ||
2041 | * target futex. If so, determine if it was a timeout or a signal that caused | ||
2042 | * the wakeup and return the appropriate error code to the caller. Must be | ||
2043 | * called with the hb lock held. | ||
2044 | * | ||
2045 | * Returns | ||
2046 | * 0 - no early wakeup detected | ||
2047 | * <0 - -ETIMEDOUT or -ERESTARTNOINTR | ||
2048 | */ | ||
2049 | static inline | ||
2050 | int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | ||
2051 | struct futex_q *q, union futex_key *key2, | ||
2052 | struct hrtimer_sleeper *timeout) | ||
2053 | { | ||
2054 | int ret = 0; | ||
2055 | |||
2056 | /* | ||
2057 | * With the hb lock held, we avoid races while we process the wakeup. | ||
2058 | * We only need to hold hb (and not hb2) to ensure atomicity as the | ||
2059 | * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. | ||
2060 | * It can't be requeued from uaddr2 to something else since we don't | ||
2061 | * support a PI aware source futex for requeue. | ||
2062 | */ | ||
2063 | if (!match_futex(&q->key, key2)) { | ||
2064 | WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); | ||
2065 | /* | ||
2066 | * We were woken prior to requeue by a timeout or a signal. | ||
2067 | * Unqueue the futex_q and determine which it was. | ||
2068 | */ | ||
2069 | plist_del(&q->list, &q->list.plist); | ||
2070 | drop_futex_key_refs(&q->key); | ||
2071 | |||
2072 | if (timeout && !timeout->task) | ||
2073 | ret = -ETIMEDOUT; | ||
2074 | else | ||
2075 | ret = -ERESTARTNOINTR; | ||
2076 | } | ||
2077 | return ret; | ||
2078 | } | ||
2079 | |||
2080 | /** | ||
2081 | * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 | ||
2082 | * @uaddr: the futex we initialyl wait on (non-pi) | ||
2083 | * @fshared: whether the futexes are shared (1) or not (0). They must be | ||
2084 | * the same type, no requeueing from private to shared, etc. | ||
2085 | * @val: the expected value of uaddr | ||
2086 | * @abs_time: absolute timeout | ||
2087 | * @bitset: 32 bit wakeup bitset set by userspace, defaults to all. | ||
2088 | * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) | ||
2089 | * @uaddr2: the pi futex we will take prior to returning to user-space | ||
2090 | * | ||
2091 | * The caller will wait on uaddr and will be requeued by futex_requeue() to | ||
2092 | * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and | ||
2093 | * complete the acquisition of the rt_mutex prior to returning to userspace. | ||
2094 | * This ensures the rt_mutex maintains an owner when it has waiters; without | ||
2095 | * one, the pi logic wouldn't know which task to boost/deboost, if there was a | ||
2096 | * need to. | ||
2097 | * | ||
2098 | * We call schedule in futex_wait_queue_me() when we enqueue and return there | ||
2099 | * via the following: | ||
2100 | * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() | ||
2101 | * 2) wakeup on uaddr2 after a requeue and subsequent unlock | ||
2102 | * 3) signal (before or after requeue) | ||
2103 | * 4) timeout (before or after requeue) | ||
2104 | * | ||
2105 | * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. | ||
2106 | * | ||
2107 | * If 2, we may then block on trying to take the rt_mutex and return via: | ||
2108 | * 5) successful lock | ||
2109 | * 6) signal | ||
2110 | * 7) timeout | ||
2111 | * 8) other lock acquisition failure | ||
2112 | * | ||
2113 | * If 6, we setup a restart_block with futex_lock_pi() as the function. | ||
2114 | * | ||
2115 | * If 4 or 7, we cleanup and return with -ETIMEDOUT. | ||
2116 | * | ||
2117 | * Returns: | ||
2118 | * 0 - On success | ||
2119 | * <0 - On error | ||
2120 | */ | ||
2121 | static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | ||
2122 | u32 val, ktime_t *abs_time, u32 bitset, | ||
2123 | int clockrt, u32 __user *uaddr2) | ||
2124 | { | ||
2125 | struct hrtimer_sleeper timeout, *to = NULL; | ||
2126 | struct rt_mutex_waiter rt_waiter; | ||
2127 | struct rt_mutex *pi_mutex = NULL; | ||
2128 | struct futex_hash_bucket *hb; | ||
2129 | union futex_key key2; | ||
2130 | struct futex_q q; | ||
2131 | int res, ret; | ||
2132 | |||
2133 | if (!bitset) | ||
2134 | return -EINVAL; | ||
2135 | |||
2136 | if (abs_time) { | ||
2137 | to = &timeout; | ||
2138 | hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : | ||
2139 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | ||
2140 | hrtimer_init_sleeper(to, current); | ||
2141 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, | ||
2142 | current->timer_slack_ns); | ||
2143 | } | ||
2144 | |||
2145 | /* | ||
2146 | * The waiter is allocated on our stack, manipulated by the requeue | ||
2147 | * code while we sleep on uaddr. | ||
2148 | */ | ||
2149 | debug_rt_mutex_init_waiter(&rt_waiter); | ||
2150 | rt_waiter.task = NULL; | ||
2151 | |||
2152 | q.pi_state = NULL; | ||
2153 | q.bitset = bitset; | ||
2154 | q.rt_waiter = &rt_waiter; | ||
2155 | |||
2156 | key2 = FUTEX_KEY_INIT; | ||
2157 | ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); | ||
2158 | if (unlikely(ret != 0)) | ||
2159 | goto out; | ||
2160 | |||
2161 | /* Prepare to wait on uaddr. */ | ||
2162 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); | ||
2163 | if (ret) | ||
2164 | goto out_key2; | ||
2165 | |||
2166 | /* Queue the futex_q, drop the hb lock, wait for wakeup. */ | ||
2167 | futex_wait_queue_me(hb, &q, to); | ||
2168 | |||
2169 | spin_lock(&hb->lock); | ||
2170 | ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); | ||
2171 | spin_unlock(&hb->lock); | ||
2172 | if (ret) | ||
2173 | goto out_put_keys; | ||
2174 | |||
2175 | /* | ||
2176 | * In order for us to be here, we know our q.key == key2, and since | ||
2177 | * we took the hb->lock above, we also know that futex_requeue() has | ||
2178 | * completed and we no longer have to concern ourselves with a wakeup | ||
2179 | * race with the atomic proxy lock acquition by the requeue code. | ||
2180 | */ | ||
2181 | |||
2182 | /* Check if the requeue code acquired the second futex for us. */ | ||
2183 | if (!q.rt_waiter) { | ||
2184 | /* | ||
2185 | * Got the lock. We might not be the anticipated owner if we | ||
2186 | * did a lock-steal - fix up the PI-state in that case. | ||
2187 | */ | ||
2188 | if (q.pi_state && (q.pi_state->owner != current)) { | ||
2189 | spin_lock(q.lock_ptr); | ||
2190 | ret = fixup_pi_state_owner(uaddr2, &q, current, | ||
2191 | fshared); | ||
2192 | spin_unlock(q.lock_ptr); | ||
2193 | } | ||
2194 | } else { | ||
2195 | /* | ||
2196 | * We have been woken up by futex_unlock_pi(), a timeout, or a | ||
2197 | * signal. futex_unlock_pi() will not destroy the lock_ptr nor | ||
2198 | * the pi_state. | ||
2199 | */ | ||
2200 | WARN_ON(!&q.pi_state); | ||
2201 | pi_mutex = &q.pi_state->pi_mutex; | ||
2202 | ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); | ||
2203 | debug_rt_mutex_free_waiter(&rt_waiter); | ||
2204 | |||
2205 | spin_lock(q.lock_ptr); | ||
2206 | /* | ||
2207 | * Fixup the pi_state owner and possibly acquire the lock if we | ||
2208 | * haven't already. | ||
2209 | */ | ||
2210 | res = fixup_owner(uaddr2, fshared, &q, !ret); | ||
2211 | /* | ||
2212 | * If fixup_owner() returned an error, proprogate that. If it | ||
2213 | * acquired the lock, clear our -ETIMEDOUT or -EINTR. | ||
2214 | */ | ||
2215 | if (res) | ||
2216 | ret = (res < 0) ? res : 0; | ||
2217 | |||
2218 | /* Unqueue and drop the lock. */ | ||
2219 | unqueue_me_pi(&q); | ||
2220 | } | ||
2221 | |||
2222 | /* | ||
2223 | * If fixup_pi_state_owner() faulted and was unable to handle the | ||
2224 | * fault, unlock the rt_mutex and return the fault to userspace. | ||
2225 | */ | ||
2226 | if (ret == -EFAULT) { | ||
2227 | if (rt_mutex_owner(pi_mutex) == current) | ||
2228 | rt_mutex_unlock(pi_mutex); | ||
2229 | } else if (ret == -EINTR) { | ||
2230 | /* | ||
2231 | * We've already been requeued, but we have no way to | ||
2232 | * restart by calling futex_lock_pi() directly. We | ||
2233 | * could restart the syscall, but that will look at | ||
2234 | * the user space value and return right away. So we | ||
2235 | * drop back with EWOULDBLOCK to tell user space that | ||
2236 | * "val" has been changed. That's the same what the | ||
2237 | * restart of the syscall would do in | ||
2238 | * futex_wait_setup(). | ||
2239 | */ | ||
2240 | ret = -EWOULDBLOCK; | ||
2241 | } | ||
2242 | |||
2243 | out_put_keys: | ||
2244 | put_futex_key(fshared, &q.key); | ||
2245 | out_key2: | ||
2246 | put_futex_key(fshared, &key2); | ||
2247 | |||
2248 | out: | ||
2249 | if (to) { | ||
2250 | hrtimer_cancel(&to->timer); | ||
2251 | destroy_hrtimer_on_stack(&to->timer); | ||
2252 | } | ||
2253 | return ret; | ||
2254 | } | ||
2255 | |||
1677 | /* | 2256 | /* |
1678 | * Support for robust futexes: the kernel cleans up held futexes at | 2257 | * Support for robust futexes: the kernel cleans up held futexes at |
1679 | * thread exit time. | 2258 | * thread exit time. |
@@ -1896,7 +2475,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, | |||
1896 | fshared = 1; | 2475 | fshared = 1; |
1897 | 2476 | ||
1898 | clockrt = op & FUTEX_CLOCK_REALTIME; | 2477 | clockrt = op & FUTEX_CLOCK_REALTIME; |
1899 | if (clockrt && cmd != FUTEX_WAIT_BITSET) | 2478 | if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) |
1900 | return -ENOSYS; | 2479 | return -ENOSYS; |
1901 | 2480 | ||
1902 | switch (cmd) { | 2481 | switch (cmd) { |
@@ -1911,10 +2490,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, | |||
1911 | ret = futex_wake(uaddr, fshared, val, val3); | 2490 | ret = futex_wake(uaddr, fshared, val, val3); |
1912 | break; | 2491 | break; |
1913 | case FUTEX_REQUEUE: | 2492 | case FUTEX_REQUEUE: |
1914 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); | 2493 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); |
1915 | break; | 2494 | break; |
1916 | case FUTEX_CMP_REQUEUE: | 2495 | case FUTEX_CMP_REQUEUE: |
1917 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); | 2496 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, |
2497 | 0); | ||
1918 | break; | 2498 | break; |
1919 | case FUTEX_WAKE_OP: | 2499 | case FUTEX_WAKE_OP: |
1920 | ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); | 2500 | ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); |
@@ -1931,6 +2511,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, | |||
1931 | if (futex_cmpxchg_enabled) | 2511 | if (futex_cmpxchg_enabled) |
1932 | ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); | 2512 | ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); |
1933 | break; | 2513 | break; |
2514 | case FUTEX_WAIT_REQUEUE_PI: | ||
2515 | val3 = FUTEX_BITSET_MATCH_ANY; | ||
2516 | ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, | ||
2517 | clockrt, uaddr2); | ||
2518 | break; | ||
2519 | case FUTEX_CMP_REQUEUE_PI: | ||
2520 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, | ||
2521 | 1); | ||
2522 | break; | ||
1934 | default: | 2523 | default: |
1935 | ret = -ENOSYS; | 2524 | ret = -ENOSYS; |
1936 | } | 2525 | } |
@@ -1948,7 +2537,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, | |||
1948 | int cmd = op & FUTEX_CMD_MASK; | 2537 | int cmd = op & FUTEX_CMD_MASK; |
1949 | 2538 | ||
1950 | if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || | 2539 | if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || |
1951 | cmd == FUTEX_WAIT_BITSET)) { | 2540 | cmd == FUTEX_WAIT_BITSET || |
2541 | cmd == FUTEX_WAIT_REQUEUE_PI)) { | ||
1952 | if (copy_from_user(&ts, utime, sizeof(ts)) != 0) | 2542 | if (copy_from_user(&ts, utime, sizeof(ts)) != 0) |
1953 | return -EFAULT; | 2543 | return -EFAULT; |
1954 | if (!timespec_valid(&ts)) | 2544 | if (!timespec_valid(&ts)) |
@@ -1960,11 +2550,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, | |||
1960 | tp = &t; | 2550 | tp = &t; |
1961 | } | 2551 | } |
1962 | /* | 2552 | /* |
1963 | * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. | 2553 | * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. |
1964 | * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. | 2554 | * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. |
1965 | */ | 2555 | */ |
1966 | if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || | 2556 | if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || |
1967 | cmd == FUTEX_WAKE_OP) | 2557 | cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) |
1968 | val2 = (u32) (unsigned long) utime; | 2558 | val2 = (u32) (unsigned long) utime; |
1969 | 2559 | ||
1970 | return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); | 2560 | return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); |
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 69d9cb921ffa..fec77e7e0562 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c | |||
@@ -300,7 +300,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
300 | * assigned pending owner [which might not have taken the | 300 | * assigned pending owner [which might not have taken the |
301 | * lock yet]: | 301 | * lock yet]: |
302 | */ | 302 | */ |
303 | static inline int try_to_steal_lock(struct rt_mutex *lock) | 303 | static inline int try_to_steal_lock(struct rt_mutex *lock, |
304 | struct task_struct *task) | ||
304 | { | 305 | { |
305 | struct task_struct *pendowner = rt_mutex_owner(lock); | 306 | struct task_struct *pendowner = rt_mutex_owner(lock); |
306 | struct rt_mutex_waiter *next; | 307 | struct rt_mutex_waiter *next; |
@@ -309,11 +310,11 @@ static inline int try_to_steal_lock(struct rt_mutex *lock) | |||
309 | if (!rt_mutex_owner_pending(lock)) | 310 | if (!rt_mutex_owner_pending(lock)) |
310 | return 0; | 311 | return 0; |
311 | 312 | ||
312 | if (pendowner == current) | 313 | if (pendowner == task) |
313 | return 1; | 314 | return 1; |
314 | 315 | ||
315 | spin_lock_irqsave(&pendowner->pi_lock, flags); | 316 | spin_lock_irqsave(&pendowner->pi_lock, flags); |
316 | if (current->prio >= pendowner->prio) { | 317 | if (task->prio >= pendowner->prio) { |
317 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); | 318 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); |
318 | return 0; | 319 | return 0; |
319 | } | 320 | } |
@@ -338,21 +339,21 @@ static inline int try_to_steal_lock(struct rt_mutex *lock) | |||
338 | * We are going to steal the lock and a waiter was | 339 | * We are going to steal the lock and a waiter was |
339 | * enqueued on the pending owners pi_waiters queue. So | 340 | * enqueued on the pending owners pi_waiters queue. So |
340 | * we have to enqueue this waiter into | 341 | * we have to enqueue this waiter into |
341 | * current->pi_waiters list. This covers the case, | 342 | * task->pi_waiters list. This covers the case, |
342 | * where current is boosted because it holds another | 343 | * where task is boosted because it holds another |
343 | * lock and gets unboosted because the booster is | 344 | * lock and gets unboosted because the booster is |
344 | * interrupted, so we would delay a waiter with higher | 345 | * interrupted, so we would delay a waiter with higher |
345 | * priority as current->normal_prio. | 346 | * priority as task->normal_prio. |
346 | * | 347 | * |
347 | * Note: in the rare case of a SCHED_OTHER task changing | 348 | * Note: in the rare case of a SCHED_OTHER task changing |
348 | * its priority and thus stealing the lock, next->task | 349 | * its priority and thus stealing the lock, next->task |
349 | * might be current: | 350 | * might be task: |
350 | */ | 351 | */ |
351 | if (likely(next->task != current)) { | 352 | if (likely(next->task != task)) { |
352 | spin_lock_irqsave(¤t->pi_lock, flags); | 353 | spin_lock_irqsave(&task->pi_lock, flags); |
353 | plist_add(&next->pi_list_entry, ¤t->pi_waiters); | 354 | plist_add(&next->pi_list_entry, &task->pi_waiters); |
354 | __rt_mutex_adjust_prio(current); | 355 | __rt_mutex_adjust_prio(task); |
355 | spin_unlock_irqrestore(¤t->pi_lock, flags); | 356 | spin_unlock_irqrestore(&task->pi_lock, flags); |
356 | } | 357 | } |
357 | return 1; | 358 | return 1; |
358 | } | 359 | } |
@@ -389,7 +390,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) | |||
389 | */ | 390 | */ |
390 | mark_rt_mutex_waiters(lock); | 391 | mark_rt_mutex_waiters(lock); |
391 | 392 | ||
392 | if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) | 393 | if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) |
393 | return 0; | 394 | return 0; |
394 | 395 | ||
395 | /* We got the lock. */ | 396 | /* We got the lock. */ |
@@ -411,6 +412,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) | |||
411 | */ | 412 | */ |
412 | static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | 413 | static int task_blocks_on_rt_mutex(struct rt_mutex *lock, |
413 | struct rt_mutex_waiter *waiter, | 414 | struct rt_mutex_waiter *waiter, |
415 | struct task_struct *task, | ||
414 | int detect_deadlock) | 416 | int detect_deadlock) |
415 | { | 417 | { |
416 | struct task_struct *owner = rt_mutex_owner(lock); | 418 | struct task_struct *owner = rt_mutex_owner(lock); |
@@ -418,21 +420,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
418 | unsigned long flags; | 420 | unsigned long flags; |
419 | int chain_walk = 0, res; | 421 | int chain_walk = 0, res; |
420 | 422 | ||
421 | spin_lock_irqsave(¤t->pi_lock, flags); | 423 | spin_lock_irqsave(&task->pi_lock, flags); |
422 | __rt_mutex_adjust_prio(current); | 424 | __rt_mutex_adjust_prio(task); |
423 | waiter->task = current; | 425 | waiter->task = task; |
424 | waiter->lock = lock; | 426 | waiter->lock = lock; |
425 | plist_node_init(&waiter->list_entry, current->prio); | 427 | plist_node_init(&waiter->list_entry, task->prio); |
426 | plist_node_init(&waiter->pi_list_entry, current->prio); | 428 | plist_node_init(&waiter->pi_list_entry, task->prio); |
427 | 429 | ||
428 | /* Get the top priority waiter on the lock */ | 430 | /* Get the top priority waiter on the lock */ |
429 | if (rt_mutex_has_waiters(lock)) | 431 | if (rt_mutex_has_waiters(lock)) |
430 | top_waiter = rt_mutex_top_waiter(lock); | 432 | top_waiter = rt_mutex_top_waiter(lock); |
431 | plist_add(&waiter->list_entry, &lock->wait_list); | 433 | plist_add(&waiter->list_entry, &lock->wait_list); |
432 | 434 | ||
433 | current->pi_blocked_on = waiter; | 435 | task->pi_blocked_on = waiter; |
434 | 436 | ||
435 | spin_unlock_irqrestore(¤t->pi_lock, flags); | 437 | spin_unlock_irqrestore(&task->pi_lock, flags); |
436 | 438 | ||
437 | if (waiter == rt_mutex_top_waiter(lock)) { | 439 | if (waiter == rt_mutex_top_waiter(lock)) { |
438 | spin_lock_irqsave(&owner->pi_lock, flags); | 440 | spin_lock_irqsave(&owner->pi_lock, flags); |
@@ -460,7 +462,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
460 | spin_unlock(&lock->wait_lock); | 462 | spin_unlock(&lock->wait_lock); |
461 | 463 | ||
462 | res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, | 464 | res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, |
463 | current); | 465 | task); |
464 | 466 | ||
465 | spin_lock(&lock->wait_lock); | 467 | spin_lock(&lock->wait_lock); |
466 | 468 | ||
@@ -605,37 +607,25 @@ void rt_mutex_adjust_pi(struct task_struct *task) | |||
605 | rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); | 607 | rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); |
606 | } | 608 | } |
607 | 609 | ||
608 | /* | 610 | /** |
609 | * Slow path lock function: | 611 | * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop |
612 | * @lock: the rt_mutex to take | ||
613 | * @state: the state the task should block in (TASK_INTERRUPTIBLE | ||
614 | * or TASK_UNINTERRUPTIBLE) | ||
615 | * @timeout: the pre-initialized and started timer, or NULL for none | ||
616 | * @waiter: the pre-initialized rt_mutex_waiter | ||
617 | * @detect_deadlock: passed to task_blocks_on_rt_mutex | ||
618 | * | ||
619 | * lock->wait_lock must be held by the caller. | ||
610 | */ | 620 | */ |
611 | static int __sched | 621 | static int __sched |
612 | rt_mutex_slowlock(struct rt_mutex *lock, int state, | 622 | __rt_mutex_slowlock(struct rt_mutex *lock, int state, |
613 | struct hrtimer_sleeper *timeout, | 623 | struct hrtimer_sleeper *timeout, |
614 | int detect_deadlock) | 624 | struct rt_mutex_waiter *waiter, |
625 | int detect_deadlock) | ||
615 | { | 626 | { |
616 | struct rt_mutex_waiter waiter; | ||
617 | int ret = 0; | 627 | int ret = 0; |
618 | 628 | ||
619 | debug_rt_mutex_init_waiter(&waiter); | ||
620 | waiter.task = NULL; | ||
621 | |||
622 | spin_lock(&lock->wait_lock); | ||
623 | |||
624 | /* Try to acquire the lock again: */ | ||
625 | if (try_to_take_rt_mutex(lock)) { | ||
626 | spin_unlock(&lock->wait_lock); | ||
627 | return 0; | ||
628 | } | ||
629 | |||
630 | set_current_state(state); | ||
631 | |||
632 | /* Setup the timer, when timeout != NULL */ | ||
633 | if (unlikely(timeout)) { | ||
634 | hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); | ||
635 | if (!hrtimer_active(&timeout->timer)) | ||
636 | timeout->task = NULL; | ||
637 | } | ||
638 | |||
639 | for (;;) { | 629 | for (;;) { |
640 | /* Try to acquire the lock: */ | 630 | /* Try to acquire the lock: */ |
641 | if (try_to_take_rt_mutex(lock)) | 631 | if (try_to_take_rt_mutex(lock)) |
@@ -656,19 +646,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
656 | } | 646 | } |
657 | 647 | ||
658 | /* | 648 | /* |
659 | * waiter.task is NULL the first time we come here and | 649 | * waiter->task is NULL the first time we come here and |
660 | * when we have been woken up by the previous owner | 650 | * when we have been woken up by the previous owner |
661 | * but the lock got stolen by a higher prio task. | 651 | * but the lock got stolen by a higher prio task. |
662 | */ | 652 | */ |
663 | if (!waiter.task) { | 653 | if (!waiter->task) { |
664 | ret = task_blocks_on_rt_mutex(lock, &waiter, | 654 | ret = task_blocks_on_rt_mutex(lock, waiter, current, |
665 | detect_deadlock); | 655 | detect_deadlock); |
666 | /* | 656 | /* |
667 | * If we got woken up by the owner then start loop | 657 | * If we got woken up by the owner then start loop |
668 | * all over without going into schedule to try | 658 | * all over without going into schedule to try |
669 | * to get the lock now: | 659 | * to get the lock now: |
670 | */ | 660 | */ |
671 | if (unlikely(!waiter.task)) { | 661 | if (unlikely(!waiter->task)) { |
672 | /* | 662 | /* |
673 | * Reset the return value. We might | 663 | * Reset the return value. We might |
674 | * have returned with -EDEADLK and the | 664 | * have returned with -EDEADLK and the |
@@ -684,15 +674,52 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
684 | 674 | ||
685 | spin_unlock(&lock->wait_lock); | 675 | spin_unlock(&lock->wait_lock); |
686 | 676 | ||
687 | debug_rt_mutex_print_deadlock(&waiter); | 677 | debug_rt_mutex_print_deadlock(waiter); |
688 | 678 | ||
689 | if (waiter.task) | 679 | if (waiter->task) |
690 | schedule_rt_mutex(lock); | 680 | schedule_rt_mutex(lock); |
691 | 681 | ||
692 | spin_lock(&lock->wait_lock); | 682 | spin_lock(&lock->wait_lock); |
693 | set_current_state(state); | 683 | set_current_state(state); |
694 | } | 684 | } |
695 | 685 | ||
686 | return ret; | ||
687 | } | ||
688 | |||
689 | /* | ||
690 | * Slow path lock function: | ||
691 | */ | ||
692 | static int __sched | ||
693 | rt_mutex_slowlock(struct rt_mutex *lock, int state, | ||
694 | struct hrtimer_sleeper *timeout, | ||
695 | int detect_deadlock) | ||
696 | { | ||
697 | struct rt_mutex_waiter waiter; | ||
698 | int ret = 0; | ||
699 | |||
700 | debug_rt_mutex_init_waiter(&waiter); | ||
701 | waiter.task = NULL; | ||
702 | |||
703 | spin_lock(&lock->wait_lock); | ||
704 | |||
705 | /* Try to acquire the lock again: */ | ||
706 | if (try_to_take_rt_mutex(lock)) { | ||
707 | spin_unlock(&lock->wait_lock); | ||
708 | return 0; | ||
709 | } | ||
710 | |||
711 | set_current_state(state); | ||
712 | |||
713 | /* Setup the timer, when timeout != NULL */ | ||
714 | if (unlikely(timeout)) { | ||
715 | hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); | ||
716 | if (!hrtimer_active(&timeout->timer)) | ||
717 | timeout->task = NULL; | ||
718 | } | ||
719 | |||
720 | ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, | ||
721 | detect_deadlock); | ||
722 | |||
696 | set_current_state(TASK_RUNNING); | 723 | set_current_state(TASK_RUNNING); |
697 | 724 | ||
698 | if (unlikely(waiter.task)) | 725 | if (unlikely(waiter.task)) |
@@ -986,6 +1013,59 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, | |||
986 | } | 1013 | } |
987 | 1014 | ||
988 | /** | 1015 | /** |
1016 | * rt_mutex_start_proxy_lock() - Start lock acquisition for another task | ||
1017 | * @lock: the rt_mutex to take | ||
1018 | * @waiter: the pre-initialized rt_mutex_waiter | ||
1019 | * @task: the task to prepare | ||
1020 | * @detect_deadlock: perform deadlock detection (1) or not (0) | ||
1021 | * | ||
1022 | * Returns: | ||
1023 | * 0 - task blocked on lock | ||
1024 | * 1 - acquired the lock for task, caller should wake it up | ||
1025 | * <0 - error | ||
1026 | * | ||
1027 | * Special API call for FUTEX_REQUEUE_PI support. | ||
1028 | */ | ||
1029 | int rt_mutex_start_proxy_lock(struct rt_mutex *lock, | ||
1030 | struct rt_mutex_waiter *waiter, | ||
1031 | struct task_struct *task, int detect_deadlock) | ||
1032 | { | ||
1033 | int ret; | ||
1034 | |||
1035 | spin_lock(&lock->wait_lock); | ||
1036 | |||
1037 | mark_rt_mutex_waiters(lock); | ||
1038 | |||
1039 | if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { | ||
1040 | /* We got the lock for task. */ | ||
1041 | debug_rt_mutex_lock(lock); | ||
1042 | |||
1043 | rt_mutex_set_owner(lock, task, 0); | ||
1044 | |||
1045 | rt_mutex_deadlock_account_lock(lock, task); | ||
1046 | return 1; | ||
1047 | } | ||
1048 | |||
1049 | ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); | ||
1050 | |||
1051 | |||
1052 | if (ret && !waiter->task) { | ||
1053 | /* | ||
1054 | * Reset the return value. We might have | ||
1055 | * returned with -EDEADLK and the owner | ||
1056 | * released the lock while we were walking the | ||
1057 | * pi chain. Let the waiter sort it out. | ||
1058 | */ | ||
1059 | ret = 0; | ||
1060 | } | ||
1061 | spin_unlock(&lock->wait_lock); | ||
1062 | |||
1063 | debug_rt_mutex_print_deadlock(waiter); | ||
1064 | |||
1065 | return ret; | ||
1066 | } | ||
1067 | |||
1068 | /** | ||
989 | * rt_mutex_next_owner - return the next owner of the lock | 1069 | * rt_mutex_next_owner - return the next owner of the lock |
990 | * | 1070 | * |
991 | * @lock: the rt lock query | 1071 | * @lock: the rt lock query |
@@ -1004,3 +1084,57 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) | |||
1004 | 1084 | ||
1005 | return rt_mutex_top_waiter(lock)->task; | 1085 | return rt_mutex_top_waiter(lock)->task; |
1006 | } | 1086 | } |
1087 | |||
1088 | /** | ||
1089 | * rt_mutex_finish_proxy_lock() - Complete lock acquisition | ||
1090 | * @lock: the rt_mutex we were woken on | ||
1091 | * @to: the timeout, null if none. hrtimer should already have | ||
1092 | * been started. | ||
1093 | * @waiter: the pre-initialized rt_mutex_waiter | ||
1094 | * @detect_deadlock: perform deadlock detection (1) or not (0) | ||
1095 | * | ||
1096 | * Complete the lock acquisition started our behalf by another thread. | ||
1097 | * | ||
1098 | * Returns: | ||
1099 | * 0 - success | ||
1100 | * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK | ||
1101 | * | ||
1102 | * Special API call for PI-futex requeue support | ||
1103 | */ | ||
1104 | int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, | ||
1105 | struct hrtimer_sleeper *to, | ||
1106 | struct rt_mutex_waiter *waiter, | ||
1107 | int detect_deadlock) | ||
1108 | { | ||
1109 | int ret; | ||
1110 | |||
1111 | spin_lock(&lock->wait_lock); | ||
1112 | |||
1113 | set_current_state(TASK_INTERRUPTIBLE); | ||
1114 | |||
1115 | ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, | ||
1116 | detect_deadlock); | ||
1117 | |||
1118 | set_current_state(TASK_RUNNING); | ||
1119 | |||
1120 | if (unlikely(waiter->task)) | ||
1121 | remove_waiter(lock, waiter); | ||
1122 | |||
1123 | /* | ||
1124 | * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might | ||
1125 | * have to fix that up. | ||
1126 | */ | ||
1127 | fixup_rt_mutex_waiters(lock); | ||
1128 | |||
1129 | spin_unlock(&lock->wait_lock); | ||
1130 | |||
1131 | /* | ||
1132 | * Readjust priority, when we did not get the lock. We might have been | ||
1133 | * the pending owner and boosted. Since we did not take the lock, the | ||
1134 | * PI boost has to go. | ||
1135 | */ | ||
1136 | if (unlikely(ret)) | ||
1137 | rt_mutex_adjust_prio(current); | ||
1138 | |||
1139 | return ret; | ||
1140 | } | ||
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index e124bf5800ea..97a2f81866af 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h | |||
@@ -120,6 +120,14 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, | |||
120 | struct task_struct *proxy_owner); | 120 | struct task_struct *proxy_owner); |
121 | extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, | 121 | extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, |
122 | struct task_struct *proxy_owner); | 122 | struct task_struct *proxy_owner); |
123 | extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, | ||
124 | struct rt_mutex_waiter *waiter, | ||
125 | struct task_struct *task, | ||
126 | int detect_deadlock); | ||
127 | extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, | ||
128 | struct hrtimer_sleeper *to, | ||
129 | struct rt_mutex_waiter *waiter, | ||
130 | int detect_deadlock); | ||
123 | 131 | ||
124 | #ifdef CONFIG_DEBUG_RT_MUTEXES | 132 | #ifdef CONFIG_DEBUG_RT_MUTEXES |
125 | # include "rtmutex-debug.h" | 133 | # include "rtmutex-debug.h" |