aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDarren Hart <dvhltc@us.ibm.com>2009-04-03 16:40:49 -0400
committerThomas Gleixner <tglx@linutronix.de>2009-04-06 05:14:03 -0400
commit52400ba946759af28442dee6265c5c0180ac7122 (patch)
tree4c9abe885b3cae3cb47b33826b3c5838fc9761b7
parentf801073f87aa22ddf0e9146355fec3993163790f (diff)
futex: add requeue_pi functionality
PI Futexes and their underlying rt_mutex cannot be left ownerless if there are pending waiters as this will break the PI boosting logic, so the standard requeue commands aren't sufficient. The new commands properly manage pi futex ownership by ensuring a futex with waiters has an owner at all times. This will allow glibc to properly handle pi mutexes with pthread_condvars. The approach taken here is to create two new futex op codes: FUTEX_WAIT_REQUEUE_PI: Tasks will use this op code to wait on a futex (such as a non-pi waitqueue) and wake after they have been requeued to a pi futex. Prior to returning to userspace, they will acquire this pi futex (and the underlying rt_mutex). futex_wait_requeue_pi() is the result of a high speed collision between futex_wait() and futex_lock_pi() (with the first part of futex_lock_pi() being done by futex_proxy_trylock_atomic() on behalf of the top_waiter). FUTEX_REQUEUE_PI (and FUTEX_CMP_REQUEUE_PI): This call must be used to wake tasks waiting with FUTEX_WAIT_REQUEUE_PI, regardless of how many tasks the caller intends to wake or requeue. pthread_cond_broadcast() should call this with nr_wake=1 and nr_requeue=INT_MAX. pthread_cond_signal() should call this with nr_wake=1 and nr_requeue=0. The reason being we need both callers to get the benefit of the futex_proxy_trylock_atomic() routine. futex_requeue() also enqueues the top_waiter on the rt_mutex via rt_mutex_start_proxy_lock(). Signed-off-by: Darren Hart <dvhltc@us.ibm.com> Reviewed-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-rw-r--r--include/linux/futex.h8
-rw-r--r--include/linux/thread_info.h3
-rw-r--r--kernel/futex.c519
3 files changed, 510 insertions, 20 deletions
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 3bf5bb5a34f..b05519ca9e5 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -23,6 +23,9 @@ union ktime;
23#define FUTEX_TRYLOCK_PI 8 23#define FUTEX_TRYLOCK_PI 8
24#define FUTEX_WAIT_BITSET 9 24#define FUTEX_WAIT_BITSET 9
25#define FUTEX_WAKE_BITSET 10 25#define FUTEX_WAKE_BITSET 10
26#define FUTEX_WAIT_REQUEUE_PI 11
27#define FUTEX_REQUEUE_PI 12
28#define FUTEX_CMP_REQUEUE_PI 13
26 29
27#define FUTEX_PRIVATE_FLAG 128 30#define FUTEX_PRIVATE_FLAG 128
28#define FUTEX_CLOCK_REALTIME 256 31#define FUTEX_CLOCK_REALTIME 256
@@ -38,6 +41,11 @@ union ktime;
38#define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG) 41#define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG)
39#define FUTEX_WAIT_BITSET_PRIVATE (FUTEX_WAIT_BITS | FUTEX_PRIVATE_FLAG) 42#define FUTEX_WAIT_BITSET_PRIVATE (FUTEX_WAIT_BITS | FUTEX_PRIVATE_FLAG)
40#define FUTEX_WAKE_BITSET_PRIVATE (FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG) 43#define FUTEX_WAKE_BITSET_PRIVATE (FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG)
44#define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | \
45 FUTEX_PRIVATE_FLAG)
46#define FUTEX_REQUEUE_PI_PRIVATE (FUTEX_REQUEUE_PI | FUTEX_PRIVATE_FLAG)
47#define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \
48 FUTEX_PRIVATE_FLAG)
41 49
42/* 50/*
43 * Support for robust futexes: the kernel cleans up held futexes at 51 * Support for robust futexes: the kernel cleans up held futexes at
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index e6b820f8b56..a8cc4e13434 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -21,13 +21,14 @@ struct restart_block {
21 struct { 21 struct {
22 unsigned long arg0, arg1, arg2, arg3; 22 unsigned long arg0, arg1, arg2, arg3;
23 }; 23 };
24 /* For futex_wait */ 24 /* For futex_wait and futex_wait_requeue_pi */
25 struct { 25 struct {
26 u32 *uaddr; 26 u32 *uaddr;
27 u32 val; 27 u32 val;
28 u32 flags; 28 u32 flags;
29 u32 bitset; 29 u32 bitset;
30 u64 time; 30 u64 time;
31 u32 *uaddr2;
31 } futex; 32 } futex;
32 /* For nanosleep */ 33 /* For nanosleep */
33 struct { 34 struct {
diff --git a/kernel/futex.c b/kernel/futex.c
index dbe857aa438..185c981d89e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -19,6 +19,10 @@
19 * PRIVATE futexes by Eric Dumazet 19 * PRIVATE futexes by Eric Dumazet
20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> 20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
21 * 21 *
22 * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
23 * Copyright (C) IBM Corporation, 2009
24 * Thanks to Thomas Gleixner for conceptual design and careful reviews.
25 *
22 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 26 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
23 * enough at me, Linus for the original (flawed) idea, Matthew 27 * enough at me, Linus for the original (flawed) idea, Matthew
24 * Kirkwood for proof-of-concept implementation. 28 * Kirkwood for proof-of-concept implementation.
@@ -109,6 +113,9 @@ struct futex_q {
109 struct futex_pi_state *pi_state; 113 struct futex_pi_state *pi_state;
110 struct task_struct *task; 114 struct task_struct *task;
111 115
116 /* rt_waiter storage for requeue_pi: */
117 struct rt_mutex_waiter *rt_waiter;
118
112 /* Bitset for the optional bitmasked wakeup */ 119 /* Bitset for the optional bitmasked wakeup */
113 u32 bitset; 120 u32 bitset;
114}; 121};
@@ -827,7 +834,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
827 834
828 plist_for_each_entry_safe(this, next, head, list) { 835 plist_for_each_entry_safe(this, next, head, list) {
829 if (match_futex (&this->key, &key)) { 836 if (match_futex (&this->key, &key)) {
830 if (this->pi_state) { 837 if (this->pi_state || this->rt_waiter) {
831 ret = -EINVAL; 838 ret = -EINVAL;
832 break; 839 break;
833 } 840 }
@@ -968,20 +975,138 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
968 q->key = *key2; 975 q->key = *key2;
969} 976}
970 977
971/* 978/**
972 * Requeue all waiters hashed on one physical page to another 979 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
973 * physical page. 980 * q: the futex_q
981 * key: the key of the requeue target futex
982 *
983 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
984 * target futex if it is uncontended or via a lock steal. Set the futex_q key
985 * to the requeue target futex so the waiter can detect the wakeup on the right
986 * futex, but remove it from the hb and NULL the rt_waiter so it can detect
987 * atomic lock acquisition. Must be called with the q->lock_ptr held.
988 */
989static inline
990void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
991{
992 drop_futex_key_refs(&q->key);
993 get_futex_key_refs(key);
994 q->key = *key;
995
996 WARN_ON(plist_node_empty(&q->list));
997 plist_del(&q->list, &q->list.plist);
998
999 WARN_ON(!q->rt_waiter);
1000 q->rt_waiter = NULL;
1001
1002 wake_up(&q->waiter);
1003}
1004
1005/**
1006 * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
1007 * @pifutex: the user address of the to futex
1008 * @hb1: the from futex hash bucket, must be locked by the caller
1009 * @hb2: the to futex hash bucket, must be locked by the caller
1010 * @key1: the from futex key
1011 * @key2: the to futex key
1012 *
1013 * Try and get the lock on behalf of the top waiter if we can do it atomically.
1014 * Wake the top waiter if we succeed. hb1 and hb2 must be held by the caller.
1015 *
1016 * Returns:
1017 * 0 - failed to acquire the lock atomicly
1018 * 1 - acquired the lock
1019 * <0 - error
1020 */
1021static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1022 struct futex_hash_bucket *hb1,
1023 struct futex_hash_bucket *hb2,
1024 union futex_key *key1, union futex_key *key2,
1025 struct futex_pi_state **ps)
1026{
1027 struct futex_q *top_waiter;
1028 u32 curval;
1029 int ret;
1030
1031 if (get_futex_value_locked(&curval, pifutex))
1032 return -EFAULT;
1033
1034 top_waiter = futex_top_waiter(hb1, key1);
1035
1036 /* There are no waiters, nothing for us to do. */
1037 if (!top_waiter)
1038 return 0;
1039
1040 /*
1041 * Either take the lock for top_waiter or set the FUTEX_WAITERS bit.
1042 * The pi_state is returned in ps in contended cases.
1043 */
1044 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task);
1045 if (ret == 1)
1046 requeue_pi_wake_futex(top_waiter, key2);
1047
1048 return ret;
1049}
1050
1051/**
1052 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1053 * uaddr1: source futex user address
1054 * uaddr2: target futex user address
1055 * nr_wake: number of waiters to wake (must be 1 for requeue_pi)
1056 * nr_requeue: number of waiters to requeue (0-INT_MAX)
1057 * requeue_pi: if we are attempting to requeue from a non-pi futex to a
1058 * pi futex (pi to pi requeue is not supported)
1059 *
1060 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1061 * uaddr2 atomically on behalf of the top waiter.
1062 *
1063 * Returns:
1064 * >=0 - on success, the number of tasks requeued or woken
1065 * <0 - on error
974 */ 1066 */
975static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, 1067static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
976 int nr_wake, int nr_requeue, u32 *cmpval) 1068 int nr_wake, int nr_requeue, u32 *cmpval,
1069 int requeue_pi)
977{ 1070{
978 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1071 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1072 int drop_count = 0, task_count = 0, ret;
1073 struct futex_pi_state *pi_state = NULL;
979 struct futex_hash_bucket *hb1, *hb2; 1074 struct futex_hash_bucket *hb1, *hb2;
980 struct plist_head *head1; 1075 struct plist_head *head1;
981 struct futex_q *this, *next; 1076 struct futex_q *this, *next;
982 int ret, drop_count = 0; 1077 u32 curval2;
1078
1079 if (requeue_pi) {
1080 /*
1081 * requeue_pi requires a pi_state, try to allocate it now
1082 * without any locks in case it fails.
1083 */
1084 if (refill_pi_state_cache())
1085 return -ENOMEM;
1086 /*
1087 * requeue_pi must wake as many tasks as it can, up to nr_wake
1088 * + nr_requeue, since it acquires the rt_mutex prior to
1089 * returning to userspace, so as to not leave the rt_mutex with
1090 * waiters and no owner. However, second and third wake-ups
1091 * cannot be predicted as they involve race conditions with the
1092 * first wake and a fault while looking up the pi_state. Both
1093 * pthread_cond_signal() and pthread_cond_broadcast() should
1094 * use nr_wake=1.
1095 */
1096 if (nr_wake != 1)
1097 return -EINVAL;
1098 }
983 1099
984retry: 1100retry:
1101 if (pi_state != NULL) {
1102 /*
1103 * We will have to lookup the pi_state again, so free this one
1104 * to keep the accounting correct.
1105 */
1106 free_pi_state(pi_state);
1107 pi_state = NULL;
1108 }
1109
985 ret = get_futex_key(uaddr1, fshared, &key1); 1110 ret = get_futex_key(uaddr1, fshared, &key1);
986 if (unlikely(ret != 0)) 1111 if (unlikely(ret != 0))
987 goto out; 1112 goto out;
@@ -1020,19 +1145,94 @@ retry_private:
1020 } 1145 }
1021 } 1146 }
1022 1147
1148 if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
1149 /* Attempt to acquire uaddr2 and wake the top_waiter. */
1150 ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
1151 &key2, &pi_state);
1152
1153 /*
1154 * At this point the top_waiter has either taken uaddr2 or is
1155 * waiting on it. If the former, then the pi_state will not
1156 * exist yet, look it up one more time to ensure we have a
1157 * reference to it.
1158 */
1159 if (ret == 1) {
1160 WARN_ON(pi_state);
1161 task_count++;
1162 ret = get_futex_value_locked(&curval2, uaddr2);
1163 if (!ret)
1164 ret = lookup_pi_state(curval2, hb2, &key2,
1165 &pi_state);
1166 }
1167
1168 switch (ret) {
1169 case 0:
1170 break;
1171 case -EFAULT:
1172 double_unlock_hb(hb1, hb2);
1173 put_futex_key(fshared, &key2);
1174 put_futex_key(fshared, &key1);
1175 ret = get_user(curval2, uaddr2);
1176 if (!ret)
1177 goto retry;
1178 goto out;
1179 case -EAGAIN:
1180 /* The owner was exiting, try again. */
1181 double_unlock_hb(hb1, hb2);
1182 put_futex_key(fshared, &key2);
1183 put_futex_key(fshared, &key1);
1184 cond_resched();
1185 goto retry;
1186 default:
1187 goto out_unlock;
1188 }
1189 }
1190
1023 head1 = &hb1->chain; 1191 head1 = &hb1->chain;
1024 plist_for_each_entry_safe(this, next, head1, list) { 1192 plist_for_each_entry_safe(this, next, head1, list) {
1025 if (!match_futex (&this->key, &key1)) 1193 if (task_count - nr_wake >= nr_requeue)
1194 break;
1195
1196 if (!match_futex(&this->key, &key1))
1026 continue; 1197 continue;
1027 if (++ret <= nr_wake) { 1198
1199 WARN_ON(!requeue_pi && this->rt_waiter);
1200 WARN_ON(requeue_pi && !this->rt_waiter);
1201
1202 /*
1203 * Wake nr_wake waiters. For requeue_pi, if we acquired the
1204 * lock, we already woke the top_waiter. If not, it will be
1205 * woken by futex_unlock_pi().
1206 */
1207 if (++task_count <= nr_wake && !requeue_pi) {
1028 wake_futex(this); 1208 wake_futex(this);
1029 } else { 1209 continue;
1030 requeue_futex(this, hb1, hb2, &key2); 1210 }
1031 drop_count++;
1032 1211
1033 if (ret - nr_wake >= nr_requeue) 1212 /*
1034 break; 1213 * Requeue nr_requeue waiters and possibly one more in the case
1214 * of requeue_pi if we couldn't acquire the lock atomically.
1215 */
1216 if (requeue_pi) {
1217 /* Prepare the waiter to take the rt_mutex. */
1218 atomic_inc(&pi_state->refcount);
1219 this->pi_state = pi_state;
1220 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
1221 this->rt_waiter,
1222 this->task, 1);
1223 if (ret == 1) {
1224 /* We got the lock. */
1225 requeue_pi_wake_futex(this, &key2);
1226 continue;
1227 } else if (ret) {
1228 /* -EDEADLK */
1229 this->pi_state = NULL;
1230 free_pi_state(pi_state);
1231 goto out_unlock;
1232 }
1035 } 1233 }
1234 requeue_futex(this, hb1, hb2, &key2);
1235 drop_count++;
1036 } 1236 }
1037 1237
1038out_unlock: 1238out_unlock:
@@ -1047,7 +1247,9 @@ out_put_keys:
1047out_put_key1: 1247out_put_key1:
1048 put_futex_key(fshared, &key1); 1248 put_futex_key(fshared, &key1);
1049out: 1249out:
1050 return ret; 1250 if (pi_state != NULL)
1251 free_pi_state(pi_state);
1252 return ret ? ret : task_count;
1051} 1253}
1052 1254
1053/* The key must be already stored in q->key. */ 1255/* The key must be already stored in q->key. */
@@ -1270,6 +1472,7 @@ handle_fault:
1270#define FLAGS_HAS_TIMEOUT 0x04 1472#define FLAGS_HAS_TIMEOUT 0x04
1271 1473
1272static long futex_wait_restart(struct restart_block *restart); 1474static long futex_wait_restart(struct restart_block *restart);
1475static long futex_lock_pi_restart(struct restart_block *restart);
1273 1476
1274/** 1477/**
1275 * fixup_owner() - Post lock pi_state and corner case management 1478 * fixup_owner() - Post lock pi_state and corner case management
@@ -1489,6 +1692,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1489 1692
1490 q.pi_state = NULL; 1693 q.pi_state = NULL;
1491 q.bitset = bitset; 1694 q.bitset = bitset;
1695 q.rt_waiter = NULL;
1492 1696
1493 if (abs_time) { 1697 if (abs_time) {
1494 to = &timeout; 1698 to = &timeout;
@@ -1596,6 +1800,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1596 } 1800 }
1597 1801
1598 q.pi_state = NULL; 1802 q.pi_state = NULL;
1803 q.rt_waiter = NULL;
1599retry: 1804retry:
1600 q.key = FUTEX_KEY_INIT; 1805 q.key = FUTEX_KEY_INIT;
1601 ret = get_futex_key(uaddr, fshared, &q.key); 1806 ret = get_futex_key(uaddr, fshared, &q.key);
@@ -1701,6 +1906,20 @@ uaddr_faulted:
1701 goto retry; 1906 goto retry;
1702} 1907}
1703 1908
1909static long futex_lock_pi_restart(struct restart_block *restart)
1910{
1911 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
1912 ktime_t t, *tp = NULL;
1913 int fshared = restart->futex.flags & FLAGS_SHARED;
1914
1915 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
1916 t.tv64 = restart->futex.time;
1917 tp = &t;
1918 }
1919 restart->fn = do_no_restart_syscall;
1920
1921 return (long)futex_lock_pi(uaddr, fshared, restart->futex.val, tp, 0);
1922}
1704 1923
1705/* 1924/*
1706 * Userspace attempted a TID -> 0 atomic transition, and failed. 1925 * Userspace attempted a TID -> 0 atomic transition, and failed.
@@ -1803,6 +2022,253 @@ pi_faulted:
1803 return ret; 2022 return ret;
1804} 2023}
1805 2024
2025/**
2026 * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
2027 * @hb: the hash_bucket futex_q was original enqueued on
2028 * @q: the futex_q woken while waiting to be requeued
2029 * @key2: the futex_key of the requeue target futex
2030 * @timeout: the timeout associated with the wait (NULL if none)
2031 *
2032 * Detect if the task was woken on the initial futex as opposed to the requeue
2033 * target futex. If so, determine if it was a timeout or a signal that caused
2034 * the wakeup and return the appropriate error code to the caller. Must be
2035 * called with the hb lock held.
2036 *
2037 * Returns
2038 * 0 - no early wakeup detected
2039 * <0 - -ETIMEDOUT or -ERESTARTSYS (FIXME: or ERESTARTNOINTR?)
2040 */
2041static inline
2042int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2043 struct futex_q *q, union futex_key *key2,
2044 struct hrtimer_sleeper *timeout)
2045{
2046 int ret = 0;
2047
2048 /*
2049 * With the hb lock held, we avoid races while we process the wakeup.
2050 * We only need to hold hb (and not hb2) to ensure atomicity as the
2051 * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
2052 * It can't be requeued from uaddr2 to something else since we don't
2053 * support a PI aware source futex for requeue.
2054 */
2055 if (!match_futex(&q->key, key2)) {
2056 WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
2057 /*
2058 * We were woken prior to requeue by a timeout or a signal.
2059 * Unqueue the futex_q and determine which it was.
2060 */
2061 plist_del(&q->list, &q->list.plist);
2062 drop_futex_key_refs(&q->key);
2063
2064 if (timeout && !timeout->task)
2065 ret = -ETIMEDOUT;
2066 else {
2067 /*
2068 * We expect signal_pending(current), but another
2069 * thread may have handled it for us already.
2070 */
2071 /* FIXME: ERESTARTSYS or ERESTARTNOINTR? Do we care if
2072 * the user specified SA_RESTART or not? */
2073 ret = -ERESTARTSYS;
2074 }
2075 }
2076 return ret;
2077}
2078
2079/**
2080 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2081 * @uaddr: the futex we initialyl wait on (non-pi)
2082 * @fshared: whether the futexes are shared (1) or not (0). They must be
2083 * the same type, no requeueing from private to shared, etc.
2084 * @val: the expected value of uaddr
2085 * @abs_time: absolute timeout
2086 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all.
2087 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
2088 * @uaddr2: the pi futex we will take prior to returning to user-space
2089 *
2090 * The caller will wait on uaddr and will be requeued by futex_requeue() to
2091 * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and
2092 * complete the acquisition of the rt_mutex prior to returning to userspace.
2093 * This ensures the rt_mutex maintains an owner when it has waiters; without
2094 * one, the pi logic wouldn't know which task to boost/deboost, if there was a
2095 * need to.
2096 *
2097 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2098 * via the following:
2099 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2100 * 2) wakeup on uaddr2 after a requeue and subsequent unlock
2101 * 3) signal (before or after requeue)
2102 * 4) timeout (before or after requeue)
2103 *
2104 * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function.
2105 *
2106 * If 2, we may then block on trying to take the rt_mutex and return via:
2107 * 5) successful lock
2108 * 6) signal
2109 * 7) timeout
2110 * 8) other lock acquisition failure
2111 *
2112 * If 6, we setup a restart_block with futex_lock_pi() as the function.
2113 *
2114 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2115 *
2116 * Returns:
2117 * 0 - On success
2118 * <0 - On error
2119 */
2120static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2121 u32 val, ktime_t *abs_time, u32 bitset,
2122 int clockrt, u32 __user *uaddr2)
2123{
2124 struct hrtimer_sleeper timeout, *to = NULL;
2125 struct rt_mutex_waiter rt_waiter;
2126 struct rt_mutex *pi_mutex = NULL;
2127 DECLARE_WAITQUEUE(wait, current);
2128 struct restart_block *restart;
2129 struct futex_hash_bucket *hb;
2130 union futex_key key2;
2131 struct futex_q q;
2132 int res, ret;
2133 u32 uval;
2134
2135 if (!bitset)
2136 return -EINVAL;
2137
2138 if (abs_time) {
2139 to = &timeout;
2140 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
2141 CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2142 hrtimer_init_sleeper(to, current);
2143 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2144 current->timer_slack_ns);
2145 }
2146
2147 /*
2148 * The waiter is allocated on our stack, manipulated by the requeue
2149 * code while we sleep on uaddr.
2150 */
2151 debug_rt_mutex_init_waiter(&rt_waiter);
2152 rt_waiter.task = NULL;
2153
2154 q.pi_state = NULL;
2155 q.bitset = bitset;
2156 q.rt_waiter = &rt_waiter;
2157
2158 key2 = FUTEX_KEY_INIT;
2159 ret = get_futex_key(uaddr2, fshared, &key2);
2160 if (unlikely(ret != 0))
2161 goto out;
2162
2163 /* Prepare to wait on uaddr. */
2164 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
2165 if (ret) {
2166 put_futex_key(fshared, &key2);
2167 goto out;
2168 }
2169
2170 /* Queue the futex_q, drop the hb lock, wait for wakeup. */
2171 futex_wait_queue_me(hb, &q, to, &wait);
2172
2173 spin_lock(&hb->lock);
2174 ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
2175 spin_unlock(&hb->lock);
2176 if (ret)
2177 goto out_put_keys;
2178
2179 /*
2180 * In order for us to be here, we know our q.key == key2, and since
2181 * we took the hb->lock above, we also know that futex_requeue() has
2182 * completed and we no longer have to concern ourselves with a wakeup
2183 * race with the atomic proxy lock acquition by the requeue code.
2184 */
2185
2186 /* Check if the requeue code acquired the second futex for us. */
2187 if (!q.rt_waiter) {
2188 /*
2189 * Got the lock. We might not be the anticipated owner if we
2190 * did a lock-steal - fix up the PI-state in that case.
2191 */
2192 if (q.pi_state && (q.pi_state->owner != current)) {
2193 spin_lock(q.lock_ptr);
2194 ret = fixup_pi_state_owner(uaddr2, &q, current,
2195 fshared);
2196 spin_unlock(q.lock_ptr);
2197 }
2198 } else {
2199 /*
2200 * We have been woken up by futex_unlock_pi(), a timeout, or a
2201 * signal. futex_unlock_pi() will not destroy the lock_ptr nor
2202 * the pi_state.
2203 */
2204 WARN_ON(!&q.pi_state);
2205 pi_mutex = &q.pi_state->pi_mutex;
2206 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
2207 debug_rt_mutex_free_waiter(&rt_waiter);
2208
2209 spin_lock(q.lock_ptr);
2210 /*
2211 * Fixup the pi_state owner and possibly acquire the lock if we
2212 * haven't already.
2213 */
2214 res = fixup_owner(uaddr2, fshared, &q, !ret);
2215 /*
2216 * If fixup_owner() returned an error, proprogate that. If it
2217 * acquired the lock, clear our -ETIMEDOUT or -EINTR.
2218 */
2219 if (res)
2220 ret = (res < 0) ? res : 0;
2221
2222 /* Unqueue and drop the lock. */
2223 unqueue_me_pi(&q);
2224 }
2225
2226 /*
2227 * If fixup_pi_state_owner() faulted and was unable to handle the
2228 * fault, unlock the rt_mutex and return the fault to userspace.
2229 */
2230 if (ret == -EFAULT) {
2231 if (rt_mutex_owner(pi_mutex) == current)
2232 rt_mutex_unlock(pi_mutex);
2233 } else if (ret == -EINTR) {
2234 ret = -EFAULT;
2235 if (get_user(uval, uaddr2))
2236 goto out_put_keys;
2237
2238 /*
2239 * We've already been requeued, so restart by calling
2240 * futex_lock_pi() directly, rather then returning to this
2241 * function.
2242 */
2243 ret = -ERESTART_RESTARTBLOCK;
2244 restart = &current_thread_info()->restart_block;
2245 restart->fn = futex_lock_pi_restart;
2246 restart->futex.uaddr = (u32 *)uaddr2;
2247 restart->futex.val = uval;
2248 restart->futex.flags = 0;
2249 if (abs_time) {
2250 restart->futex.flags |= FLAGS_HAS_TIMEOUT;
2251 restart->futex.time = abs_time->tv64;
2252 }
2253
2254 if (fshared)
2255 restart->futex.flags |= FLAGS_SHARED;
2256 if (clockrt)
2257 restart->futex.flags |= FLAGS_CLOCKRT;
2258 }
2259
2260out_put_keys:
2261 put_futex_key(fshared, &q.key);
2262 put_futex_key(fshared, &key2);
2263
2264out:
2265 if (to) {
2266 hrtimer_cancel(&to->timer);
2267 destroy_hrtimer_on_stack(&to->timer);
2268 }
2269 return ret;
2270}
2271
1806/* 2272/*
1807 * Support for robust futexes: the kernel cleans up held futexes at 2273 * Support for robust futexes: the kernel cleans up held futexes at
1808 * thread exit time. 2274 * thread exit time.
@@ -2025,7 +2491,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2025 fshared = 1; 2491 fshared = 1;
2026 2492
2027 clockrt = op & FUTEX_CLOCK_REALTIME; 2493 clockrt = op & FUTEX_CLOCK_REALTIME;
2028 if (clockrt && cmd != FUTEX_WAIT_BITSET) 2494 if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
2029 return -ENOSYS; 2495 return -ENOSYS;
2030 2496
2031 switch (cmd) { 2497 switch (cmd) {
@@ -2040,10 +2506,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2040 ret = futex_wake(uaddr, fshared, val, val3); 2506 ret = futex_wake(uaddr, fshared, val, val3);
2041 break; 2507 break;
2042 case FUTEX_REQUEUE: 2508 case FUTEX_REQUEUE:
2043 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); 2509 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
2044 break; 2510 break;
2045 case FUTEX_CMP_REQUEUE: 2511 case FUTEX_CMP_REQUEUE:
2046 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); 2512 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
2513 0);
2047 break; 2514 break;
2048 case FUTEX_WAKE_OP: 2515 case FUTEX_WAKE_OP:
2049 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); 2516 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
@@ -2060,6 +2527,18 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2060 if (futex_cmpxchg_enabled) 2527 if (futex_cmpxchg_enabled)
2061 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); 2528 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
2062 break; 2529 break;
2530 case FUTEX_WAIT_REQUEUE_PI:
2531 val3 = FUTEX_BITSET_MATCH_ANY;
2532 ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
2533 clockrt, uaddr2);
2534 break;
2535 case FUTEX_REQUEUE_PI:
2536 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 1);
2537 break;
2538 case FUTEX_CMP_REQUEUE_PI:
2539 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
2540 1);
2541 break;
2063 default: 2542 default:
2064 ret = -ENOSYS; 2543 ret = -ENOSYS;
2065 } 2544 }
@@ -2077,7 +2556,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
2077 int cmd = op & FUTEX_CMD_MASK; 2556 int cmd = op & FUTEX_CMD_MASK;
2078 2557
2079 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || 2558 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
2080 cmd == FUTEX_WAIT_BITSET)) { 2559 cmd == FUTEX_WAIT_BITSET ||
2560 cmd == FUTEX_WAIT_REQUEUE_PI)) {
2081 if (copy_from_user(&ts, utime, sizeof(ts)) != 0) 2561 if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
2082 return -EFAULT; 2562 return -EFAULT;
2083 if (!timespec_valid(&ts)) 2563 if (!timespec_valid(&ts))
@@ -2089,10 +2569,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
2089 tp = &t; 2569 tp = &t;
2090 } 2570 }
2091 /* 2571 /*
2092 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. 2572 * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
2093 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. 2573 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
2094 */ 2574 */
2095 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || 2575 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
2576 cmd == FUTEX_REQUEUE_PI || cmd == FUTEX_CMP_REQUEUE_PI ||
2096 cmd == FUTEX_WAKE_OP) 2577 cmd == FUTEX_WAKE_OP)
2097 val2 = (u32) (unsigned long) utime; 2578 val2 = (u32) (unsigned long) utime;
2098 2579