diff options
author | Darren Hart <dvhltc@us.ibm.com> | 2009-04-03 16:40:49 -0400 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2009-04-06 05:14:03 -0400 |
commit | 52400ba946759af28442dee6265c5c0180ac7122 (patch) | |
tree | 4c9abe885b3cae3cb47b33826b3c5838fc9761b7 | |
parent | f801073f87aa22ddf0e9146355fec3993163790f (diff) |
futex: add requeue_pi functionality
PI Futexes and their underlying rt_mutex cannot be left ownerless if
there are pending waiters as this will break the PI boosting logic, so
the standard requeue commands aren't sufficient. The new commands
properly manage pi futex ownership by ensuring a futex with waiters
has an owner at all times. This will allow glibc to properly handle
pi mutexes with pthread_condvars.
The approach taken here is to create two new futex op codes:
FUTEX_WAIT_REQUEUE_PI:
Tasks will use this op code to wait on a futex (such as a non-pi waitqueue)
and wake after they have been requeued to a pi futex. Prior to returning to
userspace, they will acquire this pi futex (and the underlying rt_mutex).
futex_wait_requeue_pi() is the result of a high speed collision between
futex_wait() and futex_lock_pi() (with the first part of futex_lock_pi() being
done by futex_proxy_trylock_atomic() on behalf of the top_waiter).
FUTEX_REQUEUE_PI (and FUTEX_CMP_REQUEUE_PI):
This call must be used to wake tasks waiting with FUTEX_WAIT_REQUEUE_PI,
regardless of how many tasks the caller intends to wake or requeue.
pthread_cond_broadcast() should call this with nr_wake=1 and
nr_requeue=INT_MAX. pthread_cond_signal() should call this with nr_wake=1 and
nr_requeue=0. The reason being we need both callers to get the benefit of the
futex_proxy_trylock_atomic() routine. futex_requeue() also enqueues the
top_waiter on the rt_mutex via rt_mutex_start_proxy_lock().
Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-rw-r--r-- | include/linux/futex.h | 8 | ||||
-rw-r--r-- | include/linux/thread_info.h | 3 | ||||
-rw-r--r-- | kernel/futex.c | 519 |
3 files changed, 510 insertions, 20 deletions
diff --git a/include/linux/futex.h b/include/linux/futex.h index 3bf5bb5a34f..b05519ca9e5 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h | |||
@@ -23,6 +23,9 @@ union ktime; | |||
23 | #define FUTEX_TRYLOCK_PI 8 | 23 | #define FUTEX_TRYLOCK_PI 8 |
24 | #define FUTEX_WAIT_BITSET 9 | 24 | #define FUTEX_WAIT_BITSET 9 |
25 | #define FUTEX_WAKE_BITSET 10 | 25 | #define FUTEX_WAKE_BITSET 10 |
26 | #define FUTEX_WAIT_REQUEUE_PI 11 | ||
27 | #define FUTEX_REQUEUE_PI 12 | ||
28 | #define FUTEX_CMP_REQUEUE_PI 13 | ||
26 | 29 | ||
27 | #define FUTEX_PRIVATE_FLAG 128 | 30 | #define FUTEX_PRIVATE_FLAG 128 |
28 | #define FUTEX_CLOCK_REALTIME 256 | 31 | #define FUTEX_CLOCK_REALTIME 256 |
@@ -38,6 +41,11 @@ union ktime; | |||
38 | #define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG) | 41 | #define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG) |
39 | #define FUTEX_WAIT_BITSET_PRIVATE (FUTEX_WAIT_BITS | FUTEX_PRIVATE_FLAG) | 42 | #define FUTEX_WAIT_BITSET_PRIVATE (FUTEX_WAIT_BITS | FUTEX_PRIVATE_FLAG) |
40 | #define FUTEX_WAKE_BITSET_PRIVATE (FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG) | 43 | #define FUTEX_WAKE_BITSET_PRIVATE (FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG) |
44 | #define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | \ | ||
45 | FUTEX_PRIVATE_FLAG) | ||
46 | #define FUTEX_REQUEUE_PI_PRIVATE (FUTEX_REQUEUE_PI | FUTEX_PRIVATE_FLAG) | ||
47 | #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ | ||
48 | FUTEX_PRIVATE_FLAG) | ||
41 | 49 | ||
42 | /* | 50 | /* |
43 | * Support for robust futexes: the kernel cleans up held futexes at | 51 | * Support for robust futexes: the kernel cleans up held futexes at |
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index e6b820f8b56..a8cc4e13434 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h | |||
@@ -21,13 +21,14 @@ struct restart_block { | |||
21 | struct { | 21 | struct { |
22 | unsigned long arg0, arg1, arg2, arg3; | 22 | unsigned long arg0, arg1, arg2, arg3; |
23 | }; | 23 | }; |
24 | /* For futex_wait */ | 24 | /* For futex_wait and futex_wait_requeue_pi */ |
25 | struct { | 25 | struct { |
26 | u32 *uaddr; | 26 | u32 *uaddr; |
27 | u32 val; | 27 | u32 val; |
28 | u32 flags; | 28 | u32 flags; |
29 | u32 bitset; | 29 | u32 bitset; |
30 | u64 time; | 30 | u64 time; |
31 | u32 *uaddr2; | ||
31 | } futex; | 32 | } futex; |
32 | /* For nanosleep */ | 33 | /* For nanosleep */ |
33 | struct { | 34 | struct { |
diff --git a/kernel/futex.c b/kernel/futex.c index dbe857aa438..185c981d89e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -19,6 +19,10 @@ | |||
19 | * PRIVATE futexes by Eric Dumazet | 19 | * PRIVATE futexes by Eric Dumazet |
20 | * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> | 20 | * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> |
21 | * | 21 | * |
22 | * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> | ||
23 | * Copyright (C) IBM Corporation, 2009 | ||
24 | * Thanks to Thomas Gleixner for conceptual design and careful reviews. | ||
25 | * | ||
22 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly | 26 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly |
23 | * enough at me, Linus for the original (flawed) idea, Matthew | 27 | * enough at me, Linus for the original (flawed) idea, Matthew |
24 | * Kirkwood for proof-of-concept implementation. | 28 | * Kirkwood for proof-of-concept implementation. |
@@ -109,6 +113,9 @@ struct futex_q { | |||
109 | struct futex_pi_state *pi_state; | 113 | struct futex_pi_state *pi_state; |
110 | struct task_struct *task; | 114 | struct task_struct *task; |
111 | 115 | ||
116 | /* rt_waiter storage for requeue_pi: */ | ||
117 | struct rt_mutex_waiter *rt_waiter; | ||
118 | |||
112 | /* Bitset for the optional bitmasked wakeup */ | 119 | /* Bitset for the optional bitmasked wakeup */ |
113 | u32 bitset; | 120 | u32 bitset; |
114 | }; | 121 | }; |
@@ -827,7 +834,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) | |||
827 | 834 | ||
828 | plist_for_each_entry_safe(this, next, head, list) { | 835 | plist_for_each_entry_safe(this, next, head, list) { |
829 | if (match_futex (&this->key, &key)) { | 836 | if (match_futex (&this->key, &key)) { |
830 | if (this->pi_state) { | 837 | if (this->pi_state || this->rt_waiter) { |
831 | ret = -EINVAL; | 838 | ret = -EINVAL; |
832 | break; | 839 | break; |
833 | } | 840 | } |
@@ -968,20 +975,138 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, | |||
968 | q->key = *key2; | 975 | q->key = *key2; |
969 | } | 976 | } |
970 | 977 | ||
971 | /* | 978 | /** |
972 | * Requeue all waiters hashed on one physical page to another | 979 | * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue |
973 | * physical page. | 980 | * q: the futex_q |
981 | * key: the key of the requeue target futex | ||
982 | * | ||
983 | * During futex_requeue, with requeue_pi=1, it is possible to acquire the | ||
984 | * target futex if it is uncontended or via a lock steal. Set the futex_q key | ||
985 | * to the requeue target futex so the waiter can detect the wakeup on the right | ||
986 | * futex, but remove it from the hb and NULL the rt_waiter so it can detect | ||
987 | * atomic lock acquisition. Must be called with the q->lock_ptr held. | ||
988 | */ | ||
989 | static inline | ||
990 | void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) | ||
991 | { | ||
992 | drop_futex_key_refs(&q->key); | ||
993 | get_futex_key_refs(key); | ||
994 | q->key = *key; | ||
995 | |||
996 | WARN_ON(plist_node_empty(&q->list)); | ||
997 | plist_del(&q->list, &q->list.plist); | ||
998 | |||
999 | WARN_ON(!q->rt_waiter); | ||
1000 | q->rt_waiter = NULL; | ||
1001 | |||
1002 | wake_up(&q->waiter); | ||
1003 | } | ||
1004 | |||
1005 | /** | ||
1006 | * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter | ||
1007 | * @pifutex: the user address of the to futex | ||
1008 | * @hb1: the from futex hash bucket, must be locked by the caller | ||
1009 | * @hb2: the to futex hash bucket, must be locked by the caller | ||
1010 | * @key1: the from futex key | ||
1011 | * @key2: the to futex key | ||
1012 | * | ||
1013 | * Try and get the lock on behalf of the top waiter if we can do it atomically. | ||
1014 | * Wake the top waiter if we succeed. hb1 and hb2 must be held by the caller. | ||
1015 | * | ||
1016 | * Returns: | ||
1017 | * 0 - failed to acquire the lock atomicly | ||
1018 | * 1 - acquired the lock | ||
1019 | * <0 - error | ||
1020 | */ | ||
1021 | static int futex_proxy_trylock_atomic(u32 __user *pifutex, | ||
1022 | struct futex_hash_bucket *hb1, | ||
1023 | struct futex_hash_bucket *hb2, | ||
1024 | union futex_key *key1, union futex_key *key2, | ||
1025 | struct futex_pi_state **ps) | ||
1026 | { | ||
1027 | struct futex_q *top_waiter; | ||
1028 | u32 curval; | ||
1029 | int ret; | ||
1030 | |||
1031 | if (get_futex_value_locked(&curval, pifutex)) | ||
1032 | return -EFAULT; | ||
1033 | |||
1034 | top_waiter = futex_top_waiter(hb1, key1); | ||
1035 | |||
1036 | /* There are no waiters, nothing for us to do. */ | ||
1037 | if (!top_waiter) | ||
1038 | return 0; | ||
1039 | |||
1040 | /* | ||
1041 | * Either take the lock for top_waiter or set the FUTEX_WAITERS bit. | ||
1042 | * The pi_state is returned in ps in contended cases. | ||
1043 | */ | ||
1044 | ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task); | ||
1045 | if (ret == 1) | ||
1046 | requeue_pi_wake_futex(top_waiter, key2); | ||
1047 | |||
1048 | return ret; | ||
1049 | } | ||
1050 | |||
1051 | /** | ||
1052 | * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 | ||
1053 | * uaddr1: source futex user address | ||
1054 | * uaddr2: target futex user address | ||
1055 | * nr_wake: number of waiters to wake (must be 1 for requeue_pi) | ||
1056 | * nr_requeue: number of waiters to requeue (0-INT_MAX) | ||
1057 | * requeue_pi: if we are attempting to requeue from a non-pi futex to a | ||
1058 | * pi futex (pi to pi requeue is not supported) | ||
1059 | * | ||
1060 | * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire | ||
1061 | * uaddr2 atomically on behalf of the top waiter. | ||
1062 | * | ||
1063 | * Returns: | ||
1064 | * >=0 - on success, the number of tasks requeued or woken | ||
1065 | * <0 - on error | ||
974 | */ | 1066 | */ |
975 | static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, | 1067 | static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, |
976 | int nr_wake, int nr_requeue, u32 *cmpval) | 1068 | int nr_wake, int nr_requeue, u32 *cmpval, |
1069 | int requeue_pi) | ||
977 | { | 1070 | { |
978 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; | 1071 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; |
1072 | int drop_count = 0, task_count = 0, ret; | ||
1073 | struct futex_pi_state *pi_state = NULL; | ||
979 | struct futex_hash_bucket *hb1, *hb2; | 1074 | struct futex_hash_bucket *hb1, *hb2; |
980 | struct plist_head *head1; | 1075 | struct plist_head *head1; |
981 | struct futex_q *this, *next; | 1076 | struct futex_q *this, *next; |
982 | int ret, drop_count = 0; | 1077 | u32 curval2; |
1078 | |||
1079 | if (requeue_pi) { | ||
1080 | /* | ||
1081 | * requeue_pi requires a pi_state, try to allocate it now | ||
1082 | * without any locks in case it fails. | ||
1083 | */ | ||
1084 | if (refill_pi_state_cache()) | ||
1085 | return -ENOMEM; | ||
1086 | /* | ||
1087 | * requeue_pi must wake as many tasks as it can, up to nr_wake | ||
1088 | * + nr_requeue, since it acquires the rt_mutex prior to | ||
1089 | * returning to userspace, so as to not leave the rt_mutex with | ||
1090 | * waiters and no owner. However, second and third wake-ups | ||
1091 | * cannot be predicted as they involve race conditions with the | ||
1092 | * first wake and a fault while looking up the pi_state. Both | ||
1093 | * pthread_cond_signal() and pthread_cond_broadcast() should | ||
1094 | * use nr_wake=1. | ||
1095 | */ | ||
1096 | if (nr_wake != 1) | ||
1097 | return -EINVAL; | ||
1098 | } | ||
983 | 1099 | ||
984 | retry: | 1100 | retry: |
1101 | if (pi_state != NULL) { | ||
1102 | /* | ||
1103 | * We will have to lookup the pi_state again, so free this one | ||
1104 | * to keep the accounting correct. | ||
1105 | */ | ||
1106 | free_pi_state(pi_state); | ||
1107 | pi_state = NULL; | ||
1108 | } | ||
1109 | |||
985 | ret = get_futex_key(uaddr1, fshared, &key1); | 1110 | ret = get_futex_key(uaddr1, fshared, &key1); |
986 | if (unlikely(ret != 0)) | 1111 | if (unlikely(ret != 0)) |
987 | goto out; | 1112 | goto out; |
@@ -1020,19 +1145,94 @@ retry_private: | |||
1020 | } | 1145 | } |
1021 | } | 1146 | } |
1022 | 1147 | ||
1148 | if (requeue_pi && (task_count - nr_wake < nr_requeue)) { | ||
1149 | /* Attempt to acquire uaddr2 and wake the top_waiter. */ | ||
1150 | ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, | ||
1151 | &key2, &pi_state); | ||
1152 | |||
1153 | /* | ||
1154 | * At this point the top_waiter has either taken uaddr2 or is | ||
1155 | * waiting on it. If the former, then the pi_state will not | ||
1156 | * exist yet, look it up one more time to ensure we have a | ||
1157 | * reference to it. | ||
1158 | */ | ||
1159 | if (ret == 1) { | ||
1160 | WARN_ON(pi_state); | ||
1161 | task_count++; | ||
1162 | ret = get_futex_value_locked(&curval2, uaddr2); | ||
1163 | if (!ret) | ||
1164 | ret = lookup_pi_state(curval2, hb2, &key2, | ||
1165 | &pi_state); | ||
1166 | } | ||
1167 | |||
1168 | switch (ret) { | ||
1169 | case 0: | ||
1170 | break; | ||
1171 | case -EFAULT: | ||
1172 | double_unlock_hb(hb1, hb2); | ||
1173 | put_futex_key(fshared, &key2); | ||
1174 | put_futex_key(fshared, &key1); | ||
1175 | ret = get_user(curval2, uaddr2); | ||
1176 | if (!ret) | ||
1177 | goto retry; | ||
1178 | goto out; | ||
1179 | case -EAGAIN: | ||
1180 | /* The owner was exiting, try again. */ | ||
1181 | double_unlock_hb(hb1, hb2); | ||
1182 | put_futex_key(fshared, &key2); | ||
1183 | put_futex_key(fshared, &key1); | ||
1184 | cond_resched(); | ||
1185 | goto retry; | ||
1186 | default: | ||
1187 | goto out_unlock; | ||
1188 | } | ||
1189 | } | ||
1190 | |||
1023 | head1 = &hb1->chain; | 1191 | head1 = &hb1->chain; |
1024 | plist_for_each_entry_safe(this, next, head1, list) { | 1192 | plist_for_each_entry_safe(this, next, head1, list) { |
1025 | if (!match_futex (&this->key, &key1)) | 1193 | if (task_count - nr_wake >= nr_requeue) |
1194 | break; | ||
1195 | |||
1196 | if (!match_futex(&this->key, &key1)) | ||
1026 | continue; | 1197 | continue; |
1027 | if (++ret <= nr_wake) { | 1198 | |
1199 | WARN_ON(!requeue_pi && this->rt_waiter); | ||
1200 | WARN_ON(requeue_pi && !this->rt_waiter); | ||
1201 | |||
1202 | /* | ||
1203 | * Wake nr_wake waiters. For requeue_pi, if we acquired the | ||
1204 | * lock, we already woke the top_waiter. If not, it will be | ||
1205 | * woken by futex_unlock_pi(). | ||
1206 | */ | ||
1207 | if (++task_count <= nr_wake && !requeue_pi) { | ||
1028 | wake_futex(this); | 1208 | wake_futex(this); |
1029 | } else { | 1209 | continue; |
1030 | requeue_futex(this, hb1, hb2, &key2); | 1210 | } |
1031 | drop_count++; | ||
1032 | 1211 | ||
1033 | if (ret - nr_wake >= nr_requeue) | 1212 | /* |
1034 | break; | 1213 | * Requeue nr_requeue waiters and possibly one more in the case |
1214 | * of requeue_pi if we couldn't acquire the lock atomically. | ||
1215 | */ | ||
1216 | if (requeue_pi) { | ||
1217 | /* Prepare the waiter to take the rt_mutex. */ | ||
1218 | atomic_inc(&pi_state->refcount); | ||
1219 | this->pi_state = pi_state; | ||
1220 | ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, | ||
1221 | this->rt_waiter, | ||
1222 | this->task, 1); | ||
1223 | if (ret == 1) { | ||
1224 | /* We got the lock. */ | ||
1225 | requeue_pi_wake_futex(this, &key2); | ||
1226 | continue; | ||
1227 | } else if (ret) { | ||
1228 | /* -EDEADLK */ | ||
1229 | this->pi_state = NULL; | ||
1230 | free_pi_state(pi_state); | ||
1231 | goto out_unlock; | ||
1232 | } | ||
1035 | } | 1233 | } |
1234 | requeue_futex(this, hb1, hb2, &key2); | ||
1235 | drop_count++; | ||
1036 | } | 1236 | } |
1037 | 1237 | ||
1038 | out_unlock: | 1238 | out_unlock: |
@@ -1047,7 +1247,9 @@ out_put_keys: | |||
1047 | out_put_key1: | 1247 | out_put_key1: |
1048 | put_futex_key(fshared, &key1); | 1248 | put_futex_key(fshared, &key1); |
1049 | out: | 1249 | out: |
1050 | return ret; | 1250 | if (pi_state != NULL) |
1251 | free_pi_state(pi_state); | ||
1252 | return ret ? ret : task_count; | ||
1051 | } | 1253 | } |
1052 | 1254 | ||
1053 | /* The key must be already stored in q->key. */ | 1255 | /* The key must be already stored in q->key. */ |
@@ -1270,6 +1472,7 @@ handle_fault: | |||
1270 | #define FLAGS_HAS_TIMEOUT 0x04 | 1472 | #define FLAGS_HAS_TIMEOUT 0x04 |
1271 | 1473 | ||
1272 | static long futex_wait_restart(struct restart_block *restart); | 1474 | static long futex_wait_restart(struct restart_block *restart); |
1475 | static long futex_lock_pi_restart(struct restart_block *restart); | ||
1273 | 1476 | ||
1274 | /** | 1477 | /** |
1275 | * fixup_owner() - Post lock pi_state and corner case management | 1478 | * fixup_owner() - Post lock pi_state and corner case management |
@@ -1489,6 +1692,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, | |||
1489 | 1692 | ||
1490 | q.pi_state = NULL; | 1693 | q.pi_state = NULL; |
1491 | q.bitset = bitset; | 1694 | q.bitset = bitset; |
1695 | q.rt_waiter = NULL; | ||
1492 | 1696 | ||
1493 | if (abs_time) { | 1697 | if (abs_time) { |
1494 | to = &timeout; | 1698 | to = &timeout; |
@@ -1596,6 +1800,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, | |||
1596 | } | 1800 | } |
1597 | 1801 | ||
1598 | q.pi_state = NULL; | 1802 | q.pi_state = NULL; |
1803 | q.rt_waiter = NULL; | ||
1599 | retry: | 1804 | retry: |
1600 | q.key = FUTEX_KEY_INIT; | 1805 | q.key = FUTEX_KEY_INIT; |
1601 | ret = get_futex_key(uaddr, fshared, &q.key); | 1806 | ret = get_futex_key(uaddr, fshared, &q.key); |
@@ -1701,6 +1906,20 @@ uaddr_faulted: | |||
1701 | goto retry; | 1906 | goto retry; |
1702 | } | 1907 | } |
1703 | 1908 | ||
1909 | static long futex_lock_pi_restart(struct restart_block *restart) | ||
1910 | { | ||
1911 | u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; | ||
1912 | ktime_t t, *tp = NULL; | ||
1913 | int fshared = restart->futex.flags & FLAGS_SHARED; | ||
1914 | |||
1915 | if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { | ||
1916 | t.tv64 = restart->futex.time; | ||
1917 | tp = &t; | ||
1918 | } | ||
1919 | restart->fn = do_no_restart_syscall; | ||
1920 | |||
1921 | return (long)futex_lock_pi(uaddr, fshared, restart->futex.val, tp, 0); | ||
1922 | } | ||
1704 | 1923 | ||
1705 | /* | 1924 | /* |
1706 | * Userspace attempted a TID -> 0 atomic transition, and failed. | 1925 | * Userspace attempted a TID -> 0 atomic transition, and failed. |
@@ -1803,6 +2022,253 @@ pi_faulted: | |||
1803 | return ret; | 2022 | return ret; |
1804 | } | 2023 | } |
1805 | 2024 | ||
2025 | /** | ||
2026 | * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex | ||
2027 | * @hb: the hash_bucket futex_q was original enqueued on | ||
2028 | * @q: the futex_q woken while waiting to be requeued | ||
2029 | * @key2: the futex_key of the requeue target futex | ||
2030 | * @timeout: the timeout associated with the wait (NULL if none) | ||
2031 | * | ||
2032 | * Detect if the task was woken on the initial futex as opposed to the requeue | ||
2033 | * target futex. If so, determine if it was a timeout or a signal that caused | ||
2034 | * the wakeup and return the appropriate error code to the caller. Must be | ||
2035 | * called with the hb lock held. | ||
2036 | * | ||
2037 | * Returns | ||
2038 | * 0 - no early wakeup detected | ||
2039 | * <0 - -ETIMEDOUT or -ERESTARTSYS (FIXME: or ERESTARTNOINTR?) | ||
2040 | */ | ||
2041 | static inline | ||
2042 | int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | ||
2043 | struct futex_q *q, union futex_key *key2, | ||
2044 | struct hrtimer_sleeper *timeout) | ||
2045 | { | ||
2046 | int ret = 0; | ||
2047 | |||
2048 | /* | ||
2049 | * With the hb lock held, we avoid races while we process the wakeup. | ||
2050 | * We only need to hold hb (and not hb2) to ensure atomicity as the | ||
2051 | * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. | ||
2052 | * It can't be requeued from uaddr2 to something else since we don't | ||
2053 | * support a PI aware source futex for requeue. | ||
2054 | */ | ||
2055 | if (!match_futex(&q->key, key2)) { | ||
2056 | WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); | ||
2057 | /* | ||
2058 | * We were woken prior to requeue by a timeout or a signal. | ||
2059 | * Unqueue the futex_q and determine which it was. | ||
2060 | */ | ||
2061 | plist_del(&q->list, &q->list.plist); | ||
2062 | drop_futex_key_refs(&q->key); | ||
2063 | |||
2064 | if (timeout && !timeout->task) | ||
2065 | ret = -ETIMEDOUT; | ||
2066 | else { | ||
2067 | /* | ||
2068 | * We expect signal_pending(current), but another | ||
2069 | * thread may have handled it for us already. | ||
2070 | */ | ||
2071 | /* FIXME: ERESTARTSYS or ERESTARTNOINTR? Do we care if | ||
2072 | * the user specified SA_RESTART or not? */ | ||
2073 | ret = -ERESTARTSYS; | ||
2074 | } | ||
2075 | } | ||
2076 | return ret; | ||
2077 | } | ||
2078 | |||
2079 | /** | ||
2080 | * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 | ||
2081 | * @uaddr: the futex we initialyl wait on (non-pi) | ||
2082 | * @fshared: whether the futexes are shared (1) or not (0). They must be | ||
2083 | * the same type, no requeueing from private to shared, etc. | ||
2084 | * @val: the expected value of uaddr | ||
2085 | * @abs_time: absolute timeout | ||
2086 | * @bitset: 32 bit wakeup bitset set by userspace, defaults to all. | ||
2087 | * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) | ||
2088 | * @uaddr2: the pi futex we will take prior to returning to user-space | ||
2089 | * | ||
2090 | * The caller will wait on uaddr and will be requeued by futex_requeue() to | ||
2091 | * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and | ||
2092 | * complete the acquisition of the rt_mutex prior to returning to userspace. | ||
2093 | * This ensures the rt_mutex maintains an owner when it has waiters; without | ||
2094 | * one, the pi logic wouldn't know which task to boost/deboost, if there was a | ||
2095 | * need to. | ||
2096 | * | ||
2097 | * We call schedule in futex_wait_queue_me() when we enqueue and return there | ||
2098 | * via the following: | ||
2099 | * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() | ||
2100 | * 2) wakeup on uaddr2 after a requeue and subsequent unlock | ||
2101 | * 3) signal (before or after requeue) | ||
2102 | * 4) timeout (before or after requeue) | ||
2103 | * | ||
2104 | * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. | ||
2105 | * | ||
2106 | * If 2, we may then block on trying to take the rt_mutex and return via: | ||
2107 | * 5) successful lock | ||
2108 | * 6) signal | ||
2109 | * 7) timeout | ||
2110 | * 8) other lock acquisition failure | ||
2111 | * | ||
2112 | * If 6, we setup a restart_block with futex_lock_pi() as the function. | ||
2113 | * | ||
2114 | * If 4 or 7, we cleanup and return with -ETIMEDOUT. | ||
2115 | * | ||
2116 | * Returns: | ||
2117 | * 0 - On success | ||
2118 | * <0 - On error | ||
2119 | */ | ||
2120 | static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | ||
2121 | u32 val, ktime_t *abs_time, u32 bitset, | ||
2122 | int clockrt, u32 __user *uaddr2) | ||
2123 | { | ||
2124 | struct hrtimer_sleeper timeout, *to = NULL; | ||
2125 | struct rt_mutex_waiter rt_waiter; | ||
2126 | struct rt_mutex *pi_mutex = NULL; | ||
2127 | DECLARE_WAITQUEUE(wait, current); | ||
2128 | struct restart_block *restart; | ||
2129 | struct futex_hash_bucket *hb; | ||
2130 | union futex_key key2; | ||
2131 | struct futex_q q; | ||
2132 | int res, ret; | ||
2133 | u32 uval; | ||
2134 | |||
2135 | if (!bitset) | ||
2136 | return -EINVAL; | ||
2137 | |||
2138 | if (abs_time) { | ||
2139 | to = &timeout; | ||
2140 | hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : | ||
2141 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | ||
2142 | hrtimer_init_sleeper(to, current); | ||
2143 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, | ||
2144 | current->timer_slack_ns); | ||
2145 | } | ||
2146 | |||
2147 | /* | ||
2148 | * The waiter is allocated on our stack, manipulated by the requeue | ||
2149 | * code while we sleep on uaddr. | ||
2150 | */ | ||
2151 | debug_rt_mutex_init_waiter(&rt_waiter); | ||
2152 | rt_waiter.task = NULL; | ||
2153 | |||
2154 | q.pi_state = NULL; | ||
2155 | q.bitset = bitset; | ||
2156 | q.rt_waiter = &rt_waiter; | ||
2157 | |||
2158 | key2 = FUTEX_KEY_INIT; | ||
2159 | ret = get_futex_key(uaddr2, fshared, &key2); | ||
2160 | if (unlikely(ret != 0)) | ||
2161 | goto out; | ||
2162 | |||
2163 | /* Prepare to wait on uaddr. */ | ||
2164 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); | ||
2165 | if (ret) { | ||
2166 | put_futex_key(fshared, &key2); | ||
2167 | goto out; | ||
2168 | } | ||
2169 | |||
2170 | /* Queue the futex_q, drop the hb lock, wait for wakeup. */ | ||
2171 | futex_wait_queue_me(hb, &q, to, &wait); | ||
2172 | |||
2173 | spin_lock(&hb->lock); | ||
2174 | ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); | ||
2175 | spin_unlock(&hb->lock); | ||
2176 | if (ret) | ||
2177 | goto out_put_keys; | ||
2178 | |||
2179 | /* | ||
2180 | * In order for us to be here, we know our q.key == key2, and since | ||
2181 | * we took the hb->lock above, we also know that futex_requeue() has | ||
2182 | * completed and we no longer have to concern ourselves with a wakeup | ||
2183 | * race with the atomic proxy lock acquition by the requeue code. | ||
2184 | */ | ||
2185 | |||
2186 | /* Check if the requeue code acquired the second futex for us. */ | ||
2187 | if (!q.rt_waiter) { | ||
2188 | /* | ||
2189 | * Got the lock. We might not be the anticipated owner if we | ||
2190 | * did a lock-steal - fix up the PI-state in that case. | ||
2191 | */ | ||
2192 | if (q.pi_state && (q.pi_state->owner != current)) { | ||
2193 | spin_lock(q.lock_ptr); | ||
2194 | ret = fixup_pi_state_owner(uaddr2, &q, current, | ||
2195 | fshared); | ||
2196 | spin_unlock(q.lock_ptr); | ||
2197 | } | ||
2198 | } else { | ||
2199 | /* | ||
2200 | * We have been woken up by futex_unlock_pi(), a timeout, or a | ||
2201 | * signal. futex_unlock_pi() will not destroy the lock_ptr nor | ||
2202 | * the pi_state. | ||
2203 | */ | ||
2204 | WARN_ON(!&q.pi_state); | ||
2205 | pi_mutex = &q.pi_state->pi_mutex; | ||
2206 | ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); | ||
2207 | debug_rt_mutex_free_waiter(&rt_waiter); | ||
2208 | |||
2209 | spin_lock(q.lock_ptr); | ||
2210 | /* | ||
2211 | * Fixup the pi_state owner and possibly acquire the lock if we | ||
2212 | * haven't already. | ||
2213 | */ | ||
2214 | res = fixup_owner(uaddr2, fshared, &q, !ret); | ||
2215 | /* | ||
2216 | * If fixup_owner() returned an error, proprogate that. If it | ||
2217 | * acquired the lock, clear our -ETIMEDOUT or -EINTR. | ||
2218 | */ | ||
2219 | if (res) | ||
2220 | ret = (res < 0) ? res : 0; | ||
2221 | |||
2222 | /* Unqueue and drop the lock. */ | ||
2223 | unqueue_me_pi(&q); | ||
2224 | } | ||
2225 | |||
2226 | /* | ||
2227 | * If fixup_pi_state_owner() faulted and was unable to handle the | ||
2228 | * fault, unlock the rt_mutex and return the fault to userspace. | ||
2229 | */ | ||
2230 | if (ret == -EFAULT) { | ||
2231 | if (rt_mutex_owner(pi_mutex) == current) | ||
2232 | rt_mutex_unlock(pi_mutex); | ||
2233 | } else if (ret == -EINTR) { | ||
2234 | ret = -EFAULT; | ||
2235 | if (get_user(uval, uaddr2)) | ||
2236 | goto out_put_keys; | ||
2237 | |||
2238 | /* | ||
2239 | * We've already been requeued, so restart by calling | ||
2240 | * futex_lock_pi() directly, rather then returning to this | ||
2241 | * function. | ||
2242 | */ | ||
2243 | ret = -ERESTART_RESTARTBLOCK; | ||
2244 | restart = ¤t_thread_info()->restart_block; | ||
2245 | restart->fn = futex_lock_pi_restart; | ||
2246 | restart->futex.uaddr = (u32 *)uaddr2; | ||
2247 | restart->futex.val = uval; | ||
2248 | restart->futex.flags = 0; | ||
2249 | if (abs_time) { | ||
2250 | restart->futex.flags |= FLAGS_HAS_TIMEOUT; | ||
2251 | restart->futex.time = abs_time->tv64; | ||
2252 | } | ||
2253 | |||
2254 | if (fshared) | ||
2255 | restart->futex.flags |= FLAGS_SHARED; | ||
2256 | if (clockrt) | ||
2257 | restart->futex.flags |= FLAGS_CLOCKRT; | ||
2258 | } | ||
2259 | |||
2260 | out_put_keys: | ||
2261 | put_futex_key(fshared, &q.key); | ||
2262 | put_futex_key(fshared, &key2); | ||
2263 | |||
2264 | out: | ||
2265 | if (to) { | ||
2266 | hrtimer_cancel(&to->timer); | ||
2267 | destroy_hrtimer_on_stack(&to->timer); | ||
2268 | } | ||
2269 | return ret; | ||
2270 | } | ||
2271 | |||
1806 | /* | 2272 | /* |
1807 | * Support for robust futexes: the kernel cleans up held futexes at | 2273 | * Support for robust futexes: the kernel cleans up held futexes at |
1808 | * thread exit time. | 2274 | * thread exit time. |
@@ -2025,7 +2491,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, | |||
2025 | fshared = 1; | 2491 | fshared = 1; |
2026 | 2492 | ||
2027 | clockrt = op & FUTEX_CLOCK_REALTIME; | 2493 | clockrt = op & FUTEX_CLOCK_REALTIME; |
2028 | if (clockrt && cmd != FUTEX_WAIT_BITSET) | 2494 | if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) |
2029 | return -ENOSYS; | 2495 | return -ENOSYS; |
2030 | 2496 | ||
2031 | switch (cmd) { | 2497 | switch (cmd) { |
@@ -2040,10 +2506,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, | |||
2040 | ret = futex_wake(uaddr, fshared, val, val3); | 2506 | ret = futex_wake(uaddr, fshared, val, val3); |
2041 | break; | 2507 | break; |
2042 | case FUTEX_REQUEUE: | 2508 | case FUTEX_REQUEUE: |
2043 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); | 2509 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); |
2044 | break; | 2510 | break; |
2045 | case FUTEX_CMP_REQUEUE: | 2511 | case FUTEX_CMP_REQUEUE: |
2046 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); | 2512 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, |
2513 | 0); | ||
2047 | break; | 2514 | break; |
2048 | case FUTEX_WAKE_OP: | 2515 | case FUTEX_WAKE_OP: |
2049 | ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); | 2516 | ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); |
@@ -2060,6 +2527,18 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, | |||
2060 | if (futex_cmpxchg_enabled) | 2527 | if (futex_cmpxchg_enabled) |
2061 | ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); | 2528 | ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); |
2062 | break; | 2529 | break; |
2530 | case FUTEX_WAIT_REQUEUE_PI: | ||
2531 | val3 = FUTEX_BITSET_MATCH_ANY; | ||
2532 | ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, | ||
2533 | clockrt, uaddr2); | ||
2534 | break; | ||
2535 | case FUTEX_REQUEUE_PI: | ||
2536 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 1); | ||
2537 | break; | ||
2538 | case FUTEX_CMP_REQUEUE_PI: | ||
2539 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, | ||
2540 | 1); | ||
2541 | break; | ||
2063 | default: | 2542 | default: |
2064 | ret = -ENOSYS; | 2543 | ret = -ENOSYS; |
2065 | } | 2544 | } |
@@ -2077,7 +2556,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, | |||
2077 | int cmd = op & FUTEX_CMD_MASK; | 2556 | int cmd = op & FUTEX_CMD_MASK; |
2078 | 2557 | ||
2079 | if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || | 2558 | if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || |
2080 | cmd == FUTEX_WAIT_BITSET)) { | 2559 | cmd == FUTEX_WAIT_BITSET || |
2560 | cmd == FUTEX_WAIT_REQUEUE_PI)) { | ||
2081 | if (copy_from_user(&ts, utime, sizeof(ts)) != 0) | 2561 | if (copy_from_user(&ts, utime, sizeof(ts)) != 0) |
2082 | return -EFAULT; | 2562 | return -EFAULT; |
2083 | if (!timespec_valid(&ts)) | 2563 | if (!timespec_valid(&ts)) |
@@ -2089,10 +2569,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, | |||
2089 | tp = &t; | 2569 | tp = &t; |
2090 | } | 2570 | } |
2091 | /* | 2571 | /* |
2092 | * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. | 2572 | * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. |
2093 | * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. | 2573 | * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. |
2094 | */ | 2574 | */ |
2095 | if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || | 2575 | if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || |
2576 | cmd == FUTEX_REQUEUE_PI || cmd == FUTEX_CMP_REQUEUE_PI || | ||
2096 | cmd == FUTEX_WAKE_OP) | 2577 | cmd == FUTEX_WAKE_OP) |
2097 | val2 = (u32) (unsigned long) utime; | 2578 | val2 = (u32) (unsigned long) utime; |
2098 | 2579 | ||