diff options
author | Peter Zijlstra <peterz@infradead.org> | 2017-03-22 06:35:52 -0400 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2017-03-23 14:10:07 -0400 |
commit | 734009e96d1983ad739e5b656e03430b3660c913 (patch) | |
tree | 6260eb33e594ffbf4cbed8b050eafd4b9c9f2c05 /kernel/futex.c | |
parent | 5293c2efda37775346885c7e924d4ef7018ea60b (diff) |
futex: Change locking rules
Currently futex-pi relies on hb->lock to serialize everything. But hb->lock
creates another set of problems, especially priority inversions on RT where
hb->lock becomes a rt_mutex itself.
The rt_mutex::wait_lock is the most obvious protection for keeping the
futex user space value and the kernel internal pi_state in sync.
Rework and document the locking so rt_mutex::wait_lock is held accross all
operations which modify the user space value and the pi state.
This allows to invoke rt_mutex_unlock() (including deboost) without holding
hb->lock as a next step.
Nothing yet relies on the new locking rules.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: dvhart@infradead.org
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170322104151.751993333@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'kernel/futex.c')
-rw-r--r-- | kernel/futex.c | 165 |
1 files changed, 132 insertions, 33 deletions
diff --git a/kernel/futex.c b/kernel/futex.c index af022919933a..3e71d66cb788 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -973,6 +973,39 @@ void exit_pi_state_list(struct task_struct *curr) | |||
973 | * | 973 | * |
974 | * [10] There is no transient state which leaves owner and user space | 974 | * [10] There is no transient state which leaves owner and user space |
975 | * TID out of sync. | 975 | * TID out of sync. |
976 | * | ||
977 | * | ||
978 | * Serialization and lifetime rules: | ||
979 | * | ||
980 | * hb->lock: | ||
981 | * | ||
982 | * hb -> futex_q, relation | ||
983 | * futex_q -> pi_state, relation | ||
984 | * | ||
985 | * (cannot be raw because hb can contain arbitrary amount | ||
986 | * of futex_q's) | ||
987 | * | ||
988 | * pi_mutex->wait_lock: | ||
989 | * | ||
990 | * {uval, pi_state} | ||
991 | * | ||
992 | * (and pi_mutex 'obviously') | ||
993 | * | ||
994 | * p->pi_lock: | ||
995 | * | ||
996 | * p->pi_state_list -> pi_state->list, relation | ||
997 | * | ||
998 | * pi_state->refcount: | ||
999 | * | ||
1000 | * pi_state lifetime | ||
1001 | * | ||
1002 | * | ||
1003 | * Lock order: | ||
1004 | * | ||
1005 | * hb->lock | ||
1006 | * pi_mutex->wait_lock | ||
1007 | * p->pi_lock | ||
1008 | * | ||
976 | */ | 1009 | */ |
977 | 1010 | ||
978 | /* | 1011 | /* |
@@ -980,10 +1013,12 @@ void exit_pi_state_list(struct task_struct *curr) | |||
980 | * the pi_state against the user space value. If correct, attach to | 1013 | * the pi_state against the user space value. If correct, attach to |
981 | * it. | 1014 | * it. |
982 | */ | 1015 | */ |
983 | static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, | 1016 | static int attach_to_pi_state(u32 __user *uaddr, u32 uval, |
1017 | struct futex_pi_state *pi_state, | ||
984 | struct futex_pi_state **ps) | 1018 | struct futex_pi_state **ps) |
985 | { | 1019 | { |
986 | pid_t pid = uval & FUTEX_TID_MASK; | 1020 | pid_t pid = uval & FUTEX_TID_MASK; |
1021 | int ret, uval2; | ||
987 | 1022 | ||
988 | /* | 1023 | /* |
989 | * Userspace might have messed up non-PI and PI futexes [3] | 1024 | * Userspace might have messed up non-PI and PI futexes [3] |
@@ -991,9 +1026,34 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, | |||
991 | if (unlikely(!pi_state)) | 1026 | if (unlikely(!pi_state)) |
992 | return -EINVAL; | 1027 | return -EINVAL; |
993 | 1028 | ||
1029 | /* | ||
1030 | * We get here with hb->lock held, and having found a | ||
1031 | * futex_top_waiter(). This means that futex_lock_pi() of said futex_q | ||
1032 | * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), | ||
1033 | * which in turn means that futex_lock_pi() still has a reference on | ||
1034 | * our pi_state. | ||
1035 | */ | ||
994 | WARN_ON(!atomic_read(&pi_state->refcount)); | 1036 | WARN_ON(!atomic_read(&pi_state->refcount)); |
995 | 1037 | ||
996 | /* | 1038 | /* |
1039 | * Now that we have a pi_state, we can acquire wait_lock | ||
1040 | * and do the state validation. | ||
1041 | */ | ||
1042 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | ||
1043 | |||
1044 | /* | ||
1045 | * Since {uval, pi_state} is serialized by wait_lock, and our current | ||
1046 | * uval was read without holding it, it can have changed. Verify it | ||
1047 | * still is what we expect it to be, otherwise retry the entire | ||
1048 | * operation. | ||
1049 | */ | ||
1050 | if (get_futex_value_locked(&uval2, uaddr)) | ||
1051 | goto out_efault; | ||
1052 | |||
1053 | if (uval != uval2) | ||
1054 | goto out_eagain; | ||
1055 | |||
1056 | /* | ||
997 | * Handle the owner died case: | 1057 | * Handle the owner died case: |
998 | */ | 1058 | */ |
999 | if (uval & FUTEX_OWNER_DIED) { | 1059 | if (uval & FUTEX_OWNER_DIED) { |
@@ -1008,11 +1068,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, | |||
1008 | * is not 0. Inconsistent state. [5] | 1068 | * is not 0. Inconsistent state. [5] |
1009 | */ | 1069 | */ |
1010 | if (pid) | 1070 | if (pid) |
1011 | return -EINVAL; | 1071 | goto out_einval; |
1012 | /* | 1072 | /* |
1013 | * Take a ref on the state and return success. [4] | 1073 | * Take a ref on the state and return success. [4] |
1014 | */ | 1074 | */ |
1015 | goto out_state; | 1075 | goto out_attach; |
1016 | } | 1076 | } |
1017 | 1077 | ||
1018 | /* | 1078 | /* |
@@ -1024,14 +1084,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, | |||
1024 | * Take a ref on the state and return success. [6] | 1084 | * Take a ref on the state and return success. [6] |
1025 | */ | 1085 | */ |
1026 | if (!pid) | 1086 | if (!pid) |
1027 | goto out_state; | 1087 | goto out_attach; |
1028 | } else { | 1088 | } else { |
1029 | /* | 1089 | /* |
1030 | * If the owner died bit is not set, then the pi_state | 1090 | * If the owner died bit is not set, then the pi_state |
1031 | * must have an owner. [7] | 1091 | * must have an owner. [7] |
1032 | */ | 1092 | */ |
1033 | if (!pi_state->owner) | 1093 | if (!pi_state->owner) |
1034 | return -EINVAL; | 1094 | goto out_einval; |
1035 | } | 1095 | } |
1036 | 1096 | ||
1037 | /* | 1097 | /* |
@@ -1040,11 +1100,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, | |||
1040 | * user space TID. [9/10] | 1100 | * user space TID. [9/10] |
1041 | */ | 1101 | */ |
1042 | if (pid != task_pid_vnr(pi_state->owner)) | 1102 | if (pid != task_pid_vnr(pi_state->owner)) |
1043 | return -EINVAL; | 1103 | goto out_einval; |
1044 | out_state: | 1104 | |
1105 | out_attach: | ||
1045 | atomic_inc(&pi_state->refcount); | 1106 | atomic_inc(&pi_state->refcount); |
1107 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | ||
1046 | *ps = pi_state; | 1108 | *ps = pi_state; |
1047 | return 0; | 1109 | return 0; |
1110 | |||
1111 | out_einval: | ||
1112 | ret = -EINVAL; | ||
1113 | goto out_error; | ||
1114 | |||
1115 | out_eagain: | ||
1116 | ret = -EAGAIN; | ||
1117 | goto out_error; | ||
1118 | |||
1119 | out_efault: | ||
1120 | ret = -EFAULT; | ||
1121 | goto out_error; | ||
1122 | |||
1123 | out_error: | ||
1124 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | ||
1125 | return ret; | ||
1048 | } | 1126 | } |
1049 | 1127 | ||
1050 | /* | 1128 | /* |
@@ -1095,6 +1173,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, | |||
1095 | 1173 | ||
1096 | /* | 1174 | /* |
1097 | * No existing pi state. First waiter. [2] | 1175 | * No existing pi state. First waiter. [2] |
1176 | * | ||
1177 | * This creates pi_state, we have hb->lock held, this means nothing can | ||
1178 | * observe this state, wait_lock is irrelevant. | ||
1098 | */ | 1179 | */ |
1099 | pi_state = alloc_pi_state(); | 1180 | pi_state = alloc_pi_state(); |
1100 | 1181 | ||
@@ -1119,7 +1200,8 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, | |||
1119 | return 0; | 1200 | return 0; |
1120 | } | 1201 | } |
1121 | 1202 | ||
1122 | static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | 1203 | static int lookup_pi_state(u32 __user *uaddr, u32 uval, |
1204 | struct futex_hash_bucket *hb, | ||
1123 | union futex_key *key, struct futex_pi_state **ps) | 1205 | union futex_key *key, struct futex_pi_state **ps) |
1124 | { | 1206 | { |
1125 | struct futex_q *top_waiter = futex_top_waiter(hb, key); | 1207 | struct futex_q *top_waiter = futex_top_waiter(hb, key); |
@@ -1129,7 +1211,7 @@ static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
1129 | * attach to the pi_state when the validation succeeds. | 1211 | * attach to the pi_state when the validation succeeds. |
1130 | */ | 1212 | */ |
1131 | if (top_waiter) | 1213 | if (top_waiter) |
1132 | return attach_to_pi_state(uval, top_waiter->pi_state, ps); | 1214 | return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); |
1133 | 1215 | ||
1134 | /* | 1216 | /* |
1135 | * We are the first waiter - try to look up the owner based on | 1217 | * We are the first waiter - try to look up the owner based on |
@@ -1148,7 +1230,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) | |||
1148 | if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) | 1230 | if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) |
1149 | return -EFAULT; | 1231 | return -EFAULT; |
1150 | 1232 | ||
1151 | /*If user space value changed, let the caller retry */ | 1233 | /* If user space value changed, let the caller retry */ |
1152 | return curval != uval ? -EAGAIN : 0; | 1234 | return curval != uval ? -EAGAIN : 0; |
1153 | } | 1235 | } |
1154 | 1236 | ||
@@ -1204,7 +1286,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, | |||
1204 | */ | 1286 | */ |
1205 | top_waiter = futex_top_waiter(hb, key); | 1287 | top_waiter = futex_top_waiter(hb, key); |
1206 | if (top_waiter) | 1288 | if (top_waiter) |
1207 | return attach_to_pi_state(uval, top_waiter->pi_state, ps); | 1289 | return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); |
1208 | 1290 | ||
1209 | /* | 1291 | /* |
1210 | * No waiter and user TID is 0. We are here because the | 1292 | * No waiter and user TID is 0. We are here because the |
@@ -1336,6 +1418,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter | |||
1336 | 1418 | ||
1337 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { | 1419 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { |
1338 | ret = -EFAULT; | 1420 | ret = -EFAULT; |
1421 | |||
1339 | } else if (curval != uval) { | 1422 | } else if (curval != uval) { |
1340 | /* | 1423 | /* |
1341 | * If a unconditional UNLOCK_PI operation (user space did not | 1424 | * If a unconditional UNLOCK_PI operation (user space did not |
@@ -1348,6 +1431,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter | |||
1348 | else | 1431 | else |
1349 | ret = -EINVAL; | 1432 | ret = -EINVAL; |
1350 | } | 1433 | } |
1434 | |||
1351 | if (ret) { | 1435 | if (ret) { |
1352 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | 1436 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); |
1353 | return ret; | 1437 | return ret; |
@@ -1823,7 +1907,7 @@ retry_private: | |||
1823 | * If that call succeeds then we have pi_state and an | 1907 | * If that call succeeds then we have pi_state and an |
1824 | * initial refcount on it. | 1908 | * initial refcount on it. |
1825 | */ | 1909 | */ |
1826 | ret = lookup_pi_state(ret, hb2, &key2, &pi_state); | 1910 | ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state); |
1827 | } | 1911 | } |
1828 | 1912 | ||
1829 | switch (ret) { | 1913 | switch (ret) { |
@@ -2122,10 +2206,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | |||
2122 | { | 2206 | { |
2123 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; | 2207 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; |
2124 | struct futex_pi_state *pi_state = q->pi_state; | 2208 | struct futex_pi_state *pi_state = q->pi_state; |
2125 | struct task_struct *oldowner = pi_state->owner; | ||
2126 | u32 uval, uninitialized_var(curval), newval; | 2209 | u32 uval, uninitialized_var(curval), newval; |
2210 | struct task_struct *oldowner; | ||
2127 | int ret; | 2211 | int ret; |
2128 | 2212 | ||
2213 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | ||
2214 | |||
2215 | oldowner = pi_state->owner; | ||
2129 | /* Owner died? */ | 2216 | /* Owner died? */ |
2130 | if (!pi_state->owner) | 2217 | if (!pi_state->owner) |
2131 | newtid |= FUTEX_OWNER_DIED; | 2218 | newtid |= FUTEX_OWNER_DIED; |
@@ -2141,11 +2228,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | |||
2141 | * because we can fault here. Imagine swapped out pages or a fork | 2228 | * because we can fault here. Imagine swapped out pages or a fork |
2142 | * that marked all the anonymous memory readonly for cow. | 2229 | * that marked all the anonymous memory readonly for cow. |
2143 | * | 2230 | * |
2144 | * Modifying pi_state _before_ the user space value would | 2231 | * Modifying pi_state _before_ the user space value would leave the |
2145 | * leave the pi_state in an inconsistent state when we fault | 2232 | * pi_state in an inconsistent state when we fault here, because we |
2146 | * here, because we need to drop the hash bucket lock to | 2233 | * need to drop the locks to handle the fault. This might be observed |
2147 | * handle the fault. This might be observed in the PID check | 2234 | * in the PID check in lookup_pi_state. |
2148 | * in lookup_pi_state. | ||
2149 | */ | 2235 | */ |
2150 | retry: | 2236 | retry: |
2151 | if (get_futex_value_locked(&uval, uaddr)) | 2237 | if (get_futex_value_locked(&uval, uaddr)) |
@@ -2166,47 +2252,60 @@ retry: | |||
2166 | * itself. | 2252 | * itself. |
2167 | */ | 2253 | */ |
2168 | if (pi_state->owner != NULL) { | 2254 | if (pi_state->owner != NULL) { |
2169 | raw_spin_lock_irq(&pi_state->owner->pi_lock); | 2255 | raw_spin_lock(&pi_state->owner->pi_lock); |
2170 | WARN_ON(list_empty(&pi_state->list)); | 2256 | WARN_ON(list_empty(&pi_state->list)); |
2171 | list_del_init(&pi_state->list); | 2257 | list_del_init(&pi_state->list); |
2172 | raw_spin_unlock_irq(&pi_state->owner->pi_lock); | 2258 | raw_spin_unlock(&pi_state->owner->pi_lock); |
2173 | } | 2259 | } |
2174 | 2260 | ||
2175 | pi_state->owner = newowner; | 2261 | pi_state->owner = newowner; |
2176 | 2262 | ||
2177 | raw_spin_lock_irq(&newowner->pi_lock); | 2263 | raw_spin_lock(&newowner->pi_lock); |
2178 | WARN_ON(!list_empty(&pi_state->list)); | 2264 | WARN_ON(!list_empty(&pi_state->list)); |
2179 | list_add(&pi_state->list, &newowner->pi_state_list); | 2265 | list_add(&pi_state->list, &newowner->pi_state_list); |
2180 | raw_spin_unlock_irq(&newowner->pi_lock); | 2266 | raw_spin_unlock(&newowner->pi_lock); |
2267 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | ||
2268 | |||
2181 | return 0; | 2269 | return 0; |
2182 | 2270 | ||
2183 | /* | 2271 | /* |
2184 | * To handle the page fault we need to drop the hash bucket | 2272 | * To handle the page fault we need to drop the locks here. That gives |
2185 | * lock here. That gives the other task (either the highest priority | 2273 | * the other task (either the highest priority waiter itself or the |
2186 | * waiter itself or the task which stole the rtmutex) the | 2274 | * task which stole the rtmutex) the chance to try the fixup of the |
2187 | * chance to try the fixup of the pi_state. So once we are | 2275 | * pi_state. So once we are back from handling the fault we need to |
2188 | * back from handling the fault we need to check the pi_state | 2276 | * check the pi_state after reacquiring the locks and before trying to |
2189 | * after reacquiring the hash bucket lock and before trying to | 2277 | * do another fixup. When the fixup has been done already we simply |
2190 | * do another fixup. When the fixup has been done already we | 2278 | * return. |
2191 | * simply return. | 2279 | * |
2280 | * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely | ||
2281 | * drop hb->lock since the caller owns the hb -> futex_q relation. | ||
2282 | * Dropping the pi_mutex->wait_lock requires the state revalidate. | ||
2192 | */ | 2283 | */ |
2193 | handle_fault: | 2284 | handle_fault: |
2285 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | ||
2194 | spin_unlock(q->lock_ptr); | 2286 | spin_unlock(q->lock_ptr); |
2195 | 2287 | ||
2196 | ret = fault_in_user_writeable(uaddr); | 2288 | ret = fault_in_user_writeable(uaddr); |
2197 | 2289 | ||
2198 | spin_lock(q->lock_ptr); | 2290 | spin_lock(q->lock_ptr); |
2291 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | ||
2199 | 2292 | ||
2200 | /* | 2293 | /* |
2201 | * Check if someone else fixed it for us: | 2294 | * Check if someone else fixed it for us: |
2202 | */ | 2295 | */ |
2203 | if (pi_state->owner != oldowner) | 2296 | if (pi_state->owner != oldowner) { |
2204 | return 0; | 2297 | ret = 0; |
2298 | goto out_unlock; | ||
2299 | } | ||
2205 | 2300 | ||
2206 | if (ret) | 2301 | if (ret) |
2207 | return ret; | 2302 | goto out_unlock; |
2208 | 2303 | ||
2209 | goto retry; | 2304 | goto retry; |
2305 | |||
2306 | out_unlock: | ||
2307 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | ||
2308 | return ret; | ||
2210 | } | 2309 | } |
2211 | 2310 | ||
2212 | static long futex_wait_restart(struct restart_block *restart); | 2311 | static long futex_wait_restart(struct restart_block *restart); |