aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/futex.c
diff options
context:
space:
mode:
authorPeter Zijlstra <peterz@infradead.org>2017-03-22 06:35:52 -0400
committerThomas Gleixner <tglx@linutronix.de>2017-03-23 14:10:07 -0400
commit734009e96d1983ad739e5b656e03430b3660c913 (patch)
tree6260eb33e594ffbf4cbed8b050eafd4b9c9f2c05 /kernel/futex.c
parent5293c2efda37775346885c7e924d4ef7018ea60b (diff)
futex: Change locking rules
Currently futex-pi relies on hb->lock to serialize everything. But hb->lock creates another set of problems, especially priority inversions on RT where hb->lock becomes a rt_mutex itself. The rt_mutex::wait_lock is the most obvious protection for keeping the futex user space value and the kernel internal pi_state in sync. Rework and document the locking so rt_mutex::wait_lock is held accross all operations which modify the user space value and the pi state. This allows to invoke rt_mutex_unlock() (including deboost) without holding hb->lock as a next step. Nothing yet relies on the new locking rules. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: juri.lelli@arm.com Cc: bigeasy@linutronix.de Cc: xlpang@redhat.com Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@efficios.com Cc: jdesfossez@efficios.com Cc: dvhart@infradead.org Cc: bristot@redhat.com Link: http://lkml.kernel.org/r/20170322104151.751993333@infradead.org Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'kernel/futex.c')
-rw-r--r--kernel/futex.c165
1 files changed, 132 insertions, 33 deletions
diff --git a/kernel/futex.c b/kernel/futex.c
index af022919933a..3e71d66cb788 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -973,6 +973,39 @@ void exit_pi_state_list(struct task_struct *curr)
973 * 973 *
974 * [10] There is no transient state which leaves owner and user space 974 * [10] There is no transient state which leaves owner and user space
975 * TID out of sync. 975 * TID out of sync.
976 *
977 *
978 * Serialization and lifetime rules:
979 *
980 * hb->lock:
981 *
982 * hb -> futex_q, relation
983 * futex_q -> pi_state, relation
984 *
985 * (cannot be raw because hb can contain arbitrary amount
986 * of futex_q's)
987 *
988 * pi_mutex->wait_lock:
989 *
990 * {uval, pi_state}
991 *
992 * (and pi_mutex 'obviously')
993 *
994 * p->pi_lock:
995 *
996 * p->pi_state_list -> pi_state->list, relation
997 *
998 * pi_state->refcount:
999 *
1000 * pi_state lifetime
1001 *
1002 *
1003 * Lock order:
1004 *
1005 * hb->lock
1006 * pi_mutex->wait_lock
1007 * p->pi_lock
1008 *
976 */ 1009 */
977 1010
978/* 1011/*
@@ -980,10 +1013,12 @@ void exit_pi_state_list(struct task_struct *curr)
980 * the pi_state against the user space value. If correct, attach to 1013 * the pi_state against the user space value. If correct, attach to
981 * it. 1014 * it.
982 */ 1015 */
983static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, 1016static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
1017 struct futex_pi_state *pi_state,
984 struct futex_pi_state **ps) 1018 struct futex_pi_state **ps)
985{ 1019{
986 pid_t pid = uval & FUTEX_TID_MASK; 1020 pid_t pid = uval & FUTEX_TID_MASK;
1021 int ret, uval2;
987 1022
988 /* 1023 /*
989 * Userspace might have messed up non-PI and PI futexes [3] 1024 * Userspace might have messed up non-PI and PI futexes [3]
@@ -991,9 +1026,34 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
991 if (unlikely(!pi_state)) 1026 if (unlikely(!pi_state))
992 return -EINVAL; 1027 return -EINVAL;
993 1028
1029 /*
1030 * We get here with hb->lock held, and having found a
1031 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
1032 * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
1033 * which in turn means that futex_lock_pi() still has a reference on
1034 * our pi_state.
1035 */
994 WARN_ON(!atomic_read(&pi_state->refcount)); 1036 WARN_ON(!atomic_read(&pi_state->refcount));
995 1037
996 /* 1038 /*
1039 * Now that we have a pi_state, we can acquire wait_lock
1040 * and do the state validation.
1041 */
1042 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1043
1044 /*
1045 * Since {uval, pi_state} is serialized by wait_lock, and our current
1046 * uval was read without holding it, it can have changed. Verify it
1047 * still is what we expect it to be, otherwise retry the entire
1048 * operation.
1049 */
1050 if (get_futex_value_locked(&uval2, uaddr))
1051 goto out_efault;
1052
1053 if (uval != uval2)
1054 goto out_eagain;
1055
1056 /*
997 * Handle the owner died case: 1057 * Handle the owner died case:
998 */ 1058 */
999 if (uval & FUTEX_OWNER_DIED) { 1059 if (uval & FUTEX_OWNER_DIED) {
@@ -1008,11 +1068,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
1008 * is not 0. Inconsistent state. [5] 1068 * is not 0. Inconsistent state. [5]
1009 */ 1069 */
1010 if (pid) 1070 if (pid)
1011 return -EINVAL; 1071 goto out_einval;
1012 /* 1072 /*
1013 * Take a ref on the state and return success. [4] 1073 * Take a ref on the state and return success. [4]
1014 */ 1074 */
1015 goto out_state; 1075 goto out_attach;
1016 } 1076 }
1017 1077
1018 /* 1078 /*
@@ -1024,14 +1084,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
1024 * Take a ref on the state and return success. [6] 1084 * Take a ref on the state and return success. [6]
1025 */ 1085 */
1026 if (!pid) 1086 if (!pid)
1027 goto out_state; 1087 goto out_attach;
1028 } else { 1088 } else {
1029 /* 1089 /*
1030 * If the owner died bit is not set, then the pi_state 1090 * If the owner died bit is not set, then the pi_state
1031 * must have an owner. [7] 1091 * must have an owner. [7]
1032 */ 1092 */
1033 if (!pi_state->owner) 1093 if (!pi_state->owner)
1034 return -EINVAL; 1094 goto out_einval;
1035 } 1095 }
1036 1096
1037 /* 1097 /*
@@ -1040,11 +1100,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
1040 * user space TID. [9/10] 1100 * user space TID. [9/10]
1041 */ 1101 */
1042 if (pid != task_pid_vnr(pi_state->owner)) 1102 if (pid != task_pid_vnr(pi_state->owner))
1043 return -EINVAL; 1103 goto out_einval;
1044out_state: 1104
1105out_attach:
1045 atomic_inc(&pi_state->refcount); 1106 atomic_inc(&pi_state->refcount);
1107 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1046 *ps = pi_state; 1108 *ps = pi_state;
1047 return 0; 1109 return 0;
1110
1111out_einval:
1112 ret = -EINVAL;
1113 goto out_error;
1114
1115out_eagain:
1116 ret = -EAGAIN;
1117 goto out_error;
1118
1119out_efault:
1120 ret = -EFAULT;
1121 goto out_error;
1122
1123out_error:
1124 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1125 return ret;
1048} 1126}
1049 1127
1050/* 1128/*
@@ -1095,6 +1173,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
1095 1173
1096 /* 1174 /*
1097 * No existing pi state. First waiter. [2] 1175 * No existing pi state. First waiter. [2]
1176 *
1177 * This creates pi_state, we have hb->lock held, this means nothing can
1178 * observe this state, wait_lock is irrelevant.
1098 */ 1179 */
1099 pi_state = alloc_pi_state(); 1180 pi_state = alloc_pi_state();
1100 1181
@@ -1119,7 +1200,8 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
1119 return 0; 1200 return 0;
1120} 1201}
1121 1202
1122static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, 1203static int lookup_pi_state(u32 __user *uaddr, u32 uval,
1204 struct futex_hash_bucket *hb,
1123 union futex_key *key, struct futex_pi_state **ps) 1205 union futex_key *key, struct futex_pi_state **ps)
1124{ 1206{
1125 struct futex_q *top_waiter = futex_top_waiter(hb, key); 1207 struct futex_q *top_waiter = futex_top_waiter(hb, key);
@@ -1129,7 +1211,7 @@ static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
1129 * attach to the pi_state when the validation succeeds. 1211 * attach to the pi_state when the validation succeeds.
1130 */ 1212 */
1131 if (top_waiter) 1213 if (top_waiter)
1132 return attach_to_pi_state(uval, top_waiter->pi_state, ps); 1214 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
1133 1215
1134 /* 1216 /*
1135 * We are the first waiter - try to look up the owner based on 1217 * We are the first waiter - try to look up the owner based on
@@ -1148,7 +1230,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
1148 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) 1230 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
1149 return -EFAULT; 1231 return -EFAULT;
1150 1232
1151 /*If user space value changed, let the caller retry */ 1233 /* If user space value changed, let the caller retry */
1152 return curval != uval ? -EAGAIN : 0; 1234 return curval != uval ? -EAGAIN : 0;
1153} 1235}
1154 1236
@@ -1204,7 +1286,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
1204 */ 1286 */
1205 top_waiter = futex_top_waiter(hb, key); 1287 top_waiter = futex_top_waiter(hb, key);
1206 if (top_waiter) 1288 if (top_waiter)
1207 return attach_to_pi_state(uval, top_waiter->pi_state, ps); 1289 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
1208 1290
1209 /* 1291 /*
1210 * No waiter and user TID is 0. We are here because the 1292 * No waiter and user TID is 0. We are here because the
@@ -1336,6 +1418,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter
1336 1418
1337 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { 1419 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
1338 ret = -EFAULT; 1420 ret = -EFAULT;
1421
1339 } else if (curval != uval) { 1422 } else if (curval != uval) {
1340 /* 1423 /*
1341 * If a unconditional UNLOCK_PI operation (user space did not 1424 * If a unconditional UNLOCK_PI operation (user space did not
@@ -1348,6 +1431,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter
1348 else 1431 else
1349 ret = -EINVAL; 1432 ret = -EINVAL;
1350 } 1433 }
1434
1351 if (ret) { 1435 if (ret) {
1352 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1436 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1353 return ret; 1437 return ret;
@@ -1823,7 +1907,7 @@ retry_private:
1823 * If that call succeeds then we have pi_state and an 1907 * If that call succeeds then we have pi_state and an
1824 * initial refcount on it. 1908 * initial refcount on it.
1825 */ 1909 */
1826 ret = lookup_pi_state(ret, hb2, &key2, &pi_state); 1910 ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
1827 } 1911 }
1828 1912
1829 switch (ret) { 1913 switch (ret) {
@@ -2122,10 +2206,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
2122{ 2206{
2123 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 2207 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
2124 struct futex_pi_state *pi_state = q->pi_state; 2208 struct futex_pi_state *pi_state = q->pi_state;
2125 struct task_struct *oldowner = pi_state->owner;
2126 u32 uval, uninitialized_var(curval), newval; 2209 u32 uval, uninitialized_var(curval), newval;
2210 struct task_struct *oldowner;
2127 int ret; 2211 int ret;
2128 2212
2213 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
2214
2215 oldowner = pi_state->owner;
2129 /* Owner died? */ 2216 /* Owner died? */
2130 if (!pi_state->owner) 2217 if (!pi_state->owner)
2131 newtid |= FUTEX_OWNER_DIED; 2218 newtid |= FUTEX_OWNER_DIED;
@@ -2141,11 +2228,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
2141 * because we can fault here. Imagine swapped out pages or a fork 2228 * because we can fault here. Imagine swapped out pages or a fork
2142 * that marked all the anonymous memory readonly for cow. 2229 * that marked all the anonymous memory readonly for cow.
2143 * 2230 *
2144 * Modifying pi_state _before_ the user space value would 2231 * Modifying pi_state _before_ the user space value would leave the
2145 * leave the pi_state in an inconsistent state when we fault 2232 * pi_state in an inconsistent state when we fault here, because we
2146 * here, because we need to drop the hash bucket lock to 2233 * need to drop the locks to handle the fault. This might be observed
2147 * handle the fault. This might be observed in the PID check 2234 * in the PID check in lookup_pi_state.
2148 * in lookup_pi_state.
2149 */ 2235 */
2150retry: 2236retry:
2151 if (get_futex_value_locked(&uval, uaddr)) 2237 if (get_futex_value_locked(&uval, uaddr))
@@ -2166,47 +2252,60 @@ retry:
2166 * itself. 2252 * itself.
2167 */ 2253 */
2168 if (pi_state->owner != NULL) { 2254 if (pi_state->owner != NULL) {
2169 raw_spin_lock_irq(&pi_state->owner->pi_lock); 2255 raw_spin_lock(&pi_state->owner->pi_lock);
2170 WARN_ON(list_empty(&pi_state->list)); 2256 WARN_ON(list_empty(&pi_state->list));
2171 list_del_init(&pi_state->list); 2257 list_del_init(&pi_state->list);
2172 raw_spin_unlock_irq(&pi_state->owner->pi_lock); 2258 raw_spin_unlock(&pi_state->owner->pi_lock);
2173 } 2259 }
2174 2260
2175 pi_state->owner = newowner; 2261 pi_state->owner = newowner;
2176 2262
2177 raw_spin_lock_irq(&newowner->pi_lock); 2263 raw_spin_lock(&newowner->pi_lock);
2178 WARN_ON(!list_empty(&pi_state->list)); 2264 WARN_ON(!list_empty(&pi_state->list));
2179 list_add(&pi_state->list, &newowner->pi_state_list); 2265 list_add(&pi_state->list, &newowner->pi_state_list);
2180 raw_spin_unlock_irq(&newowner->pi_lock); 2266 raw_spin_unlock(&newowner->pi_lock);
2267 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
2268
2181 return 0; 2269 return 0;
2182 2270
2183 /* 2271 /*
2184 * To handle the page fault we need to drop the hash bucket 2272 * To handle the page fault we need to drop the locks here. That gives
2185 * lock here. That gives the other task (either the highest priority 2273 * the other task (either the highest priority waiter itself or the
2186 * waiter itself or the task which stole the rtmutex) the 2274 * task which stole the rtmutex) the chance to try the fixup of the
2187 * chance to try the fixup of the pi_state. So once we are 2275 * pi_state. So once we are back from handling the fault we need to
2188 * back from handling the fault we need to check the pi_state 2276 * check the pi_state after reacquiring the locks and before trying to
2189 * after reacquiring the hash bucket lock and before trying to 2277 * do another fixup. When the fixup has been done already we simply
2190 * do another fixup. When the fixup has been done already we 2278 * return.
2191 * simply return. 2279 *
2280 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
2281 * drop hb->lock since the caller owns the hb -> futex_q relation.
2282 * Dropping the pi_mutex->wait_lock requires the state revalidate.
2192 */ 2283 */
2193handle_fault: 2284handle_fault:
2285 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
2194 spin_unlock(q->lock_ptr); 2286 spin_unlock(q->lock_ptr);
2195 2287
2196 ret = fault_in_user_writeable(uaddr); 2288 ret = fault_in_user_writeable(uaddr);
2197 2289
2198 spin_lock(q->lock_ptr); 2290 spin_lock(q->lock_ptr);
2291 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
2199 2292
2200 /* 2293 /*
2201 * Check if someone else fixed it for us: 2294 * Check if someone else fixed it for us:
2202 */ 2295 */
2203 if (pi_state->owner != oldowner) 2296 if (pi_state->owner != oldowner) {
2204 return 0; 2297 ret = 0;
2298 goto out_unlock;
2299 }
2205 2300
2206 if (ret) 2301 if (ret)
2207 return ret; 2302 goto out_unlock;
2208 2303
2209 goto retry; 2304 goto retry;
2305
2306out_unlock:
2307 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
2308 return ret;
2210} 2309}
2211 2310
2212static long futex_wait_restart(struct restart_block *restart); 2311static long futex_wait_restart(struct restart_block *restart);