diff options
| author | Manfred Spraul <manfred@colorfullife.com> | 2016-10-11 16:54:50 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-10-11 18:06:33 -0400 |
| commit | 5864a2fd3088db73d47942370d0f7210a807b9bc (patch) | |
| tree | f985a10bc1459348f13e77820ca8a3b60296d2b5 /ipc | |
| parent | 65deb8af76defeae4b114a75242ed15b0bcba173 (diff) | |
ipc/sem.c: fix complex_count vs. simple op race
Commit 6d07b68ce16a ("ipc/sem.c: optimize sem_lock()") introduced a
race:
sem_lock has a fast path that allows parallel simple operations.
There are two reasons why a simple operation cannot run in parallel:
- a non-simple operations is ongoing (sma->sem_perm.lock held)
- a complex operation is sleeping (sma->complex_count != 0)
As both facts are stored independently, a thread can bypass the current
checks by sleeping in the right positions. See below for more details
(or kernel bugzilla 105651).
The patch fixes that by creating one variable (complex_mode)
that tracks both reasons why parallel operations are not possible.
The patch also updates stale documentation regarding the locking.
With regards to stable kernels:
The patch is required for all kernels that include the
commit 6d07b68ce16a ("ipc/sem.c: optimize sem_lock()") (3.10?)
The alternative is to revert the patch that introduced the race.
The patch is safe for backporting, i.e. it makes no assumptions
about memory barriers in spin_unlock_wait().
Background:
Here is the race of the current implementation:
Thread A: (simple op)
- does the first "sma->complex_count == 0" test
Thread B: (complex op)
- does sem_lock(): This includes an array scan. But the scan can't
find Thread A, because Thread A does not own sem->lock yet.
- the thread does the operation, increases complex_count,
drops sem_lock, sleeps
Thread A:
- spin_lock(&sem->lock), spin_is_locked(sma->sem_perm.lock)
- sleeps before the complex_count test
Thread C: (complex op)
- does sem_lock (no array scan, complex_count==1)
- wakes up Thread B.
- decrements complex_count
Thread A:
- does the complex_count test
Bug:
Now both thread A and thread C operate on the same array, without
any synchronization.
Fixes: 6d07b68ce16a ("ipc/sem.c: optimize sem_lock()")
Link: http://lkml.kernel.org/r/1469123695-5661-1-git-send-email-manfred@colorfullife.com
Reported-by: <felixh@informatik.uni-bremen.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: <1vier1@web.de>
Cc: <stable@vger.kernel.org> [3.10+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'ipc')
| -rw-r--r-- | ipc/sem.c | 138 |
1 files changed, 83 insertions, 55 deletions
| @@ -162,14 +162,21 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); | |||
| 162 | 162 | ||
| 163 | /* | 163 | /* |
| 164 | * Locking: | 164 | * Locking: |
| 165 | * a) global sem_lock() for read/write | ||
| 165 | * sem_undo.id_next, | 166 | * sem_undo.id_next, |
| 166 | * sem_array.complex_count, | 167 | * sem_array.complex_count, |
| 167 | * sem_array.pending{_alter,_cont}, | 168 | * sem_array.complex_mode |
| 168 | * sem_array.sem_undo: global sem_lock() for read/write | 169 | * sem_array.pending{_alter,_const}, |
| 169 | * sem_undo.proc_next: only "current" is allowed to read/write that field. | 170 | * sem_array.sem_undo |
| 170 | * | 171 | * |
| 172 | * b) global or semaphore sem_lock() for read/write: | ||
| 171 | * sem_array.sem_base[i].pending_{const,alter}: | 173 | * sem_array.sem_base[i].pending_{const,alter}: |
| 172 | * global or semaphore sem_lock() for read/write | 174 | * sem_array.complex_mode (for read) |
| 175 | * | ||
| 176 | * c) special: | ||
| 177 | * sem_undo_list.list_proc: | ||
| 178 | * * undo_list->lock for write | ||
| 179 | * * rcu for read | ||
| 173 | */ | 180 | */ |
| 174 | 181 | ||
| 175 | #define sc_semmsl sem_ctls[0] | 182 | #define sc_semmsl sem_ctls[0] |
| @@ -260,30 +267,61 @@ static void sem_rcu_free(struct rcu_head *head) | |||
| 260 | } | 267 | } |
| 261 | 268 | ||
| 262 | /* | 269 | /* |
| 263 | * Wait until all currently ongoing simple ops have completed. | 270 | * Enter the mode suitable for non-simple operations: |
| 264 | * Caller must own sem_perm.lock. | 271 | * Caller must own sem_perm.lock. |
| 265 | * New simple ops cannot start, because simple ops first check | ||
| 266 | * that sem_perm.lock is free. | ||
| 267 | * that a) sem_perm.lock is free and b) complex_count is 0. | ||
| 268 | */ | 272 | */ |
| 269 | static void sem_wait_array(struct sem_array *sma) | 273 | static void complexmode_enter(struct sem_array *sma) |
| 270 | { | 274 | { |
| 271 | int i; | 275 | int i; |
| 272 | struct sem *sem; | 276 | struct sem *sem; |
| 273 | 277 | ||
| 274 | if (sma->complex_count) { | 278 | if (sma->complex_mode) { |
| 275 | /* The thread that increased sma->complex_count waited on | 279 | /* We are already in complex_mode. Nothing to do */ |
| 276 | * all sem->lock locks. Thus we don't need to wait again. | ||
| 277 | */ | ||
| 278 | return; | 280 | return; |
| 279 | } | 281 | } |
| 280 | 282 | ||
| 283 | /* We need a full barrier after seting complex_mode: | ||
| 284 | * The write to complex_mode must be visible | ||
| 285 | * before we read the first sem->lock spinlock state. | ||
| 286 | */ | ||
| 287 | smp_store_mb(sma->complex_mode, true); | ||
| 288 | |||
| 281 | for (i = 0; i < sma->sem_nsems; i++) { | 289 | for (i = 0; i < sma->sem_nsems; i++) { |
| 282 | sem = sma->sem_base + i; | 290 | sem = sma->sem_base + i; |
| 283 | spin_unlock_wait(&sem->lock); | 291 | spin_unlock_wait(&sem->lock); |
| 284 | } | 292 | } |
| 293 | /* | ||
| 294 | * spin_unlock_wait() is not a memory barriers, it is only a | ||
| 295 | * control barrier. The code must pair with spin_unlock(&sem->lock), | ||
| 296 | * thus just the control barrier is insufficient. | ||
| 297 | * | ||
| 298 | * smp_rmb() is sufficient, as writes cannot pass the control barrier. | ||
| 299 | */ | ||
| 300 | smp_rmb(); | ||
| 301 | } | ||
| 302 | |||
| 303 | /* | ||
| 304 | * Try to leave the mode that disallows simple operations: | ||
| 305 | * Caller must own sem_perm.lock. | ||
| 306 | */ | ||
| 307 | static void complexmode_tryleave(struct sem_array *sma) | ||
| 308 | { | ||
| 309 | if (sma->complex_count) { | ||
| 310 | /* Complex ops are sleeping. | ||
| 311 | * We must stay in complex mode | ||
| 312 | */ | ||
| 313 | return; | ||
| 314 | } | ||
| 315 | /* | ||
| 316 | * Immediately after setting complex_mode to false, | ||
| 317 | * a simple op can start. Thus: all memory writes | ||
| 318 | * performed by the current operation must be visible | ||
| 319 | * before we set complex_mode to false. | ||
| 320 | */ | ||
| 321 | smp_store_release(&sma->complex_mode, false); | ||
| 285 | } | 322 | } |
| 286 | 323 | ||
| 324 | #define SEM_GLOBAL_LOCK (-1) | ||
| 287 | /* | 325 | /* |
| 288 | * If the request contains only one semaphore operation, and there are | 326 | * If the request contains only one semaphore operation, and there are |
| 289 | * no complex transactions pending, lock only the semaphore involved. | 327 | * no complex transactions pending, lock only the semaphore involved. |
| @@ -300,56 +338,42 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, | |||
| 300 | /* Complex operation - acquire a full lock */ | 338 | /* Complex operation - acquire a full lock */ |
| 301 | ipc_lock_object(&sma->sem_perm); | 339 | ipc_lock_object(&sma->sem_perm); |
| 302 | 340 | ||
| 303 | /* And wait until all simple ops that are processed | 341 | /* Prevent parallel simple ops */ |
| 304 | * right now have dropped their locks. | 342 | complexmode_enter(sma); |
| 305 | */ | 343 | return SEM_GLOBAL_LOCK; |
| 306 | sem_wait_array(sma); | ||
| 307 | return -1; | ||
| 308 | } | 344 | } |
| 309 | 345 | ||
| 310 | /* | 346 | /* |
| 311 | * Only one semaphore affected - try to optimize locking. | 347 | * Only one semaphore affected - try to optimize locking. |
| 312 | * The rules are: | 348 | * Optimized locking is possible if no complex operation |
| 313 | * - optimized locking is possible if no complex operation | 349 | * is either enqueued or processed right now. |
| 314 | * is either enqueued or processed right now. | 350 | * |
| 315 | * - The test for enqueued complex ops is simple: | 351 | * Both facts are tracked by complex_mode. |
| 316 | * sma->complex_count != 0 | ||
| 317 | * - Testing for complex ops that are processed right now is | ||
| 318 | * a bit more difficult. Complex ops acquire the full lock | ||
| 319 | * and first wait that the running simple ops have completed. | ||
| 320 | * (see above) | ||
| 321 | * Thus: If we own a simple lock and the global lock is free | ||
| 322 | * and complex_count is now 0, then it will stay 0 and | ||
| 323 | * thus just locking sem->lock is sufficient. | ||
| 324 | */ | 352 | */ |
| 325 | sem = sma->sem_base + sops->sem_num; | 353 | sem = sma->sem_base + sops->sem_num; |
| 326 | 354 | ||
| 327 | if (sma->complex_count == 0) { | 355 | /* |
| 356 | * Initial check for complex_mode. Just an optimization, | ||
| 357 | * no locking, no memory barrier. | ||
| 358 | */ | ||
| 359 | if (!sma->complex_mode) { | ||
| 328 | /* | 360 | /* |
| 329 | * It appears that no complex operation is around. | 361 | * It appears that no complex operation is around. |
| 330 | * Acquire the per-semaphore lock. | 362 | * Acquire the per-semaphore lock. |
| 331 | */ | 363 | */ |
| 332 | spin_lock(&sem->lock); | 364 | spin_lock(&sem->lock); |
| 333 | 365 | ||
| 334 | /* Then check that the global lock is free */ | 366 | /* |
| 335 | if (!spin_is_locked(&sma->sem_perm.lock)) { | 367 | * See 51d7d5205d33 |
| 336 | /* | 368 | * ("powerpc: Add smp_mb() to arch_spin_is_locked()"): |
| 337 | * We need a memory barrier with acquire semantics, | 369 | * A full barrier is required: the write of sem->lock |
| 338 | * otherwise we can race with another thread that does: | 370 | * must be visible before the read is executed |
| 339 | * complex_count++; | 371 | */ |
| 340 | * spin_unlock(sem_perm.lock); | 372 | smp_mb(); |
| 341 | */ | ||
| 342 | smp_acquire__after_ctrl_dep(); | ||
| 343 | 373 | ||
| 344 | /* | 374 | if (!smp_load_acquire(&sma->complex_mode)) { |
| 345 | * Now repeat the test of complex_count: | 375 | /* fast path successful! */ |
| 346 | * It can't change anymore until we drop sem->lock. | 376 | return sops->sem_num; |
| 347 | * Thus: if is now 0, then it will stay 0. | ||
| 348 | */ | ||
| 349 | if (sma->complex_count == 0) { | ||
| 350 | /* fast path successful! */ | ||
| 351 | return sops->sem_num; | ||
| 352 | } | ||
| 353 | } | 377 | } |
| 354 | spin_unlock(&sem->lock); | 378 | spin_unlock(&sem->lock); |
| 355 | } | 379 | } |
| @@ -369,15 +393,16 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, | |||
| 369 | /* Not a false alarm, thus complete the sequence for a | 393 | /* Not a false alarm, thus complete the sequence for a |
| 370 | * full lock. | 394 | * full lock. |
| 371 | */ | 395 | */ |
| 372 | sem_wait_array(sma); | 396 | complexmode_enter(sma); |
| 373 | return -1; | 397 | return SEM_GLOBAL_LOCK; |
| 374 | } | 398 | } |
| 375 | } | 399 | } |
| 376 | 400 | ||
| 377 | static inline void sem_unlock(struct sem_array *sma, int locknum) | 401 | static inline void sem_unlock(struct sem_array *sma, int locknum) |
| 378 | { | 402 | { |
| 379 | if (locknum == -1) { | 403 | if (locknum == SEM_GLOBAL_LOCK) { |
| 380 | unmerge_queues(sma); | 404 | unmerge_queues(sma); |
| 405 | complexmode_tryleave(sma); | ||
| 381 | ipc_unlock_object(&sma->sem_perm); | 406 | ipc_unlock_object(&sma->sem_perm); |
| 382 | } else { | 407 | } else { |
| 383 | struct sem *sem = sma->sem_base + locknum; | 408 | struct sem *sem = sma->sem_base + locknum; |
| @@ -529,6 +554,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) | |||
| 529 | } | 554 | } |
| 530 | 555 | ||
| 531 | sma->complex_count = 0; | 556 | sma->complex_count = 0; |
| 557 | sma->complex_mode = true; /* dropped by sem_unlock below */ | ||
| 532 | INIT_LIST_HEAD(&sma->pending_alter); | 558 | INIT_LIST_HEAD(&sma->pending_alter); |
| 533 | INIT_LIST_HEAD(&sma->pending_const); | 559 | INIT_LIST_HEAD(&sma->pending_const); |
| 534 | INIT_LIST_HEAD(&sma->list_id); | 560 | INIT_LIST_HEAD(&sma->list_id); |
| @@ -2184,10 +2210,10 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it) | |||
| 2184 | /* | 2210 | /* |
| 2185 | * The proc interface isn't aware of sem_lock(), it calls | 2211 | * The proc interface isn't aware of sem_lock(), it calls |
| 2186 | * ipc_lock_object() directly (in sysvipc_find_ipc). | 2212 | * ipc_lock_object() directly (in sysvipc_find_ipc). |
| 2187 | * In order to stay compatible with sem_lock(), we must wait until | 2213 | * In order to stay compatible with sem_lock(), we must |
| 2188 | * all simple semop() calls have left their critical regions. | 2214 | * enter / leave complex_mode. |
| 2189 | */ | 2215 | */ |
| 2190 | sem_wait_array(sma); | 2216 | complexmode_enter(sma); |
| 2191 | 2217 | ||
| 2192 | sem_otime = get_semotime(sma); | 2218 | sem_otime = get_semotime(sma); |
| 2193 | 2219 | ||
| @@ -2204,6 +2230,8 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it) | |||
| 2204 | sem_otime, | 2230 | sem_otime, |
| 2205 | sma->sem_ctime); | 2231 | sma->sem_ctime); |
| 2206 | 2232 | ||
| 2233 | complexmode_tryleave(sma); | ||
| 2234 | |||
| 2207 | return 0; | 2235 | return 0; |
| 2208 | } | 2236 | } |
| 2209 | #endif | 2237 | #endif |
