diff options
| author | Manfred Spraul <manfred@colorfullife.com> | 2010-05-26 17:43:40 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-05-27 12:12:49 -0400 |
| commit | fd5db42254518fbf241dc454e918598fbe494fa2 (patch) | |
| tree | 356c8098f7f706a0e0396476fb5ccb924568eea1 /ipc | |
| parent | 2dcb22b346be7b7b7e630a8970d69cf3f1111ec1 (diff) | |
ipc/sem.c: optimize update_queue() for bulk wakeup calls
The following series of patches tries to fix the spinlock contention
reported by Chris Mason - his benchmark exposes problems of the current
code:
- In the worst case, the algorithm used by update_queue() is O(N^2).
Bulk wake-up calls can enter this worst case. The patch series fix
that.
Note that the benchmark app doesn't expose the problem, it just should
be fixed: Real world apps might do the wake-ups in another order than
perfect FIFO.
- The part of the code that runs within the semaphore array spinlock is
significantly larger than necessary.
The patch series fixes that. This change is responsible for the main
improvement.
- The cacheline with the spinlock is also used for a variable that is
read in the hot path (sem_base) and for a variable that is unnecessarily
written to multiple times (sem_otime). The last step of the series
cacheline-aligns the spinlock.
This patch:
The SysV semaphore code allows to perform multiple operations on all
semaphores in the array as atomic operations. After a modification,
update_queue() checks which of the waiting tasks can complete.
The algorithm that is used to identify the tasks is O(N^2) in the worst
case. For some cases, it is simple to avoid the O(N^2).
The patch adds a detection logic for some cases, especially for the case
of an array where all sleeping tasks are single sembuf operations and a
multi-sembuf operation is used to wake up multiple tasks.
A big database application uses that approach.
The patch fixes wakeup due to semctl(,,SETALL,) - the initial version of
the patch breaks that.
[akpm@linux-foundation.org: make do_smart_update() static]
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Zach Brown <zach.brown@oracle.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'ipc')
| -rw-r--r-- | ipc/sem.c | 110 |
1 files changed, 97 insertions, 13 deletions
| @@ -434,6 +434,69 @@ static void unlink_queue(struct sem_array *sma, struct sem_queue *q) | |||
| 434 | sma->complex_count--; | 434 | sma->complex_count--; |
| 435 | } | 435 | } |
| 436 | 436 | ||
| 437 | /** check_restart(sma, q) | ||
| 438 | * @sma: semaphore array | ||
| 439 | * @q: the operation that just completed | ||
| 440 | * | ||
| 441 | * update_queue is O(N^2) when it restarts scanning the whole queue of | ||
| 442 | * waiting operations. Therefore this function checks if the restart is | ||
| 443 | * really necessary. It is called after a previously waiting operation | ||
| 444 | * was completed. | ||
| 445 | */ | ||
| 446 | static int check_restart(struct sem_array *sma, struct sem_queue *q) | ||
| 447 | { | ||
| 448 | struct sem *curr; | ||
| 449 | struct sem_queue *h; | ||
| 450 | |||
| 451 | /* if the operation didn't modify the array, then no restart */ | ||
| 452 | if (q->alter == 0) | ||
| 453 | return 0; | ||
| 454 | |||
| 455 | /* pending complex operations are too difficult to analyse */ | ||
| 456 | if (sma->complex_count) | ||
| 457 | return 1; | ||
| 458 | |||
| 459 | /* we were a sleeping complex operation. Too difficult */ | ||
| 460 | if (q->nsops > 1) | ||
| 461 | return 1; | ||
| 462 | |||
| 463 | curr = sma->sem_base + q->sops[0].sem_num; | ||
| 464 | |||
| 465 | /* No-one waits on this queue */ | ||
| 466 | if (list_empty(&curr->sem_pending)) | ||
| 467 | return 0; | ||
| 468 | |||
| 469 | /* the new semaphore value */ | ||
| 470 | if (curr->semval) { | ||
| 471 | /* It is impossible that someone waits for the new value: | ||
| 472 | * - q is a previously sleeping simple operation that | ||
| 473 | * altered the array. It must be a decrement, because | ||
| 474 | * simple increments never sleep. | ||
| 475 | * - The value is not 0, thus wait-for-zero won't proceed. | ||
| 476 | * - If there are older (higher priority) decrements | ||
| 477 | * in the queue, then they have observed the original | ||
| 478 | * semval value and couldn't proceed. The operation | ||
| 479 | * decremented to value - thus they won't proceed either. | ||
| 480 | */ | ||
| 481 | BUG_ON(q->sops[0].sem_op >= 0); | ||
| 482 | return 0; | ||
| 483 | } | ||
| 484 | /* | ||
| 485 | * semval is 0. Check if there are wait-for-zero semops. | ||
| 486 | * They must be the first entries in the per-semaphore simple queue | ||
| 487 | */ | ||
| 488 | h = list_first_entry(&curr->sem_pending, struct sem_queue, simple_list); | ||
| 489 | BUG_ON(h->nsops != 1); | ||
| 490 | BUG_ON(h->sops[0].sem_num != q->sops[0].sem_num); | ||
| 491 | |||
| 492 | /* Yes, there is a wait-for-zero semop. Restart */ | ||
| 493 | if (h->sops[0].sem_op == 0) | ||
| 494 | return 1; | ||
| 495 | |||
| 496 | /* Again - no-one is waiting for the new value. */ | ||
| 497 | return 0; | ||
| 498 | } | ||
| 499 | |||
| 437 | 500 | ||
| 438 | /** | 501 | /** |
| 439 | * update_queue(sma, semnum): Look for tasks that can be completed. | 502 | * update_queue(sma, semnum): Look for tasks that can be completed. |
| @@ -469,7 +532,7 @@ static void update_queue(struct sem_array *sma, int semnum) | |||
| 469 | again: | 532 | again: |
| 470 | walk = pending_list->next; | 533 | walk = pending_list->next; |
| 471 | while (walk != pending_list) { | 534 | while (walk != pending_list) { |
| 472 | int error, alter; | 535 | int error, restart; |
| 473 | 536 | ||
| 474 | q = (struct sem_queue *)((char *)walk - offset); | 537 | q = (struct sem_queue *)((char *)walk - offset); |
| 475 | walk = walk->next; | 538 | walk = walk->next; |
| @@ -494,22 +557,43 @@ again: | |||
| 494 | 557 | ||
| 495 | unlink_queue(sma, q); | 558 | unlink_queue(sma, q); |
| 496 | 559 | ||
| 497 | /* | 560 | if (error) |
| 498 | * The next operation that must be checked depends on the type | 561 | restart = 0; |
| 499 | * of the completed operation: | 562 | else |
| 500 | * - if the operation modified the array, then restart from the | 563 | restart = check_restart(sma, q); |
| 501 | * head of the queue and check for threads that might be | 564 | |
| 502 | * waiting for the new semaphore values. | ||
| 503 | * - if the operation didn't modify the array, then just | ||
| 504 | * continue. | ||
| 505 | */ | ||
| 506 | alter = q->alter; | ||
| 507 | wake_up_sem_queue(q, error); | 565 | wake_up_sem_queue(q, error); |
| 508 | if (alter && !error) | 566 | if (restart) |
| 509 | goto again; | 567 | goto again; |
| 510 | } | 568 | } |
| 511 | } | 569 | } |
| 512 | 570 | ||
| 571 | /** do_smart_update(sma, sops, nsops): Optimized update_queue | ||
| 572 | * @sma: semaphore array | ||
| 573 | * @sops: operations that were performed | ||
| 574 | * @nsops: number of operations | ||
| 575 | * | ||
| 576 | * do_smart_update() does the required called to update_queue, based on the | ||
| 577 | * actual changes that were performed on the semaphore array. | ||
| 578 | */ | ||
| 579 | static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops) | ||
| 580 | { | ||
| 581 | int i; | ||
| 582 | |||
| 583 | if (sma->complex_count || sops == NULL) { | ||
| 584 | update_queue(sma, -1); | ||
| 585 | return; | ||
| 586 | } | ||
| 587 | |||
| 588 | for (i = 0; i < nsops; i++) { | ||
| 589 | if (sops[i].sem_op > 0 || | ||
| 590 | (sops[i].sem_op < 0 && | ||
| 591 | sma->sem_base[sops[i].sem_num].semval == 0)) | ||
| 592 | update_queue(sma, sops[i].sem_num); | ||
| 593 | } | ||
| 594 | } | ||
| 595 | |||
| 596 | |||
| 513 | /* The following counts are associated to each semaphore: | 597 | /* The following counts are associated to each semaphore: |
| 514 | * semncnt number of tasks waiting on semval being nonzero | 598 | * semncnt number of tasks waiting on semval being nonzero |
| 515 | * semzcnt number of tasks waiting on semval being zero | 599 | * semzcnt number of tasks waiting on semval being zero |
| @@ -1225,7 +1309,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | |||
| 1225 | error = try_atomic_semop (sma, sops, nsops, un, task_tgid_vnr(current)); | 1309 | error = try_atomic_semop (sma, sops, nsops, un, task_tgid_vnr(current)); |
| 1226 | if (error <= 0) { | 1310 | if (error <= 0) { |
| 1227 | if (alter && error == 0) | 1311 | if (alter && error == 0) |
| 1228 | update_queue(sma, (nsops == 1) ? sops[0].sem_num : -1); | 1312 | do_smart_update(sma, sops, nsops); |
| 1229 | 1313 | ||
| 1230 | goto out_unlock_free; | 1314 | goto out_unlock_free; |
| 1231 | } | 1315 | } |
