diff options
| author | Davidlohr Bueso <dave@stgolabs.net> | 2016-12-14 18:06:34 -0500 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-12-14 19:04:08 -0500 |
| commit | 9ae949fa382b080170f9d3c8bd9dea951cf52ee7 (patch) | |
| tree | af701403e6bb61d49b67533beff9239ed0bec1b2 /ipc | |
| parent | 248e7357cf8ec50bdaca68dce7c488ce843b6b93 (diff) | |
ipc/sem: rework task wakeups
Our sysv sems have been using the notion of lockless wakeups for a
while, ever since commit 0a2b9d4c7967 ("ipc/sem.c: move wake_up_process
out of the spinlock section"), in order to reduce the sem_lock hold
times. This in-house pending queue can be replaced by wake_q (just like
all the rest of ipc now), in that it provides the following advantages:
o Simplifies and gets rid of unnecessary code.
o We get rid of the IN_WAKEUP complexities. Given that wake_q_add()
grabs reference to the task, if awoken due to an unrelated event,
between the wake_q_add() and wake_up_q() window, we cannot race with
sys_exit and the imminent call to wake_up_process().
o By not spinning IN_WAKEUP, we no longer need to disable preemption.
In consequence, the wakeup paths (after schedule(), that is) must
acknowledge an external signal/event, as well spurious wakeup occurring
during the pending wakeup window. Obviously no changes in semantics
that could be visible to the user. The fastpath is _only_ for when we
know for sure that we were awoken due to a the waker's successful semop
call (queue.status is not -EINTR).
On a 48-core Haswell, running the ipcscale 'waitforzero' test, the
following is seen with increasing thread counts:
v4.8-rc5 v4.8-rc5
semopv2
Hmean sembench-sem-2 574733.00 ( 0.00%) 578322.00 ( 0.62%)
Hmean sembench-sem-8 811708.00 ( 0.00%) 824689.00 ( 1.59%)
Hmean sembench-sem-12 842448.00 ( 0.00%) 845409.00 ( 0.35%)
Hmean sembench-sem-21 933003.00 ( 0.00%) 977748.00 ( 4.80%)
Hmean sembench-sem-48 935910.00 ( 0.00%) 1004759.00 ( 7.36%)
Hmean sembench-sem-79 937186.00 ( 0.00%) 983976.00 ( 4.99%)
Hmean sembench-sem-234 974256.00 ( 0.00%) 1060294.00 ( 8.83%)
Hmean sembench-sem-265 975468.00 ( 0.00%) 1016243.00 ( 4.18%)
Hmean sembench-sem-296 991280.00 ( 0.00%) 1042659.00 ( 5.18%)
Hmean sembench-sem-327 975415.00 ( 0.00%) 1029977.00 ( 5.59%)
Hmean sembench-sem-358 1014286.00 ( 0.00%) 1049624.00 ( 3.48%)
Hmean sembench-sem-389 972939.00 ( 0.00%) 1043127.00 ( 7.21%)
Hmean sembench-sem-420 981909.00 ( 0.00%) 1056747.00 ( 7.62%)
Hmean sembench-sem-451 990139.00 ( 0.00%) 1051609.00 ( 6.21%)
Hmean sembench-sem-482 965735.00 ( 0.00%) 1040313.00 ( 7.72%)
[akpm@linux-foundation.org: coding-style fixes]
[sfr@canb.auug.org.au: merge fix for WAKE_Q to DEFINE_WAKE_Q rename]
Link: http://lkml.kernel.org/r/20161122210410.5eca9fc2@canb.auug.org.au
Link: http://lkml.kernel.org/r/1474225896-10066-3-git-send-email-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'ipc')
| -rw-r--r-- | ipc/sem.c | 266 |
1 files changed, 86 insertions, 180 deletions
| @@ -11,6 +11,7 @@ | |||
| 11 | * (c) 2001 Red Hat Inc | 11 | * (c) 2001 Red Hat Inc |
| 12 | * Lockless wakeup | 12 | * Lockless wakeup |
| 13 | * (c) 2003 Manfred Spraul <manfred@colorfullife.com> | 13 | * (c) 2003 Manfred Spraul <manfred@colorfullife.com> |
| 14 | * (c) 2016 Davidlohr Bueso <dave@stgolabs.net> | ||
| 14 | * Further wakeup optimizations, documentation | 15 | * Further wakeup optimizations, documentation |
| 15 | * (c) 2010 Manfred Spraul <manfred@colorfullife.com> | 16 | * (c) 2010 Manfred Spraul <manfred@colorfullife.com> |
| 16 | * | 17 | * |
| @@ -53,15 +54,11 @@ | |||
| 53 | * Semaphores are actively given to waiting tasks (necessary for FIFO). | 54 | * Semaphores are actively given to waiting tasks (necessary for FIFO). |
| 54 | * (see update_queue()) | 55 | * (see update_queue()) |
| 55 | * - To improve the scalability, the actual wake-up calls are performed after | 56 | * - To improve the scalability, the actual wake-up calls are performed after |
| 56 | * dropping all locks. (see wake_up_sem_queue_prepare(), | 57 | * dropping all locks. (see wake_up_sem_queue_prepare()) |
| 57 | * wake_up_sem_queue_do()) | ||
| 58 | * - All work is done by the waker, the woken up task does not have to do | 58 | * - All work is done by the waker, the woken up task does not have to do |
| 59 | * anything - not even acquiring a lock or dropping a refcount. | 59 | * anything - not even acquiring a lock or dropping a refcount. |
| 60 | * - A woken up task may not even touch the semaphore array anymore, it may | 60 | * - A woken up task may not even touch the semaphore array anymore, it may |
| 61 | * have been destroyed already by a semctl(RMID). | 61 | * have been destroyed already by a semctl(RMID). |
| 62 | * - The synchronizations between wake-ups due to a timeout/signal and a | ||
| 63 | * wake-up due to a completed semaphore operation is achieved by using an | ||
| 64 | * intermediate state (IN_WAKEUP). | ||
| 65 | * - UNDO values are stored in an array (one per process and per | 62 | * - UNDO values are stored in an array (one per process and per |
| 66 | * semaphore array, lazily allocated). For backwards compatibility, multiple | 63 | * semaphore array, lazily allocated). For backwards compatibility, multiple |
| 67 | * modes for the UNDO variables are supported (per process, per thread) | 64 | * modes for the UNDO variables are supported (per process, per thread) |
| @@ -471,40 +468,6 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) | |||
| 471 | ipc_rmid(&sem_ids(ns), &s->sem_perm); | 468 | ipc_rmid(&sem_ids(ns), &s->sem_perm); |
| 472 | } | 469 | } |
| 473 | 470 | ||
| 474 | /* | ||
| 475 | * Lockless wakeup algorithm: | ||
| 476 | * Without the check/retry algorithm a lockless wakeup is possible: | ||
| 477 | * - queue.status is initialized to -EINTR before blocking. | ||
| 478 | * - wakeup is performed by | ||
| 479 | * * unlinking the queue entry from the pending list | ||
| 480 | * * setting queue.status to IN_WAKEUP | ||
| 481 | * This is the notification for the blocked thread that a | ||
| 482 | * result value is imminent. | ||
| 483 | * * call wake_up_process | ||
| 484 | * * set queue.status to the final value. | ||
| 485 | * - the previously blocked thread checks queue.status: | ||
| 486 | * * if it's IN_WAKEUP, then it must wait until the value changes | ||
| 487 | * * if it's not -EINTR, then the operation was completed by | ||
| 488 | * update_queue. semtimedop can return queue.status without | ||
| 489 | * performing any operation on the sem array. | ||
| 490 | * * otherwise it must acquire the spinlock and check what's up. | ||
| 491 | * | ||
| 492 | * The two-stage algorithm is necessary to protect against the following | ||
| 493 | * races: | ||
| 494 | * - if queue.status is set after wake_up_process, then the woken up idle | ||
| 495 | * thread could race forward and try (and fail) to acquire sma->lock | ||
| 496 | * before update_queue had a chance to set queue.status | ||
| 497 | * - if queue.status is written before wake_up_process and if the | ||
| 498 | * blocked process is woken up by a signal between writing | ||
| 499 | * queue.status and the wake_up_process, then the woken up | ||
| 500 | * process could return from semtimedop and die by calling | ||
| 501 | * sys_exit before wake_up_process is called. Then wake_up_process | ||
| 502 | * will oops, because the task structure is already invalid. | ||
| 503 | * (yes, this happened on s390 with sysv msg). | ||
| 504 | * | ||
| 505 | */ | ||
| 506 | #define IN_WAKEUP 1 | ||
| 507 | |||
| 508 | /** | 471 | /** |
| 509 | * newary - Create a new semaphore set | 472 | * newary - Create a new semaphore set |
| 510 | * @ns: namespace | 473 | * @ns: namespace |
| @@ -703,51 +666,18 @@ undo: | |||
| 703 | return result; | 666 | return result; |
| 704 | } | 667 | } |
| 705 | 668 | ||
| 706 | /** wake_up_sem_queue_prepare(q, error): Prepare wake-up | 669 | static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error, |
| 707 | * @q: queue entry that must be signaled | 670 | struct wake_q_head *wake_q) |
| 708 | * @error: Error value for the signal | ||
| 709 | * | ||
| 710 | * Prepare the wake-up of the queue entry q. | ||
| 711 | */ | ||
| 712 | static void wake_up_sem_queue_prepare(struct list_head *pt, | ||
| 713 | struct sem_queue *q, int error) | ||
| 714 | { | ||
| 715 | if (list_empty(pt)) { | ||
| 716 | /* | ||
| 717 | * Hold preempt off so that we don't get preempted and have the | ||
| 718 | * wakee busy-wait until we're scheduled back on. | ||
| 719 | */ | ||
| 720 | preempt_disable(); | ||
| 721 | } | ||
| 722 | q->status = IN_WAKEUP; | ||
| 723 | q->pid = error; | ||
| 724 | |||
| 725 | list_add_tail(&q->list, pt); | ||
| 726 | } | ||
| 727 | |||
| 728 | /** | ||
| 729 | * wake_up_sem_queue_do - do the actual wake-up | ||
| 730 | * @pt: list of tasks to be woken up | ||
| 731 | * | ||
| 732 | * Do the actual wake-up. | ||
| 733 | * The function is called without any locks held, thus the semaphore array | ||
| 734 | * could be destroyed already and the tasks can disappear as soon as the | ||
| 735 | * status is set to the actual return code. | ||
| 736 | */ | ||
| 737 | static void wake_up_sem_queue_do(struct list_head *pt) | ||
| 738 | { | 671 | { |
| 739 | struct sem_queue *q, *t; | 672 | wake_q_add(wake_q, q->sleeper); |
| 740 | int did_something; | 673 | /* |
| 741 | 674 | * Rely on the above implicit barrier, such that we can | |
| 742 | did_something = !list_empty(pt); | 675 | * ensure that we hold reference to the task before setting |
| 743 | list_for_each_entry_safe(q, t, pt, list) { | 676 | * q->status. Otherwise we could race with do_exit if the |
| 744 | wake_up_process(q->sleeper); | 677 | * task is awoken by an external event before calling |
| 745 | /* q can disappear immediately after writing q->status. */ | 678 | * wake_up_process(). |
| 746 | smp_wmb(); | 679 | */ |
| 747 | q->status = q->pid; | 680 | WRITE_ONCE(q->status, error); |
| 748 | } | ||
| 749 | if (did_something) | ||
| 750 | preempt_enable(); | ||
| 751 | } | 681 | } |
| 752 | 682 | ||
| 753 | static void unlink_queue(struct sem_array *sma, struct sem_queue *q) | 683 | static void unlink_queue(struct sem_array *sma, struct sem_queue *q) |
| @@ -795,18 +725,18 @@ static int check_restart(struct sem_array *sma, struct sem_queue *q) | |||
| 795 | * wake_const_ops - wake up non-alter tasks | 725 | * wake_const_ops - wake up non-alter tasks |
| 796 | * @sma: semaphore array. | 726 | * @sma: semaphore array. |
| 797 | * @semnum: semaphore that was modified. | 727 | * @semnum: semaphore that was modified. |
| 798 | * @pt: list head for the tasks that must be woken up. | 728 | * @wake_q: lockless wake-queue head. |
| 799 | * | 729 | * |
| 800 | * wake_const_ops must be called after a semaphore in a semaphore array | 730 | * wake_const_ops must be called after a semaphore in a semaphore array |
| 801 | * was set to 0. If complex const operations are pending, wake_const_ops must | 731 | * was set to 0. If complex const operations are pending, wake_const_ops must |
| 802 | * be called with semnum = -1, as well as with the number of each modified | 732 | * be called with semnum = -1, as well as with the number of each modified |
| 803 | * semaphore. | 733 | * semaphore. |
| 804 | * The tasks that must be woken up are added to @pt. The return code | 734 | * The tasks that must be woken up are added to @wake_q. The return code |
| 805 | * is stored in q->pid. | 735 | * is stored in q->pid. |
| 806 | * The function returns 1 if at least one operation was completed successfully. | 736 | * The function returns 1 if at least one operation was completed successfully. |
| 807 | */ | 737 | */ |
| 808 | static int wake_const_ops(struct sem_array *sma, int semnum, | 738 | static int wake_const_ops(struct sem_array *sma, int semnum, |
| 809 | struct list_head *pt) | 739 | struct wake_q_head *wake_q) |
| 810 | { | 740 | { |
| 811 | struct sem_queue *q; | 741 | struct sem_queue *q; |
| 812 | struct list_head *walk; | 742 | struct list_head *walk; |
| @@ -832,7 +762,7 @@ static int wake_const_ops(struct sem_array *sma, int semnum, | |||
| 832 | 762 | ||
| 833 | unlink_queue(sma, q); | 763 | unlink_queue(sma, q); |
| 834 | 764 | ||
| 835 | wake_up_sem_queue_prepare(pt, q, error); | 765 | wake_up_sem_queue_prepare(q, error, wake_q); |
| 836 | if (error == 0) | 766 | if (error == 0) |
| 837 | semop_completed = 1; | 767 | semop_completed = 1; |
| 838 | } | 768 | } |
| @@ -845,14 +775,14 @@ static int wake_const_ops(struct sem_array *sma, int semnum, | |||
| 845 | * @sma: semaphore array | 775 | * @sma: semaphore array |
| 846 | * @sops: operations that were performed | 776 | * @sops: operations that were performed |
| 847 | * @nsops: number of operations | 777 | * @nsops: number of operations |
| 848 | * @pt: list head of the tasks that must be woken up. | 778 | * @wake_q: lockless wake-queue head |
| 849 | * | 779 | * |
| 850 | * Checks all required queue for wait-for-zero operations, based | 780 | * Checks all required queue for wait-for-zero operations, based |
| 851 | * on the actual changes that were performed on the semaphore array. | 781 | * on the actual changes that were performed on the semaphore array. |
| 852 | * The function returns 1 if at least one operation was completed successfully. | 782 | * The function returns 1 if at least one operation was completed successfully. |
| 853 | */ | 783 | */ |
| 854 | static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, | 784 | static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, |
| 855 | int nsops, struct list_head *pt) | 785 | int nsops, struct wake_q_head *wake_q) |
| 856 | { | 786 | { |
| 857 | int i; | 787 | int i; |
| 858 | int semop_completed = 0; | 788 | int semop_completed = 0; |
| @@ -865,7 +795,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, | |||
| 865 | 795 | ||
| 866 | if (sma->sem_base[num].semval == 0) { | 796 | if (sma->sem_base[num].semval == 0) { |
| 867 | got_zero = 1; | 797 | got_zero = 1; |
| 868 | semop_completed |= wake_const_ops(sma, num, pt); | 798 | semop_completed |= wake_const_ops(sma, num, wake_q); |
| 869 | } | 799 | } |
| 870 | } | 800 | } |
| 871 | } else { | 801 | } else { |
| @@ -876,7 +806,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, | |||
| 876 | for (i = 0; i < sma->sem_nsems; i++) { | 806 | for (i = 0; i < sma->sem_nsems; i++) { |
| 877 | if (sma->sem_base[i].semval == 0) { | 807 | if (sma->sem_base[i].semval == 0) { |
| 878 | got_zero = 1; | 808 | got_zero = 1; |
| 879 | semop_completed |= wake_const_ops(sma, i, pt); | 809 | semop_completed |= wake_const_ops(sma, i, wake_q); |
| 880 | } | 810 | } |
| 881 | } | 811 | } |
| 882 | } | 812 | } |
| @@ -885,7 +815,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, | |||
| 885 | * then check the global queue, too. | 815 | * then check the global queue, too. |
| 886 | */ | 816 | */ |
| 887 | if (got_zero) | 817 | if (got_zero) |
| 888 | semop_completed |= wake_const_ops(sma, -1, pt); | 818 | semop_completed |= wake_const_ops(sma, -1, wake_q); |
| 889 | 819 | ||
| 890 | return semop_completed; | 820 | return semop_completed; |
| 891 | } | 821 | } |
| @@ -895,19 +825,19 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, | |||
| 895 | * update_queue - look for tasks that can be completed. | 825 | * update_queue - look for tasks that can be completed. |
| 896 | * @sma: semaphore array. | 826 | * @sma: semaphore array. |
| 897 | * @semnum: semaphore that was modified. | 827 | * @semnum: semaphore that was modified. |
| 898 | * @pt: list head for the tasks that must be woken up. | 828 | * @wake_q: lockless wake-queue head. |
| 899 | * | 829 | * |
| 900 | * update_queue must be called after a semaphore in a semaphore array | 830 | * update_queue must be called after a semaphore in a semaphore array |
| 901 | * was modified. If multiple semaphores were modified, update_queue must | 831 | * was modified. If multiple semaphores were modified, update_queue must |
| 902 | * be called with semnum = -1, as well as with the number of each modified | 832 | * be called with semnum = -1, as well as with the number of each modified |
| 903 | * semaphore. | 833 | * semaphore. |
| 904 | * The tasks that must be woken up are added to @pt. The return code | 834 | * The tasks that must be woken up are added to @wake_q. The return code |
| 905 | * is stored in q->pid. | 835 | * is stored in q->pid. |
| 906 | * The function internally checks if const operations can now succeed. | 836 | * The function internally checks if const operations can now succeed. |
| 907 | * | 837 | * |
| 908 | * The function return 1 if at least one semop was completed successfully. | 838 | * The function return 1 if at least one semop was completed successfully. |
| 909 | */ | 839 | */ |
| 910 | static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) | 840 | static int update_queue(struct sem_array *sma, int semnum, struct wake_q_head *wake_q) |
| 911 | { | 841 | { |
| 912 | struct sem_queue *q; | 842 | struct sem_queue *q; |
| 913 | struct list_head *walk; | 843 | struct list_head *walk; |
| @@ -949,11 +879,11 @@ again: | |||
| 949 | restart = 0; | 879 | restart = 0; |
| 950 | } else { | 880 | } else { |
| 951 | semop_completed = 1; | 881 | semop_completed = 1; |
| 952 | do_smart_wakeup_zero(sma, q->sops, q->nsops, pt); | 882 | do_smart_wakeup_zero(sma, q->sops, q->nsops, wake_q); |
| 953 | restart = check_restart(sma, q); | 883 | restart = check_restart(sma, q); |
| 954 | } | 884 | } |
| 955 | 885 | ||
| 956 | wake_up_sem_queue_prepare(pt, q, error); | 886 | wake_up_sem_queue_prepare(q, error, wake_q); |
| 957 | if (restart) | 887 | if (restart) |
| 958 | goto again; | 888 | goto again; |
| 959 | } | 889 | } |
| @@ -984,24 +914,24 @@ static void set_semotime(struct sem_array *sma, struct sembuf *sops) | |||
| 984 | * @sops: operations that were performed | 914 | * @sops: operations that were performed |
| 985 | * @nsops: number of operations | 915 | * @nsops: number of operations |
| 986 | * @otime: force setting otime | 916 | * @otime: force setting otime |
| 987 | * @pt: list head of the tasks that must be woken up. | 917 | * @wake_q: lockless wake-queue head |
| 988 | * | 918 | * |
| 989 | * do_smart_update() does the required calls to update_queue and wakeup_zero, | 919 | * do_smart_update() does the required calls to update_queue and wakeup_zero, |
| 990 | * based on the actual changes that were performed on the semaphore array. | 920 | * based on the actual changes that were performed on the semaphore array. |
| 991 | * Note that the function does not do the actual wake-up: the caller is | 921 | * Note that the function does not do the actual wake-up: the caller is |
| 992 | * responsible for calling wake_up_sem_queue_do(@pt). | 922 | * responsible for calling wake_up_q(). |
| 993 | * It is safe to perform this call after dropping all locks. | 923 | * It is safe to perform this call after dropping all locks. |
| 994 | */ | 924 | */ |
| 995 | static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops, | 925 | static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops, |
| 996 | int otime, struct list_head *pt) | 926 | int otime, struct wake_q_head *wake_q) |
| 997 | { | 927 | { |
| 998 | int i; | 928 | int i; |
| 999 | 929 | ||
| 1000 | otime |= do_smart_wakeup_zero(sma, sops, nsops, pt); | 930 | otime |= do_smart_wakeup_zero(sma, sops, nsops, wake_q); |
| 1001 | 931 | ||
| 1002 | if (!list_empty(&sma->pending_alter)) { | 932 | if (!list_empty(&sma->pending_alter)) { |
| 1003 | /* semaphore array uses the global queue - just process it. */ | 933 | /* semaphore array uses the global queue - just process it. */ |
| 1004 | otime |= update_queue(sma, -1, pt); | 934 | otime |= update_queue(sma, -1, wake_q); |
| 1005 | } else { | 935 | } else { |
| 1006 | if (!sops) { | 936 | if (!sops) { |
| 1007 | /* | 937 | /* |
| @@ -1009,7 +939,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop | |||
| 1009 | * known. Check all. | 939 | * known. Check all. |
| 1010 | */ | 940 | */ |
| 1011 | for (i = 0; i < sma->sem_nsems; i++) | 941 | for (i = 0; i < sma->sem_nsems; i++) |
| 1012 | otime |= update_queue(sma, i, pt); | 942 | otime |= update_queue(sma, i, wake_q); |
| 1013 | } else { | 943 | } else { |
| 1014 | /* | 944 | /* |
| 1015 | * Check the semaphores that were increased: | 945 | * Check the semaphores that were increased: |
| @@ -1023,7 +953,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop | |||
| 1023 | for (i = 0; i < nsops; i++) { | 953 | for (i = 0; i < nsops; i++) { |
| 1024 | if (sops[i].sem_op > 0) { | 954 | if (sops[i].sem_op > 0) { |
| 1025 | otime |= update_queue(sma, | 955 | otime |= update_queue(sma, |
| 1026 | sops[i].sem_num, pt); | 956 | sops[i].sem_num, wake_q); |
| 1027 | } | 957 | } |
| 1028 | } | 958 | } |
| 1029 | } | 959 | } |
| @@ -1111,8 +1041,8 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) | |||
| 1111 | struct sem_undo *un, *tu; | 1041 | struct sem_undo *un, *tu; |
| 1112 | struct sem_queue *q, *tq; | 1042 | struct sem_queue *q, *tq; |
| 1113 | struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm); | 1043 | struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm); |
| 1114 | struct list_head tasks; | ||
| 1115 | int i; | 1044 | int i; |
| 1045 | DEFINE_WAKE_Q(wake_q); | ||
| 1116 | 1046 | ||
| 1117 | /* Free the existing undo structures for this semaphore set. */ | 1047 | /* Free the existing undo structures for this semaphore set. */ |
| 1118 | ipc_assert_locked_object(&sma->sem_perm); | 1048 | ipc_assert_locked_object(&sma->sem_perm); |
| @@ -1126,25 +1056,24 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) | |||
| 1126 | } | 1056 | } |
| 1127 | 1057 | ||
| 1128 | /* Wake up all pending processes and let them fail with EIDRM. */ | 1058 | /* Wake up all pending processes and let them fail with EIDRM. */ |
| 1129 | INIT_LIST_HEAD(&tasks); | ||
| 1130 | list_for_each_entry_safe(q, tq, &sma->pending_const, list) { | 1059 | list_for_each_entry_safe(q, tq, &sma->pending_const, list) { |
| 1131 | unlink_queue(sma, q); | 1060 | unlink_queue(sma, q); |
| 1132 | wake_up_sem_queue_prepare(&tasks, q, -EIDRM); | 1061 | wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); |
| 1133 | } | 1062 | } |
| 1134 | 1063 | ||
| 1135 | list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { | 1064 | list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { |
| 1136 | unlink_queue(sma, q); | 1065 | unlink_queue(sma, q); |
| 1137 | wake_up_sem_queue_prepare(&tasks, q, -EIDRM); | 1066 | wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); |
| 1138 | } | 1067 | } |
| 1139 | for (i = 0; i < sma->sem_nsems; i++) { | 1068 | for (i = 0; i < sma->sem_nsems; i++) { |
| 1140 | struct sem *sem = sma->sem_base + i; | 1069 | struct sem *sem = sma->sem_base + i; |
| 1141 | list_for_each_entry_safe(q, tq, &sem->pending_const, list) { | 1070 | list_for_each_entry_safe(q, tq, &sem->pending_const, list) { |
| 1142 | unlink_queue(sma, q); | 1071 | unlink_queue(sma, q); |
| 1143 | wake_up_sem_queue_prepare(&tasks, q, -EIDRM); | 1072 | wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); |
| 1144 | } | 1073 | } |
| 1145 | list_for_each_entry_safe(q, tq, &sem->pending_alter, list) { | 1074 | list_for_each_entry_safe(q, tq, &sem->pending_alter, list) { |
| 1146 | unlink_queue(sma, q); | 1075 | unlink_queue(sma, q); |
| 1147 | wake_up_sem_queue_prepare(&tasks, q, -EIDRM); | 1076 | wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); |
| 1148 | } | 1077 | } |
| 1149 | } | 1078 | } |
| 1150 | 1079 | ||
| @@ -1153,7 +1082,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) | |||
| 1153 | sem_unlock(sma, -1); | 1082 | sem_unlock(sma, -1); |
| 1154 | rcu_read_unlock(); | 1083 | rcu_read_unlock(); |
| 1155 | 1084 | ||
| 1156 | wake_up_sem_queue_do(&tasks); | 1085 | wake_up_q(&wake_q); |
| 1157 | ns->used_sems -= sma->sem_nsems; | 1086 | ns->used_sems -= sma->sem_nsems; |
| 1158 | ipc_rcu_putref(sma, sem_rcu_free); | 1087 | ipc_rcu_putref(sma, sem_rcu_free); |
| 1159 | } | 1088 | } |
| @@ -1292,9 +1221,9 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, | |||
| 1292 | struct sem_undo *un; | 1221 | struct sem_undo *un; |
| 1293 | struct sem_array *sma; | 1222 | struct sem_array *sma; |
| 1294 | struct sem *curr; | 1223 | struct sem *curr; |
| 1295 | int err; | 1224 | int err, val; |
| 1296 | struct list_head tasks; | 1225 | DEFINE_WAKE_Q(wake_q); |
| 1297 | int val; | 1226 | |
| 1298 | #if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN) | 1227 | #if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN) |
| 1299 | /* big-endian 64bit */ | 1228 | /* big-endian 64bit */ |
| 1300 | val = arg >> 32; | 1229 | val = arg >> 32; |
| @@ -1306,8 +1235,6 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, | |||
| 1306 | if (val > SEMVMX || val < 0) | 1235 | if (val > SEMVMX || val < 0) |
| 1307 | return -ERANGE; | 1236 | return -ERANGE; |
| 1308 | 1237 | ||
| 1309 | INIT_LIST_HEAD(&tasks); | ||
| 1310 | |||
| 1311 | rcu_read_lock(); | 1238 | rcu_read_lock(); |
| 1312 | sma = sem_obtain_object_check(ns, semid); | 1239 | sma = sem_obtain_object_check(ns, semid); |
| 1313 | if (IS_ERR(sma)) { | 1240 | if (IS_ERR(sma)) { |
| @@ -1350,10 +1277,10 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, | |||
| 1350 | curr->sempid = task_tgid_vnr(current); | 1277 | curr->sempid = task_tgid_vnr(current); |
| 1351 | sma->sem_ctime = get_seconds(); | 1278 | sma->sem_ctime = get_seconds(); |
| 1352 | /* maybe some queued-up processes were waiting for this */ | 1279 | /* maybe some queued-up processes were waiting for this */ |
| 1353 | do_smart_update(sma, NULL, 0, 0, &tasks); | 1280 | do_smart_update(sma, NULL, 0, 0, &wake_q); |
| 1354 | sem_unlock(sma, -1); | 1281 | sem_unlock(sma, -1); |
| 1355 | rcu_read_unlock(); | 1282 | rcu_read_unlock(); |
| 1356 | wake_up_sem_queue_do(&tasks); | 1283 | wake_up_q(&wake_q); |
| 1357 | return 0; | 1284 | return 0; |
| 1358 | } | 1285 | } |
| 1359 | 1286 | ||
| @@ -1365,9 +1292,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, | |||
| 1365 | int err, nsems; | 1292 | int err, nsems; |
| 1366 | ushort fast_sem_io[SEMMSL_FAST]; | 1293 | ushort fast_sem_io[SEMMSL_FAST]; |
| 1367 | ushort *sem_io = fast_sem_io; | 1294 | ushort *sem_io = fast_sem_io; |
| 1368 | struct list_head tasks; | 1295 | DEFINE_WAKE_Q(wake_q); |
| 1369 | |||
| 1370 | INIT_LIST_HEAD(&tasks); | ||
| 1371 | 1296 | ||
| 1372 | rcu_read_lock(); | 1297 | rcu_read_lock(); |
| 1373 | sma = sem_obtain_object_check(ns, semid); | 1298 | sma = sem_obtain_object_check(ns, semid); |
| @@ -1478,7 +1403,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, | |||
| 1478 | } | 1403 | } |
| 1479 | sma->sem_ctime = get_seconds(); | 1404 | sma->sem_ctime = get_seconds(); |
| 1480 | /* maybe some queued-up processes were waiting for this */ | 1405 | /* maybe some queued-up processes were waiting for this */ |
| 1481 | do_smart_update(sma, NULL, 0, 0, &tasks); | 1406 | do_smart_update(sma, NULL, 0, 0, &wake_q); |
| 1482 | err = 0; | 1407 | err = 0; |
| 1483 | goto out_unlock; | 1408 | goto out_unlock; |
| 1484 | } | 1409 | } |
| @@ -1514,7 +1439,7 @@ out_unlock: | |||
| 1514 | sem_unlock(sma, -1); | 1439 | sem_unlock(sma, -1); |
| 1515 | out_rcu_wakeup: | 1440 | out_rcu_wakeup: |
| 1516 | rcu_read_unlock(); | 1441 | rcu_read_unlock(); |
| 1517 | wake_up_sem_queue_do(&tasks); | 1442 | wake_up_q(&wake_q); |
| 1518 | out_free: | 1443 | out_free: |
| 1519 | if (sem_io != fast_sem_io) | 1444 | if (sem_io != fast_sem_io) |
| 1520 | ipc_free(sem_io); | 1445 | ipc_free(sem_io); |
| @@ -1787,32 +1712,6 @@ out: | |||
| 1787 | return un; | 1712 | return un; |
| 1788 | } | 1713 | } |
| 1789 | 1714 | ||
| 1790 | |||
| 1791 | /** | ||
| 1792 | * get_queue_result - retrieve the result code from sem_queue | ||
| 1793 | * @q: Pointer to queue structure | ||
| 1794 | * | ||
| 1795 | * Retrieve the return code from the pending queue. If IN_WAKEUP is found in | ||
| 1796 | * q->status, then we must loop until the value is replaced with the final | ||
| 1797 | * value: This may happen if a task is woken up by an unrelated event (e.g. | ||
| 1798 | * signal) and in parallel the task is woken up by another task because it got | ||
| 1799 | * the requested semaphores. | ||
| 1800 | * | ||
| 1801 | * The function can be called with or without holding the semaphore spinlock. | ||
| 1802 | */ | ||
| 1803 | static int get_queue_result(struct sem_queue *q) | ||
| 1804 | { | ||
| 1805 | int error; | ||
| 1806 | |||
| 1807 | error = q->status; | ||
| 1808 | while (unlikely(error == IN_WAKEUP)) { | ||
| 1809 | cpu_relax(); | ||
| 1810 | error = q->status; | ||
| 1811 | } | ||
| 1812 | |||
| 1813 | return error; | ||
| 1814 | } | ||
| 1815 | |||
| 1816 | SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | 1715 | SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, |
| 1817 | unsigned, nsops, const struct timespec __user *, timeout) | 1716 | unsigned, nsops, const struct timespec __user *, timeout) |
| 1818 | { | 1717 | { |
| @@ -1825,7 +1724,6 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | |||
| 1825 | struct sem_queue queue; | 1724 | struct sem_queue queue; |
| 1826 | unsigned long jiffies_left = 0; | 1725 | unsigned long jiffies_left = 0; |
| 1827 | struct ipc_namespace *ns; | 1726 | struct ipc_namespace *ns; |
| 1828 | struct list_head tasks; | ||
| 1829 | 1727 | ||
| 1830 | ns = current->nsproxy->ipc_ns; | 1728 | ns = current->nsproxy->ipc_ns; |
| 1831 | 1729 | ||
| @@ -1865,7 +1763,6 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | |||
| 1865 | alter = 1; | 1763 | alter = 1; |
| 1866 | } | 1764 | } |
| 1867 | 1765 | ||
| 1868 | INIT_LIST_HEAD(&tasks); | ||
| 1869 | 1766 | ||
| 1870 | if (undos) { | 1767 | if (undos) { |
| 1871 | /* On success, find_alloc_undo takes the rcu_read_lock */ | 1768 | /* On success, find_alloc_undo takes the rcu_read_lock */ |
| @@ -1933,22 +1830,31 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | |||
| 1933 | queue.alter = alter; | 1830 | queue.alter = alter; |
| 1934 | 1831 | ||
| 1935 | error = perform_atomic_semop(sma, &queue); | 1832 | error = perform_atomic_semop(sma, &queue); |
| 1936 | if (error == 0) { | 1833 | if (error == 0) { /* non-blocking succesfull path */ |
| 1937 | /* If the operation was successful, then do | 1834 | DEFINE_WAKE_Q(wake_q); |
| 1835 | |||
| 1836 | /* | ||
| 1837 | * If the operation was successful, then do | ||
| 1938 | * the required updates. | 1838 | * the required updates. |
| 1939 | */ | 1839 | */ |
| 1940 | if (alter) | 1840 | if (alter) |
| 1941 | do_smart_update(sma, sops, nsops, 1, &tasks); | 1841 | do_smart_update(sma, sops, nsops, 1, &wake_q); |
| 1942 | else | 1842 | else |
| 1943 | set_semotime(sma, sops); | 1843 | set_semotime(sma, sops); |
| 1844 | |||
| 1845 | sem_unlock(sma, locknum); | ||
| 1846 | rcu_read_unlock(); | ||
| 1847 | wake_up_q(&wake_q); | ||
| 1848 | |||
| 1849 | goto out_free; | ||
| 1944 | } | 1850 | } |
| 1945 | if (error <= 0) | 1851 | if (error < 0) /* non-blocking error path */ |
| 1946 | goto out_unlock_free; | 1852 | goto out_unlock_free; |
| 1947 | 1853 | ||
| 1948 | /* We need to sleep on this operation, so we put the current | 1854 | /* |
| 1855 | * We need to sleep on this operation, so we put the current | ||
| 1949 | * task into the pending queue and go to sleep. | 1856 | * task into the pending queue and go to sleep. |
| 1950 | */ | 1857 | */ |
| 1951 | |||
| 1952 | if (nsops == 1) { | 1858 | if (nsops == 1) { |
| 1953 | struct sem *curr; | 1859 | struct sem *curr; |
| 1954 | curr = &sma->sem_base[sops->sem_num]; | 1860 | curr = &sma->sem_base[sops->sem_num]; |
| @@ -1977,10 +1883,10 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | |||
| 1977 | sma->complex_count++; | 1883 | sma->complex_count++; |
| 1978 | } | 1884 | } |
| 1979 | 1885 | ||
| 1886 | sleep_again: | ||
| 1980 | queue.status = -EINTR; | 1887 | queue.status = -EINTR; |
| 1981 | queue.sleeper = current; | 1888 | queue.sleeper = current; |
| 1982 | 1889 | ||
| 1983 | sleep_again: | ||
| 1984 | __set_current_state(TASK_INTERRUPTIBLE); | 1890 | __set_current_state(TASK_INTERRUPTIBLE); |
| 1985 | sem_unlock(sma, locknum); | 1891 | sem_unlock(sma, locknum); |
| 1986 | rcu_read_unlock(); | 1892 | rcu_read_unlock(); |
| @@ -1990,28 +1896,31 @@ sleep_again: | |||
| 1990 | else | 1896 | else |
| 1991 | schedule(); | 1897 | schedule(); |
| 1992 | 1898 | ||
| 1993 | error = get_queue_result(&queue); | 1899 | /* |
| 1994 | 1900 | * fastpath: the semop has completed, either successfully or not, from | |
| 1901 | * the syscall pov, is quite irrelevant to us at this point; we're done. | ||
| 1902 | * | ||
| 1903 | * We _do_ care, nonetheless, about being awoken by a signal or | ||
| 1904 | * spuriously. The queue.status is checked again in the slowpath (aka | ||
| 1905 | * after taking sem_lock), such that we can detect scenarios where we | ||
| 1906 | * were awakened externally, during the window between wake_q_add() and | ||
| 1907 | * wake_up_q(). | ||
| 1908 | */ | ||
| 1909 | error = READ_ONCE(queue.status); | ||
| 1995 | if (error != -EINTR) { | 1910 | if (error != -EINTR) { |
| 1996 | /* fast path: update_queue already obtained all requested | 1911 | /* |
| 1997 | * resources. | 1912 | * User space could assume that semop() is a memory barrier: |
| 1998 | * Perform a smp_mb(): User space could assume that semop() | 1913 | * Without the mb(), the cpu could speculatively read in user |
| 1999 | * is a memory barrier: Without the mb(), the cpu could | 1914 | * space stale data that was overwritten by the previous owner |
| 2000 | * speculatively read in user space stale data that was | 1915 | * of the semaphore. |
| 2001 | * overwritten by the previous owner of the semaphore. | ||
| 2002 | */ | 1916 | */ |
| 2003 | smp_mb(); | 1917 | smp_mb(); |
| 2004 | |||
| 2005 | goto out_free; | 1918 | goto out_free; |
| 2006 | } | 1919 | } |
| 2007 | 1920 | ||
| 2008 | rcu_read_lock(); | 1921 | rcu_read_lock(); |
| 2009 | sma = sem_obtain_lock(ns, semid, sops, nsops, &locknum); | 1922 | sma = sem_obtain_lock(ns, semid, sops, nsops, &locknum); |
| 2010 | 1923 | error = READ_ONCE(queue.status); | |
| 2011 | /* | ||
| 2012 | * Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing. | ||
| 2013 | */ | ||
| 2014 | error = get_queue_result(&queue); | ||
| 2015 | 1924 | ||
| 2016 | /* | 1925 | /* |
| 2017 | * Array removed? If yes, leave without sem_unlock(). | 1926 | * Array removed? If yes, leave without sem_unlock(). |
| @@ -2021,7 +1930,6 @@ sleep_again: | |||
| 2021 | goto out_free; | 1930 | goto out_free; |
| 2022 | } | 1931 | } |
| 2023 | 1932 | ||
| 2024 | |||
| 2025 | /* | 1933 | /* |
| 2026 | * If queue.status != -EINTR we are woken up by another process. | 1934 | * If queue.status != -EINTR we are woken up by another process. |
| 2027 | * Leave without unlink_queue(), but with sem_unlock(). | 1935 | * Leave without unlink_queue(), but with sem_unlock(). |
| @@ -2030,13 +1938,13 @@ sleep_again: | |||
| 2030 | goto out_unlock_free; | 1938 | goto out_unlock_free; |
| 2031 | 1939 | ||
| 2032 | /* | 1940 | /* |
| 2033 | * If an interrupt occurred we have to clean up the queue | 1941 | * If an interrupt occurred we have to clean up the queue. |
| 2034 | */ | 1942 | */ |
| 2035 | if (timeout && jiffies_left == 0) | 1943 | if (timeout && jiffies_left == 0) |
| 2036 | error = -EAGAIN; | 1944 | error = -EAGAIN; |
| 2037 | 1945 | ||
| 2038 | /* | 1946 | /* |
| 2039 | * If the wakeup was spurious, just retry | 1947 | * If the wakeup was spurious, just retry. |
| 2040 | */ | 1948 | */ |
| 2041 | if (error == -EINTR && !signal_pending(current)) | 1949 | if (error == -EINTR && !signal_pending(current)) |
| 2042 | goto sleep_again; | 1950 | goto sleep_again; |
| @@ -2046,7 +1954,6 @@ sleep_again: | |||
| 2046 | out_unlock_free: | 1954 | out_unlock_free: |
| 2047 | sem_unlock(sma, locknum); | 1955 | sem_unlock(sma, locknum); |
| 2048 | rcu_read_unlock(); | 1956 | rcu_read_unlock(); |
| 2049 | wake_up_sem_queue_do(&tasks); | ||
| 2050 | out_free: | 1957 | out_free: |
| 2051 | if (sops != fast_sops) | 1958 | if (sops != fast_sops) |
| 2052 | kfree(sops); | 1959 | kfree(sops); |
| @@ -2107,8 +2014,8 @@ void exit_sem(struct task_struct *tsk) | |||
| 2107 | for (;;) { | 2014 | for (;;) { |
| 2108 | struct sem_array *sma; | 2015 | struct sem_array *sma; |
| 2109 | struct sem_undo *un; | 2016 | struct sem_undo *un; |
| 2110 | struct list_head tasks; | ||
| 2111 | int semid, i; | 2017 | int semid, i; |
| 2018 | DEFINE_WAKE_Q(wake_q); | ||
| 2112 | 2019 | ||
| 2113 | cond_resched(); | 2020 | cond_resched(); |
| 2114 | 2021 | ||
| @@ -2196,11 +2103,10 @@ void exit_sem(struct task_struct *tsk) | |||
| 2196 | } | 2103 | } |
| 2197 | } | 2104 | } |
| 2198 | /* maybe some queued-up processes were waiting for this */ | 2105 | /* maybe some queued-up processes were waiting for this */ |
| 2199 | INIT_LIST_HEAD(&tasks); | 2106 | do_smart_update(sma, NULL, 0, 1, &wake_q); |
| 2200 | do_smart_update(sma, NULL, 0, 1, &tasks); | ||
| 2201 | sem_unlock(sma, -1); | 2107 | sem_unlock(sma, -1); |
| 2202 | rcu_read_unlock(); | 2108 | rcu_read_unlock(); |
| 2203 | wake_up_sem_queue_do(&tasks); | 2109 | wake_up_q(&wake_q); |
| 2204 | 2110 | ||
| 2205 | kfree_rcu(un, rcu); | 2111 | kfree_rcu(un, rcu); |
| 2206 | } | 2112 | } |
