aboutsummaryrefslogtreecommitdiffstats
path: root/ipc
diff options
context:
space:
mode:
authorDavidlohr Bueso <dave@stgolabs.net>2016-12-14 18:06:34 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-12-14 19:04:08 -0500
commit9ae949fa382b080170f9d3c8bd9dea951cf52ee7 (patch)
treeaf701403e6bb61d49b67533beff9239ed0bec1b2 /ipc
parent248e7357cf8ec50bdaca68dce7c488ce843b6b93 (diff)
ipc/sem: rework task wakeups
Our sysv sems have been using the notion of lockless wakeups for a while, ever since commit 0a2b9d4c7967 ("ipc/sem.c: move wake_up_process out of the spinlock section"), in order to reduce the sem_lock hold times. This in-house pending queue can be replaced by wake_q (just like all the rest of ipc now), in that it provides the following advantages: o Simplifies and gets rid of unnecessary code. o We get rid of the IN_WAKEUP complexities. Given that wake_q_add() grabs reference to the task, if awoken due to an unrelated event, between the wake_q_add() and wake_up_q() window, we cannot race with sys_exit and the imminent call to wake_up_process(). o By not spinning IN_WAKEUP, we no longer need to disable preemption. In consequence, the wakeup paths (after schedule(), that is) must acknowledge an external signal/event, as well spurious wakeup occurring during the pending wakeup window. Obviously no changes in semantics that could be visible to the user. The fastpath is _only_ for when we know for sure that we were awoken due to a the waker's successful semop call (queue.status is not -EINTR). On a 48-core Haswell, running the ipcscale 'waitforzero' test, the following is seen with increasing thread counts: v4.8-rc5 v4.8-rc5 semopv2 Hmean sembench-sem-2 574733.00 ( 0.00%) 578322.00 ( 0.62%) Hmean sembench-sem-8 811708.00 ( 0.00%) 824689.00 ( 1.59%) Hmean sembench-sem-12 842448.00 ( 0.00%) 845409.00 ( 0.35%) Hmean sembench-sem-21 933003.00 ( 0.00%) 977748.00 ( 4.80%) Hmean sembench-sem-48 935910.00 ( 0.00%) 1004759.00 ( 7.36%) Hmean sembench-sem-79 937186.00 ( 0.00%) 983976.00 ( 4.99%) Hmean sembench-sem-234 974256.00 ( 0.00%) 1060294.00 ( 8.83%) Hmean sembench-sem-265 975468.00 ( 0.00%) 1016243.00 ( 4.18%) Hmean sembench-sem-296 991280.00 ( 0.00%) 1042659.00 ( 5.18%) Hmean sembench-sem-327 975415.00 ( 0.00%) 1029977.00 ( 5.59%) Hmean sembench-sem-358 1014286.00 ( 0.00%) 1049624.00 ( 3.48%) Hmean sembench-sem-389 972939.00 ( 0.00%) 1043127.00 ( 7.21%) Hmean sembench-sem-420 981909.00 ( 0.00%) 1056747.00 ( 7.62%) Hmean sembench-sem-451 990139.00 ( 0.00%) 1051609.00 ( 6.21%) Hmean sembench-sem-482 965735.00 ( 0.00%) 1040313.00 ( 7.72%) [akpm@linux-foundation.org: coding-style fixes] [sfr@canb.auug.org.au: merge fix for WAKE_Q to DEFINE_WAKE_Q rename] Link: http://lkml.kernel.org/r/20161122210410.5eca9fc2@canb.auug.org.au Link: http://lkml.kernel.org/r/1474225896-10066-3-git-send-email-dave@stgolabs.net Signed-off-by: Davidlohr Bueso <dbueso@suse.de> Acked-by: Manfred Spraul <manfred@colorfullife.com> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'ipc')
-rw-r--r--ipc/sem.c266
1 files changed, 86 insertions, 180 deletions
diff --git a/ipc/sem.c b/ipc/sem.c
index b33dd98fde69..1ff103299c28 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -11,6 +11,7 @@
11 * (c) 2001 Red Hat Inc 11 * (c) 2001 Red Hat Inc
12 * Lockless wakeup 12 * Lockless wakeup
13 * (c) 2003 Manfred Spraul <manfred@colorfullife.com> 13 * (c) 2003 Manfred Spraul <manfred@colorfullife.com>
14 * (c) 2016 Davidlohr Bueso <dave@stgolabs.net>
14 * Further wakeup optimizations, documentation 15 * Further wakeup optimizations, documentation
15 * (c) 2010 Manfred Spraul <manfred@colorfullife.com> 16 * (c) 2010 Manfred Spraul <manfred@colorfullife.com>
16 * 17 *
@@ -53,15 +54,11 @@
53 * Semaphores are actively given to waiting tasks (necessary for FIFO). 54 * Semaphores are actively given to waiting tasks (necessary for FIFO).
54 * (see update_queue()) 55 * (see update_queue())
55 * - To improve the scalability, the actual wake-up calls are performed after 56 * - To improve the scalability, the actual wake-up calls are performed after
56 * dropping all locks. (see wake_up_sem_queue_prepare(), 57 * dropping all locks. (see wake_up_sem_queue_prepare())
57 * wake_up_sem_queue_do())
58 * - All work is done by the waker, the woken up task does not have to do 58 * - All work is done by the waker, the woken up task does not have to do
59 * anything - not even acquiring a lock or dropping a refcount. 59 * anything - not even acquiring a lock or dropping a refcount.
60 * - A woken up task may not even touch the semaphore array anymore, it may 60 * - A woken up task may not even touch the semaphore array anymore, it may
61 * have been destroyed already by a semctl(RMID). 61 * have been destroyed already by a semctl(RMID).
62 * - The synchronizations between wake-ups due to a timeout/signal and a
63 * wake-up due to a completed semaphore operation is achieved by using an
64 * intermediate state (IN_WAKEUP).
65 * - UNDO values are stored in an array (one per process and per 62 * - UNDO values are stored in an array (one per process and per
66 * semaphore array, lazily allocated). For backwards compatibility, multiple 63 * semaphore array, lazily allocated). For backwards compatibility, multiple
67 * modes for the UNDO variables are supported (per process, per thread) 64 * modes for the UNDO variables are supported (per process, per thread)
@@ -471,40 +468,6 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
471 ipc_rmid(&sem_ids(ns), &s->sem_perm); 468 ipc_rmid(&sem_ids(ns), &s->sem_perm);
472} 469}
473 470
474/*
475 * Lockless wakeup algorithm:
476 * Without the check/retry algorithm a lockless wakeup is possible:
477 * - queue.status is initialized to -EINTR before blocking.
478 * - wakeup is performed by
479 * * unlinking the queue entry from the pending list
480 * * setting queue.status to IN_WAKEUP
481 * This is the notification for the blocked thread that a
482 * result value is imminent.
483 * * call wake_up_process
484 * * set queue.status to the final value.
485 * - the previously blocked thread checks queue.status:
486 * * if it's IN_WAKEUP, then it must wait until the value changes
487 * * if it's not -EINTR, then the operation was completed by
488 * update_queue. semtimedop can return queue.status without
489 * performing any operation on the sem array.
490 * * otherwise it must acquire the spinlock and check what's up.
491 *
492 * The two-stage algorithm is necessary to protect against the following
493 * races:
494 * - if queue.status is set after wake_up_process, then the woken up idle
495 * thread could race forward and try (and fail) to acquire sma->lock
496 * before update_queue had a chance to set queue.status
497 * - if queue.status is written before wake_up_process and if the
498 * blocked process is woken up by a signal between writing
499 * queue.status and the wake_up_process, then the woken up
500 * process could return from semtimedop and die by calling
501 * sys_exit before wake_up_process is called. Then wake_up_process
502 * will oops, because the task structure is already invalid.
503 * (yes, this happened on s390 with sysv msg).
504 *
505 */
506#define IN_WAKEUP 1
507
508/** 471/**
509 * newary - Create a new semaphore set 472 * newary - Create a new semaphore set
510 * @ns: namespace 473 * @ns: namespace
@@ -703,51 +666,18 @@ undo:
703 return result; 666 return result;
704} 667}
705 668
706/** wake_up_sem_queue_prepare(q, error): Prepare wake-up 669static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error,
707 * @q: queue entry that must be signaled 670 struct wake_q_head *wake_q)
708 * @error: Error value for the signal
709 *
710 * Prepare the wake-up of the queue entry q.
711 */
712static void wake_up_sem_queue_prepare(struct list_head *pt,
713 struct sem_queue *q, int error)
714{
715 if (list_empty(pt)) {
716 /*
717 * Hold preempt off so that we don't get preempted and have the
718 * wakee busy-wait until we're scheduled back on.
719 */
720 preempt_disable();
721 }
722 q->status = IN_WAKEUP;
723 q->pid = error;
724
725 list_add_tail(&q->list, pt);
726}
727
728/**
729 * wake_up_sem_queue_do - do the actual wake-up
730 * @pt: list of tasks to be woken up
731 *
732 * Do the actual wake-up.
733 * The function is called without any locks held, thus the semaphore array
734 * could be destroyed already and the tasks can disappear as soon as the
735 * status is set to the actual return code.
736 */
737static void wake_up_sem_queue_do(struct list_head *pt)
738{ 671{
739 struct sem_queue *q, *t; 672 wake_q_add(wake_q, q->sleeper);
740 int did_something; 673 /*
741 674 * Rely on the above implicit barrier, such that we can
742 did_something = !list_empty(pt); 675 * ensure that we hold reference to the task before setting
743 list_for_each_entry_safe(q, t, pt, list) { 676 * q->status. Otherwise we could race with do_exit if the
744 wake_up_process(q->sleeper); 677 * task is awoken by an external event before calling
745 /* q can disappear immediately after writing q->status. */ 678 * wake_up_process().
746 smp_wmb(); 679 */
747 q->status = q->pid; 680 WRITE_ONCE(q->status, error);
748 }
749 if (did_something)
750 preempt_enable();
751} 681}
752 682
753static void unlink_queue(struct sem_array *sma, struct sem_queue *q) 683static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
@@ -795,18 +725,18 @@ static int check_restart(struct sem_array *sma, struct sem_queue *q)
795 * wake_const_ops - wake up non-alter tasks 725 * wake_const_ops - wake up non-alter tasks
796 * @sma: semaphore array. 726 * @sma: semaphore array.
797 * @semnum: semaphore that was modified. 727 * @semnum: semaphore that was modified.
798 * @pt: list head for the tasks that must be woken up. 728 * @wake_q: lockless wake-queue head.
799 * 729 *
800 * wake_const_ops must be called after a semaphore in a semaphore array 730 * wake_const_ops must be called after a semaphore in a semaphore array
801 * was set to 0. If complex const operations are pending, wake_const_ops must 731 * was set to 0. If complex const operations are pending, wake_const_ops must
802 * be called with semnum = -1, as well as with the number of each modified 732 * be called with semnum = -1, as well as with the number of each modified
803 * semaphore. 733 * semaphore.
804 * The tasks that must be woken up are added to @pt. The return code 734 * The tasks that must be woken up are added to @wake_q. The return code
805 * is stored in q->pid. 735 * is stored in q->pid.
806 * The function returns 1 if at least one operation was completed successfully. 736 * The function returns 1 if at least one operation was completed successfully.
807 */ 737 */
808static int wake_const_ops(struct sem_array *sma, int semnum, 738static int wake_const_ops(struct sem_array *sma, int semnum,
809 struct list_head *pt) 739 struct wake_q_head *wake_q)
810{ 740{
811 struct sem_queue *q; 741 struct sem_queue *q;
812 struct list_head *walk; 742 struct list_head *walk;
@@ -832,7 +762,7 @@ static int wake_const_ops(struct sem_array *sma, int semnum,
832 762
833 unlink_queue(sma, q); 763 unlink_queue(sma, q);
834 764
835 wake_up_sem_queue_prepare(pt, q, error); 765 wake_up_sem_queue_prepare(q, error, wake_q);
836 if (error == 0) 766 if (error == 0)
837 semop_completed = 1; 767 semop_completed = 1;
838 } 768 }
@@ -845,14 +775,14 @@ static int wake_const_ops(struct sem_array *sma, int semnum,
845 * @sma: semaphore array 775 * @sma: semaphore array
846 * @sops: operations that were performed 776 * @sops: operations that were performed
847 * @nsops: number of operations 777 * @nsops: number of operations
848 * @pt: list head of the tasks that must be woken up. 778 * @wake_q: lockless wake-queue head
849 * 779 *
850 * Checks all required queue for wait-for-zero operations, based 780 * Checks all required queue for wait-for-zero operations, based
851 * on the actual changes that were performed on the semaphore array. 781 * on the actual changes that were performed on the semaphore array.
852 * The function returns 1 if at least one operation was completed successfully. 782 * The function returns 1 if at least one operation was completed successfully.
853 */ 783 */
854static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, 784static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
855 int nsops, struct list_head *pt) 785 int nsops, struct wake_q_head *wake_q)
856{ 786{
857 int i; 787 int i;
858 int semop_completed = 0; 788 int semop_completed = 0;
@@ -865,7 +795,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
865 795
866 if (sma->sem_base[num].semval == 0) { 796 if (sma->sem_base[num].semval == 0) {
867 got_zero = 1; 797 got_zero = 1;
868 semop_completed |= wake_const_ops(sma, num, pt); 798 semop_completed |= wake_const_ops(sma, num, wake_q);
869 } 799 }
870 } 800 }
871 } else { 801 } else {
@@ -876,7 +806,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
876 for (i = 0; i < sma->sem_nsems; i++) { 806 for (i = 0; i < sma->sem_nsems; i++) {
877 if (sma->sem_base[i].semval == 0) { 807 if (sma->sem_base[i].semval == 0) {
878 got_zero = 1; 808 got_zero = 1;
879 semop_completed |= wake_const_ops(sma, i, pt); 809 semop_completed |= wake_const_ops(sma, i, wake_q);
880 } 810 }
881 } 811 }
882 } 812 }
@@ -885,7 +815,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
885 * then check the global queue, too. 815 * then check the global queue, too.
886 */ 816 */
887 if (got_zero) 817 if (got_zero)
888 semop_completed |= wake_const_ops(sma, -1, pt); 818 semop_completed |= wake_const_ops(sma, -1, wake_q);
889 819
890 return semop_completed; 820 return semop_completed;
891} 821}
@@ -895,19 +825,19 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
895 * update_queue - look for tasks that can be completed. 825 * update_queue - look for tasks that can be completed.
896 * @sma: semaphore array. 826 * @sma: semaphore array.
897 * @semnum: semaphore that was modified. 827 * @semnum: semaphore that was modified.
898 * @pt: list head for the tasks that must be woken up. 828 * @wake_q: lockless wake-queue head.
899 * 829 *
900 * update_queue must be called after a semaphore in a semaphore array 830 * update_queue must be called after a semaphore in a semaphore array
901 * was modified. If multiple semaphores were modified, update_queue must 831 * was modified. If multiple semaphores were modified, update_queue must
902 * be called with semnum = -1, as well as with the number of each modified 832 * be called with semnum = -1, as well as with the number of each modified
903 * semaphore. 833 * semaphore.
904 * The tasks that must be woken up are added to @pt. The return code 834 * The tasks that must be woken up are added to @wake_q. The return code
905 * is stored in q->pid. 835 * is stored in q->pid.
906 * The function internally checks if const operations can now succeed. 836 * The function internally checks if const operations can now succeed.
907 * 837 *
908 * The function return 1 if at least one semop was completed successfully. 838 * The function return 1 if at least one semop was completed successfully.
909 */ 839 */
910static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) 840static int update_queue(struct sem_array *sma, int semnum, struct wake_q_head *wake_q)
911{ 841{
912 struct sem_queue *q; 842 struct sem_queue *q;
913 struct list_head *walk; 843 struct list_head *walk;
@@ -949,11 +879,11 @@ again:
949 restart = 0; 879 restart = 0;
950 } else { 880 } else {
951 semop_completed = 1; 881 semop_completed = 1;
952 do_smart_wakeup_zero(sma, q->sops, q->nsops, pt); 882 do_smart_wakeup_zero(sma, q->sops, q->nsops, wake_q);
953 restart = check_restart(sma, q); 883 restart = check_restart(sma, q);
954 } 884 }
955 885
956 wake_up_sem_queue_prepare(pt, q, error); 886 wake_up_sem_queue_prepare(q, error, wake_q);
957 if (restart) 887 if (restart)
958 goto again; 888 goto again;
959 } 889 }
@@ -984,24 +914,24 @@ static void set_semotime(struct sem_array *sma, struct sembuf *sops)
984 * @sops: operations that were performed 914 * @sops: operations that were performed
985 * @nsops: number of operations 915 * @nsops: number of operations
986 * @otime: force setting otime 916 * @otime: force setting otime
987 * @pt: list head of the tasks that must be woken up. 917 * @wake_q: lockless wake-queue head
988 * 918 *
989 * do_smart_update() does the required calls to update_queue and wakeup_zero, 919 * do_smart_update() does the required calls to update_queue and wakeup_zero,
990 * based on the actual changes that were performed on the semaphore array. 920 * based on the actual changes that were performed on the semaphore array.
991 * Note that the function does not do the actual wake-up: the caller is 921 * Note that the function does not do the actual wake-up: the caller is
992 * responsible for calling wake_up_sem_queue_do(@pt). 922 * responsible for calling wake_up_q().
993 * It is safe to perform this call after dropping all locks. 923 * It is safe to perform this call after dropping all locks.
994 */ 924 */
995static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops, 925static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops,
996 int otime, struct list_head *pt) 926 int otime, struct wake_q_head *wake_q)
997{ 927{
998 int i; 928 int i;
999 929
1000 otime |= do_smart_wakeup_zero(sma, sops, nsops, pt); 930 otime |= do_smart_wakeup_zero(sma, sops, nsops, wake_q);
1001 931
1002 if (!list_empty(&sma->pending_alter)) { 932 if (!list_empty(&sma->pending_alter)) {
1003 /* semaphore array uses the global queue - just process it. */ 933 /* semaphore array uses the global queue - just process it. */
1004 otime |= update_queue(sma, -1, pt); 934 otime |= update_queue(sma, -1, wake_q);
1005 } else { 935 } else {
1006 if (!sops) { 936 if (!sops) {
1007 /* 937 /*
@@ -1009,7 +939,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
1009 * known. Check all. 939 * known. Check all.
1010 */ 940 */
1011 for (i = 0; i < sma->sem_nsems; i++) 941 for (i = 0; i < sma->sem_nsems; i++)
1012 otime |= update_queue(sma, i, pt); 942 otime |= update_queue(sma, i, wake_q);
1013 } else { 943 } else {
1014 /* 944 /*
1015 * Check the semaphores that were increased: 945 * Check the semaphores that were increased:
@@ -1023,7 +953,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
1023 for (i = 0; i < nsops; i++) { 953 for (i = 0; i < nsops; i++) {
1024 if (sops[i].sem_op > 0) { 954 if (sops[i].sem_op > 0) {
1025 otime |= update_queue(sma, 955 otime |= update_queue(sma,
1026 sops[i].sem_num, pt); 956 sops[i].sem_num, wake_q);
1027 } 957 }
1028 } 958 }
1029 } 959 }
@@ -1111,8 +1041,8 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
1111 struct sem_undo *un, *tu; 1041 struct sem_undo *un, *tu;
1112 struct sem_queue *q, *tq; 1042 struct sem_queue *q, *tq;
1113 struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm); 1043 struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
1114 struct list_head tasks;
1115 int i; 1044 int i;
1045 DEFINE_WAKE_Q(wake_q);
1116 1046
1117 /* Free the existing undo structures for this semaphore set. */ 1047 /* Free the existing undo structures for this semaphore set. */
1118 ipc_assert_locked_object(&sma->sem_perm); 1048 ipc_assert_locked_object(&sma->sem_perm);
@@ -1126,25 +1056,24 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
1126 } 1056 }
1127 1057
1128 /* Wake up all pending processes and let them fail with EIDRM. */ 1058 /* Wake up all pending processes and let them fail with EIDRM. */
1129 INIT_LIST_HEAD(&tasks);
1130 list_for_each_entry_safe(q, tq, &sma->pending_const, list) { 1059 list_for_each_entry_safe(q, tq, &sma->pending_const, list) {
1131 unlink_queue(sma, q); 1060 unlink_queue(sma, q);
1132 wake_up_sem_queue_prepare(&tasks, q, -EIDRM); 1061 wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
1133 } 1062 }
1134 1063
1135 list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { 1064 list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
1136 unlink_queue(sma, q); 1065 unlink_queue(sma, q);
1137 wake_up_sem_queue_prepare(&tasks, q, -EIDRM); 1066 wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
1138 } 1067 }
1139 for (i = 0; i < sma->sem_nsems; i++) { 1068 for (i = 0; i < sma->sem_nsems; i++) {
1140 struct sem *sem = sma->sem_base + i; 1069 struct sem *sem = sma->sem_base + i;
1141 list_for_each_entry_safe(q, tq, &sem->pending_const, list) { 1070 list_for_each_entry_safe(q, tq, &sem->pending_const, list) {
1142 unlink_queue(sma, q); 1071 unlink_queue(sma, q);
1143 wake_up_sem_queue_prepare(&tasks, q, -EIDRM); 1072 wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
1144 } 1073 }
1145 list_for_each_entry_safe(q, tq, &sem->pending_alter, list) { 1074 list_for_each_entry_safe(q, tq, &sem->pending_alter, list) {
1146 unlink_queue(sma, q); 1075 unlink_queue(sma, q);
1147 wake_up_sem_queue_prepare(&tasks, q, -EIDRM); 1076 wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
1148 } 1077 }
1149 } 1078 }
1150 1079
@@ -1153,7 +1082,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
1153 sem_unlock(sma, -1); 1082 sem_unlock(sma, -1);
1154 rcu_read_unlock(); 1083 rcu_read_unlock();
1155 1084
1156 wake_up_sem_queue_do(&tasks); 1085 wake_up_q(&wake_q);
1157 ns->used_sems -= sma->sem_nsems; 1086 ns->used_sems -= sma->sem_nsems;
1158 ipc_rcu_putref(sma, sem_rcu_free); 1087 ipc_rcu_putref(sma, sem_rcu_free);
1159} 1088}
@@ -1292,9 +1221,9 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
1292 struct sem_undo *un; 1221 struct sem_undo *un;
1293 struct sem_array *sma; 1222 struct sem_array *sma;
1294 struct sem *curr; 1223 struct sem *curr;
1295 int err; 1224 int err, val;
1296 struct list_head tasks; 1225 DEFINE_WAKE_Q(wake_q);
1297 int val; 1226
1298#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN) 1227#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
1299 /* big-endian 64bit */ 1228 /* big-endian 64bit */
1300 val = arg >> 32; 1229 val = arg >> 32;
@@ -1306,8 +1235,6 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
1306 if (val > SEMVMX || val < 0) 1235 if (val > SEMVMX || val < 0)
1307 return -ERANGE; 1236 return -ERANGE;
1308 1237
1309 INIT_LIST_HEAD(&tasks);
1310
1311 rcu_read_lock(); 1238 rcu_read_lock();
1312 sma = sem_obtain_object_check(ns, semid); 1239 sma = sem_obtain_object_check(ns, semid);
1313 if (IS_ERR(sma)) { 1240 if (IS_ERR(sma)) {
@@ -1350,10 +1277,10 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
1350 curr->sempid = task_tgid_vnr(current); 1277 curr->sempid = task_tgid_vnr(current);
1351 sma->sem_ctime = get_seconds(); 1278 sma->sem_ctime = get_seconds();
1352 /* maybe some queued-up processes were waiting for this */ 1279 /* maybe some queued-up processes were waiting for this */
1353 do_smart_update(sma, NULL, 0, 0, &tasks); 1280 do_smart_update(sma, NULL, 0, 0, &wake_q);
1354 sem_unlock(sma, -1); 1281 sem_unlock(sma, -1);
1355 rcu_read_unlock(); 1282 rcu_read_unlock();
1356 wake_up_sem_queue_do(&tasks); 1283 wake_up_q(&wake_q);
1357 return 0; 1284 return 0;
1358} 1285}
1359 1286
@@ -1365,9 +1292,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
1365 int err, nsems; 1292 int err, nsems;
1366 ushort fast_sem_io[SEMMSL_FAST]; 1293 ushort fast_sem_io[SEMMSL_FAST];
1367 ushort *sem_io = fast_sem_io; 1294 ushort *sem_io = fast_sem_io;
1368 struct list_head tasks; 1295 DEFINE_WAKE_Q(wake_q);
1369
1370 INIT_LIST_HEAD(&tasks);
1371 1296
1372 rcu_read_lock(); 1297 rcu_read_lock();
1373 sma = sem_obtain_object_check(ns, semid); 1298 sma = sem_obtain_object_check(ns, semid);
@@ -1478,7 +1403,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
1478 } 1403 }
1479 sma->sem_ctime = get_seconds(); 1404 sma->sem_ctime = get_seconds();
1480 /* maybe some queued-up processes were waiting for this */ 1405 /* maybe some queued-up processes were waiting for this */
1481 do_smart_update(sma, NULL, 0, 0, &tasks); 1406 do_smart_update(sma, NULL, 0, 0, &wake_q);
1482 err = 0; 1407 err = 0;
1483 goto out_unlock; 1408 goto out_unlock;
1484 } 1409 }
@@ -1514,7 +1439,7 @@ out_unlock:
1514 sem_unlock(sma, -1); 1439 sem_unlock(sma, -1);
1515out_rcu_wakeup: 1440out_rcu_wakeup:
1516 rcu_read_unlock(); 1441 rcu_read_unlock();
1517 wake_up_sem_queue_do(&tasks); 1442 wake_up_q(&wake_q);
1518out_free: 1443out_free:
1519 if (sem_io != fast_sem_io) 1444 if (sem_io != fast_sem_io)
1520 ipc_free(sem_io); 1445 ipc_free(sem_io);
@@ -1787,32 +1712,6 @@ out:
1787 return un; 1712 return un;
1788} 1713}
1789 1714
1790
1791/**
1792 * get_queue_result - retrieve the result code from sem_queue
1793 * @q: Pointer to queue structure
1794 *
1795 * Retrieve the return code from the pending queue. If IN_WAKEUP is found in
1796 * q->status, then we must loop until the value is replaced with the final
1797 * value: This may happen if a task is woken up by an unrelated event (e.g.
1798 * signal) and in parallel the task is woken up by another task because it got
1799 * the requested semaphores.
1800 *
1801 * The function can be called with or without holding the semaphore spinlock.
1802 */
1803static int get_queue_result(struct sem_queue *q)
1804{
1805 int error;
1806
1807 error = q->status;
1808 while (unlikely(error == IN_WAKEUP)) {
1809 cpu_relax();
1810 error = q->status;
1811 }
1812
1813 return error;
1814}
1815
1816SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, 1715SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1817 unsigned, nsops, const struct timespec __user *, timeout) 1716 unsigned, nsops, const struct timespec __user *, timeout)
1818{ 1717{
@@ -1825,7 +1724,6 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1825 struct sem_queue queue; 1724 struct sem_queue queue;
1826 unsigned long jiffies_left = 0; 1725 unsigned long jiffies_left = 0;
1827 struct ipc_namespace *ns; 1726 struct ipc_namespace *ns;
1828 struct list_head tasks;
1829 1727
1830 ns = current->nsproxy->ipc_ns; 1728 ns = current->nsproxy->ipc_ns;
1831 1729
@@ -1865,7 +1763,6 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1865 alter = 1; 1763 alter = 1;
1866 } 1764 }
1867 1765
1868 INIT_LIST_HEAD(&tasks);
1869 1766
1870 if (undos) { 1767 if (undos) {
1871 /* On success, find_alloc_undo takes the rcu_read_lock */ 1768 /* On success, find_alloc_undo takes the rcu_read_lock */
@@ -1933,22 +1830,31 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1933 queue.alter = alter; 1830 queue.alter = alter;
1934 1831
1935 error = perform_atomic_semop(sma, &queue); 1832 error = perform_atomic_semop(sma, &queue);
1936 if (error == 0) { 1833 if (error == 0) { /* non-blocking succesfull path */
1937 /* If the operation was successful, then do 1834 DEFINE_WAKE_Q(wake_q);
1835
1836 /*
1837 * If the operation was successful, then do
1938 * the required updates. 1838 * the required updates.
1939 */ 1839 */
1940 if (alter) 1840 if (alter)
1941 do_smart_update(sma, sops, nsops, 1, &tasks); 1841 do_smart_update(sma, sops, nsops, 1, &wake_q);
1942 else 1842 else
1943 set_semotime(sma, sops); 1843 set_semotime(sma, sops);
1844
1845 sem_unlock(sma, locknum);
1846 rcu_read_unlock();
1847 wake_up_q(&wake_q);
1848
1849 goto out_free;
1944 } 1850 }
1945 if (error <= 0) 1851 if (error < 0) /* non-blocking error path */
1946 goto out_unlock_free; 1852 goto out_unlock_free;
1947 1853
1948 /* We need to sleep on this operation, so we put the current 1854 /*
1855 * We need to sleep on this operation, so we put the current
1949 * task into the pending queue and go to sleep. 1856 * task into the pending queue and go to sleep.
1950 */ 1857 */
1951
1952 if (nsops == 1) { 1858 if (nsops == 1) {
1953 struct sem *curr; 1859 struct sem *curr;
1954 curr = &sma->sem_base[sops->sem_num]; 1860 curr = &sma->sem_base[sops->sem_num];
@@ -1977,10 +1883,10 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1977 sma->complex_count++; 1883 sma->complex_count++;
1978 } 1884 }
1979 1885
1886sleep_again:
1980 queue.status = -EINTR; 1887 queue.status = -EINTR;
1981 queue.sleeper = current; 1888 queue.sleeper = current;
1982 1889
1983sleep_again:
1984 __set_current_state(TASK_INTERRUPTIBLE); 1890 __set_current_state(TASK_INTERRUPTIBLE);
1985 sem_unlock(sma, locknum); 1891 sem_unlock(sma, locknum);
1986 rcu_read_unlock(); 1892 rcu_read_unlock();
@@ -1990,28 +1896,31 @@ sleep_again:
1990 else 1896 else
1991 schedule(); 1897 schedule();
1992 1898
1993 error = get_queue_result(&queue); 1899 /*
1994 1900 * fastpath: the semop has completed, either successfully or not, from
1901 * the syscall pov, is quite irrelevant to us at this point; we're done.
1902 *
1903 * We _do_ care, nonetheless, about being awoken by a signal or
1904 * spuriously. The queue.status is checked again in the slowpath (aka
1905 * after taking sem_lock), such that we can detect scenarios where we
1906 * were awakened externally, during the window between wake_q_add() and
1907 * wake_up_q().
1908 */
1909 error = READ_ONCE(queue.status);
1995 if (error != -EINTR) { 1910 if (error != -EINTR) {
1996 /* fast path: update_queue already obtained all requested 1911 /*
1997 * resources. 1912 * User space could assume that semop() is a memory barrier:
1998 * Perform a smp_mb(): User space could assume that semop() 1913 * Without the mb(), the cpu could speculatively read in user
1999 * is a memory barrier: Without the mb(), the cpu could 1914 * space stale data that was overwritten by the previous owner
2000 * speculatively read in user space stale data that was 1915 * of the semaphore.
2001 * overwritten by the previous owner of the semaphore.
2002 */ 1916 */
2003 smp_mb(); 1917 smp_mb();
2004
2005 goto out_free; 1918 goto out_free;
2006 } 1919 }
2007 1920
2008 rcu_read_lock(); 1921 rcu_read_lock();
2009 sma = sem_obtain_lock(ns, semid, sops, nsops, &locknum); 1922 sma = sem_obtain_lock(ns, semid, sops, nsops, &locknum);
2010 1923 error = READ_ONCE(queue.status);
2011 /*
2012 * Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing.
2013 */
2014 error = get_queue_result(&queue);
2015 1924
2016 /* 1925 /*
2017 * Array removed? If yes, leave without sem_unlock(). 1926 * Array removed? If yes, leave without sem_unlock().
@@ -2021,7 +1930,6 @@ sleep_again:
2021 goto out_free; 1930 goto out_free;
2022 } 1931 }
2023 1932
2024
2025 /* 1933 /*
2026 * If queue.status != -EINTR we are woken up by another process. 1934 * If queue.status != -EINTR we are woken up by another process.
2027 * Leave without unlink_queue(), but with sem_unlock(). 1935 * Leave without unlink_queue(), but with sem_unlock().
@@ -2030,13 +1938,13 @@ sleep_again:
2030 goto out_unlock_free; 1938 goto out_unlock_free;
2031 1939
2032 /* 1940 /*
2033 * If an interrupt occurred we have to clean up the queue 1941 * If an interrupt occurred we have to clean up the queue.
2034 */ 1942 */
2035 if (timeout && jiffies_left == 0) 1943 if (timeout && jiffies_left == 0)
2036 error = -EAGAIN; 1944 error = -EAGAIN;
2037 1945
2038 /* 1946 /*
2039 * If the wakeup was spurious, just retry 1947 * If the wakeup was spurious, just retry.
2040 */ 1948 */
2041 if (error == -EINTR && !signal_pending(current)) 1949 if (error == -EINTR && !signal_pending(current))
2042 goto sleep_again; 1950 goto sleep_again;
@@ -2046,7 +1954,6 @@ sleep_again:
2046out_unlock_free: 1954out_unlock_free:
2047 sem_unlock(sma, locknum); 1955 sem_unlock(sma, locknum);
2048 rcu_read_unlock(); 1956 rcu_read_unlock();
2049 wake_up_sem_queue_do(&tasks);
2050out_free: 1957out_free:
2051 if (sops != fast_sops) 1958 if (sops != fast_sops)
2052 kfree(sops); 1959 kfree(sops);
@@ -2107,8 +2014,8 @@ void exit_sem(struct task_struct *tsk)
2107 for (;;) { 2014 for (;;) {
2108 struct sem_array *sma; 2015 struct sem_array *sma;
2109 struct sem_undo *un; 2016 struct sem_undo *un;
2110 struct list_head tasks;
2111 int semid, i; 2017 int semid, i;
2018 DEFINE_WAKE_Q(wake_q);
2112 2019
2113 cond_resched(); 2020 cond_resched();
2114 2021
@@ -2196,11 +2103,10 @@ void exit_sem(struct task_struct *tsk)
2196 } 2103 }
2197 } 2104 }
2198 /* maybe some queued-up processes were waiting for this */ 2105 /* maybe some queued-up processes were waiting for this */
2199 INIT_LIST_HEAD(&tasks); 2106 do_smart_update(sma, NULL, 0, 1, &wake_q);
2200 do_smart_update(sma, NULL, 0, 1, &tasks);
2201 sem_unlock(sma, -1); 2107 sem_unlock(sma, -1);
2202 rcu_read_unlock(); 2108 rcu_read_unlock();
2203 wake_up_sem_queue_do(&tasks); 2109 wake_up_q(&wake_q);
2204 2110
2205 kfree_rcu(un, rcu); 2111 kfree_rcu(un, rcu);
2206 } 2112 }