aboutsummaryrefslogtreecommitdiffstats
path: root/ipc/sem.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-12-14 20:25:18 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-12-14 20:25:18 -0500
commita57cb1c1d7974c62a5c80f7869e35b492ace12cd (patch)
tree5a42ee9a668f171143464bc86013954c1bbe94ad /ipc/sem.c
parentcf1b3341afab9d3ad02a76b3a619ea027dcf4e28 (diff)
parente1e14ab8411df344a17687821f8f78f0a1e73cbb (diff)
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton: - a few misc things - kexec updates - DMA-mapping updates to better support networking DMA operations - IPC updates - various MM changes to improve DAX fault handling - lots of radix-tree changes, mainly to the test suite. All leading up to reimplementing the IDA/IDR code to be a wrapper layer over the radix-tree. However the final trigger-pulling patch is held off for 4.11. * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (114 commits) radix tree test suite: delete unused rcupdate.c radix tree test suite: add new tag check radix-tree: ensure counts are initialised radix tree test suite: cache recently freed objects radix tree test suite: add some more functionality idr: reduce the number of bits per level from 8 to 6 rxrpc: abstract away knowledge of IDR internals tpm: use idr_find(), not idr_find_slowpath() idr: add ida_is_empty radix tree test suite: check multiorder iteration radix-tree: fix replacement for multiorder entries radix-tree: add radix_tree_split_preload() radix-tree: add radix_tree_split radix-tree: add radix_tree_join radix-tree: delete radix_tree_range_tag_if_tagged() radix-tree: delete radix_tree_locate_item() radix-tree: improve multiorder iterators btrfs: fix race in btrfs_free_dummy_fs_info() radix-tree: improve dump output radix-tree: make radix_tree_find_next_bit more useful ...
Diffstat (limited to 'ipc/sem.c')
-rw-r--r--ipc/sem.c512
1 files changed, 236 insertions, 276 deletions
diff --git a/ipc/sem.c b/ipc/sem.c
index 10b94bc59d4a..e08b94851922 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -11,6 +11,7 @@
11 * (c) 2001 Red Hat Inc 11 * (c) 2001 Red Hat Inc
12 * Lockless wakeup 12 * Lockless wakeup
13 * (c) 2003 Manfred Spraul <manfred@colorfullife.com> 13 * (c) 2003 Manfred Spraul <manfred@colorfullife.com>
14 * (c) 2016 Davidlohr Bueso <dave@stgolabs.net>
14 * Further wakeup optimizations, documentation 15 * Further wakeup optimizations, documentation
15 * (c) 2010 Manfred Spraul <manfred@colorfullife.com> 16 * (c) 2010 Manfred Spraul <manfred@colorfullife.com>
16 * 17 *
@@ -53,15 +54,11 @@
53 * Semaphores are actively given to waiting tasks (necessary for FIFO). 54 * Semaphores are actively given to waiting tasks (necessary for FIFO).
54 * (see update_queue()) 55 * (see update_queue())
55 * - To improve the scalability, the actual wake-up calls are performed after 56 * - To improve the scalability, the actual wake-up calls are performed after
56 * dropping all locks. (see wake_up_sem_queue_prepare(), 57 * dropping all locks. (see wake_up_sem_queue_prepare())
57 * wake_up_sem_queue_do())
58 * - All work is done by the waker, the woken up task does not have to do 58 * - All work is done by the waker, the woken up task does not have to do
59 * anything - not even acquiring a lock or dropping a refcount. 59 * anything - not even acquiring a lock or dropping a refcount.
60 * - A woken up task may not even touch the semaphore array anymore, it may 60 * - A woken up task may not even touch the semaphore array anymore, it may
61 * have been destroyed already by a semctl(RMID). 61 * have been destroyed already by a semctl(RMID).
62 * - The synchronizations between wake-ups due to a timeout/signal and a
63 * wake-up due to a completed semaphore operation is achieved by using an
64 * intermediate state (IN_WAKEUP).
65 * - UNDO values are stored in an array (one per process and per 62 * - UNDO values are stored in an array (one per process and per
66 * semaphore array, lazily allocated). For backwards compatibility, multiple 63 * semaphore array, lazily allocated). For backwards compatibility, multiple
67 * modes for the UNDO variables are supported (per process, per thread) 64 * modes for the UNDO variables are supported (per process, per thread)
@@ -118,7 +115,8 @@ struct sem_queue {
118 struct sembuf *sops; /* array of pending operations */ 115 struct sembuf *sops; /* array of pending operations */
119 struct sembuf *blocking; /* the operation that blocked */ 116 struct sembuf *blocking; /* the operation that blocked */
120 int nsops; /* number of operations */ 117 int nsops; /* number of operations */
121 int alter; /* does *sops alter the array? */ 118 bool alter; /* does *sops alter the array? */
119 bool dupsop; /* sops on more than one sem_num */
122}; 120};
123 121
124/* Each task has a list of undo requests. They are executed automatically 122/* Each task has a list of undo requests. They are executed automatically
@@ -416,29 +414,6 @@ static inline void sem_unlock(struct sem_array *sma, int locknum)
416 * 414 *
417 * The caller holds the RCU read lock. 415 * The caller holds the RCU read lock.
418 */ 416 */
419static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns,
420 int id, struct sembuf *sops, int nsops, int *locknum)
421{
422 struct kern_ipc_perm *ipcp;
423 struct sem_array *sma;
424
425 ipcp = ipc_obtain_object_idr(&sem_ids(ns), id);
426 if (IS_ERR(ipcp))
427 return ERR_CAST(ipcp);
428
429 sma = container_of(ipcp, struct sem_array, sem_perm);
430 *locknum = sem_lock(sma, sops, nsops);
431
432 /* ipc_rmid() may have already freed the ID while sem_lock
433 * was spinning: verify that the structure is still valid
434 */
435 if (ipc_valid_object(ipcp))
436 return container_of(ipcp, struct sem_array, sem_perm);
437
438 sem_unlock(sma, *locknum);
439 return ERR_PTR(-EINVAL);
440}
441
442static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id) 417static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id)
443{ 418{
444 struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id); 419 struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id);
@@ -471,40 +446,6 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
471 ipc_rmid(&sem_ids(ns), &s->sem_perm); 446 ipc_rmid(&sem_ids(ns), &s->sem_perm);
472} 447}
473 448
474/*
475 * Lockless wakeup algorithm:
476 * Without the check/retry algorithm a lockless wakeup is possible:
477 * - queue.status is initialized to -EINTR before blocking.
478 * - wakeup is performed by
479 * * unlinking the queue entry from the pending list
480 * * setting queue.status to IN_WAKEUP
481 * This is the notification for the blocked thread that a
482 * result value is imminent.
483 * * call wake_up_process
484 * * set queue.status to the final value.
485 * - the previously blocked thread checks queue.status:
486 * * if it's IN_WAKEUP, then it must wait until the value changes
487 * * if it's not -EINTR, then the operation was completed by
488 * update_queue. semtimedop can return queue.status without
489 * performing any operation on the sem array.
490 * * otherwise it must acquire the spinlock and check what's up.
491 *
492 * The two-stage algorithm is necessary to protect against the following
493 * races:
494 * - if queue.status is set after wake_up_process, then the woken up idle
495 * thread could race forward and try (and fail) to acquire sma->lock
496 * before update_queue had a chance to set queue.status
497 * - if queue.status is written before wake_up_process and if the
498 * blocked process is woken up by a signal between writing
499 * queue.status and the wake_up_process, then the woken up
500 * process could return from semtimedop and die by calling
501 * sys_exit before wake_up_process is called. Then wake_up_process
502 * will oops, because the task structure is already invalid.
503 * (yes, this happened on s390 with sysv msg).
504 *
505 */
506#define IN_WAKEUP 1
507
508/** 449/**
509 * newary - Create a new semaphore set 450 * newary - Create a new semaphore set
510 * @ns: namespace 451 * @ns: namespace
@@ -624,15 +565,23 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
624} 565}
625 566
626/** 567/**
627 * perform_atomic_semop - Perform (if possible) a semaphore operation 568 * perform_atomic_semop[_slow] - Attempt to perform semaphore
569 * operations on a given array.
628 * @sma: semaphore array 570 * @sma: semaphore array
629 * @q: struct sem_queue that describes the operation 571 * @q: struct sem_queue that describes the operation
630 * 572 *
573 * Caller blocking are as follows, based the value
574 * indicated by the semaphore operation (sem_op):
575 *
576 * (1) >0 never blocks.
577 * (2) 0 (wait-for-zero operation): semval is non-zero.
578 * (3) <0 attempting to decrement semval to a value smaller than zero.
579 *
631 * Returns 0 if the operation was possible. 580 * Returns 0 if the operation was possible.
632 * Returns 1 if the operation is impossible, the caller must sleep. 581 * Returns 1 if the operation is impossible, the caller must sleep.
633 * Negative values are error codes. 582 * Returns <0 for error codes.
634 */ 583 */
635static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q) 584static int perform_atomic_semop_slow(struct sem_array *sma, struct sem_queue *q)
636{ 585{
637 int result, sem_op, nsops, pid; 586 int result, sem_op, nsops, pid;
638 struct sembuf *sop; 587 struct sembuf *sop;
@@ -703,51 +652,84 @@ undo:
703 return result; 652 return result;
704} 653}
705 654
706/** wake_up_sem_queue_prepare(q, error): Prepare wake-up 655static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q)
707 * @q: queue entry that must be signaled
708 * @error: Error value for the signal
709 *
710 * Prepare the wake-up of the queue entry q.
711 */
712static void wake_up_sem_queue_prepare(struct list_head *pt,
713 struct sem_queue *q, int error)
714{ 656{
715 if (list_empty(pt)) { 657 int result, sem_op, nsops;
716 /* 658 struct sembuf *sop;
717 * Hold preempt off so that we don't get preempted and have the 659 struct sem *curr;
718 * wakee busy-wait until we're scheduled back on. 660 struct sembuf *sops;
719 */ 661 struct sem_undo *un;
720 preempt_disable(); 662
663 sops = q->sops;
664 nsops = q->nsops;
665 un = q->undo;
666
667 if (unlikely(q->dupsop))
668 return perform_atomic_semop_slow(sma, q);
669
670 /*
671 * We scan the semaphore set twice, first to ensure that the entire
672 * operation can succeed, therefore avoiding any pointless writes
673 * to shared memory and having to undo such changes in order to block
674 * until the operations can go through.
675 */
676 for (sop = sops; sop < sops + nsops; sop++) {
677 curr = sma->sem_base + sop->sem_num;
678 sem_op = sop->sem_op;
679 result = curr->semval;
680
681 if (!sem_op && result)
682 goto would_block; /* wait-for-zero */
683
684 result += sem_op;
685 if (result < 0)
686 goto would_block;
687
688 if (result > SEMVMX)
689 return -ERANGE;
690
691 if (sop->sem_flg & SEM_UNDO) {
692 int undo = un->semadj[sop->sem_num] - sem_op;
693
694 /* Exceeding the undo range is an error. */
695 if (undo < (-SEMAEM - 1) || undo > SEMAEM)
696 return -ERANGE;
697 }
698 }
699
700 for (sop = sops; sop < sops + nsops; sop++) {
701 curr = sma->sem_base + sop->sem_num;
702 sem_op = sop->sem_op;
703 result = curr->semval;
704
705 if (sop->sem_flg & SEM_UNDO) {
706 int undo = un->semadj[sop->sem_num] - sem_op;
707
708 un->semadj[sop->sem_num] = undo;
709 }
710 curr->semval += sem_op;
711 curr->sempid = q->pid;
721 } 712 }
722 q->status = IN_WAKEUP;
723 q->pid = error;
724 713
725 list_add_tail(&q->list, pt); 714 return 0;
715
716would_block:
717 q->blocking = sop;
718 return sop->sem_flg & IPC_NOWAIT ? -EAGAIN : 1;
726} 719}
727 720
728/** 721static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error,
729 * wake_up_sem_queue_do - do the actual wake-up 722 struct wake_q_head *wake_q)
730 * @pt: list of tasks to be woken up
731 *
732 * Do the actual wake-up.
733 * The function is called without any locks held, thus the semaphore array
734 * could be destroyed already and the tasks can disappear as soon as the
735 * status is set to the actual return code.
736 */
737static void wake_up_sem_queue_do(struct list_head *pt)
738{ 723{
739 struct sem_queue *q, *t; 724 wake_q_add(wake_q, q->sleeper);
740 int did_something; 725 /*
741 726 * Rely on the above implicit barrier, such that we can
742 did_something = !list_empty(pt); 727 * ensure that we hold reference to the task before setting
743 list_for_each_entry_safe(q, t, pt, list) { 728 * q->status. Otherwise we could race with do_exit if the
744 wake_up_process(q->sleeper); 729 * task is awoken by an external event before calling
745 /* q can disappear immediately after writing q->status. */ 730 * wake_up_process().
746 smp_wmb(); 731 */
747 q->status = q->pid; 732 WRITE_ONCE(q->status, error);
748 }
749 if (did_something)
750 preempt_enable();
751} 733}
752 734
753static void unlink_queue(struct sem_array *sma, struct sem_queue *q) 735static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
@@ -767,7 +749,7 @@ static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
767 * modified the array. 749 * modified the array.
768 * Note that wait-for-zero operations are handled without restart. 750 * Note that wait-for-zero operations are handled without restart.
769 */ 751 */
770static int check_restart(struct sem_array *sma, struct sem_queue *q) 752static inline int check_restart(struct sem_array *sma, struct sem_queue *q)
771{ 753{
772 /* pending complex alter operations are too difficult to analyse */ 754 /* pending complex alter operations are too difficult to analyse */
773 if (!list_empty(&sma->pending_alter)) 755 if (!list_empty(&sma->pending_alter))
@@ -795,21 +777,20 @@ static int check_restart(struct sem_array *sma, struct sem_queue *q)
795 * wake_const_ops - wake up non-alter tasks 777 * wake_const_ops - wake up non-alter tasks
796 * @sma: semaphore array. 778 * @sma: semaphore array.
797 * @semnum: semaphore that was modified. 779 * @semnum: semaphore that was modified.
798 * @pt: list head for the tasks that must be woken up. 780 * @wake_q: lockless wake-queue head.
799 * 781 *
800 * wake_const_ops must be called after a semaphore in a semaphore array 782 * wake_const_ops must be called after a semaphore in a semaphore array
801 * was set to 0. If complex const operations are pending, wake_const_ops must 783 * was set to 0. If complex const operations are pending, wake_const_ops must
802 * be called with semnum = -1, as well as with the number of each modified 784 * be called with semnum = -1, as well as with the number of each modified
803 * semaphore. 785 * semaphore.
804 * The tasks that must be woken up are added to @pt. The return code 786 * The tasks that must be woken up are added to @wake_q. The return code
805 * is stored in q->pid. 787 * is stored in q->pid.
806 * The function returns 1 if at least one operation was completed successfully. 788 * The function returns 1 if at least one operation was completed successfully.
807 */ 789 */
808static int wake_const_ops(struct sem_array *sma, int semnum, 790static int wake_const_ops(struct sem_array *sma, int semnum,
809 struct list_head *pt) 791 struct wake_q_head *wake_q)
810{ 792{
811 struct sem_queue *q; 793 struct sem_queue *q, *tmp;
812 struct list_head *walk;
813 struct list_head *pending_list; 794 struct list_head *pending_list;
814 int semop_completed = 0; 795 int semop_completed = 0;
815 796
@@ -818,25 +799,19 @@ static int wake_const_ops(struct sem_array *sma, int semnum,
818 else 799 else
819 pending_list = &sma->sem_base[semnum].pending_const; 800 pending_list = &sma->sem_base[semnum].pending_const;
820 801
821 walk = pending_list->next; 802 list_for_each_entry_safe(q, tmp, pending_list, list) {
822 while (walk != pending_list) { 803 int error = perform_atomic_semop(sma, q);
823 int error;
824
825 q = container_of(walk, struct sem_queue, list);
826 walk = walk->next;
827
828 error = perform_atomic_semop(sma, q);
829
830 if (error <= 0) {
831 /* operation completed, remove from queue & wakeup */
832 804
833 unlink_queue(sma, q); 805 if (error > 0)
806 continue;
807 /* operation completed, remove from queue & wakeup */
808 unlink_queue(sma, q);
834 809
835 wake_up_sem_queue_prepare(pt, q, error); 810 wake_up_sem_queue_prepare(q, error, wake_q);
836 if (error == 0) 811 if (error == 0)
837 semop_completed = 1; 812 semop_completed = 1;
838 }
839 } 813 }
814
840 return semop_completed; 815 return semop_completed;
841} 816}
842 817
@@ -845,14 +820,14 @@ static int wake_const_ops(struct sem_array *sma, int semnum,
845 * @sma: semaphore array 820 * @sma: semaphore array
846 * @sops: operations that were performed 821 * @sops: operations that were performed
847 * @nsops: number of operations 822 * @nsops: number of operations
848 * @pt: list head of the tasks that must be woken up. 823 * @wake_q: lockless wake-queue head
849 * 824 *
850 * Checks all required queue for wait-for-zero operations, based 825 * Checks all required queue for wait-for-zero operations, based
851 * on the actual changes that were performed on the semaphore array. 826 * on the actual changes that were performed on the semaphore array.
852 * The function returns 1 if at least one operation was completed successfully. 827 * The function returns 1 if at least one operation was completed successfully.
853 */ 828 */
854static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, 829static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
855 int nsops, struct list_head *pt) 830 int nsops, struct wake_q_head *wake_q)
856{ 831{
857 int i; 832 int i;
858 int semop_completed = 0; 833 int semop_completed = 0;
@@ -865,7 +840,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
865 840
866 if (sma->sem_base[num].semval == 0) { 841 if (sma->sem_base[num].semval == 0) {
867 got_zero = 1; 842 got_zero = 1;
868 semop_completed |= wake_const_ops(sma, num, pt); 843 semop_completed |= wake_const_ops(sma, num, wake_q);
869 } 844 }
870 } 845 }
871 } else { 846 } else {
@@ -876,7 +851,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
876 for (i = 0; i < sma->sem_nsems; i++) { 851 for (i = 0; i < sma->sem_nsems; i++) {
877 if (sma->sem_base[i].semval == 0) { 852 if (sma->sem_base[i].semval == 0) {
878 got_zero = 1; 853 got_zero = 1;
879 semop_completed |= wake_const_ops(sma, i, pt); 854 semop_completed |= wake_const_ops(sma, i, wake_q);
880 } 855 }
881 } 856 }
882 } 857 }
@@ -885,7 +860,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
885 * then check the global queue, too. 860 * then check the global queue, too.
886 */ 861 */
887 if (got_zero) 862 if (got_zero)
888 semop_completed |= wake_const_ops(sma, -1, pt); 863 semop_completed |= wake_const_ops(sma, -1, wake_q);
889 864
890 return semop_completed; 865 return semop_completed;
891} 866}
@@ -895,22 +870,21 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
895 * update_queue - look for tasks that can be completed. 870 * update_queue - look for tasks that can be completed.
896 * @sma: semaphore array. 871 * @sma: semaphore array.
897 * @semnum: semaphore that was modified. 872 * @semnum: semaphore that was modified.
898 * @pt: list head for the tasks that must be woken up. 873 * @wake_q: lockless wake-queue head.
899 * 874 *
900 * update_queue must be called after a semaphore in a semaphore array 875 * update_queue must be called after a semaphore in a semaphore array
901 * was modified. If multiple semaphores were modified, update_queue must 876 * was modified. If multiple semaphores were modified, update_queue must
902 * be called with semnum = -1, as well as with the number of each modified 877 * be called with semnum = -1, as well as with the number of each modified
903 * semaphore. 878 * semaphore.
904 * The tasks that must be woken up are added to @pt. The return code 879 * The tasks that must be woken up are added to @wake_q. The return code
905 * is stored in q->pid. 880 * is stored in q->pid.
906 * The function internally checks if const operations can now succeed. 881 * The function internally checks if const operations can now succeed.
907 * 882 *
908 * The function return 1 if at least one semop was completed successfully. 883 * The function return 1 if at least one semop was completed successfully.
909 */ 884 */
910static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) 885static int update_queue(struct sem_array *sma, int semnum, struct wake_q_head *wake_q)
911{ 886{
912 struct sem_queue *q; 887 struct sem_queue *q, *tmp;
913 struct list_head *walk;
914 struct list_head *pending_list; 888 struct list_head *pending_list;
915 int semop_completed = 0; 889 int semop_completed = 0;
916 890
@@ -920,13 +894,9 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
920 pending_list = &sma->sem_base[semnum].pending_alter; 894 pending_list = &sma->sem_base[semnum].pending_alter;
921 895
922again: 896again:
923 walk = pending_list->next; 897 list_for_each_entry_safe(q, tmp, pending_list, list) {
924 while (walk != pending_list) {
925 int error, restart; 898 int error, restart;
926 899
927 q = container_of(walk, struct sem_queue, list);
928 walk = walk->next;
929
930 /* If we are scanning the single sop, per-semaphore list of 900 /* If we are scanning the single sop, per-semaphore list of
931 * one semaphore and that semaphore is 0, then it is not 901 * one semaphore and that semaphore is 0, then it is not
932 * necessary to scan further: simple increments 902 * necessary to scan further: simple increments
@@ -949,11 +919,11 @@ again:
949 restart = 0; 919 restart = 0;
950 } else { 920 } else {
951 semop_completed = 1; 921 semop_completed = 1;
952 do_smart_wakeup_zero(sma, q->sops, q->nsops, pt); 922 do_smart_wakeup_zero(sma, q->sops, q->nsops, wake_q);
953 restart = check_restart(sma, q); 923 restart = check_restart(sma, q);
954 } 924 }
955 925
956 wake_up_sem_queue_prepare(pt, q, error); 926 wake_up_sem_queue_prepare(q, error, wake_q);
957 if (restart) 927 if (restart)
958 goto again; 928 goto again;
959 } 929 }
@@ -984,24 +954,24 @@ static void set_semotime(struct sem_array *sma, struct sembuf *sops)
984 * @sops: operations that were performed 954 * @sops: operations that were performed
985 * @nsops: number of operations 955 * @nsops: number of operations
986 * @otime: force setting otime 956 * @otime: force setting otime
987 * @pt: list head of the tasks that must be woken up. 957 * @wake_q: lockless wake-queue head
988 * 958 *
989 * do_smart_update() does the required calls to update_queue and wakeup_zero, 959 * do_smart_update() does the required calls to update_queue and wakeup_zero,
990 * based on the actual changes that were performed on the semaphore array. 960 * based on the actual changes that were performed on the semaphore array.
991 * Note that the function does not do the actual wake-up: the caller is 961 * Note that the function does not do the actual wake-up: the caller is
992 * responsible for calling wake_up_sem_queue_do(@pt). 962 * responsible for calling wake_up_q().
993 * It is safe to perform this call after dropping all locks. 963 * It is safe to perform this call after dropping all locks.
994 */ 964 */
995static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops, 965static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops,
996 int otime, struct list_head *pt) 966 int otime, struct wake_q_head *wake_q)
997{ 967{
998 int i; 968 int i;
999 969
1000 otime |= do_smart_wakeup_zero(sma, sops, nsops, pt); 970 otime |= do_smart_wakeup_zero(sma, sops, nsops, wake_q);
1001 971
1002 if (!list_empty(&sma->pending_alter)) { 972 if (!list_empty(&sma->pending_alter)) {
1003 /* semaphore array uses the global queue - just process it. */ 973 /* semaphore array uses the global queue - just process it. */
1004 otime |= update_queue(sma, -1, pt); 974 otime |= update_queue(sma, -1, wake_q);
1005 } else { 975 } else {
1006 if (!sops) { 976 if (!sops) {
1007 /* 977 /*
@@ -1009,7 +979,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
1009 * known. Check all. 979 * known. Check all.
1010 */ 980 */
1011 for (i = 0; i < sma->sem_nsems; i++) 981 for (i = 0; i < sma->sem_nsems; i++)
1012 otime |= update_queue(sma, i, pt); 982 otime |= update_queue(sma, i, wake_q);
1013 } else { 983 } else {
1014 /* 984 /*
1015 * Check the semaphores that were increased: 985 * Check the semaphores that were increased:
@@ -1023,7 +993,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
1023 for (i = 0; i < nsops; i++) { 993 for (i = 0; i < nsops; i++) {
1024 if (sops[i].sem_op > 0) { 994 if (sops[i].sem_op > 0) {
1025 otime |= update_queue(sma, 995 otime |= update_queue(sma,
1026 sops[i].sem_num, pt); 996 sops[i].sem_num, wake_q);
1027 } 997 }
1028 } 998 }
1029 } 999 }
@@ -1111,8 +1081,8 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
1111 struct sem_undo *un, *tu; 1081 struct sem_undo *un, *tu;
1112 struct sem_queue *q, *tq; 1082 struct sem_queue *q, *tq;
1113 struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm); 1083 struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
1114 struct list_head tasks;
1115 int i; 1084 int i;
1085 DEFINE_WAKE_Q(wake_q);
1116 1086
1117 /* Free the existing undo structures for this semaphore set. */ 1087 /* Free the existing undo structures for this semaphore set. */
1118 ipc_assert_locked_object(&sma->sem_perm); 1088 ipc_assert_locked_object(&sma->sem_perm);
@@ -1126,25 +1096,24 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
1126 } 1096 }
1127 1097
1128 /* Wake up all pending processes and let them fail with EIDRM. */ 1098 /* Wake up all pending processes and let them fail with EIDRM. */
1129 INIT_LIST_HEAD(&tasks);
1130 list_for_each_entry_safe(q, tq, &sma->pending_const, list) { 1099 list_for_each_entry_safe(q, tq, &sma->pending_const, list) {
1131 unlink_queue(sma, q); 1100 unlink_queue(sma, q);
1132 wake_up_sem_queue_prepare(&tasks, q, -EIDRM); 1101 wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
1133 } 1102 }
1134 1103
1135 list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { 1104 list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
1136 unlink_queue(sma, q); 1105 unlink_queue(sma, q);
1137 wake_up_sem_queue_prepare(&tasks, q, -EIDRM); 1106 wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
1138 } 1107 }
1139 for (i = 0; i < sma->sem_nsems; i++) { 1108 for (i = 0; i < sma->sem_nsems; i++) {
1140 struct sem *sem = sma->sem_base + i; 1109 struct sem *sem = sma->sem_base + i;
1141 list_for_each_entry_safe(q, tq, &sem->pending_const, list) { 1110 list_for_each_entry_safe(q, tq, &sem->pending_const, list) {
1142 unlink_queue(sma, q); 1111 unlink_queue(sma, q);
1143 wake_up_sem_queue_prepare(&tasks, q, -EIDRM); 1112 wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
1144 } 1113 }
1145 list_for_each_entry_safe(q, tq, &sem->pending_alter, list) { 1114 list_for_each_entry_safe(q, tq, &sem->pending_alter, list) {
1146 unlink_queue(sma, q); 1115 unlink_queue(sma, q);
1147 wake_up_sem_queue_prepare(&tasks, q, -EIDRM); 1116 wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
1148 } 1117 }
1149 } 1118 }
1150 1119
@@ -1153,7 +1122,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
1153 sem_unlock(sma, -1); 1122 sem_unlock(sma, -1);
1154 rcu_read_unlock(); 1123 rcu_read_unlock();
1155 1124
1156 wake_up_sem_queue_do(&tasks); 1125 wake_up_q(&wake_q);
1157 ns->used_sems -= sma->sem_nsems; 1126 ns->used_sems -= sma->sem_nsems;
1158 ipc_rcu_putref(sma, sem_rcu_free); 1127 ipc_rcu_putref(sma, sem_rcu_free);
1159} 1128}
@@ -1292,9 +1261,9 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
1292 struct sem_undo *un; 1261 struct sem_undo *un;
1293 struct sem_array *sma; 1262 struct sem_array *sma;
1294 struct sem *curr; 1263 struct sem *curr;
1295 int err; 1264 int err, val;
1296 struct list_head tasks; 1265 DEFINE_WAKE_Q(wake_q);
1297 int val; 1266
1298#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN) 1267#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
1299 /* big-endian 64bit */ 1268 /* big-endian 64bit */
1300 val = arg >> 32; 1269 val = arg >> 32;
@@ -1306,8 +1275,6 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
1306 if (val > SEMVMX || val < 0) 1275 if (val > SEMVMX || val < 0)
1307 return -ERANGE; 1276 return -ERANGE;
1308 1277
1309 INIT_LIST_HEAD(&tasks);
1310
1311 rcu_read_lock(); 1278 rcu_read_lock();
1312 sma = sem_obtain_object_check(ns, semid); 1279 sma = sem_obtain_object_check(ns, semid);
1313 if (IS_ERR(sma)) { 1280 if (IS_ERR(sma)) {
@@ -1350,10 +1317,10 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
1350 curr->sempid = task_tgid_vnr(current); 1317 curr->sempid = task_tgid_vnr(current);
1351 sma->sem_ctime = get_seconds(); 1318 sma->sem_ctime = get_seconds();
1352 /* maybe some queued-up processes were waiting for this */ 1319 /* maybe some queued-up processes were waiting for this */
1353 do_smart_update(sma, NULL, 0, 0, &tasks); 1320 do_smart_update(sma, NULL, 0, 0, &wake_q);
1354 sem_unlock(sma, -1); 1321 sem_unlock(sma, -1);
1355 rcu_read_unlock(); 1322 rcu_read_unlock();
1356 wake_up_sem_queue_do(&tasks); 1323 wake_up_q(&wake_q);
1357 return 0; 1324 return 0;
1358} 1325}
1359 1326
@@ -1365,9 +1332,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
1365 int err, nsems; 1332 int err, nsems;
1366 ushort fast_sem_io[SEMMSL_FAST]; 1333 ushort fast_sem_io[SEMMSL_FAST];
1367 ushort *sem_io = fast_sem_io; 1334 ushort *sem_io = fast_sem_io;
1368 struct list_head tasks; 1335 DEFINE_WAKE_Q(wake_q);
1369
1370 INIT_LIST_HEAD(&tasks);
1371 1336
1372 rcu_read_lock(); 1337 rcu_read_lock();
1373 sma = sem_obtain_object_check(ns, semid); 1338 sma = sem_obtain_object_check(ns, semid);
@@ -1478,7 +1443,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
1478 } 1443 }
1479 sma->sem_ctime = get_seconds(); 1444 sma->sem_ctime = get_seconds();
1480 /* maybe some queued-up processes were waiting for this */ 1445 /* maybe some queued-up processes were waiting for this */
1481 do_smart_update(sma, NULL, 0, 0, &tasks); 1446 do_smart_update(sma, NULL, 0, 0, &wake_q);
1482 err = 0; 1447 err = 0;
1483 goto out_unlock; 1448 goto out_unlock;
1484 } 1449 }
@@ -1514,7 +1479,7 @@ out_unlock:
1514 sem_unlock(sma, -1); 1479 sem_unlock(sma, -1);
1515out_rcu_wakeup: 1480out_rcu_wakeup:
1516 rcu_read_unlock(); 1481 rcu_read_unlock();
1517 wake_up_sem_queue_do(&tasks); 1482 wake_up_q(&wake_q);
1518out_free: 1483out_free:
1519 if (sem_io != fast_sem_io) 1484 if (sem_io != fast_sem_io)
1520 ipc_free(sem_io); 1485 ipc_free(sem_io);
@@ -1787,32 +1752,6 @@ out:
1787 return un; 1752 return un;
1788} 1753}
1789 1754
1790
1791/**
1792 * get_queue_result - retrieve the result code from sem_queue
1793 * @q: Pointer to queue structure
1794 *
1795 * Retrieve the return code from the pending queue. If IN_WAKEUP is found in
1796 * q->status, then we must loop until the value is replaced with the final
1797 * value: This may happen if a task is woken up by an unrelated event (e.g.
1798 * signal) and in parallel the task is woken up by another task because it got
1799 * the requested semaphores.
1800 *
1801 * The function can be called with or without holding the semaphore spinlock.
1802 */
1803static int get_queue_result(struct sem_queue *q)
1804{
1805 int error;
1806
1807 error = q->status;
1808 while (unlikely(error == IN_WAKEUP)) {
1809 cpu_relax();
1810 error = q->status;
1811 }
1812
1813 return error;
1814}
1815
1816SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, 1755SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1817 unsigned, nsops, const struct timespec __user *, timeout) 1756 unsigned, nsops, const struct timespec __user *, timeout)
1818{ 1757{
@@ -1821,11 +1760,11 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1821 struct sembuf fast_sops[SEMOPM_FAST]; 1760 struct sembuf fast_sops[SEMOPM_FAST];
1822 struct sembuf *sops = fast_sops, *sop; 1761 struct sembuf *sops = fast_sops, *sop;
1823 struct sem_undo *un; 1762 struct sem_undo *un;
1824 int undos = 0, alter = 0, max, locknum; 1763 int max, locknum;
1764 bool undos = false, alter = false, dupsop = false;
1825 struct sem_queue queue; 1765 struct sem_queue queue;
1826 unsigned long jiffies_left = 0; 1766 unsigned long dup = 0, jiffies_left = 0;
1827 struct ipc_namespace *ns; 1767 struct ipc_namespace *ns;
1828 struct list_head tasks;
1829 1768
1830 ns = current->nsproxy->ipc_ns; 1769 ns = current->nsproxy->ipc_ns;
1831 1770
@@ -1838,10 +1777,12 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1838 if (sops == NULL) 1777 if (sops == NULL)
1839 return -ENOMEM; 1778 return -ENOMEM;
1840 } 1779 }
1780
1841 if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) { 1781 if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) {
1842 error = -EFAULT; 1782 error = -EFAULT;
1843 goto out_free; 1783 goto out_free;
1844 } 1784 }
1785
1845 if (timeout) { 1786 if (timeout) {
1846 struct timespec _timeout; 1787 struct timespec _timeout;
1847 if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) { 1788 if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) {
@@ -1855,18 +1796,30 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1855 } 1796 }
1856 jiffies_left = timespec_to_jiffies(&_timeout); 1797 jiffies_left = timespec_to_jiffies(&_timeout);
1857 } 1798 }
1799
1858 max = 0; 1800 max = 0;
1859 for (sop = sops; sop < sops + nsops; sop++) { 1801 for (sop = sops; sop < sops + nsops; sop++) {
1802 unsigned long mask = 1ULL << ((sop->sem_num) % BITS_PER_LONG);
1803
1860 if (sop->sem_num >= max) 1804 if (sop->sem_num >= max)
1861 max = sop->sem_num; 1805 max = sop->sem_num;
1862 if (sop->sem_flg & SEM_UNDO) 1806 if (sop->sem_flg & SEM_UNDO)
1863 undos = 1; 1807 undos = true;
1864 if (sop->sem_op != 0) 1808 if (dup & mask) {
1865 alter = 1; 1809 /*
1810 * There was a previous alter access that appears
1811 * to have accessed the same semaphore, thus use
1812 * the dupsop logic. "appears", because the detection
1813 * can only check % BITS_PER_LONG.
1814 */
1815 dupsop = true;
1816 }
1817 if (sop->sem_op != 0) {
1818 alter = true;
1819 dup |= mask;
1820 }
1866 } 1821 }
1867 1822
1868 INIT_LIST_HEAD(&tasks);
1869
1870 if (undos) { 1823 if (undos) {
1871 /* On success, find_alloc_undo takes the rcu_read_lock */ 1824 /* On success, find_alloc_undo takes the rcu_read_lock */
1872 un = find_alloc_undo(ns, semid); 1825 un = find_alloc_undo(ns, semid);
@@ -1887,16 +1840,22 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1887 } 1840 }
1888 1841
1889 error = -EFBIG; 1842 error = -EFBIG;
1890 if (max >= sma->sem_nsems) 1843 if (max >= sma->sem_nsems) {
1891 goto out_rcu_wakeup; 1844 rcu_read_unlock();
1845 goto out_free;
1846 }
1892 1847
1893 error = -EACCES; 1848 error = -EACCES;
1894 if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) 1849 if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) {
1895 goto out_rcu_wakeup; 1850 rcu_read_unlock();
1851 goto out_free;
1852 }
1896 1853
1897 error = security_sem_semop(sma, sops, nsops, alter); 1854 error = security_sem_semop(sma, sops, nsops, alter);
1898 if (error) 1855 if (error) {
1899 goto out_rcu_wakeup; 1856 rcu_read_unlock();
1857 goto out_free;
1858 }
1900 1859
1901 error = -EIDRM; 1860 error = -EIDRM;
1902 locknum = sem_lock(sma, sops, nsops); 1861 locknum = sem_lock(sma, sops, nsops);
@@ -1925,24 +1884,34 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1925 queue.undo = un; 1884 queue.undo = un;
1926 queue.pid = task_tgid_vnr(current); 1885 queue.pid = task_tgid_vnr(current);
1927 queue.alter = alter; 1886 queue.alter = alter;
1887 queue.dupsop = dupsop;
1928 1888
1929 error = perform_atomic_semop(sma, &queue); 1889 error = perform_atomic_semop(sma, &queue);
1930 if (error == 0) { 1890 if (error == 0) { /* non-blocking succesfull path */
1931 /* If the operation was successful, then do 1891 DEFINE_WAKE_Q(wake_q);
1892
1893 /*
1894 * If the operation was successful, then do
1932 * the required updates. 1895 * the required updates.
1933 */ 1896 */
1934 if (alter) 1897 if (alter)
1935 do_smart_update(sma, sops, nsops, 1, &tasks); 1898 do_smart_update(sma, sops, nsops, 1, &wake_q);
1936 else 1899 else
1937 set_semotime(sma, sops); 1900 set_semotime(sma, sops);
1901
1902 sem_unlock(sma, locknum);
1903 rcu_read_unlock();
1904 wake_up_q(&wake_q);
1905
1906 goto out_free;
1938 } 1907 }
1939 if (error <= 0) 1908 if (error < 0) /* non-blocking error path */
1940 goto out_unlock_free; 1909 goto out_unlock_free;
1941 1910
1942 /* We need to sleep on this operation, so we put the current 1911 /*
1912 * We need to sleep on this operation, so we put the current
1943 * task into the pending queue and go to sleep. 1913 * task into the pending queue and go to sleep.
1944 */ 1914 */
1945
1946 if (nsops == 1) { 1915 if (nsops == 1) {
1947 struct sem *curr; 1916 struct sem *curr;
1948 curr = &sma->sem_base[sops->sem_num]; 1917 curr = &sma->sem_base[sops->sem_num];
@@ -1971,77 +1940,69 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1971 sma->complex_count++; 1940 sma->complex_count++;
1972 } 1941 }
1973 1942
1974 queue.status = -EINTR; 1943 do {
1975 queue.sleeper = current; 1944 queue.status = -EINTR;
1945 queue.sleeper = current;
1976 1946
1977sleep_again: 1947 __set_current_state(TASK_INTERRUPTIBLE);
1978 __set_current_state(TASK_INTERRUPTIBLE); 1948 sem_unlock(sma, locknum);
1979 sem_unlock(sma, locknum); 1949 rcu_read_unlock();
1980 rcu_read_unlock();
1981
1982 if (timeout)
1983 jiffies_left = schedule_timeout(jiffies_left);
1984 else
1985 schedule();
1986 1950
1987 error = get_queue_result(&queue); 1951 if (timeout)
1952 jiffies_left = schedule_timeout(jiffies_left);
1953 else
1954 schedule();
1988 1955
1989 if (error != -EINTR) { 1956 /*
1990 /* fast path: update_queue already obtained all requested 1957 * fastpath: the semop has completed, either successfully or
1991 * resources. 1958 * not, from the syscall pov, is quite irrelevant to us at this
1992 * Perform a smp_mb(): User space could assume that semop() 1959 * point; we're done.
1993 * is a memory barrier: Without the mb(), the cpu could 1960 *
1994 * speculatively read in user space stale data that was 1961 * We _do_ care, nonetheless, about being awoken by a signal or
1995 * overwritten by the previous owner of the semaphore. 1962 * spuriously. The queue.status is checked again in the
1963 * slowpath (aka after taking sem_lock), such that we can detect
1964 * scenarios where we were awakened externally, during the
1965 * window between wake_q_add() and wake_up_q().
1996 */ 1966 */
1997 smp_mb(); 1967 error = READ_ONCE(queue.status);
1998 1968 if (error != -EINTR) {
1999 goto out_free; 1969 /*
2000 } 1970 * User space could assume that semop() is a memory
2001 1971 * barrier: Without the mb(), the cpu could
2002 rcu_read_lock(); 1972 * speculatively read in userspace stale data that was
2003 sma = sem_obtain_lock(ns, semid, sops, nsops, &locknum); 1973 * overwritten by the previous owner of the semaphore.
2004 1974 */
2005 /* 1975 smp_mb();
2006 * Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing. 1976 goto out_free;
2007 */ 1977 }
2008 error = get_queue_result(&queue);
2009 1978
2010 /* 1979 rcu_read_lock();
2011 * Array removed? If yes, leave without sem_unlock(). 1980 sem_lock(sma, sops, nsops);
2012 */
2013 if (IS_ERR(sma)) {
2014 rcu_read_unlock();
2015 goto out_free;
2016 }
2017 1981
1982 if (!ipc_valid_object(&sma->sem_perm))
1983 goto out_unlock_free;
2018 1984
2019 /* 1985 error = READ_ONCE(queue.status);
2020 * If queue.status != -EINTR we are woken up by another process.
2021 * Leave without unlink_queue(), but with sem_unlock().
2022 */
2023 if (error != -EINTR)
2024 goto out_unlock_free;
2025 1986
2026 /* 1987 /*
2027 * If an interrupt occurred we have to clean up the queue 1988 * If queue.status != -EINTR we are woken up by another process.
2028 */ 1989 * Leave without unlink_queue(), but with sem_unlock().
2029 if (timeout && jiffies_left == 0) 1990 */
2030 error = -EAGAIN; 1991 if (error != -EINTR)
1992 goto out_unlock_free;
2031 1993
2032 /* 1994 /*
2033 * If the wakeup was spurious, just retry 1995 * If an interrupt occurred we have to clean up the queue.
2034 */ 1996 */
2035 if (error == -EINTR && !signal_pending(current)) 1997 if (timeout && jiffies_left == 0)
2036 goto sleep_again; 1998 error = -EAGAIN;
1999 } while (error == -EINTR && !signal_pending(current)); /* spurious */
2037 2000
2038 unlink_queue(sma, &queue); 2001 unlink_queue(sma, &queue);
2039 2002
2040out_unlock_free: 2003out_unlock_free:
2041 sem_unlock(sma, locknum); 2004 sem_unlock(sma, locknum);
2042out_rcu_wakeup:
2043 rcu_read_unlock(); 2005 rcu_read_unlock();
2044 wake_up_sem_queue_do(&tasks);
2045out_free: 2006out_free:
2046 if (sops != fast_sops) 2007 if (sops != fast_sops)
2047 kfree(sops); 2008 kfree(sops);
@@ -2102,8 +2063,8 @@ void exit_sem(struct task_struct *tsk)
2102 for (;;) { 2063 for (;;) {
2103 struct sem_array *sma; 2064 struct sem_array *sma;
2104 struct sem_undo *un; 2065 struct sem_undo *un;
2105 struct list_head tasks;
2106 int semid, i; 2066 int semid, i;
2067 DEFINE_WAKE_Q(wake_q);
2107 2068
2108 cond_resched(); 2069 cond_resched();
2109 2070
@@ -2191,11 +2152,10 @@ void exit_sem(struct task_struct *tsk)
2191 } 2152 }
2192 } 2153 }
2193 /* maybe some queued-up processes were waiting for this */ 2154 /* maybe some queued-up processes were waiting for this */
2194 INIT_LIST_HEAD(&tasks); 2155 do_smart_update(sma, NULL, 0, 1, &wake_q);
2195 do_smart_update(sma, NULL, 0, 1, &tasks);
2196 sem_unlock(sma, -1); 2156 sem_unlock(sma, -1);
2197 rcu_read_unlock(); 2157 rcu_read_unlock();
2198 wake_up_sem_queue_do(&tasks); 2158 wake_up_q(&wake_q);
2199 2159
2200 kfree_rcu(un, rcu); 2160 kfree_rcu(un, rcu);
2201 } 2161 }