diff options
author | Manfred Spraul <manfred@colorfullife.com> | 2013-07-08 19:01:23 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-07-09 13:33:28 -0400 |
commit | 1a82e9e1d0f1b45f47a97c9e2349020536ff8987 (patch) | |
tree | 956810c32e02f6ae1527db015c6ae622800bd720 /ipc | |
parent | f5c936c0f267ec58641451cf8b8d39b4c207ee4d (diff) |
ipc/sem: separate wait-for-zero and alter tasks into seperate queues
Introduce separate queues for operations that do not modify the
semaphore values. Advantages:
- Simpler logic in check_restart().
- Faster update_queue(): Right now, all wait-for-zero operations are
always tested, even if the semaphore value is not 0.
- wait-for-zero gets again priority, as in linux <=3.0.9
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'ipc')
-rw-r--r-- | ipc/sem.c | 211 |
1 files changed, 151 insertions, 60 deletions
@@ -95,7 +95,10 @@ struct sem { | |||
95 | int semval; /* current value */ | 95 | int semval; /* current value */ |
96 | int sempid; /* pid of last operation */ | 96 | int sempid; /* pid of last operation */ |
97 | spinlock_t lock; /* spinlock for fine-grained semtimedop */ | 97 | spinlock_t lock; /* spinlock for fine-grained semtimedop */ |
98 | struct list_head sem_pending; /* pending single-sop operations */ | 98 | struct list_head pending_alter; /* pending single-sop operations */ |
99 | /* that alter the semaphore */ | ||
100 | struct list_head pending_const; /* pending single-sop operations */ | ||
101 | /* that do not alter the semaphore*/ | ||
99 | } ____cacheline_aligned_in_smp; | 102 | } ____cacheline_aligned_in_smp; |
100 | 103 | ||
101 | /* One queue for each sleeping process in the system. */ | 104 | /* One queue for each sleeping process in the system. */ |
@@ -152,7 +155,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); | |||
152 | /* | 155 | /* |
153 | * linked list protection: | 156 | * linked list protection: |
154 | * sem_undo.id_next, | 157 | * sem_undo.id_next, |
155 | * sem_array.sem_pending{,last}, | 158 | * sem_array.pending{_alter,_cont}, |
156 | * sem_array.sem_undo: sem_lock() for read/write | 159 | * sem_array.sem_undo: sem_lock() for read/write |
157 | * sem_undo.proc_next: only "current" is allowed to read/write that field. | 160 | * sem_undo.proc_next: only "current" is allowed to read/write that field. |
158 | * | 161 | * |
@@ -337,7 +340,7 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) | |||
337 | * Without the check/retry algorithm a lockless wakeup is possible: | 340 | * Without the check/retry algorithm a lockless wakeup is possible: |
338 | * - queue.status is initialized to -EINTR before blocking. | 341 | * - queue.status is initialized to -EINTR before blocking. |
339 | * - wakeup is performed by | 342 | * - wakeup is performed by |
340 | * * unlinking the queue entry from sma->sem_pending | 343 | * * unlinking the queue entry from the pending list |
341 | * * setting queue.status to IN_WAKEUP | 344 | * * setting queue.status to IN_WAKEUP |
342 | * This is the notification for the blocked thread that a | 345 | * This is the notification for the blocked thread that a |
343 | * result value is imminent. | 346 | * result value is imminent. |
@@ -418,12 +421,14 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) | |||
418 | sma->sem_base = (struct sem *) &sma[1]; | 421 | sma->sem_base = (struct sem *) &sma[1]; |
419 | 422 | ||
420 | for (i = 0; i < nsems; i++) { | 423 | for (i = 0; i < nsems; i++) { |
421 | INIT_LIST_HEAD(&sma->sem_base[i].sem_pending); | 424 | INIT_LIST_HEAD(&sma->sem_base[i].pending_alter); |
425 | INIT_LIST_HEAD(&sma->sem_base[i].pending_const); | ||
422 | spin_lock_init(&sma->sem_base[i].lock); | 426 | spin_lock_init(&sma->sem_base[i].lock); |
423 | } | 427 | } |
424 | 428 | ||
425 | sma->complex_count = 0; | 429 | sma->complex_count = 0; |
426 | INIT_LIST_HEAD(&sma->sem_pending); | 430 | INIT_LIST_HEAD(&sma->pending_alter); |
431 | INIT_LIST_HEAD(&sma->pending_const); | ||
427 | INIT_LIST_HEAD(&sma->list_id); | 432 | INIT_LIST_HEAD(&sma->list_id); |
428 | sma->sem_nsems = nsems; | 433 | sma->sem_nsems = nsems; |
429 | sma->sem_ctime = get_seconds(); | 434 | sma->sem_ctime = get_seconds(); |
@@ -609,60 +614,132 @@ static void unlink_queue(struct sem_array *sma, struct sem_queue *q) | |||
609 | * update_queue is O(N^2) when it restarts scanning the whole queue of | 614 | * update_queue is O(N^2) when it restarts scanning the whole queue of |
610 | * waiting operations. Therefore this function checks if the restart is | 615 | * waiting operations. Therefore this function checks if the restart is |
611 | * really necessary. It is called after a previously waiting operation | 616 | * really necessary. It is called after a previously waiting operation |
612 | * was completed. | 617 | * modified the array. |
618 | * Note that wait-for-zero operations are handled without restart. | ||
613 | */ | 619 | */ |
614 | static int check_restart(struct sem_array *sma, struct sem_queue *q) | 620 | static int check_restart(struct sem_array *sma, struct sem_queue *q) |
615 | { | 621 | { |
616 | struct sem *curr; | 622 | /* pending complex alter operations are too difficult to analyse */ |
617 | struct sem_queue *h; | 623 | if (!list_empty(&sma->pending_alter)) |
618 | |||
619 | /* if the operation didn't modify the array, then no restart */ | ||
620 | if (q->alter == 0) | ||
621 | return 0; | ||
622 | |||
623 | /* pending complex operations are too difficult to analyse */ | ||
624 | if (sma->complex_count) | ||
625 | return 1; | 624 | return 1; |
626 | 625 | ||
627 | /* we were a sleeping complex operation. Too difficult */ | 626 | /* we were a sleeping complex operation. Too difficult */ |
628 | if (q->nsops > 1) | 627 | if (q->nsops > 1) |
629 | return 1; | 628 | return 1; |
630 | 629 | ||
631 | curr = sma->sem_base + q->sops[0].sem_num; | 630 | /* It is impossible that someone waits for the new value: |
631 | * - complex operations always restart. | ||
632 | * - wait-for-zero are handled seperately. | ||
633 | * - q is a previously sleeping simple operation that | ||
634 | * altered the array. It must be a decrement, because | ||
635 | * simple increments never sleep. | ||
636 | * - If there are older (higher priority) decrements | ||
637 | * in the queue, then they have observed the original | ||
638 | * semval value and couldn't proceed. The operation | ||
639 | * decremented to value - thus they won't proceed either. | ||
640 | */ | ||
641 | return 0; | ||
642 | } | ||
632 | 643 | ||
633 | /* No-one waits on this queue */ | 644 | /** |
634 | if (list_empty(&curr->sem_pending)) | 645 | * wake_const_ops(sma, semnum, pt) - Wake up non-alter tasks |
635 | return 0; | 646 | * @sma: semaphore array. |
647 | * @semnum: semaphore that was modified. | ||
648 | * @pt: list head for the tasks that must be woken up. | ||
649 | * | ||
650 | * wake_const_ops must be called after a semaphore in a semaphore array | ||
651 | * was set to 0. If complex const operations are pending, wake_const_ops must | ||
652 | * be called with semnum = -1, as well as with the number of each modified | ||
653 | * semaphore. | ||
654 | * The tasks that must be woken up are added to @pt. The return code | ||
655 | * is stored in q->pid. | ||
656 | * The function returns 1 if at least one operation was completed successfully. | ||
657 | */ | ||
658 | static int wake_const_ops(struct sem_array *sma, int semnum, | ||
659 | struct list_head *pt) | ||
660 | { | ||
661 | struct sem_queue *q; | ||
662 | struct list_head *walk; | ||
663 | struct list_head *pending_list; | ||
664 | int semop_completed = 0; | ||
665 | |||
666 | if (semnum == -1) | ||
667 | pending_list = &sma->pending_const; | ||
668 | else | ||
669 | pending_list = &sma->sem_base[semnum].pending_const; | ||
636 | 670 | ||
637 | /* the new semaphore value */ | 671 | walk = pending_list->next; |
638 | if (curr->semval) { | 672 | while (walk != pending_list) { |
639 | /* It is impossible that someone waits for the new value: | 673 | int error; |
640 | * - q is a previously sleeping simple operation that | 674 | |
641 | * altered the array. It must be a decrement, because | 675 | q = container_of(walk, struct sem_queue, list); |
642 | * simple increments never sleep. | 676 | walk = walk->next; |
643 | * - The value is not 0, thus wait-for-zero won't proceed. | 677 | |
644 | * - If there are older (higher priority) decrements | 678 | error = try_atomic_semop(sma, q->sops, q->nsops, |
645 | * in the queue, then they have observed the original | 679 | q->undo, q->pid); |
646 | * semval value and couldn't proceed. The operation | 680 | |
647 | * decremented to value - thus they won't proceed either. | 681 | if (error <= 0) { |
682 | /* operation completed, remove from queue & wakeup */ | ||
683 | |||
684 | unlink_queue(sma, q); | ||
685 | |||
686 | wake_up_sem_queue_prepare(pt, q, error); | ||
687 | if (error == 0) | ||
688 | semop_completed = 1; | ||
689 | } | ||
690 | } | ||
691 | return semop_completed; | ||
692 | } | ||
693 | |||
694 | /** | ||
695 | * do_smart_wakeup_zero(sma, sops, nsops, pt) - wakeup all wait for zero tasks | ||
696 | * @sma: semaphore array | ||
697 | * @sops: operations that were performed | ||
698 | * @nsops: number of operations | ||
699 | * @pt: list head of the tasks that must be woken up. | ||
700 | * | ||
701 | * do_smart_wakeup_zero() checks all required queue for wait-for-zero | ||
702 | * operations, based on the actual changes that were performed on the | ||
703 | * semaphore array. | ||
704 | * The function returns 1 if at least one operation was completed successfully. | ||
705 | */ | ||
706 | static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, | ||
707 | int nsops, struct list_head *pt) | ||
708 | { | ||
709 | int i; | ||
710 | int semop_completed = 0; | ||
711 | int got_zero = 0; | ||
712 | |||
713 | /* first: the per-semaphore queues, if known */ | ||
714 | if (sops) { | ||
715 | for (i = 0; i < nsops; i++) { | ||
716 | int num = sops[i].sem_num; | ||
717 | |||
718 | if (sma->sem_base[num].semval == 0) { | ||
719 | got_zero = 1; | ||
720 | semop_completed |= wake_const_ops(sma, num, pt); | ||
721 | } | ||
722 | } | ||
723 | } else { | ||
724 | /* | ||
725 | * No sops means modified semaphores not known. | ||
726 | * Assume all were changed. | ||
648 | */ | 727 | */ |
649 | BUG_ON(q->sops[0].sem_op >= 0); | 728 | for (i = 0; i < sma->sem_nsems; i++) { |
650 | return 0; | 729 | if (sma->sem_base[i].semval == 0) { |
730 | got_zero = 1; | ||
731 | semop_completed |= wake_const_ops(sma, i, pt); | ||
732 | } | ||
733 | } | ||
651 | } | 734 | } |
652 | /* | 735 | /* |
653 | * semval is 0. Check if there are wait-for-zero semops. | 736 | * If one of the modified semaphores got 0, |
654 | * They must be the first entries in the per-semaphore queue | 737 | * then check the global queue, too. |
655 | */ | 738 | */ |
656 | h = list_first_entry(&curr->sem_pending, struct sem_queue, list); | 739 | if (got_zero) |
657 | BUG_ON(h->nsops != 1); | 740 | semop_completed |= wake_const_ops(sma, -1, pt); |
658 | BUG_ON(h->sops[0].sem_num != q->sops[0].sem_num); | ||
659 | 741 | ||
660 | /* Yes, there is a wait-for-zero semop. Restart */ | 742 | return semop_completed; |
661 | if (h->sops[0].sem_op == 0) | ||
662 | return 1; | ||
663 | |||
664 | /* Again - no-one is waiting for the new value. */ | ||
665 | return 0; | ||
666 | } | 743 | } |
667 | 744 | ||
668 | 745 | ||
@@ -678,6 +755,8 @@ static int check_restart(struct sem_array *sma, struct sem_queue *q) | |||
678 | * semaphore. | 755 | * semaphore. |
679 | * The tasks that must be woken up are added to @pt. The return code | 756 | * The tasks that must be woken up are added to @pt. The return code |
680 | * is stored in q->pid. | 757 | * is stored in q->pid. |
758 | * The function internally checks if const operations can now succeed. | ||
759 | * | ||
681 | * The function return 1 if at least one semop was completed successfully. | 760 | * The function return 1 if at least one semop was completed successfully. |
682 | */ | 761 | */ |
683 | static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) | 762 | static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) |
@@ -688,9 +767,9 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) | |||
688 | int semop_completed = 0; | 767 | int semop_completed = 0; |
689 | 768 | ||
690 | if (semnum == -1) | 769 | if (semnum == -1) |
691 | pending_list = &sma->sem_pending; | 770 | pending_list = &sma->pending_alter; |
692 | else | 771 | else |
693 | pending_list = &sma->sem_base[semnum].sem_pending; | 772 | pending_list = &sma->sem_base[semnum].pending_alter; |
694 | 773 | ||
695 | again: | 774 | again: |
696 | walk = pending_list->next; | 775 | walk = pending_list->next; |
@@ -702,13 +781,12 @@ again: | |||
702 | 781 | ||
703 | /* If we are scanning the single sop, per-semaphore list of | 782 | /* If we are scanning the single sop, per-semaphore list of |
704 | * one semaphore and that semaphore is 0, then it is not | 783 | * one semaphore and that semaphore is 0, then it is not |
705 | * necessary to scan the "alter" entries: simple increments | 784 | * necessary to scan further: simple increments |
706 | * that affect only one entry succeed immediately and cannot | 785 | * that affect only one entry succeed immediately and cannot |
707 | * be in the per semaphore pending queue, and decrements | 786 | * be in the per semaphore pending queue, and decrements |
708 | * cannot be successful if the value is already 0. | 787 | * cannot be successful if the value is already 0. |
709 | */ | 788 | */ |
710 | if (semnum != -1 && sma->sem_base[semnum].semval == 0 && | 789 | if (semnum != -1 && sma->sem_base[semnum].semval == 0) |
711 | q->alter) | ||
712 | break; | 790 | break; |
713 | 791 | ||
714 | error = try_atomic_semop(sma, q->sops, q->nsops, | 792 | error = try_atomic_semop(sma, q->sops, q->nsops, |
@@ -724,6 +802,7 @@ again: | |||
724 | restart = 0; | 802 | restart = 0; |
725 | } else { | 803 | } else { |
726 | semop_completed = 1; | 804 | semop_completed = 1; |
805 | do_smart_wakeup_zero(sma, q->sops, q->nsops, pt); | ||
727 | restart = check_restart(sma, q); | 806 | restart = check_restart(sma, q); |
728 | } | 807 | } |
729 | 808 | ||
@@ -742,8 +821,8 @@ again: | |||
742 | * @otime: force setting otime | 821 | * @otime: force setting otime |
743 | * @pt: list head of the tasks that must be woken up. | 822 | * @pt: list head of the tasks that must be woken up. |
744 | * | 823 | * |
745 | * do_smart_update() does the required called to update_queue, based on the | 824 | * do_smart_update() does the required calls to update_queue and wakeup_zero, |
746 | * actual changes that were performed on the semaphore array. | 825 | * based on the actual changes that were performed on the semaphore array. |
747 | * Note that the function does not do the actual wake-up: the caller is | 826 | * Note that the function does not do the actual wake-up: the caller is |
748 | * responsible for calling wake_up_sem_queue_do(@pt). | 827 | * responsible for calling wake_up_sem_queue_do(@pt). |
749 | * It is safe to perform this call after dropping all locks. | 828 | * It is safe to perform this call after dropping all locks. |
@@ -754,6 +833,8 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop | |||
754 | int i; | 833 | int i; |
755 | int progress; | 834 | int progress; |
756 | 835 | ||
836 | otime |= do_smart_wakeup_zero(sma, sops, nsops, pt); | ||
837 | |||
757 | progress = 1; | 838 | progress = 1; |
758 | retry_global: | 839 | retry_global: |
759 | if (sma->complex_count) { | 840 | if (sma->complex_count) { |
@@ -813,14 +894,14 @@ static int count_semncnt (struct sem_array * sma, ushort semnum) | |||
813 | struct sem_queue * q; | 894 | struct sem_queue * q; |
814 | 895 | ||
815 | semncnt = 0; | 896 | semncnt = 0; |
816 | list_for_each_entry(q, &sma->sem_base[semnum].sem_pending, list) { | 897 | list_for_each_entry(q, &sma->sem_base[semnum].pending_alter, list) { |
817 | struct sembuf * sops = q->sops; | 898 | struct sembuf * sops = q->sops; |
818 | BUG_ON(sops->sem_num != semnum); | 899 | BUG_ON(sops->sem_num != semnum); |
819 | if ((sops->sem_op < 0) && !(sops->sem_flg & IPC_NOWAIT)) | 900 | if ((sops->sem_op < 0) && !(sops->sem_flg & IPC_NOWAIT)) |
820 | semncnt++; | 901 | semncnt++; |
821 | } | 902 | } |
822 | 903 | ||
823 | list_for_each_entry(q, &sma->sem_pending, list) { | 904 | list_for_each_entry(q, &sma->pending_alter, list) { |
824 | struct sembuf * sops = q->sops; | 905 | struct sembuf * sops = q->sops; |
825 | int nsops = q->nsops; | 906 | int nsops = q->nsops; |
826 | int i; | 907 | int i; |
@@ -839,14 +920,14 @@ static int count_semzcnt (struct sem_array * sma, ushort semnum) | |||
839 | struct sem_queue * q; | 920 | struct sem_queue * q; |
840 | 921 | ||
841 | semzcnt = 0; | 922 | semzcnt = 0; |
842 | list_for_each_entry(q, &sma->sem_base[semnum].sem_pending, list) { | 923 | list_for_each_entry(q, &sma->sem_base[semnum].pending_const, list) { |
843 | struct sembuf * sops = q->sops; | 924 | struct sembuf * sops = q->sops; |
844 | BUG_ON(sops->sem_num != semnum); | 925 | BUG_ON(sops->sem_num != semnum); |
845 | if ((sops->sem_op == 0) && !(sops->sem_flg & IPC_NOWAIT)) | 926 | if ((sops->sem_op == 0) && !(sops->sem_flg & IPC_NOWAIT)) |
846 | semzcnt++; | 927 | semzcnt++; |
847 | } | 928 | } |
848 | 929 | ||
849 | list_for_each_entry(q, &sma->sem_pending, list) { | 930 | list_for_each_entry(q, &sma->pending_const, list) { |
850 | struct sembuf * sops = q->sops; | 931 | struct sembuf * sops = q->sops; |
851 | int nsops = q->nsops; | 932 | int nsops = q->nsops; |
852 | int i; | 933 | int i; |
@@ -884,13 +965,22 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) | |||
884 | 965 | ||
885 | /* Wake up all pending processes and let them fail with EIDRM. */ | 966 | /* Wake up all pending processes and let them fail with EIDRM. */ |
886 | INIT_LIST_HEAD(&tasks); | 967 | INIT_LIST_HEAD(&tasks); |
887 | list_for_each_entry_safe(q, tq, &sma->sem_pending, list) { | 968 | list_for_each_entry_safe(q, tq, &sma->pending_const, list) { |
969 | unlink_queue(sma, q); | ||
970 | wake_up_sem_queue_prepare(&tasks, q, -EIDRM); | ||
971 | } | ||
972 | |||
973 | list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { | ||
888 | unlink_queue(sma, q); | 974 | unlink_queue(sma, q); |
889 | wake_up_sem_queue_prepare(&tasks, q, -EIDRM); | 975 | wake_up_sem_queue_prepare(&tasks, q, -EIDRM); |
890 | } | 976 | } |
891 | for (i = 0; i < sma->sem_nsems; i++) { | 977 | for (i = 0; i < sma->sem_nsems; i++) { |
892 | struct sem *sem = sma->sem_base + i; | 978 | struct sem *sem = sma->sem_base + i; |
893 | list_for_each_entry_safe(q, tq, &sem->sem_pending, list) { | 979 | list_for_each_entry_safe(q, tq, &sem->pending_const, list) { |
980 | unlink_queue(sma, q); | ||
981 | wake_up_sem_queue_prepare(&tasks, q, -EIDRM); | ||
982 | } | ||
983 | list_for_each_entry_safe(q, tq, &sem->pending_alter, list) { | ||
894 | unlink_queue(sma, q); | 984 | unlink_queue(sma, q); |
895 | wake_up_sem_queue_prepare(&tasks, q, -EIDRM); | 985 | wake_up_sem_queue_prepare(&tasks, q, -EIDRM); |
896 | } | 986 | } |
@@ -1658,14 +1748,15 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | |||
1658 | curr = &sma->sem_base[sops->sem_num]; | 1748 | curr = &sma->sem_base[sops->sem_num]; |
1659 | 1749 | ||
1660 | if (alter) | 1750 | if (alter) |
1661 | list_add_tail(&queue.list, &curr->sem_pending); | 1751 | list_add_tail(&queue.list, &curr->pending_alter); |
1662 | else | 1752 | else |
1663 | list_add(&queue.list, &curr->sem_pending); | 1753 | list_add_tail(&queue.list, &curr->pending_const); |
1664 | } else { | 1754 | } else { |
1665 | if (alter) | 1755 | if (alter) |
1666 | list_add_tail(&queue.list, &sma->sem_pending); | 1756 | list_add_tail(&queue.list, &sma->pending_alter); |
1667 | else | 1757 | else |
1668 | list_add(&queue.list, &sma->sem_pending); | 1758 | list_add_tail(&queue.list, &sma->pending_const); |
1759 | |||
1669 | sma->complex_count++; | 1760 | sma->complex_count++; |
1670 | } | 1761 | } |
1671 | 1762 | ||