diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/futex.c | 1188 | ||||
-rw-r--r-- | kernel/irq/Makefile | 2 | ||||
-rw-r--r-- | kernel/irq/chip.c | 12 | ||||
-rw-r--r-- | kernel/irq/handle.c | 58 | ||||
-rw-r--r-- | kernel/irq/internals.h | 5 | ||||
-rw-r--r-- | kernel/irq/manage.c | 17 | ||||
-rw-r--r-- | kernel/irq/migration.c | 14 | ||||
-rw-r--r-- | kernel/irq/numa_migrate.c | 38 | ||||
-rw-r--r-- | kernel/mutex.c | 29 | ||||
-rw-r--r-- | kernel/rtmutex.c | 248 | ||||
-rw-r--r-- | kernel/rtmutex_common.h | 8 | ||||
-rw-r--r-- | kernel/sched.c | 304 | ||||
-rw-r--r-- | kernel/sched_cpupri.c | 2 | ||||
-rw-r--r-- | kernel/sched_fair.c | 13 | ||||
-rw-r--r-- | kernel/sched_idletask.c | 3 | ||||
-rw-r--r-- | kernel/sched_rt.c | 2 | ||||
-rw-r--r-- | kernel/smp.c | 2 | ||||
-rw-r--r-- | kernel/softirq.c | 2 | ||||
-rw-r--r-- | kernel/sysctl.c | 8 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 2 | ||||
-rw-r--r-- | kernel/timer.c | 86 | ||||
-rw-r--r-- | kernel/wait.c | 2 |
22 files changed, 1479 insertions, 566 deletions
diff --git a/kernel/futex.c b/kernel/futex.c index d546b2d53a62..80b5ce716596 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -19,6 +19,10 @@ | |||
19 | * PRIVATE futexes by Eric Dumazet | 19 | * PRIVATE futexes by Eric Dumazet |
20 | * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> | 20 | * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> |
21 | * | 21 | * |
22 | * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> | ||
23 | * Copyright (C) IBM Corporation, 2009 | ||
24 | * Thanks to Thomas Gleixner for conceptual design and careful reviews. | ||
25 | * | ||
22 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly | 26 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly |
23 | * enough at me, Linus for the original (flawed) idea, Matthew | 27 | * enough at me, Linus for the original (flawed) idea, Matthew |
24 | * Kirkwood for proof-of-concept implementation. | 28 | * Kirkwood for proof-of-concept implementation. |
@@ -96,8 +100,8 @@ struct futex_pi_state { | |||
96 | */ | 100 | */ |
97 | struct futex_q { | 101 | struct futex_q { |
98 | struct plist_node list; | 102 | struct plist_node list; |
99 | /* There can only be a single waiter */ | 103 | /* Waiter reference */ |
100 | wait_queue_head_t waiter; | 104 | struct task_struct *task; |
101 | 105 | ||
102 | /* Which hash list lock to use: */ | 106 | /* Which hash list lock to use: */ |
103 | spinlock_t *lock_ptr; | 107 | spinlock_t *lock_ptr; |
@@ -107,7 +111,9 @@ struct futex_q { | |||
107 | 111 | ||
108 | /* Optional priority inheritance state: */ | 112 | /* Optional priority inheritance state: */ |
109 | struct futex_pi_state *pi_state; | 113 | struct futex_pi_state *pi_state; |
110 | struct task_struct *task; | 114 | |
115 | /* rt_waiter storage for requeue_pi: */ | ||
116 | struct rt_mutex_waiter *rt_waiter; | ||
111 | 117 | ||
112 | /* Bitset for the optional bitmasked wakeup */ | 118 | /* Bitset for the optional bitmasked wakeup */ |
113 | u32 bitset; | 119 | u32 bitset; |
@@ -278,6 +284,25 @@ void put_futex_key(int fshared, union futex_key *key) | |||
278 | drop_futex_key_refs(key); | 284 | drop_futex_key_refs(key); |
279 | } | 285 | } |
280 | 286 | ||
287 | /** | ||
288 | * futex_top_waiter() - Return the highest priority waiter on a futex | ||
289 | * @hb: the hash bucket the futex_q's reside in | ||
290 | * @key: the futex key (to distinguish it from other futex futex_q's) | ||
291 | * | ||
292 | * Must be called with the hb lock held. | ||
293 | */ | ||
294 | static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, | ||
295 | union futex_key *key) | ||
296 | { | ||
297 | struct futex_q *this; | ||
298 | |||
299 | plist_for_each_entry(this, &hb->chain, list) { | ||
300 | if (match_futex(&this->key, key)) | ||
301 | return this; | ||
302 | } | ||
303 | return NULL; | ||
304 | } | ||
305 | |||
281 | static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) | 306 | static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) |
282 | { | 307 | { |
283 | u32 curval; | 308 | u32 curval; |
@@ -539,28 +564,160 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
539 | return 0; | 564 | return 0; |
540 | } | 565 | } |
541 | 566 | ||
567 | /** | ||
568 | * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex | ||
569 | * @uaddr: the pi futex user address | ||
570 | * @hb: the pi futex hash bucket | ||
571 | * @key: the futex key associated with uaddr and hb | ||
572 | * @ps: the pi_state pointer where we store the result of the | ||
573 | * lookup | ||
574 | * @task: the task to perform the atomic lock work for. This will | ||
575 | * be "current" except in the case of requeue pi. | ||
576 | * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) | ||
577 | * | ||
578 | * Returns: | ||
579 | * 0 - ready to wait | ||
580 | * 1 - acquired the lock | ||
581 | * <0 - error | ||
582 | * | ||
583 | * The hb->lock and futex_key refs shall be held by the caller. | ||
584 | */ | ||
585 | static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, | ||
586 | union futex_key *key, | ||
587 | struct futex_pi_state **ps, | ||
588 | struct task_struct *task, int set_waiters) | ||
589 | { | ||
590 | int lock_taken, ret, ownerdied = 0; | ||
591 | u32 uval, newval, curval; | ||
592 | |||
593 | retry: | ||
594 | ret = lock_taken = 0; | ||
595 | |||
596 | /* | ||
597 | * To avoid races, we attempt to take the lock here again | ||
598 | * (by doing a 0 -> TID atomic cmpxchg), while holding all | ||
599 | * the locks. It will most likely not succeed. | ||
600 | */ | ||
601 | newval = task_pid_vnr(task); | ||
602 | if (set_waiters) | ||
603 | newval |= FUTEX_WAITERS; | ||
604 | |||
605 | curval = cmpxchg_futex_value_locked(uaddr, 0, newval); | ||
606 | |||
607 | if (unlikely(curval == -EFAULT)) | ||
608 | return -EFAULT; | ||
609 | |||
610 | /* | ||
611 | * Detect deadlocks. | ||
612 | */ | ||
613 | if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) | ||
614 | return -EDEADLK; | ||
615 | |||
616 | /* | ||
617 | * Surprise - we got the lock. Just return to userspace: | ||
618 | */ | ||
619 | if (unlikely(!curval)) | ||
620 | return 1; | ||
621 | |||
622 | uval = curval; | ||
623 | |||
624 | /* | ||
625 | * Set the FUTEX_WAITERS flag, so the owner will know it has someone | ||
626 | * to wake at the next unlock. | ||
627 | */ | ||
628 | newval = curval | FUTEX_WAITERS; | ||
629 | |||
630 | /* | ||
631 | * There are two cases, where a futex might have no owner (the | ||
632 | * owner TID is 0): OWNER_DIED. We take over the futex in this | ||
633 | * case. We also do an unconditional take over, when the owner | ||
634 | * of the futex died. | ||
635 | * | ||
636 | * This is safe as we are protected by the hash bucket lock ! | ||
637 | */ | ||
638 | if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { | ||
639 | /* Keep the OWNER_DIED bit */ | ||
640 | newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); | ||
641 | ownerdied = 0; | ||
642 | lock_taken = 1; | ||
643 | } | ||
644 | |||
645 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); | ||
646 | |||
647 | if (unlikely(curval == -EFAULT)) | ||
648 | return -EFAULT; | ||
649 | if (unlikely(curval != uval)) | ||
650 | goto retry; | ||
651 | |||
652 | /* | ||
653 | * We took the lock due to owner died take over. | ||
654 | */ | ||
655 | if (unlikely(lock_taken)) | ||
656 | return 1; | ||
657 | |||
658 | /* | ||
659 | * We dont have the lock. Look up the PI state (or create it if | ||
660 | * we are the first waiter): | ||
661 | */ | ||
662 | ret = lookup_pi_state(uval, hb, key, ps); | ||
663 | |||
664 | if (unlikely(ret)) { | ||
665 | switch (ret) { | ||
666 | case -ESRCH: | ||
667 | /* | ||
668 | * No owner found for this futex. Check if the | ||
669 | * OWNER_DIED bit is set to figure out whether | ||
670 | * this is a robust futex or not. | ||
671 | */ | ||
672 | if (get_futex_value_locked(&curval, uaddr)) | ||
673 | return -EFAULT; | ||
674 | |||
675 | /* | ||
676 | * We simply start over in case of a robust | ||
677 | * futex. The code above will take the futex | ||
678 | * and return happy. | ||
679 | */ | ||
680 | if (curval & FUTEX_OWNER_DIED) { | ||
681 | ownerdied = 1; | ||
682 | goto retry; | ||
683 | } | ||
684 | default: | ||
685 | break; | ||
686 | } | ||
687 | } | ||
688 | |||
689 | return ret; | ||
690 | } | ||
691 | |||
542 | /* | 692 | /* |
543 | * The hash bucket lock must be held when this is called. | 693 | * The hash bucket lock must be held when this is called. |
544 | * Afterwards, the futex_q must not be accessed. | 694 | * Afterwards, the futex_q must not be accessed. |
545 | */ | 695 | */ |
546 | static void wake_futex(struct futex_q *q) | 696 | static void wake_futex(struct futex_q *q) |
547 | { | 697 | { |
548 | plist_del(&q->list, &q->list.plist); | 698 | struct task_struct *p = q->task; |
699 | |||
549 | /* | 700 | /* |
550 | * The lock in wake_up_all() is a crucial memory barrier after the | 701 | * We set q->lock_ptr = NULL _before_ we wake up the task. If |
551 | * plist_del() and also before assigning to q->lock_ptr. | 702 | * a non futex wake up happens on another CPU then the task |
703 | * might exit and p would dereference a non existing task | ||
704 | * struct. Prevent this by holding a reference on p across the | ||
705 | * wake up. | ||
552 | */ | 706 | */ |
553 | wake_up(&q->waiter); | 707 | get_task_struct(p); |
708 | |||
709 | plist_del(&q->list, &q->list.plist); | ||
554 | /* | 710 | /* |
555 | * The waiting task can free the futex_q as soon as this is written, | 711 | * The waiting task can free the futex_q as soon as |
556 | * without taking any locks. This must come last. | 712 | * q->lock_ptr = NULL is written, without taking any locks. A |
557 | * | 713 | * memory barrier is required here to prevent the following |
558 | * A memory barrier is required here to prevent the following store to | 714 | * store to lock_ptr from getting ahead of the plist_del. |
559 | * lock_ptr from getting ahead of the wakeup. Clearing the lock at the | ||
560 | * end of wake_up() does not prevent this store from moving. | ||
561 | */ | 715 | */ |
562 | smp_wmb(); | 716 | smp_wmb(); |
563 | q->lock_ptr = NULL; | 717 | q->lock_ptr = NULL; |
718 | |||
719 | wake_up_state(p, TASK_NORMAL); | ||
720 | put_task_struct(p); | ||
564 | } | 721 | } |
565 | 722 | ||
566 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | 723 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) |
@@ -689,7 +846,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) | |||
689 | 846 | ||
690 | plist_for_each_entry_safe(this, next, head, list) { | 847 | plist_for_each_entry_safe(this, next, head, list) { |
691 | if (match_futex (&this->key, &key)) { | 848 | if (match_futex (&this->key, &key)) { |
692 | if (this->pi_state) { | 849 | if (this->pi_state || this->rt_waiter) { |
693 | ret = -EINVAL; | 850 | ret = -EINVAL; |
694 | break; | 851 | break; |
695 | } | 852 | } |
@@ -802,24 +959,185 @@ out: | |||
802 | return ret; | 959 | return ret; |
803 | } | 960 | } |
804 | 961 | ||
805 | /* | 962 | /** |
806 | * Requeue all waiters hashed on one physical page to another | 963 | * requeue_futex() - Requeue a futex_q from one hb to another |
807 | * physical page. | 964 | * @q: the futex_q to requeue |
965 | * @hb1: the source hash_bucket | ||
966 | * @hb2: the target hash_bucket | ||
967 | * @key2: the new key for the requeued futex_q | ||
968 | */ | ||
969 | static inline | ||
970 | void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, | ||
971 | struct futex_hash_bucket *hb2, union futex_key *key2) | ||
972 | { | ||
973 | |||
974 | /* | ||
975 | * If key1 and key2 hash to the same bucket, no need to | ||
976 | * requeue. | ||
977 | */ | ||
978 | if (likely(&hb1->chain != &hb2->chain)) { | ||
979 | plist_del(&q->list, &hb1->chain); | ||
980 | plist_add(&q->list, &hb2->chain); | ||
981 | q->lock_ptr = &hb2->lock; | ||
982 | #ifdef CONFIG_DEBUG_PI_LIST | ||
983 | q->list.plist.lock = &hb2->lock; | ||
984 | #endif | ||
985 | } | ||
986 | get_futex_key_refs(key2); | ||
987 | q->key = *key2; | ||
988 | } | ||
989 | |||
990 | /** | ||
991 | * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue | ||
992 | * q: the futex_q | ||
993 | * key: the key of the requeue target futex | ||
994 | * | ||
995 | * During futex_requeue, with requeue_pi=1, it is possible to acquire the | ||
996 | * target futex if it is uncontended or via a lock steal. Set the futex_q key | ||
997 | * to the requeue target futex so the waiter can detect the wakeup on the right | ||
998 | * futex, but remove it from the hb and NULL the rt_waiter so it can detect | ||
999 | * atomic lock acquisition. Must be called with the q->lock_ptr held. | ||
1000 | */ | ||
1001 | static inline | ||
1002 | void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) | ||
1003 | { | ||
1004 | drop_futex_key_refs(&q->key); | ||
1005 | get_futex_key_refs(key); | ||
1006 | q->key = *key; | ||
1007 | |||
1008 | WARN_ON(plist_node_empty(&q->list)); | ||
1009 | plist_del(&q->list, &q->list.plist); | ||
1010 | |||
1011 | WARN_ON(!q->rt_waiter); | ||
1012 | q->rt_waiter = NULL; | ||
1013 | |||
1014 | wake_up_state(q->task, TASK_NORMAL); | ||
1015 | } | ||
1016 | |||
1017 | /** | ||
1018 | * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter | ||
1019 | * @pifutex: the user address of the to futex | ||
1020 | * @hb1: the from futex hash bucket, must be locked by the caller | ||
1021 | * @hb2: the to futex hash bucket, must be locked by the caller | ||
1022 | * @key1: the from futex key | ||
1023 | * @key2: the to futex key | ||
1024 | * @ps: address to store the pi_state pointer | ||
1025 | * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) | ||
1026 | * | ||
1027 | * Try and get the lock on behalf of the top waiter if we can do it atomically. | ||
1028 | * Wake the top waiter if we succeed. If the caller specified set_waiters, | ||
1029 | * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. | ||
1030 | * hb1 and hb2 must be held by the caller. | ||
1031 | * | ||
1032 | * Returns: | ||
1033 | * 0 - failed to acquire the lock atomicly | ||
1034 | * 1 - acquired the lock | ||
1035 | * <0 - error | ||
1036 | */ | ||
1037 | static int futex_proxy_trylock_atomic(u32 __user *pifutex, | ||
1038 | struct futex_hash_bucket *hb1, | ||
1039 | struct futex_hash_bucket *hb2, | ||
1040 | union futex_key *key1, union futex_key *key2, | ||
1041 | struct futex_pi_state **ps, int set_waiters) | ||
1042 | { | ||
1043 | struct futex_q *top_waiter = NULL; | ||
1044 | u32 curval; | ||
1045 | int ret; | ||
1046 | |||
1047 | if (get_futex_value_locked(&curval, pifutex)) | ||
1048 | return -EFAULT; | ||
1049 | |||
1050 | /* | ||
1051 | * Find the top_waiter and determine if there are additional waiters. | ||
1052 | * If the caller intends to requeue more than 1 waiter to pifutex, | ||
1053 | * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, | ||
1054 | * as we have means to handle the possible fault. If not, don't set | ||
1055 | * the bit unecessarily as it will force the subsequent unlock to enter | ||
1056 | * the kernel. | ||
1057 | */ | ||
1058 | top_waiter = futex_top_waiter(hb1, key1); | ||
1059 | |||
1060 | /* There are no waiters, nothing for us to do. */ | ||
1061 | if (!top_waiter) | ||
1062 | return 0; | ||
1063 | |||
1064 | /* | ||
1065 | * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in | ||
1066 | * the contended case or if set_waiters is 1. The pi_state is returned | ||
1067 | * in ps in contended cases. | ||
1068 | */ | ||
1069 | ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, | ||
1070 | set_waiters); | ||
1071 | if (ret == 1) | ||
1072 | requeue_pi_wake_futex(top_waiter, key2); | ||
1073 | |||
1074 | return ret; | ||
1075 | } | ||
1076 | |||
1077 | /** | ||
1078 | * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 | ||
1079 | * uaddr1: source futex user address | ||
1080 | * uaddr2: target futex user address | ||
1081 | * nr_wake: number of waiters to wake (must be 1 for requeue_pi) | ||
1082 | * nr_requeue: number of waiters to requeue (0-INT_MAX) | ||
1083 | * requeue_pi: if we are attempting to requeue from a non-pi futex to a | ||
1084 | * pi futex (pi to pi requeue is not supported) | ||
1085 | * | ||
1086 | * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire | ||
1087 | * uaddr2 atomically on behalf of the top waiter. | ||
1088 | * | ||
1089 | * Returns: | ||
1090 | * >=0 - on success, the number of tasks requeued or woken | ||
1091 | * <0 - on error | ||
808 | */ | 1092 | */ |
809 | static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, | 1093 | static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, |
810 | int nr_wake, int nr_requeue, u32 *cmpval) | 1094 | int nr_wake, int nr_requeue, u32 *cmpval, |
1095 | int requeue_pi) | ||
811 | { | 1096 | { |
812 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; | 1097 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; |
1098 | int drop_count = 0, task_count = 0, ret; | ||
1099 | struct futex_pi_state *pi_state = NULL; | ||
813 | struct futex_hash_bucket *hb1, *hb2; | 1100 | struct futex_hash_bucket *hb1, *hb2; |
814 | struct plist_head *head1; | 1101 | struct plist_head *head1; |
815 | struct futex_q *this, *next; | 1102 | struct futex_q *this, *next; |
816 | int ret, drop_count = 0; | 1103 | u32 curval2; |
1104 | |||
1105 | if (requeue_pi) { | ||
1106 | /* | ||
1107 | * requeue_pi requires a pi_state, try to allocate it now | ||
1108 | * without any locks in case it fails. | ||
1109 | */ | ||
1110 | if (refill_pi_state_cache()) | ||
1111 | return -ENOMEM; | ||
1112 | /* | ||
1113 | * requeue_pi must wake as many tasks as it can, up to nr_wake | ||
1114 | * + nr_requeue, since it acquires the rt_mutex prior to | ||
1115 | * returning to userspace, so as to not leave the rt_mutex with | ||
1116 | * waiters and no owner. However, second and third wake-ups | ||
1117 | * cannot be predicted as they involve race conditions with the | ||
1118 | * first wake and a fault while looking up the pi_state. Both | ||
1119 | * pthread_cond_signal() and pthread_cond_broadcast() should | ||
1120 | * use nr_wake=1. | ||
1121 | */ | ||
1122 | if (nr_wake != 1) | ||
1123 | return -EINVAL; | ||
1124 | } | ||
817 | 1125 | ||
818 | retry: | 1126 | retry: |
1127 | if (pi_state != NULL) { | ||
1128 | /* | ||
1129 | * We will have to lookup the pi_state again, so free this one | ||
1130 | * to keep the accounting correct. | ||
1131 | */ | ||
1132 | free_pi_state(pi_state); | ||
1133 | pi_state = NULL; | ||
1134 | } | ||
1135 | |||
819 | ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); | 1136 | ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); |
820 | if (unlikely(ret != 0)) | 1137 | if (unlikely(ret != 0)) |
821 | goto out; | 1138 | goto out; |
822 | ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_READ); | 1139 | ret = get_futex_key(uaddr2, fshared, &key2, |
1140 | requeue_pi ? VERIFY_WRITE : VERIFY_READ); | ||
823 | if (unlikely(ret != 0)) | 1141 | if (unlikely(ret != 0)) |
824 | goto out_put_key1; | 1142 | goto out_put_key1; |
825 | 1143 | ||
@@ -854,32 +1172,99 @@ retry_private: | |||
854 | } | 1172 | } |
855 | } | 1173 | } |
856 | 1174 | ||
1175 | if (requeue_pi && (task_count - nr_wake < nr_requeue)) { | ||
1176 | /* | ||
1177 | * Attempt to acquire uaddr2 and wake the top waiter. If we | ||
1178 | * intend to requeue waiters, force setting the FUTEX_WAITERS | ||
1179 | * bit. We force this here where we are able to easily handle | ||
1180 | * faults rather in the requeue loop below. | ||
1181 | */ | ||
1182 | ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, | ||
1183 | &key2, &pi_state, nr_requeue); | ||
1184 | |||
1185 | /* | ||
1186 | * At this point the top_waiter has either taken uaddr2 or is | ||
1187 | * waiting on it. If the former, then the pi_state will not | ||
1188 | * exist yet, look it up one more time to ensure we have a | ||
1189 | * reference to it. | ||
1190 | */ | ||
1191 | if (ret == 1) { | ||
1192 | WARN_ON(pi_state); | ||
1193 | task_count++; | ||
1194 | ret = get_futex_value_locked(&curval2, uaddr2); | ||
1195 | if (!ret) | ||
1196 | ret = lookup_pi_state(curval2, hb2, &key2, | ||
1197 | &pi_state); | ||
1198 | } | ||
1199 | |||
1200 | switch (ret) { | ||
1201 | case 0: | ||
1202 | break; | ||
1203 | case -EFAULT: | ||
1204 | double_unlock_hb(hb1, hb2); | ||
1205 | put_futex_key(fshared, &key2); | ||
1206 | put_futex_key(fshared, &key1); | ||
1207 | ret = get_user(curval2, uaddr2); | ||
1208 | if (!ret) | ||
1209 | goto retry; | ||
1210 | goto out; | ||
1211 | case -EAGAIN: | ||
1212 | /* The owner was exiting, try again. */ | ||
1213 | double_unlock_hb(hb1, hb2); | ||
1214 | put_futex_key(fshared, &key2); | ||
1215 | put_futex_key(fshared, &key1); | ||
1216 | cond_resched(); | ||
1217 | goto retry; | ||
1218 | default: | ||
1219 | goto out_unlock; | ||
1220 | } | ||
1221 | } | ||
1222 | |||
857 | head1 = &hb1->chain; | 1223 | head1 = &hb1->chain; |
858 | plist_for_each_entry_safe(this, next, head1, list) { | 1224 | plist_for_each_entry_safe(this, next, head1, list) { |
859 | if (!match_futex (&this->key, &key1)) | 1225 | if (task_count - nr_wake >= nr_requeue) |
1226 | break; | ||
1227 | |||
1228 | if (!match_futex(&this->key, &key1)) | ||
860 | continue; | 1229 | continue; |
861 | if (++ret <= nr_wake) { | 1230 | |
1231 | WARN_ON(!requeue_pi && this->rt_waiter); | ||
1232 | WARN_ON(requeue_pi && !this->rt_waiter); | ||
1233 | |||
1234 | /* | ||
1235 | * Wake nr_wake waiters. For requeue_pi, if we acquired the | ||
1236 | * lock, we already woke the top_waiter. If not, it will be | ||
1237 | * woken by futex_unlock_pi(). | ||
1238 | */ | ||
1239 | if (++task_count <= nr_wake && !requeue_pi) { | ||
862 | wake_futex(this); | 1240 | wake_futex(this); |
863 | } else { | 1241 | continue; |
864 | /* | 1242 | } |
865 | * If key1 and key2 hash to the same bucket, no need to | ||
866 | * requeue. | ||
867 | */ | ||
868 | if (likely(head1 != &hb2->chain)) { | ||
869 | plist_del(&this->list, &hb1->chain); | ||
870 | plist_add(&this->list, &hb2->chain); | ||
871 | this->lock_ptr = &hb2->lock; | ||
872 | #ifdef CONFIG_DEBUG_PI_LIST | ||
873 | this->list.plist.lock = &hb2->lock; | ||
874 | #endif | ||
875 | } | ||
876 | this->key = key2; | ||
877 | get_futex_key_refs(&key2); | ||
878 | drop_count++; | ||
879 | 1243 | ||
880 | if (ret - nr_wake >= nr_requeue) | 1244 | /* |
881 | break; | 1245 | * Requeue nr_requeue waiters and possibly one more in the case |
1246 | * of requeue_pi if we couldn't acquire the lock atomically. | ||
1247 | */ | ||
1248 | if (requeue_pi) { | ||
1249 | /* Prepare the waiter to take the rt_mutex. */ | ||
1250 | atomic_inc(&pi_state->refcount); | ||
1251 | this->pi_state = pi_state; | ||
1252 | ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, | ||
1253 | this->rt_waiter, | ||
1254 | this->task, 1); | ||
1255 | if (ret == 1) { | ||
1256 | /* We got the lock. */ | ||
1257 | requeue_pi_wake_futex(this, &key2); | ||
1258 | continue; | ||
1259 | } else if (ret) { | ||
1260 | /* -EDEADLK */ | ||
1261 | this->pi_state = NULL; | ||
1262 | free_pi_state(pi_state); | ||
1263 | goto out_unlock; | ||
1264 | } | ||
882 | } | 1265 | } |
1266 | requeue_futex(this, hb1, hb2, &key2); | ||
1267 | drop_count++; | ||
883 | } | 1268 | } |
884 | 1269 | ||
885 | out_unlock: | 1270 | out_unlock: |
@@ -899,7 +1284,9 @@ out_put_keys: | |||
899 | out_put_key1: | 1284 | out_put_key1: |
900 | put_futex_key(fshared, &key1); | 1285 | put_futex_key(fshared, &key1); |
901 | out: | 1286 | out: |
902 | return ret; | 1287 | if (pi_state != NULL) |
1288 | free_pi_state(pi_state); | ||
1289 | return ret ? ret : task_count; | ||
903 | } | 1290 | } |
904 | 1291 | ||
905 | /* The key must be already stored in q->key. */ | 1292 | /* The key must be already stored in q->key. */ |
@@ -907,8 +1294,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) | |||
907 | { | 1294 | { |
908 | struct futex_hash_bucket *hb; | 1295 | struct futex_hash_bucket *hb; |
909 | 1296 | ||
910 | init_waitqueue_head(&q->waiter); | ||
911 | |||
912 | get_futex_key_refs(&q->key); | 1297 | get_futex_key_refs(&q->key); |
913 | hb = hash_futex(&q->key); | 1298 | hb = hash_futex(&q->key); |
914 | q->lock_ptr = &hb->lock; | 1299 | q->lock_ptr = &hb->lock; |
@@ -1119,35 +1504,149 @@ handle_fault: | |||
1119 | */ | 1504 | */ |
1120 | #define FLAGS_SHARED 0x01 | 1505 | #define FLAGS_SHARED 0x01 |
1121 | #define FLAGS_CLOCKRT 0x02 | 1506 | #define FLAGS_CLOCKRT 0x02 |
1507 | #define FLAGS_HAS_TIMEOUT 0x04 | ||
1122 | 1508 | ||
1123 | static long futex_wait_restart(struct restart_block *restart); | 1509 | static long futex_wait_restart(struct restart_block *restart); |
1124 | 1510 | ||
1125 | static int futex_wait(u32 __user *uaddr, int fshared, | 1511 | /** |
1126 | u32 val, ktime_t *abs_time, u32 bitset, int clockrt) | 1512 | * fixup_owner() - Post lock pi_state and corner case management |
1513 | * @uaddr: user address of the futex | ||
1514 | * @fshared: whether the futex is shared (1) or not (0) | ||
1515 | * @q: futex_q (contains pi_state and access to the rt_mutex) | ||
1516 | * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) | ||
1517 | * | ||
1518 | * After attempting to lock an rt_mutex, this function is called to cleanup | ||
1519 | * the pi_state owner as well as handle race conditions that may allow us to | ||
1520 | * acquire the lock. Must be called with the hb lock held. | ||
1521 | * | ||
1522 | * Returns: | ||
1523 | * 1 - success, lock taken | ||
1524 | * 0 - success, lock not taken | ||
1525 | * <0 - on error (-EFAULT) | ||
1526 | */ | ||
1527 | static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, | ||
1528 | int locked) | ||
1127 | { | 1529 | { |
1128 | struct task_struct *curr = current; | 1530 | struct task_struct *owner; |
1129 | struct restart_block *restart; | 1531 | int ret = 0; |
1130 | DECLARE_WAITQUEUE(wait, curr); | ||
1131 | struct futex_hash_bucket *hb; | ||
1132 | struct futex_q q; | ||
1133 | u32 uval; | ||
1134 | int ret; | ||
1135 | struct hrtimer_sleeper t; | ||
1136 | int rem = 0; | ||
1137 | 1532 | ||
1138 | if (!bitset) | 1533 | if (locked) { |
1139 | return -EINVAL; | 1534 | /* |
1535 | * Got the lock. We might not be the anticipated owner if we | ||
1536 | * did a lock-steal - fix up the PI-state in that case: | ||
1537 | */ | ||
1538 | if (q->pi_state->owner != current) | ||
1539 | ret = fixup_pi_state_owner(uaddr, q, current, fshared); | ||
1540 | goto out; | ||
1541 | } | ||
1140 | 1542 | ||
1141 | q.pi_state = NULL; | 1543 | /* |
1142 | q.bitset = bitset; | 1544 | * Catch the rare case, where the lock was released when we were on the |
1143 | retry: | 1545 | * way back before we locked the hash bucket. |
1144 | q.key = FUTEX_KEY_INIT; | 1546 | */ |
1145 | ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_READ); | 1547 | if (q->pi_state->owner == current) { |
1146 | if (unlikely(ret != 0)) | 1548 | /* |
1549 | * Try to get the rt_mutex now. This might fail as some other | ||
1550 | * task acquired the rt_mutex after we removed ourself from the | ||
1551 | * rt_mutex waiters list. | ||
1552 | */ | ||
1553 | if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { | ||
1554 | locked = 1; | ||
1555 | goto out; | ||
1556 | } | ||
1557 | |||
1558 | /* | ||
1559 | * pi_state is incorrect, some other task did a lock steal and | ||
1560 | * we returned due to timeout or signal without taking the | ||
1561 | * rt_mutex. Too late. We can access the rt_mutex_owner without | ||
1562 | * locking, as the other task is now blocked on the hash bucket | ||
1563 | * lock. Fix the state up. | ||
1564 | */ | ||
1565 | owner = rt_mutex_owner(&q->pi_state->pi_mutex); | ||
1566 | ret = fixup_pi_state_owner(uaddr, q, owner, fshared); | ||
1147 | goto out; | 1567 | goto out; |
1568 | } | ||
1148 | 1569 | ||
1149 | retry_private: | 1570 | /* |
1150 | hb = queue_lock(&q); | 1571 | * Paranoia check. If we did not take the lock, then we should not be |
1572 | * the owner, nor the pending owner, of the rt_mutex. | ||
1573 | */ | ||
1574 | if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) | ||
1575 | printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " | ||
1576 | "pi-state %p\n", ret, | ||
1577 | q->pi_state->pi_mutex.owner, | ||
1578 | q->pi_state->owner); | ||
1579 | |||
1580 | out: | ||
1581 | return ret ? ret : locked; | ||
1582 | } | ||
1583 | |||
1584 | /** | ||
1585 | * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal | ||
1586 | * @hb: the futex hash bucket, must be locked by the caller | ||
1587 | * @q: the futex_q to queue up on | ||
1588 | * @timeout: the prepared hrtimer_sleeper, or null for no timeout | ||
1589 | */ | ||
1590 | static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, | ||
1591 | struct hrtimer_sleeper *timeout) | ||
1592 | { | ||
1593 | queue_me(q, hb); | ||
1594 | |||
1595 | /* | ||
1596 | * There might have been scheduling since the queue_me(), as we | ||
1597 | * cannot hold a spinlock across the get_user() in case it | ||
1598 | * faults, and we cannot just set TASK_INTERRUPTIBLE state when | ||
1599 | * queueing ourselves into the futex hash. This code thus has to | ||
1600 | * rely on the futex_wake() code removing us from hash when it | ||
1601 | * wakes us up. | ||
1602 | */ | ||
1603 | set_current_state(TASK_INTERRUPTIBLE); | ||
1604 | |||
1605 | /* Arm the timer */ | ||
1606 | if (timeout) { | ||
1607 | hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); | ||
1608 | if (!hrtimer_active(&timeout->timer)) | ||
1609 | timeout->task = NULL; | ||
1610 | } | ||
1611 | |||
1612 | /* | ||
1613 | * !plist_node_empty() is safe here without any lock. | ||
1614 | * q.lock_ptr != 0 is not safe, because of ordering against wakeup. | ||
1615 | */ | ||
1616 | if (likely(!plist_node_empty(&q->list))) { | ||
1617 | /* | ||
1618 | * If the timer has already expired, current will already be | ||
1619 | * flagged for rescheduling. Only call schedule if there | ||
1620 | * is no timeout, or if it has yet to expire. | ||
1621 | */ | ||
1622 | if (!timeout || timeout->task) | ||
1623 | schedule(); | ||
1624 | } | ||
1625 | __set_current_state(TASK_RUNNING); | ||
1626 | } | ||
1627 | |||
1628 | /** | ||
1629 | * futex_wait_setup() - Prepare to wait on a futex | ||
1630 | * @uaddr: the futex userspace address | ||
1631 | * @val: the expected value | ||
1632 | * @fshared: whether the futex is shared (1) or not (0) | ||
1633 | * @q: the associated futex_q | ||
1634 | * @hb: storage for hash_bucket pointer to be returned to caller | ||
1635 | * | ||
1636 | * Setup the futex_q and locate the hash_bucket. Get the futex value and | ||
1637 | * compare it with the expected value. Handle atomic faults internally. | ||
1638 | * Return with the hb lock held and a q.key reference on success, and unlocked | ||
1639 | * with no q.key reference on failure. | ||
1640 | * | ||
1641 | * Returns: | ||
1642 | * 0 - uaddr contains val and hb has been locked | ||
1643 | * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked | ||
1644 | */ | ||
1645 | static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, | ||
1646 | struct futex_q *q, struct futex_hash_bucket **hb) | ||
1647 | { | ||
1648 | u32 uval; | ||
1649 | int ret; | ||
1151 | 1650 | ||
1152 | /* | 1651 | /* |
1153 | * Access the page AFTER the hash-bucket is locked. | 1652 | * Access the page AFTER the hash-bucket is locked. |
@@ -1165,95 +1664,83 @@ retry_private: | |||
1165 | * A consequence is that futex_wait() can return zero and absorb | 1664 | * A consequence is that futex_wait() can return zero and absorb |
1166 | * a wakeup when *uaddr != val on entry to the syscall. This is | 1665 | * a wakeup when *uaddr != val on entry to the syscall. This is |
1167 | * rare, but normal. | 1666 | * rare, but normal. |
1168 | * | ||
1169 | * For shared futexes, we hold the mmap semaphore, so the mapping | ||
1170 | * cannot have changed since we looked it up in get_futex_key. | ||
1171 | */ | 1667 | */ |
1668 | retry: | ||
1669 | q->key = FUTEX_KEY_INIT; | ||
1670 | ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ); | ||
1671 | if (unlikely(ret != 0)) | ||
1672 | return ret; | ||
1673 | |||
1674 | retry_private: | ||
1675 | *hb = queue_lock(q); | ||
1676 | |||
1172 | ret = get_futex_value_locked(&uval, uaddr); | 1677 | ret = get_futex_value_locked(&uval, uaddr); |
1173 | 1678 | ||
1174 | if (unlikely(ret)) { | 1679 | if (ret) { |
1175 | queue_unlock(&q, hb); | 1680 | queue_unlock(q, *hb); |
1176 | 1681 | ||
1177 | ret = get_user(uval, uaddr); | 1682 | ret = get_user(uval, uaddr); |
1178 | if (ret) | 1683 | if (ret) |
1179 | goto out_put_key; | 1684 | goto out; |
1180 | 1685 | ||
1181 | if (!fshared) | 1686 | if (!fshared) |
1182 | goto retry_private; | 1687 | goto retry_private; |
1183 | 1688 | ||
1184 | put_futex_key(fshared, &q.key); | 1689 | put_futex_key(fshared, &q->key); |
1185 | goto retry; | 1690 | goto retry; |
1186 | } | 1691 | } |
1187 | ret = -EWOULDBLOCK; | ||
1188 | if (unlikely(uval != val)) { | ||
1189 | queue_unlock(&q, hb); | ||
1190 | goto out_put_key; | ||
1191 | } | ||
1192 | 1692 | ||
1193 | /* Only actually queue if *uaddr contained val. */ | 1693 | if (uval != val) { |
1194 | queue_me(&q, hb); | 1694 | queue_unlock(q, *hb); |
1695 | ret = -EWOULDBLOCK; | ||
1696 | } | ||
1195 | 1697 | ||
1196 | /* | 1698 | out: |
1197 | * There might have been scheduling since the queue_me(), as we | 1699 | if (ret) |
1198 | * cannot hold a spinlock across the get_user() in case it | 1700 | put_futex_key(fshared, &q->key); |
1199 | * faults, and we cannot just set TASK_INTERRUPTIBLE state when | 1701 | return ret; |
1200 | * queueing ourselves into the futex hash. This code thus has to | 1702 | } |
1201 | * rely on the futex_wake() code removing us from hash when it | ||
1202 | * wakes us up. | ||
1203 | */ | ||
1204 | 1703 | ||
1205 | /* add_wait_queue is the barrier after __set_current_state. */ | 1704 | static int futex_wait(u32 __user *uaddr, int fshared, |
1206 | __set_current_state(TASK_INTERRUPTIBLE); | 1705 | u32 val, ktime_t *abs_time, u32 bitset, int clockrt) |
1207 | add_wait_queue(&q.waiter, &wait); | 1706 | { |
1208 | /* | 1707 | struct hrtimer_sleeper timeout, *to = NULL; |
1209 | * !plist_node_empty() is safe here without any lock. | 1708 | struct restart_block *restart; |
1210 | * q.lock_ptr != 0 is not safe, because of ordering against wakeup. | 1709 | struct futex_hash_bucket *hb; |
1211 | */ | 1710 | struct futex_q q; |
1212 | if (likely(!plist_node_empty(&q.list))) { | 1711 | int ret; |
1213 | if (!abs_time) | ||
1214 | schedule(); | ||
1215 | else { | ||
1216 | hrtimer_init_on_stack(&t.timer, | ||
1217 | clockrt ? CLOCK_REALTIME : | ||
1218 | CLOCK_MONOTONIC, | ||
1219 | HRTIMER_MODE_ABS); | ||
1220 | hrtimer_init_sleeper(&t, current); | ||
1221 | hrtimer_set_expires_range_ns(&t.timer, *abs_time, | ||
1222 | current->timer_slack_ns); | ||
1223 | |||
1224 | hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); | ||
1225 | if (!hrtimer_active(&t.timer)) | ||
1226 | t.task = NULL; | ||
1227 | 1712 | ||
1228 | /* | 1713 | if (!bitset) |
1229 | * the timer could have already expired, in which | 1714 | return -EINVAL; |
1230 | * case current would be flagged for rescheduling. | ||
1231 | * Don't bother calling schedule. | ||
1232 | */ | ||
1233 | if (likely(t.task)) | ||
1234 | schedule(); | ||
1235 | 1715 | ||
1236 | hrtimer_cancel(&t.timer); | 1716 | q.pi_state = NULL; |
1717 | q.bitset = bitset; | ||
1718 | q.rt_waiter = NULL; | ||
1237 | 1719 | ||
1238 | /* Flag if a timeout occured */ | 1720 | if (abs_time) { |
1239 | rem = (t.task == NULL); | 1721 | to = &timeout; |
1240 | 1722 | ||
1241 | destroy_hrtimer_on_stack(&t.timer); | 1723 | hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : |
1242 | } | 1724 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); |
1725 | hrtimer_init_sleeper(to, current); | ||
1726 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, | ||
1727 | current->timer_slack_ns); | ||
1243 | } | 1728 | } |
1244 | __set_current_state(TASK_RUNNING); | ||
1245 | 1729 | ||
1246 | /* | 1730 | /* Prepare to wait on uaddr. */ |
1247 | * NOTE: we don't remove ourselves from the waitqueue because | 1731 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); |
1248 | * we are the only user of it. | 1732 | if (ret) |
1249 | */ | 1733 | goto out; |
1734 | |||
1735 | /* queue_me and wait for wakeup, timeout, or a signal. */ | ||
1736 | futex_wait_queue_me(hb, &q, to); | ||
1250 | 1737 | ||
1251 | /* If we were woken (and unqueued), we succeeded, whatever. */ | 1738 | /* If we were woken (and unqueued), we succeeded, whatever. */ |
1252 | ret = 0; | 1739 | ret = 0; |
1253 | if (!unqueue_me(&q)) | 1740 | if (!unqueue_me(&q)) |
1254 | goto out_put_key; | 1741 | goto out_put_key; |
1255 | ret = -ETIMEDOUT; | 1742 | ret = -ETIMEDOUT; |
1256 | if (rem) | 1743 | if (to && !to->task) |
1257 | goto out_put_key; | 1744 | goto out_put_key; |
1258 | 1745 | ||
1259 | /* | 1746 | /* |
@@ -1270,7 +1757,7 @@ retry_private: | |||
1270 | restart->futex.val = val; | 1757 | restart->futex.val = val; |
1271 | restart->futex.time = abs_time->tv64; | 1758 | restart->futex.time = abs_time->tv64; |
1272 | restart->futex.bitset = bitset; | 1759 | restart->futex.bitset = bitset; |
1273 | restart->futex.flags = 0; | 1760 | restart->futex.flags = FLAGS_HAS_TIMEOUT; |
1274 | 1761 | ||
1275 | if (fshared) | 1762 | if (fshared) |
1276 | restart->futex.flags |= FLAGS_SHARED; | 1763 | restart->futex.flags |= FLAGS_SHARED; |
@@ -1282,6 +1769,10 @@ retry_private: | |||
1282 | out_put_key: | 1769 | out_put_key: |
1283 | put_futex_key(fshared, &q.key); | 1770 | put_futex_key(fshared, &q.key); |
1284 | out: | 1771 | out: |
1772 | if (to) { | ||
1773 | hrtimer_cancel(&to->timer); | ||
1774 | destroy_hrtimer_on_stack(&to->timer); | ||
1775 | } | ||
1285 | return ret; | 1776 | return ret; |
1286 | } | 1777 | } |
1287 | 1778 | ||
@@ -1290,13 +1781,16 @@ static long futex_wait_restart(struct restart_block *restart) | |||
1290 | { | 1781 | { |
1291 | u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; | 1782 | u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; |
1292 | int fshared = 0; | 1783 | int fshared = 0; |
1293 | ktime_t t; | 1784 | ktime_t t, *tp = NULL; |
1294 | 1785 | ||
1295 | t.tv64 = restart->futex.time; | 1786 | if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { |
1787 | t.tv64 = restart->futex.time; | ||
1788 | tp = &t; | ||
1789 | } | ||
1296 | restart->fn = do_no_restart_syscall; | 1790 | restart->fn = do_no_restart_syscall; |
1297 | if (restart->futex.flags & FLAGS_SHARED) | 1791 | if (restart->futex.flags & FLAGS_SHARED) |
1298 | fshared = 1; | 1792 | fshared = 1; |
1299 | return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, | 1793 | return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, |
1300 | restart->futex.bitset, | 1794 | restart->futex.bitset, |
1301 | restart->futex.flags & FLAGS_CLOCKRT); | 1795 | restart->futex.flags & FLAGS_CLOCKRT); |
1302 | } | 1796 | } |
@@ -1312,11 +1806,10 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, | |||
1312 | int detect, ktime_t *time, int trylock) | 1806 | int detect, ktime_t *time, int trylock) |
1313 | { | 1807 | { |
1314 | struct hrtimer_sleeper timeout, *to = NULL; | 1808 | struct hrtimer_sleeper timeout, *to = NULL; |
1315 | struct task_struct *curr = current; | ||
1316 | struct futex_hash_bucket *hb; | 1809 | struct futex_hash_bucket *hb; |
1317 | u32 uval, newval, curval; | 1810 | u32 uval; |
1318 | struct futex_q q; | 1811 | struct futex_q q; |
1319 | int ret, lock_taken, ownerdied = 0; | 1812 | int res, ret; |
1320 | 1813 | ||
1321 | if (refill_pi_state_cache()) | 1814 | if (refill_pi_state_cache()) |
1322 | return -ENOMEM; | 1815 | return -ENOMEM; |
@@ -1330,6 +1823,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, | |||
1330 | } | 1823 | } |
1331 | 1824 | ||
1332 | q.pi_state = NULL; | 1825 | q.pi_state = NULL; |
1826 | q.rt_waiter = NULL; | ||
1333 | retry: | 1827 | retry: |
1334 | q.key = FUTEX_KEY_INIT; | 1828 | q.key = FUTEX_KEY_INIT; |
1335 | ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); | 1829 | ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); |
@@ -1339,81 +1833,15 @@ retry: | |||
1339 | retry_private: | 1833 | retry_private: |
1340 | hb = queue_lock(&q); | 1834 | hb = queue_lock(&q); |
1341 | 1835 | ||
1342 | retry_locked: | 1836 | ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); |
1343 | ret = lock_taken = 0; | ||
1344 | |||
1345 | /* | ||
1346 | * To avoid races, we attempt to take the lock here again | ||
1347 | * (by doing a 0 -> TID atomic cmpxchg), while holding all | ||
1348 | * the locks. It will most likely not succeed. | ||
1349 | */ | ||
1350 | newval = task_pid_vnr(current); | ||
1351 | |||
1352 | curval = cmpxchg_futex_value_locked(uaddr, 0, newval); | ||
1353 | |||
1354 | if (unlikely(curval == -EFAULT)) | ||
1355 | goto uaddr_faulted; | ||
1356 | |||
1357 | /* | ||
1358 | * Detect deadlocks. In case of REQUEUE_PI this is a valid | ||
1359 | * situation and we return success to user space. | ||
1360 | */ | ||
1361 | if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { | ||
1362 | ret = -EDEADLK; | ||
1363 | goto out_unlock_put_key; | ||
1364 | } | ||
1365 | |||
1366 | /* | ||
1367 | * Surprise - we got the lock. Just return to userspace: | ||
1368 | */ | ||
1369 | if (unlikely(!curval)) | ||
1370 | goto out_unlock_put_key; | ||
1371 | |||
1372 | uval = curval; | ||
1373 | |||
1374 | /* | ||
1375 | * Set the WAITERS flag, so the owner will know it has someone | ||
1376 | * to wake at next unlock | ||
1377 | */ | ||
1378 | newval = curval | FUTEX_WAITERS; | ||
1379 | |||
1380 | /* | ||
1381 | * There are two cases, where a futex might have no owner (the | ||
1382 | * owner TID is 0): OWNER_DIED. We take over the futex in this | ||
1383 | * case. We also do an unconditional take over, when the owner | ||
1384 | * of the futex died. | ||
1385 | * | ||
1386 | * This is safe as we are protected by the hash bucket lock ! | ||
1387 | */ | ||
1388 | if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { | ||
1389 | /* Keep the OWNER_DIED bit */ | ||
1390 | newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current); | ||
1391 | ownerdied = 0; | ||
1392 | lock_taken = 1; | ||
1393 | } | ||
1394 | |||
1395 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); | ||
1396 | |||
1397 | if (unlikely(curval == -EFAULT)) | ||
1398 | goto uaddr_faulted; | ||
1399 | if (unlikely(curval != uval)) | ||
1400 | goto retry_locked; | ||
1401 | |||
1402 | /* | ||
1403 | * We took the lock due to owner died take over. | ||
1404 | */ | ||
1405 | if (unlikely(lock_taken)) | ||
1406 | goto out_unlock_put_key; | ||
1407 | |||
1408 | /* | ||
1409 | * We dont have the lock. Look up the PI state (or create it if | ||
1410 | * we are the first waiter): | ||
1411 | */ | ||
1412 | ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state); | ||
1413 | |||
1414 | if (unlikely(ret)) { | 1837 | if (unlikely(ret)) { |
1415 | switch (ret) { | 1838 | switch (ret) { |
1416 | 1839 | case 1: | |
1840 | /* We got the lock. */ | ||
1841 | ret = 0; | ||
1842 | goto out_unlock_put_key; | ||
1843 | case -EFAULT: | ||
1844 | goto uaddr_faulted; | ||
1417 | case -EAGAIN: | 1845 | case -EAGAIN: |
1418 | /* | 1846 | /* |
1419 | * Task is exiting and we just wait for the | 1847 | * Task is exiting and we just wait for the |
@@ -1423,25 +1851,6 @@ retry_locked: | |||
1423 | put_futex_key(fshared, &q.key); | 1851 | put_futex_key(fshared, &q.key); |
1424 | cond_resched(); | 1852 | cond_resched(); |
1425 | goto retry; | 1853 | goto retry; |
1426 | |||
1427 | case -ESRCH: | ||
1428 | /* | ||
1429 | * No owner found for this futex. Check if the | ||
1430 | * OWNER_DIED bit is set to figure out whether | ||
1431 | * this is a robust futex or not. | ||
1432 | */ | ||
1433 | if (get_futex_value_locked(&curval, uaddr)) | ||
1434 | goto uaddr_faulted; | ||
1435 | |||
1436 | /* | ||
1437 | * We simply start over in case of a robust | ||
1438 | * futex. The code above will take the futex | ||
1439 | * and return happy. | ||
1440 | */ | ||
1441 | if (curval & FUTEX_OWNER_DIED) { | ||
1442 | ownerdied = 1; | ||
1443 | goto retry_locked; | ||
1444 | } | ||
1445 | default: | 1854 | default: |
1446 | goto out_unlock_put_key; | 1855 | goto out_unlock_put_key; |
1447 | } | 1856 | } |
@@ -1465,71 +1874,21 @@ retry_locked: | |||
1465 | } | 1874 | } |
1466 | 1875 | ||
1467 | spin_lock(q.lock_ptr); | 1876 | spin_lock(q.lock_ptr); |
1468 | 1877 | /* | |
1469 | if (!ret) { | 1878 | * Fixup the pi_state owner and possibly acquire the lock if we |
1470 | /* | 1879 | * haven't already. |
1471 | * Got the lock. We might not be the anticipated owner | 1880 | */ |
1472 | * if we did a lock-steal - fix up the PI-state in | 1881 | res = fixup_owner(uaddr, fshared, &q, !ret); |
1473 | * that case: | 1882 | /* |
1474 | */ | 1883 | * If fixup_owner() returned an error, proprogate that. If it acquired |
1475 | if (q.pi_state->owner != curr) | 1884 | * the lock, clear our -ETIMEDOUT or -EINTR. |
1476 | ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); | 1885 | */ |
1477 | } else { | 1886 | if (res) |
1478 | /* | 1887 | ret = (res < 0) ? res : 0; |
1479 | * Catch the rare case, where the lock was released | ||
1480 | * when we were on the way back before we locked the | ||
1481 | * hash bucket. | ||
1482 | */ | ||
1483 | if (q.pi_state->owner == curr) { | ||
1484 | /* | ||
1485 | * Try to get the rt_mutex now. This might | ||
1486 | * fail as some other task acquired the | ||
1487 | * rt_mutex after we removed ourself from the | ||
1488 | * rt_mutex waiters list. | ||
1489 | */ | ||
1490 | if (rt_mutex_trylock(&q.pi_state->pi_mutex)) | ||
1491 | ret = 0; | ||
1492 | else { | ||
1493 | /* | ||
1494 | * pi_state is incorrect, some other | ||
1495 | * task did a lock steal and we | ||
1496 | * returned due to timeout or signal | ||
1497 | * without taking the rt_mutex. Too | ||
1498 | * late. We can access the | ||
1499 | * rt_mutex_owner without locking, as | ||
1500 | * the other task is now blocked on | ||
1501 | * the hash bucket lock. Fix the state | ||
1502 | * up. | ||
1503 | */ | ||
1504 | struct task_struct *owner; | ||
1505 | int res; | ||
1506 | |||
1507 | owner = rt_mutex_owner(&q.pi_state->pi_mutex); | ||
1508 | res = fixup_pi_state_owner(uaddr, &q, owner, | ||
1509 | fshared); | ||
1510 | |||
1511 | /* propagate -EFAULT, if the fixup failed */ | ||
1512 | if (res) | ||
1513 | ret = res; | ||
1514 | } | ||
1515 | } else { | ||
1516 | /* | ||
1517 | * Paranoia check. If we did not take the lock | ||
1518 | * in the trylock above, then we should not be | ||
1519 | * the owner of the rtmutex, neither the real | ||
1520 | * nor the pending one: | ||
1521 | */ | ||
1522 | if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr) | ||
1523 | printk(KERN_ERR "futex_lock_pi: ret = %d " | ||
1524 | "pi-mutex: %p pi-state %p\n", ret, | ||
1525 | q.pi_state->pi_mutex.owner, | ||
1526 | q.pi_state->owner); | ||
1527 | } | ||
1528 | } | ||
1529 | 1888 | ||
1530 | /* | 1889 | /* |
1531 | * If fixup_pi_state_owner() faulted and was unable to handle the | 1890 | * If fixup_owner() faulted and was unable to handle the fault, unlock |
1532 | * fault, unlock it and return the fault to userspace. | 1891 | * it and return the fault to userspace. |
1533 | */ | 1892 | */ |
1534 | if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) | 1893 | if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) |
1535 | rt_mutex_unlock(&q.pi_state->pi_mutex); | 1894 | rt_mutex_unlock(&q.pi_state->pi_mutex); |
@@ -1537,9 +1896,7 @@ retry_locked: | |||
1537 | /* Unqueue and drop the lock */ | 1896 | /* Unqueue and drop the lock */ |
1538 | unqueue_me_pi(&q); | 1897 | unqueue_me_pi(&q); |
1539 | 1898 | ||
1540 | if (to) | 1899 | goto out; |
1541 | destroy_hrtimer_on_stack(&to->timer); | ||
1542 | return ret != -EINTR ? ret : -ERESTARTNOINTR; | ||
1543 | 1900 | ||
1544 | out_unlock_put_key: | 1901 | out_unlock_put_key: |
1545 | queue_unlock(&q, hb); | 1902 | queue_unlock(&q, hb); |
@@ -1549,7 +1906,7 @@ out_put_key: | |||
1549 | out: | 1906 | out: |
1550 | if (to) | 1907 | if (to) |
1551 | destroy_hrtimer_on_stack(&to->timer); | 1908 | destroy_hrtimer_on_stack(&to->timer); |
1552 | return ret; | 1909 | return ret != -EINTR ? ret : -ERESTARTNOINTR; |
1553 | 1910 | ||
1554 | uaddr_faulted: | 1911 | uaddr_faulted: |
1555 | /* | 1912 | /* |
@@ -1572,7 +1929,6 @@ uaddr_faulted: | |||
1572 | goto retry; | 1929 | goto retry; |
1573 | } | 1930 | } |
1574 | 1931 | ||
1575 | |||
1576 | /* | 1932 | /* |
1577 | * Userspace attempted a TID -> 0 atomic transition, and failed. | 1933 | * Userspace attempted a TID -> 0 atomic transition, and failed. |
1578 | * This is the in-kernel slowpath: we look up the PI state (if any), | 1934 | * This is the in-kernel slowpath: we look up the PI state (if any), |
@@ -1674,6 +2030,229 @@ pi_faulted: | |||
1674 | return ret; | 2030 | return ret; |
1675 | } | 2031 | } |
1676 | 2032 | ||
2033 | /** | ||
2034 | * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex | ||
2035 | * @hb: the hash_bucket futex_q was original enqueued on | ||
2036 | * @q: the futex_q woken while waiting to be requeued | ||
2037 | * @key2: the futex_key of the requeue target futex | ||
2038 | * @timeout: the timeout associated with the wait (NULL if none) | ||
2039 | * | ||
2040 | * Detect if the task was woken on the initial futex as opposed to the requeue | ||
2041 | * target futex. If so, determine if it was a timeout or a signal that caused | ||
2042 | * the wakeup and return the appropriate error code to the caller. Must be | ||
2043 | * called with the hb lock held. | ||
2044 | * | ||
2045 | * Returns | ||
2046 | * 0 - no early wakeup detected | ||
2047 | * <0 - -ETIMEDOUT or -ERESTARTNOINTR | ||
2048 | */ | ||
2049 | static inline | ||
2050 | int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | ||
2051 | struct futex_q *q, union futex_key *key2, | ||
2052 | struct hrtimer_sleeper *timeout) | ||
2053 | { | ||
2054 | int ret = 0; | ||
2055 | |||
2056 | /* | ||
2057 | * With the hb lock held, we avoid races while we process the wakeup. | ||
2058 | * We only need to hold hb (and not hb2) to ensure atomicity as the | ||
2059 | * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. | ||
2060 | * It can't be requeued from uaddr2 to something else since we don't | ||
2061 | * support a PI aware source futex for requeue. | ||
2062 | */ | ||
2063 | if (!match_futex(&q->key, key2)) { | ||
2064 | WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); | ||
2065 | /* | ||
2066 | * We were woken prior to requeue by a timeout or a signal. | ||
2067 | * Unqueue the futex_q and determine which it was. | ||
2068 | */ | ||
2069 | plist_del(&q->list, &q->list.plist); | ||
2070 | drop_futex_key_refs(&q->key); | ||
2071 | |||
2072 | if (timeout && !timeout->task) | ||
2073 | ret = -ETIMEDOUT; | ||
2074 | else | ||
2075 | ret = -ERESTARTNOINTR; | ||
2076 | } | ||
2077 | return ret; | ||
2078 | } | ||
2079 | |||
2080 | /** | ||
2081 | * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 | ||
2082 | * @uaddr: the futex we initialyl wait on (non-pi) | ||
2083 | * @fshared: whether the futexes are shared (1) or not (0). They must be | ||
2084 | * the same type, no requeueing from private to shared, etc. | ||
2085 | * @val: the expected value of uaddr | ||
2086 | * @abs_time: absolute timeout | ||
2087 | * @bitset: 32 bit wakeup bitset set by userspace, defaults to all. | ||
2088 | * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) | ||
2089 | * @uaddr2: the pi futex we will take prior to returning to user-space | ||
2090 | * | ||
2091 | * The caller will wait on uaddr and will be requeued by futex_requeue() to | ||
2092 | * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and | ||
2093 | * complete the acquisition of the rt_mutex prior to returning to userspace. | ||
2094 | * This ensures the rt_mutex maintains an owner when it has waiters; without | ||
2095 | * one, the pi logic wouldn't know which task to boost/deboost, if there was a | ||
2096 | * need to. | ||
2097 | * | ||
2098 | * We call schedule in futex_wait_queue_me() when we enqueue and return there | ||
2099 | * via the following: | ||
2100 | * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() | ||
2101 | * 2) wakeup on uaddr2 after a requeue and subsequent unlock | ||
2102 | * 3) signal (before or after requeue) | ||
2103 | * 4) timeout (before or after requeue) | ||
2104 | * | ||
2105 | * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. | ||
2106 | * | ||
2107 | * If 2, we may then block on trying to take the rt_mutex and return via: | ||
2108 | * 5) successful lock | ||
2109 | * 6) signal | ||
2110 | * 7) timeout | ||
2111 | * 8) other lock acquisition failure | ||
2112 | * | ||
2113 | * If 6, we setup a restart_block with futex_lock_pi() as the function. | ||
2114 | * | ||
2115 | * If 4 or 7, we cleanup and return with -ETIMEDOUT. | ||
2116 | * | ||
2117 | * Returns: | ||
2118 | * 0 - On success | ||
2119 | * <0 - On error | ||
2120 | */ | ||
2121 | static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | ||
2122 | u32 val, ktime_t *abs_time, u32 bitset, | ||
2123 | int clockrt, u32 __user *uaddr2) | ||
2124 | { | ||
2125 | struct hrtimer_sleeper timeout, *to = NULL; | ||
2126 | struct rt_mutex_waiter rt_waiter; | ||
2127 | struct rt_mutex *pi_mutex = NULL; | ||
2128 | struct futex_hash_bucket *hb; | ||
2129 | union futex_key key2; | ||
2130 | struct futex_q q; | ||
2131 | int res, ret; | ||
2132 | |||
2133 | if (!bitset) | ||
2134 | return -EINVAL; | ||
2135 | |||
2136 | if (abs_time) { | ||
2137 | to = &timeout; | ||
2138 | hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : | ||
2139 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | ||
2140 | hrtimer_init_sleeper(to, current); | ||
2141 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, | ||
2142 | current->timer_slack_ns); | ||
2143 | } | ||
2144 | |||
2145 | /* | ||
2146 | * The waiter is allocated on our stack, manipulated by the requeue | ||
2147 | * code while we sleep on uaddr. | ||
2148 | */ | ||
2149 | debug_rt_mutex_init_waiter(&rt_waiter); | ||
2150 | rt_waiter.task = NULL; | ||
2151 | |||
2152 | q.pi_state = NULL; | ||
2153 | q.bitset = bitset; | ||
2154 | q.rt_waiter = &rt_waiter; | ||
2155 | |||
2156 | key2 = FUTEX_KEY_INIT; | ||
2157 | ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); | ||
2158 | if (unlikely(ret != 0)) | ||
2159 | goto out; | ||
2160 | |||
2161 | /* Prepare to wait on uaddr. */ | ||
2162 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); | ||
2163 | if (ret) | ||
2164 | goto out_key2; | ||
2165 | |||
2166 | /* Queue the futex_q, drop the hb lock, wait for wakeup. */ | ||
2167 | futex_wait_queue_me(hb, &q, to); | ||
2168 | |||
2169 | spin_lock(&hb->lock); | ||
2170 | ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); | ||
2171 | spin_unlock(&hb->lock); | ||
2172 | if (ret) | ||
2173 | goto out_put_keys; | ||
2174 | |||
2175 | /* | ||
2176 | * In order for us to be here, we know our q.key == key2, and since | ||
2177 | * we took the hb->lock above, we also know that futex_requeue() has | ||
2178 | * completed and we no longer have to concern ourselves with a wakeup | ||
2179 | * race with the atomic proxy lock acquition by the requeue code. | ||
2180 | */ | ||
2181 | |||
2182 | /* Check if the requeue code acquired the second futex for us. */ | ||
2183 | if (!q.rt_waiter) { | ||
2184 | /* | ||
2185 | * Got the lock. We might not be the anticipated owner if we | ||
2186 | * did a lock-steal - fix up the PI-state in that case. | ||
2187 | */ | ||
2188 | if (q.pi_state && (q.pi_state->owner != current)) { | ||
2189 | spin_lock(q.lock_ptr); | ||
2190 | ret = fixup_pi_state_owner(uaddr2, &q, current, | ||
2191 | fshared); | ||
2192 | spin_unlock(q.lock_ptr); | ||
2193 | } | ||
2194 | } else { | ||
2195 | /* | ||
2196 | * We have been woken up by futex_unlock_pi(), a timeout, or a | ||
2197 | * signal. futex_unlock_pi() will not destroy the lock_ptr nor | ||
2198 | * the pi_state. | ||
2199 | */ | ||
2200 | WARN_ON(!&q.pi_state); | ||
2201 | pi_mutex = &q.pi_state->pi_mutex; | ||
2202 | ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); | ||
2203 | debug_rt_mutex_free_waiter(&rt_waiter); | ||
2204 | |||
2205 | spin_lock(q.lock_ptr); | ||
2206 | /* | ||
2207 | * Fixup the pi_state owner and possibly acquire the lock if we | ||
2208 | * haven't already. | ||
2209 | */ | ||
2210 | res = fixup_owner(uaddr2, fshared, &q, !ret); | ||
2211 | /* | ||
2212 | * If fixup_owner() returned an error, proprogate that. If it | ||
2213 | * acquired the lock, clear our -ETIMEDOUT or -EINTR. | ||
2214 | */ | ||
2215 | if (res) | ||
2216 | ret = (res < 0) ? res : 0; | ||
2217 | |||
2218 | /* Unqueue and drop the lock. */ | ||
2219 | unqueue_me_pi(&q); | ||
2220 | } | ||
2221 | |||
2222 | /* | ||
2223 | * If fixup_pi_state_owner() faulted and was unable to handle the | ||
2224 | * fault, unlock the rt_mutex and return the fault to userspace. | ||
2225 | */ | ||
2226 | if (ret == -EFAULT) { | ||
2227 | if (rt_mutex_owner(pi_mutex) == current) | ||
2228 | rt_mutex_unlock(pi_mutex); | ||
2229 | } else if (ret == -EINTR) { | ||
2230 | /* | ||
2231 | * We've already been requeued, but we have no way to | ||
2232 | * restart by calling futex_lock_pi() directly. We | ||
2233 | * could restart the syscall, but that will look at | ||
2234 | * the user space value and return right away. So we | ||
2235 | * drop back with EWOULDBLOCK to tell user space that | ||
2236 | * "val" has been changed. That's the same what the | ||
2237 | * restart of the syscall would do in | ||
2238 | * futex_wait_setup(). | ||
2239 | */ | ||
2240 | ret = -EWOULDBLOCK; | ||
2241 | } | ||
2242 | |||
2243 | out_put_keys: | ||
2244 | put_futex_key(fshared, &q.key); | ||
2245 | out_key2: | ||
2246 | put_futex_key(fshared, &key2); | ||
2247 | |||
2248 | out: | ||
2249 | if (to) { | ||
2250 | hrtimer_cancel(&to->timer); | ||
2251 | destroy_hrtimer_on_stack(&to->timer); | ||
2252 | } | ||
2253 | return ret; | ||
2254 | } | ||
2255 | |||
1677 | /* | 2256 | /* |
1678 | * Support for robust futexes: the kernel cleans up held futexes at | 2257 | * Support for robust futexes: the kernel cleans up held futexes at |
1679 | * thread exit time. | 2258 | * thread exit time. |
@@ -1896,7 +2475,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, | |||
1896 | fshared = 1; | 2475 | fshared = 1; |
1897 | 2476 | ||
1898 | clockrt = op & FUTEX_CLOCK_REALTIME; | 2477 | clockrt = op & FUTEX_CLOCK_REALTIME; |
1899 | if (clockrt && cmd != FUTEX_WAIT_BITSET) | 2478 | if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) |
1900 | return -ENOSYS; | 2479 | return -ENOSYS; |
1901 | 2480 | ||
1902 | switch (cmd) { | 2481 | switch (cmd) { |
@@ -1911,10 +2490,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, | |||
1911 | ret = futex_wake(uaddr, fshared, val, val3); | 2490 | ret = futex_wake(uaddr, fshared, val, val3); |
1912 | break; | 2491 | break; |
1913 | case FUTEX_REQUEUE: | 2492 | case FUTEX_REQUEUE: |
1914 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); | 2493 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); |
1915 | break; | 2494 | break; |
1916 | case FUTEX_CMP_REQUEUE: | 2495 | case FUTEX_CMP_REQUEUE: |
1917 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); | 2496 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, |
2497 | 0); | ||
1918 | break; | 2498 | break; |
1919 | case FUTEX_WAKE_OP: | 2499 | case FUTEX_WAKE_OP: |
1920 | ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); | 2500 | ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); |
@@ -1931,6 +2511,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, | |||
1931 | if (futex_cmpxchg_enabled) | 2511 | if (futex_cmpxchg_enabled) |
1932 | ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); | 2512 | ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); |
1933 | break; | 2513 | break; |
2514 | case FUTEX_WAIT_REQUEUE_PI: | ||
2515 | val3 = FUTEX_BITSET_MATCH_ANY; | ||
2516 | ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, | ||
2517 | clockrt, uaddr2); | ||
2518 | break; | ||
2519 | case FUTEX_CMP_REQUEUE_PI: | ||
2520 | ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, | ||
2521 | 1); | ||
2522 | break; | ||
1934 | default: | 2523 | default: |
1935 | ret = -ENOSYS; | 2524 | ret = -ENOSYS; |
1936 | } | 2525 | } |
@@ -1948,7 +2537,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, | |||
1948 | int cmd = op & FUTEX_CMD_MASK; | 2537 | int cmd = op & FUTEX_CMD_MASK; |
1949 | 2538 | ||
1950 | if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || | 2539 | if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || |
1951 | cmd == FUTEX_WAIT_BITSET)) { | 2540 | cmd == FUTEX_WAIT_BITSET || |
2541 | cmd == FUTEX_WAIT_REQUEUE_PI)) { | ||
1952 | if (copy_from_user(&ts, utime, sizeof(ts)) != 0) | 2542 | if (copy_from_user(&ts, utime, sizeof(ts)) != 0) |
1953 | return -EFAULT; | 2543 | return -EFAULT; |
1954 | if (!timespec_valid(&ts)) | 2544 | if (!timespec_valid(&ts)) |
@@ -1960,11 +2550,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, | |||
1960 | tp = &t; | 2550 | tp = &t; |
1961 | } | 2551 | } |
1962 | /* | 2552 | /* |
1963 | * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. | 2553 | * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. |
1964 | * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. | 2554 | * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. |
1965 | */ | 2555 | */ |
1966 | if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || | 2556 | if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || |
1967 | cmd == FUTEX_WAKE_OP) | 2557 | cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) |
1968 | val2 = (u32) (unsigned long) utime; | 2558 | val2 = (u32) (unsigned long) utime; |
1969 | 2559 | ||
1970 | return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); | 2560 | return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); |
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 3394f8f52964..7d047808419d 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile | |||
@@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o | |||
3 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o | 3 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o |
4 | obj-$(CONFIG_PROC_FS) += proc.o | 4 | obj-$(CONFIG_PROC_FS) += proc.o |
5 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o | 5 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o |
6 | obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o | 6 | obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o |
7 | obj-$(CONFIG_PM_SLEEP) += pm.o | 7 | obj-$(CONFIG_PM_SLEEP) += pm.o |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c687ba4363f2..13c68e71b726 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -359,7 +359,6 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) | |||
359 | 359 | ||
360 | spin_lock(&desc->lock); | 360 | spin_lock(&desc->lock); |
361 | mask_ack_irq(desc, irq); | 361 | mask_ack_irq(desc, irq); |
362 | desc = irq_remap_to_desc(irq, desc); | ||
363 | 362 | ||
364 | if (unlikely(desc->status & IRQ_INPROGRESS)) | 363 | if (unlikely(desc->status & IRQ_INPROGRESS)) |
365 | goto out_unlock; | 364 | goto out_unlock; |
@@ -438,7 +437,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) | |||
438 | desc->status &= ~IRQ_INPROGRESS; | 437 | desc->status &= ~IRQ_INPROGRESS; |
439 | out: | 438 | out: |
440 | desc->chip->eoi(irq); | 439 | desc->chip->eoi(irq); |
441 | desc = irq_remap_to_desc(irq, desc); | ||
442 | 440 | ||
443 | spin_unlock(&desc->lock); | 441 | spin_unlock(&desc->lock); |
444 | } | 442 | } |
@@ -475,7 +473,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
475 | !desc->action)) { | 473 | !desc->action)) { |
476 | desc->status |= (IRQ_PENDING | IRQ_MASKED); | 474 | desc->status |= (IRQ_PENDING | IRQ_MASKED); |
477 | mask_ack_irq(desc, irq); | 475 | mask_ack_irq(desc, irq); |
478 | desc = irq_remap_to_desc(irq, desc); | ||
479 | goto out_unlock; | 476 | goto out_unlock; |
480 | } | 477 | } |
481 | kstat_incr_irqs_this_cpu(irq, desc); | 478 | kstat_incr_irqs_this_cpu(irq, desc); |
@@ -483,7 +480,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
483 | /* Start handling the irq */ | 480 | /* Start handling the irq */ |
484 | if (desc->chip->ack) | 481 | if (desc->chip->ack) |
485 | desc->chip->ack(irq); | 482 | desc->chip->ack(irq); |
486 | desc = irq_remap_to_desc(irq, desc); | ||
487 | 483 | ||
488 | /* Mark the IRQ currently in progress.*/ | 484 | /* Mark the IRQ currently in progress.*/ |
489 | desc->status |= IRQ_INPROGRESS; | 485 | desc->status |= IRQ_INPROGRESS; |
@@ -544,10 +540,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) | |||
544 | if (!noirqdebug) | 540 | if (!noirqdebug) |
545 | note_interrupt(irq, desc, action_ret); | 541 | note_interrupt(irq, desc, action_ret); |
546 | 542 | ||
547 | if (desc->chip->eoi) { | 543 | if (desc->chip->eoi) |
548 | desc->chip->eoi(irq); | 544 | desc->chip->eoi(irq); |
549 | desc = irq_remap_to_desc(irq, desc); | ||
550 | } | ||
551 | } | 545 | } |
552 | 546 | ||
553 | void | 547 | void |
@@ -582,10 +576,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, | |||
582 | 576 | ||
583 | /* Uninstall? */ | 577 | /* Uninstall? */ |
584 | if (handle == handle_bad_irq) { | 578 | if (handle == handle_bad_irq) { |
585 | if (desc->chip != &no_irq_chip) { | 579 | if (desc->chip != &no_irq_chip) |
586 | mask_ack_irq(desc, irq); | 580 | mask_ack_irq(desc, irq); |
587 | desc = irq_remap_to_desc(irq, desc); | ||
588 | } | ||
589 | desc->status |= IRQ_DISABLED; | 581 | desc->status |= IRQ_DISABLED; |
590 | desc->depth = 1; | 582 | desc->depth = 1; |
591 | } | 583 | } |
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 26e08754744f..18041a254d32 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -11,6 +11,7 @@ | |||
11 | */ | 11 | */ |
12 | 12 | ||
13 | #include <linux/irq.h> | 13 | #include <linux/irq.h> |
14 | #include <linux/slab.h> | ||
14 | #include <linux/module.h> | 15 | #include <linux/module.h> |
15 | #include <linux/random.h> | 16 | #include <linux/random.h> |
16 | #include <linux/interrupt.h> | 17 | #include <linux/interrupt.h> |
@@ -81,45 +82,48 @@ static struct irq_desc irq_desc_init = { | |||
81 | .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), | 82 | .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), |
82 | }; | 83 | }; |
83 | 84 | ||
84 | void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) | 85 | void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) |
85 | { | 86 | { |
86 | int node; | ||
87 | void *ptr; | 87 | void *ptr; |
88 | 88 | ||
89 | node = cpu_to_node(cpu); | 89 | if (slab_is_available()) |
90 | ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node); | 90 | ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), |
91 | GFP_ATOMIC, node); | ||
92 | else | ||
93 | ptr = alloc_bootmem_node(NODE_DATA(node), | ||
94 | nr * sizeof(*desc->kstat_irqs)); | ||
91 | 95 | ||
92 | /* | 96 | /* |
93 | * don't overwite if can not get new one | 97 | * don't overwite if can not get new one |
94 | * init_copy_kstat_irqs() could still use old one | 98 | * init_copy_kstat_irqs() could still use old one |
95 | */ | 99 | */ |
96 | if (ptr) { | 100 | if (ptr) { |
97 | printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", | 101 | printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node); |
98 | cpu, node); | ||
99 | desc->kstat_irqs = ptr; | 102 | desc->kstat_irqs = ptr; |
100 | } | 103 | } |
101 | } | 104 | } |
102 | 105 | ||
103 | static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) | 106 | static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) |
104 | { | 107 | { |
105 | memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); | 108 | memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); |
106 | 109 | ||
107 | spin_lock_init(&desc->lock); | 110 | spin_lock_init(&desc->lock); |
108 | desc->irq = irq; | 111 | desc->irq = irq; |
109 | #ifdef CONFIG_SMP | 112 | #ifdef CONFIG_SMP |
110 | desc->cpu = cpu; | 113 | desc->node = node; |
111 | #endif | 114 | #endif |
112 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); | 115 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); |
113 | init_kstat_irqs(desc, cpu, nr_cpu_ids); | 116 | init_kstat_irqs(desc, node, nr_cpu_ids); |
114 | if (!desc->kstat_irqs) { | 117 | if (!desc->kstat_irqs) { |
115 | printk(KERN_ERR "can not alloc kstat_irqs\n"); | 118 | printk(KERN_ERR "can not alloc kstat_irqs\n"); |
116 | BUG_ON(1); | 119 | BUG_ON(1); |
117 | } | 120 | } |
118 | if (!init_alloc_desc_masks(desc, cpu, false)) { | 121 | if (!alloc_desc_masks(desc, node, false)) { |
119 | printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); | 122 | printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); |
120 | BUG_ON(1); | 123 | BUG_ON(1); |
121 | } | 124 | } |
122 | arch_init_chip_data(desc, cpu); | 125 | init_desc_masks(desc); |
126 | arch_init_chip_data(desc, node); | ||
123 | } | 127 | } |
124 | 128 | ||
125 | /* | 129 | /* |
@@ -169,7 +173,8 @@ int __init early_irq_init(void) | |||
169 | desc[i].irq = i; | 173 | desc[i].irq = i; |
170 | desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; | 174 | desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; |
171 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); | 175 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); |
172 | init_alloc_desc_masks(&desc[i], 0, true); | 176 | alloc_desc_masks(&desc[i], 0, true); |
177 | init_desc_masks(&desc[i]); | ||
173 | irq_desc_ptrs[i] = desc + i; | 178 | irq_desc_ptrs[i] = desc + i; |
174 | } | 179 | } |
175 | 180 | ||
@@ -187,11 +192,10 @@ struct irq_desc *irq_to_desc(unsigned int irq) | |||
187 | return NULL; | 192 | return NULL; |
188 | } | 193 | } |
189 | 194 | ||
190 | struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) | 195 | struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) |
191 | { | 196 | { |
192 | struct irq_desc *desc; | 197 | struct irq_desc *desc; |
193 | unsigned long flags; | 198 | unsigned long flags; |
194 | int node; | ||
195 | 199 | ||
196 | if (irq >= nr_irqs) { | 200 | if (irq >= nr_irqs) { |
197 | WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", | 201 | WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", |
@@ -210,15 +214,17 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) | |||
210 | if (desc) | 214 | if (desc) |
211 | goto out_unlock; | 215 | goto out_unlock; |
212 | 216 | ||
213 | node = cpu_to_node(cpu); | 217 | if (slab_is_available()) |
214 | desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); | 218 | desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); |
215 | printk(KERN_DEBUG " alloc irq_desc for %d on cpu %d node %d\n", | 219 | else |
216 | irq, cpu, node); | 220 | desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc)); |
221 | |||
222 | printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); | ||
217 | if (!desc) { | 223 | if (!desc) { |
218 | printk(KERN_ERR "can not alloc irq_desc\n"); | 224 | printk(KERN_ERR "can not alloc irq_desc\n"); |
219 | BUG_ON(1); | 225 | BUG_ON(1); |
220 | } | 226 | } |
221 | init_one_irq_desc(irq, desc, cpu); | 227 | init_one_irq_desc(irq, desc, node); |
222 | 228 | ||
223 | irq_desc_ptrs[irq] = desc; | 229 | irq_desc_ptrs[irq] = desc; |
224 | 230 | ||
@@ -256,7 +262,8 @@ int __init early_irq_init(void) | |||
256 | 262 | ||
257 | for (i = 0; i < count; i++) { | 263 | for (i = 0; i < count; i++) { |
258 | desc[i].irq = i; | 264 | desc[i].irq = i; |
259 | init_alloc_desc_masks(&desc[i], 0, true); | 265 | alloc_desc_masks(&desc[i], 0, true); |
266 | init_desc_masks(&desc[i]); | ||
260 | desc[i].kstat_irqs = kstat_irqs_all[i]; | 267 | desc[i].kstat_irqs = kstat_irqs_all[i]; |
261 | } | 268 | } |
262 | return arch_early_irq_init(); | 269 | return arch_early_irq_init(); |
@@ -267,7 +274,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) | |||
267 | return (irq < NR_IRQS) ? irq_desc + irq : NULL; | 274 | return (irq < NR_IRQS) ? irq_desc + irq : NULL; |
268 | } | 275 | } |
269 | 276 | ||
270 | struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) | 277 | struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) |
271 | { | 278 | { |
272 | return irq_to_desc(irq); | 279 | return irq_to_desc(irq); |
273 | } | 280 | } |
@@ -453,11 +460,8 @@ unsigned int __do_IRQ(unsigned int irq) | |||
453 | /* | 460 | /* |
454 | * No locking required for CPU-local interrupts: | 461 | * No locking required for CPU-local interrupts: |
455 | */ | 462 | */ |
456 | if (desc->chip->ack) { | 463 | if (desc->chip->ack) |
457 | desc->chip->ack(irq); | 464 | desc->chip->ack(irq); |
458 | /* get new one */ | ||
459 | desc = irq_remap_to_desc(irq, desc); | ||
460 | } | ||
461 | if (likely(!(desc->status & IRQ_DISABLED))) { | 465 | if (likely(!(desc->status & IRQ_DISABLED))) { |
462 | action_ret = handle_IRQ_event(irq, desc->action); | 466 | action_ret = handle_IRQ_event(irq, desc->action); |
463 | if (!noirqdebug) | 467 | if (!noirqdebug) |
@@ -468,10 +472,8 @@ unsigned int __do_IRQ(unsigned int irq) | |||
468 | } | 472 | } |
469 | 473 | ||
470 | spin_lock(&desc->lock); | 474 | spin_lock(&desc->lock); |
471 | if (desc->chip->ack) { | 475 | if (desc->chip->ack) |
472 | desc->chip->ack(irq); | 476 | desc->chip->ack(irq); |
473 | desc = irq_remap_to_desc(irq, desc); | ||
474 | } | ||
475 | /* | 477 | /* |
476 | * REPLAY is when Linux resends an IRQ that was dropped earlier | 478 | * REPLAY is when Linux resends an IRQ that was dropped earlier |
477 | * WAITING is used by probe to mark irqs that are being tested | 479 | * WAITING is used by probe to mark irqs that are being tested |
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 01ce20eab38f..73468253143b 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
@@ -16,7 +16,7 @@ extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); | |||
16 | extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); | 16 | extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); |
17 | 17 | ||
18 | extern struct lock_class_key irq_desc_lock_class; | 18 | extern struct lock_class_key irq_desc_lock_class; |
19 | extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); | 19 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); |
20 | extern void clear_kstat_irqs(struct irq_desc *desc); | 20 | extern void clear_kstat_irqs(struct irq_desc *desc); |
21 | extern spinlock_t sparse_irq_lock; | 21 | extern spinlock_t sparse_irq_lock; |
22 | 22 | ||
@@ -42,6 +42,9 @@ static inline void unregister_handler_proc(unsigned int irq, | |||
42 | 42 | ||
43 | extern int irq_select_affinity_usr(unsigned int irq); | 43 | extern int irq_select_affinity_usr(unsigned int irq); |
44 | 44 | ||
45 | extern void | ||
46 | irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask); | ||
47 | |||
45 | /* | 48 | /* |
46 | * Debugging printout: | 49 | * Debugging printout: |
47 | */ | 50 | */ |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2734eca59243..aaf5c9d05770 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -80,7 +80,7 @@ int irq_can_set_affinity(unsigned int irq) | |||
80 | return 1; | 80 | return 1; |
81 | } | 81 | } |
82 | 82 | ||
83 | static void | 83 | void |
84 | irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) | 84 | irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) |
85 | { | 85 | { |
86 | struct irqaction *action = desc->action; | 86 | struct irqaction *action = desc->action; |
@@ -109,17 +109,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) | |||
109 | spin_lock_irqsave(&desc->lock, flags); | 109 | spin_lock_irqsave(&desc->lock, flags); |
110 | 110 | ||
111 | #ifdef CONFIG_GENERIC_PENDING_IRQ | 111 | #ifdef CONFIG_GENERIC_PENDING_IRQ |
112 | if (desc->status & IRQ_MOVE_PCNTXT) | 112 | if (desc->status & IRQ_MOVE_PCNTXT) { |
113 | desc->chip->set_affinity(irq, cpumask); | 113 | if (!desc->chip->set_affinity(irq, cpumask)) { |
114 | cpumask_copy(desc->affinity, cpumask); | ||
115 | irq_set_thread_affinity(desc, cpumask); | ||
116 | } | ||
117 | } | ||
114 | else { | 118 | else { |
115 | desc->status |= IRQ_MOVE_PENDING; | 119 | desc->status |= IRQ_MOVE_PENDING; |
116 | cpumask_copy(desc->pending_mask, cpumask); | 120 | cpumask_copy(desc->pending_mask, cpumask); |
117 | } | 121 | } |
118 | #else | 122 | #else |
119 | cpumask_copy(desc->affinity, cpumask); | 123 | if (!desc->chip->set_affinity(irq, cpumask)) { |
120 | desc->chip->set_affinity(irq, cpumask); | 124 | cpumask_copy(desc->affinity, cpumask); |
125 | irq_set_thread_affinity(desc, cpumask); | ||
126 | } | ||
121 | #endif | 127 | #endif |
122 | irq_set_thread_affinity(desc, cpumask); | ||
123 | desc->status |= IRQ_AFFINITY_SET; | 128 | desc->status |= IRQ_AFFINITY_SET; |
124 | spin_unlock_irqrestore(&desc->lock, flags); | 129 | spin_unlock_irqrestore(&desc->lock, flags); |
125 | return 0; | 130 | return 0; |
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index e05ad9be43b7..cfe767ca1545 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c | |||
@@ -1,5 +1,8 @@ | |||
1 | 1 | ||
2 | #include <linux/irq.h> | 2 | #include <linux/irq.h> |
3 | #include <linux/interrupt.h> | ||
4 | |||
5 | #include "internals.h" | ||
3 | 6 | ||
4 | void move_masked_irq(int irq) | 7 | void move_masked_irq(int irq) |
5 | { | 8 | { |
@@ -39,11 +42,12 @@ void move_masked_irq(int irq) | |||
39 | * masking the irqs. | 42 | * masking the irqs. |
40 | */ | 43 | */ |
41 | if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) | 44 | if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) |
42 | < nr_cpu_ids)) { | 45 | < nr_cpu_ids)) |
43 | cpumask_and(desc->affinity, | 46 | if (!desc->chip->set_affinity(irq, desc->pending_mask)) { |
44 | desc->pending_mask, cpu_online_mask); | 47 | cpumask_copy(desc->affinity, desc->pending_mask); |
45 | desc->chip->set_affinity(irq, desc->affinity); | 48 | irq_set_thread_affinity(desc, desc->pending_mask); |
46 | } | 49 | } |
50 | |||
47 | cpumask_clear(desc->pending_mask); | 51 | cpumask_clear(desc->pending_mask); |
48 | } | 52 | } |
49 | 53 | ||
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 44bbdcbaf8d2..2f69bee57bf2 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c | |||
@@ -15,9 +15,9 @@ | |||
15 | 15 | ||
16 | static void init_copy_kstat_irqs(struct irq_desc *old_desc, | 16 | static void init_copy_kstat_irqs(struct irq_desc *old_desc, |
17 | struct irq_desc *desc, | 17 | struct irq_desc *desc, |
18 | int cpu, int nr) | 18 | int node, int nr) |
19 | { | 19 | { |
20 | init_kstat_irqs(desc, cpu, nr); | 20 | init_kstat_irqs(desc, node, nr); |
21 | 21 | ||
22 | if (desc->kstat_irqs != old_desc->kstat_irqs) | 22 | if (desc->kstat_irqs != old_desc->kstat_irqs) |
23 | memcpy(desc->kstat_irqs, old_desc->kstat_irqs, | 23 | memcpy(desc->kstat_irqs, old_desc->kstat_irqs, |
@@ -34,20 +34,20 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) | |||
34 | } | 34 | } |
35 | 35 | ||
36 | static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, | 36 | static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, |
37 | struct irq_desc *desc, int cpu) | 37 | struct irq_desc *desc, int node) |
38 | { | 38 | { |
39 | memcpy(desc, old_desc, sizeof(struct irq_desc)); | 39 | memcpy(desc, old_desc, sizeof(struct irq_desc)); |
40 | if (!init_alloc_desc_masks(desc, cpu, false)) { | 40 | if (!alloc_desc_masks(desc, node, false)) { |
41 | printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " | 41 | printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " |
42 | "for migration.\n", irq); | 42 | "for migration.\n", irq); |
43 | return false; | 43 | return false; |
44 | } | 44 | } |
45 | spin_lock_init(&desc->lock); | 45 | spin_lock_init(&desc->lock); |
46 | desc->cpu = cpu; | 46 | desc->node = node; |
47 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); | 47 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); |
48 | init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); | 48 | init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); |
49 | init_copy_desc_masks(old_desc, desc); | 49 | init_copy_desc_masks(old_desc, desc); |
50 | arch_init_copy_chip_data(old_desc, desc, cpu); | 50 | arch_init_copy_chip_data(old_desc, desc, node); |
51 | return true; | 51 | return true; |
52 | } | 52 | } |
53 | 53 | ||
@@ -59,12 +59,11 @@ static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) | |||
59 | } | 59 | } |
60 | 60 | ||
61 | static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, | 61 | static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, |
62 | int cpu) | 62 | int node) |
63 | { | 63 | { |
64 | struct irq_desc *desc; | 64 | struct irq_desc *desc; |
65 | unsigned int irq; | 65 | unsigned int irq; |
66 | unsigned long flags; | 66 | unsigned long flags; |
67 | int node; | ||
68 | 67 | ||
69 | irq = old_desc->irq; | 68 | irq = old_desc->irq; |
70 | 69 | ||
@@ -76,7 +75,6 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, | |||
76 | if (desc && old_desc != desc) | 75 | if (desc && old_desc != desc) |
77 | goto out_unlock; | 76 | goto out_unlock; |
78 | 77 | ||
79 | node = cpu_to_node(cpu); | ||
80 | desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); | 78 | desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); |
81 | if (!desc) { | 79 | if (!desc) { |
82 | printk(KERN_ERR "irq %d: can not get new irq_desc " | 80 | printk(KERN_ERR "irq %d: can not get new irq_desc " |
@@ -85,7 +83,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, | |||
85 | desc = old_desc; | 83 | desc = old_desc; |
86 | goto out_unlock; | 84 | goto out_unlock; |
87 | } | 85 | } |
88 | if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) { | 86 | if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) { |
89 | /* still use old one */ | 87 | /* still use old one */ |
90 | kfree(desc); | 88 | kfree(desc); |
91 | desc = old_desc; | 89 | desc = old_desc; |
@@ -97,9 +95,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, | |||
97 | 95 | ||
98 | /* free the old one */ | 96 | /* free the old one */ |
99 | free_one_irq_desc(old_desc, desc); | 97 | free_one_irq_desc(old_desc, desc); |
100 | spin_unlock(&old_desc->lock); | ||
101 | kfree(old_desc); | 98 | kfree(old_desc); |
102 | spin_lock(&desc->lock); | ||
103 | 99 | ||
104 | return desc; | 100 | return desc; |
105 | 101 | ||
@@ -109,24 +105,14 @@ out_unlock: | |||
109 | return desc; | 105 | return desc; |
110 | } | 106 | } |
111 | 107 | ||
112 | struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu) | 108 | struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) |
113 | { | 109 | { |
114 | int old_cpu; | ||
115 | int node, old_node; | ||
116 | |||
117 | /* those all static, do move them */ | 110 | /* those all static, do move them */ |
118 | if (desc->irq < NR_IRQS_LEGACY) | 111 | if (desc->irq < NR_IRQS_LEGACY) |
119 | return desc; | 112 | return desc; |
120 | 113 | ||
121 | old_cpu = desc->cpu; | 114 | if (desc->node != node) |
122 | if (old_cpu != cpu) { | 115 | desc = __real_move_irq_desc(desc, node); |
123 | node = cpu_to_node(cpu); | ||
124 | old_node = cpu_to_node(old_cpu); | ||
125 | if (old_node != node) | ||
126 | desc = __real_move_irq_desc(desc, cpu); | ||
127 | else | ||
128 | desc->cpu = cpu; | ||
129 | } | ||
130 | 116 | ||
131 | return desc; | 117 | return desc; |
132 | } | 118 | } |
diff --git a/kernel/mutex.c b/kernel/mutex.c index 507cf2b5e9f1..e5cc0cd28d54 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -249,7 +249,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
249 | 249 | ||
250 | /* didnt get the lock, go to sleep: */ | 250 | /* didnt get the lock, go to sleep: */ |
251 | spin_unlock_mutex(&lock->wait_lock, flags); | 251 | spin_unlock_mutex(&lock->wait_lock, flags); |
252 | __schedule(); | 252 | preempt_enable_no_resched(); |
253 | schedule(); | ||
254 | preempt_disable(); | ||
253 | spin_lock_mutex(&lock->wait_lock, flags); | 255 | spin_lock_mutex(&lock->wait_lock, flags); |
254 | } | 256 | } |
255 | 257 | ||
@@ -471,5 +473,28 @@ int __sched mutex_trylock(struct mutex *lock) | |||
471 | 473 | ||
472 | return ret; | 474 | return ret; |
473 | } | 475 | } |
474 | |||
475 | EXPORT_SYMBOL(mutex_trylock); | 476 | EXPORT_SYMBOL(mutex_trylock); |
477 | |||
478 | /** | ||
479 | * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 | ||
480 | * @cnt: the atomic which we are to dec | ||
481 | * @lock: the mutex to return holding if we dec to 0 | ||
482 | * | ||
483 | * return true and hold lock if we dec to 0, return false otherwise | ||
484 | */ | ||
485 | int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) | ||
486 | { | ||
487 | /* dec if we can't possibly hit 0 */ | ||
488 | if (atomic_add_unless(cnt, -1, 1)) | ||
489 | return 0; | ||
490 | /* we might hit 0, so take the lock */ | ||
491 | mutex_lock(lock); | ||
492 | if (!atomic_dec_and_test(cnt)) { | ||
493 | /* when we actually did the dec, we didn't hit 0 */ | ||
494 | mutex_unlock(lock); | ||
495 | return 0; | ||
496 | } | ||
497 | /* we hit 0, and we hold the lock */ | ||
498 | return 1; | ||
499 | } | ||
500 | EXPORT_SYMBOL(atomic_dec_and_mutex_lock); | ||
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 69d9cb921ffa..820c5af44f3e 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c | |||
@@ -300,7 +300,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
300 | * assigned pending owner [which might not have taken the | 300 | * assigned pending owner [which might not have taken the |
301 | * lock yet]: | 301 | * lock yet]: |
302 | */ | 302 | */ |
303 | static inline int try_to_steal_lock(struct rt_mutex *lock) | 303 | static inline int try_to_steal_lock(struct rt_mutex *lock, |
304 | struct task_struct *task) | ||
304 | { | 305 | { |
305 | struct task_struct *pendowner = rt_mutex_owner(lock); | 306 | struct task_struct *pendowner = rt_mutex_owner(lock); |
306 | struct rt_mutex_waiter *next; | 307 | struct rt_mutex_waiter *next; |
@@ -309,11 +310,11 @@ static inline int try_to_steal_lock(struct rt_mutex *lock) | |||
309 | if (!rt_mutex_owner_pending(lock)) | 310 | if (!rt_mutex_owner_pending(lock)) |
310 | return 0; | 311 | return 0; |
311 | 312 | ||
312 | if (pendowner == current) | 313 | if (pendowner == task) |
313 | return 1; | 314 | return 1; |
314 | 315 | ||
315 | spin_lock_irqsave(&pendowner->pi_lock, flags); | 316 | spin_lock_irqsave(&pendowner->pi_lock, flags); |
316 | if (current->prio >= pendowner->prio) { | 317 | if (task->prio >= pendowner->prio) { |
317 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); | 318 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); |
318 | return 0; | 319 | return 0; |
319 | } | 320 | } |
@@ -338,21 +339,21 @@ static inline int try_to_steal_lock(struct rt_mutex *lock) | |||
338 | * We are going to steal the lock and a waiter was | 339 | * We are going to steal the lock and a waiter was |
339 | * enqueued on the pending owners pi_waiters queue. So | 340 | * enqueued on the pending owners pi_waiters queue. So |
340 | * we have to enqueue this waiter into | 341 | * we have to enqueue this waiter into |
341 | * current->pi_waiters list. This covers the case, | 342 | * task->pi_waiters list. This covers the case, |
342 | * where current is boosted because it holds another | 343 | * where task is boosted because it holds another |
343 | * lock and gets unboosted because the booster is | 344 | * lock and gets unboosted because the booster is |
344 | * interrupted, so we would delay a waiter with higher | 345 | * interrupted, so we would delay a waiter with higher |
345 | * priority as current->normal_prio. | 346 | * priority as task->normal_prio. |
346 | * | 347 | * |
347 | * Note: in the rare case of a SCHED_OTHER task changing | 348 | * Note: in the rare case of a SCHED_OTHER task changing |
348 | * its priority and thus stealing the lock, next->task | 349 | * its priority and thus stealing the lock, next->task |
349 | * might be current: | 350 | * might be task: |
350 | */ | 351 | */ |
351 | if (likely(next->task != current)) { | 352 | if (likely(next->task != task)) { |
352 | spin_lock_irqsave(¤t->pi_lock, flags); | 353 | spin_lock_irqsave(&task->pi_lock, flags); |
353 | plist_add(&next->pi_list_entry, ¤t->pi_waiters); | 354 | plist_add(&next->pi_list_entry, &task->pi_waiters); |
354 | __rt_mutex_adjust_prio(current); | 355 | __rt_mutex_adjust_prio(task); |
355 | spin_unlock_irqrestore(¤t->pi_lock, flags); | 356 | spin_unlock_irqrestore(&task->pi_lock, flags); |
356 | } | 357 | } |
357 | return 1; | 358 | return 1; |
358 | } | 359 | } |
@@ -389,7 +390,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) | |||
389 | */ | 390 | */ |
390 | mark_rt_mutex_waiters(lock); | 391 | mark_rt_mutex_waiters(lock); |
391 | 392 | ||
392 | if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) | 393 | if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) |
393 | return 0; | 394 | return 0; |
394 | 395 | ||
395 | /* We got the lock. */ | 396 | /* We got the lock. */ |
@@ -411,6 +412,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) | |||
411 | */ | 412 | */ |
412 | static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | 413 | static int task_blocks_on_rt_mutex(struct rt_mutex *lock, |
413 | struct rt_mutex_waiter *waiter, | 414 | struct rt_mutex_waiter *waiter, |
415 | struct task_struct *task, | ||
414 | int detect_deadlock) | 416 | int detect_deadlock) |
415 | { | 417 | { |
416 | struct task_struct *owner = rt_mutex_owner(lock); | 418 | struct task_struct *owner = rt_mutex_owner(lock); |
@@ -418,21 +420,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
418 | unsigned long flags; | 420 | unsigned long flags; |
419 | int chain_walk = 0, res; | 421 | int chain_walk = 0, res; |
420 | 422 | ||
421 | spin_lock_irqsave(¤t->pi_lock, flags); | 423 | spin_lock_irqsave(&task->pi_lock, flags); |
422 | __rt_mutex_adjust_prio(current); | 424 | __rt_mutex_adjust_prio(task); |
423 | waiter->task = current; | 425 | waiter->task = task; |
424 | waiter->lock = lock; | 426 | waiter->lock = lock; |
425 | plist_node_init(&waiter->list_entry, current->prio); | 427 | plist_node_init(&waiter->list_entry, task->prio); |
426 | plist_node_init(&waiter->pi_list_entry, current->prio); | 428 | plist_node_init(&waiter->pi_list_entry, task->prio); |
427 | 429 | ||
428 | /* Get the top priority waiter on the lock */ | 430 | /* Get the top priority waiter on the lock */ |
429 | if (rt_mutex_has_waiters(lock)) | 431 | if (rt_mutex_has_waiters(lock)) |
430 | top_waiter = rt_mutex_top_waiter(lock); | 432 | top_waiter = rt_mutex_top_waiter(lock); |
431 | plist_add(&waiter->list_entry, &lock->wait_list); | 433 | plist_add(&waiter->list_entry, &lock->wait_list); |
432 | 434 | ||
433 | current->pi_blocked_on = waiter; | 435 | task->pi_blocked_on = waiter; |
434 | 436 | ||
435 | spin_unlock_irqrestore(¤t->pi_lock, flags); | 437 | spin_unlock_irqrestore(&task->pi_lock, flags); |
436 | 438 | ||
437 | if (waiter == rt_mutex_top_waiter(lock)) { | 439 | if (waiter == rt_mutex_top_waiter(lock)) { |
438 | spin_lock_irqsave(&owner->pi_lock, flags); | 440 | spin_lock_irqsave(&owner->pi_lock, flags); |
@@ -460,7 +462,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
460 | spin_unlock(&lock->wait_lock); | 462 | spin_unlock(&lock->wait_lock); |
461 | 463 | ||
462 | res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, | 464 | res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, |
463 | current); | 465 | task); |
464 | 466 | ||
465 | spin_lock(&lock->wait_lock); | 467 | spin_lock(&lock->wait_lock); |
466 | 468 | ||
@@ -605,37 +607,25 @@ void rt_mutex_adjust_pi(struct task_struct *task) | |||
605 | rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); | 607 | rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); |
606 | } | 608 | } |
607 | 609 | ||
608 | /* | 610 | /** |
609 | * Slow path lock function: | 611 | * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop |
612 | * @lock: the rt_mutex to take | ||
613 | * @state: the state the task should block in (TASK_INTERRUPTIBLE | ||
614 | * or TASK_UNINTERRUPTIBLE) | ||
615 | * @timeout: the pre-initialized and started timer, or NULL for none | ||
616 | * @waiter: the pre-initialized rt_mutex_waiter | ||
617 | * @detect_deadlock: passed to task_blocks_on_rt_mutex | ||
618 | * | ||
619 | * lock->wait_lock must be held by the caller. | ||
610 | */ | 620 | */ |
611 | static int __sched | 621 | static int __sched |
612 | rt_mutex_slowlock(struct rt_mutex *lock, int state, | 622 | __rt_mutex_slowlock(struct rt_mutex *lock, int state, |
613 | struct hrtimer_sleeper *timeout, | 623 | struct hrtimer_sleeper *timeout, |
614 | int detect_deadlock) | 624 | struct rt_mutex_waiter *waiter, |
625 | int detect_deadlock) | ||
615 | { | 626 | { |
616 | struct rt_mutex_waiter waiter; | ||
617 | int ret = 0; | 627 | int ret = 0; |
618 | 628 | ||
619 | debug_rt_mutex_init_waiter(&waiter); | ||
620 | waiter.task = NULL; | ||
621 | |||
622 | spin_lock(&lock->wait_lock); | ||
623 | |||
624 | /* Try to acquire the lock again: */ | ||
625 | if (try_to_take_rt_mutex(lock)) { | ||
626 | spin_unlock(&lock->wait_lock); | ||
627 | return 0; | ||
628 | } | ||
629 | |||
630 | set_current_state(state); | ||
631 | |||
632 | /* Setup the timer, when timeout != NULL */ | ||
633 | if (unlikely(timeout)) { | ||
634 | hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); | ||
635 | if (!hrtimer_active(&timeout->timer)) | ||
636 | timeout->task = NULL; | ||
637 | } | ||
638 | |||
639 | for (;;) { | 629 | for (;;) { |
640 | /* Try to acquire the lock: */ | 630 | /* Try to acquire the lock: */ |
641 | if (try_to_take_rt_mutex(lock)) | 631 | if (try_to_take_rt_mutex(lock)) |
@@ -656,19 +646,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
656 | } | 646 | } |
657 | 647 | ||
658 | /* | 648 | /* |
659 | * waiter.task is NULL the first time we come here and | 649 | * waiter->task is NULL the first time we come here and |
660 | * when we have been woken up by the previous owner | 650 | * when we have been woken up by the previous owner |
661 | * but the lock got stolen by a higher prio task. | 651 | * but the lock got stolen by a higher prio task. |
662 | */ | 652 | */ |
663 | if (!waiter.task) { | 653 | if (!waiter->task) { |
664 | ret = task_blocks_on_rt_mutex(lock, &waiter, | 654 | ret = task_blocks_on_rt_mutex(lock, waiter, current, |
665 | detect_deadlock); | 655 | detect_deadlock); |
666 | /* | 656 | /* |
667 | * If we got woken up by the owner then start loop | 657 | * If we got woken up by the owner then start loop |
668 | * all over without going into schedule to try | 658 | * all over without going into schedule to try |
669 | * to get the lock now: | 659 | * to get the lock now: |
670 | */ | 660 | */ |
671 | if (unlikely(!waiter.task)) { | 661 | if (unlikely(!waiter->task)) { |
672 | /* | 662 | /* |
673 | * Reset the return value. We might | 663 | * Reset the return value. We might |
674 | * have returned with -EDEADLK and the | 664 | * have returned with -EDEADLK and the |
@@ -684,15 +674,52 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
684 | 674 | ||
685 | spin_unlock(&lock->wait_lock); | 675 | spin_unlock(&lock->wait_lock); |
686 | 676 | ||
687 | debug_rt_mutex_print_deadlock(&waiter); | 677 | debug_rt_mutex_print_deadlock(waiter); |
688 | 678 | ||
689 | if (waiter.task) | 679 | if (waiter->task) |
690 | schedule_rt_mutex(lock); | 680 | schedule_rt_mutex(lock); |
691 | 681 | ||
692 | spin_lock(&lock->wait_lock); | 682 | spin_lock(&lock->wait_lock); |
693 | set_current_state(state); | 683 | set_current_state(state); |
694 | } | 684 | } |
695 | 685 | ||
686 | return ret; | ||
687 | } | ||
688 | |||
689 | /* | ||
690 | * Slow path lock function: | ||
691 | */ | ||
692 | static int __sched | ||
693 | rt_mutex_slowlock(struct rt_mutex *lock, int state, | ||
694 | struct hrtimer_sleeper *timeout, | ||
695 | int detect_deadlock) | ||
696 | { | ||
697 | struct rt_mutex_waiter waiter; | ||
698 | int ret = 0; | ||
699 | |||
700 | debug_rt_mutex_init_waiter(&waiter); | ||
701 | waiter.task = NULL; | ||
702 | |||
703 | spin_lock(&lock->wait_lock); | ||
704 | |||
705 | /* Try to acquire the lock again: */ | ||
706 | if (try_to_take_rt_mutex(lock)) { | ||
707 | spin_unlock(&lock->wait_lock); | ||
708 | return 0; | ||
709 | } | ||
710 | |||
711 | set_current_state(state); | ||
712 | |||
713 | /* Setup the timer, when timeout != NULL */ | ||
714 | if (unlikely(timeout)) { | ||
715 | hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); | ||
716 | if (!hrtimer_active(&timeout->timer)) | ||
717 | timeout->task = NULL; | ||
718 | } | ||
719 | |||
720 | ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, | ||
721 | detect_deadlock); | ||
722 | |||
696 | set_current_state(TASK_RUNNING); | 723 | set_current_state(TASK_RUNNING); |
697 | 724 | ||
698 | if (unlikely(waiter.task)) | 725 | if (unlikely(waiter.task)) |
@@ -864,9 +891,9 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, | |||
864 | EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); | 891 | EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); |
865 | 892 | ||
866 | /** | 893 | /** |
867 | * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible | 894 | * rt_mutex_timed_lock - lock a rt_mutex interruptible |
868 | * the timeout structure is provided | 895 | * the timeout structure is provided |
869 | * by the caller | 896 | * by the caller |
870 | * | 897 | * |
871 | * @lock: the rt_mutex to be locked | 898 | * @lock: the rt_mutex to be locked |
872 | * @timeout: timeout structure or NULL (no timeout) | 899 | * @timeout: timeout structure or NULL (no timeout) |
@@ -913,7 +940,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock) | |||
913 | } | 940 | } |
914 | EXPORT_SYMBOL_GPL(rt_mutex_unlock); | 941 | EXPORT_SYMBOL_GPL(rt_mutex_unlock); |
915 | 942 | ||
916 | /*** | 943 | /** |
917 | * rt_mutex_destroy - mark a mutex unusable | 944 | * rt_mutex_destroy - mark a mutex unusable |
918 | * @lock: the mutex to be destroyed | 945 | * @lock: the mutex to be destroyed |
919 | * | 946 | * |
@@ -986,6 +1013,59 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, | |||
986 | } | 1013 | } |
987 | 1014 | ||
988 | /** | 1015 | /** |
1016 | * rt_mutex_start_proxy_lock() - Start lock acquisition for another task | ||
1017 | * @lock: the rt_mutex to take | ||
1018 | * @waiter: the pre-initialized rt_mutex_waiter | ||
1019 | * @task: the task to prepare | ||
1020 | * @detect_deadlock: perform deadlock detection (1) or not (0) | ||
1021 | * | ||
1022 | * Returns: | ||
1023 | * 0 - task blocked on lock | ||
1024 | * 1 - acquired the lock for task, caller should wake it up | ||
1025 | * <0 - error | ||
1026 | * | ||
1027 | * Special API call for FUTEX_REQUEUE_PI support. | ||
1028 | */ | ||
1029 | int rt_mutex_start_proxy_lock(struct rt_mutex *lock, | ||
1030 | struct rt_mutex_waiter *waiter, | ||
1031 | struct task_struct *task, int detect_deadlock) | ||
1032 | { | ||
1033 | int ret; | ||
1034 | |||
1035 | spin_lock(&lock->wait_lock); | ||
1036 | |||
1037 | mark_rt_mutex_waiters(lock); | ||
1038 | |||
1039 | if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { | ||
1040 | /* We got the lock for task. */ | ||
1041 | debug_rt_mutex_lock(lock); | ||
1042 | |||
1043 | rt_mutex_set_owner(lock, task, 0); | ||
1044 | |||
1045 | rt_mutex_deadlock_account_lock(lock, task); | ||
1046 | return 1; | ||
1047 | } | ||
1048 | |||
1049 | ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); | ||
1050 | |||
1051 | |||
1052 | if (ret && !waiter->task) { | ||
1053 | /* | ||
1054 | * Reset the return value. We might have | ||
1055 | * returned with -EDEADLK and the owner | ||
1056 | * released the lock while we were walking the | ||
1057 | * pi chain. Let the waiter sort it out. | ||
1058 | */ | ||
1059 | ret = 0; | ||
1060 | } | ||
1061 | spin_unlock(&lock->wait_lock); | ||
1062 | |||
1063 | debug_rt_mutex_print_deadlock(waiter); | ||
1064 | |||
1065 | return ret; | ||
1066 | } | ||
1067 | |||
1068 | /** | ||
989 | * rt_mutex_next_owner - return the next owner of the lock | 1069 | * rt_mutex_next_owner - return the next owner of the lock |
990 | * | 1070 | * |
991 | * @lock: the rt lock query | 1071 | * @lock: the rt lock query |
@@ -1004,3 +1084,57 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) | |||
1004 | 1084 | ||
1005 | return rt_mutex_top_waiter(lock)->task; | 1085 | return rt_mutex_top_waiter(lock)->task; |
1006 | } | 1086 | } |
1087 | |||
1088 | /** | ||
1089 | * rt_mutex_finish_proxy_lock() - Complete lock acquisition | ||
1090 | * @lock: the rt_mutex we were woken on | ||
1091 | * @to: the timeout, null if none. hrtimer should already have | ||
1092 | * been started. | ||
1093 | * @waiter: the pre-initialized rt_mutex_waiter | ||
1094 | * @detect_deadlock: perform deadlock detection (1) or not (0) | ||
1095 | * | ||
1096 | * Complete the lock acquisition started our behalf by another thread. | ||
1097 | * | ||
1098 | * Returns: | ||
1099 | * 0 - success | ||
1100 | * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK | ||
1101 | * | ||
1102 | * Special API call for PI-futex requeue support | ||
1103 | */ | ||
1104 | int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, | ||
1105 | struct hrtimer_sleeper *to, | ||
1106 | struct rt_mutex_waiter *waiter, | ||
1107 | int detect_deadlock) | ||
1108 | { | ||
1109 | int ret; | ||
1110 | |||
1111 | spin_lock(&lock->wait_lock); | ||
1112 | |||
1113 | set_current_state(TASK_INTERRUPTIBLE); | ||
1114 | |||
1115 | ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, | ||
1116 | detect_deadlock); | ||
1117 | |||
1118 | set_current_state(TASK_RUNNING); | ||
1119 | |||
1120 | if (unlikely(waiter->task)) | ||
1121 | remove_waiter(lock, waiter); | ||
1122 | |||
1123 | /* | ||
1124 | * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might | ||
1125 | * have to fix that up. | ||
1126 | */ | ||
1127 | fixup_rt_mutex_waiters(lock); | ||
1128 | |||
1129 | spin_unlock(&lock->wait_lock); | ||
1130 | |||
1131 | /* | ||
1132 | * Readjust priority, when we did not get the lock. We might have been | ||
1133 | * the pending owner and boosted. Since we did not take the lock, the | ||
1134 | * PI boost has to go. | ||
1135 | */ | ||
1136 | if (unlikely(ret)) | ||
1137 | rt_mutex_adjust_prio(current); | ||
1138 | |||
1139 | return ret; | ||
1140 | } | ||
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index e124bf5800ea..97a2f81866af 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h | |||
@@ -120,6 +120,14 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, | |||
120 | struct task_struct *proxy_owner); | 120 | struct task_struct *proxy_owner); |
121 | extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, | 121 | extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, |
122 | struct task_struct *proxy_owner); | 122 | struct task_struct *proxy_owner); |
123 | extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, | ||
124 | struct rt_mutex_waiter *waiter, | ||
125 | struct task_struct *task, | ||
126 | int detect_deadlock); | ||
127 | extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, | ||
128 | struct hrtimer_sleeper *to, | ||
129 | struct rt_mutex_waiter *waiter, | ||
130 | int detect_deadlock); | ||
123 | 131 | ||
124 | #ifdef CONFIG_DEBUG_RT_MUTEXES | 132 | #ifdef CONFIG_DEBUG_RT_MUTEXES |
125 | # include "rtmutex-debug.h" | 133 | # include "rtmutex-debug.h" |
diff --git a/kernel/sched.c b/kernel/sched.c index 26efa475bdc1..076e403b9c88 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -630,6 +630,10 @@ struct rq { | |||
630 | struct list_head migration_queue; | 630 | struct list_head migration_queue; |
631 | #endif | 631 | #endif |
632 | 632 | ||
633 | /* calc_load related fields */ | ||
634 | unsigned long calc_load_update; | ||
635 | long calc_load_active; | ||
636 | |||
633 | #ifdef CONFIG_SCHED_HRTICK | 637 | #ifdef CONFIG_SCHED_HRTICK |
634 | #ifdef CONFIG_SMP | 638 | #ifdef CONFIG_SMP |
635 | int hrtick_csd_pending; | 639 | int hrtick_csd_pending; |
@@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | |||
1728 | } | 1732 | } |
1729 | #endif | 1733 | #endif |
1730 | 1734 | ||
1735 | static void calc_load_account_active(struct rq *this_rq); | ||
1736 | |||
1731 | #include "sched_stats.h" | 1737 | #include "sched_stats.h" |
1732 | #include "sched_idletask.c" | 1738 | #include "sched_idletask.c" |
1733 | #include "sched_fair.c" | 1739 | #include "sched_fair.c" |
@@ -2458,6 +2464,17 @@ out: | |||
2458 | return success; | 2464 | return success; |
2459 | } | 2465 | } |
2460 | 2466 | ||
2467 | /** | ||
2468 | * wake_up_process - Wake up a specific process | ||
2469 | * @p: The process to be woken up. | ||
2470 | * | ||
2471 | * Attempt to wake up the nominated process and move it to the set of runnable | ||
2472 | * processes. Returns 1 if the process was woken up, 0 if it was already | ||
2473 | * running. | ||
2474 | * | ||
2475 | * It may be assumed that this function implies a write memory barrier before | ||
2476 | * changing the task state if and only if any tasks are woken up. | ||
2477 | */ | ||
2461 | int wake_up_process(struct task_struct *p) | 2478 | int wake_up_process(struct task_struct *p) |
2462 | { | 2479 | { |
2463 | return try_to_wake_up(p, TASK_ALL, 0); | 2480 | return try_to_wake_up(p, TASK_ALL, 0); |
@@ -2766,7 +2783,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2766 | * combine the page table reload and the switch backend into | 2783 | * combine the page table reload and the switch backend into |
2767 | * one hypercall. | 2784 | * one hypercall. |
2768 | */ | 2785 | */ |
2769 | arch_enter_lazy_cpu_mode(); | 2786 | arch_start_context_switch(prev); |
2770 | 2787 | ||
2771 | if (unlikely(!mm)) { | 2788 | if (unlikely(!mm)) { |
2772 | next->active_mm = oldmm; | 2789 | next->active_mm = oldmm; |
@@ -2856,19 +2873,72 @@ unsigned long nr_iowait(void) | |||
2856 | return sum; | 2873 | return sum; |
2857 | } | 2874 | } |
2858 | 2875 | ||
2859 | unsigned long nr_active(void) | 2876 | /* Variables and functions for calc_load */ |
2877 | static atomic_long_t calc_load_tasks; | ||
2878 | static unsigned long calc_load_update; | ||
2879 | unsigned long avenrun[3]; | ||
2880 | EXPORT_SYMBOL(avenrun); | ||
2881 | |||
2882 | /** | ||
2883 | * get_avenrun - get the load average array | ||
2884 | * @loads: pointer to dest load array | ||
2885 | * @offset: offset to add | ||
2886 | * @shift: shift count to shift the result left | ||
2887 | * | ||
2888 | * These values are estimates at best, so no need for locking. | ||
2889 | */ | ||
2890 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
2860 | { | 2891 | { |
2861 | unsigned long i, running = 0, uninterruptible = 0; | 2892 | loads[0] = (avenrun[0] + offset) << shift; |
2893 | loads[1] = (avenrun[1] + offset) << shift; | ||
2894 | loads[2] = (avenrun[2] + offset) << shift; | ||
2895 | } | ||
2862 | 2896 | ||
2863 | for_each_online_cpu(i) { | 2897 | static unsigned long |
2864 | running += cpu_rq(i)->nr_running; | 2898 | calc_load(unsigned long load, unsigned long exp, unsigned long active) |
2865 | uninterruptible += cpu_rq(i)->nr_uninterruptible; | 2899 | { |
2866 | } | 2900 | load *= exp; |
2901 | load += active * (FIXED_1 - exp); | ||
2902 | return load >> FSHIFT; | ||
2903 | } | ||
2867 | 2904 | ||
2868 | if (unlikely((long)uninterruptible < 0)) | 2905 | /* |
2869 | uninterruptible = 0; | 2906 | * calc_load - update the avenrun load estimates 10 ticks after the |
2907 | * CPUs have updated calc_load_tasks. | ||
2908 | */ | ||
2909 | void calc_global_load(void) | ||
2910 | { | ||
2911 | unsigned long upd = calc_load_update + 10; | ||
2912 | long active; | ||
2913 | |||
2914 | if (time_before(jiffies, upd)) | ||
2915 | return; | ||
2916 | |||
2917 | active = atomic_long_read(&calc_load_tasks); | ||
2918 | active = active > 0 ? active * FIXED_1 : 0; | ||
2870 | 2919 | ||
2871 | return running + uninterruptible; | 2920 | avenrun[0] = calc_load(avenrun[0], EXP_1, active); |
2921 | avenrun[1] = calc_load(avenrun[1], EXP_5, active); | ||
2922 | avenrun[2] = calc_load(avenrun[2], EXP_15, active); | ||
2923 | |||
2924 | calc_load_update += LOAD_FREQ; | ||
2925 | } | ||
2926 | |||
2927 | /* | ||
2928 | * Either called from update_cpu_load() or from a cpu going idle | ||
2929 | */ | ||
2930 | static void calc_load_account_active(struct rq *this_rq) | ||
2931 | { | ||
2932 | long nr_active, delta; | ||
2933 | |||
2934 | nr_active = this_rq->nr_running; | ||
2935 | nr_active += (long) this_rq->nr_uninterruptible; | ||
2936 | |||
2937 | if (nr_active != this_rq->calc_load_active) { | ||
2938 | delta = nr_active - this_rq->calc_load_active; | ||
2939 | this_rq->calc_load_active = nr_active; | ||
2940 | atomic_long_add(delta, &calc_load_tasks); | ||
2941 | } | ||
2872 | } | 2942 | } |
2873 | 2943 | ||
2874 | /* | 2944 | /* |
@@ -2899,6 +2969,11 @@ static void update_cpu_load(struct rq *this_rq) | |||
2899 | new_load += scale-1; | 2969 | new_load += scale-1; |
2900 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | 2970 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; |
2901 | } | 2971 | } |
2972 | |||
2973 | if (time_after_eq(jiffies, this_rq->calc_load_update)) { | ||
2974 | this_rq->calc_load_update += LOAD_FREQ; | ||
2975 | calc_load_account_active(this_rq); | ||
2976 | } | ||
2902 | } | 2977 | } |
2903 | 2978 | ||
2904 | #ifdef CONFIG_SMP | 2979 | #ifdef CONFIG_SMP |
@@ -4240,10 +4315,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
4240 | static struct { | 4315 | static struct { |
4241 | atomic_t load_balancer; | 4316 | atomic_t load_balancer; |
4242 | cpumask_var_t cpu_mask; | 4317 | cpumask_var_t cpu_mask; |
4318 | cpumask_var_t ilb_grp_nohz_mask; | ||
4243 | } nohz ____cacheline_aligned = { | 4319 | } nohz ____cacheline_aligned = { |
4244 | .load_balancer = ATOMIC_INIT(-1), | 4320 | .load_balancer = ATOMIC_INIT(-1), |
4245 | }; | 4321 | }; |
4246 | 4322 | ||
4323 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
4324 | /** | ||
4325 | * lowest_flag_domain - Return lowest sched_domain containing flag. | ||
4326 | * @cpu: The cpu whose lowest level of sched domain is to | ||
4327 | * be returned. | ||
4328 | * @flag: The flag to check for the lowest sched_domain | ||
4329 | * for the given cpu. | ||
4330 | * | ||
4331 | * Returns the lowest sched_domain of a cpu which contains the given flag. | ||
4332 | */ | ||
4333 | static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | ||
4334 | { | ||
4335 | struct sched_domain *sd; | ||
4336 | |||
4337 | for_each_domain(cpu, sd) | ||
4338 | if (sd && (sd->flags & flag)) | ||
4339 | break; | ||
4340 | |||
4341 | return sd; | ||
4342 | } | ||
4343 | |||
4344 | /** | ||
4345 | * for_each_flag_domain - Iterates over sched_domains containing the flag. | ||
4346 | * @cpu: The cpu whose domains we're iterating over. | ||
4347 | * @sd: variable holding the value of the power_savings_sd | ||
4348 | * for cpu. | ||
4349 | * @flag: The flag to filter the sched_domains to be iterated. | ||
4350 | * | ||
4351 | * Iterates over all the scheduler domains for a given cpu that has the 'flag' | ||
4352 | * set, starting from the lowest sched_domain to the highest. | ||
4353 | */ | ||
4354 | #define for_each_flag_domain(cpu, sd, flag) \ | ||
4355 | for (sd = lowest_flag_domain(cpu, flag); \ | ||
4356 | (sd && (sd->flags & flag)); sd = sd->parent) | ||
4357 | |||
4358 | /** | ||
4359 | * is_semi_idle_group - Checks if the given sched_group is semi-idle. | ||
4360 | * @ilb_group: group to be checked for semi-idleness | ||
4361 | * | ||
4362 | * Returns: 1 if the group is semi-idle. 0 otherwise. | ||
4363 | * | ||
4364 | * We define a sched_group to be semi idle if it has atleast one idle-CPU | ||
4365 | * and atleast one non-idle CPU. This helper function checks if the given | ||
4366 | * sched_group is semi-idle or not. | ||
4367 | */ | ||
4368 | static inline int is_semi_idle_group(struct sched_group *ilb_group) | ||
4369 | { | ||
4370 | cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, | ||
4371 | sched_group_cpus(ilb_group)); | ||
4372 | |||
4373 | /* | ||
4374 | * A sched_group is semi-idle when it has atleast one busy cpu | ||
4375 | * and atleast one idle cpu. | ||
4376 | */ | ||
4377 | if (cpumask_empty(nohz.ilb_grp_nohz_mask)) | ||
4378 | return 0; | ||
4379 | |||
4380 | if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) | ||
4381 | return 0; | ||
4382 | |||
4383 | return 1; | ||
4384 | } | ||
4385 | /** | ||
4386 | * find_new_ilb - Finds the optimum idle load balancer for nomination. | ||
4387 | * @cpu: The cpu which is nominating a new idle_load_balancer. | ||
4388 | * | ||
4389 | * Returns: Returns the id of the idle load balancer if it exists, | ||
4390 | * Else, returns >= nr_cpu_ids. | ||
4391 | * | ||
4392 | * This algorithm picks the idle load balancer such that it belongs to a | ||
4393 | * semi-idle powersavings sched_domain. The idea is to try and avoid | ||
4394 | * completely idle packages/cores just for the purpose of idle load balancing | ||
4395 | * when there are other idle cpu's which are better suited for that job. | ||
4396 | */ | ||
4397 | static int find_new_ilb(int cpu) | ||
4398 | { | ||
4399 | struct sched_domain *sd; | ||
4400 | struct sched_group *ilb_group; | ||
4401 | |||
4402 | /* | ||
4403 | * Have idle load balancer selection from semi-idle packages only | ||
4404 | * when power-aware load balancing is enabled | ||
4405 | */ | ||
4406 | if (!(sched_smt_power_savings || sched_mc_power_savings)) | ||
4407 | goto out_done; | ||
4408 | |||
4409 | /* | ||
4410 | * Optimize for the case when we have no idle CPUs or only one | ||
4411 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | ||
4412 | */ | ||
4413 | if (cpumask_weight(nohz.cpu_mask) < 2) | ||
4414 | goto out_done; | ||
4415 | |||
4416 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | ||
4417 | ilb_group = sd->groups; | ||
4418 | |||
4419 | do { | ||
4420 | if (is_semi_idle_group(ilb_group)) | ||
4421 | return cpumask_first(nohz.ilb_grp_nohz_mask); | ||
4422 | |||
4423 | ilb_group = ilb_group->next; | ||
4424 | |||
4425 | } while (ilb_group != sd->groups); | ||
4426 | } | ||
4427 | |||
4428 | out_done: | ||
4429 | return cpumask_first(nohz.cpu_mask); | ||
4430 | } | ||
4431 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | ||
4432 | static inline int find_new_ilb(int call_cpu) | ||
4433 | { | ||
4434 | return cpumask_first(nohz.cpu_mask); | ||
4435 | } | ||
4436 | #endif | ||
4437 | |||
4247 | /* | 4438 | /* |
4248 | * This routine will try to nominate the ilb (idle load balancing) | 4439 | * This routine will try to nominate the ilb (idle load balancing) |
4249 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | 4440 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle |
@@ -4298,8 +4489,24 @@ int select_nohz_load_balancer(int stop_tick) | |||
4298 | /* make me the ilb owner */ | 4489 | /* make me the ilb owner */ |
4299 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) | 4490 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) |
4300 | return 1; | 4491 | return 1; |
4301 | } else if (atomic_read(&nohz.load_balancer) == cpu) | 4492 | } else if (atomic_read(&nohz.load_balancer) == cpu) { |
4493 | int new_ilb; | ||
4494 | |||
4495 | if (!(sched_smt_power_savings || | ||
4496 | sched_mc_power_savings)) | ||
4497 | return 1; | ||
4498 | /* | ||
4499 | * Check to see if there is a more power-efficient | ||
4500 | * ilb. | ||
4501 | */ | ||
4502 | new_ilb = find_new_ilb(cpu); | ||
4503 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { | ||
4504 | atomic_set(&nohz.load_balancer, -1); | ||
4505 | resched_cpu(new_ilb); | ||
4506 | return 0; | ||
4507 | } | ||
4302 | return 1; | 4508 | return 1; |
4509 | } | ||
4303 | } else { | 4510 | } else { |
4304 | if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) | 4511 | if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) |
4305 | return 0; | 4512 | return 0; |
@@ -4468,15 +4675,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) | |||
4468 | } | 4675 | } |
4469 | 4676 | ||
4470 | if (atomic_read(&nohz.load_balancer) == -1) { | 4677 | if (atomic_read(&nohz.load_balancer) == -1) { |
4471 | /* | 4678 | int ilb = find_new_ilb(cpu); |
4472 | * simple selection for now: Nominate the | ||
4473 | * first cpu in the nohz list to be the next | ||
4474 | * ilb owner. | ||
4475 | * | ||
4476 | * TBD: Traverse the sched domains and nominate | ||
4477 | * the nearest cpu in the nohz.cpu_mask. | ||
4478 | */ | ||
4479 | int ilb = cpumask_first(nohz.cpu_mask); | ||
4480 | 4679 | ||
4481 | if (ilb < nr_cpu_ids) | 4680 | if (ilb < nr_cpu_ids) |
4482 | resched_cpu(ilb); | 4681 | resched_cpu(ilb); |
@@ -5007,13 +5206,15 @@ pick_next_task(struct rq *rq) | |||
5007 | /* | 5206 | /* |
5008 | * schedule() is the main scheduler function. | 5207 | * schedule() is the main scheduler function. |
5009 | */ | 5208 | */ |
5010 | asmlinkage void __sched __schedule(void) | 5209 | asmlinkage void __sched schedule(void) |
5011 | { | 5210 | { |
5012 | struct task_struct *prev, *next; | 5211 | struct task_struct *prev, *next; |
5013 | unsigned long *switch_count; | 5212 | unsigned long *switch_count; |
5014 | struct rq *rq; | 5213 | struct rq *rq; |
5015 | int cpu; | 5214 | int cpu; |
5016 | 5215 | ||
5216 | need_resched: | ||
5217 | preempt_disable(); | ||
5017 | cpu = smp_processor_id(); | 5218 | cpu = smp_processor_id(); |
5018 | rq = cpu_rq(cpu); | 5219 | rq = cpu_rq(cpu); |
5019 | rcu_qsctr_inc(cpu); | 5220 | rcu_qsctr_inc(cpu); |
@@ -5070,15 +5271,9 @@ need_resched_nonpreemptible: | |||
5070 | 5271 | ||
5071 | if (unlikely(reacquire_kernel_lock(current) < 0)) | 5272 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
5072 | goto need_resched_nonpreemptible; | 5273 | goto need_resched_nonpreemptible; |
5073 | } | ||
5074 | 5274 | ||
5075 | asmlinkage void __sched schedule(void) | ||
5076 | { | ||
5077 | need_resched: | ||
5078 | preempt_disable(); | ||
5079 | __schedule(); | ||
5080 | preempt_enable_no_resched(); | 5275 | preempt_enable_no_resched(); |
5081 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 5276 | if (need_resched()) |
5082 | goto need_resched; | 5277 | goto need_resched; |
5083 | } | 5278 | } |
5084 | EXPORT_SYMBOL(schedule); | 5279 | EXPORT_SYMBOL(schedule); |
@@ -5221,7 +5416,7 @@ EXPORT_SYMBOL(default_wake_function); | |||
5221 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns | 5416 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns |
5222 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | 5417 | * zero in this (rare) case, and we handle it by continuing to scan the queue. |
5223 | */ | 5418 | */ |
5224 | void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | 5419 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, |
5225 | int nr_exclusive, int sync, void *key) | 5420 | int nr_exclusive, int sync, void *key) |
5226 | { | 5421 | { |
5227 | wait_queue_t *curr, *next; | 5422 | wait_queue_t *curr, *next; |
@@ -5241,6 +5436,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | |||
5241 | * @mode: which threads | 5436 | * @mode: which threads |
5242 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | 5437 | * @nr_exclusive: how many wake-one or wake-many threads to wake up |
5243 | * @key: is directly passed to the wakeup function | 5438 | * @key: is directly passed to the wakeup function |
5439 | * | ||
5440 | * It may be assumed that this function implies a write memory barrier before | ||
5441 | * changing the task state if and only if any tasks are woken up. | ||
5244 | */ | 5442 | */ |
5245 | void __wake_up(wait_queue_head_t *q, unsigned int mode, | 5443 | void __wake_up(wait_queue_head_t *q, unsigned int mode, |
5246 | int nr_exclusive, void *key) | 5444 | int nr_exclusive, void *key) |
@@ -5279,6 +5477,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) | |||
5279 | * with each other. This can prevent needless bouncing between CPUs. | 5477 | * with each other. This can prevent needless bouncing between CPUs. |
5280 | * | 5478 | * |
5281 | * On UP it can prevent extra preemption. | 5479 | * On UP it can prevent extra preemption. |
5480 | * | ||
5481 | * It may be assumed that this function implies a write memory barrier before | ||
5482 | * changing the task state if and only if any tasks are woken up. | ||
5282 | */ | 5483 | */ |
5283 | void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, | 5484 | void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, |
5284 | int nr_exclusive, void *key) | 5485 | int nr_exclusive, void *key) |
@@ -5315,6 +5516,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | |||
5315 | * awakened in the same order in which they were queued. | 5516 | * awakened in the same order in which they were queued. |
5316 | * | 5517 | * |
5317 | * See also complete_all(), wait_for_completion() and related routines. | 5518 | * See also complete_all(), wait_for_completion() and related routines. |
5519 | * | ||
5520 | * It may be assumed that this function implies a write memory barrier before | ||
5521 | * changing the task state if and only if any tasks are woken up. | ||
5318 | */ | 5522 | */ |
5319 | void complete(struct completion *x) | 5523 | void complete(struct completion *x) |
5320 | { | 5524 | { |
@@ -5332,6 +5536,9 @@ EXPORT_SYMBOL(complete); | |||
5332 | * @x: holds the state of this particular completion | 5536 | * @x: holds the state of this particular completion |
5333 | * | 5537 | * |
5334 | * This will wake up all threads waiting on this particular completion event. | 5538 | * This will wake up all threads waiting on this particular completion event. |
5539 | * | ||
5540 | * It may be assumed that this function implies a write memory barrier before | ||
5541 | * changing the task state if and only if any tasks are woken up. | ||
5335 | */ | 5542 | */ |
5336 | void complete_all(struct completion *x) | 5543 | void complete_all(struct completion *x) |
5337 | { | 5544 | { |
@@ -6490,8 +6697,9 @@ void sched_show_task(struct task_struct *p) | |||
6490 | #ifdef CONFIG_DEBUG_STACK_USAGE | 6697 | #ifdef CONFIG_DEBUG_STACK_USAGE |
6491 | free = stack_not_used(p); | 6698 | free = stack_not_used(p); |
6492 | #endif | 6699 | #endif |
6493 | printk(KERN_CONT "%5lu %5d %6d\n", free, | 6700 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, |
6494 | task_pid_nr(p), task_pid_nr(p->real_parent)); | 6701 | task_pid_nr(p), task_pid_nr(p->real_parent), |
6702 | (unsigned long)task_thread_info(p)->flags); | ||
6495 | 6703 | ||
6496 | show_stack(p, NULL); | 6704 | show_stack(p, NULL); |
6497 | } | 6705 | } |
@@ -6970,6 +7178,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu) | |||
6970 | 7178 | ||
6971 | } | 7179 | } |
6972 | } | 7180 | } |
7181 | |||
7182 | /* | ||
7183 | * remove the tasks which were accounted by rq from calc_load_tasks. | ||
7184 | */ | ||
7185 | static void calc_global_load_remove(struct rq *rq) | ||
7186 | { | ||
7187 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); | ||
7188 | } | ||
6973 | #endif /* CONFIG_HOTPLUG_CPU */ | 7189 | #endif /* CONFIG_HOTPLUG_CPU */ |
6974 | 7190 | ||
6975 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | 7191 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) |
@@ -7204,6 +7420,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
7204 | /* Update our root-domain */ | 7420 | /* Update our root-domain */ |
7205 | rq = cpu_rq(cpu); | 7421 | rq = cpu_rq(cpu); |
7206 | spin_lock_irqsave(&rq->lock, flags); | 7422 | spin_lock_irqsave(&rq->lock, flags); |
7423 | rq->calc_load_update = calc_load_update; | ||
7424 | rq->calc_load_active = 0; | ||
7207 | if (rq->rd) { | 7425 | if (rq->rd) { |
7208 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 7426 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
7209 | 7427 | ||
@@ -7243,7 +7461,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
7243 | cpuset_unlock(); | 7461 | cpuset_unlock(); |
7244 | migrate_nr_uninterruptible(rq); | 7462 | migrate_nr_uninterruptible(rq); |
7245 | BUG_ON(rq->nr_running != 0); | 7463 | BUG_ON(rq->nr_running != 0); |
7246 | 7464 | calc_global_load_remove(rq); | |
7247 | /* | 7465 | /* |
7248 | * No need to migrate the tasks: it was best-effort if | 7466 | * No need to migrate the tasks: it was best-effort if |
7249 | * they didn't take sched_hotcpu_mutex. Just wake up | 7467 | * they didn't take sched_hotcpu_mutex. Just wake up |
@@ -7753,8 +7971,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | |||
7753 | 7971 | ||
7754 | /* | 7972 | /* |
7755 | * The cpus mask in sched_group and sched_domain hangs off the end. | 7973 | * The cpus mask in sched_group and sched_domain hangs off the end. |
7756 | * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space | 7974 | * |
7757 | * for nr_cpu_ids < CONFIG_NR_CPUS. | 7975 | * ( See the the comments in include/linux/sched.h:struct sched_group |
7976 | * and struct sched_domain. ) | ||
7758 | */ | 7977 | */ |
7759 | struct static_sched_group { | 7978 | struct static_sched_group { |
7760 | struct sched_group sg; | 7979 | struct sched_group sg; |
@@ -7875,7 +8094,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
7875 | struct sched_domain *sd; | 8094 | struct sched_domain *sd; |
7876 | 8095 | ||
7877 | sd = &per_cpu(phys_domains, j).sd; | 8096 | sd = &per_cpu(phys_domains, j).sd; |
7878 | if (j != cpumask_first(sched_group_cpus(sd->groups))) { | 8097 | if (j != group_first_cpu(sd->groups)) { |
7879 | /* | 8098 | /* |
7880 | * Only add "power" once for each | 8099 | * Only add "power" once for each |
7881 | * physical package. | 8100 | * physical package. |
@@ -7953,7 +8172,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
7953 | 8172 | ||
7954 | WARN_ON(!sd || !sd->groups); | 8173 | WARN_ON(!sd || !sd->groups); |
7955 | 8174 | ||
7956 | if (cpu != cpumask_first(sched_group_cpus(sd->groups))) | 8175 | if (cpu != group_first_cpu(sd->groups)) |
7957 | return; | 8176 | return; |
7958 | 8177 | ||
7959 | child = sd->child; | 8178 | child = sd->child; |
@@ -8938,6 +9157,8 @@ void __init sched_init(void) | |||
8938 | rq = cpu_rq(i); | 9157 | rq = cpu_rq(i); |
8939 | spin_lock_init(&rq->lock); | 9158 | spin_lock_init(&rq->lock); |
8940 | rq->nr_running = 0; | 9159 | rq->nr_running = 0; |
9160 | rq->calc_load_active = 0; | ||
9161 | rq->calc_load_update = jiffies + LOAD_FREQ; | ||
8941 | init_cfs_rq(&rq->cfs, rq); | 9162 | init_cfs_rq(&rq->cfs, rq); |
8942 | init_rt_rq(&rq->rt, rq); | 9163 | init_rt_rq(&rq->rt, rq); |
8943 | #ifdef CONFIG_FAIR_GROUP_SCHED | 9164 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -9045,6 +9266,9 @@ void __init sched_init(void) | |||
9045 | * when this runqueue becomes "idle". | 9266 | * when this runqueue becomes "idle". |
9046 | */ | 9267 | */ |
9047 | init_idle(current, smp_processor_id()); | 9268 | init_idle(current, smp_processor_id()); |
9269 | |||
9270 | calc_load_update = jiffies + LOAD_FREQ; | ||
9271 | |||
9048 | /* | 9272 | /* |
9049 | * During early bootup we pretend to be a normal task: | 9273 | * During early bootup we pretend to be a normal task: |
9050 | */ | 9274 | */ |
@@ -9055,6 +9279,7 @@ void __init sched_init(void) | |||
9055 | #ifdef CONFIG_SMP | 9279 | #ifdef CONFIG_SMP |
9056 | #ifdef CONFIG_NO_HZ | 9280 | #ifdef CONFIG_NO_HZ |
9057 | alloc_bootmem_cpumask_var(&nohz.cpu_mask); | 9281 | alloc_bootmem_cpumask_var(&nohz.cpu_mask); |
9282 | alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask); | ||
9058 | #endif | 9283 | #endif |
9059 | alloc_bootmem_cpumask_var(&cpu_isolated_map); | 9284 | alloc_bootmem_cpumask_var(&cpu_isolated_map); |
9060 | #endif /* SMP */ | 9285 | #endif /* SMP */ |
@@ -9800,6 +10025,13 @@ static int sched_rt_global_constraints(void) | |||
9800 | if (sysctl_sched_rt_period <= 0) | 10025 | if (sysctl_sched_rt_period <= 0) |
9801 | return -EINVAL; | 10026 | return -EINVAL; |
9802 | 10027 | ||
10028 | /* | ||
10029 | * There's always some RT tasks in the root group | ||
10030 | * -- migration, kstopmachine etc.. | ||
10031 | */ | ||
10032 | if (sysctl_sched_rt_runtime == 0) | ||
10033 | return -EBUSY; | ||
10034 | |||
9803 | spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); | 10035 | spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); |
9804 | for_each_possible_cpu(i) { | 10036 | for_each_possible_cpu(i) { |
9805 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; | 10037 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; |
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index cdd3c89574cd..344712a5e3ed 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c | |||
@@ -165,7 +165,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) | |||
165 | vec->count = 0; | 165 | vec->count = 0; |
166 | if (bootmem) | 166 | if (bootmem) |
167 | alloc_bootmem_cpumask_var(&vec->mask); | 167 | alloc_bootmem_cpumask_var(&vec->mask); |
168 | else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL)) | 168 | else if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) |
169 | goto cleanup; | 169 | goto cleanup; |
170 | } | 170 | } |
171 | 171 | ||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3816f217f119..5f9650e8fe75 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -1487,17 +1487,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) | |||
1487 | 1487 | ||
1488 | find_matching_se(&se, &pse); | 1488 | find_matching_se(&se, &pse); |
1489 | 1489 | ||
1490 | while (se) { | 1490 | BUG_ON(!pse); |
1491 | BUG_ON(!pse); | ||
1492 | 1491 | ||
1493 | if (wakeup_preempt_entity(se, pse) == 1) { | 1492 | if (wakeup_preempt_entity(se, pse) == 1) |
1494 | resched_task(curr); | 1493 | resched_task(curr); |
1495 | break; | ||
1496 | } | ||
1497 | |||
1498 | se = parent_entity(se); | ||
1499 | pse = parent_entity(pse); | ||
1500 | } | ||
1501 | } | 1494 | } |
1502 | 1495 | ||
1503 | static struct task_struct *pick_next_task_fair(struct rq *rq) | 1496 | static struct task_struct *pick_next_task_fair(struct rq *rq) |
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 8a21a2e28c13..499672c10cbd 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy | |||
22 | static struct task_struct *pick_next_task_idle(struct rq *rq) | 22 | static struct task_struct *pick_next_task_idle(struct rq *rq) |
23 | { | 23 | { |
24 | schedstat_inc(rq, sched_goidle); | 24 | schedstat_inc(rq, sched_goidle); |
25 | 25 | /* adjust the active tasks as we might go into a long sleep */ | |
26 | calc_load_account_active(rq); | ||
26 | return rq->idle; | 27 | return rq->idle; |
27 | } | 28 | } |
28 | 29 | ||
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index f2c66f8f9712..9bf0d2a73045 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -1591,7 +1591,7 @@ static inline void init_sched_rt_class(void) | |||
1591 | unsigned int i; | 1591 | unsigned int i; |
1592 | 1592 | ||
1593 | for_each_possible_cpu(i) | 1593 | for_each_possible_cpu(i) |
1594 | alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), | 1594 | zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), |
1595 | GFP_KERNEL, cpu_to_node(i)); | 1595 | GFP_KERNEL, cpu_to_node(i)); |
1596 | } | 1596 | } |
1597 | #endif /* CONFIG_SMP */ | 1597 | #endif /* CONFIG_SMP */ |
diff --git a/kernel/smp.c b/kernel/smp.c index 858baac568ee..ad63d8501207 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -52,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
52 | switch (action) { | 52 | switch (action) { |
53 | case CPU_UP_PREPARE: | 53 | case CPU_UP_PREPARE: |
54 | case CPU_UP_PREPARE_FROZEN: | 54 | case CPU_UP_PREPARE_FROZEN: |
55 | if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, | 55 | if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, |
56 | cpu_to_node(cpu))) | 56 | cpu_to_node(cpu))) |
57 | return NOTIFY_BAD; | 57 | return NOTIFY_BAD; |
58 | break; | 58 | break; |
diff --git a/kernel/softirq.c b/kernel/softirq.c index b525dd348511..f674f332a024 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -828,7 +828,7 @@ int __init __weak arch_early_irq_init(void) | |||
828 | return 0; | 828 | return 0; |
829 | } | 829 | } |
830 | 830 | ||
831 | int __weak arch_init_chip_data(struct irq_desc *desc, int cpu) | 831 | int __weak arch_init_chip_data(struct irq_desc *desc, int node) |
832 | { | 832 | { |
833 | return 0; | 833 | return 0; |
834 | } | 834 | } |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 45bd711a242e..944ba03cae19 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -743,6 +743,14 @@ static struct ctl_table kern_table[] = { | |||
743 | }, | 743 | }, |
744 | { | 744 | { |
745 | .ctl_name = CTL_UNNUMBERED, | 745 | .ctl_name = CTL_UNNUMBERED, |
746 | .procname = "bootloader_version", | ||
747 | .data = &bootloader_version, | ||
748 | .maxlen = sizeof (int), | ||
749 | .mode = 0444, | ||
750 | .proc_handler = &proc_dointvec, | ||
751 | }, | ||
752 | { | ||
753 | .ctl_name = CTL_UNNUMBERED, | ||
746 | .procname = "kstack_depth_to_print", | 754 | .procname = "kstack_depth_to_print", |
747 | .data = &kstack_depth_to_print, | 755 | .data = &kstack_depth_to_print, |
748 | .maxlen = sizeof(int), | 756 | .maxlen = sizeof(int), |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 687dff49f6e7..52a8bf8931f3 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -22,7 +22,7 @@ | |||
22 | 22 | ||
23 | /* | 23 | /* |
24 | * This read-write spinlock protects us from races in SMP while | 24 | * This read-write spinlock protects us from races in SMP while |
25 | * playing with xtime and avenrun. | 25 | * playing with xtime. |
26 | */ | 26 | */ |
27 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); | 27 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); |
28 | 28 | ||
diff --git a/kernel/timer.c b/kernel/timer.c index cffffad01c31..a26ed294f938 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -1123,47 +1123,6 @@ void update_process_times(int user_tick) | |||
1123 | } | 1123 | } |
1124 | 1124 | ||
1125 | /* | 1125 | /* |
1126 | * Nr of active tasks - counted in fixed-point numbers | ||
1127 | */ | ||
1128 | static unsigned long count_active_tasks(void) | ||
1129 | { | ||
1130 | return nr_active() * FIXED_1; | ||
1131 | } | ||
1132 | |||
1133 | /* | ||
1134 | * Hmm.. Changed this, as the GNU make sources (load.c) seems to | ||
1135 | * imply that avenrun[] is the standard name for this kind of thing. | ||
1136 | * Nothing else seems to be standardized: the fractional size etc | ||
1137 | * all seem to differ on different machines. | ||
1138 | * | ||
1139 | * Requires xtime_lock to access. | ||
1140 | */ | ||
1141 | unsigned long avenrun[3]; | ||
1142 | |||
1143 | EXPORT_SYMBOL(avenrun); | ||
1144 | |||
1145 | /* | ||
1146 | * calc_load - given tick count, update the avenrun load estimates. | ||
1147 | * This is called while holding a write_lock on xtime_lock. | ||
1148 | */ | ||
1149 | static inline void calc_load(unsigned long ticks) | ||
1150 | { | ||
1151 | unsigned long active_tasks; /* fixed-point */ | ||
1152 | static int count = LOAD_FREQ; | ||
1153 | |||
1154 | count -= ticks; | ||
1155 | if (unlikely(count < 0)) { | ||
1156 | active_tasks = count_active_tasks(); | ||
1157 | do { | ||
1158 | CALC_LOAD(avenrun[0], EXP_1, active_tasks); | ||
1159 | CALC_LOAD(avenrun[1], EXP_5, active_tasks); | ||
1160 | CALC_LOAD(avenrun[2], EXP_15, active_tasks); | ||
1161 | count += LOAD_FREQ; | ||
1162 | } while (count < 0); | ||
1163 | } | ||
1164 | } | ||
1165 | |||
1166 | /* | ||
1167 | * This function runs timers and the timer-tq in bottom half context. | 1126 | * This function runs timers and the timer-tq in bottom half context. |
1168 | */ | 1127 | */ |
1169 | static void run_timer_softirq(struct softirq_action *h) | 1128 | static void run_timer_softirq(struct softirq_action *h) |
@@ -1187,16 +1146,6 @@ void run_local_timers(void) | |||
1187 | } | 1146 | } |
1188 | 1147 | ||
1189 | /* | 1148 | /* |
1190 | * Called by the timer interrupt. xtime_lock must already be taken | ||
1191 | * by the timer IRQ! | ||
1192 | */ | ||
1193 | static inline void update_times(unsigned long ticks) | ||
1194 | { | ||
1195 | update_wall_time(); | ||
1196 | calc_load(ticks); | ||
1197 | } | ||
1198 | |||
1199 | /* | ||
1200 | * The 64-bit jiffies value is not atomic - you MUST NOT read it | 1149 | * The 64-bit jiffies value is not atomic - you MUST NOT read it |
1201 | * without sampling the sequence number in xtime_lock. | 1150 | * without sampling the sequence number in xtime_lock. |
1202 | * jiffies is defined in the linker script... | 1151 | * jiffies is defined in the linker script... |
@@ -1205,7 +1154,8 @@ static inline void update_times(unsigned long ticks) | |||
1205 | void do_timer(unsigned long ticks) | 1154 | void do_timer(unsigned long ticks) |
1206 | { | 1155 | { |
1207 | jiffies_64 += ticks; | 1156 | jiffies_64 += ticks; |
1208 | update_times(ticks); | 1157 | update_wall_time(); |
1158 | calc_global_load(); | ||
1209 | } | 1159 | } |
1210 | 1160 | ||
1211 | #ifdef __ARCH_WANT_SYS_ALARM | 1161 | #ifdef __ARCH_WANT_SYS_ALARM |
@@ -1406,37 +1356,17 @@ int do_sysinfo(struct sysinfo *info) | |||
1406 | { | 1356 | { |
1407 | unsigned long mem_total, sav_total; | 1357 | unsigned long mem_total, sav_total; |
1408 | unsigned int mem_unit, bitcount; | 1358 | unsigned int mem_unit, bitcount; |
1409 | unsigned long seq; | 1359 | struct timespec tp; |
1410 | 1360 | ||
1411 | memset(info, 0, sizeof(struct sysinfo)); | 1361 | memset(info, 0, sizeof(struct sysinfo)); |
1412 | 1362 | ||
1413 | do { | 1363 | ktime_get_ts(&tp); |
1414 | struct timespec tp; | 1364 | monotonic_to_bootbased(&tp); |
1415 | seq = read_seqbegin(&xtime_lock); | 1365 | info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); |
1416 | |||
1417 | /* | ||
1418 | * This is annoying. The below is the same thing | ||
1419 | * posix_get_clock_monotonic() does, but it wants to | ||
1420 | * take the lock which we want to cover the loads stuff | ||
1421 | * too. | ||
1422 | */ | ||
1423 | |||
1424 | getnstimeofday(&tp); | ||
1425 | tp.tv_sec += wall_to_monotonic.tv_sec; | ||
1426 | tp.tv_nsec += wall_to_monotonic.tv_nsec; | ||
1427 | monotonic_to_bootbased(&tp); | ||
1428 | if (tp.tv_nsec - NSEC_PER_SEC >= 0) { | ||
1429 | tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; | ||
1430 | tp.tv_sec++; | ||
1431 | } | ||
1432 | info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); | ||
1433 | 1366 | ||
1434 | info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); | 1367 | get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); |
1435 | info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); | ||
1436 | info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); | ||
1437 | 1368 | ||
1438 | info->procs = nr_threads; | 1369 | info->procs = nr_threads; |
1439 | } while (read_seqretry(&xtime_lock, seq)); | ||
1440 | 1370 | ||
1441 | si_meminfo(info); | 1371 | si_meminfo(info); |
1442 | si_swapinfo(info); | 1372 | si_swapinfo(info); |
diff --git a/kernel/wait.c b/kernel/wait.c index 42a2dbc181c8..ea7c3b4275cf 100644 --- a/kernel/wait.c +++ b/kernel/wait.c | |||
@@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, | |||
154 | if (!list_empty(&wait->task_list)) | 154 | if (!list_empty(&wait->task_list)) |
155 | list_del_init(&wait->task_list); | 155 | list_del_init(&wait->task_list); |
156 | else if (waitqueue_active(q)) | 156 | else if (waitqueue_active(q)) |
157 | __wake_up_common(q, mode, 1, 0, key); | 157 | __wake_up_locked_key(q, mode, key); |
158 | spin_unlock_irqrestore(&q->lock, flags); | 158 | spin_unlock_irqrestore(&q->lock, flags); |
159 | } | 159 | } |
160 | EXPORT_SYMBOL(abort_exclusive_wait); | 160 | EXPORT_SYMBOL(abort_exclusive_wait); |