aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/alpha/kernel/smp.c3
-rw-r--r--arch/arm/kernel/smp.c5
-rw-r--r--arch/blackfin/mach-common/smp.c3
-rw-r--r--arch/cris/arch-v32/kernel/smp.c13
-rw-r--r--arch/ia64/kernel/irq_ia64.c2
-rw-r--r--arch/ia64/xen/irq_xen.c10
-rw-r--r--arch/m32r/kernel/smp.c4
-rw-r--r--arch/mips/cavium-octeon/smp.c2
-rw-r--r--arch/mips/kernel/smtc.c2
-rw-r--r--arch/mips/mti-malta/malta-int.c2
-rw-r--r--arch/mips/pmc-sierra/yosemite/smp.c4
-rw-r--r--arch/mips/sgi-ip27/ip27-irq.c2
-rw-r--r--arch/mips/sibyte/bcm1480/smp.c7
-rw-r--r--arch/mips/sibyte/sb1250/smp.c7
-rw-r--r--arch/mn10300/kernel/smp.c5
-rw-r--r--arch/parisc/kernel/smp.c5
-rw-r--r--arch/powerpc/kernel/smp.c4
-rw-r--r--arch/s390/kernel/smp.c6
-rw-r--r--arch/sh/kernel/smp.c2
-rw-r--r--arch/sparc/kernel/smp_32.c8
-rw-r--r--arch/sparc/kernel/smp_64.c1
-rw-r--r--arch/tile/kernel/smp.c6
-rw-r--r--arch/um/kernel/smp.c2
-rw-r--r--arch/x86/kernel/smp.c5
-rw-r--r--arch/x86/xen/smp.c5
-rw-r--r--include/linux/mutex.h2
-rw-r--r--include/linux/sched.h24
-rw-r--r--init/Kconfig5
-rw-r--r--kernel/mutex-debug.c2
-rw-r--r--kernel/mutex-debug.h2
-rw-r--r--kernel/mutex.c2
-rw-r--r--kernel/mutex.h2
-rw-r--r--kernel/sched.c627
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c28
-rw-r--r--kernel/sched_features.h6
-rw-r--r--kernel/sched_idletask.c2
-rw-r--r--kernel/sched_rt.c54
-rw-r--r--kernel/sched_stoptask.c5
39 files changed, 501 insertions, 377 deletions
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 42aa078a5e4d..5a621c6d22ab 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -585,8 +585,7 @@ handle_ipi(struct pt_regs *regs)
585 585
586 switch (which) { 586 switch (which) {
587 case IPI_RESCHEDULE: 587 case IPI_RESCHEDULE:
588 /* Reschedule callback. Everything to be done 588 scheduler_ipi();
589 is done by the interrupt return path. */
590 break; 589 break;
591 590
592 case IPI_CALL_FUNC: 591 case IPI_CALL_FUNC:
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 8fe05ad932e4..7a561eb731ea 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -560,10 +560,7 @@ asmlinkage void __exception_irq_entry do_IPI(int ipinr, struct pt_regs *regs)
560 break; 560 break;
561 561
562 case IPI_RESCHEDULE: 562 case IPI_RESCHEDULE:
563 /* 563 scheduler_ipi();
564 * nothing more to do - eveything is
565 * done on the interrupt return path
566 */
567 break; 564 break;
568 565
569 case IPI_CALL_FUNC: 566 case IPI_CALL_FUNC:
diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c
index 8bce5ed031e4..1fbd94c44457 100644
--- a/arch/blackfin/mach-common/smp.c
+++ b/arch/blackfin/mach-common/smp.c
@@ -177,6 +177,9 @@ static irqreturn_t ipi_handler_int1(int irq, void *dev_instance)
177 while (msg_queue->count) { 177 while (msg_queue->count) {
178 msg = &msg_queue->ipi_message[msg_queue->head]; 178 msg = &msg_queue->ipi_message[msg_queue->head];
179 switch (msg->type) { 179 switch (msg->type) {
180 case BFIN_IPI_RESCHEDULE:
181 scheduler_ipi();
182 break;
180 case BFIN_IPI_CALL_FUNC: 183 case BFIN_IPI_CALL_FUNC:
181 spin_unlock_irqrestore(&msg_queue->lock, flags); 184 spin_unlock_irqrestore(&msg_queue->lock, flags);
182 ipi_call_function(cpu, msg); 185 ipi_call_function(cpu, msg);
diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c
index 4c9e3e1ba5d1..66cc75657e2f 100644
--- a/arch/cris/arch-v32/kernel/smp.c
+++ b/arch/cris/arch-v32/kernel/smp.c
@@ -342,15 +342,18 @@ irqreturn_t crisv32_ipi_interrupt(int irq, void *dev_id)
342 342
343 ipi = REG_RD(intr_vect, irq_regs[smp_processor_id()], rw_ipi); 343 ipi = REG_RD(intr_vect, irq_regs[smp_processor_id()], rw_ipi);
344 344
345 if (ipi.vector & IPI_SCHEDULE) {
346 scheduler_ipi();
347 }
345 if (ipi.vector & IPI_CALL) { 348 if (ipi.vector & IPI_CALL) {
346 func(info); 349 func(info);
347 } 350 }
348 if (ipi.vector & IPI_FLUSH_TLB) { 351 if (ipi.vector & IPI_FLUSH_TLB) {
349 if (flush_mm == FLUSH_ALL) 352 if (flush_mm == FLUSH_ALL)
350 __flush_tlb_all(); 353 __flush_tlb_all();
351 else if (flush_vma == FLUSH_ALL) 354 else if (flush_vma == FLUSH_ALL)
352 __flush_tlb_mm(flush_mm); 355 __flush_tlb_mm(flush_mm);
353 else 356 else
354 __flush_tlb_page(flush_vma, flush_addr); 357 __flush_tlb_page(flush_vma, flush_addr);
355 } 358 }
356 359
diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
index 5b704740f160..782c3a357f24 100644
--- a/arch/ia64/kernel/irq_ia64.c
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -31,6 +31,7 @@
31#include <linux/irq.h> 31#include <linux/irq.h>
32#include <linux/ratelimit.h> 32#include <linux/ratelimit.h>
33#include <linux/acpi.h> 33#include <linux/acpi.h>
34#include <linux/sched.h>
34 35
35#include <asm/delay.h> 36#include <asm/delay.h>
36#include <asm/intrinsics.h> 37#include <asm/intrinsics.h>
@@ -496,6 +497,7 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs)
496 smp_local_flush_tlb(); 497 smp_local_flush_tlb();
497 kstat_incr_irqs_this_cpu(irq, desc); 498 kstat_incr_irqs_this_cpu(irq, desc);
498 } else if (unlikely(IS_RESCHEDULE(vector))) { 499 } else if (unlikely(IS_RESCHEDULE(vector))) {
500 scheduler_ipi();
499 kstat_incr_irqs_this_cpu(irq, desc); 501 kstat_incr_irqs_this_cpu(irq, desc);
500 } else { 502 } else {
501 ia64_setreg(_IA64_REG_CR_TPR, vector); 503 ia64_setreg(_IA64_REG_CR_TPR, vector);
diff --git a/arch/ia64/xen/irq_xen.c b/arch/ia64/xen/irq_xen.c
index 108bb858acf2..b279e142c633 100644
--- a/arch/ia64/xen/irq_xen.c
+++ b/arch/ia64/xen/irq_xen.c
@@ -92,6 +92,8 @@ static unsigned short saved_irq_cnt;
92static int xen_slab_ready; 92static int xen_slab_ready;
93 93
94#ifdef CONFIG_SMP 94#ifdef CONFIG_SMP
95#include <linux/sched.h>
96
95/* Dummy stub. Though we may check XEN_RESCHEDULE_VECTOR before __do_IRQ, 97/* Dummy stub. Though we may check XEN_RESCHEDULE_VECTOR before __do_IRQ,
96 * it ends up to issue several memory accesses upon percpu data and 98 * it ends up to issue several memory accesses upon percpu data and
97 * thus adds unnecessary traffic to other paths. 99 * thus adds unnecessary traffic to other paths.
@@ -99,7 +101,13 @@ static int xen_slab_ready;
99static irqreturn_t 101static irqreturn_t
100xen_dummy_handler(int irq, void *dev_id) 102xen_dummy_handler(int irq, void *dev_id)
101{ 103{
104 return IRQ_HANDLED;
105}
102 106
107static irqreturn_t
108xen_resched_handler(int irq, void *dev_id)
109{
110 scheduler_ipi();
103 return IRQ_HANDLED; 111 return IRQ_HANDLED;
104} 112}
105 113
@@ -110,7 +118,7 @@ static struct irqaction xen_ipi_irqaction = {
110}; 118};
111 119
112static struct irqaction xen_resched_irqaction = { 120static struct irqaction xen_resched_irqaction = {
113 .handler = xen_dummy_handler, 121 .handler = xen_resched_handler,
114 .flags = IRQF_DISABLED, 122 .flags = IRQF_DISABLED,
115 .name = "resched" 123 .name = "resched"
116}; 124};
diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c
index 31cef20b2996..fc10b39893d4 100644
--- a/arch/m32r/kernel/smp.c
+++ b/arch/m32r/kernel/smp.c
@@ -122,8 +122,6 @@ void smp_send_reschedule(int cpu_id)
122 * 122 *
123 * Description: This routine executes on CPU which received 123 * Description: This routine executes on CPU which received
124 * 'RESCHEDULE_IPI'. 124 * 'RESCHEDULE_IPI'.
125 * Rescheduling is processed at the exit of interrupt
126 * operation.
127 * 125 *
128 * Born on Date: 2002.02.05 126 * Born on Date: 2002.02.05
129 * 127 *
@@ -138,7 +136,7 @@ void smp_send_reschedule(int cpu_id)
138 *==========================================================================*/ 136 *==========================================================================*/
139void smp_reschedule_interrupt(void) 137void smp_reschedule_interrupt(void)
140{ 138{
141 /* nothing to do */ 139 scheduler_ipi();
142} 140}
143 141
144/*==========================================================================* 142/*==========================================================================*
diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c
index ba78b21cc8d0..76923eeb58b9 100644
--- a/arch/mips/cavium-octeon/smp.c
+++ b/arch/mips/cavium-octeon/smp.c
@@ -44,6 +44,8 @@ static irqreturn_t mailbox_interrupt(int irq, void *dev_id)
44 44
45 if (action & SMP_CALL_FUNCTION) 45 if (action & SMP_CALL_FUNCTION)
46 smp_call_function_interrupt(); 46 smp_call_function_interrupt();
47 if (action & SMP_RESCHEDULE_YOURSELF)
48 scheduler_ipi();
47 49
48 /* Check if we've been told to flush the icache */ 50 /* Check if we've been told to flush the icache */
49 if (action & SMP_ICACHE_FLUSH) 51 if (action & SMP_ICACHE_FLUSH)
diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c
index 5a88cc4ccd5a..cedac4633741 100644
--- a/arch/mips/kernel/smtc.c
+++ b/arch/mips/kernel/smtc.c
@@ -929,7 +929,7 @@ static void post_direct_ipi(int cpu, struct smtc_ipi *pipi)
929 929
930static void ipi_resched_interrupt(void) 930static void ipi_resched_interrupt(void)
931{ 931{
932 /* Return from interrupt should be enough to cause scheduler check */ 932 scheduler_ipi();
933} 933}
934 934
935static void ipi_call_interrupt(void) 935static void ipi_call_interrupt(void)
diff --git a/arch/mips/mti-malta/malta-int.c b/arch/mips/mti-malta/malta-int.c
index 9027061f0ead..7d93e6fbfa5a 100644
--- a/arch/mips/mti-malta/malta-int.c
+++ b/arch/mips/mti-malta/malta-int.c
@@ -309,6 +309,8 @@ static void ipi_call_dispatch(void)
309 309
310static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id) 310static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id)
311{ 311{
312 scheduler_ipi();
313
312 return IRQ_HANDLED; 314 return IRQ_HANDLED;
313} 315}
314 316
diff --git a/arch/mips/pmc-sierra/yosemite/smp.c b/arch/mips/pmc-sierra/yosemite/smp.c
index efc9e889b349..2608752898c0 100644
--- a/arch/mips/pmc-sierra/yosemite/smp.c
+++ b/arch/mips/pmc-sierra/yosemite/smp.c
@@ -55,6 +55,8 @@ void titan_mailbox_irq(void)
55 55
56 if (status & 0x2) 56 if (status & 0x2)
57 smp_call_function_interrupt(); 57 smp_call_function_interrupt();
58 if (status & 0x4)
59 scheduler_ipi();
58 break; 60 break;
59 61
60 case 1: 62 case 1:
@@ -63,6 +65,8 @@ void titan_mailbox_irq(void)
63 65
64 if (status & 0x2) 66 if (status & 0x2)
65 smp_call_function_interrupt(); 67 smp_call_function_interrupt();
68 if (status & 0x4)
69 scheduler_ipi();
66 break; 70 break;
67 } 71 }
68} 72}
diff --git a/arch/mips/sgi-ip27/ip27-irq.c b/arch/mips/sgi-ip27/ip27-irq.c
index 0a04603d577c..b18b04e48577 100644
--- a/arch/mips/sgi-ip27/ip27-irq.c
+++ b/arch/mips/sgi-ip27/ip27-irq.c
@@ -147,8 +147,10 @@ static void ip27_do_irq_mask0(void)
147#ifdef CONFIG_SMP 147#ifdef CONFIG_SMP
148 if (pend0 & (1UL << CPU_RESCHED_A_IRQ)) { 148 if (pend0 & (1UL << CPU_RESCHED_A_IRQ)) {
149 LOCAL_HUB_CLR_INTR(CPU_RESCHED_A_IRQ); 149 LOCAL_HUB_CLR_INTR(CPU_RESCHED_A_IRQ);
150 scheduler_ipi();
150 } else if (pend0 & (1UL << CPU_RESCHED_B_IRQ)) { 151 } else if (pend0 & (1UL << CPU_RESCHED_B_IRQ)) {
151 LOCAL_HUB_CLR_INTR(CPU_RESCHED_B_IRQ); 152 LOCAL_HUB_CLR_INTR(CPU_RESCHED_B_IRQ);
153 scheduler_ipi();
152 } else if (pend0 & (1UL << CPU_CALL_A_IRQ)) { 154 } else if (pend0 & (1UL << CPU_CALL_A_IRQ)) {
153 LOCAL_HUB_CLR_INTR(CPU_CALL_A_IRQ); 155 LOCAL_HUB_CLR_INTR(CPU_CALL_A_IRQ);
154 smp_call_function_interrupt(); 156 smp_call_function_interrupt();
diff --git a/arch/mips/sibyte/bcm1480/smp.c b/arch/mips/sibyte/bcm1480/smp.c
index 47b347c992ea..d667875be564 100644
--- a/arch/mips/sibyte/bcm1480/smp.c
+++ b/arch/mips/sibyte/bcm1480/smp.c
@@ -20,6 +20,7 @@
20#include <linux/delay.h> 20#include <linux/delay.h>
21#include <linux/smp.h> 21#include <linux/smp.h>
22#include <linux/kernel_stat.h> 22#include <linux/kernel_stat.h>
23#include <linux/sched.h>
23 24
24#include <asm/mmu_context.h> 25#include <asm/mmu_context.h>
25#include <asm/io.h> 26#include <asm/io.h>
@@ -189,10 +190,8 @@ void bcm1480_mailbox_interrupt(void)
189 /* Clear the mailbox to clear the interrupt */ 190 /* Clear the mailbox to clear the interrupt */
190 __raw_writeq(((u64)action)<<48, mailbox_0_clear_regs[cpu]); 191 __raw_writeq(((u64)action)<<48, mailbox_0_clear_regs[cpu]);
191 192
192 /* 193 if (action & SMP_RESCHEDULE_YOURSELF)
193 * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the 194 scheduler_ipi();
194 * interrupt will do the reschedule for us
195 */
196 195
197 if (action & SMP_CALL_FUNCTION) 196 if (action & SMP_CALL_FUNCTION)
198 smp_call_function_interrupt(); 197 smp_call_function_interrupt();
diff --git a/arch/mips/sibyte/sb1250/smp.c b/arch/mips/sibyte/sb1250/smp.c
index c00a5cb1128d..38e7f6bd7922 100644
--- a/arch/mips/sibyte/sb1250/smp.c
+++ b/arch/mips/sibyte/sb1250/smp.c
@@ -21,6 +21,7 @@
21#include <linux/interrupt.h> 21#include <linux/interrupt.h>
22#include <linux/smp.h> 22#include <linux/smp.h>
23#include <linux/kernel_stat.h> 23#include <linux/kernel_stat.h>
24#include <linux/sched.h>
24 25
25#include <asm/mmu_context.h> 26#include <asm/mmu_context.h>
26#include <asm/io.h> 27#include <asm/io.h>
@@ -177,10 +178,8 @@ void sb1250_mailbox_interrupt(void)
177 /* Clear the mailbox to clear the interrupt */ 178 /* Clear the mailbox to clear the interrupt */
178 ____raw_writeq(((u64)action) << 48, mailbox_clear_regs[cpu]); 179 ____raw_writeq(((u64)action) << 48, mailbox_clear_regs[cpu]);
179 180
180 /* 181 if (action & SMP_RESCHEDULE_YOURSELF)
181 * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the 182 scheduler_ipi();
182 * interrupt will do the reschedule for us
183 */
184 183
185 if (action & SMP_CALL_FUNCTION) 184 if (action & SMP_CALL_FUNCTION)
186 smp_call_function_interrupt(); 185 smp_call_function_interrupt();
diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c
index 226c826a2194..83fb27912231 100644
--- a/arch/mn10300/kernel/smp.c
+++ b/arch/mn10300/kernel/smp.c
@@ -494,14 +494,11 @@ void smp_send_stop(void)
494 * @irq: The interrupt number. 494 * @irq: The interrupt number.
495 * @dev_id: The device ID. 495 * @dev_id: The device ID.
496 * 496 *
497 * We need do nothing here, since the scheduling will be effected on our way
498 * back through entry.S.
499 *
500 * Returns IRQ_HANDLED to indicate we handled the interrupt successfully. 497 * Returns IRQ_HANDLED to indicate we handled the interrupt successfully.
501 */ 498 */
502static irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id) 499static irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
503{ 500{
504 /* do nothing */ 501 scheduler_ipi();
505 return IRQ_HANDLED; 502 return IRQ_HANDLED;
506} 503}
507 504
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 69d63d354ef0..828305f19cff 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -155,10 +155,7 @@ ipi_interrupt(int irq, void *dev_id)
155 155
156 case IPI_RESCHEDULE: 156 case IPI_RESCHEDULE:
157 smp_debug(100, KERN_DEBUG "CPU%d IPI_RESCHEDULE\n", this_cpu); 157 smp_debug(100, KERN_DEBUG "CPU%d IPI_RESCHEDULE\n", this_cpu);
158 /* 158 scheduler_ipi();
159 * Reschedule callback. Everything to be
160 * done is done by the interrupt return path.
161 */
162 break; 159 break;
163 160
164 case IPI_CALL_FUNC: 161 case IPI_CALL_FUNC:
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index cbdbb14be4b0..9f9c204bef69 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -116,7 +116,7 @@ void smp_message_recv(int msg)
116 generic_smp_call_function_interrupt(); 116 generic_smp_call_function_interrupt();
117 break; 117 break;
118 case PPC_MSG_RESCHEDULE: 118 case PPC_MSG_RESCHEDULE:
119 /* we notice need_resched on exit */ 119 scheduler_ipi();
120 break; 120 break;
121 case PPC_MSG_CALL_FUNC_SINGLE: 121 case PPC_MSG_CALL_FUNC_SINGLE:
122 generic_smp_call_function_single_interrupt(); 122 generic_smp_call_function_single_interrupt();
@@ -146,7 +146,7 @@ static irqreturn_t call_function_action(int irq, void *data)
146 146
147static irqreturn_t reschedule_action(int irq, void *data) 147static irqreturn_t reschedule_action(int irq, void *data)
148{ 148{
149 /* we just need the return path side effect of checking need_resched */ 149 scheduler_ipi();
150 return IRQ_HANDLED; 150 return IRQ_HANDLED;
151} 151}
152 152
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 63a97db83f96..63c7d9ff220d 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -165,12 +165,12 @@ static void do_ext_call_interrupt(unsigned int ext_int_code,
165 kstat_cpu(smp_processor_id()).irqs[EXTINT_IPI]++; 165 kstat_cpu(smp_processor_id()).irqs[EXTINT_IPI]++;
166 /* 166 /*
167 * handle bit signal external calls 167 * handle bit signal external calls
168 *
169 * For the ec_schedule signal we have to do nothing. All the work
170 * is done automatically when we return from the interrupt.
171 */ 168 */
172 bits = xchg(&S390_lowcore.ext_call_fast, 0); 169 bits = xchg(&S390_lowcore.ext_call_fast, 0);
173 170
171 if (test_bit(ec_schedule, &bits))
172 scheduler_ipi();
173
174 if (test_bit(ec_call_function, &bits)) 174 if (test_bit(ec_call_function, &bits))
175 generic_smp_call_function_interrupt(); 175 generic_smp_call_function_interrupt();
176 176
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 509b36b45115..6207561ea34a 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -20,6 +20,7 @@
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/cpu.h> 21#include <linux/cpu.h>
22#include <linux/interrupt.h> 22#include <linux/interrupt.h>
23#include <linux/sched.h>
23#include <asm/atomic.h> 24#include <asm/atomic.h>
24#include <asm/processor.h> 25#include <asm/processor.h>
25#include <asm/system.h> 26#include <asm/system.h>
@@ -323,6 +324,7 @@ void smp_message_recv(unsigned int msg)
323 generic_smp_call_function_interrupt(); 324 generic_smp_call_function_interrupt();
324 break; 325 break;
325 case SMP_MSG_RESCHEDULE: 326 case SMP_MSG_RESCHEDULE:
327 scheduler_ipi();
326 break; 328 break;
327 case SMP_MSG_FUNCTION_SINGLE: 329 case SMP_MSG_FUNCTION_SINGLE:
328 generic_smp_call_function_single_interrupt(); 330 generic_smp_call_function_single_interrupt();
diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c
index 41102c5a6702..d5b3958be0b4 100644
--- a/arch/sparc/kernel/smp_32.c
+++ b/arch/sparc/kernel/smp_32.c
@@ -156,11 +156,11 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask)
156 156
157void smp_resched_interrupt(void) 157void smp_resched_interrupt(void)
158{ 158{
159 irq_enter();
160 scheduler_ipi();
159 local_cpu_data().irq_resched_count++; 161 local_cpu_data().irq_resched_count++;
160 /* 162 irq_exit();
161 * do nothing, since it all was about calling re-schedule 163 /* re-schedule routine called by interrupt return code. */
162 * routine called by interrupt return code.
163 */
164} 164}
165 165
166void smp_call_function_single_interrupt(void) 166void smp_call_function_single_interrupt(void)
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index c274a30c3cbf..99cb17251bb5 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1368,6 +1368,7 @@ void smp_send_reschedule(int cpu)
1368void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs) 1368void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs)
1369{ 1369{
1370 clear_softint(1 << irq); 1370 clear_softint(1 << irq);
1371 scheduler_ipi();
1371} 1372}
1372 1373
1373/* This is a nop because we capture all other cpus 1374/* This is a nop because we capture all other cpus
diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c
index a4293102ef81..c52224d5ed45 100644
--- a/arch/tile/kernel/smp.c
+++ b/arch/tile/kernel/smp.c
@@ -189,12 +189,8 @@ void flush_icache_range(unsigned long start, unsigned long end)
189/* Called when smp_send_reschedule() triggers IRQ_RESCHEDULE. */ 189/* Called when smp_send_reschedule() triggers IRQ_RESCHEDULE. */
190static irqreturn_t handle_reschedule_ipi(int irq, void *token) 190static irqreturn_t handle_reschedule_ipi(int irq, void *token)
191{ 191{
192 /*
193 * Nothing to do here; when we return from interrupt, the
194 * rescheduling will occur there. But do bump the interrupt
195 * profiler count in the meantime.
196 */
197 __get_cpu_var(irq_stat).irq_resched_count++; 192 __get_cpu_var(irq_stat).irq_resched_count++;
193 scheduler_ipi();
198 194
199 return IRQ_HANDLED; 195 return IRQ_HANDLED;
200} 196}
diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c
index 106bf27e2a9a..eefb107d2d73 100644
--- a/arch/um/kernel/smp.c
+++ b/arch/um/kernel/smp.c
@@ -173,7 +173,7 @@ void IPI_handler(int cpu)
173 break; 173 break;
174 174
175 case 'R': 175 case 'R':
176 set_tsk_need_resched(current); 176 scheduler_ipi();
177 break; 177 break;
178 178
179 case 'S': 179 case 'S':
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 513deac7228d..013e7eba83bb 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -194,14 +194,13 @@ static void native_stop_other_cpus(int wait)
194} 194}
195 195
196/* 196/*
197 * Reschedule call back. Nothing to do, 197 * Reschedule call back.
198 * all the work is done automatically when
199 * we return from the interrupt.
200 */ 198 */
201void smp_reschedule_interrupt(struct pt_regs *regs) 199void smp_reschedule_interrupt(struct pt_regs *regs)
202{ 200{
203 ack_APIC_irq(); 201 ack_APIC_irq();
204 inc_irq_stat(irq_resched_count); 202 inc_irq_stat(irq_resched_count);
203 scheduler_ipi();
205 /* 204 /*
206 * KVM uses this interrupt to force a cpu out of guest mode 205 * KVM uses this interrupt to force a cpu out of guest mode
207 */ 206 */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 30612441ed99..762b46ab14d5 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -46,13 +46,12 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
46static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); 46static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
47 47
48/* 48/*
49 * Reschedule call back. Nothing to do, 49 * Reschedule call back.
50 * all the work is done automatically when
51 * we return from the interrupt.
52 */ 50 */
53static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) 51static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
54{ 52{
55 inc_irq_stat(irq_resched_count); 53 inc_irq_stat(irq_resched_count);
54 scheduler_ipi();
56 55
57 return IRQ_HANDLED; 56 return IRQ_HANDLED;
58} 57}
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 94b48bd40dd7..c75471db576e 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -51,7 +51,7 @@ struct mutex {
51 spinlock_t wait_lock; 51 spinlock_t wait_lock;
52 struct list_head wait_list; 52 struct list_head wait_list;
53#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) 53#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
54 struct thread_info *owner; 54 struct task_struct *owner;
55#endif 55#endif
56#ifdef CONFIG_DEBUG_MUTEXES 56#ifdef CONFIG_DEBUG_MUTEXES
57 const char *name; 57 const char *name;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 18d63cea2848..94107a2c2840 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -360,7 +360,7 @@ extern signed long schedule_timeout_interruptible(signed long timeout);
360extern signed long schedule_timeout_killable(signed long timeout); 360extern signed long schedule_timeout_killable(signed long timeout);
361extern signed long schedule_timeout_uninterruptible(signed long timeout); 361extern signed long schedule_timeout_uninterruptible(signed long timeout);
362asmlinkage void schedule(void); 362asmlinkage void schedule(void);
363extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); 363extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner);
364 364
365struct nsproxy; 365struct nsproxy;
366struct user_namespace; 366struct user_namespace;
@@ -1048,8 +1048,12 @@ struct sched_domain;
1048#define WF_FORK 0x02 /* child wakeup after fork */ 1048#define WF_FORK 0x02 /* child wakeup after fork */
1049 1049
1050#define ENQUEUE_WAKEUP 1 1050#define ENQUEUE_WAKEUP 1
1051#define ENQUEUE_WAKING 2 1051#define ENQUEUE_HEAD 2
1052#define ENQUEUE_HEAD 4 1052#ifdef CONFIG_SMP
1053#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
1054#else
1055#define ENQUEUE_WAKING 0
1056#endif
1053 1057
1054#define DEQUEUE_SLEEP 1 1058#define DEQUEUE_SLEEP 1
1055 1059
@@ -1067,12 +1071,11 @@ struct sched_class {
1067 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1071 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
1068 1072
1069#ifdef CONFIG_SMP 1073#ifdef CONFIG_SMP
1070 int (*select_task_rq)(struct rq *rq, struct task_struct *p, 1074 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
1071 int sd_flag, int flags);
1072 1075
1073 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); 1076 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
1074 void (*post_schedule) (struct rq *this_rq); 1077 void (*post_schedule) (struct rq *this_rq);
1075 void (*task_waking) (struct rq *this_rq, struct task_struct *task); 1078 void (*task_waking) (struct task_struct *task);
1076 void (*task_woken) (struct rq *this_rq, struct task_struct *task); 1079 void (*task_woken) (struct rq *this_rq, struct task_struct *task);
1077 1080
1078 void (*set_cpus_allowed)(struct task_struct *p, 1081 void (*set_cpus_allowed)(struct task_struct *p,
@@ -1200,10 +1203,10 @@ struct task_struct {
1200 int lock_depth; /* BKL lock depth */ 1203 int lock_depth; /* BKL lock depth */
1201 1204
1202#ifdef CONFIG_SMP 1205#ifdef CONFIG_SMP
1203#ifdef __ARCH_WANT_UNLOCKED_CTXSW 1206 struct task_struct *wake_entry;
1204 int oncpu; 1207 int on_cpu;
1205#endif
1206#endif 1208#endif
1209 int on_rq;
1207 1210
1208 int prio, static_prio, normal_prio; 1211 int prio, static_prio, normal_prio;
1209 unsigned int rt_priority; 1212 unsigned int rt_priority;
@@ -1274,6 +1277,7 @@ struct task_struct {
1274 1277
1275 /* Revert to default priority/policy when forking */ 1278 /* Revert to default priority/policy when forking */
1276 unsigned sched_reset_on_fork:1; 1279 unsigned sched_reset_on_fork:1;
1280 unsigned sched_contributes_to_load:1;
1277 1281
1278 pid_t pid; 1282 pid_t pid;
1279 pid_t tgid; 1283 pid_t tgid;
@@ -2192,8 +2196,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from);
2192extern char *get_task_comm(char *to, struct task_struct *tsk); 2196extern char *get_task_comm(char *to, struct task_struct *tsk);
2193 2197
2194#ifdef CONFIG_SMP 2198#ifdef CONFIG_SMP
2199void scheduler_ipi(void);
2195extern unsigned long wait_task_inactive(struct task_struct *, long match_state); 2200extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
2196#else 2201#else
2202static inline void scheduler_ipi(void) { }
2197static inline unsigned long wait_task_inactive(struct task_struct *p, 2203static inline unsigned long wait_task_inactive(struct task_struct *p,
2198 long match_state) 2204 long match_state)
2199{ 2205{
diff --git a/init/Kconfig b/init/Kconfig
index 56240e724d9a..32745bfe059e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -827,6 +827,11 @@ config SCHED_AUTOGROUP
827 desktop applications. Task group autogeneration is currently based 827 desktop applications. Task group autogeneration is currently based
828 upon task session. 828 upon task session.
829 829
830config SCHED_TTWU_QUEUE
831 bool
832 depends on !SPARC32
833 default y
834
830config MM_OWNER 835config MM_OWNER
831 bool 836 bool
832 837
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index ec815a960b5d..73da83aff418 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock)
75 return; 75 return;
76 76
77 DEBUG_LOCKS_WARN_ON(lock->magic != lock); 77 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
78 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); 78 DEBUG_LOCKS_WARN_ON(lock->owner != current);
79 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 79 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
80 mutex_clear_owner(lock); 80 mutex_clear_owner(lock);
81} 81}
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index 57d527a16f9d..0799fd3e4cfa 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
29 29
30static inline void mutex_set_owner(struct mutex *lock) 30static inline void mutex_set_owner(struct mutex *lock)
31{ 31{
32 lock->owner = current_thread_info(); 32 lock->owner = current;
33} 33}
34 34
35static inline void mutex_clear_owner(struct mutex *lock) 35static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index c4195fa98900..fe4706cb0c5b 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -160,7 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
160 */ 160 */
161 161
162 for (;;) { 162 for (;;) {
163 struct thread_info *owner; 163 struct task_struct *owner;
164 164
165 /* 165 /*
166 * If we own the BKL, then don't spin. The owner of 166 * If we own the BKL, then don't spin. The owner of
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 67578ca48f94..4115fbf83b12 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -19,7 +19,7 @@
19#ifdef CONFIG_SMP 19#ifdef CONFIG_SMP
20static inline void mutex_set_owner(struct mutex *lock) 20static inline void mutex_set_owner(struct mutex *lock)
21{ 21{
22 lock->owner = current_thread_info(); 22 lock->owner = current;
23} 23}
24 24
25static inline void mutex_clear_owner(struct mutex *lock) 25static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/sched.c b/kernel/sched.c
index 312f8b95c2d4..8c9d804dc07d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -312,6 +312,9 @@ struct cfs_rq {
312 312
313 u64 exec_clock; 313 u64 exec_clock;
314 u64 min_vruntime; 314 u64 min_vruntime;
315#ifndef CONFIG_64BIT
316 u64 min_vruntime_copy;
317#endif
315 318
316 struct rb_root tasks_timeline; 319 struct rb_root tasks_timeline;
317 struct rb_node *rb_leftmost; 320 struct rb_node *rb_leftmost;
@@ -553,6 +556,10 @@ struct rq {
553 unsigned int ttwu_count; 556 unsigned int ttwu_count;
554 unsigned int ttwu_local; 557 unsigned int ttwu_local;
555#endif 558#endif
559
560#ifdef CONFIG_SMP
561 struct task_struct *wake_list;
562#endif
556}; 563};
557 564
558static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 565static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -596,7 +603,7 @@ static inline int cpu_of(struct rq *rq)
596 * Return the group to which this tasks belongs. 603 * Return the group to which this tasks belongs.
597 * 604 *
598 * We use task_subsys_state_check() and extend the RCU verification 605 * We use task_subsys_state_check() and extend the RCU verification
599 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() 606 * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
600 * holds that lock for each task it moves into the cgroup. Therefore 607 * holds that lock for each task it moves into the cgroup. Therefore
601 * by holding that lock, we pin the task to the current cgroup. 608 * by holding that lock, we pin the task to the current cgroup.
602 */ 609 */
@@ -606,7 +613,7 @@ static inline struct task_group *task_group(struct task_struct *p)
606 struct cgroup_subsys_state *css; 613 struct cgroup_subsys_state *css;
607 614
608 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 615 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
609 lockdep_is_held(&task_rq(p)->lock)); 616 lockdep_is_held(&p->pi_lock));
610 tg = container_of(css, struct task_group, css); 617 tg = container_of(css, struct task_group, css);
611 618
612 return autogroup_task_group(p, tg); 619 return autogroup_task_group(p, tg);
@@ -838,18 +845,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
838 return rq->curr == p; 845 return rq->curr == p;
839} 846}
840 847
841#ifndef __ARCH_WANT_UNLOCKED_CTXSW
842static inline int task_running(struct rq *rq, struct task_struct *p) 848static inline int task_running(struct rq *rq, struct task_struct *p)
843{ 849{
850#ifdef CONFIG_SMP
851 return p->on_cpu;
852#else
844 return task_current(rq, p); 853 return task_current(rq, p);
854#endif
845} 855}
846 856
857#ifndef __ARCH_WANT_UNLOCKED_CTXSW
847static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 858static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
848{ 859{
860#ifdef CONFIG_SMP
861 /*
862 * We can optimise this out completely for !SMP, because the
863 * SMP rebalancing from interrupt is the only thing that cares
864 * here.
865 */
866 next->on_cpu = 1;
867#endif
849} 868}
850 869
851static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 870static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
852{ 871{
872#ifdef CONFIG_SMP
873 /*
874 * After ->on_cpu is cleared, the task can be moved to a different CPU.
875 * We must ensure this doesn't happen until the switch is completely
876 * finished.
877 */
878 smp_wmb();
879 prev->on_cpu = 0;
880#endif
853#ifdef CONFIG_DEBUG_SPINLOCK 881#ifdef CONFIG_DEBUG_SPINLOCK
854 /* this is a valid case when another task releases the spinlock */ 882 /* this is a valid case when another task releases the spinlock */
855 rq->lock.owner = current; 883 rq->lock.owner = current;
@@ -865,15 +893,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
865} 893}
866 894
867#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 895#else /* __ARCH_WANT_UNLOCKED_CTXSW */
868static inline int task_running(struct rq *rq, struct task_struct *p)
869{
870#ifdef CONFIG_SMP
871 return p->oncpu;
872#else
873 return task_current(rq, p);
874#endif
875}
876
877static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 896static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
878{ 897{
879#ifdef CONFIG_SMP 898#ifdef CONFIG_SMP
@@ -882,7 +901,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
882 * SMP rebalancing from interrupt is the only thing that cares 901 * SMP rebalancing from interrupt is the only thing that cares
883 * here. 902 * here.
884 */ 903 */
885 next->oncpu = 1; 904 next->on_cpu = 1;
886#endif 905#endif
887#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 906#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
888 raw_spin_unlock_irq(&rq->lock); 907 raw_spin_unlock_irq(&rq->lock);
@@ -895,12 +914,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
895{ 914{
896#ifdef CONFIG_SMP 915#ifdef CONFIG_SMP
897 /* 916 /*
898 * After ->oncpu is cleared, the task can be moved to a different CPU. 917 * After ->on_cpu is cleared, the task can be moved to a different CPU.
899 * We must ensure this doesn't happen until the switch is completely 918 * We must ensure this doesn't happen until the switch is completely
900 * finished. 919 * finished.
901 */ 920 */
902 smp_wmb(); 921 smp_wmb();
903 prev->oncpu = 0; 922 prev->on_cpu = 0;
904#endif 923#endif
905#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 924#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
906 local_irq_enable(); 925 local_irq_enable();
@@ -909,23 +928,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
909#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 928#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
910 929
911/* 930/*
912 * Check whether the task is waking, we use this to synchronize ->cpus_allowed 931 * __task_rq_lock - lock the rq @p resides on.
913 * against ttwu().
914 */
915static inline int task_is_waking(struct task_struct *p)
916{
917 return unlikely(p->state == TASK_WAKING);
918}
919
920/*
921 * __task_rq_lock - lock the runqueue a given task resides on.
922 * Must be called interrupts disabled.
923 */ 932 */
924static inline struct rq *__task_rq_lock(struct task_struct *p) 933static inline struct rq *__task_rq_lock(struct task_struct *p)
925 __acquires(rq->lock) 934 __acquires(rq->lock)
926{ 935{
927 struct rq *rq; 936 struct rq *rq;
928 937
938 lockdep_assert_held(&p->pi_lock);
939
929 for (;;) { 940 for (;;) {
930 rq = task_rq(p); 941 rq = task_rq(p);
931 raw_spin_lock(&rq->lock); 942 raw_spin_lock(&rq->lock);
@@ -936,22 +947,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
936} 947}
937 948
938/* 949/*
939 * task_rq_lock - lock the runqueue a given task resides on and disable 950 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
940 * interrupts. Note the ordering: we can safely lookup the task_rq without
941 * explicitly disabling preemption.
942 */ 951 */
943static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 952static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
953 __acquires(p->pi_lock)
944 __acquires(rq->lock) 954 __acquires(rq->lock)
945{ 955{
946 struct rq *rq; 956 struct rq *rq;
947 957
948 for (;;) { 958 for (;;) {
949 local_irq_save(*flags); 959 raw_spin_lock_irqsave(&p->pi_lock, *flags);
950 rq = task_rq(p); 960 rq = task_rq(p);
951 raw_spin_lock(&rq->lock); 961 raw_spin_lock(&rq->lock);
952 if (likely(rq == task_rq(p))) 962 if (likely(rq == task_rq(p)))
953 return rq; 963 return rq;
954 raw_spin_unlock_irqrestore(&rq->lock, *flags); 964 raw_spin_unlock(&rq->lock);
965 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
955 } 966 }
956} 967}
957 968
@@ -961,10 +972,13 @@ static void __task_rq_unlock(struct rq *rq)
961 raw_spin_unlock(&rq->lock); 972 raw_spin_unlock(&rq->lock);
962} 973}
963 974
964static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 975static inline void
976task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
965 __releases(rq->lock) 977 __releases(rq->lock)
978 __releases(p->pi_lock)
966{ 979{
967 raw_spin_unlock_irqrestore(&rq->lock, *flags); 980 raw_spin_unlock(&rq->lock);
981 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
968} 982}
969 983
970/* 984/*
@@ -1773,7 +1787,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1773 update_rq_clock(rq); 1787 update_rq_clock(rq);
1774 sched_info_queued(p); 1788 sched_info_queued(p);
1775 p->sched_class->enqueue_task(rq, p, flags); 1789 p->sched_class->enqueue_task(rq, p, flags);
1776 p->se.on_rq = 1;
1777} 1790}
1778 1791
1779static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 1792static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1781,7 +1794,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1781 update_rq_clock(rq); 1794 update_rq_clock(rq);
1782 sched_info_dequeued(p); 1795 sched_info_dequeued(p);
1783 p->sched_class->dequeue_task(rq, p, flags); 1796 p->sched_class->dequeue_task(rq, p, flags);
1784 p->se.on_rq = 0;
1785} 1797}
1786 1798
1787/* 1799/*
@@ -2116,7 +2128,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2116 * A queue event has occurred, and we're going to schedule. In 2128 * A queue event has occurred, and we're going to schedule. In
2117 * this case, we can save a useless back to back clock update. 2129 * this case, we can save a useless back to back clock update.
2118 */ 2130 */
2119 if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) 2131 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
2120 rq->skip_clock_update = 1; 2132 rq->skip_clock_update = 1;
2121} 2133}
2122 2134
@@ -2162,6 +2174,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2162 */ 2174 */
2163 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 2175 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2164 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 2176 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2177
2178#ifdef CONFIG_LOCKDEP
2179 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2180 lockdep_is_held(&task_rq(p)->lock)));
2181#endif
2165#endif 2182#endif
2166 2183
2167 trace_sched_migrate_task(p, new_cpu); 2184 trace_sched_migrate_task(p, new_cpu);
@@ -2185,13 +2202,15 @@ static int migration_cpu_stop(void *data);
2185 * The task's runqueue lock must be held. 2202 * The task's runqueue lock must be held.
2186 * Returns true if you have to wait for migration thread. 2203 * Returns true if you have to wait for migration thread.
2187 */ 2204 */
2188static bool migrate_task(struct task_struct *p, struct rq *rq) 2205static bool need_migrate_task(struct task_struct *p)
2189{ 2206{
2190 /* 2207 /*
2191 * If the task is not on a runqueue (and not running), then 2208 * If the task is not on a runqueue (and not running), then
2192 * the next wake-up will properly place the task. 2209 * the next wake-up will properly place the task.
2193 */ 2210 */
2194 return p->se.on_rq || task_running(rq, p); 2211 bool running = p->on_rq || p->on_cpu;
2212 smp_rmb(); /* finish_lock_switch() */
2213 return running;
2195} 2214}
2196 2215
2197/* 2216/*
@@ -2251,11 +2270,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2251 rq = task_rq_lock(p, &flags); 2270 rq = task_rq_lock(p, &flags);
2252 trace_sched_wait_task(p); 2271 trace_sched_wait_task(p);
2253 running = task_running(rq, p); 2272 running = task_running(rq, p);
2254 on_rq = p->se.on_rq; 2273 on_rq = p->on_rq;
2255 ncsw = 0; 2274 ncsw = 0;
2256 if (!match_state || p->state == match_state) 2275 if (!match_state || p->state == match_state)
2257 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 2276 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2258 task_rq_unlock(rq, &flags); 2277 task_rq_unlock(rq, p, &flags);
2259 2278
2260 /* 2279 /*
2261 * If it changed from the expected state, bail out now. 2280 * If it changed from the expected state, bail out now.
@@ -2330,7 +2349,7 @@ EXPORT_SYMBOL_GPL(kick_process);
2330 2349
2331#ifdef CONFIG_SMP 2350#ifdef CONFIG_SMP
2332/* 2351/*
2333 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2352 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
2334 */ 2353 */
2335static int select_fallback_rq(int cpu, struct task_struct *p) 2354static int select_fallback_rq(int cpu, struct task_struct *p)
2336{ 2355{
@@ -2363,12 +2382,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2363} 2382}
2364 2383
2365/* 2384/*
2366 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. 2385 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
2367 */ 2386 */
2368static inline 2387static inline
2369int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) 2388int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2370{ 2389{
2371 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); 2390 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2372 2391
2373 /* 2392 /*
2374 * In order not to call set_task_cpu() on a blocking task we need 2393 * In order not to call set_task_cpu() on a blocking task we need
@@ -2394,27 +2413,60 @@ static void update_avg(u64 *avg, u64 sample)
2394} 2413}
2395#endif 2414#endif
2396 2415
2397static inline void ttwu_activate(struct task_struct *p, struct rq *rq, 2416static void
2398 bool is_sync, bool is_migrate, bool is_local, 2417ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2399 unsigned long en_flags)
2400{ 2418{
2419#ifdef CONFIG_SCHEDSTATS
2420 struct rq *rq = this_rq();
2421
2422#ifdef CONFIG_SMP
2423 int this_cpu = smp_processor_id();
2424
2425 if (cpu == this_cpu) {
2426 schedstat_inc(rq, ttwu_local);
2427 schedstat_inc(p, se.statistics.nr_wakeups_local);
2428 } else {
2429 struct sched_domain *sd;
2430
2431 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2432 for_each_domain(this_cpu, sd) {
2433 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2434 schedstat_inc(sd, ttwu_wake_remote);
2435 break;
2436 }
2437 }
2438 }
2439#endif /* CONFIG_SMP */
2440
2441 schedstat_inc(rq, ttwu_count);
2401 schedstat_inc(p, se.statistics.nr_wakeups); 2442 schedstat_inc(p, se.statistics.nr_wakeups);
2402 if (is_sync) 2443
2444 if (wake_flags & WF_SYNC)
2403 schedstat_inc(p, se.statistics.nr_wakeups_sync); 2445 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2404 if (is_migrate) 2446
2447 if (cpu != task_cpu(p))
2405 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 2448 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2406 if (is_local)
2407 schedstat_inc(p, se.statistics.nr_wakeups_local);
2408 else
2409 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2410 2449
2450#endif /* CONFIG_SCHEDSTATS */
2451}
2452
2453static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2454{
2411 activate_task(rq, p, en_flags); 2455 activate_task(rq, p, en_flags);
2456 p->on_rq = 1;
2457
2458 /* if a worker is waking up, notify workqueue */
2459 if (p->flags & PF_WQ_WORKER)
2460 wq_worker_waking_up(p, cpu_of(rq));
2412} 2461}
2413 2462
2414static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, 2463/*
2415 int wake_flags, bool success) 2464 * Mark the task runnable and perform wakeup-preemption.
2465 */
2466static void
2467ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2416{ 2468{
2417 trace_sched_wakeup(p, success); 2469 trace_sched_wakeup(p, true);
2418 check_preempt_curr(rq, p, wake_flags); 2470 check_preempt_curr(rq, p, wake_flags);
2419 2471
2420 p->state = TASK_RUNNING; 2472 p->state = TASK_RUNNING;
@@ -2433,9 +2485,99 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2433 rq->idle_stamp = 0; 2485 rq->idle_stamp = 0;
2434 } 2486 }
2435#endif 2487#endif
2436 /* if a worker is waking up, notify workqueue */ 2488}
2437 if ((p->flags & PF_WQ_WORKER) && success) 2489
2438 wq_worker_waking_up(p, cpu_of(rq)); 2490static void
2491ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
2492{
2493#ifdef CONFIG_SMP
2494 if (p->sched_contributes_to_load)
2495 rq->nr_uninterruptible--;
2496#endif
2497
2498 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
2499 ttwu_do_wakeup(rq, p, wake_flags);
2500}
2501
2502/*
2503 * Called in case the task @p isn't fully descheduled from its runqueue,
2504 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2505 * since all we need to do is flip p->state to TASK_RUNNING, since
2506 * the task is still ->on_rq.
2507 */
2508static int ttwu_remote(struct task_struct *p, int wake_flags)
2509{
2510 struct rq *rq;
2511 int ret = 0;
2512
2513 rq = __task_rq_lock(p);
2514 if (p->on_rq) {
2515 ttwu_do_wakeup(rq, p, wake_flags);
2516 ret = 1;
2517 }
2518 __task_rq_unlock(rq);
2519
2520 return ret;
2521}
2522
2523#ifdef CONFIG_SMP
2524static void sched_ttwu_pending(void)
2525{
2526 struct rq *rq = this_rq();
2527 struct task_struct *list = xchg(&rq->wake_list, NULL);
2528
2529 if (!list)
2530 return;
2531
2532 raw_spin_lock(&rq->lock);
2533
2534 while (list) {
2535 struct task_struct *p = list;
2536 list = list->wake_entry;
2537 ttwu_do_activate(rq, p, 0);
2538 }
2539
2540 raw_spin_unlock(&rq->lock);
2541}
2542
2543void scheduler_ipi(void)
2544{
2545 sched_ttwu_pending();
2546}
2547
2548static void ttwu_queue_remote(struct task_struct *p, int cpu)
2549{
2550 struct rq *rq = cpu_rq(cpu);
2551 struct task_struct *next = rq->wake_list;
2552
2553 for (;;) {
2554 struct task_struct *old = next;
2555
2556 p->wake_entry = next;
2557 next = cmpxchg(&rq->wake_list, old, p);
2558 if (next == old)
2559 break;
2560 }
2561
2562 if (!next)
2563 smp_send_reschedule(cpu);
2564}
2565#endif
2566
2567static void ttwu_queue(struct task_struct *p, int cpu)
2568{
2569 struct rq *rq = cpu_rq(cpu);
2570
2571#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE)
2572 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
2573 ttwu_queue_remote(p, cpu);
2574 return;
2575 }
2576#endif
2577
2578 raw_spin_lock(&rq->lock);
2579 ttwu_do_activate(rq, p, 0);
2580 raw_spin_unlock(&rq->lock);
2439} 2581}
2440 2582
2441/** 2583/**
@@ -2453,92 +2595,64 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2453 * Returns %true if @p was woken up, %false if it was already running 2595 * Returns %true if @p was woken up, %false if it was already running
2454 * or @state didn't match @p's state. 2596 * or @state didn't match @p's state.
2455 */ 2597 */
2456static int try_to_wake_up(struct task_struct *p, unsigned int state, 2598static int
2457 int wake_flags) 2599try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2458{ 2600{
2459 int cpu, orig_cpu, this_cpu, success = 0;
2460 unsigned long flags; 2601 unsigned long flags;
2461 unsigned long en_flags = ENQUEUE_WAKEUP; 2602 int cpu, success = 0;
2462 struct rq *rq;
2463
2464 this_cpu = get_cpu();
2465 2603
2466 smp_wmb(); 2604 smp_wmb();
2467 rq = task_rq_lock(p, &flags); 2605 raw_spin_lock_irqsave(&p->pi_lock, flags);
2468 if (!(p->state & state)) 2606 if (!(p->state & state))
2469 goto out; 2607 goto out;
2470 2608
2471 if (p->se.on_rq) 2609 success = 1; /* we're going to change ->state */
2472 goto out_running;
2473
2474 cpu = task_cpu(p); 2610 cpu = task_cpu(p);
2475 orig_cpu = cpu;
2476 2611
2477#ifdef CONFIG_SMP 2612 if (p->on_rq && ttwu_remote(p, wake_flags))
2478 if (unlikely(task_running(rq, p))) 2613 goto stat;
2479 goto out_activate;
2480 2614
2615#ifdef CONFIG_SMP
2481 /* 2616 /*
2482 * In order to handle concurrent wakeups and release the rq->lock 2617 * If the owning (remote) cpu is still in the middle of schedule() with
2483 * we put the task in TASK_WAKING state. 2618 * this task as prev, wait until its done referencing the task.
2484 *
2485 * First fix up the nr_uninterruptible count:
2486 */ 2619 */
2487 if (task_contributes_to_load(p)) { 2620 while (p->on_cpu) {
2488 if (likely(cpu_online(orig_cpu))) 2621#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2489 rq->nr_uninterruptible--; 2622 /*
2490 else 2623 * If called from interrupt context we could have landed in the
2491 this_rq()->nr_uninterruptible--; 2624 * middle of schedule(), in this case we should take care not
2492 } 2625 * to spin on ->on_cpu if p is current, since that would
2493 p->state = TASK_WAKING; 2626 * deadlock.
2494 2627 */
2495 if (p->sched_class->task_waking) { 2628 if (p == current) {
2496 p->sched_class->task_waking(rq, p); 2629 ttwu_queue(p, cpu);
2497 en_flags |= ENQUEUE_WAKING; 2630 goto stat;
2631 }
2632#endif
2633 cpu_relax();
2498 } 2634 }
2499
2500 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2501 if (cpu != orig_cpu)
2502 set_task_cpu(p, cpu);
2503 __task_rq_unlock(rq);
2504
2505 rq = cpu_rq(cpu);
2506 raw_spin_lock(&rq->lock);
2507
2508 /* 2635 /*
2509 * We migrated the task without holding either rq->lock, however 2636 * Pairs with the smp_wmb() in finish_lock_switch().
2510 * since the task is not on the task list itself, nobody else
2511 * will try and migrate the task, hence the rq should match the
2512 * cpu we just moved it to.
2513 */ 2637 */
2514 WARN_ON(task_cpu(p) != cpu); 2638 smp_rmb();
2515 WARN_ON(p->state != TASK_WAKING);
2516 2639
2517#ifdef CONFIG_SCHEDSTATS 2640 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2518 schedstat_inc(rq, ttwu_count); 2641 p->state = TASK_WAKING;
2519 if (cpu == this_cpu) 2642
2520 schedstat_inc(rq, ttwu_local); 2643 if (p->sched_class->task_waking)
2521 else { 2644 p->sched_class->task_waking(p);
2522 struct sched_domain *sd;
2523 for_each_domain(this_cpu, sd) {
2524 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2525 schedstat_inc(sd, ttwu_wake_remote);
2526 break;
2527 }
2528 }
2529 }
2530#endif /* CONFIG_SCHEDSTATS */
2531 2645
2532out_activate: 2646 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2647 if (task_cpu(p) != cpu)
2648 set_task_cpu(p, cpu);
2533#endif /* CONFIG_SMP */ 2649#endif /* CONFIG_SMP */
2534 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, 2650
2535 cpu == this_cpu, en_flags); 2651 ttwu_queue(p, cpu);
2536 success = 1; 2652stat:
2537out_running: 2653 ttwu_stat(p, cpu, wake_flags);
2538 ttwu_post_activation(p, rq, wake_flags, success);
2539out: 2654out:
2540 task_rq_unlock(rq, &flags); 2655 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2541 put_cpu();
2542 2656
2543 return success; 2657 return success;
2544} 2658}
@@ -2547,31 +2661,34 @@ out:
2547 * try_to_wake_up_local - try to wake up a local task with rq lock held 2661 * try_to_wake_up_local - try to wake up a local task with rq lock held
2548 * @p: the thread to be awakened 2662 * @p: the thread to be awakened
2549 * 2663 *
2550 * Put @p on the run-queue if it's not already there. The caller must 2664 * Put @p on the run-queue if it's not already there. The caller must
2551 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2665 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2552 * the current task. this_rq() stays locked over invocation. 2666 * the current task.
2553 */ 2667 */
2554static void try_to_wake_up_local(struct task_struct *p) 2668static void try_to_wake_up_local(struct task_struct *p)
2555{ 2669{
2556 struct rq *rq = task_rq(p); 2670 struct rq *rq = task_rq(p);
2557 bool success = false;
2558 2671
2559 BUG_ON(rq != this_rq()); 2672 BUG_ON(rq != this_rq());
2560 BUG_ON(p == current); 2673 BUG_ON(p == current);
2561 lockdep_assert_held(&rq->lock); 2674 lockdep_assert_held(&rq->lock);
2562 2675
2676 if (!raw_spin_trylock(&p->pi_lock)) {
2677 raw_spin_unlock(&rq->lock);
2678 raw_spin_lock(&p->pi_lock);
2679 raw_spin_lock(&rq->lock);
2680 }
2681
2563 if (!(p->state & TASK_NORMAL)) 2682 if (!(p->state & TASK_NORMAL))
2564 return; 2683 goto out;
2565 2684
2566 if (!p->se.on_rq) { 2685 if (!p->on_rq)
2567 if (likely(!task_running(rq, p))) { 2686 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2568 schedstat_inc(rq, ttwu_count); 2687
2569 schedstat_inc(rq, ttwu_local); 2688 ttwu_do_wakeup(rq, p, 0);
2570 } 2689 ttwu_stat(p, smp_processor_id(), 0);
2571 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); 2690out:
2572 success = true; 2691 raw_spin_unlock(&p->pi_lock);
2573 }
2574 ttwu_post_activation(p, rq, 0, success);
2575} 2692}
2576 2693
2577/** 2694/**
@@ -2604,19 +2721,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
2604 */ 2721 */
2605static void __sched_fork(struct task_struct *p) 2722static void __sched_fork(struct task_struct *p)
2606{ 2723{
2724 p->on_rq = 0;
2725
2726 p->se.on_rq = 0;
2607 p->se.exec_start = 0; 2727 p->se.exec_start = 0;
2608 p->se.sum_exec_runtime = 0; 2728 p->se.sum_exec_runtime = 0;
2609 p->se.prev_sum_exec_runtime = 0; 2729 p->se.prev_sum_exec_runtime = 0;
2610 p->se.nr_migrations = 0; 2730 p->se.nr_migrations = 0;
2611 p->se.vruntime = 0; 2731 p->se.vruntime = 0;
2732 INIT_LIST_HEAD(&p->se.group_node);
2612 2733
2613#ifdef CONFIG_SCHEDSTATS 2734#ifdef CONFIG_SCHEDSTATS
2614 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2735 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2615#endif 2736#endif
2616 2737
2617 INIT_LIST_HEAD(&p->rt.run_list); 2738 INIT_LIST_HEAD(&p->rt.run_list);
2618 p->se.on_rq = 0;
2619 INIT_LIST_HEAD(&p->se.group_node);
2620 2739
2621#ifdef CONFIG_PREEMPT_NOTIFIERS 2740#ifdef CONFIG_PREEMPT_NOTIFIERS
2622 INIT_HLIST_HEAD(&p->preempt_notifiers); 2741 INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2628,6 +2747,7 @@ static void __sched_fork(struct task_struct *p)
2628 */ 2747 */
2629void sched_fork(struct task_struct *p, int clone_flags) 2748void sched_fork(struct task_struct *p, int clone_flags)
2630{ 2749{
2750 unsigned long flags;
2631 int cpu = get_cpu(); 2751 int cpu = get_cpu();
2632 2752
2633 __sched_fork(p); 2753 __sched_fork(p);
@@ -2678,16 +2798,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
2678 * 2798 *
2679 * Silence PROVE_RCU. 2799 * Silence PROVE_RCU.
2680 */ 2800 */
2681 rcu_read_lock(); 2801 raw_spin_lock_irqsave(&p->pi_lock, flags);
2682 set_task_cpu(p, cpu); 2802 set_task_cpu(p, cpu);
2683 rcu_read_unlock(); 2803 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2684 2804
2685#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2805#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2686 if (likely(sched_info_on())) 2806 if (likely(sched_info_on()))
2687 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2807 memset(&p->sched_info, 0, sizeof(p->sched_info));
2688#endif 2808#endif
2689#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 2809#if defined(CONFIG_SMP)
2690 p->oncpu = 0; 2810 p->on_cpu = 0;
2691#endif 2811#endif
2692#ifdef CONFIG_PREEMPT 2812#ifdef CONFIG_PREEMPT
2693 /* Want to start with kernel preemption disabled. */ 2813 /* Want to start with kernel preemption disabled. */
@@ -2711,37 +2831,27 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2711{ 2831{
2712 unsigned long flags; 2832 unsigned long flags;
2713 struct rq *rq; 2833 struct rq *rq;
2714 int cpu __maybe_unused = get_cpu();
2715 2834
2835 raw_spin_lock_irqsave(&p->pi_lock, flags);
2716#ifdef CONFIG_SMP 2836#ifdef CONFIG_SMP
2717 rq = task_rq_lock(p, &flags);
2718 p->state = TASK_WAKING;
2719
2720 /* 2837 /*
2721 * Fork balancing, do it here and not earlier because: 2838 * Fork balancing, do it here and not earlier because:
2722 * - cpus_allowed can change in the fork path 2839 * - cpus_allowed can change in the fork path
2723 * - any previously selected cpu might disappear through hotplug 2840 * - any previously selected cpu might disappear through hotplug
2724 *
2725 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2726 * without people poking at ->cpus_allowed.
2727 */ 2841 */
2728 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); 2842 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
2729 set_task_cpu(p, cpu);
2730
2731 p->state = TASK_RUNNING;
2732 task_rq_unlock(rq, &flags);
2733#endif 2843#endif
2734 2844
2735 rq = task_rq_lock(p, &flags); 2845 rq = __task_rq_lock(p);
2736 activate_task(rq, p, 0); 2846 activate_task(rq, p, 0);
2737 trace_sched_wakeup_new(p, 1); 2847 p->on_rq = 1;
2848 trace_sched_wakeup_new(p, true);
2738 check_preempt_curr(rq, p, WF_FORK); 2849 check_preempt_curr(rq, p, WF_FORK);
2739#ifdef CONFIG_SMP 2850#ifdef CONFIG_SMP
2740 if (p->sched_class->task_woken) 2851 if (p->sched_class->task_woken)
2741 p->sched_class->task_woken(rq, p); 2852 p->sched_class->task_woken(rq, p);
2742#endif 2853#endif
2743 task_rq_unlock(rq, &flags); 2854 task_rq_unlock(rq, p, &flags);
2744 put_cpu();
2745} 2855}
2746 2856
2747#ifdef CONFIG_PREEMPT_NOTIFIERS 2857#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -3450,27 +3560,22 @@ void sched_exec(void)
3450{ 3560{
3451 struct task_struct *p = current; 3561 struct task_struct *p = current;
3452 unsigned long flags; 3562 unsigned long flags;
3453 struct rq *rq;
3454 int dest_cpu; 3563 int dest_cpu;
3455 3564
3456 rq = task_rq_lock(p, &flags); 3565 raw_spin_lock_irqsave(&p->pi_lock, flags);
3457 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); 3566 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
3458 if (dest_cpu == smp_processor_id()) 3567 if (dest_cpu == smp_processor_id())
3459 goto unlock; 3568 goto unlock;
3460 3569
3461 /* 3570 if (likely(cpu_active(dest_cpu))) {
3462 * select_task_rq() can race against ->cpus_allowed
3463 */
3464 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3465 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3466 struct migration_arg arg = { p, dest_cpu }; 3571 struct migration_arg arg = { p, dest_cpu };
3467 3572
3468 task_rq_unlock(rq, &flags); 3573 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3469 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 3574 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3470 return; 3575 return;
3471 } 3576 }
3472unlock: 3577unlock:
3473 task_rq_unlock(rq, &flags); 3578 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3474} 3579}
3475 3580
3476#endif 3581#endif
@@ -3507,7 +3612,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
3507 3612
3508 rq = task_rq_lock(p, &flags); 3613 rq = task_rq_lock(p, &flags);
3509 ns = do_task_delta_exec(p, rq); 3614 ns = do_task_delta_exec(p, rq);
3510 task_rq_unlock(rq, &flags); 3615 task_rq_unlock(rq, p, &flags);
3511 3616
3512 return ns; 3617 return ns;
3513} 3618}
@@ -3525,7 +3630,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3525 3630
3526 rq = task_rq_lock(p, &flags); 3631 rq = task_rq_lock(p, &flags);
3527 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 3632 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3528 task_rq_unlock(rq, &flags); 3633 task_rq_unlock(rq, p, &flags);
3529 3634
3530 return ns; 3635 return ns;
3531} 3636}
@@ -3549,7 +3654,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
3549 rq = task_rq_lock(p, &flags); 3654 rq = task_rq_lock(p, &flags);
3550 thread_group_cputime(p, &totals); 3655 thread_group_cputime(p, &totals);
3551 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); 3656 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3552 task_rq_unlock(rq, &flags); 3657 task_rq_unlock(rq, p, &flags);
3553 3658
3554 return ns; 3659 return ns;
3555} 3660}
@@ -4035,7 +4140,7 @@ static inline void schedule_debug(struct task_struct *prev)
4035 4140
4036static void put_prev_task(struct rq *rq, struct task_struct *prev) 4141static void put_prev_task(struct rq *rq, struct task_struct *prev)
4037{ 4142{
4038 if (prev->se.on_rq) 4143 if (prev->on_rq)
4039 update_rq_clock(rq); 4144 update_rq_clock(rq);
4040 prev->sched_class->put_prev_task(rq, prev); 4145 prev->sched_class->put_prev_task(rq, prev);
4041} 4146}
@@ -4097,11 +4202,13 @@ need_resched:
4097 if (unlikely(signal_pending_state(prev->state, prev))) { 4202 if (unlikely(signal_pending_state(prev->state, prev))) {
4098 prev->state = TASK_RUNNING; 4203 prev->state = TASK_RUNNING;
4099 } else { 4204 } else {
4205 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4206 prev->on_rq = 0;
4207
4100 /* 4208 /*
4101 * If a worker is going to sleep, notify and 4209 * If a worker went to sleep, notify and ask workqueue
4102 * ask workqueue whether it wants to wake up a 4210 * whether it wants to wake up a task to maintain
4103 * task to maintain concurrency. If so, wake 4211 * concurrency.
4104 * up the task.
4105 */ 4212 */
4106 if (prev->flags & PF_WQ_WORKER) { 4213 if (prev->flags & PF_WQ_WORKER) {
4107 struct task_struct *to_wakeup; 4214 struct task_struct *to_wakeup;
@@ -4110,11 +4217,10 @@ need_resched:
4110 if (to_wakeup) 4217 if (to_wakeup)
4111 try_to_wake_up_local(to_wakeup); 4218 try_to_wake_up_local(to_wakeup);
4112 } 4219 }
4113 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4114 4220
4115 /* 4221 /*
4116 * If we are going to sleep and we have plugged IO queued, make 4222 * If we are going to sleep and we have plugged IO
4117 * sure to submit it to avoid deadlocks. 4223 * queued, make sure to submit it to avoid deadlocks.
4118 */ 4224 */
4119 if (blk_needs_flush_plug(prev)) { 4225 if (blk_needs_flush_plug(prev)) {
4120 raw_spin_unlock(&rq->lock); 4226 raw_spin_unlock(&rq->lock);
@@ -4161,70 +4267,53 @@ need_resched:
4161EXPORT_SYMBOL(schedule); 4267EXPORT_SYMBOL(schedule);
4162 4268
4163#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 4269#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4164/*
4165 * Look out! "owner" is an entirely speculative pointer
4166 * access and not reliable.
4167 */
4168int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4169{
4170 unsigned int cpu;
4171 struct rq *rq;
4172 4270
4173 if (!sched_feat(OWNER_SPIN)) 4271static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4174 return 0; 4272{
4273 bool ret = false;
4175 4274
4176#ifdef CONFIG_DEBUG_PAGEALLOC 4275 rcu_read_lock();
4177 /* 4276 if (lock->owner != owner)
4178 * Need to access the cpu field knowing that 4277 goto fail;
4179 * DEBUG_PAGEALLOC could have unmapped it if
4180 * the mutex owner just released it and exited.
4181 */
4182 if (probe_kernel_address(&owner->cpu, cpu))
4183 return 0;
4184#else
4185 cpu = owner->cpu;
4186#endif
4187 4278
4188 /* 4279 /*
4189 * Even if the access succeeded (likely case), 4280 * Ensure we emit the owner->on_cpu, dereference _after_ checking
4190 * the cpu field may no longer be valid. 4281 * lock->owner still matches owner, if that fails, owner might
4282 * point to free()d memory, if it still matches, the rcu_read_lock()
4283 * ensures the memory stays valid.
4191 */ 4284 */
4192 if (cpu >= nr_cpumask_bits) 4285 barrier();
4193 return 0;
4194 4286
4195 /* 4287 ret = owner->on_cpu;
4196 * We need to validate that we can do a 4288fail:
4197 * get_cpu() and that we have the percpu area. 4289 rcu_read_unlock();
4198 */
4199 if (!cpu_online(cpu))
4200 return 0;
4201 4290
4202 rq = cpu_rq(cpu); 4291 return ret;
4292}
4203 4293
4204 for (;;) { 4294/*
4205 /* 4295 * Look out! "owner" is an entirely speculative pointer
4206 * Owner changed, break to re-assess state. 4296 * access and not reliable.
4207 */ 4297 */
4208 if (lock->owner != owner) { 4298int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
4209 /* 4299{
4210 * If the lock has switched to a different owner, 4300 if (!sched_feat(OWNER_SPIN))
4211 * we likely have heavy contention. Return 0 to quit 4301 return 0;
4212 * optimistic spinning and not contend further:
4213 */
4214 if (lock->owner)
4215 return 0;
4216 break;
4217 }
4218 4302
4219 /* 4303 while (owner_running(lock, owner)) {
4220 * Is that owner really running on that cpu? 4304 if (need_resched())
4221 */
4222 if (task_thread_info(rq->curr) != owner || need_resched())
4223 return 0; 4305 return 0;
4224 4306
4225 arch_mutex_cpu_relax(); 4307 arch_mutex_cpu_relax();
4226 } 4308 }
4227 4309
4310 /*
4311 * If the owner changed to another task there is likely
4312 * heavy contention, stop spinning.
4313 */
4314 if (lock->owner)
4315 return 0;
4316
4228 return 1; 4317 return 1;
4229} 4318}
4230#endif 4319#endif
@@ -4684,19 +4773,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
4684 */ 4773 */
4685void rt_mutex_setprio(struct task_struct *p, int prio) 4774void rt_mutex_setprio(struct task_struct *p, int prio)
4686{ 4775{
4687 unsigned long flags;
4688 int oldprio, on_rq, running; 4776 int oldprio, on_rq, running;
4689 struct rq *rq; 4777 struct rq *rq;
4690 const struct sched_class *prev_class; 4778 const struct sched_class *prev_class;
4691 4779
4692 BUG_ON(prio < 0 || prio > MAX_PRIO); 4780 BUG_ON(prio < 0 || prio > MAX_PRIO);
4693 4781
4694 rq = task_rq_lock(p, &flags); 4782 rq = __task_rq_lock(p);
4695 4783
4696 trace_sched_pi_setprio(p, prio); 4784 trace_sched_pi_setprio(p, prio);
4697 oldprio = p->prio; 4785 oldprio = p->prio;
4698 prev_class = p->sched_class; 4786 prev_class = p->sched_class;
4699 on_rq = p->se.on_rq; 4787 on_rq = p->on_rq;
4700 running = task_current(rq, p); 4788 running = task_current(rq, p);
4701 if (on_rq) 4789 if (on_rq)
4702 dequeue_task(rq, p, 0); 4790 dequeue_task(rq, p, 0);
@@ -4716,7 +4804,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4716 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4804 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4717 4805
4718 check_class_changed(rq, p, prev_class, oldprio); 4806 check_class_changed(rq, p, prev_class, oldprio);
4719 task_rq_unlock(rq, &flags); 4807 __task_rq_unlock(rq);
4720} 4808}
4721 4809
4722#endif 4810#endif
@@ -4744,7 +4832,7 @@ void set_user_nice(struct task_struct *p, long nice)
4744 p->static_prio = NICE_TO_PRIO(nice); 4832 p->static_prio = NICE_TO_PRIO(nice);
4745 goto out_unlock; 4833 goto out_unlock;
4746 } 4834 }
4747 on_rq = p->se.on_rq; 4835 on_rq = p->on_rq;
4748 if (on_rq) 4836 if (on_rq)
4749 dequeue_task(rq, p, 0); 4837 dequeue_task(rq, p, 0);
4750 4838
@@ -4764,7 +4852,7 @@ void set_user_nice(struct task_struct *p, long nice)
4764 resched_task(rq->curr); 4852 resched_task(rq->curr);
4765 } 4853 }
4766out_unlock: 4854out_unlock:
4767 task_rq_unlock(rq, &flags); 4855 task_rq_unlock(rq, p, &flags);
4768} 4856}
4769EXPORT_SYMBOL(set_user_nice); 4857EXPORT_SYMBOL(set_user_nice);
4770 4858
@@ -4878,8 +4966,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
4878static void 4966static void
4879__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 4967__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4880{ 4968{
4881 BUG_ON(p->se.on_rq);
4882
4883 p->policy = policy; 4969 p->policy = policy;
4884 p->rt_priority = prio; 4970 p->rt_priority = prio;
4885 p->normal_prio = normal_prio(p); 4971 p->normal_prio = normal_prio(p);
@@ -4994,20 +5080,17 @@ recheck:
4994 /* 5080 /*
4995 * make sure no PI-waiters arrive (or leave) while we are 5081 * make sure no PI-waiters arrive (or leave) while we are
4996 * changing the priority of the task: 5082 * changing the priority of the task:
4997 */ 5083 *
4998 raw_spin_lock_irqsave(&p->pi_lock, flags);
4999 /*
5000 * To be able to change p->policy safely, the appropriate 5084 * To be able to change p->policy safely, the appropriate
5001 * runqueue lock must be held. 5085 * runqueue lock must be held.
5002 */ 5086 */
5003 rq = __task_rq_lock(p); 5087 rq = task_rq_lock(p, &flags);
5004 5088
5005 /* 5089 /*
5006 * Changing the policy of the stop threads its a very bad idea 5090 * Changing the policy of the stop threads its a very bad idea
5007 */ 5091 */
5008 if (p == rq->stop) { 5092 if (p == rq->stop) {
5009 __task_rq_unlock(rq); 5093 task_rq_unlock(rq, p, &flags);
5010 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5011 return -EINVAL; 5094 return -EINVAL;
5012 } 5095 }
5013 5096
@@ -5031,8 +5114,7 @@ recheck:
5031 if (rt_bandwidth_enabled() && rt_policy(policy) && 5114 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5032 task_group(p)->rt_bandwidth.rt_runtime == 0 && 5115 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5033 !task_group_is_autogroup(task_group(p))) { 5116 !task_group_is_autogroup(task_group(p))) {
5034 __task_rq_unlock(rq); 5117 task_rq_unlock(rq, p, &flags);
5035 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5036 return -EPERM; 5118 return -EPERM;
5037 } 5119 }
5038 } 5120 }
@@ -5041,11 +5123,10 @@ recheck:
5041 /* recheck policy now with rq lock held */ 5123 /* recheck policy now with rq lock held */
5042 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 5124 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5043 policy = oldpolicy = -1; 5125 policy = oldpolicy = -1;
5044 __task_rq_unlock(rq); 5126 task_rq_unlock(rq, p, &flags);
5045 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5046 goto recheck; 5127 goto recheck;
5047 } 5128 }
5048 on_rq = p->se.on_rq; 5129 on_rq = p->on_rq;
5049 running = task_current(rq, p); 5130 running = task_current(rq, p);
5050 if (on_rq) 5131 if (on_rq)
5051 deactivate_task(rq, p, 0); 5132 deactivate_task(rq, p, 0);
@@ -5064,8 +5145,7 @@ recheck:
5064 activate_task(rq, p, 0); 5145 activate_task(rq, p, 0);
5065 5146
5066 check_class_changed(rq, p, prev_class, oldprio); 5147 check_class_changed(rq, p, prev_class, oldprio);
5067 __task_rq_unlock(rq); 5148 task_rq_unlock(rq, p, &flags);
5068 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5069 5149
5070 rt_mutex_adjust_pi(p); 5150 rt_mutex_adjust_pi(p);
5071 5151
@@ -5316,7 +5396,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5316{ 5396{
5317 struct task_struct *p; 5397 struct task_struct *p;
5318 unsigned long flags; 5398 unsigned long flags;
5319 struct rq *rq;
5320 int retval; 5399 int retval;
5321 5400
5322 get_online_cpus(); 5401 get_online_cpus();
@@ -5331,9 +5410,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5331 if (retval) 5410 if (retval)
5332 goto out_unlock; 5411 goto out_unlock;
5333 5412
5334 rq = task_rq_lock(p, &flags); 5413 raw_spin_lock_irqsave(&p->pi_lock, flags);
5335 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 5414 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5336 task_rq_unlock(rq, &flags); 5415 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5337 5416
5338out_unlock: 5417out_unlock:
5339 rcu_read_unlock(); 5418 rcu_read_unlock();
@@ -5658,7 +5737,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5658 5737
5659 rq = task_rq_lock(p, &flags); 5738 rq = task_rq_lock(p, &flags);
5660 time_slice = p->sched_class->get_rr_interval(rq, p); 5739 time_slice = p->sched_class->get_rr_interval(rq, p);
5661 task_rq_unlock(rq, &flags); 5740 task_rq_unlock(rq, p, &flags);
5662 5741
5663 rcu_read_unlock(); 5742 rcu_read_unlock();
5664 jiffies_to_timespec(time_slice, &t); 5743 jiffies_to_timespec(time_slice, &t);
@@ -5776,8 +5855,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5776 rcu_read_unlock(); 5855 rcu_read_unlock();
5777 5856
5778 rq->curr = rq->idle = idle; 5857 rq->curr = rq->idle = idle;
5779#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5858#if defined(CONFIG_SMP)
5780 idle->oncpu = 1; 5859 idle->on_cpu = 1;
5781#endif 5860#endif
5782 raw_spin_unlock_irqrestore(&rq->lock, flags); 5861 raw_spin_unlock_irqrestore(&rq->lock, flags);
5783 5862
@@ -5881,18 +5960,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5881 unsigned int dest_cpu; 5960 unsigned int dest_cpu;
5882 int ret = 0; 5961 int ret = 0;
5883 5962
5884 /*
5885 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5886 * drop the rq->lock and still rely on ->cpus_allowed.
5887 */
5888again:
5889 while (task_is_waking(p))
5890 cpu_relax();
5891 rq = task_rq_lock(p, &flags); 5963 rq = task_rq_lock(p, &flags);
5892 if (task_is_waking(p)) {
5893 task_rq_unlock(rq, &flags);
5894 goto again;
5895 }
5896 5964
5897 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5965 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5898 ret = -EINVAL; 5966 ret = -EINVAL;
@@ -5917,16 +5985,16 @@ again:
5917 goto out; 5985 goto out;
5918 5986
5919 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5987 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5920 if (migrate_task(p, rq)) { 5988 if (need_migrate_task(p)) {
5921 struct migration_arg arg = { p, dest_cpu }; 5989 struct migration_arg arg = { p, dest_cpu };
5922 /* Need help from migration thread: drop lock and wait. */ 5990 /* Need help from migration thread: drop lock and wait. */
5923 task_rq_unlock(rq, &flags); 5991 task_rq_unlock(rq, p, &flags);
5924 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 5992 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5925 tlb_migrate_finish(p->mm); 5993 tlb_migrate_finish(p->mm);
5926 return 0; 5994 return 0;
5927 } 5995 }
5928out: 5996out:
5929 task_rq_unlock(rq, &flags); 5997 task_rq_unlock(rq, p, &flags);
5930 5998
5931 return ret; 5999 return ret;
5932} 6000}
@@ -5954,6 +6022,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5954 rq_src = cpu_rq(src_cpu); 6022 rq_src = cpu_rq(src_cpu);
5955 rq_dest = cpu_rq(dest_cpu); 6023 rq_dest = cpu_rq(dest_cpu);
5956 6024
6025 raw_spin_lock(&p->pi_lock);
5957 double_rq_lock(rq_src, rq_dest); 6026 double_rq_lock(rq_src, rq_dest);
5958 /* Already moved. */ 6027 /* Already moved. */
5959 if (task_cpu(p) != src_cpu) 6028 if (task_cpu(p) != src_cpu)
@@ -5966,7 +6035,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5966 * If we're not on a rq, the next wake-up will ensure we're 6035 * If we're not on a rq, the next wake-up will ensure we're
5967 * placed properly. 6036 * placed properly.
5968 */ 6037 */
5969 if (p->se.on_rq) { 6038 if (p->on_rq) {
5970 deactivate_task(rq_src, p, 0); 6039 deactivate_task(rq_src, p, 0);
5971 set_task_cpu(p, dest_cpu); 6040 set_task_cpu(p, dest_cpu);
5972 activate_task(rq_dest, p, 0); 6041 activate_task(rq_dest, p, 0);
@@ -5976,6 +6045,7 @@ done:
5976 ret = 1; 6045 ret = 1;
5977fail: 6046fail:
5978 double_rq_unlock(rq_src, rq_dest); 6047 double_rq_unlock(rq_src, rq_dest);
6048 raw_spin_unlock(&p->pi_lock);
5979 return ret; 6049 return ret;
5980} 6050}
5981 6051
@@ -6316,6 +6386,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6316 6386
6317#ifdef CONFIG_HOTPLUG_CPU 6387#ifdef CONFIG_HOTPLUG_CPU
6318 case CPU_DYING: 6388 case CPU_DYING:
6389 sched_ttwu_pending();
6319 /* Update our root-domain */ 6390 /* Update our root-domain */
6320 raw_spin_lock_irqsave(&rq->lock, flags); 6391 raw_spin_lock_irqsave(&rq->lock, flags);
6321 if (rq->rd) { 6392 if (rq->rd) {
@@ -8340,7 +8411,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8340 int old_prio = p->prio; 8411 int old_prio = p->prio;
8341 int on_rq; 8412 int on_rq;
8342 8413
8343 on_rq = p->se.on_rq; 8414 on_rq = p->on_rq;
8344 if (on_rq) 8415 if (on_rq)
8345 deactivate_task(rq, p, 0); 8416 deactivate_task(rq, p, 0);
8346 __setscheduler(rq, p, SCHED_NORMAL, 0); 8417 __setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8683,7 +8754,7 @@ void sched_move_task(struct task_struct *tsk)
8683 rq = task_rq_lock(tsk, &flags); 8754 rq = task_rq_lock(tsk, &flags);
8684 8755
8685 running = task_current(rq, tsk); 8756 running = task_current(rq, tsk);
8686 on_rq = tsk->se.on_rq; 8757 on_rq = tsk->on_rq;
8687 8758
8688 if (on_rq) 8759 if (on_rq)
8689 dequeue_task(rq, tsk, 0); 8760 dequeue_task(rq, tsk, 0);
@@ -8702,7 +8773,7 @@ void sched_move_task(struct task_struct *tsk)
8702 if (on_rq) 8773 if (on_rq)
8703 enqueue_task(rq, tsk, 0); 8774 enqueue_task(rq, tsk, 0);
8704 8775
8705 task_rq_unlock(rq, &flags); 8776 task_rq_unlock(rq, tsk, &flags);
8706} 8777}
8707#endif /* CONFIG_CGROUP_SCHED */ 8778#endif /* CONFIG_CGROUP_SCHED */
8708 8779
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 7bacd83a4158..3669bec6e130 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -152,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
152 read_lock_irqsave(&tasklist_lock, flags); 152 read_lock_irqsave(&tasklist_lock, flags);
153 153
154 do_each_thread(g, p) { 154 do_each_thread(g, p) {
155 if (!p->se.on_rq || task_cpu(p) != rq_cpu) 155 if (!p->on_rq || task_cpu(p) != rq_cpu)
156 continue; 156 continue;
157 157
158 print_task(m, rq, p); 158 print_task(m, rq, p);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6fa833ab2cb8..054cebb81f7b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -358,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
358 } 358 }
359 359
360 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); 360 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
361#ifndef CONFIG_64BIT
362 smp_wmb();
363 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
364#endif
361} 365}
362 366
363/* 367/*
@@ -1372,12 +1376,25 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1372 1376
1373#ifdef CONFIG_SMP 1377#ifdef CONFIG_SMP
1374 1378
1375static void task_waking_fair(struct rq *rq, struct task_struct *p) 1379static void task_waking_fair(struct task_struct *p)
1376{ 1380{
1377 struct sched_entity *se = &p->se; 1381 struct sched_entity *se = &p->se;
1378 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1382 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1383 u64 min_vruntime;
1379 1384
1380 se->vruntime -= cfs_rq->min_vruntime; 1385#ifndef CONFIG_64BIT
1386 u64 min_vruntime_copy;
1387
1388 do {
1389 min_vruntime_copy = cfs_rq->min_vruntime_copy;
1390 smp_rmb();
1391 min_vruntime = cfs_rq->min_vruntime;
1392 } while (min_vruntime != min_vruntime_copy);
1393#else
1394 min_vruntime = cfs_rq->min_vruntime;
1395#endif
1396
1397 se->vruntime -= min_vruntime;
1381} 1398}
1382 1399
1383#ifdef CONFIG_FAIR_GROUP_SCHED 1400#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1657,7 +1674,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1657 * preempt must be disabled. 1674 * preempt must be disabled.
1658 */ 1675 */
1659static int 1676static int
1660select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) 1677select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1661{ 1678{
1662 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 1679 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1663 int cpu = smp_processor_id(); 1680 int cpu = smp_processor_id();
@@ -1789,10 +1806,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
1789 * This is especially important for buddies when the leftmost 1806 * This is especially important for buddies when the leftmost
1790 * task is higher priority than the buddy. 1807 * task is higher priority than the buddy.
1791 */ 1808 */
1792 if (unlikely(se->load.weight != NICE_0_LOAD)) 1809 return calc_delta_fair(gran, se);
1793 gran = calc_delta_fair(gran, se);
1794
1795 return gran;
1796} 1810}
1797 1811
1798/* 1812/*
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 68e69acc29b9..be40f7371ee1 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -64,3 +64,9 @@ SCHED_FEAT(OWNER_SPIN, 1)
64 * Decrement CPU power based on irq activity 64 * Decrement CPU power based on irq activity
65 */ 65 */
66SCHED_FEAT(NONIRQ_POWER, 1) 66SCHED_FEAT(NONIRQ_POWER, 1)
67
68/*
69 * Queue remote wakeups on the target CPU and process them
70 * using the scheduler IPI. Reduces rq->lock contention/bounces.
71 */
72SCHED_FEAT(TTWU_QUEUE, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index a776a6396427..0a51882534ea 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -7,7 +7,7 @@
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int 9static int
10select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) 10select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
11{ 11{
12 return task_cpu(p); /* IDLE tasks as never migrated */ 12 return task_cpu(p); /* IDLE tasks as never migrated */
13} 13}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index e7cebdc65f82..19ecb3127379 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -977,13 +977,23 @@ static void yield_task_rt(struct rq *rq)
977static int find_lowest_rq(struct task_struct *task); 977static int find_lowest_rq(struct task_struct *task);
978 978
979static int 979static int
980select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) 980select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
981{ 981{
982 struct task_struct *curr;
983 struct rq *rq;
984 int cpu;
985
982 if (sd_flag != SD_BALANCE_WAKE) 986 if (sd_flag != SD_BALANCE_WAKE)
983 return smp_processor_id(); 987 return smp_processor_id();
984 988
989 cpu = task_cpu(p);
990 rq = cpu_rq(cpu);
991
992 rcu_read_lock();
993 curr = ACCESS_ONCE(rq->curr); /* unlocked access */
994
985 /* 995 /*
986 * If the current task is an RT task, then 996 * If the current task on @p's runqueue is an RT task, then
987 * try to see if we can wake this RT task up on another 997 * try to see if we can wake this RT task up on another
988 * runqueue. Otherwise simply start this RT task 998 * runqueue. Otherwise simply start this RT task
989 * on its current runqueue. 999 * on its current runqueue.
@@ -997,21 +1007,25 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
997 * lock? 1007 * lock?
998 * 1008 *
999 * For equal prio tasks, we just let the scheduler sort it out. 1009 * For equal prio tasks, we just let the scheduler sort it out.
1010 *
1011 * Otherwise, just let it ride on the affined RQ and the
1012 * post-schedule router will push the preempted task away
1013 *
1014 * This test is optimistic, if we get it wrong the load-balancer
1015 * will have to sort it out.
1000 */ 1016 */
1001 if (unlikely(rt_task(rq->curr)) && 1017 if (curr && unlikely(rt_task(curr)) &&
1002 (rq->curr->rt.nr_cpus_allowed < 2 || 1018 (curr->rt.nr_cpus_allowed < 2 ||
1003 rq->curr->prio < p->prio) && 1019 curr->prio < p->prio) &&
1004 (p->rt.nr_cpus_allowed > 1)) { 1020 (p->rt.nr_cpus_allowed > 1)) {
1005 int cpu = find_lowest_rq(p); 1021 int target = find_lowest_rq(p);
1006 1022
1007 return (cpu == -1) ? task_cpu(p) : cpu; 1023 if (target != -1)
1024 cpu = target;
1008 } 1025 }
1026 rcu_read_unlock();
1009 1027
1010 /* 1028 return cpu;
1011 * Otherwise, just let it ride on the affined RQ and the
1012 * post-schedule router will push the preempted task away
1013 */
1014 return task_cpu(p);
1015} 1029}
1016 1030
1017static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1031static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
@@ -1136,7 +1150,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1136 * The previous task needs to be made eligible for pushing 1150 * The previous task needs to be made eligible for pushing
1137 * if it is still active 1151 * if it is still active
1138 */ 1152 */
1139 if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) 1153 if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
1140 enqueue_pushable_task(rq, p); 1154 enqueue_pushable_task(rq, p);
1141} 1155}
1142 1156
@@ -1287,7 +1301,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1287 !cpumask_test_cpu(lowest_rq->cpu, 1301 !cpumask_test_cpu(lowest_rq->cpu,
1288 &task->cpus_allowed) || 1302 &task->cpus_allowed) ||
1289 task_running(rq, task) || 1303 task_running(rq, task) ||
1290 !task->se.on_rq)) { 1304 !task->on_rq)) {
1291 1305
1292 raw_spin_unlock(&lowest_rq->lock); 1306 raw_spin_unlock(&lowest_rq->lock);
1293 lowest_rq = NULL; 1307 lowest_rq = NULL;
@@ -1321,7 +1335,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
1321 BUG_ON(task_current(rq, p)); 1335 BUG_ON(task_current(rq, p));
1322 BUG_ON(p->rt.nr_cpus_allowed <= 1); 1336 BUG_ON(p->rt.nr_cpus_allowed <= 1);
1323 1337
1324 BUG_ON(!p->se.on_rq); 1338 BUG_ON(!p->on_rq);
1325 BUG_ON(!rt_task(p)); 1339 BUG_ON(!rt_task(p));
1326 1340
1327 return p; 1341 return p;
@@ -1467,7 +1481,7 @@ static int pull_rt_task(struct rq *this_rq)
1467 */ 1481 */
1468 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 1482 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1469 WARN_ON(p == src_rq->curr); 1483 WARN_ON(p == src_rq->curr);
1470 WARN_ON(!p->se.on_rq); 1484 WARN_ON(!p->on_rq);
1471 1485
1472 /* 1486 /*
1473 * There's a chance that p is higher in priority 1487 * There's a chance that p is higher in priority
@@ -1538,7 +1552,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1538 * Update the migration status of the RQ if we have an RT task 1552 * Update the migration status of the RQ if we have an RT task
1539 * which is running AND changing its weight value. 1553 * which is running AND changing its weight value.
1540 */ 1554 */
1541 if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { 1555 if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
1542 struct rq *rq = task_rq(p); 1556 struct rq *rq = task_rq(p);
1543 1557
1544 if (!task_current(rq, p)) { 1558 if (!task_current(rq, p)) {
@@ -1608,7 +1622,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1608 * we may need to handle the pulling of RT tasks 1622 * we may need to handle the pulling of RT tasks
1609 * now. 1623 * now.
1610 */ 1624 */
1611 if (p->se.on_rq && !rq->rt.rt_nr_running) 1625 if (p->on_rq && !rq->rt.rt_nr_running)
1612 pull_rt_task(rq); 1626 pull_rt_task(rq);
1613} 1627}
1614 1628
@@ -1638,7 +1652,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1638 * If that current running task is also an RT task 1652 * If that current running task is also an RT task
1639 * then see if we can move to another run queue. 1653 * then see if we can move to another run queue.
1640 */ 1654 */
1641 if (p->se.on_rq && rq->curr != p) { 1655 if (p->on_rq && rq->curr != p) {
1642#ifdef CONFIG_SMP 1656#ifdef CONFIG_SMP
1643 if (rq->rt.overloaded && push_rt_task(rq) && 1657 if (rq->rt.overloaded && push_rt_task(rq) &&
1644 /* Don't resched if we changed runqueues */ 1658 /* Don't resched if we changed runqueues */
@@ -1657,7 +1671,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1657static void 1671static void
1658prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 1672prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1659{ 1673{
1660 if (!p->se.on_rq) 1674 if (!p->on_rq)
1661 return; 1675 return;
1662 1676
1663 if (rq->curr == p) { 1677 if (rq->curr == p) {
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 1ba2bd40fdac..6f437632afab 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -9,8 +9,7 @@
9 9
10#ifdef CONFIG_SMP 10#ifdef CONFIG_SMP
11static int 11static int
12select_task_rq_stop(struct rq *rq, struct task_struct *p, 12select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
13 int sd_flag, int flags)
14{ 13{
15 return task_cpu(p); /* stop tasks as never migrate */ 14 return task_cpu(p); /* stop tasks as never migrate */
16} 15}
@@ -26,7 +25,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
26{ 25{
27 struct task_struct *stop = rq->stop; 26 struct task_struct *stop = rq->stop;
28 27
29 if (stop && stop->se.on_rq) 28 if (stop && stop->on_rq)
30 return stop; 29 return stop;
31 30
32 return NULL; 31 return NULL;