aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/trace/kprobetrace.txt1
-rw-r--r--arch/alpha/kernel/smp.c3
-rw-r--r--arch/arm/kernel/smp.c5
-rw-r--r--arch/blackfin/mach-common/smp.c3
-rw-r--r--arch/cris/arch-v32/kernel/smp.c13
-rw-r--r--arch/ia64/kernel/irq_ia64.c2
-rw-r--r--arch/ia64/xen/irq_xen.c10
-rw-r--r--arch/m32r/kernel/smp.c4
-rw-r--r--arch/mips/cavium-octeon/smp.c2
-rw-r--r--arch/mips/kernel/smtc.c2
-rw-r--r--arch/mips/mti-malta/malta-int.c2
-rw-r--r--arch/mips/pmc-sierra/yosemite/smp.c4
-rw-r--r--arch/mips/sgi-ip27/ip27-irq.c2
-rw-r--r--arch/mips/sibyte/bcm1480/smp.c7
-rw-r--r--arch/mips/sibyte/sb1250/smp.c7
-rw-r--r--arch/mn10300/kernel/smp.c5
-rw-r--r--arch/parisc/kernel/smp.c5
-rw-r--r--arch/powerpc/kernel/smp.c4
-rw-r--r--arch/s390/kernel/smp.c6
-rw-r--r--arch/sh/kernel/smp.c2
-rw-r--r--arch/sparc/include/asm/topology_64.h6
-rw-r--r--arch/sparc/kernel/smp_32.c4
-rw-r--r--arch/sparc/kernel/smp_64.c1
-rw-r--r--arch/tile/kernel/smp.c6
-rw-r--r--arch/um/kernel/smp.c2
-rw-r--r--arch/x86/kernel/smp.c5
-rw-r--r--arch/x86/xen/smp.c5
-rw-r--r--include/linux/init_task.h1
-rw-r--r--include/linux/mutex.h2
-rw-r--r--include/linux/sched.h61
-rw-r--r--init/Kconfig5
-rw-r--r--kernel/cpuset.c2
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/mutex-debug.c2
-rw-r--r--kernel/mutex-debug.h2
-rw-r--r--kernel/mutex.c9
-rw-r--r--kernel/mutex.h2
-rw-r--r--kernel/sched.c1658
-rw-r--r--kernel/sched_debug.c6
-rw-r--r--kernel/sched_fair.c126
-rw-r--r--kernel/sched_features.h6
-rw-r--r--kernel/sched_idletask.c2
-rw-r--r--kernel/sched_rt.c83
-rw-r--r--kernel/sched_stoptask.c5
-rw-r--r--kernel/trace/trace_kprobe.c1
-rw-r--r--tools/perf/Documentation/perf-script-perl.txt1
-rw-r--r--tools/perf/Documentation/perf-script-python.txt1
47 files changed, 928 insertions, 1170 deletions
diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index 6d27ab8d6e9f..c83bd6b4e6e8 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -120,7 +120,6 @@ format:
120 field:unsigned char common_flags; offset:2; size:1; signed:0; 120 field:unsigned char common_flags; offset:2; size:1; signed:0;
121 field:unsigned char common_preempt_count; offset:3; size:1;signed:0; 121 field:unsigned char common_preempt_count; offset:3; size:1;signed:0;
122 field:int common_pid; offset:4; size:4; signed:1; 122 field:int common_pid; offset:4; size:4; signed:1;
123 field:int common_lock_depth; offset:8; size:4; signed:1;
124 123
125 field:unsigned long __probe_ip; offset:12; size:4; signed:0; 124 field:unsigned long __probe_ip; offset:12; size:4; signed:0;
126 field:int __probe_nargs; offset:16; size:4; signed:1; 125 field:int __probe_nargs; offset:16; size:4; signed:1;
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 42aa078a5e4d..5a621c6d22ab 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -585,8 +585,7 @@ handle_ipi(struct pt_regs *regs)
585 585
586 switch (which) { 586 switch (which) {
587 case IPI_RESCHEDULE: 587 case IPI_RESCHEDULE:
588 /* Reschedule callback. Everything to be done 588 scheduler_ipi();
589 is done by the interrupt return path. */
590 break; 589 break;
591 590
592 case IPI_CALL_FUNC: 591 case IPI_CALL_FUNC:
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index f29b8a29b174..007a0a950e75 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -560,10 +560,7 @@ asmlinkage void __exception_irq_entry do_IPI(int ipinr, struct pt_regs *regs)
560 break; 560 break;
561 561
562 case IPI_RESCHEDULE: 562 case IPI_RESCHEDULE:
563 /* 563 scheduler_ipi();
564 * nothing more to do - eveything is
565 * done on the interrupt return path
566 */
567 break; 564 break;
568 565
569 case IPI_CALL_FUNC: 566 case IPI_CALL_FUNC:
diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c
index 8bce5ed031e4..1fbd94c44457 100644
--- a/arch/blackfin/mach-common/smp.c
+++ b/arch/blackfin/mach-common/smp.c
@@ -177,6 +177,9 @@ static irqreturn_t ipi_handler_int1(int irq, void *dev_instance)
177 while (msg_queue->count) { 177 while (msg_queue->count) {
178 msg = &msg_queue->ipi_message[msg_queue->head]; 178 msg = &msg_queue->ipi_message[msg_queue->head];
179 switch (msg->type) { 179 switch (msg->type) {
180 case BFIN_IPI_RESCHEDULE:
181 scheduler_ipi();
182 break;
180 case BFIN_IPI_CALL_FUNC: 183 case BFIN_IPI_CALL_FUNC:
181 spin_unlock_irqrestore(&msg_queue->lock, flags); 184 spin_unlock_irqrestore(&msg_queue->lock, flags);
182 ipi_call_function(cpu, msg); 185 ipi_call_function(cpu, msg);
diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c
index 4c9e3e1ba5d1..66cc75657e2f 100644
--- a/arch/cris/arch-v32/kernel/smp.c
+++ b/arch/cris/arch-v32/kernel/smp.c
@@ -342,15 +342,18 @@ irqreturn_t crisv32_ipi_interrupt(int irq, void *dev_id)
342 342
343 ipi = REG_RD(intr_vect, irq_regs[smp_processor_id()], rw_ipi); 343 ipi = REG_RD(intr_vect, irq_regs[smp_processor_id()], rw_ipi);
344 344
345 if (ipi.vector & IPI_SCHEDULE) {
346 scheduler_ipi();
347 }
345 if (ipi.vector & IPI_CALL) { 348 if (ipi.vector & IPI_CALL) {
346 func(info); 349 func(info);
347 } 350 }
348 if (ipi.vector & IPI_FLUSH_TLB) { 351 if (ipi.vector & IPI_FLUSH_TLB) {
349 if (flush_mm == FLUSH_ALL) 352 if (flush_mm == FLUSH_ALL)
350 __flush_tlb_all(); 353 __flush_tlb_all();
351 else if (flush_vma == FLUSH_ALL) 354 else if (flush_vma == FLUSH_ALL)
352 __flush_tlb_mm(flush_mm); 355 __flush_tlb_mm(flush_mm);
353 else 356 else
354 __flush_tlb_page(flush_vma, flush_addr); 357 __flush_tlb_page(flush_vma, flush_addr);
355 } 358 }
356 359
diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
index 5b704740f160..782c3a357f24 100644
--- a/arch/ia64/kernel/irq_ia64.c
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -31,6 +31,7 @@
31#include <linux/irq.h> 31#include <linux/irq.h>
32#include <linux/ratelimit.h> 32#include <linux/ratelimit.h>
33#include <linux/acpi.h> 33#include <linux/acpi.h>
34#include <linux/sched.h>
34 35
35#include <asm/delay.h> 36#include <asm/delay.h>
36#include <asm/intrinsics.h> 37#include <asm/intrinsics.h>
@@ -496,6 +497,7 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs)
496 smp_local_flush_tlb(); 497 smp_local_flush_tlb();
497 kstat_incr_irqs_this_cpu(irq, desc); 498 kstat_incr_irqs_this_cpu(irq, desc);
498 } else if (unlikely(IS_RESCHEDULE(vector))) { 499 } else if (unlikely(IS_RESCHEDULE(vector))) {
500 scheduler_ipi();
499 kstat_incr_irqs_this_cpu(irq, desc); 501 kstat_incr_irqs_this_cpu(irq, desc);
500 } else { 502 } else {
501 ia64_setreg(_IA64_REG_CR_TPR, vector); 503 ia64_setreg(_IA64_REG_CR_TPR, vector);
diff --git a/arch/ia64/xen/irq_xen.c b/arch/ia64/xen/irq_xen.c
index 108bb858acf2..b279e142c633 100644
--- a/arch/ia64/xen/irq_xen.c
+++ b/arch/ia64/xen/irq_xen.c
@@ -92,6 +92,8 @@ static unsigned short saved_irq_cnt;
92static int xen_slab_ready; 92static int xen_slab_ready;
93 93
94#ifdef CONFIG_SMP 94#ifdef CONFIG_SMP
95#include <linux/sched.h>
96
95/* Dummy stub. Though we may check XEN_RESCHEDULE_VECTOR before __do_IRQ, 97/* Dummy stub. Though we may check XEN_RESCHEDULE_VECTOR before __do_IRQ,
96 * it ends up to issue several memory accesses upon percpu data and 98 * it ends up to issue several memory accesses upon percpu data and
97 * thus adds unnecessary traffic to other paths. 99 * thus adds unnecessary traffic to other paths.
@@ -99,7 +101,13 @@ static int xen_slab_ready;
99static irqreturn_t 101static irqreturn_t
100xen_dummy_handler(int irq, void *dev_id) 102xen_dummy_handler(int irq, void *dev_id)
101{ 103{
104 return IRQ_HANDLED;
105}
102 106
107static irqreturn_t
108xen_resched_handler(int irq, void *dev_id)
109{
110 scheduler_ipi();
103 return IRQ_HANDLED; 111 return IRQ_HANDLED;
104} 112}
105 113
@@ -110,7 +118,7 @@ static struct irqaction xen_ipi_irqaction = {
110}; 118};
111 119
112static struct irqaction xen_resched_irqaction = { 120static struct irqaction xen_resched_irqaction = {
113 .handler = xen_dummy_handler, 121 .handler = xen_resched_handler,
114 .flags = IRQF_DISABLED, 122 .flags = IRQF_DISABLED,
115 .name = "resched" 123 .name = "resched"
116}; 124};
diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c
index 31cef20b2996..fc10b39893d4 100644
--- a/arch/m32r/kernel/smp.c
+++ b/arch/m32r/kernel/smp.c
@@ -122,8 +122,6 @@ void smp_send_reschedule(int cpu_id)
122 * 122 *
123 * Description: This routine executes on CPU which received 123 * Description: This routine executes on CPU which received
124 * 'RESCHEDULE_IPI'. 124 * 'RESCHEDULE_IPI'.
125 * Rescheduling is processed at the exit of interrupt
126 * operation.
127 * 125 *
128 * Born on Date: 2002.02.05 126 * Born on Date: 2002.02.05
129 * 127 *
@@ -138,7 +136,7 @@ void smp_send_reschedule(int cpu_id)
138 *==========================================================================*/ 136 *==========================================================================*/
139void smp_reschedule_interrupt(void) 137void smp_reschedule_interrupt(void)
140{ 138{
141 /* nothing to do */ 139 scheduler_ipi();
142} 140}
143 141
144/*==========================================================================* 142/*==========================================================================*
diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c
index 716fae6f941a..8b606423bbd7 100644
--- a/arch/mips/cavium-octeon/smp.c
+++ b/arch/mips/cavium-octeon/smp.c
@@ -44,6 +44,8 @@ static irqreturn_t mailbox_interrupt(int irq, void *dev_id)
44 44
45 if (action & SMP_CALL_FUNCTION) 45 if (action & SMP_CALL_FUNCTION)
46 smp_call_function_interrupt(); 46 smp_call_function_interrupt();
47 if (action & SMP_RESCHEDULE_YOURSELF)
48 scheduler_ipi();
47 49
48 /* Check if we've been told to flush the icache */ 50 /* Check if we've been told to flush the icache */
49 if (action & SMP_ICACHE_FLUSH) 51 if (action & SMP_ICACHE_FLUSH)
diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c
index 5a88cc4ccd5a..cedac4633741 100644
--- a/arch/mips/kernel/smtc.c
+++ b/arch/mips/kernel/smtc.c
@@ -929,7 +929,7 @@ static void post_direct_ipi(int cpu, struct smtc_ipi *pipi)
929 929
930static void ipi_resched_interrupt(void) 930static void ipi_resched_interrupt(void)
931{ 931{
932 /* Return from interrupt should be enough to cause scheduler check */ 932 scheduler_ipi();
933} 933}
934 934
935static void ipi_call_interrupt(void) 935static void ipi_call_interrupt(void)
diff --git a/arch/mips/mti-malta/malta-int.c b/arch/mips/mti-malta/malta-int.c
index e85c977328da..1d36c511a7a5 100644
--- a/arch/mips/mti-malta/malta-int.c
+++ b/arch/mips/mti-malta/malta-int.c
@@ -308,6 +308,8 @@ static void ipi_call_dispatch(void)
308 308
309static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id) 309static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id)
310{ 310{
311 scheduler_ipi();
312
311 return IRQ_HANDLED; 313 return IRQ_HANDLED;
312} 314}
313 315
diff --git a/arch/mips/pmc-sierra/yosemite/smp.c b/arch/mips/pmc-sierra/yosemite/smp.c
index efc9e889b349..2608752898c0 100644
--- a/arch/mips/pmc-sierra/yosemite/smp.c
+++ b/arch/mips/pmc-sierra/yosemite/smp.c
@@ -55,6 +55,8 @@ void titan_mailbox_irq(void)
55 55
56 if (status & 0x2) 56 if (status & 0x2)
57 smp_call_function_interrupt(); 57 smp_call_function_interrupt();
58 if (status & 0x4)
59 scheduler_ipi();
58 break; 60 break;
59 61
60 case 1: 62 case 1:
@@ -63,6 +65,8 @@ void titan_mailbox_irq(void)
63 65
64 if (status & 0x2) 66 if (status & 0x2)
65 smp_call_function_interrupt(); 67 smp_call_function_interrupt();
68 if (status & 0x4)
69 scheduler_ipi();
66 break; 70 break;
67 } 71 }
68} 72}
diff --git a/arch/mips/sgi-ip27/ip27-irq.c b/arch/mips/sgi-ip27/ip27-irq.c
index 0a04603d577c..b18b04e48577 100644
--- a/arch/mips/sgi-ip27/ip27-irq.c
+++ b/arch/mips/sgi-ip27/ip27-irq.c
@@ -147,8 +147,10 @@ static void ip27_do_irq_mask0(void)
147#ifdef CONFIG_SMP 147#ifdef CONFIG_SMP
148 if (pend0 & (1UL << CPU_RESCHED_A_IRQ)) { 148 if (pend0 & (1UL << CPU_RESCHED_A_IRQ)) {
149 LOCAL_HUB_CLR_INTR(CPU_RESCHED_A_IRQ); 149 LOCAL_HUB_CLR_INTR(CPU_RESCHED_A_IRQ);
150 scheduler_ipi();
150 } else if (pend0 & (1UL << CPU_RESCHED_B_IRQ)) { 151 } else if (pend0 & (1UL << CPU_RESCHED_B_IRQ)) {
151 LOCAL_HUB_CLR_INTR(CPU_RESCHED_B_IRQ); 152 LOCAL_HUB_CLR_INTR(CPU_RESCHED_B_IRQ);
153 scheduler_ipi();
152 } else if (pend0 & (1UL << CPU_CALL_A_IRQ)) { 154 } else if (pend0 & (1UL << CPU_CALL_A_IRQ)) {
153 LOCAL_HUB_CLR_INTR(CPU_CALL_A_IRQ); 155 LOCAL_HUB_CLR_INTR(CPU_CALL_A_IRQ);
154 smp_call_function_interrupt(); 156 smp_call_function_interrupt();
diff --git a/arch/mips/sibyte/bcm1480/smp.c b/arch/mips/sibyte/bcm1480/smp.c
index 47b347c992ea..d667875be564 100644
--- a/arch/mips/sibyte/bcm1480/smp.c
+++ b/arch/mips/sibyte/bcm1480/smp.c
@@ -20,6 +20,7 @@
20#include <linux/delay.h> 20#include <linux/delay.h>
21#include <linux/smp.h> 21#include <linux/smp.h>
22#include <linux/kernel_stat.h> 22#include <linux/kernel_stat.h>
23#include <linux/sched.h>
23 24
24#include <asm/mmu_context.h> 25#include <asm/mmu_context.h>
25#include <asm/io.h> 26#include <asm/io.h>
@@ -189,10 +190,8 @@ void bcm1480_mailbox_interrupt(void)
189 /* Clear the mailbox to clear the interrupt */ 190 /* Clear the mailbox to clear the interrupt */
190 __raw_writeq(((u64)action)<<48, mailbox_0_clear_regs[cpu]); 191 __raw_writeq(((u64)action)<<48, mailbox_0_clear_regs[cpu]);
191 192
192 /* 193 if (action & SMP_RESCHEDULE_YOURSELF)
193 * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the 194 scheduler_ipi();
194 * interrupt will do the reschedule for us
195 */
196 195
197 if (action & SMP_CALL_FUNCTION) 196 if (action & SMP_CALL_FUNCTION)
198 smp_call_function_interrupt(); 197 smp_call_function_interrupt();
diff --git a/arch/mips/sibyte/sb1250/smp.c b/arch/mips/sibyte/sb1250/smp.c
index c00a5cb1128d..38e7f6bd7922 100644
--- a/arch/mips/sibyte/sb1250/smp.c
+++ b/arch/mips/sibyte/sb1250/smp.c
@@ -21,6 +21,7 @@
21#include <linux/interrupt.h> 21#include <linux/interrupt.h>
22#include <linux/smp.h> 22#include <linux/smp.h>
23#include <linux/kernel_stat.h> 23#include <linux/kernel_stat.h>
24#include <linux/sched.h>
24 25
25#include <asm/mmu_context.h> 26#include <asm/mmu_context.h>
26#include <asm/io.h> 27#include <asm/io.h>
@@ -177,10 +178,8 @@ void sb1250_mailbox_interrupt(void)
177 /* Clear the mailbox to clear the interrupt */ 178 /* Clear the mailbox to clear the interrupt */
178 ____raw_writeq(((u64)action) << 48, mailbox_clear_regs[cpu]); 179 ____raw_writeq(((u64)action) << 48, mailbox_clear_regs[cpu]);
179 180
180 /* 181 if (action & SMP_RESCHEDULE_YOURSELF)
181 * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the 182 scheduler_ipi();
182 * interrupt will do the reschedule for us
183 */
184 183
185 if (action & SMP_CALL_FUNCTION) 184 if (action & SMP_CALL_FUNCTION)
186 smp_call_function_interrupt(); 185 smp_call_function_interrupt();
diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c
index 226c826a2194..83fb27912231 100644
--- a/arch/mn10300/kernel/smp.c
+++ b/arch/mn10300/kernel/smp.c
@@ -494,14 +494,11 @@ void smp_send_stop(void)
494 * @irq: The interrupt number. 494 * @irq: The interrupt number.
495 * @dev_id: The device ID. 495 * @dev_id: The device ID.
496 * 496 *
497 * We need do nothing here, since the scheduling will be effected on our way
498 * back through entry.S.
499 *
500 * Returns IRQ_HANDLED to indicate we handled the interrupt successfully. 497 * Returns IRQ_HANDLED to indicate we handled the interrupt successfully.
501 */ 498 */
502static irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id) 499static irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
503{ 500{
504 /* do nothing */ 501 scheduler_ipi();
505 return IRQ_HANDLED; 502 return IRQ_HANDLED;
506} 503}
507 504
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 69d63d354ef0..828305f19cff 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -155,10 +155,7 @@ ipi_interrupt(int irq, void *dev_id)
155 155
156 case IPI_RESCHEDULE: 156 case IPI_RESCHEDULE:
157 smp_debug(100, KERN_DEBUG "CPU%d IPI_RESCHEDULE\n", this_cpu); 157 smp_debug(100, KERN_DEBUG "CPU%d IPI_RESCHEDULE\n", this_cpu);
158 /* 158 scheduler_ipi();
159 * Reschedule callback. Everything to be
160 * done is done by the interrupt return path.
161 */
162 break; 159 break;
163 160
164 case IPI_CALL_FUNC: 161 case IPI_CALL_FUNC:
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index cbdbb14be4b0..9f9c204bef69 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -116,7 +116,7 @@ void smp_message_recv(int msg)
116 generic_smp_call_function_interrupt(); 116 generic_smp_call_function_interrupt();
117 break; 117 break;
118 case PPC_MSG_RESCHEDULE: 118 case PPC_MSG_RESCHEDULE:
119 /* we notice need_resched on exit */ 119 scheduler_ipi();
120 break; 120 break;
121 case PPC_MSG_CALL_FUNC_SINGLE: 121 case PPC_MSG_CALL_FUNC_SINGLE:
122 generic_smp_call_function_single_interrupt(); 122 generic_smp_call_function_single_interrupt();
@@ -146,7 +146,7 @@ static irqreturn_t call_function_action(int irq, void *data)
146 146
147static irqreturn_t reschedule_action(int irq, void *data) 147static irqreturn_t reschedule_action(int irq, void *data)
148{ 148{
149 /* we just need the return path side effect of checking need_resched */ 149 scheduler_ipi();
150 return IRQ_HANDLED; 150 return IRQ_HANDLED;
151} 151}
152 152
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 63a97db83f96..63c7d9ff220d 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -165,12 +165,12 @@ static void do_ext_call_interrupt(unsigned int ext_int_code,
165 kstat_cpu(smp_processor_id()).irqs[EXTINT_IPI]++; 165 kstat_cpu(smp_processor_id()).irqs[EXTINT_IPI]++;
166 /* 166 /*
167 * handle bit signal external calls 167 * handle bit signal external calls
168 *
169 * For the ec_schedule signal we have to do nothing. All the work
170 * is done automatically when we return from the interrupt.
171 */ 168 */
172 bits = xchg(&S390_lowcore.ext_call_fast, 0); 169 bits = xchg(&S390_lowcore.ext_call_fast, 0);
173 170
171 if (test_bit(ec_schedule, &bits))
172 scheduler_ipi();
173
174 if (test_bit(ec_call_function, &bits)) 174 if (test_bit(ec_call_function, &bits))
175 generic_smp_call_function_interrupt(); 175 generic_smp_call_function_interrupt();
176 176
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 509b36b45115..6207561ea34a 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -20,6 +20,7 @@
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/cpu.h> 21#include <linux/cpu.h>
22#include <linux/interrupt.h> 22#include <linux/interrupt.h>
23#include <linux/sched.h>
23#include <asm/atomic.h> 24#include <asm/atomic.h>
24#include <asm/processor.h> 25#include <asm/processor.h>
25#include <asm/system.h> 26#include <asm/system.h>
@@ -323,6 +324,7 @@ void smp_message_recv(unsigned int msg)
323 generic_smp_call_function_interrupt(); 324 generic_smp_call_function_interrupt();
324 break; 325 break;
325 case SMP_MSG_RESCHEDULE: 326 case SMP_MSG_RESCHEDULE:
327 scheduler_ipi();
326 break; 328 break;
327 case SMP_MSG_FUNCTION_SINGLE: 329 case SMP_MSG_FUNCTION_SINGLE:
328 generic_smp_call_function_single_interrupt(); 330 generic_smp_call_function_single_interrupt();
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index 1c79f32734a0..8b9c556d630b 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -65,6 +65,10 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
65#define smt_capable() (sparc64_multi_core) 65#define smt_capable() (sparc64_multi_core)
66#endif /* CONFIG_SMP */ 66#endif /* CONFIG_SMP */
67 67
68#define cpu_coregroup_mask(cpu) (&cpu_core_map[cpu]) 68extern cpumask_t cpu_core_map[NR_CPUS];
69static inline const struct cpumask *cpu_coregroup_mask(int cpu)
70{
71 return &cpu_core_map[cpu];
72}
69 73
70#endif /* _ASM_SPARC64_TOPOLOGY_H */ 74#endif /* _ASM_SPARC64_TOPOLOGY_H */
diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c
index 850a1360c0d6..442286d83435 100644
--- a/arch/sparc/kernel/smp_32.c
+++ b/arch/sparc/kernel/smp_32.c
@@ -129,7 +129,9 @@ struct linux_prom_registers smp_penguin_ctable __cpuinitdata = { 0 };
129 129
130void smp_send_reschedule(int cpu) 130void smp_send_reschedule(int cpu)
131{ 131{
132 /* See sparc64 */ 132 /*
133 * XXX missing reschedule IPI, see scheduler_ipi()
134 */
133} 135}
134 136
135void smp_send_stop(void) 137void smp_send_stop(void)
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 3e94a8c23238..9478da7fdb3e 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1368,6 +1368,7 @@ void smp_send_reschedule(int cpu)
1368void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs) 1368void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs)
1369{ 1369{
1370 clear_softint(1 << irq); 1370 clear_softint(1 << irq);
1371 scheduler_ipi();
1371} 1372}
1372 1373
1373/* This is a nop because we capture all other cpus 1374/* This is a nop because we capture all other cpus
diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c
index a4293102ef81..c52224d5ed45 100644
--- a/arch/tile/kernel/smp.c
+++ b/arch/tile/kernel/smp.c
@@ -189,12 +189,8 @@ void flush_icache_range(unsigned long start, unsigned long end)
189/* Called when smp_send_reschedule() triggers IRQ_RESCHEDULE. */ 189/* Called when smp_send_reschedule() triggers IRQ_RESCHEDULE. */
190static irqreturn_t handle_reschedule_ipi(int irq, void *token) 190static irqreturn_t handle_reschedule_ipi(int irq, void *token)
191{ 191{
192 /*
193 * Nothing to do here; when we return from interrupt, the
194 * rescheduling will occur there. But do bump the interrupt
195 * profiler count in the meantime.
196 */
197 __get_cpu_var(irq_stat).irq_resched_count++; 192 __get_cpu_var(irq_stat).irq_resched_count++;
193 scheduler_ipi();
198 194
199 return IRQ_HANDLED; 195 return IRQ_HANDLED;
200} 196}
diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c
index 106bf27e2a9a..eefb107d2d73 100644
--- a/arch/um/kernel/smp.c
+++ b/arch/um/kernel/smp.c
@@ -173,7 +173,7 @@ void IPI_handler(int cpu)
173 break; 173 break;
174 174
175 case 'R': 175 case 'R':
176 set_tsk_need_resched(current); 176 scheduler_ipi();
177 break; 177 break;
178 178
179 case 'S': 179 case 'S':
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 513deac7228d..013e7eba83bb 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -194,14 +194,13 @@ static void native_stop_other_cpus(int wait)
194} 194}
195 195
196/* 196/*
197 * Reschedule call back. Nothing to do, 197 * Reschedule call back.
198 * all the work is done automatically when
199 * we return from the interrupt.
200 */ 198 */
201void smp_reschedule_interrupt(struct pt_regs *regs) 199void smp_reschedule_interrupt(struct pt_regs *regs)
202{ 200{
203 ack_APIC_irq(); 201 ack_APIC_irq();
204 inc_irq_stat(irq_resched_count); 202 inc_irq_stat(irq_resched_count);
203 scheduler_ipi();
205 /* 204 /*
206 * KVM uses this interrupt to force a cpu out of guest mode 205 * KVM uses this interrupt to force a cpu out of guest mode
207 */ 206 */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 194a3edef5cb..41038c01de40 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -46,13 +46,12 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
46static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); 46static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
47 47
48/* 48/*
49 * Reschedule call back. Nothing to do, 49 * Reschedule call back.
50 * all the work is done automatically when
51 * we return from the interrupt.
52 */ 50 */
53static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) 51static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
54{ 52{
55 inc_irq_stat(irq_resched_count); 53 inc_irq_stat(irq_resched_count);
54 scheduler_ipi();
56 55
57 return IRQ_HANDLED; 56 return IRQ_HANDLED;
58} 57}
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index caa151fbebb7..689496bb6654 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -134,7 +134,6 @@ extern struct cred init_cred;
134 .stack = &init_thread_info, \ 134 .stack = &init_thread_info, \
135 .usage = ATOMIC_INIT(2), \ 135 .usage = ATOMIC_INIT(2), \
136 .flags = PF_KTHREAD, \ 136 .flags = PF_KTHREAD, \
137 .lock_depth = -1, \
138 .prio = MAX_PRIO-20, \ 137 .prio = MAX_PRIO-20, \
139 .static_prio = MAX_PRIO-20, \ 138 .static_prio = MAX_PRIO-20, \
140 .normal_prio = MAX_PRIO-20, \ 139 .normal_prio = MAX_PRIO-20, \
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 94b48bd40dd7..c75471db576e 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -51,7 +51,7 @@ struct mutex {
51 spinlock_t wait_lock; 51 spinlock_t wait_lock;
52 struct list_head wait_list; 52 struct list_head wait_list;
53#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) 53#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
54 struct thread_info *owner; 54 struct task_struct *owner;
55#endif 55#endif
56#ifdef CONFIG_DEBUG_MUTEXES 56#ifdef CONFIG_DEBUG_MUTEXES
57 const char *name; 57 const char *name;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 781abd137673..12211e1666e2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -360,7 +360,7 @@ extern signed long schedule_timeout_interruptible(signed long timeout);
360extern signed long schedule_timeout_killable(signed long timeout); 360extern signed long schedule_timeout_killable(signed long timeout);
361extern signed long schedule_timeout_uninterruptible(signed long timeout); 361extern signed long schedule_timeout_uninterruptible(signed long timeout);
362asmlinkage void schedule(void); 362asmlinkage void schedule(void);
363extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); 363extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner);
364 364
365struct nsproxy; 365struct nsproxy;
366struct user_namespace; 366struct user_namespace;
@@ -731,10 +731,6 @@ struct sched_info {
731 /* timestamps */ 731 /* timestamps */
732 unsigned long long last_arrival,/* when we last ran on a cpu */ 732 unsigned long long last_arrival,/* when we last ran on a cpu */
733 last_queued; /* when we were last queued to run */ 733 last_queued; /* when we were last queued to run */
734#ifdef CONFIG_SCHEDSTATS
735 /* BKL stats */
736 unsigned int bkl_count;
737#endif
738}; 734};
739#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ 735#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
740 736
@@ -868,6 +864,7 @@ static inline int sd_power_saving_flags(void)
868 864
869struct sched_group { 865struct sched_group {
870 struct sched_group *next; /* Must be a circular list */ 866 struct sched_group *next; /* Must be a circular list */
867 atomic_t ref;
871 868
872 /* 869 /*
873 * CPU power of this group, SCHED_LOAD_SCALE being max power for a 870 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
@@ -882,9 +879,6 @@ struct sched_group {
882 * NOTE: this field is variable length. (Allocated dynamically 879 * NOTE: this field is variable length. (Allocated dynamically
883 * by attaching extra space to the end of the structure, 880 * by attaching extra space to the end of the structure,
884 * depending on how many CPUs the kernel has booted up with) 881 * depending on how many CPUs the kernel has booted up with)
885 *
886 * It is also be embedded into static data structures at build
887 * time. (See 'struct static_sched_group' in kernel/sched.c)
888 */ 882 */
889 unsigned long cpumask[0]; 883 unsigned long cpumask[0];
890}; 884};
@@ -894,17 +888,6 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
894 return to_cpumask(sg->cpumask); 888 return to_cpumask(sg->cpumask);
895} 889}
896 890
897enum sched_domain_level {
898 SD_LV_NONE = 0,
899 SD_LV_SIBLING,
900 SD_LV_MC,
901 SD_LV_BOOK,
902 SD_LV_CPU,
903 SD_LV_NODE,
904 SD_LV_ALLNODES,
905 SD_LV_MAX
906};
907
908struct sched_domain_attr { 891struct sched_domain_attr {
909 int relax_domain_level; 892 int relax_domain_level;
910}; 893};
@@ -913,6 +896,8 @@ struct sched_domain_attr {
913 .relax_domain_level = -1, \ 896 .relax_domain_level = -1, \
914} 897}
915 898
899extern int sched_domain_level_max;
900
916struct sched_domain { 901struct sched_domain {
917 /* These fields must be setup */ 902 /* These fields must be setup */
918 struct sched_domain *parent; /* top domain must be null terminated */ 903 struct sched_domain *parent; /* top domain must be null terminated */
@@ -930,7 +915,7 @@ struct sched_domain {
930 unsigned int forkexec_idx; 915 unsigned int forkexec_idx;
931 unsigned int smt_gain; 916 unsigned int smt_gain;
932 int flags; /* See SD_* */ 917 int flags; /* See SD_* */
933 enum sched_domain_level level; 918 int level;
934 919
935 /* Runtime fields. */ 920 /* Runtime fields. */
936 unsigned long last_balance; /* init to jiffies. units in jiffies */ 921 unsigned long last_balance; /* init to jiffies. units in jiffies */
@@ -973,6 +958,10 @@ struct sched_domain {
973#ifdef CONFIG_SCHED_DEBUG 958#ifdef CONFIG_SCHED_DEBUG
974 char *name; 959 char *name;
975#endif 960#endif
961 union {
962 void *private; /* used during construction */
963 struct rcu_head rcu; /* used during destruction */
964 };
976 965
977 unsigned int span_weight; 966 unsigned int span_weight;
978 /* 967 /*
@@ -981,9 +970,6 @@ struct sched_domain {
981 * NOTE: this field is variable length. (Allocated dynamically 970 * NOTE: this field is variable length. (Allocated dynamically
982 * by attaching extra space to the end of the structure, 971 * by attaching extra space to the end of the structure,
983 * depending on how many CPUs the kernel has booted up with) 972 * depending on how many CPUs the kernel has booted up with)
984 *
985 * It is also be embedded into static data structures at build
986 * time. (See 'struct static_sched_domain' in kernel/sched.c)
987 */ 973 */
988 unsigned long span[0]; 974 unsigned long span[0];
989}; 975};
@@ -1048,8 +1034,12 @@ struct sched_domain;
1048#define WF_FORK 0x02 /* child wakeup after fork */ 1034#define WF_FORK 0x02 /* child wakeup after fork */
1049 1035
1050#define ENQUEUE_WAKEUP 1 1036#define ENQUEUE_WAKEUP 1
1051#define ENQUEUE_WAKING 2 1037#define ENQUEUE_HEAD 2
1052#define ENQUEUE_HEAD 4 1038#ifdef CONFIG_SMP
1039#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
1040#else
1041#define ENQUEUE_WAKING 0
1042#endif
1053 1043
1054#define DEQUEUE_SLEEP 1 1044#define DEQUEUE_SLEEP 1
1055 1045
@@ -1067,12 +1057,11 @@ struct sched_class {
1067 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1057 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
1068 1058
1069#ifdef CONFIG_SMP 1059#ifdef CONFIG_SMP
1070 int (*select_task_rq)(struct rq *rq, struct task_struct *p, 1060 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
1071 int sd_flag, int flags);
1072 1061
1073 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); 1062 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
1074 void (*post_schedule) (struct rq *this_rq); 1063 void (*post_schedule) (struct rq *this_rq);
1075 void (*task_waking) (struct rq *this_rq, struct task_struct *task); 1064 void (*task_waking) (struct task_struct *task);
1076 void (*task_woken) (struct rq *this_rq, struct task_struct *task); 1065 void (*task_woken) (struct rq *this_rq, struct task_struct *task);
1077 1066
1078 void (*set_cpus_allowed)(struct task_struct *p, 1067 void (*set_cpus_allowed)(struct task_struct *p,
@@ -1197,13 +1186,11 @@ struct task_struct {
1197 unsigned int flags; /* per process flags, defined below */ 1186 unsigned int flags; /* per process flags, defined below */
1198 unsigned int ptrace; 1187 unsigned int ptrace;
1199 1188
1200 int lock_depth; /* BKL lock depth */
1201
1202#ifdef CONFIG_SMP 1189#ifdef CONFIG_SMP
1203#ifdef __ARCH_WANT_UNLOCKED_CTXSW 1190 struct task_struct *wake_entry;
1204 int oncpu; 1191 int on_cpu;
1205#endif
1206#endif 1192#endif
1193 int on_rq;
1207 1194
1208 int prio, static_prio, normal_prio; 1195 int prio, static_prio, normal_prio;
1209 unsigned int rt_priority; 1196 unsigned int rt_priority;
@@ -1274,6 +1261,7 @@ struct task_struct {
1274 1261
1275 /* Revert to default priority/policy when forking */ 1262 /* Revert to default priority/policy when forking */
1276 unsigned sched_reset_on_fork:1; 1263 unsigned sched_reset_on_fork:1;
1264 unsigned sched_contributes_to_load:1;
1277 1265
1278 pid_t pid; 1266 pid_t pid;
1279 pid_t tgid; 1267 pid_t tgid;
@@ -2063,14 +2051,13 @@ extern void xtime_update(unsigned long ticks);
2063 2051
2064extern int wake_up_state(struct task_struct *tsk, unsigned int state); 2052extern int wake_up_state(struct task_struct *tsk, unsigned int state);
2065extern int wake_up_process(struct task_struct *tsk); 2053extern int wake_up_process(struct task_struct *tsk);
2066extern void wake_up_new_task(struct task_struct *tsk, 2054extern void wake_up_new_task(struct task_struct *tsk);
2067 unsigned long clone_flags);
2068#ifdef CONFIG_SMP 2055#ifdef CONFIG_SMP
2069 extern void kick_process(struct task_struct *tsk); 2056 extern void kick_process(struct task_struct *tsk);
2070#else 2057#else
2071 static inline void kick_process(struct task_struct *tsk) { } 2058 static inline void kick_process(struct task_struct *tsk) { }
2072#endif 2059#endif
2073extern void sched_fork(struct task_struct *p, int clone_flags); 2060extern void sched_fork(struct task_struct *p);
2074extern void sched_dead(struct task_struct *p); 2061extern void sched_dead(struct task_struct *p);
2075 2062
2076extern void proc_caches_init(void); 2063extern void proc_caches_init(void);
@@ -2195,8 +2182,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from);
2195extern char *get_task_comm(char *to, struct task_struct *tsk); 2182extern char *get_task_comm(char *to, struct task_struct *tsk);
2196 2183
2197#ifdef CONFIG_SMP 2184#ifdef CONFIG_SMP
2185void scheduler_ipi(void);
2198extern unsigned long wait_task_inactive(struct task_struct *, long match_state); 2186extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
2199#else 2187#else
2188static inline void scheduler_ipi(void) { }
2200static inline unsigned long wait_task_inactive(struct task_struct *p, 2189static inline unsigned long wait_task_inactive(struct task_struct *p,
2201 long match_state) 2190 long match_state)
2202{ 2191{
diff --git a/init/Kconfig b/init/Kconfig
index 7a71e0a9992a..af958ad26d60 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -827,6 +827,11 @@ config SCHED_AUTOGROUP
827 desktop applications. Task group autogeneration is currently based 827 desktop applications. Task group autogeneration is currently based
828 upon task session. 828 upon task session.
829 829
830config SCHED_TTWU_QUEUE
831 bool
832 depends on !SPARC32
833 default y
834
830config MM_OWNER 835config MM_OWNER
831 bool 836 bool
832 837
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 33eee16addb8..2bb8c2e98fff 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1159,7 +1159,7 @@ int current_cpuset_is_being_rebound(void)
1159static int update_relax_domain_level(struct cpuset *cs, s64 val) 1159static int update_relax_domain_level(struct cpuset *cs, s64 val)
1160{ 1160{
1161#ifdef CONFIG_SMP 1161#ifdef CONFIG_SMP
1162 if (val < -1 || val >= SD_LV_MAX) 1162 if (val < -1 || val >= sched_domain_level_max)
1163 return -EINVAL; 1163 return -EINVAL;
1164#endif 1164#endif
1165 1165
diff --git a/kernel/fork.c b/kernel/fork.c
index e7548dee636b..2b44d82b8237 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1103,7 +1103,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1103 1103
1104 posix_cpu_timers_init(p); 1104 posix_cpu_timers_init(p);
1105 1105
1106 p->lock_depth = -1; /* -1 = no lock */
1107 do_posix_clock_monotonic_gettime(&p->start_time); 1106 do_posix_clock_monotonic_gettime(&p->start_time);
1108 p->real_start_time = p->start_time; 1107 p->real_start_time = p->start_time;
1109 monotonic_to_bootbased(&p->real_start_time); 1108 monotonic_to_bootbased(&p->real_start_time);
@@ -1153,7 +1152,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1153#endif 1152#endif
1154 1153
1155 /* Perform scheduler related setup. Assign this task to a CPU. */ 1154 /* Perform scheduler related setup. Assign this task to a CPU. */
1156 sched_fork(p, clone_flags); 1155 sched_fork(p);
1157 1156
1158 retval = perf_event_init_task(p); 1157 retval = perf_event_init_task(p);
1159 if (retval) 1158 if (retval)
@@ -1464,7 +1463,7 @@ long do_fork(unsigned long clone_flags,
1464 */ 1463 */
1465 p->flags &= ~PF_STARTING; 1464 p->flags &= ~PF_STARTING;
1466 1465
1467 wake_up_new_task(p, clone_flags); 1466 wake_up_new_task(p);
1468 1467
1469 tracehook_report_clone_complete(trace, regs, 1468 tracehook_report_clone_complete(trace, regs,
1470 clone_flags, nr, p); 1469 clone_flags, nr, p);
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index ec815a960b5d..73da83aff418 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock)
75 return; 75 return;
76 76
77 DEBUG_LOCKS_WARN_ON(lock->magic != lock); 77 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
78 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); 78 DEBUG_LOCKS_WARN_ON(lock->owner != current);
79 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 79 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
80 mutex_clear_owner(lock); 80 mutex_clear_owner(lock);
81} 81}
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index 57d527a16f9d..0799fd3e4cfa 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
29 29
30static inline void mutex_set_owner(struct mutex *lock) 30static inline void mutex_set_owner(struct mutex *lock)
31{ 31{
32 lock->owner = current_thread_info(); 32 lock->owner = current;
33} 33}
34 34
35static inline void mutex_clear_owner(struct mutex *lock) 35static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index c4195fa98900..2c938e2337cd 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -160,14 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
160 */ 160 */
161 161
162 for (;;) { 162 for (;;) {
163 struct thread_info *owner; 163 struct task_struct *owner;
164
165 /*
166 * If we own the BKL, then don't spin. The owner of
167 * the mutex might be waiting on us to release the BKL.
168 */
169 if (unlikely(current->lock_depth >= 0))
170 break;
171 164
172 /* 165 /*
173 * If there's an owner, wait for it to either 166 * If there's an owner, wait for it to either
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 67578ca48f94..4115fbf83b12 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -19,7 +19,7 @@
19#ifdef CONFIG_SMP 19#ifdef CONFIG_SMP
20static inline void mutex_set_owner(struct mutex *lock) 20static inline void mutex_set_owner(struct mutex *lock)
21{ 21{
22 lock->owner = current_thread_info(); 22 lock->owner = current;
23} 23}
24 24
25static inline void mutex_clear_owner(struct mutex *lock) 25static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/sched.c b/kernel/sched.c
index 312f8b95c2d4..c62acf45d3b9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -231,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
231#endif 231#endif
232 232
233/* 233/*
234 * sched_domains_mutex serializes calls to arch_init_sched_domains, 234 * sched_domains_mutex serializes calls to init_sched_domains,
235 * detach_destroy_domains and partition_sched_domains. 235 * detach_destroy_domains and partition_sched_domains.
236 */ 236 */
237static DEFINE_MUTEX(sched_domains_mutex); 237static DEFINE_MUTEX(sched_domains_mutex);
@@ -312,6 +312,9 @@ struct cfs_rq {
312 312
313 u64 exec_clock; 313 u64 exec_clock;
314 u64 min_vruntime; 314 u64 min_vruntime;
315#ifndef CONFIG_64BIT
316 u64 min_vruntime_copy;
317#endif
315 318
316 struct rb_root tasks_timeline; 319 struct rb_root tasks_timeline;
317 struct rb_node *rb_leftmost; 320 struct rb_node *rb_leftmost;
@@ -325,7 +328,9 @@ struct cfs_rq {
325 */ 328 */
326 struct sched_entity *curr, *next, *last, *skip; 329 struct sched_entity *curr, *next, *last, *skip;
327 330
331#ifdef CONFIG_SCHED_DEBUG
328 unsigned int nr_spread_over; 332 unsigned int nr_spread_over;
333#endif
329 334
330#ifdef CONFIG_FAIR_GROUP_SCHED 335#ifdef CONFIG_FAIR_GROUP_SCHED
331 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 336 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -417,6 +422,7 @@ struct rt_rq {
417 */ 422 */
418struct root_domain { 423struct root_domain {
419 atomic_t refcount; 424 atomic_t refcount;
425 struct rcu_head rcu;
420 cpumask_var_t span; 426 cpumask_var_t span;
421 cpumask_var_t online; 427 cpumask_var_t online;
422 428
@@ -460,7 +466,7 @@ struct rq {
460 u64 nohz_stamp; 466 u64 nohz_stamp;
461 unsigned char nohz_balance_kick; 467 unsigned char nohz_balance_kick;
462#endif 468#endif
463 unsigned int skip_clock_update; 469 int skip_clock_update;
464 470
465 /* capture load from *all* tasks on this cpu: */ 471 /* capture load from *all* tasks on this cpu: */
466 struct load_weight load; 472 struct load_weight load;
@@ -553,6 +559,10 @@ struct rq {
553 unsigned int ttwu_count; 559 unsigned int ttwu_count;
554 unsigned int ttwu_local; 560 unsigned int ttwu_local;
555#endif 561#endif
562
563#ifdef CONFIG_SMP
564 struct task_struct *wake_list;
565#endif
556}; 566};
557 567
558static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 568static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -571,7 +581,7 @@ static inline int cpu_of(struct rq *rq)
571 581
572#define rcu_dereference_check_sched_domain(p) \ 582#define rcu_dereference_check_sched_domain(p) \
573 rcu_dereference_check((p), \ 583 rcu_dereference_check((p), \
574 rcu_read_lock_sched_held() || \ 584 rcu_read_lock_held() || \
575 lockdep_is_held(&sched_domains_mutex)) 585 lockdep_is_held(&sched_domains_mutex))
576 586
577/* 587/*
@@ -596,7 +606,7 @@ static inline int cpu_of(struct rq *rq)
596 * Return the group to which this tasks belongs. 606 * Return the group to which this tasks belongs.
597 * 607 *
598 * We use task_subsys_state_check() and extend the RCU verification 608 * We use task_subsys_state_check() and extend the RCU verification
599 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() 609 * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
600 * holds that lock for each task it moves into the cgroup. Therefore 610 * holds that lock for each task it moves into the cgroup. Therefore
601 * by holding that lock, we pin the task to the current cgroup. 611 * by holding that lock, we pin the task to the current cgroup.
602 */ 612 */
@@ -606,7 +616,7 @@ static inline struct task_group *task_group(struct task_struct *p)
606 struct cgroup_subsys_state *css; 616 struct cgroup_subsys_state *css;
607 617
608 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 618 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
609 lockdep_is_held(&task_rq(p)->lock)); 619 lockdep_is_held(&p->pi_lock));
610 tg = container_of(css, struct task_group, css); 620 tg = container_of(css, struct task_group, css);
611 621
612 return autogroup_task_group(p, tg); 622 return autogroup_task_group(p, tg);
@@ -642,7 +652,7 @@ static void update_rq_clock(struct rq *rq)
642{ 652{
643 s64 delta; 653 s64 delta;
644 654
645 if (rq->skip_clock_update) 655 if (rq->skip_clock_update > 0)
646 return; 656 return;
647 657
648 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 658 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -838,18 +848,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
838 return rq->curr == p; 848 return rq->curr == p;
839} 849}
840 850
841#ifndef __ARCH_WANT_UNLOCKED_CTXSW
842static inline int task_running(struct rq *rq, struct task_struct *p) 851static inline int task_running(struct rq *rq, struct task_struct *p)
843{ 852{
853#ifdef CONFIG_SMP
854 return p->on_cpu;
855#else
844 return task_current(rq, p); 856 return task_current(rq, p);
857#endif
845} 858}
846 859
860#ifndef __ARCH_WANT_UNLOCKED_CTXSW
847static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 861static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
848{ 862{
863#ifdef CONFIG_SMP
864 /*
865 * We can optimise this out completely for !SMP, because the
866 * SMP rebalancing from interrupt is the only thing that cares
867 * here.
868 */
869 next->on_cpu = 1;
870#endif
849} 871}
850 872
851static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 873static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
852{ 874{
875#ifdef CONFIG_SMP
876 /*
877 * After ->on_cpu is cleared, the task can be moved to a different CPU.
878 * We must ensure this doesn't happen until the switch is completely
879 * finished.
880 */
881 smp_wmb();
882 prev->on_cpu = 0;
883#endif
853#ifdef CONFIG_DEBUG_SPINLOCK 884#ifdef CONFIG_DEBUG_SPINLOCK
854 /* this is a valid case when another task releases the spinlock */ 885 /* this is a valid case when another task releases the spinlock */
855 rq->lock.owner = current; 886 rq->lock.owner = current;
@@ -865,15 +896,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
865} 896}
866 897
867#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 898#else /* __ARCH_WANT_UNLOCKED_CTXSW */
868static inline int task_running(struct rq *rq, struct task_struct *p)
869{
870#ifdef CONFIG_SMP
871 return p->oncpu;
872#else
873 return task_current(rq, p);
874#endif
875}
876
877static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 899static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
878{ 900{
879#ifdef CONFIG_SMP 901#ifdef CONFIG_SMP
@@ -882,7 +904,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
882 * SMP rebalancing from interrupt is the only thing that cares 904 * SMP rebalancing from interrupt is the only thing that cares
883 * here. 905 * here.
884 */ 906 */
885 next->oncpu = 1; 907 next->on_cpu = 1;
886#endif 908#endif
887#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 909#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
888 raw_spin_unlock_irq(&rq->lock); 910 raw_spin_unlock_irq(&rq->lock);
@@ -895,12 +917,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
895{ 917{
896#ifdef CONFIG_SMP 918#ifdef CONFIG_SMP
897 /* 919 /*
898 * After ->oncpu is cleared, the task can be moved to a different CPU. 920 * After ->on_cpu is cleared, the task can be moved to a different CPU.
899 * We must ensure this doesn't happen until the switch is completely 921 * We must ensure this doesn't happen until the switch is completely
900 * finished. 922 * finished.
901 */ 923 */
902 smp_wmb(); 924 smp_wmb();
903 prev->oncpu = 0; 925 prev->on_cpu = 0;
904#endif 926#endif
905#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 927#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
906 local_irq_enable(); 928 local_irq_enable();
@@ -909,23 +931,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
909#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 931#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
910 932
911/* 933/*
912 * Check whether the task is waking, we use this to synchronize ->cpus_allowed 934 * __task_rq_lock - lock the rq @p resides on.
913 * against ttwu().
914 */
915static inline int task_is_waking(struct task_struct *p)
916{
917 return unlikely(p->state == TASK_WAKING);
918}
919
920/*
921 * __task_rq_lock - lock the runqueue a given task resides on.
922 * Must be called interrupts disabled.
923 */ 935 */
924static inline struct rq *__task_rq_lock(struct task_struct *p) 936static inline struct rq *__task_rq_lock(struct task_struct *p)
925 __acquires(rq->lock) 937 __acquires(rq->lock)
926{ 938{
927 struct rq *rq; 939 struct rq *rq;
928 940
941 lockdep_assert_held(&p->pi_lock);
942
929 for (;;) { 943 for (;;) {
930 rq = task_rq(p); 944 rq = task_rq(p);
931 raw_spin_lock(&rq->lock); 945 raw_spin_lock(&rq->lock);
@@ -936,22 +950,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
936} 950}
937 951
938/* 952/*
939 * task_rq_lock - lock the runqueue a given task resides on and disable 953 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
940 * interrupts. Note the ordering: we can safely lookup the task_rq without
941 * explicitly disabling preemption.
942 */ 954 */
943static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 955static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
956 __acquires(p->pi_lock)
944 __acquires(rq->lock) 957 __acquires(rq->lock)
945{ 958{
946 struct rq *rq; 959 struct rq *rq;
947 960
948 for (;;) { 961 for (;;) {
949 local_irq_save(*flags); 962 raw_spin_lock_irqsave(&p->pi_lock, *flags);
950 rq = task_rq(p); 963 rq = task_rq(p);
951 raw_spin_lock(&rq->lock); 964 raw_spin_lock(&rq->lock);
952 if (likely(rq == task_rq(p))) 965 if (likely(rq == task_rq(p)))
953 return rq; 966 return rq;
954 raw_spin_unlock_irqrestore(&rq->lock, *flags); 967 raw_spin_unlock(&rq->lock);
968 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
955 } 969 }
956} 970}
957 971
@@ -961,10 +975,13 @@ static void __task_rq_unlock(struct rq *rq)
961 raw_spin_unlock(&rq->lock); 975 raw_spin_unlock(&rq->lock);
962} 976}
963 977
964static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 978static inline void
979task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
965 __releases(rq->lock) 980 __releases(rq->lock)
981 __releases(p->pi_lock)
966{ 982{
967 raw_spin_unlock_irqrestore(&rq->lock, *flags); 983 raw_spin_unlock(&rq->lock);
984 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
968} 985}
969 986
970/* 987/*
@@ -1193,11 +1210,17 @@ int get_nohz_timer_target(void)
1193 int i; 1210 int i;
1194 struct sched_domain *sd; 1211 struct sched_domain *sd;
1195 1212
1213 rcu_read_lock();
1196 for_each_domain(cpu, sd) { 1214 for_each_domain(cpu, sd) {
1197 for_each_cpu(i, sched_domain_span(sd)) 1215 for_each_cpu(i, sched_domain_span(sd)) {
1198 if (!idle_cpu(i)) 1216 if (!idle_cpu(i)) {
1199 return i; 1217 cpu = i;
1218 goto unlock;
1219 }
1220 }
1200 } 1221 }
1222unlock:
1223 rcu_read_unlock();
1201 return cpu; 1224 return cpu;
1202} 1225}
1203/* 1226/*
@@ -1307,15 +1330,15 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1307{ 1330{
1308 u64 tmp; 1331 u64 tmp;
1309 1332
1333 tmp = (u64)delta_exec * weight;
1334
1310 if (!lw->inv_weight) { 1335 if (!lw->inv_weight) {
1311 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) 1336 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1312 lw->inv_weight = 1; 1337 lw->inv_weight = 1;
1313 else 1338 else
1314 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) 1339 lw->inv_weight = WMULT_CONST / lw->weight;
1315 / (lw->weight+1);
1316 } 1340 }
1317 1341
1318 tmp = (u64)delta_exec * weight;
1319 /* 1342 /*
1320 * Check whether we'd overflow the 64-bit multiplication: 1343 * Check whether we'd overflow the 64-bit multiplication:
1321 */ 1344 */
@@ -1773,7 +1796,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1773 update_rq_clock(rq); 1796 update_rq_clock(rq);
1774 sched_info_queued(p); 1797 sched_info_queued(p);
1775 p->sched_class->enqueue_task(rq, p, flags); 1798 p->sched_class->enqueue_task(rq, p, flags);
1776 p->se.on_rq = 1;
1777} 1799}
1778 1800
1779static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 1801static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1781,7 +1803,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1781 update_rq_clock(rq); 1803 update_rq_clock(rq);
1782 sched_info_dequeued(p); 1804 sched_info_dequeued(p);
1783 p->sched_class->dequeue_task(rq, p, flags); 1805 p->sched_class->dequeue_task(rq, p, flags);
1784 p->se.on_rq = 0;
1785} 1806}
1786 1807
1787/* 1808/*
@@ -2116,7 +2137,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2116 * A queue event has occurred, and we're going to schedule. In 2137 * A queue event has occurred, and we're going to schedule. In
2117 * this case, we can save a useless back to back clock update. 2138 * this case, we can save a useless back to back clock update.
2118 */ 2139 */
2119 if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) 2140 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
2120 rq->skip_clock_update = 1; 2141 rq->skip_clock_update = 1;
2121} 2142}
2122 2143
@@ -2162,6 +2183,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2162 */ 2183 */
2163 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 2184 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2164 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 2185 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2186
2187#ifdef CONFIG_LOCKDEP
2188 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2189 lockdep_is_held(&task_rq(p)->lock)));
2190#endif
2165#endif 2191#endif
2166 2192
2167 trace_sched_migrate_task(p, new_cpu); 2193 trace_sched_migrate_task(p, new_cpu);
@@ -2182,19 +2208,6 @@ struct migration_arg {
2182static int migration_cpu_stop(void *data); 2208static int migration_cpu_stop(void *data);
2183 2209
2184/* 2210/*
2185 * The task's runqueue lock must be held.
2186 * Returns true if you have to wait for migration thread.
2187 */
2188static bool migrate_task(struct task_struct *p, struct rq *rq)
2189{
2190 /*
2191 * If the task is not on a runqueue (and not running), then
2192 * the next wake-up will properly place the task.
2193 */
2194 return p->se.on_rq || task_running(rq, p);
2195}
2196
2197/*
2198 * wait_task_inactive - wait for a thread to unschedule. 2211 * wait_task_inactive - wait for a thread to unschedule.
2199 * 2212 *
2200 * If @match_state is nonzero, it's the @p->state value just checked and 2213 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2251,11 +2264,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2251 rq = task_rq_lock(p, &flags); 2264 rq = task_rq_lock(p, &flags);
2252 trace_sched_wait_task(p); 2265 trace_sched_wait_task(p);
2253 running = task_running(rq, p); 2266 running = task_running(rq, p);
2254 on_rq = p->se.on_rq; 2267 on_rq = p->on_rq;
2255 ncsw = 0; 2268 ncsw = 0;
2256 if (!match_state || p->state == match_state) 2269 if (!match_state || p->state == match_state)
2257 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 2270 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2258 task_rq_unlock(rq, &flags); 2271 task_rq_unlock(rq, p, &flags);
2259 2272
2260 /* 2273 /*
2261 * If it changed from the expected state, bail out now. 2274 * If it changed from the expected state, bail out now.
@@ -2330,7 +2343,7 @@ EXPORT_SYMBOL_GPL(kick_process);
2330 2343
2331#ifdef CONFIG_SMP 2344#ifdef CONFIG_SMP
2332/* 2345/*
2333 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2346 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
2334 */ 2347 */
2335static int select_fallback_rq(int cpu, struct task_struct *p) 2348static int select_fallback_rq(int cpu, struct task_struct *p)
2336{ 2349{
@@ -2363,12 +2376,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2363} 2376}
2364 2377
2365/* 2378/*
2366 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. 2379 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
2367 */ 2380 */
2368static inline 2381static inline
2369int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) 2382int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2370{ 2383{
2371 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); 2384 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2372 2385
2373 /* 2386 /*
2374 * In order not to call set_task_cpu() on a blocking task we need 2387 * In order not to call set_task_cpu() on a blocking task we need
@@ -2394,27 +2407,62 @@ static void update_avg(u64 *avg, u64 sample)
2394} 2407}
2395#endif 2408#endif
2396 2409
2397static inline void ttwu_activate(struct task_struct *p, struct rq *rq, 2410static void
2398 bool is_sync, bool is_migrate, bool is_local, 2411ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2399 unsigned long en_flags)
2400{ 2412{
2413#ifdef CONFIG_SCHEDSTATS
2414 struct rq *rq = this_rq();
2415
2416#ifdef CONFIG_SMP
2417 int this_cpu = smp_processor_id();
2418
2419 if (cpu == this_cpu) {
2420 schedstat_inc(rq, ttwu_local);
2421 schedstat_inc(p, se.statistics.nr_wakeups_local);
2422 } else {
2423 struct sched_domain *sd;
2424
2425 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2426 rcu_read_lock();
2427 for_each_domain(this_cpu, sd) {
2428 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2429 schedstat_inc(sd, ttwu_wake_remote);
2430 break;
2431 }
2432 }
2433 rcu_read_unlock();
2434 }
2435#endif /* CONFIG_SMP */
2436
2437 schedstat_inc(rq, ttwu_count);
2401 schedstat_inc(p, se.statistics.nr_wakeups); 2438 schedstat_inc(p, se.statistics.nr_wakeups);
2402 if (is_sync) 2439
2440 if (wake_flags & WF_SYNC)
2403 schedstat_inc(p, se.statistics.nr_wakeups_sync); 2441 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2404 if (is_migrate) 2442
2443 if (cpu != task_cpu(p))
2405 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 2444 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2406 if (is_local)
2407 schedstat_inc(p, se.statistics.nr_wakeups_local);
2408 else
2409 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2410 2445
2446#endif /* CONFIG_SCHEDSTATS */
2447}
2448
2449static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2450{
2411 activate_task(rq, p, en_flags); 2451 activate_task(rq, p, en_flags);
2452 p->on_rq = 1;
2453
2454 /* if a worker is waking up, notify workqueue */
2455 if (p->flags & PF_WQ_WORKER)
2456 wq_worker_waking_up(p, cpu_of(rq));
2412} 2457}
2413 2458
2414static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, 2459/*
2415 int wake_flags, bool success) 2460 * Mark the task runnable and perform wakeup-preemption.
2461 */
2462static void
2463ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2416{ 2464{
2417 trace_sched_wakeup(p, success); 2465 trace_sched_wakeup(p, true);
2418 check_preempt_curr(rq, p, wake_flags); 2466 check_preempt_curr(rq, p, wake_flags);
2419 2467
2420 p->state = TASK_RUNNING; 2468 p->state = TASK_RUNNING;
@@ -2433,9 +2481,99 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2433 rq->idle_stamp = 0; 2481 rq->idle_stamp = 0;
2434 } 2482 }
2435#endif 2483#endif
2436 /* if a worker is waking up, notify workqueue */ 2484}
2437 if ((p->flags & PF_WQ_WORKER) && success) 2485
2438 wq_worker_waking_up(p, cpu_of(rq)); 2486static void
2487ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
2488{
2489#ifdef CONFIG_SMP
2490 if (p->sched_contributes_to_load)
2491 rq->nr_uninterruptible--;
2492#endif
2493
2494 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
2495 ttwu_do_wakeup(rq, p, wake_flags);
2496}
2497
2498/*
2499 * Called in case the task @p isn't fully descheduled from its runqueue,
2500 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2501 * since all we need to do is flip p->state to TASK_RUNNING, since
2502 * the task is still ->on_rq.
2503 */
2504static int ttwu_remote(struct task_struct *p, int wake_flags)
2505{
2506 struct rq *rq;
2507 int ret = 0;
2508
2509 rq = __task_rq_lock(p);
2510 if (p->on_rq) {
2511 ttwu_do_wakeup(rq, p, wake_flags);
2512 ret = 1;
2513 }
2514 __task_rq_unlock(rq);
2515
2516 return ret;
2517}
2518
2519#ifdef CONFIG_SMP
2520static void sched_ttwu_pending(void)
2521{
2522 struct rq *rq = this_rq();
2523 struct task_struct *list = xchg(&rq->wake_list, NULL);
2524
2525 if (!list)
2526 return;
2527
2528 raw_spin_lock(&rq->lock);
2529
2530 while (list) {
2531 struct task_struct *p = list;
2532 list = list->wake_entry;
2533 ttwu_do_activate(rq, p, 0);
2534 }
2535
2536 raw_spin_unlock(&rq->lock);
2537}
2538
2539void scheduler_ipi(void)
2540{
2541 sched_ttwu_pending();
2542}
2543
2544static void ttwu_queue_remote(struct task_struct *p, int cpu)
2545{
2546 struct rq *rq = cpu_rq(cpu);
2547 struct task_struct *next = rq->wake_list;
2548
2549 for (;;) {
2550 struct task_struct *old = next;
2551
2552 p->wake_entry = next;
2553 next = cmpxchg(&rq->wake_list, old, p);
2554 if (next == old)
2555 break;
2556 }
2557
2558 if (!next)
2559 smp_send_reschedule(cpu);
2560}
2561#endif
2562
2563static void ttwu_queue(struct task_struct *p, int cpu)
2564{
2565 struct rq *rq = cpu_rq(cpu);
2566
2567#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE)
2568 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
2569 ttwu_queue_remote(p, cpu);
2570 return;
2571 }
2572#endif
2573
2574 raw_spin_lock(&rq->lock);
2575 ttwu_do_activate(rq, p, 0);
2576 raw_spin_unlock(&rq->lock);
2439} 2577}
2440 2578
2441/** 2579/**
@@ -2453,92 +2591,64 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2453 * Returns %true if @p was woken up, %false if it was already running 2591 * Returns %true if @p was woken up, %false if it was already running
2454 * or @state didn't match @p's state. 2592 * or @state didn't match @p's state.
2455 */ 2593 */
2456static int try_to_wake_up(struct task_struct *p, unsigned int state, 2594static int
2457 int wake_flags) 2595try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2458{ 2596{
2459 int cpu, orig_cpu, this_cpu, success = 0;
2460 unsigned long flags; 2597 unsigned long flags;
2461 unsigned long en_flags = ENQUEUE_WAKEUP; 2598 int cpu, success = 0;
2462 struct rq *rq;
2463
2464 this_cpu = get_cpu();
2465 2599
2466 smp_wmb(); 2600 smp_wmb();
2467 rq = task_rq_lock(p, &flags); 2601 raw_spin_lock_irqsave(&p->pi_lock, flags);
2468 if (!(p->state & state)) 2602 if (!(p->state & state))
2469 goto out; 2603 goto out;
2470 2604
2471 if (p->se.on_rq) 2605 success = 1; /* we're going to change ->state */
2472 goto out_running;
2473
2474 cpu = task_cpu(p); 2606 cpu = task_cpu(p);
2475 orig_cpu = cpu;
2476 2607
2477#ifdef CONFIG_SMP 2608 if (p->on_rq && ttwu_remote(p, wake_flags))
2478 if (unlikely(task_running(rq, p))) 2609 goto stat;
2479 goto out_activate;
2480 2610
2611#ifdef CONFIG_SMP
2481 /* 2612 /*
2482 * In order to handle concurrent wakeups and release the rq->lock 2613 * If the owning (remote) cpu is still in the middle of schedule() with
2483 * we put the task in TASK_WAKING state. 2614 * this task as prev, wait until its done referencing the task.
2484 *
2485 * First fix up the nr_uninterruptible count:
2486 */ 2615 */
2487 if (task_contributes_to_load(p)) { 2616 while (p->on_cpu) {
2488 if (likely(cpu_online(orig_cpu))) 2617#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2489 rq->nr_uninterruptible--; 2618 /*
2490 else 2619 * If called from interrupt context we could have landed in the
2491 this_rq()->nr_uninterruptible--; 2620 * middle of schedule(), in this case we should take care not
2492 } 2621 * to spin on ->on_cpu if p is current, since that would
2493 p->state = TASK_WAKING; 2622 * deadlock.
2494 2623 */
2495 if (p->sched_class->task_waking) { 2624 if (p == current) {
2496 p->sched_class->task_waking(rq, p); 2625 ttwu_queue(p, cpu);
2497 en_flags |= ENQUEUE_WAKING; 2626 goto stat;
2627 }
2628#endif
2629 cpu_relax();
2498 } 2630 }
2499
2500 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2501 if (cpu != orig_cpu)
2502 set_task_cpu(p, cpu);
2503 __task_rq_unlock(rq);
2504
2505 rq = cpu_rq(cpu);
2506 raw_spin_lock(&rq->lock);
2507
2508 /* 2631 /*
2509 * We migrated the task without holding either rq->lock, however 2632 * Pairs with the smp_wmb() in finish_lock_switch().
2510 * since the task is not on the task list itself, nobody else
2511 * will try and migrate the task, hence the rq should match the
2512 * cpu we just moved it to.
2513 */ 2633 */
2514 WARN_ON(task_cpu(p) != cpu); 2634 smp_rmb();
2515 WARN_ON(p->state != TASK_WAKING);
2516 2635
2517#ifdef CONFIG_SCHEDSTATS 2636 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2518 schedstat_inc(rq, ttwu_count); 2637 p->state = TASK_WAKING;
2519 if (cpu == this_cpu) 2638
2520 schedstat_inc(rq, ttwu_local); 2639 if (p->sched_class->task_waking)
2521 else { 2640 p->sched_class->task_waking(p);
2522 struct sched_domain *sd;
2523 for_each_domain(this_cpu, sd) {
2524 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2525 schedstat_inc(sd, ttwu_wake_remote);
2526 break;
2527 }
2528 }
2529 }
2530#endif /* CONFIG_SCHEDSTATS */
2531 2641
2532out_activate: 2642 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2643 if (task_cpu(p) != cpu)
2644 set_task_cpu(p, cpu);
2533#endif /* CONFIG_SMP */ 2645#endif /* CONFIG_SMP */
2534 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, 2646
2535 cpu == this_cpu, en_flags); 2647 ttwu_queue(p, cpu);
2536 success = 1; 2648stat:
2537out_running: 2649 ttwu_stat(p, cpu, wake_flags);
2538 ttwu_post_activation(p, rq, wake_flags, success);
2539out: 2650out:
2540 task_rq_unlock(rq, &flags); 2651 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2541 put_cpu();
2542 2652
2543 return success; 2653 return success;
2544} 2654}
@@ -2547,31 +2657,34 @@ out:
2547 * try_to_wake_up_local - try to wake up a local task with rq lock held 2657 * try_to_wake_up_local - try to wake up a local task with rq lock held
2548 * @p: the thread to be awakened 2658 * @p: the thread to be awakened
2549 * 2659 *
2550 * Put @p on the run-queue if it's not already there. The caller must 2660 * Put @p on the run-queue if it's not already there. The caller must
2551 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2661 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2552 * the current task. this_rq() stays locked over invocation. 2662 * the current task.
2553 */ 2663 */
2554static void try_to_wake_up_local(struct task_struct *p) 2664static void try_to_wake_up_local(struct task_struct *p)
2555{ 2665{
2556 struct rq *rq = task_rq(p); 2666 struct rq *rq = task_rq(p);
2557 bool success = false;
2558 2667
2559 BUG_ON(rq != this_rq()); 2668 BUG_ON(rq != this_rq());
2560 BUG_ON(p == current); 2669 BUG_ON(p == current);
2561 lockdep_assert_held(&rq->lock); 2670 lockdep_assert_held(&rq->lock);
2562 2671
2672 if (!raw_spin_trylock(&p->pi_lock)) {
2673 raw_spin_unlock(&rq->lock);
2674 raw_spin_lock(&p->pi_lock);
2675 raw_spin_lock(&rq->lock);
2676 }
2677
2563 if (!(p->state & TASK_NORMAL)) 2678 if (!(p->state & TASK_NORMAL))
2564 return; 2679 goto out;
2565 2680
2566 if (!p->se.on_rq) { 2681 if (!p->on_rq)
2567 if (likely(!task_running(rq, p))) { 2682 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2568 schedstat_inc(rq, ttwu_count); 2683
2569 schedstat_inc(rq, ttwu_local); 2684 ttwu_do_wakeup(rq, p, 0);
2570 } 2685 ttwu_stat(p, smp_processor_id(), 0);
2571 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); 2686out:
2572 success = true; 2687 raw_spin_unlock(&p->pi_lock);
2573 }
2574 ttwu_post_activation(p, rq, 0, success);
2575} 2688}
2576 2689
2577/** 2690/**
@@ -2604,19 +2717,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
2604 */ 2717 */
2605static void __sched_fork(struct task_struct *p) 2718static void __sched_fork(struct task_struct *p)
2606{ 2719{
2720 p->on_rq = 0;
2721
2722 p->se.on_rq = 0;
2607 p->se.exec_start = 0; 2723 p->se.exec_start = 0;
2608 p->se.sum_exec_runtime = 0; 2724 p->se.sum_exec_runtime = 0;
2609 p->se.prev_sum_exec_runtime = 0; 2725 p->se.prev_sum_exec_runtime = 0;
2610 p->se.nr_migrations = 0; 2726 p->se.nr_migrations = 0;
2611 p->se.vruntime = 0; 2727 p->se.vruntime = 0;
2728 INIT_LIST_HEAD(&p->se.group_node);
2612 2729
2613#ifdef CONFIG_SCHEDSTATS 2730#ifdef CONFIG_SCHEDSTATS
2614 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2731 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2615#endif 2732#endif
2616 2733
2617 INIT_LIST_HEAD(&p->rt.run_list); 2734 INIT_LIST_HEAD(&p->rt.run_list);
2618 p->se.on_rq = 0;
2619 INIT_LIST_HEAD(&p->se.group_node);
2620 2735
2621#ifdef CONFIG_PREEMPT_NOTIFIERS 2736#ifdef CONFIG_PREEMPT_NOTIFIERS
2622 INIT_HLIST_HEAD(&p->preempt_notifiers); 2737 INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2626,8 +2741,9 @@ static void __sched_fork(struct task_struct *p)
2626/* 2741/*
2627 * fork()/clone()-time setup: 2742 * fork()/clone()-time setup:
2628 */ 2743 */
2629void sched_fork(struct task_struct *p, int clone_flags) 2744void sched_fork(struct task_struct *p)
2630{ 2745{
2746 unsigned long flags;
2631 int cpu = get_cpu(); 2747 int cpu = get_cpu();
2632 2748
2633 __sched_fork(p); 2749 __sched_fork(p);
@@ -2678,16 +2794,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
2678 * 2794 *
2679 * Silence PROVE_RCU. 2795 * Silence PROVE_RCU.
2680 */ 2796 */
2681 rcu_read_lock(); 2797 raw_spin_lock_irqsave(&p->pi_lock, flags);
2682 set_task_cpu(p, cpu); 2798 set_task_cpu(p, cpu);
2683 rcu_read_unlock(); 2799 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2684 2800
2685#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2801#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2686 if (likely(sched_info_on())) 2802 if (likely(sched_info_on()))
2687 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2803 memset(&p->sched_info, 0, sizeof(p->sched_info));
2688#endif 2804#endif
2689#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 2805#if defined(CONFIG_SMP)
2690 p->oncpu = 0; 2806 p->on_cpu = 0;
2691#endif 2807#endif
2692#ifdef CONFIG_PREEMPT 2808#ifdef CONFIG_PREEMPT
2693 /* Want to start with kernel preemption disabled. */ 2809 /* Want to start with kernel preemption disabled. */
@@ -2707,41 +2823,31 @@ void sched_fork(struct task_struct *p, int clone_flags)
2707 * that must be done for every newly created context, then puts the task 2823 * that must be done for every newly created context, then puts the task
2708 * on the runqueue and wakes it. 2824 * on the runqueue and wakes it.
2709 */ 2825 */
2710void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 2826void wake_up_new_task(struct task_struct *p)
2711{ 2827{
2712 unsigned long flags; 2828 unsigned long flags;
2713 struct rq *rq; 2829 struct rq *rq;
2714 int cpu __maybe_unused = get_cpu();
2715 2830
2831 raw_spin_lock_irqsave(&p->pi_lock, flags);
2716#ifdef CONFIG_SMP 2832#ifdef CONFIG_SMP
2717 rq = task_rq_lock(p, &flags);
2718 p->state = TASK_WAKING;
2719
2720 /* 2833 /*
2721 * Fork balancing, do it here and not earlier because: 2834 * Fork balancing, do it here and not earlier because:
2722 * - cpus_allowed can change in the fork path 2835 * - cpus_allowed can change in the fork path
2723 * - any previously selected cpu might disappear through hotplug 2836 * - any previously selected cpu might disappear through hotplug
2724 *
2725 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2726 * without people poking at ->cpus_allowed.
2727 */ 2837 */
2728 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); 2838 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
2729 set_task_cpu(p, cpu);
2730
2731 p->state = TASK_RUNNING;
2732 task_rq_unlock(rq, &flags);
2733#endif 2839#endif
2734 2840
2735 rq = task_rq_lock(p, &flags); 2841 rq = __task_rq_lock(p);
2736 activate_task(rq, p, 0); 2842 activate_task(rq, p, 0);
2737 trace_sched_wakeup_new(p, 1); 2843 p->on_rq = 1;
2844 trace_sched_wakeup_new(p, true);
2738 check_preempt_curr(rq, p, WF_FORK); 2845 check_preempt_curr(rq, p, WF_FORK);
2739#ifdef CONFIG_SMP 2846#ifdef CONFIG_SMP
2740 if (p->sched_class->task_woken) 2847 if (p->sched_class->task_woken)
2741 p->sched_class->task_woken(rq, p); 2848 p->sched_class->task_woken(rq, p);
2742#endif 2849#endif
2743 task_rq_unlock(rq, &flags); 2850 task_rq_unlock(rq, p, &flags);
2744 put_cpu();
2745} 2851}
2746 2852
2747#ifdef CONFIG_PREEMPT_NOTIFIERS 2853#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -3450,27 +3556,22 @@ void sched_exec(void)
3450{ 3556{
3451 struct task_struct *p = current; 3557 struct task_struct *p = current;
3452 unsigned long flags; 3558 unsigned long flags;
3453 struct rq *rq;
3454 int dest_cpu; 3559 int dest_cpu;
3455 3560
3456 rq = task_rq_lock(p, &flags); 3561 raw_spin_lock_irqsave(&p->pi_lock, flags);
3457 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); 3562 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
3458 if (dest_cpu == smp_processor_id()) 3563 if (dest_cpu == smp_processor_id())
3459 goto unlock; 3564 goto unlock;
3460 3565
3461 /* 3566 if (likely(cpu_active(dest_cpu))) {
3462 * select_task_rq() can race against ->cpus_allowed
3463 */
3464 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3465 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3466 struct migration_arg arg = { p, dest_cpu }; 3567 struct migration_arg arg = { p, dest_cpu };
3467 3568
3468 task_rq_unlock(rq, &flags); 3569 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3469 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 3570 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3470 return; 3571 return;
3471 } 3572 }
3472unlock: 3573unlock:
3473 task_rq_unlock(rq, &flags); 3574 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3474} 3575}
3475 3576
3476#endif 3577#endif
@@ -3507,7 +3608,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
3507 3608
3508 rq = task_rq_lock(p, &flags); 3609 rq = task_rq_lock(p, &flags);
3509 ns = do_task_delta_exec(p, rq); 3610 ns = do_task_delta_exec(p, rq);
3510 task_rq_unlock(rq, &flags); 3611 task_rq_unlock(rq, p, &flags);
3511 3612
3512 return ns; 3613 return ns;
3513} 3614}
@@ -3525,7 +3626,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3525 3626
3526 rq = task_rq_lock(p, &flags); 3627 rq = task_rq_lock(p, &flags);
3527 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 3628 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3528 task_rq_unlock(rq, &flags); 3629 task_rq_unlock(rq, p, &flags);
3529 3630
3530 return ns; 3631 return ns;
3531} 3632}
@@ -3549,7 +3650,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
3549 rq = task_rq_lock(p, &flags); 3650 rq = task_rq_lock(p, &flags);
3550 thread_group_cputime(p, &totals); 3651 thread_group_cputime(p, &totals);
3551 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); 3652 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3552 task_rq_unlock(rq, &flags); 3653 task_rq_unlock(rq, p, &flags);
3553 3654
3554 return ns; 3655 return ns;
3555} 3656}
@@ -3903,9 +4004,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3903/* 4004/*
3904 * This function gets called by the timer code, with HZ frequency. 4005 * This function gets called by the timer code, with HZ frequency.
3905 * We call it with interrupts disabled. 4006 * We call it with interrupts disabled.
3906 *
3907 * It also gets called by the fork code, when changing the parent's
3908 * timeslices.
3909 */ 4007 */
3910void scheduler_tick(void) 4008void scheduler_tick(void)
3911{ 4009{
@@ -4025,17 +4123,11 @@ static inline void schedule_debug(struct task_struct *prev)
4025 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 4123 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4026 4124
4027 schedstat_inc(this_rq(), sched_count); 4125 schedstat_inc(this_rq(), sched_count);
4028#ifdef CONFIG_SCHEDSTATS
4029 if (unlikely(prev->lock_depth >= 0)) {
4030 schedstat_inc(this_rq(), rq_sched_info.bkl_count);
4031 schedstat_inc(prev, sched_info.bkl_count);
4032 }
4033#endif
4034} 4126}
4035 4127
4036static void put_prev_task(struct rq *rq, struct task_struct *prev) 4128static void put_prev_task(struct rq *rq, struct task_struct *prev)
4037{ 4129{
4038 if (prev->se.on_rq) 4130 if (prev->on_rq || rq->skip_clock_update < 0)
4039 update_rq_clock(rq); 4131 update_rq_clock(rq);
4040 prev->sched_class->put_prev_task(rq, prev); 4132 prev->sched_class->put_prev_task(rq, prev);
4041} 4133}
@@ -4097,11 +4189,13 @@ need_resched:
4097 if (unlikely(signal_pending_state(prev->state, prev))) { 4189 if (unlikely(signal_pending_state(prev->state, prev))) {
4098 prev->state = TASK_RUNNING; 4190 prev->state = TASK_RUNNING;
4099 } else { 4191 } else {
4192 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4193 prev->on_rq = 0;
4194
4100 /* 4195 /*
4101 * If a worker is going to sleep, notify and 4196 * If a worker went to sleep, notify and ask workqueue
4102 * ask workqueue whether it wants to wake up a 4197 * whether it wants to wake up a task to maintain
4103 * task to maintain concurrency. If so, wake 4198 * concurrency.
4104 * up the task.
4105 */ 4199 */
4106 if (prev->flags & PF_WQ_WORKER) { 4200 if (prev->flags & PF_WQ_WORKER) {
4107 struct task_struct *to_wakeup; 4201 struct task_struct *to_wakeup;
@@ -4110,11 +4204,10 @@ need_resched:
4110 if (to_wakeup) 4204 if (to_wakeup)
4111 try_to_wake_up_local(to_wakeup); 4205 try_to_wake_up_local(to_wakeup);
4112 } 4206 }
4113 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4114 4207
4115 /* 4208 /*
4116 * If we are going to sleep and we have plugged IO queued, make 4209 * If we are going to sleep and we have plugged IO
4117 * sure to submit it to avoid deadlocks. 4210 * queued, make sure to submit it to avoid deadlocks.
4118 */ 4211 */
4119 if (blk_needs_flush_plug(prev)) { 4212 if (blk_needs_flush_plug(prev)) {
4120 raw_spin_unlock(&rq->lock); 4213 raw_spin_unlock(&rq->lock);
@@ -4161,70 +4254,53 @@ need_resched:
4161EXPORT_SYMBOL(schedule); 4254EXPORT_SYMBOL(schedule);
4162 4255
4163#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 4256#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4164/*
4165 * Look out! "owner" is an entirely speculative pointer
4166 * access and not reliable.
4167 */
4168int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4169{
4170 unsigned int cpu;
4171 struct rq *rq;
4172 4257
4173 if (!sched_feat(OWNER_SPIN)) 4258static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4174 return 0; 4259{
4260 bool ret = false;
4175 4261
4176#ifdef CONFIG_DEBUG_PAGEALLOC 4262 rcu_read_lock();
4177 /* 4263 if (lock->owner != owner)
4178 * Need to access the cpu field knowing that 4264 goto fail;
4179 * DEBUG_PAGEALLOC could have unmapped it if
4180 * the mutex owner just released it and exited.
4181 */
4182 if (probe_kernel_address(&owner->cpu, cpu))
4183 return 0;
4184#else
4185 cpu = owner->cpu;
4186#endif
4187 4265
4188 /* 4266 /*
4189 * Even if the access succeeded (likely case), 4267 * Ensure we emit the owner->on_cpu, dereference _after_ checking
4190 * the cpu field may no longer be valid. 4268 * lock->owner still matches owner, if that fails, owner might
4269 * point to free()d memory, if it still matches, the rcu_read_lock()
4270 * ensures the memory stays valid.
4191 */ 4271 */
4192 if (cpu >= nr_cpumask_bits) 4272 barrier();
4193 return 0;
4194 4273
4195 /* 4274 ret = owner->on_cpu;
4196 * We need to validate that we can do a 4275fail:
4197 * get_cpu() and that we have the percpu area. 4276 rcu_read_unlock();
4198 */
4199 if (!cpu_online(cpu))
4200 return 0;
4201 4277
4202 rq = cpu_rq(cpu); 4278 return ret;
4279}
4203 4280
4204 for (;;) { 4281/*
4205 /* 4282 * Look out! "owner" is an entirely speculative pointer
4206 * Owner changed, break to re-assess state. 4283 * access and not reliable.
4207 */ 4284 */
4208 if (lock->owner != owner) { 4285int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
4209 /* 4286{
4210 * If the lock has switched to a different owner, 4287 if (!sched_feat(OWNER_SPIN))
4211 * we likely have heavy contention. Return 0 to quit 4288 return 0;
4212 * optimistic spinning and not contend further:
4213 */
4214 if (lock->owner)
4215 return 0;
4216 break;
4217 }
4218 4289
4219 /* 4290 while (owner_running(lock, owner)) {
4220 * Is that owner really running on that cpu? 4291 if (need_resched())
4221 */
4222 if (task_thread_info(rq->curr) != owner || need_resched())
4223 return 0; 4292 return 0;
4224 4293
4225 arch_mutex_cpu_relax(); 4294 arch_mutex_cpu_relax();
4226 } 4295 }
4227 4296
4297 /*
4298 * If the owner changed to another task there is likely
4299 * heavy contention, stop spinning.
4300 */
4301 if (lock->owner)
4302 return 0;
4303
4228 return 1; 4304 return 1;
4229} 4305}
4230#endif 4306#endif
@@ -4684,19 +4760,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
4684 */ 4760 */
4685void rt_mutex_setprio(struct task_struct *p, int prio) 4761void rt_mutex_setprio(struct task_struct *p, int prio)
4686{ 4762{
4687 unsigned long flags;
4688 int oldprio, on_rq, running; 4763 int oldprio, on_rq, running;
4689 struct rq *rq; 4764 struct rq *rq;
4690 const struct sched_class *prev_class; 4765 const struct sched_class *prev_class;
4691 4766
4692 BUG_ON(prio < 0 || prio > MAX_PRIO); 4767 BUG_ON(prio < 0 || prio > MAX_PRIO);
4693 4768
4694 rq = task_rq_lock(p, &flags); 4769 rq = __task_rq_lock(p);
4695 4770
4696 trace_sched_pi_setprio(p, prio); 4771 trace_sched_pi_setprio(p, prio);
4697 oldprio = p->prio; 4772 oldprio = p->prio;
4698 prev_class = p->sched_class; 4773 prev_class = p->sched_class;
4699 on_rq = p->se.on_rq; 4774 on_rq = p->on_rq;
4700 running = task_current(rq, p); 4775 running = task_current(rq, p);
4701 if (on_rq) 4776 if (on_rq)
4702 dequeue_task(rq, p, 0); 4777 dequeue_task(rq, p, 0);
@@ -4716,7 +4791,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4716 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4791 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4717 4792
4718 check_class_changed(rq, p, prev_class, oldprio); 4793 check_class_changed(rq, p, prev_class, oldprio);
4719 task_rq_unlock(rq, &flags); 4794 __task_rq_unlock(rq);
4720} 4795}
4721 4796
4722#endif 4797#endif
@@ -4744,7 +4819,7 @@ void set_user_nice(struct task_struct *p, long nice)
4744 p->static_prio = NICE_TO_PRIO(nice); 4819 p->static_prio = NICE_TO_PRIO(nice);
4745 goto out_unlock; 4820 goto out_unlock;
4746 } 4821 }
4747 on_rq = p->se.on_rq; 4822 on_rq = p->on_rq;
4748 if (on_rq) 4823 if (on_rq)
4749 dequeue_task(rq, p, 0); 4824 dequeue_task(rq, p, 0);
4750 4825
@@ -4764,7 +4839,7 @@ void set_user_nice(struct task_struct *p, long nice)
4764 resched_task(rq->curr); 4839 resched_task(rq->curr);
4765 } 4840 }
4766out_unlock: 4841out_unlock:
4767 task_rq_unlock(rq, &flags); 4842 task_rq_unlock(rq, p, &flags);
4768} 4843}
4769EXPORT_SYMBOL(set_user_nice); 4844EXPORT_SYMBOL(set_user_nice);
4770 4845
@@ -4878,8 +4953,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
4878static void 4953static void
4879__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 4954__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4880{ 4955{
4881 BUG_ON(p->se.on_rq);
4882
4883 p->policy = policy; 4956 p->policy = policy;
4884 p->rt_priority = prio; 4957 p->rt_priority = prio;
4885 p->normal_prio = normal_prio(p); 4958 p->normal_prio = normal_prio(p);
@@ -4994,20 +5067,17 @@ recheck:
4994 /* 5067 /*
4995 * make sure no PI-waiters arrive (or leave) while we are 5068 * make sure no PI-waiters arrive (or leave) while we are
4996 * changing the priority of the task: 5069 * changing the priority of the task:
4997 */ 5070 *
4998 raw_spin_lock_irqsave(&p->pi_lock, flags);
4999 /*
5000 * To be able to change p->policy safely, the appropriate 5071 * To be able to change p->policy safely, the appropriate
5001 * runqueue lock must be held. 5072 * runqueue lock must be held.
5002 */ 5073 */
5003 rq = __task_rq_lock(p); 5074 rq = task_rq_lock(p, &flags);
5004 5075
5005 /* 5076 /*
5006 * Changing the policy of the stop threads its a very bad idea 5077 * Changing the policy of the stop threads its a very bad idea
5007 */ 5078 */
5008 if (p == rq->stop) { 5079 if (p == rq->stop) {
5009 __task_rq_unlock(rq); 5080 task_rq_unlock(rq, p, &flags);
5010 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5011 return -EINVAL; 5081 return -EINVAL;
5012 } 5082 }
5013 5083
@@ -5031,8 +5101,7 @@ recheck:
5031 if (rt_bandwidth_enabled() && rt_policy(policy) && 5101 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5032 task_group(p)->rt_bandwidth.rt_runtime == 0 && 5102 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5033 !task_group_is_autogroup(task_group(p))) { 5103 !task_group_is_autogroup(task_group(p))) {
5034 __task_rq_unlock(rq); 5104 task_rq_unlock(rq, p, &flags);
5035 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5036 return -EPERM; 5105 return -EPERM;
5037 } 5106 }
5038 } 5107 }
@@ -5041,11 +5110,10 @@ recheck:
5041 /* recheck policy now with rq lock held */ 5110 /* recheck policy now with rq lock held */
5042 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 5111 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5043 policy = oldpolicy = -1; 5112 policy = oldpolicy = -1;
5044 __task_rq_unlock(rq); 5113 task_rq_unlock(rq, p, &flags);
5045 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5046 goto recheck; 5114 goto recheck;
5047 } 5115 }
5048 on_rq = p->se.on_rq; 5116 on_rq = p->on_rq;
5049 running = task_current(rq, p); 5117 running = task_current(rq, p);
5050 if (on_rq) 5118 if (on_rq)
5051 deactivate_task(rq, p, 0); 5119 deactivate_task(rq, p, 0);
@@ -5064,8 +5132,7 @@ recheck:
5064 activate_task(rq, p, 0); 5132 activate_task(rq, p, 0);
5065 5133
5066 check_class_changed(rq, p, prev_class, oldprio); 5134 check_class_changed(rq, p, prev_class, oldprio);
5067 __task_rq_unlock(rq); 5135 task_rq_unlock(rq, p, &flags);
5068 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5069 5136
5070 rt_mutex_adjust_pi(p); 5137 rt_mutex_adjust_pi(p);
5071 5138
@@ -5316,7 +5383,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5316{ 5383{
5317 struct task_struct *p; 5384 struct task_struct *p;
5318 unsigned long flags; 5385 unsigned long flags;
5319 struct rq *rq;
5320 int retval; 5386 int retval;
5321 5387
5322 get_online_cpus(); 5388 get_online_cpus();
@@ -5331,9 +5397,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5331 if (retval) 5397 if (retval)
5332 goto out_unlock; 5398 goto out_unlock;
5333 5399
5334 rq = task_rq_lock(p, &flags); 5400 raw_spin_lock_irqsave(&p->pi_lock, flags);
5335 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 5401 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5336 task_rq_unlock(rq, &flags); 5402 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5337 5403
5338out_unlock: 5404out_unlock:
5339 rcu_read_unlock(); 5405 rcu_read_unlock();
@@ -5658,7 +5724,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5658 5724
5659 rq = task_rq_lock(p, &flags); 5725 rq = task_rq_lock(p, &flags);
5660 time_slice = p->sched_class->get_rr_interval(rq, p); 5726 time_slice = p->sched_class->get_rr_interval(rq, p);
5661 task_rq_unlock(rq, &flags); 5727 task_rq_unlock(rq, p, &flags);
5662 5728
5663 rcu_read_unlock(); 5729 rcu_read_unlock();
5664 jiffies_to_timespec(time_slice, &t); 5730 jiffies_to_timespec(time_slice, &t);
@@ -5776,17 +5842,14 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5776 rcu_read_unlock(); 5842 rcu_read_unlock();
5777 5843
5778 rq->curr = rq->idle = idle; 5844 rq->curr = rq->idle = idle;
5779#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5845#if defined(CONFIG_SMP)
5780 idle->oncpu = 1; 5846 idle->on_cpu = 1;
5781#endif 5847#endif
5782 raw_spin_unlock_irqrestore(&rq->lock, flags); 5848 raw_spin_unlock_irqrestore(&rq->lock, flags);
5783 5849
5784 /* Set the preempt count _outside_ the spinlocks! */ 5850 /* Set the preempt count _outside_ the spinlocks! */
5785#if defined(CONFIG_PREEMPT)
5786 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5787#else
5788 task_thread_info(idle)->preempt_count = 0; 5851 task_thread_info(idle)->preempt_count = 0;
5789#endif 5852
5790 /* 5853 /*
5791 * The idle tasks have their own, simple scheduling class: 5854 * The idle tasks have their own, simple scheduling class:
5792 */ 5855 */
@@ -5881,26 +5944,17 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5881 unsigned int dest_cpu; 5944 unsigned int dest_cpu;
5882 int ret = 0; 5945 int ret = 0;
5883 5946
5884 /*
5885 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5886 * drop the rq->lock and still rely on ->cpus_allowed.
5887 */
5888again:
5889 while (task_is_waking(p))
5890 cpu_relax();
5891 rq = task_rq_lock(p, &flags); 5947 rq = task_rq_lock(p, &flags);
5892 if (task_is_waking(p)) { 5948
5893 task_rq_unlock(rq, &flags); 5949 if (cpumask_equal(&p->cpus_allowed, new_mask))
5894 goto again; 5950 goto out;
5895 }
5896 5951
5897 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5952 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5898 ret = -EINVAL; 5953 ret = -EINVAL;
5899 goto out; 5954 goto out;
5900 } 5955 }
5901 5956
5902 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 5957 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
5903 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5904 ret = -EINVAL; 5958 ret = -EINVAL;
5905 goto out; 5959 goto out;
5906 } 5960 }
@@ -5917,16 +5971,16 @@ again:
5917 goto out; 5971 goto out;
5918 5972
5919 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5973 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5920 if (migrate_task(p, rq)) { 5974 if (p->on_rq) {
5921 struct migration_arg arg = { p, dest_cpu }; 5975 struct migration_arg arg = { p, dest_cpu };
5922 /* Need help from migration thread: drop lock and wait. */ 5976 /* Need help from migration thread: drop lock and wait. */
5923 task_rq_unlock(rq, &flags); 5977 task_rq_unlock(rq, p, &flags);
5924 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 5978 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5925 tlb_migrate_finish(p->mm); 5979 tlb_migrate_finish(p->mm);
5926 return 0; 5980 return 0;
5927 } 5981 }
5928out: 5982out:
5929 task_rq_unlock(rq, &flags); 5983 task_rq_unlock(rq, p, &flags);
5930 5984
5931 return ret; 5985 return ret;
5932} 5986}
@@ -5954,6 +6008,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5954 rq_src = cpu_rq(src_cpu); 6008 rq_src = cpu_rq(src_cpu);
5955 rq_dest = cpu_rq(dest_cpu); 6009 rq_dest = cpu_rq(dest_cpu);
5956 6010
6011 raw_spin_lock(&p->pi_lock);
5957 double_rq_lock(rq_src, rq_dest); 6012 double_rq_lock(rq_src, rq_dest);
5958 /* Already moved. */ 6013 /* Already moved. */
5959 if (task_cpu(p) != src_cpu) 6014 if (task_cpu(p) != src_cpu)
@@ -5966,7 +6021,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5966 * If we're not on a rq, the next wake-up will ensure we're 6021 * If we're not on a rq, the next wake-up will ensure we're
5967 * placed properly. 6022 * placed properly.
5968 */ 6023 */
5969 if (p->se.on_rq) { 6024 if (p->on_rq) {
5970 deactivate_task(rq_src, p, 0); 6025 deactivate_task(rq_src, p, 0);
5971 set_task_cpu(p, dest_cpu); 6026 set_task_cpu(p, dest_cpu);
5972 activate_task(rq_dest, p, 0); 6027 activate_task(rq_dest, p, 0);
@@ -5976,6 +6031,7 @@ done:
5976 ret = 1; 6031 ret = 1;
5977fail: 6032fail:
5978 double_rq_unlock(rq_src, rq_dest); 6033 double_rq_unlock(rq_src, rq_dest);
6034 raw_spin_unlock(&p->pi_lock);
5979 return ret; 6035 return ret;
5980} 6036}
5981 6037
@@ -6316,6 +6372,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6316 6372
6317#ifdef CONFIG_HOTPLUG_CPU 6373#ifdef CONFIG_HOTPLUG_CPU
6318 case CPU_DYING: 6374 case CPU_DYING:
6375 sched_ttwu_pending();
6319 /* Update our root-domain */ 6376 /* Update our root-domain */
6320 raw_spin_lock_irqsave(&rq->lock, flags); 6377 raw_spin_lock_irqsave(&rq->lock, flags);
6321 if (rq->rd) { 6378 if (rq->rd) {
@@ -6394,6 +6451,8 @@ early_initcall(migration_init);
6394 6451
6395#ifdef CONFIG_SMP 6452#ifdef CONFIG_SMP
6396 6453
6454static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
6455
6397#ifdef CONFIG_SCHED_DEBUG 6456#ifdef CONFIG_SCHED_DEBUG
6398 6457
6399static __read_mostly int sched_domain_debug_enabled; 6458static __read_mostly int sched_domain_debug_enabled;
@@ -6489,7 +6548,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6489 6548
6490static void sched_domain_debug(struct sched_domain *sd, int cpu) 6549static void sched_domain_debug(struct sched_domain *sd, int cpu)
6491{ 6550{
6492 cpumask_var_t groupmask;
6493 int level = 0; 6551 int level = 0;
6494 6552
6495 if (!sched_domain_debug_enabled) 6553 if (!sched_domain_debug_enabled)
@@ -6502,20 +6560,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6502 6560
6503 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6561 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6504 6562
6505 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6506 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6507 return;
6508 }
6509
6510 for (;;) { 6563 for (;;) {
6511 if (sched_domain_debug_one(sd, cpu, level, groupmask)) 6564 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
6512 break; 6565 break;
6513 level++; 6566 level++;
6514 sd = sd->parent; 6567 sd = sd->parent;
6515 if (!sd) 6568 if (!sd)
6516 break; 6569 break;
6517 } 6570 }
6518 free_cpumask_var(groupmask);
6519} 6571}
6520#else /* !CONFIG_SCHED_DEBUG */ 6572#else /* !CONFIG_SCHED_DEBUG */
6521# define sched_domain_debug(sd, cpu) do { } while (0) 6573# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6572,12 +6624,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6572 return 1; 6624 return 1;
6573} 6625}
6574 6626
6575static void free_rootdomain(struct root_domain *rd) 6627static void free_rootdomain(struct rcu_head *rcu)
6576{ 6628{
6577 synchronize_sched(); 6629 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
6578 6630
6579 cpupri_cleanup(&rd->cpupri); 6631 cpupri_cleanup(&rd->cpupri);
6580
6581 free_cpumask_var(rd->rto_mask); 6632 free_cpumask_var(rd->rto_mask);
6582 free_cpumask_var(rd->online); 6633 free_cpumask_var(rd->online);
6583 free_cpumask_var(rd->span); 6634 free_cpumask_var(rd->span);
@@ -6618,7 +6669,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6618 raw_spin_unlock_irqrestore(&rq->lock, flags); 6669 raw_spin_unlock_irqrestore(&rq->lock, flags);
6619 6670
6620 if (old_rd) 6671 if (old_rd)
6621 free_rootdomain(old_rd); 6672 call_rcu_sched(&old_rd->rcu, free_rootdomain);
6622} 6673}
6623 6674
6624static int init_rootdomain(struct root_domain *rd) 6675static int init_rootdomain(struct root_domain *rd)
@@ -6669,6 +6720,25 @@ static struct root_domain *alloc_rootdomain(void)
6669 return rd; 6720 return rd;
6670} 6721}
6671 6722
6723static void free_sched_domain(struct rcu_head *rcu)
6724{
6725 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
6726 if (atomic_dec_and_test(&sd->groups->ref))
6727 kfree(sd->groups);
6728 kfree(sd);
6729}
6730
6731static void destroy_sched_domain(struct sched_domain *sd, int cpu)
6732{
6733 call_rcu(&sd->rcu, free_sched_domain);
6734}
6735
6736static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6737{
6738 for (; sd; sd = sd->parent)
6739 destroy_sched_domain(sd, cpu);
6740}
6741
6672/* 6742/*
6673 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6743 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
6674 * hold the hotplug lock. 6744 * hold the hotplug lock.
@@ -6679,9 +6749,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6679 struct rq *rq = cpu_rq(cpu); 6749 struct rq *rq = cpu_rq(cpu);
6680 struct sched_domain *tmp; 6750 struct sched_domain *tmp;
6681 6751
6682 for (tmp = sd; tmp; tmp = tmp->parent)
6683 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6684
6685 /* Remove the sched domains which do not contribute to scheduling. */ 6752 /* Remove the sched domains which do not contribute to scheduling. */
6686 for (tmp = sd; tmp; ) { 6753 for (tmp = sd; tmp; ) {
6687 struct sched_domain *parent = tmp->parent; 6754 struct sched_domain *parent = tmp->parent;
@@ -6692,12 +6759,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6692 tmp->parent = parent->parent; 6759 tmp->parent = parent->parent;
6693 if (parent->parent) 6760 if (parent->parent)
6694 parent->parent->child = tmp; 6761 parent->parent->child = tmp;
6762 destroy_sched_domain(parent, cpu);
6695 } else 6763 } else
6696 tmp = tmp->parent; 6764 tmp = tmp->parent;
6697 } 6765 }
6698 6766
6699 if (sd && sd_degenerate(sd)) { 6767 if (sd && sd_degenerate(sd)) {
6768 tmp = sd;
6700 sd = sd->parent; 6769 sd = sd->parent;
6770 destroy_sched_domain(tmp, cpu);
6701 if (sd) 6771 if (sd)
6702 sd->child = NULL; 6772 sd->child = NULL;
6703 } 6773 }
@@ -6705,7 +6775,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6705 sched_domain_debug(sd, cpu); 6775 sched_domain_debug(sd, cpu);
6706 6776
6707 rq_attach_root(rq, rd); 6777 rq_attach_root(rq, rd);
6778 tmp = rq->sd;
6708 rcu_assign_pointer(rq->sd, sd); 6779 rcu_assign_pointer(rq->sd, sd);
6780 destroy_sched_domains(tmp, cpu);
6709} 6781}
6710 6782
6711/* cpus with isolated domains */ 6783/* cpus with isolated domains */
@@ -6721,56 +6793,6 @@ static int __init isolated_cpu_setup(char *str)
6721 6793
6722__setup("isolcpus=", isolated_cpu_setup); 6794__setup("isolcpus=", isolated_cpu_setup);
6723 6795
6724/*
6725 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6726 * to a function which identifies what group(along with sched group) a CPU
6727 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6728 * (due to the fact that we keep track of groups covered with a struct cpumask).
6729 *
6730 * init_sched_build_groups will build a circular linked list of the groups
6731 * covered by the given span, and will set each group's ->cpumask correctly,
6732 * and ->cpu_power to 0.
6733 */
6734static void
6735init_sched_build_groups(const struct cpumask *span,
6736 const struct cpumask *cpu_map,
6737 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
6738 struct sched_group **sg,
6739 struct cpumask *tmpmask),
6740 struct cpumask *covered, struct cpumask *tmpmask)
6741{
6742 struct sched_group *first = NULL, *last = NULL;
6743 int i;
6744
6745 cpumask_clear(covered);
6746
6747 for_each_cpu(i, span) {
6748 struct sched_group *sg;
6749 int group = group_fn(i, cpu_map, &sg, tmpmask);
6750 int j;
6751
6752 if (cpumask_test_cpu(i, covered))
6753 continue;
6754
6755 cpumask_clear(sched_group_cpus(sg));
6756 sg->cpu_power = 0;
6757
6758 for_each_cpu(j, span) {
6759 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6760 continue;
6761
6762 cpumask_set_cpu(j, covered);
6763 cpumask_set_cpu(j, sched_group_cpus(sg));
6764 }
6765 if (!first)
6766 first = sg;
6767 if (last)
6768 last->next = sg;
6769 last = sg;
6770 }
6771 last->next = first;
6772}
6773
6774#define SD_NODES_PER_DOMAIN 16 6796#define SD_NODES_PER_DOMAIN 16
6775 6797
6776#ifdef CONFIG_NUMA 6798#ifdef CONFIG_NUMA
@@ -6787,7 +6809,7 @@ init_sched_build_groups(const struct cpumask *span,
6787 */ 6809 */
6788static int find_next_best_node(int node, nodemask_t *used_nodes) 6810static int find_next_best_node(int node, nodemask_t *used_nodes)
6789{ 6811{
6790 int i, n, val, min_val, best_node = 0; 6812 int i, n, val, min_val, best_node = -1;
6791 6813
6792 min_val = INT_MAX; 6814 min_val = INT_MAX;
6793 6815
@@ -6811,7 +6833,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
6811 } 6833 }
6812 } 6834 }
6813 6835
6814 node_set(best_node, *used_nodes); 6836 if (best_node != -1)
6837 node_set(best_node, *used_nodes);
6815 return best_node; 6838 return best_node;
6816} 6839}
6817 6840
@@ -6837,315 +6860,130 @@ static void sched_domain_node_span(int node, struct cpumask *span)
6837 6860
6838 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 6861 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6839 int next_node = find_next_best_node(node, &used_nodes); 6862 int next_node = find_next_best_node(node, &used_nodes);
6840 6863 if (next_node < 0)
6864 break;
6841 cpumask_or(span, span, cpumask_of_node(next_node)); 6865 cpumask_or(span, span, cpumask_of_node(next_node));
6842 } 6866 }
6843} 6867}
6868
6869static const struct cpumask *cpu_node_mask(int cpu)
6870{
6871 lockdep_assert_held(&sched_domains_mutex);
6872
6873 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
6874
6875 return sched_domains_tmpmask;
6876}
6877
6878static const struct cpumask *cpu_allnodes_mask(int cpu)
6879{
6880 return cpu_possible_mask;
6881}
6844#endif /* CONFIG_NUMA */ 6882#endif /* CONFIG_NUMA */
6845 6883
6846int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6884static const struct cpumask *cpu_cpu_mask(int cpu)
6885{
6886 return cpumask_of_node(cpu_to_node(cpu));
6887}
6847 6888
6848/* 6889int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6849 * The cpus mask in sched_group and sched_domain hangs off the end.
6850 *
6851 * ( See the the comments in include/linux/sched.h:struct sched_group
6852 * and struct sched_domain. )
6853 */
6854struct static_sched_group {
6855 struct sched_group sg;
6856 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
6857};
6858 6890
6859struct static_sched_domain { 6891struct sd_data {
6860 struct sched_domain sd; 6892 struct sched_domain **__percpu sd;
6861 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 6893 struct sched_group **__percpu sg;
6862}; 6894};
6863 6895
6864struct s_data { 6896struct s_data {
6865#ifdef CONFIG_NUMA 6897 struct sched_domain ** __percpu sd;
6866 int sd_allnodes;
6867 cpumask_var_t domainspan;
6868 cpumask_var_t covered;
6869 cpumask_var_t notcovered;
6870#endif
6871 cpumask_var_t nodemask;
6872 cpumask_var_t this_sibling_map;
6873 cpumask_var_t this_core_map;
6874 cpumask_var_t this_book_map;
6875 cpumask_var_t send_covered;
6876 cpumask_var_t tmpmask;
6877 struct sched_group **sched_group_nodes;
6878 struct root_domain *rd; 6898 struct root_domain *rd;
6879}; 6899};
6880 6900
6881enum s_alloc { 6901enum s_alloc {
6882 sa_sched_groups = 0,
6883 sa_rootdomain, 6902 sa_rootdomain,
6884 sa_tmpmask, 6903 sa_sd,
6885 sa_send_covered, 6904 sa_sd_storage,
6886 sa_this_book_map,
6887 sa_this_core_map,
6888 sa_this_sibling_map,
6889 sa_nodemask,
6890 sa_sched_group_nodes,
6891#ifdef CONFIG_NUMA
6892 sa_notcovered,
6893 sa_covered,
6894 sa_domainspan,
6895#endif
6896 sa_none, 6905 sa_none,
6897}; 6906};
6898 6907
6899/* 6908struct sched_domain_topology_level;
6900 * SMT sched-domains:
6901 */
6902#ifdef CONFIG_SCHED_SMT
6903static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
6904static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
6905 6909
6906static int 6910typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
6907cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 6911typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
6908 struct sched_group **sg, struct cpumask *unused)
6909{
6910 if (sg)
6911 *sg = &per_cpu(sched_groups, cpu).sg;
6912 return cpu;
6913}
6914#endif /* CONFIG_SCHED_SMT */
6915 6912
6916/* 6913struct sched_domain_topology_level {
6917 * multi-core sched-domains: 6914 sched_domain_init_f init;
6918 */ 6915 sched_domain_mask_f mask;
6919#ifdef CONFIG_SCHED_MC 6916 struct sd_data data;
6920static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 6917};
6921static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6922
6923static int
6924cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
6925 struct sched_group **sg, struct cpumask *mask)
6926{
6927 int group;
6928#ifdef CONFIG_SCHED_SMT
6929 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6930 group = cpumask_first(mask);
6931#else
6932 group = cpu;
6933#endif
6934 if (sg)
6935 *sg = &per_cpu(sched_group_core, group).sg;
6936 return group;
6937}
6938#endif /* CONFIG_SCHED_MC */
6939 6918
6940/* 6919/*
6941 * book sched-domains: 6920 * Assumes the sched_domain tree is fully constructed
6942 */ 6921 */
6943#ifdef CONFIG_SCHED_BOOK 6922static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6944static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
6945static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
6946
6947static int
6948cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
6949 struct sched_group **sg, struct cpumask *mask)
6950{ 6923{
6951 int group = cpu; 6924 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6952#ifdef CONFIG_SCHED_MC 6925 struct sched_domain *child = sd->child;
6953 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6954 group = cpumask_first(mask);
6955#elif defined(CONFIG_SCHED_SMT)
6956 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6957 group = cpumask_first(mask);
6958#endif
6959 if (sg)
6960 *sg = &per_cpu(sched_group_book, group).sg;
6961 return group;
6962}
6963#endif /* CONFIG_SCHED_BOOK */
6964 6926
6965static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 6927 if (child)
6966static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 6928 cpu = cpumask_first(sched_domain_span(child));
6967 6929
6968static int
6969cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
6970 struct sched_group **sg, struct cpumask *mask)
6971{
6972 int group;
6973#ifdef CONFIG_SCHED_BOOK
6974 cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
6975 group = cpumask_first(mask);
6976#elif defined(CONFIG_SCHED_MC)
6977 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6978 group = cpumask_first(mask);
6979#elif defined(CONFIG_SCHED_SMT)
6980 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6981 group = cpumask_first(mask);
6982#else
6983 group = cpu;
6984#endif
6985 if (sg) 6930 if (sg)
6986 *sg = &per_cpu(sched_group_phys, group).sg; 6931 *sg = *per_cpu_ptr(sdd->sg, cpu);
6987 return group; 6932
6933 return cpu;
6988} 6934}
6989 6935
6990#ifdef CONFIG_NUMA
6991/* 6936/*
6992 * The init_sched_build_groups can't handle what we want to do with node 6937 * build_sched_groups takes the cpumask we wish to span, and a pointer
6993 * groups, so roll our own. Now each node has its own list of groups which 6938 * to a function which identifies what group(along with sched group) a CPU
6994 * gets dynamically allocated. 6939 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6940 * (due to the fact that we keep track of groups covered with a struct cpumask).
6941 *
6942 * build_sched_groups will build a circular linked list of the groups
6943 * covered by the given span, and will set each group's ->cpumask correctly,
6944 * and ->cpu_power to 0.
6995 */ 6945 */
6996static DEFINE_PER_CPU(struct static_sched_domain, node_domains); 6946static void
6997static struct sched_group ***sched_group_nodes_bycpu; 6947build_sched_groups(struct sched_domain *sd)
6998
6999static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
7000static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
7001
7002static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
7003 struct sched_group **sg,
7004 struct cpumask *nodemask)
7005{
7006 int group;
7007
7008 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
7009 group = cpumask_first(nodemask);
7010
7011 if (sg)
7012 *sg = &per_cpu(sched_group_allnodes, group).sg;
7013 return group;
7014}
7015
7016static void init_numa_sched_groups_power(struct sched_group *group_head)
7017{
7018 struct sched_group *sg = group_head;
7019 int j;
7020
7021 if (!sg)
7022 return;
7023 do {
7024 for_each_cpu(j, sched_group_cpus(sg)) {
7025 struct sched_domain *sd;
7026
7027 sd = &per_cpu(phys_domains, j).sd;
7028 if (j != group_first_cpu(sd->groups)) {
7029 /*
7030 * Only add "power" once for each
7031 * physical package.
7032 */
7033 continue;
7034 }
7035
7036 sg->cpu_power += sd->groups->cpu_power;
7037 }
7038 sg = sg->next;
7039 } while (sg != group_head);
7040}
7041
7042static int build_numa_sched_groups(struct s_data *d,
7043 const struct cpumask *cpu_map, int num)
7044{ 6948{
7045 struct sched_domain *sd; 6949 struct sched_group *first = NULL, *last = NULL;
7046 struct sched_group *sg, *prev; 6950 struct sd_data *sdd = sd->private;
7047 int n, j; 6951 const struct cpumask *span = sched_domain_span(sd);
7048 6952 struct cpumask *covered;
7049 cpumask_clear(d->covered); 6953 int i;
7050 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
7051 if (cpumask_empty(d->nodemask)) {
7052 d->sched_group_nodes[num] = NULL;
7053 goto out;
7054 }
7055
7056 sched_domain_node_span(num, d->domainspan);
7057 cpumask_and(d->domainspan, d->domainspan, cpu_map);
7058
7059 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7060 GFP_KERNEL, num);
7061 if (!sg) {
7062 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
7063 num);
7064 return -ENOMEM;
7065 }
7066 d->sched_group_nodes[num] = sg;
7067
7068 for_each_cpu(j, d->nodemask) {
7069 sd = &per_cpu(node_domains, j).sd;
7070 sd->groups = sg;
7071 }
7072
7073 sg->cpu_power = 0;
7074 cpumask_copy(sched_group_cpus(sg), d->nodemask);
7075 sg->next = sg;
7076 cpumask_or(d->covered, d->covered, d->nodemask);
7077 6954
7078 prev = sg; 6955 lockdep_assert_held(&sched_domains_mutex);
7079 for (j = 0; j < nr_node_ids; j++) { 6956 covered = sched_domains_tmpmask;
7080 n = (num + j) % nr_node_ids;
7081 cpumask_complement(d->notcovered, d->covered);
7082 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
7083 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
7084 if (cpumask_empty(d->tmpmask))
7085 break;
7086 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
7087 if (cpumask_empty(d->tmpmask))
7088 continue;
7089 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7090 GFP_KERNEL, num);
7091 if (!sg) {
7092 printk(KERN_WARNING
7093 "Can not alloc domain group for node %d\n", j);
7094 return -ENOMEM;
7095 }
7096 sg->cpu_power = 0;
7097 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
7098 sg->next = prev->next;
7099 cpumask_or(d->covered, d->covered, d->tmpmask);
7100 prev->next = sg;
7101 prev = sg;
7102 }
7103out:
7104 return 0;
7105}
7106#endif /* CONFIG_NUMA */
7107 6957
7108#ifdef CONFIG_NUMA 6958 cpumask_clear(covered);
7109/* Free memory allocated for various sched_group structures */
7110static void free_sched_groups(const struct cpumask *cpu_map,
7111 struct cpumask *nodemask)
7112{
7113 int cpu, i;
7114 6959
7115 for_each_cpu(cpu, cpu_map) { 6960 for_each_cpu(i, span) {
7116 struct sched_group **sched_group_nodes 6961 struct sched_group *sg;
7117 = sched_group_nodes_bycpu[cpu]; 6962 int group = get_group(i, sdd, &sg);
6963 int j;
7118 6964
7119 if (!sched_group_nodes) 6965 if (cpumask_test_cpu(i, covered))
7120 continue; 6966 continue;
7121 6967
7122 for (i = 0; i < nr_node_ids; i++) { 6968 cpumask_clear(sched_group_cpus(sg));
7123 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 6969 sg->cpu_power = 0;
7124 6970
7125 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 6971 for_each_cpu(j, span) {
7126 if (cpumask_empty(nodemask)) 6972 if (get_group(j, sdd, NULL) != group)
7127 continue; 6973 continue;
7128 6974
7129 if (sg == NULL) 6975 cpumask_set_cpu(j, covered);
7130 continue; 6976 cpumask_set_cpu(j, sched_group_cpus(sg));
7131 sg = sg->next;
7132next_sg:
7133 oldsg = sg;
7134 sg = sg->next;
7135 kfree(oldsg);
7136 if (oldsg != sched_group_nodes[i])
7137 goto next_sg;
7138 } 6977 }
7139 kfree(sched_group_nodes); 6978
7140 sched_group_nodes_bycpu[cpu] = NULL; 6979 if (!first)
6980 first = sg;
6981 if (last)
6982 last->next = sg;
6983 last = sg;
7141 } 6984 }
6985 last->next = first;
7142} 6986}
7143#else /* !CONFIG_NUMA */
7144static void free_sched_groups(const struct cpumask *cpu_map,
7145 struct cpumask *nodemask)
7146{
7147}
7148#endif /* CONFIG_NUMA */
7149 6987
7150/* 6988/*
7151 * Initialize sched groups cpu_power. 6989 * Initialize sched groups cpu_power.
@@ -7159,11 +6997,6 @@ static void free_sched_groups(const struct cpumask *cpu_map,
7159 */ 6997 */
7160static void init_sched_groups_power(int cpu, struct sched_domain *sd) 6998static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7161{ 6999{
7162 struct sched_domain *child;
7163 struct sched_group *group;
7164 long power;
7165 int weight;
7166
7167 WARN_ON(!sd || !sd->groups); 7000 WARN_ON(!sd || !sd->groups);
7168 7001
7169 if (cpu != group_first_cpu(sd->groups)) 7002 if (cpu != group_first_cpu(sd->groups))
@@ -7171,36 +7004,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7171 7004
7172 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); 7005 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
7173 7006
7174 child = sd->child; 7007 update_group_power(sd, cpu);
7175
7176 sd->groups->cpu_power = 0;
7177
7178 if (!child) {
7179 power = SCHED_LOAD_SCALE;
7180 weight = cpumask_weight(sched_domain_span(sd));
7181 /*
7182 * SMT siblings share the power of a single core.
7183 * Usually multiple threads get a better yield out of
7184 * that one core than a single thread would have,
7185 * reflect that in sd->smt_gain.
7186 */
7187 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
7188 power *= sd->smt_gain;
7189 power /= weight;
7190 power >>= SCHED_LOAD_SHIFT;
7191 }
7192 sd->groups->cpu_power += power;
7193 return;
7194 }
7195
7196 /*
7197 * Add cpu_power of each child group to this groups cpu_power.
7198 */
7199 group = child->groups;
7200 do {
7201 sd->groups->cpu_power += group->cpu_power;
7202 group = group->next;
7203 } while (group != child->groups);
7204} 7008}
7205 7009
7206/* 7010/*
@@ -7214,15 +7018,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7214# define SD_INIT_NAME(sd, type) do { } while (0) 7018# define SD_INIT_NAME(sd, type) do { } while (0)
7215#endif 7019#endif
7216 7020
7217#define SD_INIT(sd, type) sd_init_##type(sd) 7021#define SD_INIT_FUNC(type) \
7218 7022static noinline struct sched_domain * \
7219#define SD_INIT_FUNC(type) \ 7023sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
7220static noinline void sd_init_##type(struct sched_domain *sd) \ 7024{ \
7221{ \ 7025 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
7222 memset(sd, 0, sizeof(*sd)); \ 7026 *sd = SD_##type##_INIT; \
7223 *sd = SD_##type##_INIT; \ 7027 SD_INIT_NAME(sd, type); \
7224 sd->level = SD_LV_##type; \ 7028 sd->private = &tl->data; \
7225 SD_INIT_NAME(sd, type); \ 7029 return sd; \
7226} 7030}
7227 7031
7228SD_INIT_FUNC(CPU) 7032SD_INIT_FUNC(CPU)
@@ -7241,13 +7045,14 @@ SD_INIT_FUNC(CPU)
7241#endif 7045#endif
7242 7046
7243static int default_relax_domain_level = -1; 7047static int default_relax_domain_level = -1;
7048int sched_domain_level_max;
7244 7049
7245static int __init setup_relax_domain_level(char *str) 7050static int __init setup_relax_domain_level(char *str)
7246{ 7051{
7247 unsigned long val; 7052 unsigned long val;
7248 7053
7249 val = simple_strtoul(str, NULL, 0); 7054 val = simple_strtoul(str, NULL, 0);
7250 if (val < SD_LV_MAX) 7055 if (val < sched_domain_level_max)
7251 default_relax_domain_level = val; 7056 default_relax_domain_level = val;
7252 7057
7253 return 1; 7058 return 1;
@@ -7275,37 +7080,20 @@ static void set_domain_attribute(struct sched_domain *sd,
7275 } 7080 }
7276} 7081}
7277 7082
7083static void __sdt_free(const struct cpumask *cpu_map);
7084static int __sdt_alloc(const struct cpumask *cpu_map);
7085
7278static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 7086static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7279 const struct cpumask *cpu_map) 7087 const struct cpumask *cpu_map)
7280{ 7088{
7281 switch (what) { 7089 switch (what) {
7282 case sa_sched_groups:
7283 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
7284 d->sched_group_nodes = NULL;
7285 case sa_rootdomain: 7090 case sa_rootdomain:
7286 free_rootdomain(d->rd); /* fall through */ 7091 if (!atomic_read(&d->rd->refcount))
7287 case sa_tmpmask: 7092 free_rootdomain(&d->rd->rcu); /* fall through */
7288 free_cpumask_var(d->tmpmask); /* fall through */ 7093 case sa_sd:
7289 case sa_send_covered: 7094 free_percpu(d->sd); /* fall through */
7290 free_cpumask_var(d->send_covered); /* fall through */ 7095 case sa_sd_storage:
7291 case sa_this_book_map: 7096 __sdt_free(cpu_map); /* fall through */
7292 free_cpumask_var(d->this_book_map); /* fall through */
7293 case sa_this_core_map:
7294 free_cpumask_var(d->this_core_map); /* fall through */
7295 case sa_this_sibling_map:
7296 free_cpumask_var(d->this_sibling_map); /* fall through */
7297 case sa_nodemask:
7298 free_cpumask_var(d->nodemask); /* fall through */
7299 case sa_sched_group_nodes:
7300#ifdef CONFIG_NUMA
7301 kfree(d->sched_group_nodes); /* fall through */
7302 case sa_notcovered:
7303 free_cpumask_var(d->notcovered); /* fall through */
7304 case sa_covered:
7305 free_cpumask_var(d->covered); /* fall through */
7306 case sa_domainspan:
7307 free_cpumask_var(d->domainspan); /* fall through */
7308#endif
7309 case sa_none: 7097 case sa_none:
7310 break; 7098 break;
7311 } 7099 }
@@ -7314,308 +7102,212 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7314static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 7102static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7315 const struct cpumask *cpu_map) 7103 const struct cpumask *cpu_map)
7316{ 7104{
7317#ifdef CONFIG_NUMA 7105 memset(d, 0, sizeof(*d));
7318 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) 7106
7319 return sa_none; 7107 if (__sdt_alloc(cpu_map))
7320 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) 7108 return sa_sd_storage;
7321 return sa_domainspan; 7109 d->sd = alloc_percpu(struct sched_domain *);
7322 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) 7110 if (!d->sd)
7323 return sa_covered; 7111 return sa_sd_storage;
7324 /* Allocate the per-node list of sched groups */
7325 d->sched_group_nodes = kcalloc(nr_node_ids,
7326 sizeof(struct sched_group *), GFP_KERNEL);
7327 if (!d->sched_group_nodes) {
7328 printk(KERN_WARNING "Can not alloc sched group node list\n");
7329 return sa_notcovered;
7330 }
7331 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
7332#endif
7333 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
7334 return sa_sched_group_nodes;
7335 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
7336 return sa_nodemask;
7337 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
7338 return sa_this_sibling_map;
7339 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
7340 return sa_this_core_map;
7341 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7342 return sa_this_book_map;
7343 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
7344 return sa_send_covered;
7345 d->rd = alloc_rootdomain(); 7112 d->rd = alloc_rootdomain();
7346 if (!d->rd) { 7113 if (!d->rd)
7347 printk(KERN_WARNING "Cannot alloc root domain\n"); 7114 return sa_sd;
7348 return sa_tmpmask;
7349 }
7350 return sa_rootdomain; 7115 return sa_rootdomain;
7351} 7116}
7352 7117
7353static struct sched_domain *__build_numa_sched_domains(struct s_data *d, 7118/*
7354 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) 7119 * NULL the sd_data elements we've used to build the sched_domain and
7120 * sched_group structure so that the subsequent __free_domain_allocs()
7121 * will not free the data we're using.
7122 */
7123static void claim_allocations(int cpu, struct sched_domain *sd)
7355{ 7124{
7356 struct sched_domain *sd = NULL; 7125 struct sd_data *sdd = sd->private;
7357#ifdef CONFIG_NUMA 7126 struct sched_group *sg = sd->groups;
7358 struct sched_domain *parent;
7359
7360 d->sd_allnodes = 0;
7361 if (cpumask_weight(cpu_map) >
7362 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
7363 sd = &per_cpu(allnodes_domains, i).sd;
7364 SD_INIT(sd, ALLNODES);
7365 set_domain_attribute(sd, attr);
7366 cpumask_copy(sched_domain_span(sd), cpu_map);
7367 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
7368 d->sd_allnodes = 1;
7369 }
7370 parent = sd;
7371
7372 sd = &per_cpu(node_domains, i).sd;
7373 SD_INIT(sd, NODE);
7374 set_domain_attribute(sd, attr);
7375 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7376 sd->parent = parent;
7377 if (parent)
7378 parent->child = sd;
7379 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
7380#endif
7381 return sd;
7382}
7383 7127
7384static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, 7128 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
7385 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7129 *per_cpu_ptr(sdd->sd, cpu) = NULL;
7386 struct sched_domain *parent, int i)
7387{
7388 struct sched_domain *sd;
7389 sd = &per_cpu(phys_domains, i).sd;
7390 SD_INIT(sd, CPU);
7391 set_domain_attribute(sd, attr);
7392 cpumask_copy(sched_domain_span(sd), d->nodemask);
7393 sd->parent = parent;
7394 if (parent)
7395 parent->child = sd;
7396 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
7397 return sd;
7398}
7399 7130
7400static struct sched_domain *__build_book_sched_domain(struct s_data *d, 7131 if (cpu == cpumask_first(sched_group_cpus(sg))) {
7401 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7132 WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
7402 struct sched_domain *parent, int i) 7133 *per_cpu_ptr(sdd->sg, cpu) = NULL;
7403{ 7134 }
7404 struct sched_domain *sd = parent;
7405#ifdef CONFIG_SCHED_BOOK
7406 sd = &per_cpu(book_domains, i).sd;
7407 SD_INIT(sd, BOOK);
7408 set_domain_attribute(sd, attr);
7409 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7410 sd->parent = parent;
7411 parent->child = sd;
7412 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7413#endif
7414 return sd;
7415} 7135}
7416 7136
7417static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7137#ifdef CONFIG_SCHED_SMT
7418 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7138static const struct cpumask *cpu_smt_mask(int cpu)
7419 struct sched_domain *parent, int i)
7420{ 7139{
7421 struct sched_domain *sd = parent; 7140 return topology_thread_cpumask(cpu);
7422#ifdef CONFIG_SCHED_MC
7423 sd = &per_cpu(core_domains, i).sd;
7424 SD_INIT(sd, MC);
7425 set_domain_attribute(sd, attr);
7426 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
7427 sd->parent = parent;
7428 parent->child = sd;
7429 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
7430#endif
7431 return sd;
7432} 7141}
7433
7434static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
7435 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7436 struct sched_domain *parent, int i)
7437{
7438 struct sched_domain *sd = parent;
7439#ifdef CONFIG_SCHED_SMT
7440 sd = &per_cpu(cpu_domains, i).sd;
7441 SD_INIT(sd, SIBLING);
7442 set_domain_attribute(sd, attr);
7443 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
7444 sd->parent = parent;
7445 parent->child = sd;
7446 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
7447#endif 7142#endif
7448 return sd;
7449}
7450 7143
7451static void build_sched_groups(struct s_data *d, enum sched_domain_level l, 7144/*
7452 const struct cpumask *cpu_map, int cpu) 7145 * Topology list, bottom-up.
7453{ 7146 */
7454 switch (l) { 7147static struct sched_domain_topology_level default_topology[] = {
7455#ifdef CONFIG_SCHED_SMT 7148#ifdef CONFIG_SCHED_SMT
7456 case SD_LV_SIBLING: /* set up CPU (sibling) groups */ 7149 { sd_init_SIBLING, cpu_smt_mask, },
7457 cpumask_and(d->this_sibling_map, cpu_map,
7458 topology_thread_cpumask(cpu));
7459 if (cpu == cpumask_first(d->this_sibling_map))
7460 init_sched_build_groups(d->this_sibling_map, cpu_map,
7461 &cpu_to_cpu_group,
7462 d->send_covered, d->tmpmask);
7463 break;
7464#endif 7150#endif
7465#ifdef CONFIG_SCHED_MC 7151#ifdef CONFIG_SCHED_MC
7466 case SD_LV_MC: /* set up multi-core groups */ 7152 { sd_init_MC, cpu_coregroup_mask, },
7467 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
7468 if (cpu == cpumask_first(d->this_core_map))
7469 init_sched_build_groups(d->this_core_map, cpu_map,
7470 &cpu_to_core_group,
7471 d->send_covered, d->tmpmask);
7472 break;
7473#endif 7153#endif
7474#ifdef CONFIG_SCHED_BOOK 7154#ifdef CONFIG_SCHED_BOOK
7475 case SD_LV_BOOK: /* set up book groups */ 7155 { sd_init_BOOK, cpu_book_mask, },
7476 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7477 if (cpu == cpumask_first(d->this_book_map))
7478 init_sched_build_groups(d->this_book_map, cpu_map,
7479 &cpu_to_book_group,
7480 d->send_covered, d->tmpmask);
7481 break;
7482#endif 7156#endif
7483 case SD_LV_CPU: /* set up physical groups */ 7157 { sd_init_CPU, cpu_cpu_mask, },
7484 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
7485 if (!cpumask_empty(d->nodemask))
7486 init_sched_build_groups(d->nodemask, cpu_map,
7487 &cpu_to_phys_group,
7488 d->send_covered, d->tmpmask);
7489 break;
7490#ifdef CONFIG_NUMA 7158#ifdef CONFIG_NUMA
7491 case SD_LV_ALLNODES: 7159 { sd_init_NODE, cpu_node_mask, },
7492 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, 7160 { sd_init_ALLNODES, cpu_allnodes_mask, },
7493 d->send_covered, d->tmpmask);
7494 break;
7495#endif 7161#endif
7496 default: 7162 { NULL, },
7497 break; 7163};
7164
7165static struct sched_domain_topology_level *sched_domain_topology = default_topology;
7166
7167static int __sdt_alloc(const struct cpumask *cpu_map)
7168{
7169 struct sched_domain_topology_level *tl;
7170 int j;
7171
7172 for (tl = sched_domain_topology; tl->init; tl++) {
7173 struct sd_data *sdd = &tl->data;
7174
7175 sdd->sd = alloc_percpu(struct sched_domain *);
7176 if (!sdd->sd)
7177 return -ENOMEM;
7178
7179 sdd->sg = alloc_percpu(struct sched_group *);
7180 if (!sdd->sg)
7181 return -ENOMEM;
7182
7183 for_each_cpu(j, cpu_map) {
7184 struct sched_domain *sd;
7185 struct sched_group *sg;
7186
7187 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
7188 GFP_KERNEL, cpu_to_node(j));
7189 if (!sd)
7190 return -ENOMEM;
7191
7192 *per_cpu_ptr(sdd->sd, j) = sd;
7193
7194 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7195 GFP_KERNEL, cpu_to_node(j));
7196 if (!sg)
7197 return -ENOMEM;
7198
7199 *per_cpu_ptr(sdd->sg, j) = sg;
7200 }
7201 }
7202
7203 return 0;
7204}
7205
7206static void __sdt_free(const struct cpumask *cpu_map)
7207{
7208 struct sched_domain_topology_level *tl;
7209 int j;
7210
7211 for (tl = sched_domain_topology; tl->init; tl++) {
7212 struct sd_data *sdd = &tl->data;
7213
7214 for_each_cpu(j, cpu_map) {
7215 kfree(*per_cpu_ptr(sdd->sd, j));
7216 kfree(*per_cpu_ptr(sdd->sg, j));
7217 }
7218 free_percpu(sdd->sd);
7219 free_percpu(sdd->sg);
7220 }
7221}
7222
7223struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
7224 struct s_data *d, const struct cpumask *cpu_map,
7225 struct sched_domain_attr *attr, struct sched_domain *child,
7226 int cpu)
7227{
7228 struct sched_domain *sd = tl->init(tl, cpu);
7229 if (!sd)
7230 return child;
7231
7232 set_domain_attribute(sd, attr);
7233 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
7234 if (child) {
7235 sd->level = child->level + 1;
7236 sched_domain_level_max = max(sched_domain_level_max, sd->level);
7237 child->parent = sd;
7498 } 7238 }
7239 sd->child = child;
7240
7241 return sd;
7499} 7242}
7500 7243
7501/* 7244/*
7502 * Build sched domains for a given set of cpus and attach the sched domains 7245 * Build sched domains for a given set of cpus and attach the sched domains
7503 * to the individual cpus 7246 * to the individual cpus
7504 */ 7247 */
7505static int __build_sched_domains(const struct cpumask *cpu_map, 7248static int build_sched_domains(const struct cpumask *cpu_map,
7506 struct sched_domain_attr *attr) 7249 struct sched_domain_attr *attr)
7507{ 7250{
7508 enum s_alloc alloc_state = sa_none; 7251 enum s_alloc alloc_state = sa_none;
7509 struct s_data d;
7510 struct sched_domain *sd; 7252 struct sched_domain *sd;
7511 int i; 7253 struct s_data d;
7512#ifdef CONFIG_NUMA 7254 int i, ret = -ENOMEM;
7513 d.sd_allnodes = 0;
7514#endif
7515 7255
7516 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 7256 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7517 if (alloc_state != sa_rootdomain) 7257 if (alloc_state != sa_rootdomain)
7518 goto error; 7258 goto error;
7519 alloc_state = sa_sched_groups;
7520 7259
7521 /* 7260 /* Set up domains for cpus specified by the cpu_map. */
7522 * Set up domains for cpus specified by the cpu_map.
7523 */
7524 for_each_cpu(i, cpu_map) { 7261 for_each_cpu(i, cpu_map) {
7525 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), 7262 struct sched_domain_topology_level *tl;
7526 cpu_map);
7527 7263
7528 sd = __build_numa_sched_domains(&d, cpu_map, attr, i); 7264 sd = NULL;
7529 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); 7265 for (tl = sched_domain_topology; tl->init; tl++)
7530 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); 7266 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
7531 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7532 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7533 }
7534
7535 for_each_cpu(i, cpu_map) {
7536 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
7537 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
7538 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
7539 }
7540
7541 /* Set up physical groups */
7542 for (i = 0; i < nr_node_ids; i++)
7543 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
7544 7267
7545#ifdef CONFIG_NUMA 7268 while (sd->child)
7546 /* Set up node groups */ 7269 sd = sd->child;
7547 if (d.sd_allnodes)
7548 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
7549
7550 for (i = 0; i < nr_node_ids; i++)
7551 if (build_numa_sched_groups(&d, cpu_map, i))
7552 goto error;
7553#endif
7554 7270
7555 /* Calculate CPU power for physical packages and nodes */ 7271 *per_cpu_ptr(d.sd, i) = sd;
7556#ifdef CONFIG_SCHED_SMT
7557 for_each_cpu(i, cpu_map) {
7558 sd = &per_cpu(cpu_domains, i).sd;
7559 init_sched_groups_power(i, sd);
7560 }
7561#endif
7562#ifdef CONFIG_SCHED_MC
7563 for_each_cpu(i, cpu_map) {
7564 sd = &per_cpu(core_domains, i).sd;
7565 init_sched_groups_power(i, sd);
7566 } 7272 }
7567#endif
7568#ifdef CONFIG_SCHED_BOOK
7569 for_each_cpu(i, cpu_map) {
7570 sd = &per_cpu(book_domains, i).sd;
7571 init_sched_groups_power(i, sd);
7572 }
7573#endif
7574 7273
7274 /* Build the groups for the domains */
7575 for_each_cpu(i, cpu_map) { 7275 for_each_cpu(i, cpu_map) {
7576 sd = &per_cpu(phys_domains, i).sd; 7276 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7577 init_sched_groups_power(i, sd); 7277 sd->span_weight = cpumask_weight(sched_domain_span(sd));
7578 } 7278 get_group(i, sd->private, &sd->groups);
7279 atomic_inc(&sd->groups->ref);
7579 7280
7580#ifdef CONFIG_NUMA 7281 if (i != cpumask_first(sched_domain_span(sd)))
7581 for (i = 0; i < nr_node_ids; i++) 7282 continue;
7582 init_numa_sched_groups_power(d.sched_group_nodes[i]);
7583 7283
7584 if (d.sd_allnodes) { 7284 build_sched_groups(sd);
7585 struct sched_group *sg; 7285 }
7286 }
7586 7287
7587 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 7288 /* Calculate CPU power for physical packages and nodes */
7588 d.tmpmask); 7289 for (i = nr_cpumask_bits-1; i >= 0; i--) {
7589 init_numa_sched_groups_power(sg); 7290 if (!cpumask_test_cpu(i, cpu_map))
7291 continue;
7292
7293 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7294 claim_allocations(i, sd);
7295 init_sched_groups_power(i, sd);
7296 }
7590 } 7297 }
7591#endif
7592 7298
7593 /* Attach the domains */ 7299 /* Attach the domains */
7300 rcu_read_lock();
7594 for_each_cpu(i, cpu_map) { 7301 for_each_cpu(i, cpu_map) {
7595#ifdef CONFIG_SCHED_SMT 7302 sd = *per_cpu_ptr(d.sd, i);
7596 sd = &per_cpu(cpu_domains, i).sd;
7597#elif defined(CONFIG_SCHED_MC)
7598 sd = &per_cpu(core_domains, i).sd;
7599#elif defined(CONFIG_SCHED_BOOK)
7600 sd = &per_cpu(book_domains, i).sd;
7601#else
7602 sd = &per_cpu(phys_domains, i).sd;
7603#endif
7604 cpu_attach_domain(sd, d.rd, i); 7303 cpu_attach_domain(sd, d.rd, i);
7605 } 7304 }
7305 rcu_read_unlock();
7606 7306
7607 d.sched_group_nodes = NULL; /* don't free this we still need it */ 7307 ret = 0;
7608 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
7609 return 0;
7610
7611error: 7308error:
7612 __free_domain_allocs(&d, alloc_state, cpu_map); 7309 __free_domain_allocs(&d, alloc_state, cpu_map);
7613 return -ENOMEM; 7310 return ret;
7614}
7615
7616static int build_sched_domains(const struct cpumask *cpu_map)
7617{
7618 return __build_sched_domains(cpu_map, NULL);
7619} 7311}
7620 7312
7621static cpumask_var_t *doms_cur; /* current sched domains */ 7313static cpumask_var_t *doms_cur; /* current sched domains */
@@ -7670,7 +7362,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7670 * For now this just excludes isolated cpus, but could be used to 7362 * For now this just excludes isolated cpus, but could be used to
7671 * exclude other special cases in the future. 7363 * exclude other special cases in the future.
7672 */ 7364 */
7673static int arch_init_sched_domains(const struct cpumask *cpu_map) 7365static int init_sched_domains(const struct cpumask *cpu_map)
7674{ 7366{
7675 int err; 7367 int err;
7676 7368
@@ -7681,32 +7373,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
7681 doms_cur = &fallback_doms; 7373 doms_cur = &fallback_doms;
7682 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 7374 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
7683 dattr_cur = NULL; 7375 dattr_cur = NULL;
7684 err = build_sched_domains(doms_cur[0]); 7376 err = build_sched_domains(doms_cur[0], NULL);
7685 register_sched_domain_sysctl(); 7377 register_sched_domain_sysctl();
7686 7378
7687 return err; 7379 return err;
7688} 7380}
7689 7381
7690static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7691 struct cpumask *tmpmask)
7692{
7693 free_sched_groups(cpu_map, tmpmask);
7694}
7695
7696/* 7382/*
7697 * Detach sched domains from a group of cpus specified in cpu_map 7383 * Detach sched domains from a group of cpus specified in cpu_map
7698 * These cpus will now be attached to the NULL domain 7384 * These cpus will now be attached to the NULL domain
7699 */ 7385 */
7700static void detach_destroy_domains(const struct cpumask *cpu_map) 7386static void detach_destroy_domains(const struct cpumask *cpu_map)
7701{ 7387{
7702 /* Save because hotplug lock held. */
7703 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7704 int i; 7388 int i;
7705 7389
7390 rcu_read_lock();
7706 for_each_cpu(i, cpu_map) 7391 for_each_cpu(i, cpu_map)
7707 cpu_attach_domain(NULL, &def_root_domain, i); 7392 cpu_attach_domain(NULL, &def_root_domain, i);
7708 synchronize_sched(); 7393 rcu_read_unlock();
7709 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7710} 7394}
7711 7395
7712/* handle null as "default" */ 7396/* handle null as "default" */
@@ -7795,8 +7479,7 @@ match1:
7795 goto match2; 7479 goto match2;
7796 } 7480 }
7797 /* no match - add a new doms_new */ 7481 /* no match - add a new doms_new */
7798 __build_sched_domains(doms_new[i], 7482 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
7799 dattr_new ? dattr_new + i : NULL);
7800match2: 7483match2:
7801 ; 7484 ;
7802 } 7485 }
@@ -7815,7 +7498,7 @@ match2:
7815} 7498}
7816 7499
7817#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7500#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7818static void arch_reinit_sched_domains(void) 7501static void reinit_sched_domains(void)
7819{ 7502{
7820 get_online_cpus(); 7503 get_online_cpus();
7821 7504
@@ -7848,7 +7531,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7848 else 7531 else
7849 sched_mc_power_savings = level; 7532 sched_mc_power_savings = level;
7850 7533
7851 arch_reinit_sched_domains(); 7534 reinit_sched_domains();
7852 7535
7853 return count; 7536 return count;
7854} 7537}
@@ -7967,14 +7650,9 @@ void __init sched_init_smp(void)
7967 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7650 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7968 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7651 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7969 7652
7970#if defined(CONFIG_NUMA)
7971 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7972 GFP_KERNEL);
7973 BUG_ON(sched_group_nodes_bycpu == NULL);
7974#endif
7975 get_online_cpus(); 7653 get_online_cpus();
7976 mutex_lock(&sched_domains_mutex); 7654 mutex_lock(&sched_domains_mutex);
7977 arch_init_sched_domains(cpu_active_mask); 7655 init_sched_domains(cpu_active_mask);
7978 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7656 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7979 if (cpumask_empty(non_isolated_cpus)) 7657 if (cpumask_empty(non_isolated_cpus))
7980 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 7658 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -8281,6 +7959,7 @@ void __init sched_init(void)
8281 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 7959 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8282 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 7960 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
8283#ifdef CONFIG_SMP 7961#ifdef CONFIG_SMP
7962 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8284#ifdef CONFIG_NO_HZ 7963#ifdef CONFIG_NO_HZ
8285 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 7964 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8286 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); 7965 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
@@ -8340,7 +8019,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8340 int old_prio = p->prio; 8019 int old_prio = p->prio;
8341 int on_rq; 8020 int on_rq;
8342 8021
8343 on_rq = p->se.on_rq; 8022 on_rq = p->on_rq;
8344 if (on_rq) 8023 if (on_rq)
8345 deactivate_task(rq, p, 0); 8024 deactivate_task(rq, p, 0);
8346 __setscheduler(rq, p, SCHED_NORMAL, 0); 8025 __setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8553,7 +8232,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8553{ 8232{
8554 struct rt_rq *rt_rq; 8233 struct rt_rq *rt_rq;
8555 struct sched_rt_entity *rt_se; 8234 struct sched_rt_entity *rt_se;
8556 struct rq *rq;
8557 int i; 8235 int i;
8558 8236
8559 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 8237 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8567,8 +8245,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8567 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 8245 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8568 8246
8569 for_each_possible_cpu(i) { 8247 for_each_possible_cpu(i) {
8570 rq = cpu_rq(i);
8571
8572 rt_rq = kzalloc_node(sizeof(struct rt_rq), 8248 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8573 GFP_KERNEL, cpu_to_node(i)); 8249 GFP_KERNEL, cpu_to_node(i));
8574 if (!rt_rq) 8250 if (!rt_rq)
@@ -8683,7 +8359,7 @@ void sched_move_task(struct task_struct *tsk)
8683 rq = task_rq_lock(tsk, &flags); 8359 rq = task_rq_lock(tsk, &flags);
8684 8360
8685 running = task_current(rq, tsk); 8361 running = task_current(rq, tsk);
8686 on_rq = tsk->se.on_rq; 8362 on_rq = tsk->on_rq;
8687 8363
8688 if (on_rq) 8364 if (on_rq)
8689 dequeue_task(rq, tsk, 0); 8365 dequeue_task(rq, tsk, 0);
@@ -8702,7 +8378,7 @@ void sched_move_task(struct task_struct *tsk)
8702 if (on_rq) 8378 if (on_rq)
8703 enqueue_task(rq, tsk, 0); 8379 enqueue_task(rq, tsk, 0);
8704 8380
8705 task_rq_unlock(rq, &flags); 8381 task_rq_unlock(rq, tsk, &flags);
8706} 8382}
8707#endif /* CONFIG_CGROUP_SCHED */ 8383#endif /* CONFIG_CGROUP_SCHED */
8708 8384
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 7bacd83a4158..a6710a112b4f 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -152,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
152 read_lock_irqsave(&tasklist_lock, flags); 152 read_lock_irqsave(&tasklist_lock, flags);
153 153
154 do_each_thread(g, p) { 154 do_each_thread(g, p) {
155 if (!p->se.on_rq || task_cpu(p) != rq_cpu) 155 if (!p->on_rq || task_cpu(p) != rq_cpu)
156 continue; 156 continue;
157 157
158 print_task(m, rq, p); 158 print_task(m, rq, p);
@@ -296,9 +296,6 @@ static void print_cpu(struct seq_file *m, int cpu)
296 P(ttwu_count); 296 P(ttwu_count);
297 P(ttwu_local); 297 P(ttwu_local);
298 298
299 SEQ_printf(m, " .%-30s: %d\n", "bkl_count",
300 rq->rq_sched_info.bkl_count);
301
302#undef P 299#undef P
303#undef P64 300#undef P64
304#endif 301#endif
@@ -441,7 +438,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
441 P(se.statistics.wait_count); 438 P(se.statistics.wait_count);
442 PN(se.statistics.iowait_sum); 439 PN(se.statistics.iowait_sum);
443 P(se.statistics.iowait_count); 440 P(se.statistics.iowait_count);
444 P(sched_info.bkl_count);
445 P(se.nr_migrations); 441 P(se.nr_migrations);
446 P(se.statistics.nr_migrations_cold); 442 P(se.statistics.nr_migrations_cold);
447 P(se.statistics.nr_failed_migrations_affine); 443 P(se.statistics.nr_failed_migrations_affine);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6fa833ab2cb8..37f22626225e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -358,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
358 } 358 }
359 359
360 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); 360 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
361#ifndef CONFIG_64BIT
362 smp_wmb();
363 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
364#endif
361} 365}
362 366
363/* 367/*
@@ -1340,6 +1344,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1340 hrtick_update(rq); 1344 hrtick_update(rq);
1341} 1345}
1342 1346
1347static void set_next_buddy(struct sched_entity *se);
1348
1343/* 1349/*
1344 * The dequeue_task method is called before nr_running is 1350 * The dequeue_task method is called before nr_running is
1345 * decreased. We remove the task from the rbtree and 1351 * decreased. We remove the task from the rbtree and
@@ -1349,14 +1355,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1349{ 1355{
1350 struct cfs_rq *cfs_rq; 1356 struct cfs_rq *cfs_rq;
1351 struct sched_entity *se = &p->se; 1357 struct sched_entity *se = &p->se;
1358 int task_sleep = flags & DEQUEUE_SLEEP;
1352 1359
1353 for_each_sched_entity(se) { 1360 for_each_sched_entity(se) {
1354 cfs_rq = cfs_rq_of(se); 1361 cfs_rq = cfs_rq_of(se);
1355 dequeue_entity(cfs_rq, se, flags); 1362 dequeue_entity(cfs_rq, se, flags);
1356 1363
1357 /* Don't dequeue parent if it has other entities besides us */ 1364 /* Don't dequeue parent if it has other entities besides us */
1358 if (cfs_rq->load.weight) 1365 if (cfs_rq->load.weight) {
1366 /*
1367 * Bias pick_next to pick a task from this cfs_rq, as
1368 * p is sleeping when it is within its sched_slice.
1369 */
1370 if (task_sleep && parent_entity(se))
1371 set_next_buddy(parent_entity(se));
1359 break; 1372 break;
1373 }
1360 flags |= DEQUEUE_SLEEP; 1374 flags |= DEQUEUE_SLEEP;
1361 } 1375 }
1362 1376
@@ -1372,12 +1386,25 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1372 1386
1373#ifdef CONFIG_SMP 1387#ifdef CONFIG_SMP
1374 1388
1375static void task_waking_fair(struct rq *rq, struct task_struct *p) 1389static void task_waking_fair(struct task_struct *p)
1376{ 1390{
1377 struct sched_entity *se = &p->se; 1391 struct sched_entity *se = &p->se;
1378 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1392 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1393 u64 min_vruntime;
1379 1394
1380 se->vruntime -= cfs_rq->min_vruntime; 1395#ifndef CONFIG_64BIT
1396 u64 min_vruntime_copy;
1397
1398 do {
1399 min_vruntime_copy = cfs_rq->min_vruntime_copy;
1400 smp_rmb();
1401 min_vruntime = cfs_rq->min_vruntime;
1402 } while (min_vruntime != min_vruntime_copy);
1403#else
1404 min_vruntime = cfs_rq->min_vruntime;
1405#endif
1406
1407 se->vruntime -= min_vruntime;
1381} 1408}
1382 1409
1383#ifdef CONFIG_FAIR_GROUP_SCHED 1410#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1622,6 +1649,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1622 /* 1649 /*
1623 * Otherwise, iterate the domains and find an elegible idle cpu. 1650 * Otherwise, iterate the domains and find an elegible idle cpu.
1624 */ 1651 */
1652 rcu_read_lock();
1625 for_each_domain(target, sd) { 1653 for_each_domain(target, sd) {
1626 if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) 1654 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1627 break; 1655 break;
@@ -1641,6 +1669,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1641 cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) 1669 cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
1642 break; 1670 break;
1643 } 1671 }
1672 rcu_read_unlock();
1644 1673
1645 return target; 1674 return target;
1646} 1675}
@@ -1657,7 +1686,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1657 * preempt must be disabled. 1686 * preempt must be disabled.
1658 */ 1687 */
1659static int 1688static int
1660select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) 1689select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1661{ 1690{
1662 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 1691 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1663 int cpu = smp_processor_id(); 1692 int cpu = smp_processor_id();
@@ -1673,6 +1702,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1673 new_cpu = prev_cpu; 1702 new_cpu = prev_cpu;
1674 } 1703 }
1675 1704
1705 rcu_read_lock();
1676 for_each_domain(cpu, tmp) { 1706 for_each_domain(cpu, tmp) {
1677 if (!(tmp->flags & SD_LOAD_BALANCE)) 1707 if (!(tmp->flags & SD_LOAD_BALANCE))
1678 continue; 1708 continue;
@@ -1723,9 +1753,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1723 1753
1724 if (affine_sd) { 1754 if (affine_sd) {
1725 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1755 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1726 return select_idle_sibling(p, cpu); 1756 prev_cpu = cpu;
1727 else 1757
1728 return select_idle_sibling(p, prev_cpu); 1758 new_cpu = select_idle_sibling(p, prev_cpu);
1759 goto unlock;
1729 } 1760 }
1730 1761
1731 while (sd) { 1762 while (sd) {
@@ -1766,6 +1797,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1766 } 1797 }
1767 /* while loop will break here if sd == NULL */ 1798 /* while loop will break here if sd == NULL */
1768 } 1799 }
1800unlock:
1801 rcu_read_unlock();
1769 1802
1770 return new_cpu; 1803 return new_cpu;
1771} 1804}
@@ -1789,10 +1822,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
1789 * This is especially important for buddies when the leftmost 1822 * This is especially important for buddies when the leftmost
1790 * task is higher priority than the buddy. 1823 * task is higher priority than the buddy.
1791 */ 1824 */
1792 if (unlikely(se->load.weight != NICE_0_LOAD)) 1825 return calc_delta_fair(gran, se);
1793 gran = calc_delta_fair(gran, se);
1794
1795 return gran;
1796} 1826}
1797 1827
1798/* 1828/*
@@ -1826,26 +1856,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1826 1856
1827static void set_last_buddy(struct sched_entity *se) 1857static void set_last_buddy(struct sched_entity *se)
1828{ 1858{
1829 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1859 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
1830 for_each_sched_entity(se) 1860 return;
1831 cfs_rq_of(se)->last = se; 1861
1832 } 1862 for_each_sched_entity(se)
1863 cfs_rq_of(se)->last = se;
1833} 1864}
1834 1865
1835static void set_next_buddy(struct sched_entity *se) 1866static void set_next_buddy(struct sched_entity *se)
1836{ 1867{
1837 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1868 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
1838 for_each_sched_entity(se) 1869 return;
1839 cfs_rq_of(se)->next = se; 1870
1840 } 1871 for_each_sched_entity(se)
1872 cfs_rq_of(se)->next = se;
1841} 1873}
1842 1874
1843static void set_skip_buddy(struct sched_entity *se) 1875static void set_skip_buddy(struct sched_entity *se)
1844{ 1876{
1845 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1877 for_each_sched_entity(se)
1846 for_each_sched_entity(se) 1878 cfs_rq_of(se)->skip = se;
1847 cfs_rq_of(se)->skip = se;
1848 }
1849} 1879}
1850 1880
1851/* 1881/*
@@ -1857,12 +1887,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1857 struct sched_entity *se = &curr->se, *pse = &p->se; 1887 struct sched_entity *se = &curr->se, *pse = &p->se;
1858 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1888 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1859 int scale = cfs_rq->nr_running >= sched_nr_latency; 1889 int scale = cfs_rq->nr_running >= sched_nr_latency;
1890 int next_buddy_marked = 0;
1860 1891
1861 if (unlikely(se == pse)) 1892 if (unlikely(se == pse))
1862 return; 1893 return;
1863 1894
1864 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) 1895 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
1865 set_next_buddy(pse); 1896 set_next_buddy(pse);
1897 next_buddy_marked = 1;
1898 }
1866 1899
1867 /* 1900 /*
1868 * We can come here with TIF_NEED_RESCHED already set from new task 1901 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1890,8 +1923,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1890 update_curr(cfs_rq); 1923 update_curr(cfs_rq);
1891 find_matching_se(&se, &pse); 1924 find_matching_se(&se, &pse);
1892 BUG_ON(!pse); 1925 BUG_ON(!pse);
1893 if (wakeup_preempt_entity(se, pse) == 1) 1926 if (wakeup_preempt_entity(se, pse) == 1) {
1927 /*
1928 * Bias pick_next to pick the sched entity that is
1929 * triggering this preemption.
1930 */
1931 if (!next_buddy_marked)
1932 set_next_buddy(pse);
1894 goto preempt; 1933 goto preempt;
1934 }
1895 1935
1896 return; 1936 return;
1897 1937
@@ -2102,7 +2142,7 @@ static unsigned long
2102balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 2142balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2103 unsigned long max_load_move, struct sched_domain *sd, 2143 unsigned long max_load_move, struct sched_domain *sd,
2104 enum cpu_idle_type idle, int *all_pinned, 2144 enum cpu_idle_type idle, int *all_pinned,
2105 int *this_best_prio, struct cfs_rq *busiest_cfs_rq) 2145 struct cfs_rq *busiest_cfs_rq)
2106{ 2146{
2107 int loops = 0, pulled = 0; 2147 int loops = 0, pulled = 0;
2108 long rem_load_move = max_load_move; 2148 long rem_load_move = max_load_move;
@@ -2140,9 +2180,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2140 */ 2180 */
2141 if (rem_load_move <= 0) 2181 if (rem_load_move <= 0)
2142 break; 2182 break;
2143
2144 if (p->prio < *this_best_prio)
2145 *this_best_prio = p->prio;
2146 } 2183 }
2147out: 2184out:
2148 /* 2185 /*
@@ -2202,7 +2239,7 @@ static unsigned long
2202load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2239load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2203 unsigned long max_load_move, 2240 unsigned long max_load_move,
2204 struct sched_domain *sd, enum cpu_idle_type idle, 2241 struct sched_domain *sd, enum cpu_idle_type idle,
2205 int *all_pinned, int *this_best_prio) 2242 int *all_pinned)
2206{ 2243{
2207 long rem_load_move = max_load_move; 2244 long rem_load_move = max_load_move;
2208 int busiest_cpu = cpu_of(busiest); 2245 int busiest_cpu = cpu_of(busiest);
@@ -2227,7 +2264,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2227 rem_load = div_u64(rem_load, busiest_h_load + 1); 2264 rem_load = div_u64(rem_load, busiest_h_load + 1);
2228 2265
2229 moved_load = balance_tasks(this_rq, this_cpu, busiest, 2266 moved_load = balance_tasks(this_rq, this_cpu, busiest,
2230 rem_load, sd, idle, all_pinned, this_best_prio, 2267 rem_load, sd, idle, all_pinned,
2231 busiest_cfs_rq); 2268 busiest_cfs_rq);
2232 2269
2233 if (!moved_load) 2270 if (!moved_load)
@@ -2253,11 +2290,11 @@ static unsigned long
2253load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2290load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2254 unsigned long max_load_move, 2291 unsigned long max_load_move,
2255 struct sched_domain *sd, enum cpu_idle_type idle, 2292 struct sched_domain *sd, enum cpu_idle_type idle,
2256 int *all_pinned, int *this_best_prio) 2293 int *all_pinned)
2257{ 2294{
2258 return balance_tasks(this_rq, this_cpu, busiest, 2295 return balance_tasks(this_rq, this_cpu, busiest,
2259 max_load_move, sd, idle, all_pinned, 2296 max_load_move, sd, idle, all_pinned,
2260 this_best_prio, &busiest->cfs); 2297 &busiest->cfs);
2261} 2298}
2262#endif 2299#endif
2263 2300
@@ -2274,12 +2311,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2274 int *all_pinned) 2311 int *all_pinned)
2275{ 2312{
2276 unsigned long total_load_moved = 0, load_moved; 2313 unsigned long total_load_moved = 0, load_moved;
2277 int this_best_prio = this_rq->curr->prio;
2278 2314
2279 do { 2315 do {
2280 load_moved = load_balance_fair(this_rq, this_cpu, busiest, 2316 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2281 max_load_move - total_load_moved, 2317 max_load_move - total_load_moved,
2282 sd, idle, all_pinned, &this_best_prio); 2318 sd, idle, all_pinned);
2283 2319
2284 total_load_moved += load_moved; 2320 total_load_moved += load_moved;
2285 2321
@@ -2648,7 +2684,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2648 /* 2684 /*
2649 * Only siblings can have significantly less than SCHED_LOAD_SCALE 2685 * Only siblings can have significantly less than SCHED_LOAD_SCALE
2650 */ 2686 */
2651 if (sd->level != SD_LV_SIBLING) 2687 if (!(sd->flags & SD_SHARE_CPUPOWER))
2652 return 0; 2688 return 0;
2653 2689
2654 /* 2690 /*
@@ -3465,6 +3501,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3465 raw_spin_unlock(&this_rq->lock); 3501 raw_spin_unlock(&this_rq->lock);
3466 3502
3467 update_shares(this_cpu); 3503 update_shares(this_cpu);
3504 rcu_read_lock();
3468 for_each_domain(this_cpu, sd) { 3505 for_each_domain(this_cpu, sd) {
3469 unsigned long interval; 3506 unsigned long interval;
3470 int balance = 1; 3507 int balance = 1;
@@ -3486,6 +3523,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3486 break; 3523 break;
3487 } 3524 }
3488 } 3525 }
3526 rcu_read_unlock();
3489 3527
3490 raw_spin_lock(&this_rq->lock); 3528 raw_spin_lock(&this_rq->lock);
3491 3529
@@ -3534,6 +3572,7 @@ static int active_load_balance_cpu_stop(void *data)
3534 double_lock_balance(busiest_rq, target_rq); 3572 double_lock_balance(busiest_rq, target_rq);
3535 3573
3536 /* Search for an sd spanning us and the target CPU. */ 3574 /* Search for an sd spanning us and the target CPU. */
3575 rcu_read_lock();
3537 for_each_domain(target_cpu, sd) { 3576 for_each_domain(target_cpu, sd) {
3538 if ((sd->flags & SD_LOAD_BALANCE) && 3577 if ((sd->flags & SD_LOAD_BALANCE) &&
3539 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) 3578 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
@@ -3549,6 +3588,7 @@ static int active_load_balance_cpu_stop(void *data)
3549 else 3588 else
3550 schedstat_inc(sd, alb_failed); 3589 schedstat_inc(sd, alb_failed);
3551 } 3590 }
3591 rcu_read_unlock();
3552 double_unlock_balance(busiest_rq, target_rq); 3592 double_unlock_balance(busiest_rq, target_rq);
3553out_unlock: 3593out_unlock:
3554 busiest_rq->active_balance = 0; 3594 busiest_rq->active_balance = 0;
@@ -3675,6 +3715,7 @@ static int find_new_ilb(int cpu)
3675{ 3715{
3676 struct sched_domain *sd; 3716 struct sched_domain *sd;
3677 struct sched_group *ilb_group; 3717 struct sched_group *ilb_group;
3718 int ilb = nr_cpu_ids;
3678 3719
3679 /* 3720 /*
3680 * Have idle load balancer selection from semi-idle packages only 3721 * Have idle load balancer selection from semi-idle packages only
@@ -3690,20 +3731,25 @@ static int find_new_ilb(int cpu)
3690 if (cpumask_weight(nohz.idle_cpus_mask) < 2) 3731 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3691 goto out_done; 3732 goto out_done;
3692 3733
3734 rcu_read_lock();
3693 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 3735 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3694 ilb_group = sd->groups; 3736 ilb_group = sd->groups;
3695 3737
3696 do { 3738 do {
3697 if (is_semi_idle_group(ilb_group)) 3739 if (is_semi_idle_group(ilb_group)) {
3698 return cpumask_first(nohz.grp_idle_mask); 3740 ilb = cpumask_first(nohz.grp_idle_mask);
3741 goto unlock;
3742 }
3699 3743
3700 ilb_group = ilb_group->next; 3744 ilb_group = ilb_group->next;
3701 3745
3702 } while (ilb_group != sd->groups); 3746 } while (ilb_group != sd->groups);
3703 } 3747 }
3748unlock:
3749 rcu_read_unlock();
3704 3750
3705out_done: 3751out_done:
3706 return nr_cpu_ids; 3752 return ilb;
3707} 3753}
3708#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 3754#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3709static inline int find_new_ilb(int call_cpu) 3755static inline int find_new_ilb(int call_cpu)
@@ -3848,6 +3894,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3848 3894
3849 update_shares(cpu); 3895 update_shares(cpu);
3850 3896
3897 rcu_read_lock();
3851 for_each_domain(cpu, sd) { 3898 for_each_domain(cpu, sd) {
3852 if (!(sd->flags & SD_LOAD_BALANCE)) 3899 if (!(sd->flags & SD_LOAD_BALANCE))
3853 continue; 3900 continue;
@@ -3893,6 +3940,7 @@ out:
3893 if (!balance) 3940 if (!balance)
3894 break; 3941 break;
3895 } 3942 }
3943 rcu_read_unlock();
3896 3944
3897 /* 3945 /*
3898 * next_balance will be updated only when there is a need. 3946 * next_balance will be updated only when there is a need.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 68e69acc29b9..be40f7371ee1 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -64,3 +64,9 @@ SCHED_FEAT(OWNER_SPIN, 1)
64 * Decrement CPU power based on irq activity 64 * Decrement CPU power based on irq activity
65 */ 65 */
66SCHED_FEAT(NONIRQ_POWER, 1) 66SCHED_FEAT(NONIRQ_POWER, 1)
67
68/*
69 * Queue remote wakeups on the target CPU and process them
70 * using the scheduler IPI. Reduces rq->lock contention/bounces.
71 */
72SCHED_FEAT(TTWU_QUEUE, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index a776a6396427..0a51882534ea 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -7,7 +7,7 @@
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int 9static int
10select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) 10select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
11{ 11{
12 return task_cpu(p); /* IDLE tasks as never migrated */ 12 return task_cpu(p); /* IDLE tasks as never migrated */
13} 13}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index e7cebdc65f82..64b2a37c07d0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
184} 184}
185 185
186typedef struct task_group *rt_rq_iter_t;
187
188#define for_each_rt_rq(rt_rq, iter, rq) \
189 for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \
190 (&iter->list != &task_groups) && \
191 (rt_rq = iter->rt_rq[cpu_of(rq)]); \
192 iter = list_entry_rcu(iter->list.next, typeof(*iter), list))
193
186static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) 194static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
187{ 195{
188 list_add_rcu(&rt_rq->leaf_rt_rq_list, 196 list_add_rcu(&rt_rq->leaf_rt_rq_list,
@@ -288,6 +296,11 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
288 return ktime_to_ns(def_rt_bandwidth.rt_period); 296 return ktime_to_ns(def_rt_bandwidth.rt_period);
289} 297}
290 298
299typedef struct rt_rq *rt_rq_iter_t;
300
301#define for_each_rt_rq(rt_rq, iter, rq) \
302 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
303
291static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) 304static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
292{ 305{
293} 306}
@@ -402,12 +415,13 @@ next:
402static void __disable_runtime(struct rq *rq) 415static void __disable_runtime(struct rq *rq)
403{ 416{
404 struct root_domain *rd = rq->rd; 417 struct root_domain *rd = rq->rd;
418 rt_rq_iter_t iter;
405 struct rt_rq *rt_rq; 419 struct rt_rq *rt_rq;
406 420
407 if (unlikely(!scheduler_running)) 421 if (unlikely(!scheduler_running))
408 return; 422 return;
409 423
410 for_each_leaf_rt_rq(rt_rq, rq) { 424 for_each_rt_rq(rt_rq, iter, rq) {
411 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 425 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
412 s64 want; 426 s64 want;
413 int i; 427 int i;
@@ -487,6 +501,7 @@ static void disable_runtime(struct rq *rq)
487 501
488static void __enable_runtime(struct rq *rq) 502static void __enable_runtime(struct rq *rq)
489{ 503{
504 rt_rq_iter_t iter;
490 struct rt_rq *rt_rq; 505 struct rt_rq *rt_rq;
491 506
492 if (unlikely(!scheduler_running)) 507 if (unlikely(!scheduler_running))
@@ -495,7 +510,7 @@ static void __enable_runtime(struct rq *rq)
495 /* 510 /*
496 * Reset each runqueue's bandwidth settings 511 * Reset each runqueue's bandwidth settings
497 */ 512 */
498 for_each_leaf_rt_rq(rt_rq, rq) { 513 for_each_rt_rq(rt_rq, iter, rq) {
499 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 514 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
500 515
501 raw_spin_lock(&rt_b->rt_runtime_lock); 516 raw_spin_lock(&rt_b->rt_runtime_lock);
@@ -562,6 +577,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
562 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { 577 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
563 rt_rq->rt_throttled = 0; 578 rt_rq->rt_throttled = 0;
564 enqueue = 1; 579 enqueue = 1;
580
581 /*
582 * Force a clock update if the CPU was idle,
583 * lest wakeup -> unthrottle time accumulate.
584 */
585 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
586 rq->skip_clock_update = -1;
565 } 587 }
566 if (rt_rq->rt_time || rt_rq->rt_nr_running) 588 if (rt_rq->rt_time || rt_rq->rt_nr_running)
567 idle = 0; 589 idle = 0;
@@ -977,13 +999,23 @@ static void yield_task_rt(struct rq *rq)
977static int find_lowest_rq(struct task_struct *task); 999static int find_lowest_rq(struct task_struct *task);
978 1000
979static int 1001static int
980select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) 1002select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
981{ 1003{
1004 struct task_struct *curr;
1005 struct rq *rq;
1006 int cpu;
1007
982 if (sd_flag != SD_BALANCE_WAKE) 1008 if (sd_flag != SD_BALANCE_WAKE)
983 return smp_processor_id(); 1009 return smp_processor_id();
984 1010
1011 cpu = task_cpu(p);
1012 rq = cpu_rq(cpu);
1013
1014 rcu_read_lock();
1015 curr = ACCESS_ONCE(rq->curr); /* unlocked access */
1016
985 /* 1017 /*
986 * If the current task is an RT task, then 1018 * If the current task on @p's runqueue is an RT task, then
987 * try to see if we can wake this RT task up on another 1019 * try to see if we can wake this RT task up on another
988 * runqueue. Otherwise simply start this RT task 1020 * runqueue. Otherwise simply start this RT task
989 * on its current runqueue. 1021 * on its current runqueue.
@@ -997,21 +1029,25 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
997 * lock? 1029 * lock?
998 * 1030 *
999 * For equal prio tasks, we just let the scheduler sort it out. 1031 * For equal prio tasks, we just let the scheduler sort it out.
1032 *
1033 * Otherwise, just let it ride on the affined RQ and the
1034 * post-schedule router will push the preempted task away
1035 *
1036 * This test is optimistic, if we get it wrong the load-balancer
1037 * will have to sort it out.
1000 */ 1038 */
1001 if (unlikely(rt_task(rq->curr)) && 1039 if (curr && unlikely(rt_task(curr)) &&
1002 (rq->curr->rt.nr_cpus_allowed < 2 || 1040 (curr->rt.nr_cpus_allowed < 2 ||
1003 rq->curr->prio < p->prio) && 1041 curr->prio < p->prio) &&
1004 (p->rt.nr_cpus_allowed > 1)) { 1042 (p->rt.nr_cpus_allowed > 1)) {
1005 int cpu = find_lowest_rq(p); 1043 int target = find_lowest_rq(p);
1006 1044
1007 return (cpu == -1) ? task_cpu(p) : cpu; 1045 if (target != -1)
1046 cpu = target;
1008 } 1047 }
1048 rcu_read_unlock();
1009 1049
1010 /* 1050 return cpu;
1011 * Otherwise, just let it ride on the affined RQ and the
1012 * post-schedule router will push the preempted task away
1013 */
1014 return task_cpu(p);
1015} 1051}
1016 1052
1017static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1053static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
@@ -1136,7 +1172,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1136 * The previous task needs to be made eligible for pushing 1172 * The previous task needs to be made eligible for pushing
1137 * if it is still active 1173 * if it is still active
1138 */ 1174 */
1139 if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) 1175 if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
1140 enqueue_pushable_task(rq, p); 1176 enqueue_pushable_task(rq, p);
1141} 1177}
1142 1178
@@ -1287,7 +1323,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1287 !cpumask_test_cpu(lowest_rq->cpu, 1323 !cpumask_test_cpu(lowest_rq->cpu,
1288 &task->cpus_allowed) || 1324 &task->cpus_allowed) ||
1289 task_running(rq, task) || 1325 task_running(rq, task) ||
1290 !task->se.on_rq)) { 1326 !task->on_rq)) {
1291 1327
1292 raw_spin_unlock(&lowest_rq->lock); 1328 raw_spin_unlock(&lowest_rq->lock);
1293 lowest_rq = NULL; 1329 lowest_rq = NULL;
@@ -1321,7 +1357,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
1321 BUG_ON(task_current(rq, p)); 1357 BUG_ON(task_current(rq, p));
1322 BUG_ON(p->rt.nr_cpus_allowed <= 1); 1358 BUG_ON(p->rt.nr_cpus_allowed <= 1);
1323 1359
1324 BUG_ON(!p->se.on_rq); 1360 BUG_ON(!p->on_rq);
1325 BUG_ON(!rt_task(p)); 1361 BUG_ON(!rt_task(p));
1326 1362
1327 return p; 1363 return p;
@@ -1467,7 +1503,7 @@ static int pull_rt_task(struct rq *this_rq)
1467 */ 1503 */
1468 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 1504 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1469 WARN_ON(p == src_rq->curr); 1505 WARN_ON(p == src_rq->curr);
1470 WARN_ON(!p->se.on_rq); 1506 WARN_ON(!p->on_rq);
1471 1507
1472 /* 1508 /*
1473 * There's a chance that p is higher in priority 1509 * There's a chance that p is higher in priority
@@ -1538,7 +1574,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1538 * Update the migration status of the RQ if we have an RT task 1574 * Update the migration status of the RQ if we have an RT task
1539 * which is running AND changing its weight value. 1575 * which is running AND changing its weight value.
1540 */ 1576 */
1541 if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { 1577 if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
1542 struct rq *rq = task_rq(p); 1578 struct rq *rq = task_rq(p);
1543 1579
1544 if (!task_current(rq, p)) { 1580 if (!task_current(rq, p)) {
@@ -1608,7 +1644,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1608 * we may need to handle the pulling of RT tasks 1644 * we may need to handle the pulling of RT tasks
1609 * now. 1645 * now.
1610 */ 1646 */
1611 if (p->se.on_rq && !rq->rt.rt_nr_running) 1647 if (p->on_rq && !rq->rt.rt_nr_running)
1612 pull_rt_task(rq); 1648 pull_rt_task(rq);
1613} 1649}
1614 1650
@@ -1638,7 +1674,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1638 * If that current running task is also an RT task 1674 * If that current running task is also an RT task
1639 * then see if we can move to another run queue. 1675 * then see if we can move to another run queue.
1640 */ 1676 */
1641 if (p->se.on_rq && rq->curr != p) { 1677 if (p->on_rq && rq->curr != p) {
1642#ifdef CONFIG_SMP 1678#ifdef CONFIG_SMP
1643 if (rq->rt.overloaded && push_rt_task(rq) && 1679 if (rq->rt.overloaded && push_rt_task(rq) &&
1644 /* Don't resched if we changed runqueues */ 1680 /* Don't resched if we changed runqueues */
@@ -1657,7 +1693,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1657static void 1693static void
1658prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 1694prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1659{ 1695{
1660 if (!p->se.on_rq) 1696 if (!p->on_rq)
1661 return; 1697 return;
1662 1698
1663 if (rq->curr == p) { 1699 if (rq->curr == p) {
@@ -1796,10 +1832,11 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1796 1832
1797static void print_rt_stats(struct seq_file *m, int cpu) 1833static void print_rt_stats(struct seq_file *m, int cpu)
1798{ 1834{
1835 rt_rq_iter_t iter;
1799 struct rt_rq *rt_rq; 1836 struct rt_rq *rt_rq;
1800 1837
1801 rcu_read_lock(); 1838 rcu_read_lock();
1802 for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu)) 1839 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
1803 print_rt_rq(m, cpu, rt_rq); 1840 print_rt_rq(m, cpu, rt_rq);
1804 rcu_read_unlock(); 1841 rcu_read_unlock();
1805} 1842}
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 1ba2bd40fdac..6f437632afab 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -9,8 +9,7 @@
9 9
10#ifdef CONFIG_SMP 10#ifdef CONFIG_SMP
11static int 11static int
12select_task_rq_stop(struct rq *rq, struct task_struct *p, 12select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
13 int sd_flag, int flags)
14{ 13{
15 return task_cpu(p); /* stop tasks as never migrate */ 14 return task_cpu(p); /* stop tasks as never migrate */
16} 15}
@@ -26,7 +25,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
26{ 25{
27 struct task_struct *stop = rq->stop; 26 struct task_struct *stop = rq->stop;
28 27
29 if (stop && stop->se.on_rq) 28 if (stop && stop->on_rq)
30 return stop; 29 return stop;
31 30
32 return NULL; 31 return NULL;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 35d55a386145..f925c45f0afa 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -53,7 +53,6 @@ const char *reserved_field_names[] = {
53 "common_preempt_count", 53 "common_preempt_count",
54 "common_pid", 54 "common_pid",
55 "common_tgid", 55 "common_tgid",
56 "common_lock_depth",
57 FIELD_STRING_IP, 56 FIELD_STRING_IP,
58 FIELD_STRING_RETIP, 57 FIELD_STRING_RETIP,
59 FIELD_STRING_FUNC, 58 FIELD_STRING_FUNC,
diff --git a/tools/perf/Documentation/perf-script-perl.txt b/tools/perf/Documentation/perf-script-perl.txt
index 5bb41e55a3ac..3152cca15501 100644
--- a/tools/perf/Documentation/perf-script-perl.txt
+++ b/tools/perf/Documentation/perf-script-perl.txt
@@ -63,7 +63,6 @@ The format file for the sched_wakep event defines the following fields
63 field:unsigned char common_flags; 63 field:unsigned char common_flags;
64 field:unsigned char common_preempt_count; 64 field:unsigned char common_preempt_count;
65 field:int common_pid; 65 field:int common_pid;
66 field:int common_lock_depth;
67 66
68 field:char comm[TASK_COMM_LEN]; 67 field:char comm[TASK_COMM_LEN];
69 field:pid_t pid; 68 field:pid_t pid;
diff --git a/tools/perf/Documentation/perf-script-python.txt b/tools/perf/Documentation/perf-script-python.txt
index 36b38277422c..471022069119 100644
--- a/tools/perf/Documentation/perf-script-python.txt
+++ b/tools/perf/Documentation/perf-script-python.txt
@@ -463,7 +463,6 @@ The format file for the sched_wakep event defines the following fields
463 field:unsigned char common_flags; 463 field:unsigned char common_flags;
464 field:unsigned char common_preempt_count; 464 field:unsigned char common_preempt_count;
465 field:int common_pid; 465 field:int common_pid;
466 field:int common_lock_depth;
467 466
468 field:char comm[TASK_COMM_LEN]; 467 field:char comm[TASK_COMM_LEN];
469 field:pid_t pid; 468 field:pid_t pid;