diff options
47 files changed, 928 insertions, 1170 deletions
diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index 6d27ab8d6e9f..c83bd6b4e6e8 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt | |||
@@ -120,7 +120,6 @@ format: | |||
120 | field:unsigned char common_flags; offset:2; size:1; signed:0; | 120 | field:unsigned char common_flags; offset:2; size:1; signed:0; |
121 | field:unsigned char common_preempt_count; offset:3; size:1;signed:0; | 121 | field:unsigned char common_preempt_count; offset:3; size:1;signed:0; |
122 | field:int common_pid; offset:4; size:4; signed:1; | 122 | field:int common_pid; offset:4; size:4; signed:1; |
123 | field:int common_lock_depth; offset:8; size:4; signed:1; | ||
124 | 123 | ||
125 | field:unsigned long __probe_ip; offset:12; size:4; signed:0; | 124 | field:unsigned long __probe_ip; offset:12; size:4; signed:0; |
126 | field:int __probe_nargs; offset:16; size:4; signed:1; | 125 | field:int __probe_nargs; offset:16; size:4; signed:1; |
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 42aa078a5e4d..5a621c6d22ab 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c | |||
@@ -585,8 +585,7 @@ handle_ipi(struct pt_regs *regs) | |||
585 | 585 | ||
586 | switch (which) { | 586 | switch (which) { |
587 | case IPI_RESCHEDULE: | 587 | case IPI_RESCHEDULE: |
588 | /* Reschedule callback. Everything to be done | 588 | scheduler_ipi(); |
589 | is done by the interrupt return path. */ | ||
590 | break; | 589 | break; |
591 | 590 | ||
592 | case IPI_CALL_FUNC: | 591 | case IPI_CALL_FUNC: |
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index f29b8a29b174..007a0a950e75 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c | |||
@@ -560,10 +560,7 @@ asmlinkage void __exception_irq_entry do_IPI(int ipinr, struct pt_regs *regs) | |||
560 | break; | 560 | break; |
561 | 561 | ||
562 | case IPI_RESCHEDULE: | 562 | case IPI_RESCHEDULE: |
563 | /* | 563 | scheduler_ipi(); |
564 | * nothing more to do - eveything is | ||
565 | * done on the interrupt return path | ||
566 | */ | ||
567 | break; | 564 | break; |
568 | 565 | ||
569 | case IPI_CALL_FUNC: | 566 | case IPI_CALL_FUNC: |
diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c index 8bce5ed031e4..1fbd94c44457 100644 --- a/arch/blackfin/mach-common/smp.c +++ b/arch/blackfin/mach-common/smp.c | |||
@@ -177,6 +177,9 @@ static irqreturn_t ipi_handler_int1(int irq, void *dev_instance) | |||
177 | while (msg_queue->count) { | 177 | while (msg_queue->count) { |
178 | msg = &msg_queue->ipi_message[msg_queue->head]; | 178 | msg = &msg_queue->ipi_message[msg_queue->head]; |
179 | switch (msg->type) { | 179 | switch (msg->type) { |
180 | case BFIN_IPI_RESCHEDULE: | ||
181 | scheduler_ipi(); | ||
182 | break; | ||
180 | case BFIN_IPI_CALL_FUNC: | 183 | case BFIN_IPI_CALL_FUNC: |
181 | spin_unlock_irqrestore(&msg_queue->lock, flags); | 184 | spin_unlock_irqrestore(&msg_queue->lock, flags); |
182 | ipi_call_function(cpu, msg); | 185 | ipi_call_function(cpu, msg); |
diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c index 4c9e3e1ba5d1..66cc75657e2f 100644 --- a/arch/cris/arch-v32/kernel/smp.c +++ b/arch/cris/arch-v32/kernel/smp.c | |||
@@ -342,15 +342,18 @@ irqreturn_t crisv32_ipi_interrupt(int irq, void *dev_id) | |||
342 | 342 | ||
343 | ipi = REG_RD(intr_vect, irq_regs[smp_processor_id()], rw_ipi); | 343 | ipi = REG_RD(intr_vect, irq_regs[smp_processor_id()], rw_ipi); |
344 | 344 | ||
345 | if (ipi.vector & IPI_SCHEDULE) { | ||
346 | scheduler_ipi(); | ||
347 | } | ||
345 | if (ipi.vector & IPI_CALL) { | 348 | if (ipi.vector & IPI_CALL) { |
346 | func(info); | 349 | func(info); |
347 | } | 350 | } |
348 | if (ipi.vector & IPI_FLUSH_TLB) { | 351 | if (ipi.vector & IPI_FLUSH_TLB) { |
349 | if (flush_mm == FLUSH_ALL) | 352 | if (flush_mm == FLUSH_ALL) |
350 | __flush_tlb_all(); | 353 | __flush_tlb_all(); |
351 | else if (flush_vma == FLUSH_ALL) | 354 | else if (flush_vma == FLUSH_ALL) |
352 | __flush_tlb_mm(flush_mm); | 355 | __flush_tlb_mm(flush_mm); |
353 | else | 356 | else |
354 | __flush_tlb_page(flush_vma, flush_addr); | 357 | __flush_tlb_page(flush_vma, flush_addr); |
355 | } | 358 | } |
356 | 359 | ||
diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c index 5b704740f160..782c3a357f24 100644 --- a/arch/ia64/kernel/irq_ia64.c +++ b/arch/ia64/kernel/irq_ia64.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/irq.h> | 31 | #include <linux/irq.h> |
32 | #include <linux/ratelimit.h> | 32 | #include <linux/ratelimit.h> |
33 | #include <linux/acpi.h> | 33 | #include <linux/acpi.h> |
34 | #include <linux/sched.h> | ||
34 | 35 | ||
35 | #include <asm/delay.h> | 36 | #include <asm/delay.h> |
36 | #include <asm/intrinsics.h> | 37 | #include <asm/intrinsics.h> |
@@ -496,6 +497,7 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs) | |||
496 | smp_local_flush_tlb(); | 497 | smp_local_flush_tlb(); |
497 | kstat_incr_irqs_this_cpu(irq, desc); | 498 | kstat_incr_irqs_this_cpu(irq, desc); |
498 | } else if (unlikely(IS_RESCHEDULE(vector))) { | 499 | } else if (unlikely(IS_RESCHEDULE(vector))) { |
500 | scheduler_ipi(); | ||
499 | kstat_incr_irqs_this_cpu(irq, desc); | 501 | kstat_incr_irqs_this_cpu(irq, desc); |
500 | } else { | 502 | } else { |
501 | ia64_setreg(_IA64_REG_CR_TPR, vector); | 503 | ia64_setreg(_IA64_REG_CR_TPR, vector); |
diff --git a/arch/ia64/xen/irq_xen.c b/arch/ia64/xen/irq_xen.c index 108bb858acf2..b279e142c633 100644 --- a/arch/ia64/xen/irq_xen.c +++ b/arch/ia64/xen/irq_xen.c | |||
@@ -92,6 +92,8 @@ static unsigned short saved_irq_cnt; | |||
92 | static int xen_slab_ready; | 92 | static int xen_slab_ready; |
93 | 93 | ||
94 | #ifdef CONFIG_SMP | 94 | #ifdef CONFIG_SMP |
95 | #include <linux/sched.h> | ||
96 | |||
95 | /* Dummy stub. Though we may check XEN_RESCHEDULE_VECTOR before __do_IRQ, | 97 | /* Dummy stub. Though we may check XEN_RESCHEDULE_VECTOR before __do_IRQ, |
96 | * it ends up to issue several memory accesses upon percpu data and | 98 | * it ends up to issue several memory accesses upon percpu data and |
97 | * thus adds unnecessary traffic to other paths. | 99 | * thus adds unnecessary traffic to other paths. |
@@ -99,7 +101,13 @@ static int xen_slab_ready; | |||
99 | static irqreturn_t | 101 | static irqreturn_t |
100 | xen_dummy_handler(int irq, void *dev_id) | 102 | xen_dummy_handler(int irq, void *dev_id) |
101 | { | 103 | { |
104 | return IRQ_HANDLED; | ||
105 | } | ||
102 | 106 | ||
107 | static irqreturn_t | ||
108 | xen_resched_handler(int irq, void *dev_id) | ||
109 | { | ||
110 | scheduler_ipi(); | ||
103 | return IRQ_HANDLED; | 111 | return IRQ_HANDLED; |
104 | } | 112 | } |
105 | 113 | ||
@@ -110,7 +118,7 @@ static struct irqaction xen_ipi_irqaction = { | |||
110 | }; | 118 | }; |
111 | 119 | ||
112 | static struct irqaction xen_resched_irqaction = { | 120 | static struct irqaction xen_resched_irqaction = { |
113 | .handler = xen_dummy_handler, | 121 | .handler = xen_resched_handler, |
114 | .flags = IRQF_DISABLED, | 122 | .flags = IRQF_DISABLED, |
115 | .name = "resched" | 123 | .name = "resched" |
116 | }; | 124 | }; |
diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c index 31cef20b2996..fc10b39893d4 100644 --- a/arch/m32r/kernel/smp.c +++ b/arch/m32r/kernel/smp.c | |||
@@ -122,8 +122,6 @@ void smp_send_reschedule(int cpu_id) | |||
122 | * | 122 | * |
123 | * Description: This routine executes on CPU which received | 123 | * Description: This routine executes on CPU which received |
124 | * 'RESCHEDULE_IPI'. | 124 | * 'RESCHEDULE_IPI'. |
125 | * Rescheduling is processed at the exit of interrupt | ||
126 | * operation. | ||
127 | * | 125 | * |
128 | * Born on Date: 2002.02.05 | 126 | * Born on Date: 2002.02.05 |
129 | * | 127 | * |
@@ -138,7 +136,7 @@ void smp_send_reschedule(int cpu_id) | |||
138 | *==========================================================================*/ | 136 | *==========================================================================*/ |
139 | void smp_reschedule_interrupt(void) | 137 | void smp_reschedule_interrupt(void) |
140 | { | 138 | { |
141 | /* nothing to do */ | 139 | scheduler_ipi(); |
142 | } | 140 | } |
143 | 141 | ||
144 | /*==========================================================================* | 142 | /*==========================================================================* |
diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c index 716fae6f941a..8b606423bbd7 100644 --- a/arch/mips/cavium-octeon/smp.c +++ b/arch/mips/cavium-octeon/smp.c | |||
@@ -44,6 +44,8 @@ static irqreturn_t mailbox_interrupt(int irq, void *dev_id) | |||
44 | 44 | ||
45 | if (action & SMP_CALL_FUNCTION) | 45 | if (action & SMP_CALL_FUNCTION) |
46 | smp_call_function_interrupt(); | 46 | smp_call_function_interrupt(); |
47 | if (action & SMP_RESCHEDULE_YOURSELF) | ||
48 | scheduler_ipi(); | ||
47 | 49 | ||
48 | /* Check if we've been told to flush the icache */ | 50 | /* Check if we've been told to flush the icache */ |
49 | if (action & SMP_ICACHE_FLUSH) | 51 | if (action & SMP_ICACHE_FLUSH) |
diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c index 5a88cc4ccd5a..cedac4633741 100644 --- a/arch/mips/kernel/smtc.c +++ b/arch/mips/kernel/smtc.c | |||
@@ -929,7 +929,7 @@ static void post_direct_ipi(int cpu, struct smtc_ipi *pipi) | |||
929 | 929 | ||
930 | static void ipi_resched_interrupt(void) | 930 | static void ipi_resched_interrupt(void) |
931 | { | 931 | { |
932 | /* Return from interrupt should be enough to cause scheduler check */ | 932 | scheduler_ipi(); |
933 | } | 933 | } |
934 | 934 | ||
935 | static void ipi_call_interrupt(void) | 935 | static void ipi_call_interrupt(void) |
diff --git a/arch/mips/mti-malta/malta-int.c b/arch/mips/mti-malta/malta-int.c index e85c977328da..1d36c511a7a5 100644 --- a/arch/mips/mti-malta/malta-int.c +++ b/arch/mips/mti-malta/malta-int.c | |||
@@ -308,6 +308,8 @@ static void ipi_call_dispatch(void) | |||
308 | 308 | ||
309 | static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id) | 309 | static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id) |
310 | { | 310 | { |
311 | scheduler_ipi(); | ||
312 | |||
311 | return IRQ_HANDLED; | 313 | return IRQ_HANDLED; |
312 | } | 314 | } |
313 | 315 | ||
diff --git a/arch/mips/pmc-sierra/yosemite/smp.c b/arch/mips/pmc-sierra/yosemite/smp.c index efc9e889b349..2608752898c0 100644 --- a/arch/mips/pmc-sierra/yosemite/smp.c +++ b/arch/mips/pmc-sierra/yosemite/smp.c | |||
@@ -55,6 +55,8 @@ void titan_mailbox_irq(void) | |||
55 | 55 | ||
56 | if (status & 0x2) | 56 | if (status & 0x2) |
57 | smp_call_function_interrupt(); | 57 | smp_call_function_interrupt(); |
58 | if (status & 0x4) | ||
59 | scheduler_ipi(); | ||
58 | break; | 60 | break; |
59 | 61 | ||
60 | case 1: | 62 | case 1: |
@@ -63,6 +65,8 @@ void titan_mailbox_irq(void) | |||
63 | 65 | ||
64 | if (status & 0x2) | 66 | if (status & 0x2) |
65 | smp_call_function_interrupt(); | 67 | smp_call_function_interrupt(); |
68 | if (status & 0x4) | ||
69 | scheduler_ipi(); | ||
66 | break; | 70 | break; |
67 | } | 71 | } |
68 | } | 72 | } |
diff --git a/arch/mips/sgi-ip27/ip27-irq.c b/arch/mips/sgi-ip27/ip27-irq.c index 0a04603d577c..b18b04e48577 100644 --- a/arch/mips/sgi-ip27/ip27-irq.c +++ b/arch/mips/sgi-ip27/ip27-irq.c | |||
@@ -147,8 +147,10 @@ static void ip27_do_irq_mask0(void) | |||
147 | #ifdef CONFIG_SMP | 147 | #ifdef CONFIG_SMP |
148 | if (pend0 & (1UL << CPU_RESCHED_A_IRQ)) { | 148 | if (pend0 & (1UL << CPU_RESCHED_A_IRQ)) { |
149 | LOCAL_HUB_CLR_INTR(CPU_RESCHED_A_IRQ); | 149 | LOCAL_HUB_CLR_INTR(CPU_RESCHED_A_IRQ); |
150 | scheduler_ipi(); | ||
150 | } else if (pend0 & (1UL << CPU_RESCHED_B_IRQ)) { | 151 | } else if (pend0 & (1UL << CPU_RESCHED_B_IRQ)) { |
151 | LOCAL_HUB_CLR_INTR(CPU_RESCHED_B_IRQ); | 152 | LOCAL_HUB_CLR_INTR(CPU_RESCHED_B_IRQ); |
153 | scheduler_ipi(); | ||
152 | } else if (pend0 & (1UL << CPU_CALL_A_IRQ)) { | 154 | } else if (pend0 & (1UL << CPU_CALL_A_IRQ)) { |
153 | LOCAL_HUB_CLR_INTR(CPU_CALL_A_IRQ); | 155 | LOCAL_HUB_CLR_INTR(CPU_CALL_A_IRQ); |
154 | smp_call_function_interrupt(); | 156 | smp_call_function_interrupt(); |
diff --git a/arch/mips/sibyte/bcm1480/smp.c b/arch/mips/sibyte/bcm1480/smp.c index 47b347c992ea..d667875be564 100644 --- a/arch/mips/sibyte/bcm1480/smp.c +++ b/arch/mips/sibyte/bcm1480/smp.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/delay.h> | 20 | #include <linux/delay.h> |
21 | #include <linux/smp.h> | 21 | #include <linux/smp.h> |
22 | #include <linux/kernel_stat.h> | 22 | #include <linux/kernel_stat.h> |
23 | #include <linux/sched.h> | ||
23 | 24 | ||
24 | #include <asm/mmu_context.h> | 25 | #include <asm/mmu_context.h> |
25 | #include <asm/io.h> | 26 | #include <asm/io.h> |
@@ -189,10 +190,8 @@ void bcm1480_mailbox_interrupt(void) | |||
189 | /* Clear the mailbox to clear the interrupt */ | 190 | /* Clear the mailbox to clear the interrupt */ |
190 | __raw_writeq(((u64)action)<<48, mailbox_0_clear_regs[cpu]); | 191 | __raw_writeq(((u64)action)<<48, mailbox_0_clear_regs[cpu]); |
191 | 192 | ||
192 | /* | 193 | if (action & SMP_RESCHEDULE_YOURSELF) |
193 | * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the | 194 | scheduler_ipi(); |
194 | * interrupt will do the reschedule for us | ||
195 | */ | ||
196 | 195 | ||
197 | if (action & SMP_CALL_FUNCTION) | 196 | if (action & SMP_CALL_FUNCTION) |
198 | smp_call_function_interrupt(); | 197 | smp_call_function_interrupt(); |
diff --git a/arch/mips/sibyte/sb1250/smp.c b/arch/mips/sibyte/sb1250/smp.c index c00a5cb1128d..38e7f6bd7922 100644 --- a/arch/mips/sibyte/sb1250/smp.c +++ b/arch/mips/sibyte/sb1250/smp.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/interrupt.h> | 21 | #include <linux/interrupt.h> |
22 | #include <linux/smp.h> | 22 | #include <linux/smp.h> |
23 | #include <linux/kernel_stat.h> | 23 | #include <linux/kernel_stat.h> |
24 | #include <linux/sched.h> | ||
24 | 25 | ||
25 | #include <asm/mmu_context.h> | 26 | #include <asm/mmu_context.h> |
26 | #include <asm/io.h> | 27 | #include <asm/io.h> |
@@ -177,10 +178,8 @@ void sb1250_mailbox_interrupt(void) | |||
177 | /* Clear the mailbox to clear the interrupt */ | 178 | /* Clear the mailbox to clear the interrupt */ |
178 | ____raw_writeq(((u64)action) << 48, mailbox_clear_regs[cpu]); | 179 | ____raw_writeq(((u64)action) << 48, mailbox_clear_regs[cpu]); |
179 | 180 | ||
180 | /* | 181 | if (action & SMP_RESCHEDULE_YOURSELF) |
181 | * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the | 182 | scheduler_ipi(); |
182 | * interrupt will do the reschedule for us | ||
183 | */ | ||
184 | 183 | ||
185 | if (action & SMP_CALL_FUNCTION) | 184 | if (action & SMP_CALL_FUNCTION) |
186 | smp_call_function_interrupt(); | 185 | smp_call_function_interrupt(); |
diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c index 226c826a2194..83fb27912231 100644 --- a/arch/mn10300/kernel/smp.c +++ b/arch/mn10300/kernel/smp.c | |||
@@ -494,14 +494,11 @@ void smp_send_stop(void) | |||
494 | * @irq: The interrupt number. | 494 | * @irq: The interrupt number. |
495 | * @dev_id: The device ID. | 495 | * @dev_id: The device ID. |
496 | * | 496 | * |
497 | * We need do nothing here, since the scheduling will be effected on our way | ||
498 | * back through entry.S. | ||
499 | * | ||
500 | * Returns IRQ_HANDLED to indicate we handled the interrupt successfully. | 497 | * Returns IRQ_HANDLED to indicate we handled the interrupt successfully. |
501 | */ | 498 | */ |
502 | static irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id) | 499 | static irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id) |
503 | { | 500 | { |
504 | /* do nothing */ | 501 | scheduler_ipi(); |
505 | return IRQ_HANDLED; | 502 | return IRQ_HANDLED; |
506 | } | 503 | } |
507 | 504 | ||
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 69d63d354ef0..828305f19cff 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c | |||
@@ -155,10 +155,7 @@ ipi_interrupt(int irq, void *dev_id) | |||
155 | 155 | ||
156 | case IPI_RESCHEDULE: | 156 | case IPI_RESCHEDULE: |
157 | smp_debug(100, KERN_DEBUG "CPU%d IPI_RESCHEDULE\n", this_cpu); | 157 | smp_debug(100, KERN_DEBUG "CPU%d IPI_RESCHEDULE\n", this_cpu); |
158 | /* | 158 | scheduler_ipi(); |
159 | * Reschedule callback. Everything to be | ||
160 | * done is done by the interrupt return path. | ||
161 | */ | ||
162 | break; | 159 | break; |
163 | 160 | ||
164 | case IPI_CALL_FUNC: | 161 | case IPI_CALL_FUNC: |
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index cbdbb14be4b0..9f9c204bef69 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c | |||
@@ -116,7 +116,7 @@ void smp_message_recv(int msg) | |||
116 | generic_smp_call_function_interrupt(); | 116 | generic_smp_call_function_interrupt(); |
117 | break; | 117 | break; |
118 | case PPC_MSG_RESCHEDULE: | 118 | case PPC_MSG_RESCHEDULE: |
119 | /* we notice need_resched on exit */ | 119 | scheduler_ipi(); |
120 | break; | 120 | break; |
121 | case PPC_MSG_CALL_FUNC_SINGLE: | 121 | case PPC_MSG_CALL_FUNC_SINGLE: |
122 | generic_smp_call_function_single_interrupt(); | 122 | generic_smp_call_function_single_interrupt(); |
@@ -146,7 +146,7 @@ static irqreturn_t call_function_action(int irq, void *data) | |||
146 | 146 | ||
147 | static irqreturn_t reschedule_action(int irq, void *data) | 147 | static irqreturn_t reschedule_action(int irq, void *data) |
148 | { | 148 | { |
149 | /* we just need the return path side effect of checking need_resched */ | 149 | scheduler_ipi(); |
150 | return IRQ_HANDLED; | 150 | return IRQ_HANDLED; |
151 | } | 151 | } |
152 | 152 | ||
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 63a97db83f96..63c7d9ff220d 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c | |||
@@ -165,12 +165,12 @@ static void do_ext_call_interrupt(unsigned int ext_int_code, | |||
165 | kstat_cpu(smp_processor_id()).irqs[EXTINT_IPI]++; | 165 | kstat_cpu(smp_processor_id()).irqs[EXTINT_IPI]++; |
166 | /* | 166 | /* |
167 | * handle bit signal external calls | 167 | * handle bit signal external calls |
168 | * | ||
169 | * For the ec_schedule signal we have to do nothing. All the work | ||
170 | * is done automatically when we return from the interrupt. | ||
171 | */ | 168 | */ |
172 | bits = xchg(&S390_lowcore.ext_call_fast, 0); | 169 | bits = xchg(&S390_lowcore.ext_call_fast, 0); |
173 | 170 | ||
171 | if (test_bit(ec_schedule, &bits)) | ||
172 | scheduler_ipi(); | ||
173 | |||
174 | if (test_bit(ec_call_function, &bits)) | 174 | if (test_bit(ec_call_function, &bits)) |
175 | generic_smp_call_function_interrupt(); | 175 | generic_smp_call_function_interrupt(); |
176 | 176 | ||
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c index 509b36b45115..6207561ea34a 100644 --- a/arch/sh/kernel/smp.c +++ b/arch/sh/kernel/smp.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/module.h> | 20 | #include <linux/module.h> |
21 | #include <linux/cpu.h> | 21 | #include <linux/cpu.h> |
22 | #include <linux/interrupt.h> | 22 | #include <linux/interrupt.h> |
23 | #include <linux/sched.h> | ||
23 | #include <asm/atomic.h> | 24 | #include <asm/atomic.h> |
24 | #include <asm/processor.h> | 25 | #include <asm/processor.h> |
25 | #include <asm/system.h> | 26 | #include <asm/system.h> |
@@ -323,6 +324,7 @@ void smp_message_recv(unsigned int msg) | |||
323 | generic_smp_call_function_interrupt(); | 324 | generic_smp_call_function_interrupt(); |
324 | break; | 325 | break; |
325 | case SMP_MSG_RESCHEDULE: | 326 | case SMP_MSG_RESCHEDULE: |
327 | scheduler_ipi(); | ||
326 | break; | 328 | break; |
327 | case SMP_MSG_FUNCTION_SINGLE: | 329 | case SMP_MSG_FUNCTION_SINGLE: |
328 | generic_smp_call_function_single_interrupt(); | 330 | generic_smp_call_function_single_interrupt(); |
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h index 1c79f32734a0..8b9c556d630b 100644 --- a/arch/sparc/include/asm/topology_64.h +++ b/arch/sparc/include/asm/topology_64.h | |||
@@ -65,6 +65,10 @@ static inline int pcibus_to_node(struct pci_bus *pbus) | |||
65 | #define smt_capable() (sparc64_multi_core) | 65 | #define smt_capable() (sparc64_multi_core) |
66 | #endif /* CONFIG_SMP */ | 66 | #endif /* CONFIG_SMP */ |
67 | 67 | ||
68 | #define cpu_coregroup_mask(cpu) (&cpu_core_map[cpu]) | 68 | extern cpumask_t cpu_core_map[NR_CPUS]; |
69 | static inline const struct cpumask *cpu_coregroup_mask(int cpu) | ||
70 | { | ||
71 | return &cpu_core_map[cpu]; | ||
72 | } | ||
69 | 73 | ||
70 | #endif /* _ASM_SPARC64_TOPOLOGY_H */ | 74 | #endif /* _ASM_SPARC64_TOPOLOGY_H */ |
diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c index 850a1360c0d6..442286d83435 100644 --- a/arch/sparc/kernel/smp_32.c +++ b/arch/sparc/kernel/smp_32.c | |||
@@ -129,7 +129,9 @@ struct linux_prom_registers smp_penguin_ctable __cpuinitdata = { 0 }; | |||
129 | 129 | ||
130 | void smp_send_reschedule(int cpu) | 130 | void smp_send_reschedule(int cpu) |
131 | { | 131 | { |
132 | /* See sparc64 */ | 132 | /* |
133 | * XXX missing reschedule IPI, see scheduler_ipi() | ||
134 | */ | ||
133 | } | 135 | } |
134 | 136 | ||
135 | void smp_send_stop(void) | 137 | void smp_send_stop(void) |
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 3e94a8c23238..9478da7fdb3e 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c | |||
@@ -1368,6 +1368,7 @@ void smp_send_reschedule(int cpu) | |||
1368 | void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs) | 1368 | void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs) |
1369 | { | 1369 | { |
1370 | clear_softint(1 << irq); | 1370 | clear_softint(1 << irq); |
1371 | scheduler_ipi(); | ||
1371 | } | 1372 | } |
1372 | 1373 | ||
1373 | /* This is a nop because we capture all other cpus | 1374 | /* This is a nop because we capture all other cpus |
diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c index a4293102ef81..c52224d5ed45 100644 --- a/arch/tile/kernel/smp.c +++ b/arch/tile/kernel/smp.c | |||
@@ -189,12 +189,8 @@ void flush_icache_range(unsigned long start, unsigned long end) | |||
189 | /* Called when smp_send_reschedule() triggers IRQ_RESCHEDULE. */ | 189 | /* Called when smp_send_reschedule() triggers IRQ_RESCHEDULE. */ |
190 | static irqreturn_t handle_reschedule_ipi(int irq, void *token) | 190 | static irqreturn_t handle_reschedule_ipi(int irq, void *token) |
191 | { | 191 | { |
192 | /* | ||
193 | * Nothing to do here; when we return from interrupt, the | ||
194 | * rescheduling will occur there. But do bump the interrupt | ||
195 | * profiler count in the meantime. | ||
196 | */ | ||
197 | __get_cpu_var(irq_stat).irq_resched_count++; | 192 | __get_cpu_var(irq_stat).irq_resched_count++; |
193 | scheduler_ipi(); | ||
198 | 194 | ||
199 | return IRQ_HANDLED; | 195 | return IRQ_HANDLED; |
200 | } | 196 | } |
diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c index 106bf27e2a9a..eefb107d2d73 100644 --- a/arch/um/kernel/smp.c +++ b/arch/um/kernel/smp.c | |||
@@ -173,7 +173,7 @@ void IPI_handler(int cpu) | |||
173 | break; | 173 | break; |
174 | 174 | ||
175 | case 'R': | 175 | case 'R': |
176 | set_tsk_need_resched(current); | 176 | scheduler_ipi(); |
177 | break; | 177 | break; |
178 | 178 | ||
179 | case 'S': | 179 | case 'S': |
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 513deac7228d..013e7eba83bb 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c | |||
@@ -194,14 +194,13 @@ static void native_stop_other_cpus(int wait) | |||
194 | } | 194 | } |
195 | 195 | ||
196 | /* | 196 | /* |
197 | * Reschedule call back. Nothing to do, | 197 | * Reschedule call back. |
198 | * all the work is done automatically when | ||
199 | * we return from the interrupt. | ||
200 | */ | 198 | */ |
201 | void smp_reschedule_interrupt(struct pt_regs *regs) | 199 | void smp_reschedule_interrupt(struct pt_regs *regs) |
202 | { | 200 | { |
203 | ack_APIC_irq(); | 201 | ack_APIC_irq(); |
204 | inc_irq_stat(irq_resched_count); | 202 | inc_irq_stat(irq_resched_count); |
203 | scheduler_ipi(); | ||
205 | /* | 204 | /* |
206 | * KVM uses this interrupt to force a cpu out of guest mode | 205 | * KVM uses this interrupt to force a cpu out of guest mode |
207 | */ | 206 | */ |
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 194a3edef5cb..41038c01de40 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c | |||
@@ -46,13 +46,12 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); | |||
46 | static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); | 46 | static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); |
47 | 47 | ||
48 | /* | 48 | /* |
49 | * Reschedule call back. Nothing to do, | 49 | * Reschedule call back. |
50 | * all the work is done automatically when | ||
51 | * we return from the interrupt. | ||
52 | */ | 50 | */ |
53 | static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) | 51 | static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) |
54 | { | 52 | { |
55 | inc_irq_stat(irq_resched_count); | 53 | inc_irq_stat(irq_resched_count); |
54 | scheduler_ipi(); | ||
56 | 55 | ||
57 | return IRQ_HANDLED; | 56 | return IRQ_HANDLED; |
58 | } | 57 | } |
diff --git a/include/linux/init_task.h b/include/linux/init_task.h index caa151fbebb7..689496bb6654 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h | |||
@@ -134,7 +134,6 @@ extern struct cred init_cred; | |||
134 | .stack = &init_thread_info, \ | 134 | .stack = &init_thread_info, \ |
135 | .usage = ATOMIC_INIT(2), \ | 135 | .usage = ATOMIC_INIT(2), \ |
136 | .flags = PF_KTHREAD, \ | 136 | .flags = PF_KTHREAD, \ |
137 | .lock_depth = -1, \ | ||
138 | .prio = MAX_PRIO-20, \ | 137 | .prio = MAX_PRIO-20, \ |
139 | .static_prio = MAX_PRIO-20, \ | 138 | .static_prio = MAX_PRIO-20, \ |
140 | .normal_prio = MAX_PRIO-20, \ | 139 | .normal_prio = MAX_PRIO-20, \ |
diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 94b48bd40dd7..c75471db576e 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h | |||
@@ -51,7 +51,7 @@ struct mutex { | |||
51 | spinlock_t wait_lock; | 51 | spinlock_t wait_lock; |
52 | struct list_head wait_list; | 52 | struct list_head wait_list; |
53 | #if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) | 53 | #if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) |
54 | struct thread_info *owner; | 54 | struct task_struct *owner; |
55 | #endif | 55 | #endif |
56 | #ifdef CONFIG_DEBUG_MUTEXES | 56 | #ifdef CONFIG_DEBUG_MUTEXES |
57 | const char *name; | 57 | const char *name; |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 781abd137673..12211e1666e2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -360,7 +360,7 @@ extern signed long schedule_timeout_interruptible(signed long timeout); | |||
360 | extern signed long schedule_timeout_killable(signed long timeout); | 360 | extern signed long schedule_timeout_killable(signed long timeout); |
361 | extern signed long schedule_timeout_uninterruptible(signed long timeout); | 361 | extern signed long schedule_timeout_uninterruptible(signed long timeout); |
362 | asmlinkage void schedule(void); | 362 | asmlinkage void schedule(void); |
363 | extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); | 363 | extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); |
364 | 364 | ||
365 | struct nsproxy; | 365 | struct nsproxy; |
366 | struct user_namespace; | 366 | struct user_namespace; |
@@ -731,10 +731,6 @@ struct sched_info { | |||
731 | /* timestamps */ | 731 | /* timestamps */ |
732 | unsigned long long last_arrival,/* when we last ran on a cpu */ | 732 | unsigned long long last_arrival,/* when we last ran on a cpu */ |
733 | last_queued; /* when we were last queued to run */ | 733 | last_queued; /* when we were last queued to run */ |
734 | #ifdef CONFIG_SCHEDSTATS | ||
735 | /* BKL stats */ | ||
736 | unsigned int bkl_count; | ||
737 | #endif | ||
738 | }; | 734 | }; |
739 | #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ | 735 | #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ |
740 | 736 | ||
@@ -868,6 +864,7 @@ static inline int sd_power_saving_flags(void) | |||
868 | 864 | ||
869 | struct sched_group { | 865 | struct sched_group { |
870 | struct sched_group *next; /* Must be a circular list */ | 866 | struct sched_group *next; /* Must be a circular list */ |
867 | atomic_t ref; | ||
871 | 868 | ||
872 | /* | 869 | /* |
873 | * CPU power of this group, SCHED_LOAD_SCALE being max power for a | 870 | * CPU power of this group, SCHED_LOAD_SCALE being max power for a |
@@ -882,9 +879,6 @@ struct sched_group { | |||
882 | * NOTE: this field is variable length. (Allocated dynamically | 879 | * NOTE: this field is variable length. (Allocated dynamically |
883 | * by attaching extra space to the end of the structure, | 880 | * by attaching extra space to the end of the structure, |
884 | * depending on how many CPUs the kernel has booted up with) | 881 | * depending on how many CPUs the kernel has booted up with) |
885 | * | ||
886 | * It is also be embedded into static data structures at build | ||
887 | * time. (See 'struct static_sched_group' in kernel/sched.c) | ||
888 | */ | 882 | */ |
889 | unsigned long cpumask[0]; | 883 | unsigned long cpumask[0]; |
890 | }; | 884 | }; |
@@ -894,17 +888,6 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) | |||
894 | return to_cpumask(sg->cpumask); | 888 | return to_cpumask(sg->cpumask); |
895 | } | 889 | } |
896 | 890 | ||
897 | enum sched_domain_level { | ||
898 | SD_LV_NONE = 0, | ||
899 | SD_LV_SIBLING, | ||
900 | SD_LV_MC, | ||
901 | SD_LV_BOOK, | ||
902 | SD_LV_CPU, | ||
903 | SD_LV_NODE, | ||
904 | SD_LV_ALLNODES, | ||
905 | SD_LV_MAX | ||
906 | }; | ||
907 | |||
908 | struct sched_domain_attr { | 891 | struct sched_domain_attr { |
909 | int relax_domain_level; | 892 | int relax_domain_level; |
910 | }; | 893 | }; |
@@ -913,6 +896,8 @@ struct sched_domain_attr { | |||
913 | .relax_domain_level = -1, \ | 896 | .relax_domain_level = -1, \ |
914 | } | 897 | } |
915 | 898 | ||
899 | extern int sched_domain_level_max; | ||
900 | |||
916 | struct sched_domain { | 901 | struct sched_domain { |
917 | /* These fields must be setup */ | 902 | /* These fields must be setup */ |
918 | struct sched_domain *parent; /* top domain must be null terminated */ | 903 | struct sched_domain *parent; /* top domain must be null terminated */ |
@@ -930,7 +915,7 @@ struct sched_domain { | |||
930 | unsigned int forkexec_idx; | 915 | unsigned int forkexec_idx; |
931 | unsigned int smt_gain; | 916 | unsigned int smt_gain; |
932 | int flags; /* See SD_* */ | 917 | int flags; /* See SD_* */ |
933 | enum sched_domain_level level; | 918 | int level; |
934 | 919 | ||
935 | /* Runtime fields. */ | 920 | /* Runtime fields. */ |
936 | unsigned long last_balance; /* init to jiffies. units in jiffies */ | 921 | unsigned long last_balance; /* init to jiffies. units in jiffies */ |
@@ -973,6 +958,10 @@ struct sched_domain { | |||
973 | #ifdef CONFIG_SCHED_DEBUG | 958 | #ifdef CONFIG_SCHED_DEBUG |
974 | char *name; | 959 | char *name; |
975 | #endif | 960 | #endif |
961 | union { | ||
962 | void *private; /* used during construction */ | ||
963 | struct rcu_head rcu; /* used during destruction */ | ||
964 | }; | ||
976 | 965 | ||
977 | unsigned int span_weight; | 966 | unsigned int span_weight; |
978 | /* | 967 | /* |
@@ -981,9 +970,6 @@ struct sched_domain { | |||
981 | * NOTE: this field is variable length. (Allocated dynamically | 970 | * NOTE: this field is variable length. (Allocated dynamically |
982 | * by attaching extra space to the end of the structure, | 971 | * by attaching extra space to the end of the structure, |
983 | * depending on how many CPUs the kernel has booted up with) | 972 | * depending on how many CPUs the kernel has booted up with) |
984 | * | ||
985 | * It is also be embedded into static data structures at build | ||
986 | * time. (See 'struct static_sched_domain' in kernel/sched.c) | ||
987 | */ | 973 | */ |
988 | unsigned long span[0]; | 974 | unsigned long span[0]; |
989 | }; | 975 | }; |
@@ -1048,8 +1034,12 @@ struct sched_domain; | |||
1048 | #define WF_FORK 0x02 /* child wakeup after fork */ | 1034 | #define WF_FORK 0x02 /* child wakeup after fork */ |
1049 | 1035 | ||
1050 | #define ENQUEUE_WAKEUP 1 | 1036 | #define ENQUEUE_WAKEUP 1 |
1051 | #define ENQUEUE_WAKING 2 | 1037 | #define ENQUEUE_HEAD 2 |
1052 | #define ENQUEUE_HEAD 4 | 1038 | #ifdef CONFIG_SMP |
1039 | #define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ | ||
1040 | #else | ||
1041 | #define ENQUEUE_WAKING 0 | ||
1042 | #endif | ||
1053 | 1043 | ||
1054 | #define DEQUEUE_SLEEP 1 | 1044 | #define DEQUEUE_SLEEP 1 |
1055 | 1045 | ||
@@ -1067,12 +1057,11 @@ struct sched_class { | |||
1067 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); | 1057 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); |
1068 | 1058 | ||
1069 | #ifdef CONFIG_SMP | 1059 | #ifdef CONFIG_SMP |
1070 | int (*select_task_rq)(struct rq *rq, struct task_struct *p, | 1060 | int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); |
1071 | int sd_flag, int flags); | ||
1072 | 1061 | ||
1073 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); | 1062 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); |
1074 | void (*post_schedule) (struct rq *this_rq); | 1063 | void (*post_schedule) (struct rq *this_rq); |
1075 | void (*task_waking) (struct rq *this_rq, struct task_struct *task); | 1064 | void (*task_waking) (struct task_struct *task); |
1076 | void (*task_woken) (struct rq *this_rq, struct task_struct *task); | 1065 | void (*task_woken) (struct rq *this_rq, struct task_struct *task); |
1077 | 1066 | ||
1078 | void (*set_cpus_allowed)(struct task_struct *p, | 1067 | void (*set_cpus_allowed)(struct task_struct *p, |
@@ -1197,13 +1186,11 @@ struct task_struct { | |||
1197 | unsigned int flags; /* per process flags, defined below */ | 1186 | unsigned int flags; /* per process flags, defined below */ |
1198 | unsigned int ptrace; | 1187 | unsigned int ptrace; |
1199 | 1188 | ||
1200 | int lock_depth; /* BKL lock depth */ | ||
1201 | |||
1202 | #ifdef CONFIG_SMP | 1189 | #ifdef CONFIG_SMP |
1203 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | 1190 | struct task_struct *wake_entry; |
1204 | int oncpu; | 1191 | int on_cpu; |
1205 | #endif | ||
1206 | #endif | 1192 | #endif |
1193 | int on_rq; | ||
1207 | 1194 | ||
1208 | int prio, static_prio, normal_prio; | 1195 | int prio, static_prio, normal_prio; |
1209 | unsigned int rt_priority; | 1196 | unsigned int rt_priority; |
@@ -1274,6 +1261,7 @@ struct task_struct { | |||
1274 | 1261 | ||
1275 | /* Revert to default priority/policy when forking */ | 1262 | /* Revert to default priority/policy when forking */ |
1276 | unsigned sched_reset_on_fork:1; | 1263 | unsigned sched_reset_on_fork:1; |
1264 | unsigned sched_contributes_to_load:1; | ||
1277 | 1265 | ||
1278 | pid_t pid; | 1266 | pid_t pid; |
1279 | pid_t tgid; | 1267 | pid_t tgid; |
@@ -2063,14 +2051,13 @@ extern void xtime_update(unsigned long ticks); | |||
2063 | 2051 | ||
2064 | extern int wake_up_state(struct task_struct *tsk, unsigned int state); | 2052 | extern int wake_up_state(struct task_struct *tsk, unsigned int state); |
2065 | extern int wake_up_process(struct task_struct *tsk); | 2053 | extern int wake_up_process(struct task_struct *tsk); |
2066 | extern void wake_up_new_task(struct task_struct *tsk, | 2054 | extern void wake_up_new_task(struct task_struct *tsk); |
2067 | unsigned long clone_flags); | ||
2068 | #ifdef CONFIG_SMP | 2055 | #ifdef CONFIG_SMP |
2069 | extern void kick_process(struct task_struct *tsk); | 2056 | extern void kick_process(struct task_struct *tsk); |
2070 | #else | 2057 | #else |
2071 | static inline void kick_process(struct task_struct *tsk) { } | 2058 | static inline void kick_process(struct task_struct *tsk) { } |
2072 | #endif | 2059 | #endif |
2073 | extern void sched_fork(struct task_struct *p, int clone_flags); | 2060 | extern void sched_fork(struct task_struct *p); |
2074 | extern void sched_dead(struct task_struct *p); | 2061 | extern void sched_dead(struct task_struct *p); |
2075 | 2062 | ||
2076 | extern void proc_caches_init(void); | 2063 | extern void proc_caches_init(void); |
@@ -2195,8 +2182,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from); | |||
2195 | extern char *get_task_comm(char *to, struct task_struct *tsk); | 2182 | extern char *get_task_comm(char *to, struct task_struct *tsk); |
2196 | 2183 | ||
2197 | #ifdef CONFIG_SMP | 2184 | #ifdef CONFIG_SMP |
2185 | void scheduler_ipi(void); | ||
2198 | extern unsigned long wait_task_inactive(struct task_struct *, long match_state); | 2186 | extern unsigned long wait_task_inactive(struct task_struct *, long match_state); |
2199 | #else | 2187 | #else |
2188 | static inline void scheduler_ipi(void) { } | ||
2200 | static inline unsigned long wait_task_inactive(struct task_struct *p, | 2189 | static inline unsigned long wait_task_inactive(struct task_struct *p, |
2201 | long match_state) | 2190 | long match_state) |
2202 | { | 2191 | { |
diff --git a/init/Kconfig b/init/Kconfig index 7a71e0a9992a..af958ad26d60 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -827,6 +827,11 @@ config SCHED_AUTOGROUP | |||
827 | desktop applications. Task group autogeneration is currently based | 827 | desktop applications. Task group autogeneration is currently based |
828 | upon task session. | 828 | upon task session. |
829 | 829 | ||
830 | config SCHED_TTWU_QUEUE | ||
831 | bool | ||
832 | depends on !SPARC32 | ||
833 | default y | ||
834 | |||
830 | config MM_OWNER | 835 | config MM_OWNER |
831 | bool | 836 | bool |
832 | 837 | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 33eee16addb8..2bb8c2e98fff 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1159,7 +1159,7 @@ int current_cpuset_is_being_rebound(void) | |||
1159 | static int update_relax_domain_level(struct cpuset *cs, s64 val) | 1159 | static int update_relax_domain_level(struct cpuset *cs, s64 val) |
1160 | { | 1160 | { |
1161 | #ifdef CONFIG_SMP | 1161 | #ifdef CONFIG_SMP |
1162 | if (val < -1 || val >= SD_LV_MAX) | 1162 | if (val < -1 || val >= sched_domain_level_max) |
1163 | return -EINVAL; | 1163 | return -EINVAL; |
1164 | #endif | 1164 | #endif |
1165 | 1165 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index e7548dee636b..2b44d82b8237 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1103,7 +1103,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1103 | 1103 | ||
1104 | posix_cpu_timers_init(p); | 1104 | posix_cpu_timers_init(p); |
1105 | 1105 | ||
1106 | p->lock_depth = -1; /* -1 = no lock */ | ||
1107 | do_posix_clock_monotonic_gettime(&p->start_time); | 1106 | do_posix_clock_monotonic_gettime(&p->start_time); |
1108 | p->real_start_time = p->start_time; | 1107 | p->real_start_time = p->start_time; |
1109 | monotonic_to_bootbased(&p->real_start_time); | 1108 | monotonic_to_bootbased(&p->real_start_time); |
@@ -1153,7 +1152,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1153 | #endif | 1152 | #endif |
1154 | 1153 | ||
1155 | /* Perform scheduler related setup. Assign this task to a CPU. */ | 1154 | /* Perform scheduler related setup. Assign this task to a CPU. */ |
1156 | sched_fork(p, clone_flags); | 1155 | sched_fork(p); |
1157 | 1156 | ||
1158 | retval = perf_event_init_task(p); | 1157 | retval = perf_event_init_task(p); |
1159 | if (retval) | 1158 | if (retval) |
@@ -1464,7 +1463,7 @@ long do_fork(unsigned long clone_flags, | |||
1464 | */ | 1463 | */ |
1465 | p->flags &= ~PF_STARTING; | 1464 | p->flags &= ~PF_STARTING; |
1466 | 1465 | ||
1467 | wake_up_new_task(p, clone_flags); | 1466 | wake_up_new_task(p); |
1468 | 1467 | ||
1469 | tracehook_report_clone_complete(trace, regs, | 1468 | tracehook_report_clone_complete(trace, regs, |
1470 | clone_flags, nr, p); | 1469 | clone_flags, nr, p); |
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index ec815a960b5d..73da83aff418 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c | |||
@@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock) | |||
75 | return; | 75 | return; |
76 | 76 | ||
77 | DEBUG_LOCKS_WARN_ON(lock->magic != lock); | 77 | DEBUG_LOCKS_WARN_ON(lock->magic != lock); |
78 | DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); | 78 | DEBUG_LOCKS_WARN_ON(lock->owner != current); |
79 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); | 79 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); |
80 | mutex_clear_owner(lock); | 80 | mutex_clear_owner(lock); |
81 | } | 81 | } |
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index 57d527a16f9d..0799fd3e4cfa 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h | |||
@@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, | |||
29 | 29 | ||
30 | static inline void mutex_set_owner(struct mutex *lock) | 30 | static inline void mutex_set_owner(struct mutex *lock) |
31 | { | 31 | { |
32 | lock->owner = current_thread_info(); | 32 | lock->owner = current; |
33 | } | 33 | } |
34 | 34 | ||
35 | static inline void mutex_clear_owner(struct mutex *lock) | 35 | static inline void mutex_clear_owner(struct mutex *lock) |
diff --git a/kernel/mutex.c b/kernel/mutex.c index c4195fa98900..2c938e2337cd 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -160,14 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
160 | */ | 160 | */ |
161 | 161 | ||
162 | for (;;) { | 162 | for (;;) { |
163 | struct thread_info *owner; | 163 | struct task_struct *owner; |
164 | |||
165 | /* | ||
166 | * If we own the BKL, then don't spin. The owner of | ||
167 | * the mutex might be waiting on us to release the BKL. | ||
168 | */ | ||
169 | if (unlikely(current->lock_depth >= 0)) | ||
170 | break; | ||
171 | 164 | ||
172 | /* | 165 | /* |
173 | * If there's an owner, wait for it to either | 166 | * If there's an owner, wait for it to either |
diff --git a/kernel/mutex.h b/kernel/mutex.h index 67578ca48f94..4115fbf83b12 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h | |||
@@ -19,7 +19,7 @@ | |||
19 | #ifdef CONFIG_SMP | 19 | #ifdef CONFIG_SMP |
20 | static inline void mutex_set_owner(struct mutex *lock) | 20 | static inline void mutex_set_owner(struct mutex *lock) |
21 | { | 21 | { |
22 | lock->owner = current_thread_info(); | 22 | lock->owner = current; |
23 | } | 23 | } |
24 | 24 | ||
25 | static inline void mutex_clear_owner(struct mutex *lock) | 25 | static inline void mutex_clear_owner(struct mutex *lock) |
diff --git a/kernel/sched.c b/kernel/sched.c index 312f8b95c2d4..c62acf45d3b9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -231,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
231 | #endif | 231 | #endif |
232 | 232 | ||
233 | /* | 233 | /* |
234 | * sched_domains_mutex serializes calls to arch_init_sched_domains, | 234 | * sched_domains_mutex serializes calls to init_sched_domains, |
235 | * detach_destroy_domains and partition_sched_domains. | 235 | * detach_destroy_domains and partition_sched_domains. |
236 | */ | 236 | */ |
237 | static DEFINE_MUTEX(sched_domains_mutex); | 237 | static DEFINE_MUTEX(sched_domains_mutex); |
@@ -312,6 +312,9 @@ struct cfs_rq { | |||
312 | 312 | ||
313 | u64 exec_clock; | 313 | u64 exec_clock; |
314 | u64 min_vruntime; | 314 | u64 min_vruntime; |
315 | #ifndef CONFIG_64BIT | ||
316 | u64 min_vruntime_copy; | ||
317 | #endif | ||
315 | 318 | ||
316 | struct rb_root tasks_timeline; | 319 | struct rb_root tasks_timeline; |
317 | struct rb_node *rb_leftmost; | 320 | struct rb_node *rb_leftmost; |
@@ -325,7 +328,9 @@ struct cfs_rq { | |||
325 | */ | 328 | */ |
326 | struct sched_entity *curr, *next, *last, *skip; | 329 | struct sched_entity *curr, *next, *last, *skip; |
327 | 330 | ||
331 | #ifdef CONFIG_SCHED_DEBUG | ||
328 | unsigned int nr_spread_over; | 332 | unsigned int nr_spread_over; |
333 | #endif | ||
329 | 334 | ||
330 | #ifdef CONFIG_FAIR_GROUP_SCHED | 335 | #ifdef CONFIG_FAIR_GROUP_SCHED |
331 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 336 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ |
@@ -417,6 +422,7 @@ struct rt_rq { | |||
417 | */ | 422 | */ |
418 | struct root_domain { | 423 | struct root_domain { |
419 | atomic_t refcount; | 424 | atomic_t refcount; |
425 | struct rcu_head rcu; | ||
420 | cpumask_var_t span; | 426 | cpumask_var_t span; |
421 | cpumask_var_t online; | 427 | cpumask_var_t online; |
422 | 428 | ||
@@ -460,7 +466,7 @@ struct rq { | |||
460 | u64 nohz_stamp; | 466 | u64 nohz_stamp; |
461 | unsigned char nohz_balance_kick; | 467 | unsigned char nohz_balance_kick; |
462 | #endif | 468 | #endif |
463 | unsigned int skip_clock_update; | 469 | int skip_clock_update; |
464 | 470 | ||
465 | /* capture load from *all* tasks on this cpu: */ | 471 | /* capture load from *all* tasks on this cpu: */ |
466 | struct load_weight load; | 472 | struct load_weight load; |
@@ -553,6 +559,10 @@ struct rq { | |||
553 | unsigned int ttwu_count; | 559 | unsigned int ttwu_count; |
554 | unsigned int ttwu_local; | 560 | unsigned int ttwu_local; |
555 | #endif | 561 | #endif |
562 | |||
563 | #ifdef CONFIG_SMP | ||
564 | struct task_struct *wake_list; | ||
565 | #endif | ||
556 | }; | 566 | }; |
557 | 567 | ||
558 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 568 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
@@ -571,7 +581,7 @@ static inline int cpu_of(struct rq *rq) | |||
571 | 581 | ||
572 | #define rcu_dereference_check_sched_domain(p) \ | 582 | #define rcu_dereference_check_sched_domain(p) \ |
573 | rcu_dereference_check((p), \ | 583 | rcu_dereference_check((p), \ |
574 | rcu_read_lock_sched_held() || \ | 584 | rcu_read_lock_held() || \ |
575 | lockdep_is_held(&sched_domains_mutex)) | 585 | lockdep_is_held(&sched_domains_mutex)) |
576 | 586 | ||
577 | /* | 587 | /* |
@@ -596,7 +606,7 @@ static inline int cpu_of(struct rq *rq) | |||
596 | * Return the group to which this tasks belongs. | 606 | * Return the group to which this tasks belongs. |
597 | * | 607 | * |
598 | * We use task_subsys_state_check() and extend the RCU verification | 608 | * We use task_subsys_state_check() and extend the RCU verification |
599 | * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() | 609 | * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach() |
600 | * holds that lock for each task it moves into the cgroup. Therefore | 610 | * holds that lock for each task it moves into the cgroup. Therefore |
601 | * by holding that lock, we pin the task to the current cgroup. | 611 | * by holding that lock, we pin the task to the current cgroup. |
602 | */ | 612 | */ |
@@ -606,7 +616,7 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
606 | struct cgroup_subsys_state *css; | 616 | struct cgroup_subsys_state *css; |
607 | 617 | ||
608 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | 618 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, |
609 | lockdep_is_held(&task_rq(p)->lock)); | 619 | lockdep_is_held(&p->pi_lock)); |
610 | tg = container_of(css, struct task_group, css); | 620 | tg = container_of(css, struct task_group, css); |
611 | 621 | ||
612 | return autogroup_task_group(p, tg); | 622 | return autogroup_task_group(p, tg); |
@@ -642,7 +652,7 @@ static void update_rq_clock(struct rq *rq) | |||
642 | { | 652 | { |
643 | s64 delta; | 653 | s64 delta; |
644 | 654 | ||
645 | if (rq->skip_clock_update) | 655 | if (rq->skip_clock_update > 0) |
646 | return; | 656 | return; |
647 | 657 | ||
648 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; | 658 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; |
@@ -838,18 +848,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p) | |||
838 | return rq->curr == p; | 848 | return rq->curr == p; |
839 | } | 849 | } |
840 | 850 | ||
841 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
842 | static inline int task_running(struct rq *rq, struct task_struct *p) | 851 | static inline int task_running(struct rq *rq, struct task_struct *p) |
843 | { | 852 | { |
853 | #ifdef CONFIG_SMP | ||
854 | return p->on_cpu; | ||
855 | #else | ||
844 | return task_current(rq, p); | 856 | return task_current(rq, p); |
857 | #endif | ||
845 | } | 858 | } |
846 | 859 | ||
860 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
847 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 861 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
848 | { | 862 | { |
863 | #ifdef CONFIG_SMP | ||
864 | /* | ||
865 | * We can optimise this out completely for !SMP, because the | ||
866 | * SMP rebalancing from interrupt is the only thing that cares | ||
867 | * here. | ||
868 | */ | ||
869 | next->on_cpu = 1; | ||
870 | #endif | ||
849 | } | 871 | } |
850 | 872 | ||
851 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | 873 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
852 | { | 874 | { |
875 | #ifdef CONFIG_SMP | ||
876 | /* | ||
877 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
878 | * We must ensure this doesn't happen until the switch is completely | ||
879 | * finished. | ||
880 | */ | ||
881 | smp_wmb(); | ||
882 | prev->on_cpu = 0; | ||
883 | #endif | ||
853 | #ifdef CONFIG_DEBUG_SPINLOCK | 884 | #ifdef CONFIG_DEBUG_SPINLOCK |
854 | /* this is a valid case when another task releases the spinlock */ | 885 | /* this is a valid case when another task releases the spinlock */ |
855 | rq->lock.owner = current; | 886 | rq->lock.owner = current; |
@@ -865,15 +896,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
865 | } | 896 | } |
866 | 897 | ||
867 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | 898 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ |
868 | static inline int task_running(struct rq *rq, struct task_struct *p) | ||
869 | { | ||
870 | #ifdef CONFIG_SMP | ||
871 | return p->oncpu; | ||
872 | #else | ||
873 | return task_current(rq, p); | ||
874 | #endif | ||
875 | } | ||
876 | |||
877 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 899 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
878 | { | 900 | { |
879 | #ifdef CONFIG_SMP | 901 | #ifdef CONFIG_SMP |
@@ -882,7 +904,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | |||
882 | * SMP rebalancing from interrupt is the only thing that cares | 904 | * SMP rebalancing from interrupt is the only thing that cares |
883 | * here. | 905 | * here. |
884 | */ | 906 | */ |
885 | next->oncpu = 1; | 907 | next->on_cpu = 1; |
886 | #endif | 908 | #endif |
887 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 909 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
888 | raw_spin_unlock_irq(&rq->lock); | 910 | raw_spin_unlock_irq(&rq->lock); |
@@ -895,12 +917,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
895 | { | 917 | { |
896 | #ifdef CONFIG_SMP | 918 | #ifdef CONFIG_SMP |
897 | /* | 919 | /* |
898 | * After ->oncpu is cleared, the task can be moved to a different CPU. | 920 | * After ->on_cpu is cleared, the task can be moved to a different CPU. |
899 | * We must ensure this doesn't happen until the switch is completely | 921 | * We must ensure this doesn't happen until the switch is completely |
900 | * finished. | 922 | * finished. |
901 | */ | 923 | */ |
902 | smp_wmb(); | 924 | smp_wmb(); |
903 | prev->oncpu = 0; | 925 | prev->on_cpu = 0; |
904 | #endif | 926 | #endif |
905 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 927 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
906 | local_irq_enable(); | 928 | local_irq_enable(); |
@@ -909,23 +931,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
909 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 931 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
910 | 932 | ||
911 | /* | 933 | /* |
912 | * Check whether the task is waking, we use this to synchronize ->cpus_allowed | 934 | * __task_rq_lock - lock the rq @p resides on. |
913 | * against ttwu(). | ||
914 | */ | ||
915 | static inline int task_is_waking(struct task_struct *p) | ||
916 | { | ||
917 | return unlikely(p->state == TASK_WAKING); | ||
918 | } | ||
919 | |||
920 | /* | ||
921 | * __task_rq_lock - lock the runqueue a given task resides on. | ||
922 | * Must be called interrupts disabled. | ||
923 | */ | 935 | */ |
924 | static inline struct rq *__task_rq_lock(struct task_struct *p) | 936 | static inline struct rq *__task_rq_lock(struct task_struct *p) |
925 | __acquires(rq->lock) | 937 | __acquires(rq->lock) |
926 | { | 938 | { |
927 | struct rq *rq; | 939 | struct rq *rq; |
928 | 940 | ||
941 | lockdep_assert_held(&p->pi_lock); | ||
942 | |||
929 | for (;;) { | 943 | for (;;) { |
930 | rq = task_rq(p); | 944 | rq = task_rq(p); |
931 | raw_spin_lock(&rq->lock); | 945 | raw_spin_lock(&rq->lock); |
@@ -936,22 +950,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) | |||
936 | } | 950 | } |
937 | 951 | ||
938 | /* | 952 | /* |
939 | * task_rq_lock - lock the runqueue a given task resides on and disable | 953 | * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. |
940 | * interrupts. Note the ordering: we can safely lookup the task_rq without | ||
941 | * explicitly disabling preemption. | ||
942 | */ | 954 | */ |
943 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | 955 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) |
956 | __acquires(p->pi_lock) | ||
944 | __acquires(rq->lock) | 957 | __acquires(rq->lock) |
945 | { | 958 | { |
946 | struct rq *rq; | 959 | struct rq *rq; |
947 | 960 | ||
948 | for (;;) { | 961 | for (;;) { |
949 | local_irq_save(*flags); | 962 | raw_spin_lock_irqsave(&p->pi_lock, *flags); |
950 | rq = task_rq(p); | 963 | rq = task_rq(p); |
951 | raw_spin_lock(&rq->lock); | 964 | raw_spin_lock(&rq->lock); |
952 | if (likely(rq == task_rq(p))) | 965 | if (likely(rq == task_rq(p))) |
953 | return rq; | 966 | return rq; |
954 | raw_spin_unlock_irqrestore(&rq->lock, *flags); | 967 | raw_spin_unlock(&rq->lock); |
968 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | ||
955 | } | 969 | } |
956 | } | 970 | } |
957 | 971 | ||
@@ -961,10 +975,13 @@ static void __task_rq_unlock(struct rq *rq) | |||
961 | raw_spin_unlock(&rq->lock); | 975 | raw_spin_unlock(&rq->lock); |
962 | } | 976 | } |
963 | 977 | ||
964 | static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | 978 | static inline void |
979 | task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) | ||
965 | __releases(rq->lock) | 980 | __releases(rq->lock) |
981 | __releases(p->pi_lock) | ||
966 | { | 982 | { |
967 | raw_spin_unlock_irqrestore(&rq->lock, *flags); | 983 | raw_spin_unlock(&rq->lock); |
984 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | ||
968 | } | 985 | } |
969 | 986 | ||
970 | /* | 987 | /* |
@@ -1193,11 +1210,17 @@ int get_nohz_timer_target(void) | |||
1193 | int i; | 1210 | int i; |
1194 | struct sched_domain *sd; | 1211 | struct sched_domain *sd; |
1195 | 1212 | ||
1213 | rcu_read_lock(); | ||
1196 | for_each_domain(cpu, sd) { | 1214 | for_each_domain(cpu, sd) { |
1197 | for_each_cpu(i, sched_domain_span(sd)) | 1215 | for_each_cpu(i, sched_domain_span(sd)) { |
1198 | if (!idle_cpu(i)) | 1216 | if (!idle_cpu(i)) { |
1199 | return i; | 1217 | cpu = i; |
1218 | goto unlock; | ||
1219 | } | ||
1220 | } | ||
1200 | } | 1221 | } |
1222 | unlock: | ||
1223 | rcu_read_unlock(); | ||
1201 | return cpu; | 1224 | return cpu; |
1202 | } | 1225 | } |
1203 | /* | 1226 | /* |
@@ -1307,15 +1330,15 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, | |||
1307 | { | 1330 | { |
1308 | u64 tmp; | 1331 | u64 tmp; |
1309 | 1332 | ||
1333 | tmp = (u64)delta_exec * weight; | ||
1334 | |||
1310 | if (!lw->inv_weight) { | 1335 | if (!lw->inv_weight) { |
1311 | if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) | 1336 | if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) |
1312 | lw->inv_weight = 1; | 1337 | lw->inv_weight = 1; |
1313 | else | 1338 | else |
1314 | lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) | 1339 | lw->inv_weight = WMULT_CONST / lw->weight; |
1315 | / (lw->weight+1); | ||
1316 | } | 1340 | } |
1317 | 1341 | ||
1318 | tmp = (u64)delta_exec * weight; | ||
1319 | /* | 1342 | /* |
1320 | * Check whether we'd overflow the 64-bit multiplication: | 1343 | * Check whether we'd overflow the 64-bit multiplication: |
1321 | */ | 1344 | */ |
@@ -1773,7 +1796,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | |||
1773 | update_rq_clock(rq); | 1796 | update_rq_clock(rq); |
1774 | sched_info_queued(p); | 1797 | sched_info_queued(p); |
1775 | p->sched_class->enqueue_task(rq, p, flags); | 1798 | p->sched_class->enqueue_task(rq, p, flags); |
1776 | p->se.on_rq = 1; | ||
1777 | } | 1799 | } |
1778 | 1800 | ||
1779 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | 1801 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) |
@@ -1781,7 +1803,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | |||
1781 | update_rq_clock(rq); | 1803 | update_rq_clock(rq); |
1782 | sched_info_dequeued(p); | 1804 | sched_info_dequeued(p); |
1783 | p->sched_class->dequeue_task(rq, p, flags); | 1805 | p->sched_class->dequeue_task(rq, p, flags); |
1784 | p->se.on_rq = 0; | ||
1785 | } | 1806 | } |
1786 | 1807 | ||
1787 | /* | 1808 | /* |
@@ -2116,7 +2137,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
2116 | * A queue event has occurred, and we're going to schedule. In | 2137 | * A queue event has occurred, and we're going to schedule. In |
2117 | * this case, we can save a useless back to back clock update. | 2138 | * this case, we can save a useless back to back clock update. |
2118 | */ | 2139 | */ |
2119 | if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) | 2140 | if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) |
2120 | rq->skip_clock_update = 1; | 2141 | rq->skip_clock_update = 1; |
2121 | } | 2142 | } |
2122 | 2143 | ||
@@ -2162,6 +2183,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
2162 | */ | 2183 | */ |
2163 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && | 2184 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && |
2164 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); | 2185 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); |
2186 | |||
2187 | #ifdef CONFIG_LOCKDEP | ||
2188 | WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || | ||
2189 | lockdep_is_held(&task_rq(p)->lock))); | ||
2190 | #endif | ||
2165 | #endif | 2191 | #endif |
2166 | 2192 | ||
2167 | trace_sched_migrate_task(p, new_cpu); | 2193 | trace_sched_migrate_task(p, new_cpu); |
@@ -2182,19 +2208,6 @@ struct migration_arg { | |||
2182 | static int migration_cpu_stop(void *data); | 2208 | static int migration_cpu_stop(void *data); |
2183 | 2209 | ||
2184 | /* | 2210 | /* |
2185 | * The task's runqueue lock must be held. | ||
2186 | * Returns true if you have to wait for migration thread. | ||
2187 | */ | ||
2188 | static bool migrate_task(struct task_struct *p, struct rq *rq) | ||
2189 | { | ||
2190 | /* | ||
2191 | * If the task is not on a runqueue (and not running), then | ||
2192 | * the next wake-up will properly place the task. | ||
2193 | */ | ||
2194 | return p->se.on_rq || task_running(rq, p); | ||
2195 | } | ||
2196 | |||
2197 | /* | ||
2198 | * wait_task_inactive - wait for a thread to unschedule. | 2211 | * wait_task_inactive - wait for a thread to unschedule. |
2199 | * | 2212 | * |
2200 | * If @match_state is nonzero, it's the @p->state value just checked and | 2213 | * If @match_state is nonzero, it's the @p->state value just checked and |
@@ -2251,11 +2264,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
2251 | rq = task_rq_lock(p, &flags); | 2264 | rq = task_rq_lock(p, &flags); |
2252 | trace_sched_wait_task(p); | 2265 | trace_sched_wait_task(p); |
2253 | running = task_running(rq, p); | 2266 | running = task_running(rq, p); |
2254 | on_rq = p->se.on_rq; | 2267 | on_rq = p->on_rq; |
2255 | ncsw = 0; | 2268 | ncsw = 0; |
2256 | if (!match_state || p->state == match_state) | 2269 | if (!match_state || p->state == match_state) |
2257 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ | 2270 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ |
2258 | task_rq_unlock(rq, &flags); | 2271 | task_rq_unlock(rq, p, &flags); |
2259 | 2272 | ||
2260 | /* | 2273 | /* |
2261 | * If it changed from the expected state, bail out now. | 2274 | * If it changed from the expected state, bail out now. |
@@ -2330,7 +2343,7 @@ EXPORT_SYMBOL_GPL(kick_process); | |||
2330 | 2343 | ||
2331 | #ifdef CONFIG_SMP | 2344 | #ifdef CONFIG_SMP |
2332 | /* | 2345 | /* |
2333 | * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. | 2346 | * ->cpus_allowed is protected by both rq->lock and p->pi_lock |
2334 | */ | 2347 | */ |
2335 | static int select_fallback_rq(int cpu, struct task_struct *p) | 2348 | static int select_fallback_rq(int cpu, struct task_struct *p) |
2336 | { | 2349 | { |
@@ -2363,12 +2376,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
2363 | } | 2376 | } |
2364 | 2377 | ||
2365 | /* | 2378 | /* |
2366 | * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. | 2379 | * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. |
2367 | */ | 2380 | */ |
2368 | static inline | 2381 | static inline |
2369 | int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) | 2382 | int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) |
2370 | { | 2383 | { |
2371 | int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); | 2384 | int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); |
2372 | 2385 | ||
2373 | /* | 2386 | /* |
2374 | * In order not to call set_task_cpu() on a blocking task we need | 2387 | * In order not to call set_task_cpu() on a blocking task we need |
@@ -2394,27 +2407,62 @@ static void update_avg(u64 *avg, u64 sample) | |||
2394 | } | 2407 | } |
2395 | #endif | 2408 | #endif |
2396 | 2409 | ||
2397 | static inline void ttwu_activate(struct task_struct *p, struct rq *rq, | 2410 | static void |
2398 | bool is_sync, bool is_migrate, bool is_local, | 2411 | ttwu_stat(struct task_struct *p, int cpu, int wake_flags) |
2399 | unsigned long en_flags) | ||
2400 | { | 2412 | { |
2413 | #ifdef CONFIG_SCHEDSTATS | ||
2414 | struct rq *rq = this_rq(); | ||
2415 | |||
2416 | #ifdef CONFIG_SMP | ||
2417 | int this_cpu = smp_processor_id(); | ||
2418 | |||
2419 | if (cpu == this_cpu) { | ||
2420 | schedstat_inc(rq, ttwu_local); | ||
2421 | schedstat_inc(p, se.statistics.nr_wakeups_local); | ||
2422 | } else { | ||
2423 | struct sched_domain *sd; | ||
2424 | |||
2425 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | ||
2426 | rcu_read_lock(); | ||
2427 | for_each_domain(this_cpu, sd) { | ||
2428 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
2429 | schedstat_inc(sd, ttwu_wake_remote); | ||
2430 | break; | ||
2431 | } | ||
2432 | } | ||
2433 | rcu_read_unlock(); | ||
2434 | } | ||
2435 | #endif /* CONFIG_SMP */ | ||
2436 | |||
2437 | schedstat_inc(rq, ttwu_count); | ||
2401 | schedstat_inc(p, se.statistics.nr_wakeups); | 2438 | schedstat_inc(p, se.statistics.nr_wakeups); |
2402 | if (is_sync) | 2439 | |
2440 | if (wake_flags & WF_SYNC) | ||
2403 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | 2441 | schedstat_inc(p, se.statistics.nr_wakeups_sync); |
2404 | if (is_migrate) | 2442 | |
2443 | if (cpu != task_cpu(p)) | ||
2405 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | 2444 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); |
2406 | if (is_local) | ||
2407 | schedstat_inc(p, se.statistics.nr_wakeups_local); | ||
2408 | else | ||
2409 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | ||
2410 | 2445 | ||
2446 | #endif /* CONFIG_SCHEDSTATS */ | ||
2447 | } | ||
2448 | |||
2449 | static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) | ||
2450 | { | ||
2411 | activate_task(rq, p, en_flags); | 2451 | activate_task(rq, p, en_flags); |
2452 | p->on_rq = 1; | ||
2453 | |||
2454 | /* if a worker is waking up, notify workqueue */ | ||
2455 | if (p->flags & PF_WQ_WORKER) | ||
2456 | wq_worker_waking_up(p, cpu_of(rq)); | ||
2412 | } | 2457 | } |
2413 | 2458 | ||
2414 | static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | 2459 | /* |
2415 | int wake_flags, bool success) | 2460 | * Mark the task runnable and perform wakeup-preemption. |
2461 | */ | ||
2462 | static void | ||
2463 | ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) | ||
2416 | { | 2464 | { |
2417 | trace_sched_wakeup(p, success); | 2465 | trace_sched_wakeup(p, true); |
2418 | check_preempt_curr(rq, p, wake_flags); | 2466 | check_preempt_curr(rq, p, wake_flags); |
2419 | 2467 | ||
2420 | p->state = TASK_RUNNING; | 2468 | p->state = TASK_RUNNING; |
@@ -2433,9 +2481,99 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | |||
2433 | rq->idle_stamp = 0; | 2481 | rq->idle_stamp = 0; |
2434 | } | 2482 | } |
2435 | #endif | 2483 | #endif |
2436 | /* if a worker is waking up, notify workqueue */ | 2484 | } |
2437 | if ((p->flags & PF_WQ_WORKER) && success) | 2485 | |
2438 | wq_worker_waking_up(p, cpu_of(rq)); | 2486 | static void |
2487 | ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) | ||
2488 | { | ||
2489 | #ifdef CONFIG_SMP | ||
2490 | if (p->sched_contributes_to_load) | ||
2491 | rq->nr_uninterruptible--; | ||
2492 | #endif | ||
2493 | |||
2494 | ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); | ||
2495 | ttwu_do_wakeup(rq, p, wake_flags); | ||
2496 | } | ||
2497 | |||
2498 | /* | ||
2499 | * Called in case the task @p isn't fully descheduled from its runqueue, | ||
2500 | * in this case we must do a remote wakeup. Its a 'light' wakeup though, | ||
2501 | * since all we need to do is flip p->state to TASK_RUNNING, since | ||
2502 | * the task is still ->on_rq. | ||
2503 | */ | ||
2504 | static int ttwu_remote(struct task_struct *p, int wake_flags) | ||
2505 | { | ||
2506 | struct rq *rq; | ||
2507 | int ret = 0; | ||
2508 | |||
2509 | rq = __task_rq_lock(p); | ||
2510 | if (p->on_rq) { | ||
2511 | ttwu_do_wakeup(rq, p, wake_flags); | ||
2512 | ret = 1; | ||
2513 | } | ||
2514 | __task_rq_unlock(rq); | ||
2515 | |||
2516 | return ret; | ||
2517 | } | ||
2518 | |||
2519 | #ifdef CONFIG_SMP | ||
2520 | static void sched_ttwu_pending(void) | ||
2521 | { | ||
2522 | struct rq *rq = this_rq(); | ||
2523 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
2524 | |||
2525 | if (!list) | ||
2526 | return; | ||
2527 | |||
2528 | raw_spin_lock(&rq->lock); | ||
2529 | |||
2530 | while (list) { | ||
2531 | struct task_struct *p = list; | ||
2532 | list = list->wake_entry; | ||
2533 | ttwu_do_activate(rq, p, 0); | ||
2534 | } | ||
2535 | |||
2536 | raw_spin_unlock(&rq->lock); | ||
2537 | } | ||
2538 | |||
2539 | void scheduler_ipi(void) | ||
2540 | { | ||
2541 | sched_ttwu_pending(); | ||
2542 | } | ||
2543 | |||
2544 | static void ttwu_queue_remote(struct task_struct *p, int cpu) | ||
2545 | { | ||
2546 | struct rq *rq = cpu_rq(cpu); | ||
2547 | struct task_struct *next = rq->wake_list; | ||
2548 | |||
2549 | for (;;) { | ||
2550 | struct task_struct *old = next; | ||
2551 | |||
2552 | p->wake_entry = next; | ||
2553 | next = cmpxchg(&rq->wake_list, old, p); | ||
2554 | if (next == old) | ||
2555 | break; | ||
2556 | } | ||
2557 | |||
2558 | if (!next) | ||
2559 | smp_send_reschedule(cpu); | ||
2560 | } | ||
2561 | #endif | ||
2562 | |||
2563 | static void ttwu_queue(struct task_struct *p, int cpu) | ||
2564 | { | ||
2565 | struct rq *rq = cpu_rq(cpu); | ||
2566 | |||
2567 | #if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE) | ||
2568 | if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { | ||
2569 | ttwu_queue_remote(p, cpu); | ||
2570 | return; | ||
2571 | } | ||
2572 | #endif | ||
2573 | |||
2574 | raw_spin_lock(&rq->lock); | ||
2575 | ttwu_do_activate(rq, p, 0); | ||
2576 | raw_spin_unlock(&rq->lock); | ||
2439 | } | 2577 | } |
2440 | 2578 | ||
2441 | /** | 2579 | /** |
@@ -2453,92 +2591,64 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | |||
2453 | * Returns %true if @p was woken up, %false if it was already running | 2591 | * Returns %true if @p was woken up, %false if it was already running |
2454 | * or @state didn't match @p's state. | 2592 | * or @state didn't match @p's state. |
2455 | */ | 2593 | */ |
2456 | static int try_to_wake_up(struct task_struct *p, unsigned int state, | 2594 | static int |
2457 | int wake_flags) | 2595 | try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) |
2458 | { | 2596 | { |
2459 | int cpu, orig_cpu, this_cpu, success = 0; | ||
2460 | unsigned long flags; | 2597 | unsigned long flags; |
2461 | unsigned long en_flags = ENQUEUE_WAKEUP; | 2598 | int cpu, success = 0; |
2462 | struct rq *rq; | ||
2463 | |||
2464 | this_cpu = get_cpu(); | ||
2465 | 2599 | ||
2466 | smp_wmb(); | 2600 | smp_wmb(); |
2467 | rq = task_rq_lock(p, &flags); | 2601 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2468 | if (!(p->state & state)) | 2602 | if (!(p->state & state)) |
2469 | goto out; | 2603 | goto out; |
2470 | 2604 | ||
2471 | if (p->se.on_rq) | 2605 | success = 1; /* we're going to change ->state */ |
2472 | goto out_running; | ||
2473 | |||
2474 | cpu = task_cpu(p); | 2606 | cpu = task_cpu(p); |
2475 | orig_cpu = cpu; | ||
2476 | 2607 | ||
2477 | #ifdef CONFIG_SMP | 2608 | if (p->on_rq && ttwu_remote(p, wake_flags)) |
2478 | if (unlikely(task_running(rq, p))) | 2609 | goto stat; |
2479 | goto out_activate; | ||
2480 | 2610 | ||
2611 | #ifdef CONFIG_SMP | ||
2481 | /* | 2612 | /* |
2482 | * In order to handle concurrent wakeups and release the rq->lock | 2613 | * If the owning (remote) cpu is still in the middle of schedule() with |
2483 | * we put the task in TASK_WAKING state. | 2614 | * this task as prev, wait until its done referencing the task. |
2484 | * | ||
2485 | * First fix up the nr_uninterruptible count: | ||
2486 | */ | 2615 | */ |
2487 | if (task_contributes_to_load(p)) { | 2616 | while (p->on_cpu) { |
2488 | if (likely(cpu_online(orig_cpu))) | 2617 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
2489 | rq->nr_uninterruptible--; | 2618 | /* |
2490 | else | 2619 | * If called from interrupt context we could have landed in the |
2491 | this_rq()->nr_uninterruptible--; | 2620 | * middle of schedule(), in this case we should take care not |
2492 | } | 2621 | * to spin on ->on_cpu if p is current, since that would |
2493 | p->state = TASK_WAKING; | 2622 | * deadlock. |
2494 | 2623 | */ | |
2495 | if (p->sched_class->task_waking) { | 2624 | if (p == current) { |
2496 | p->sched_class->task_waking(rq, p); | 2625 | ttwu_queue(p, cpu); |
2497 | en_flags |= ENQUEUE_WAKING; | 2626 | goto stat; |
2627 | } | ||
2628 | #endif | ||
2629 | cpu_relax(); | ||
2498 | } | 2630 | } |
2499 | |||
2500 | cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); | ||
2501 | if (cpu != orig_cpu) | ||
2502 | set_task_cpu(p, cpu); | ||
2503 | __task_rq_unlock(rq); | ||
2504 | |||
2505 | rq = cpu_rq(cpu); | ||
2506 | raw_spin_lock(&rq->lock); | ||
2507 | |||
2508 | /* | 2631 | /* |
2509 | * We migrated the task without holding either rq->lock, however | 2632 | * Pairs with the smp_wmb() in finish_lock_switch(). |
2510 | * since the task is not on the task list itself, nobody else | ||
2511 | * will try and migrate the task, hence the rq should match the | ||
2512 | * cpu we just moved it to. | ||
2513 | */ | 2633 | */ |
2514 | WARN_ON(task_cpu(p) != cpu); | 2634 | smp_rmb(); |
2515 | WARN_ON(p->state != TASK_WAKING); | ||
2516 | 2635 | ||
2517 | #ifdef CONFIG_SCHEDSTATS | 2636 | p->sched_contributes_to_load = !!task_contributes_to_load(p); |
2518 | schedstat_inc(rq, ttwu_count); | 2637 | p->state = TASK_WAKING; |
2519 | if (cpu == this_cpu) | 2638 | |
2520 | schedstat_inc(rq, ttwu_local); | 2639 | if (p->sched_class->task_waking) |
2521 | else { | 2640 | p->sched_class->task_waking(p); |
2522 | struct sched_domain *sd; | ||
2523 | for_each_domain(this_cpu, sd) { | ||
2524 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
2525 | schedstat_inc(sd, ttwu_wake_remote); | ||
2526 | break; | ||
2527 | } | ||
2528 | } | ||
2529 | } | ||
2530 | #endif /* CONFIG_SCHEDSTATS */ | ||
2531 | 2641 | ||
2532 | out_activate: | 2642 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); |
2643 | if (task_cpu(p) != cpu) | ||
2644 | set_task_cpu(p, cpu); | ||
2533 | #endif /* CONFIG_SMP */ | 2645 | #endif /* CONFIG_SMP */ |
2534 | ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, | 2646 | |
2535 | cpu == this_cpu, en_flags); | 2647 | ttwu_queue(p, cpu); |
2536 | success = 1; | 2648 | stat: |
2537 | out_running: | 2649 | ttwu_stat(p, cpu, wake_flags); |
2538 | ttwu_post_activation(p, rq, wake_flags, success); | ||
2539 | out: | 2650 | out: |
2540 | task_rq_unlock(rq, &flags); | 2651 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
2541 | put_cpu(); | ||
2542 | 2652 | ||
2543 | return success; | 2653 | return success; |
2544 | } | 2654 | } |
@@ -2547,31 +2657,34 @@ out: | |||
2547 | * try_to_wake_up_local - try to wake up a local task with rq lock held | 2657 | * try_to_wake_up_local - try to wake up a local task with rq lock held |
2548 | * @p: the thread to be awakened | 2658 | * @p: the thread to be awakened |
2549 | * | 2659 | * |
2550 | * Put @p on the run-queue if it's not already there. The caller must | 2660 | * Put @p on the run-queue if it's not already there. The caller must |
2551 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | 2661 | * ensure that this_rq() is locked, @p is bound to this_rq() and not |
2552 | * the current task. this_rq() stays locked over invocation. | 2662 | * the current task. |
2553 | */ | 2663 | */ |
2554 | static void try_to_wake_up_local(struct task_struct *p) | 2664 | static void try_to_wake_up_local(struct task_struct *p) |
2555 | { | 2665 | { |
2556 | struct rq *rq = task_rq(p); | 2666 | struct rq *rq = task_rq(p); |
2557 | bool success = false; | ||
2558 | 2667 | ||
2559 | BUG_ON(rq != this_rq()); | 2668 | BUG_ON(rq != this_rq()); |
2560 | BUG_ON(p == current); | 2669 | BUG_ON(p == current); |
2561 | lockdep_assert_held(&rq->lock); | 2670 | lockdep_assert_held(&rq->lock); |
2562 | 2671 | ||
2672 | if (!raw_spin_trylock(&p->pi_lock)) { | ||
2673 | raw_spin_unlock(&rq->lock); | ||
2674 | raw_spin_lock(&p->pi_lock); | ||
2675 | raw_spin_lock(&rq->lock); | ||
2676 | } | ||
2677 | |||
2563 | if (!(p->state & TASK_NORMAL)) | 2678 | if (!(p->state & TASK_NORMAL)) |
2564 | return; | 2679 | goto out; |
2565 | 2680 | ||
2566 | if (!p->se.on_rq) { | 2681 | if (!p->on_rq) |
2567 | if (likely(!task_running(rq, p))) { | 2682 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
2568 | schedstat_inc(rq, ttwu_count); | 2683 | |
2569 | schedstat_inc(rq, ttwu_local); | 2684 | ttwu_do_wakeup(rq, p, 0); |
2570 | } | 2685 | ttwu_stat(p, smp_processor_id(), 0); |
2571 | ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); | 2686 | out: |
2572 | success = true; | 2687 | raw_spin_unlock(&p->pi_lock); |
2573 | } | ||
2574 | ttwu_post_activation(p, rq, 0, success); | ||
2575 | } | 2688 | } |
2576 | 2689 | ||
2577 | /** | 2690 | /** |
@@ -2604,19 +2717,21 @@ int wake_up_state(struct task_struct *p, unsigned int state) | |||
2604 | */ | 2717 | */ |
2605 | static void __sched_fork(struct task_struct *p) | 2718 | static void __sched_fork(struct task_struct *p) |
2606 | { | 2719 | { |
2720 | p->on_rq = 0; | ||
2721 | |||
2722 | p->se.on_rq = 0; | ||
2607 | p->se.exec_start = 0; | 2723 | p->se.exec_start = 0; |
2608 | p->se.sum_exec_runtime = 0; | 2724 | p->se.sum_exec_runtime = 0; |
2609 | p->se.prev_sum_exec_runtime = 0; | 2725 | p->se.prev_sum_exec_runtime = 0; |
2610 | p->se.nr_migrations = 0; | 2726 | p->se.nr_migrations = 0; |
2611 | p->se.vruntime = 0; | 2727 | p->se.vruntime = 0; |
2728 | INIT_LIST_HEAD(&p->se.group_node); | ||
2612 | 2729 | ||
2613 | #ifdef CONFIG_SCHEDSTATS | 2730 | #ifdef CONFIG_SCHEDSTATS |
2614 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 2731 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
2615 | #endif | 2732 | #endif |
2616 | 2733 | ||
2617 | INIT_LIST_HEAD(&p->rt.run_list); | 2734 | INIT_LIST_HEAD(&p->rt.run_list); |
2618 | p->se.on_rq = 0; | ||
2619 | INIT_LIST_HEAD(&p->se.group_node); | ||
2620 | 2735 | ||
2621 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2736 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
2622 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 2737 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
@@ -2626,8 +2741,9 @@ static void __sched_fork(struct task_struct *p) | |||
2626 | /* | 2741 | /* |
2627 | * fork()/clone()-time setup: | 2742 | * fork()/clone()-time setup: |
2628 | */ | 2743 | */ |
2629 | void sched_fork(struct task_struct *p, int clone_flags) | 2744 | void sched_fork(struct task_struct *p) |
2630 | { | 2745 | { |
2746 | unsigned long flags; | ||
2631 | int cpu = get_cpu(); | 2747 | int cpu = get_cpu(); |
2632 | 2748 | ||
2633 | __sched_fork(p); | 2749 | __sched_fork(p); |
@@ -2678,16 +2794,16 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2678 | * | 2794 | * |
2679 | * Silence PROVE_RCU. | 2795 | * Silence PROVE_RCU. |
2680 | */ | 2796 | */ |
2681 | rcu_read_lock(); | 2797 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2682 | set_task_cpu(p, cpu); | 2798 | set_task_cpu(p, cpu); |
2683 | rcu_read_unlock(); | 2799 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
2684 | 2800 | ||
2685 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 2801 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
2686 | if (likely(sched_info_on())) | 2802 | if (likely(sched_info_on())) |
2687 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 2803 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
2688 | #endif | 2804 | #endif |
2689 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 2805 | #if defined(CONFIG_SMP) |
2690 | p->oncpu = 0; | 2806 | p->on_cpu = 0; |
2691 | #endif | 2807 | #endif |
2692 | #ifdef CONFIG_PREEMPT | 2808 | #ifdef CONFIG_PREEMPT |
2693 | /* Want to start with kernel preemption disabled. */ | 2809 | /* Want to start with kernel preemption disabled. */ |
@@ -2707,41 +2823,31 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2707 | * that must be done for every newly created context, then puts the task | 2823 | * that must be done for every newly created context, then puts the task |
2708 | * on the runqueue and wakes it. | 2824 | * on the runqueue and wakes it. |
2709 | */ | 2825 | */ |
2710 | void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | 2826 | void wake_up_new_task(struct task_struct *p) |
2711 | { | 2827 | { |
2712 | unsigned long flags; | 2828 | unsigned long flags; |
2713 | struct rq *rq; | 2829 | struct rq *rq; |
2714 | int cpu __maybe_unused = get_cpu(); | ||
2715 | 2830 | ||
2831 | raw_spin_lock_irqsave(&p->pi_lock, flags); | ||
2716 | #ifdef CONFIG_SMP | 2832 | #ifdef CONFIG_SMP |
2717 | rq = task_rq_lock(p, &flags); | ||
2718 | p->state = TASK_WAKING; | ||
2719 | |||
2720 | /* | 2833 | /* |
2721 | * Fork balancing, do it here and not earlier because: | 2834 | * Fork balancing, do it here and not earlier because: |
2722 | * - cpus_allowed can change in the fork path | 2835 | * - cpus_allowed can change in the fork path |
2723 | * - any previously selected cpu might disappear through hotplug | 2836 | * - any previously selected cpu might disappear through hotplug |
2724 | * | ||
2725 | * We set TASK_WAKING so that select_task_rq() can drop rq->lock | ||
2726 | * without people poking at ->cpus_allowed. | ||
2727 | */ | 2837 | */ |
2728 | cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); | 2838 | set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); |
2729 | set_task_cpu(p, cpu); | ||
2730 | |||
2731 | p->state = TASK_RUNNING; | ||
2732 | task_rq_unlock(rq, &flags); | ||
2733 | #endif | 2839 | #endif |
2734 | 2840 | ||
2735 | rq = task_rq_lock(p, &flags); | 2841 | rq = __task_rq_lock(p); |
2736 | activate_task(rq, p, 0); | 2842 | activate_task(rq, p, 0); |
2737 | trace_sched_wakeup_new(p, 1); | 2843 | p->on_rq = 1; |
2844 | trace_sched_wakeup_new(p, true); | ||
2738 | check_preempt_curr(rq, p, WF_FORK); | 2845 | check_preempt_curr(rq, p, WF_FORK); |
2739 | #ifdef CONFIG_SMP | 2846 | #ifdef CONFIG_SMP |
2740 | if (p->sched_class->task_woken) | 2847 | if (p->sched_class->task_woken) |
2741 | p->sched_class->task_woken(rq, p); | 2848 | p->sched_class->task_woken(rq, p); |
2742 | #endif | 2849 | #endif |
2743 | task_rq_unlock(rq, &flags); | 2850 | task_rq_unlock(rq, p, &flags); |
2744 | put_cpu(); | ||
2745 | } | 2851 | } |
2746 | 2852 | ||
2747 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2853 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
@@ -3450,27 +3556,22 @@ void sched_exec(void) | |||
3450 | { | 3556 | { |
3451 | struct task_struct *p = current; | 3557 | struct task_struct *p = current; |
3452 | unsigned long flags; | 3558 | unsigned long flags; |
3453 | struct rq *rq; | ||
3454 | int dest_cpu; | 3559 | int dest_cpu; |
3455 | 3560 | ||
3456 | rq = task_rq_lock(p, &flags); | 3561 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
3457 | dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); | 3562 | dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); |
3458 | if (dest_cpu == smp_processor_id()) | 3563 | if (dest_cpu == smp_processor_id()) |
3459 | goto unlock; | 3564 | goto unlock; |
3460 | 3565 | ||
3461 | /* | 3566 | if (likely(cpu_active(dest_cpu))) { |
3462 | * select_task_rq() can race against ->cpus_allowed | ||
3463 | */ | ||
3464 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && | ||
3465 | likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { | ||
3466 | struct migration_arg arg = { p, dest_cpu }; | 3567 | struct migration_arg arg = { p, dest_cpu }; |
3467 | 3568 | ||
3468 | task_rq_unlock(rq, &flags); | 3569 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
3469 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); | 3570 | stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); |
3470 | return; | 3571 | return; |
3471 | } | 3572 | } |
3472 | unlock: | 3573 | unlock: |
3473 | task_rq_unlock(rq, &flags); | 3574 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
3474 | } | 3575 | } |
3475 | 3576 | ||
3476 | #endif | 3577 | #endif |
@@ -3507,7 +3608,7 @@ unsigned long long task_delta_exec(struct task_struct *p) | |||
3507 | 3608 | ||
3508 | rq = task_rq_lock(p, &flags); | 3609 | rq = task_rq_lock(p, &flags); |
3509 | ns = do_task_delta_exec(p, rq); | 3610 | ns = do_task_delta_exec(p, rq); |
3510 | task_rq_unlock(rq, &flags); | 3611 | task_rq_unlock(rq, p, &flags); |
3511 | 3612 | ||
3512 | return ns; | 3613 | return ns; |
3513 | } | 3614 | } |
@@ -3525,7 +3626,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3525 | 3626 | ||
3526 | rq = task_rq_lock(p, &flags); | 3627 | rq = task_rq_lock(p, &flags); |
3527 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); | 3628 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); |
3528 | task_rq_unlock(rq, &flags); | 3629 | task_rq_unlock(rq, p, &flags); |
3529 | 3630 | ||
3530 | return ns; | 3631 | return ns; |
3531 | } | 3632 | } |
@@ -3549,7 +3650,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p) | |||
3549 | rq = task_rq_lock(p, &flags); | 3650 | rq = task_rq_lock(p, &flags); |
3550 | thread_group_cputime(p, &totals); | 3651 | thread_group_cputime(p, &totals); |
3551 | ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); | 3652 | ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); |
3552 | task_rq_unlock(rq, &flags); | 3653 | task_rq_unlock(rq, p, &flags); |
3553 | 3654 | ||
3554 | return ns; | 3655 | return ns; |
3555 | } | 3656 | } |
@@ -3903,9 +4004,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
3903 | /* | 4004 | /* |
3904 | * This function gets called by the timer code, with HZ frequency. | 4005 | * This function gets called by the timer code, with HZ frequency. |
3905 | * We call it with interrupts disabled. | 4006 | * We call it with interrupts disabled. |
3906 | * | ||
3907 | * It also gets called by the fork code, when changing the parent's | ||
3908 | * timeslices. | ||
3909 | */ | 4007 | */ |
3910 | void scheduler_tick(void) | 4008 | void scheduler_tick(void) |
3911 | { | 4009 | { |
@@ -4025,17 +4123,11 @@ static inline void schedule_debug(struct task_struct *prev) | |||
4025 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 4123 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
4026 | 4124 | ||
4027 | schedstat_inc(this_rq(), sched_count); | 4125 | schedstat_inc(this_rq(), sched_count); |
4028 | #ifdef CONFIG_SCHEDSTATS | ||
4029 | if (unlikely(prev->lock_depth >= 0)) { | ||
4030 | schedstat_inc(this_rq(), rq_sched_info.bkl_count); | ||
4031 | schedstat_inc(prev, sched_info.bkl_count); | ||
4032 | } | ||
4033 | #endif | ||
4034 | } | 4126 | } |
4035 | 4127 | ||
4036 | static void put_prev_task(struct rq *rq, struct task_struct *prev) | 4128 | static void put_prev_task(struct rq *rq, struct task_struct *prev) |
4037 | { | 4129 | { |
4038 | if (prev->se.on_rq) | 4130 | if (prev->on_rq || rq->skip_clock_update < 0) |
4039 | update_rq_clock(rq); | 4131 | update_rq_clock(rq); |
4040 | prev->sched_class->put_prev_task(rq, prev); | 4132 | prev->sched_class->put_prev_task(rq, prev); |
4041 | } | 4133 | } |
@@ -4097,11 +4189,13 @@ need_resched: | |||
4097 | if (unlikely(signal_pending_state(prev->state, prev))) { | 4189 | if (unlikely(signal_pending_state(prev->state, prev))) { |
4098 | prev->state = TASK_RUNNING; | 4190 | prev->state = TASK_RUNNING; |
4099 | } else { | 4191 | } else { |
4192 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | ||
4193 | prev->on_rq = 0; | ||
4194 | |||
4100 | /* | 4195 | /* |
4101 | * If a worker is going to sleep, notify and | 4196 | * If a worker went to sleep, notify and ask workqueue |
4102 | * ask workqueue whether it wants to wake up a | 4197 | * whether it wants to wake up a task to maintain |
4103 | * task to maintain concurrency. If so, wake | 4198 | * concurrency. |
4104 | * up the task. | ||
4105 | */ | 4199 | */ |
4106 | if (prev->flags & PF_WQ_WORKER) { | 4200 | if (prev->flags & PF_WQ_WORKER) { |
4107 | struct task_struct *to_wakeup; | 4201 | struct task_struct *to_wakeup; |
@@ -4110,11 +4204,10 @@ need_resched: | |||
4110 | if (to_wakeup) | 4204 | if (to_wakeup) |
4111 | try_to_wake_up_local(to_wakeup); | 4205 | try_to_wake_up_local(to_wakeup); |
4112 | } | 4206 | } |
4113 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | ||
4114 | 4207 | ||
4115 | /* | 4208 | /* |
4116 | * If we are going to sleep and we have plugged IO queued, make | 4209 | * If we are going to sleep and we have plugged IO |
4117 | * sure to submit it to avoid deadlocks. | 4210 | * queued, make sure to submit it to avoid deadlocks. |
4118 | */ | 4211 | */ |
4119 | if (blk_needs_flush_plug(prev)) { | 4212 | if (blk_needs_flush_plug(prev)) { |
4120 | raw_spin_unlock(&rq->lock); | 4213 | raw_spin_unlock(&rq->lock); |
@@ -4161,70 +4254,53 @@ need_resched: | |||
4161 | EXPORT_SYMBOL(schedule); | 4254 | EXPORT_SYMBOL(schedule); |
4162 | 4255 | ||
4163 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 4256 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
4164 | /* | ||
4165 | * Look out! "owner" is an entirely speculative pointer | ||
4166 | * access and not reliable. | ||
4167 | */ | ||
4168 | int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | ||
4169 | { | ||
4170 | unsigned int cpu; | ||
4171 | struct rq *rq; | ||
4172 | 4257 | ||
4173 | if (!sched_feat(OWNER_SPIN)) | 4258 | static inline bool owner_running(struct mutex *lock, struct task_struct *owner) |
4174 | return 0; | 4259 | { |
4260 | bool ret = false; | ||
4175 | 4261 | ||
4176 | #ifdef CONFIG_DEBUG_PAGEALLOC | 4262 | rcu_read_lock(); |
4177 | /* | 4263 | if (lock->owner != owner) |
4178 | * Need to access the cpu field knowing that | 4264 | goto fail; |
4179 | * DEBUG_PAGEALLOC could have unmapped it if | ||
4180 | * the mutex owner just released it and exited. | ||
4181 | */ | ||
4182 | if (probe_kernel_address(&owner->cpu, cpu)) | ||
4183 | return 0; | ||
4184 | #else | ||
4185 | cpu = owner->cpu; | ||
4186 | #endif | ||
4187 | 4265 | ||
4188 | /* | 4266 | /* |
4189 | * Even if the access succeeded (likely case), | 4267 | * Ensure we emit the owner->on_cpu, dereference _after_ checking |
4190 | * the cpu field may no longer be valid. | 4268 | * lock->owner still matches owner, if that fails, owner might |
4269 | * point to free()d memory, if it still matches, the rcu_read_lock() | ||
4270 | * ensures the memory stays valid. | ||
4191 | */ | 4271 | */ |
4192 | if (cpu >= nr_cpumask_bits) | 4272 | barrier(); |
4193 | return 0; | ||
4194 | 4273 | ||
4195 | /* | 4274 | ret = owner->on_cpu; |
4196 | * We need to validate that we can do a | 4275 | fail: |
4197 | * get_cpu() and that we have the percpu area. | 4276 | rcu_read_unlock(); |
4198 | */ | ||
4199 | if (!cpu_online(cpu)) | ||
4200 | return 0; | ||
4201 | 4277 | ||
4202 | rq = cpu_rq(cpu); | 4278 | return ret; |
4279 | } | ||
4203 | 4280 | ||
4204 | for (;;) { | 4281 | /* |
4205 | /* | 4282 | * Look out! "owner" is an entirely speculative pointer |
4206 | * Owner changed, break to re-assess state. | 4283 | * access and not reliable. |
4207 | */ | 4284 | */ |
4208 | if (lock->owner != owner) { | 4285 | int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) |
4209 | /* | 4286 | { |
4210 | * If the lock has switched to a different owner, | 4287 | if (!sched_feat(OWNER_SPIN)) |
4211 | * we likely have heavy contention. Return 0 to quit | 4288 | return 0; |
4212 | * optimistic spinning and not contend further: | ||
4213 | */ | ||
4214 | if (lock->owner) | ||
4215 | return 0; | ||
4216 | break; | ||
4217 | } | ||
4218 | 4289 | ||
4219 | /* | 4290 | while (owner_running(lock, owner)) { |
4220 | * Is that owner really running on that cpu? | 4291 | if (need_resched()) |
4221 | */ | ||
4222 | if (task_thread_info(rq->curr) != owner || need_resched()) | ||
4223 | return 0; | 4292 | return 0; |
4224 | 4293 | ||
4225 | arch_mutex_cpu_relax(); | 4294 | arch_mutex_cpu_relax(); |
4226 | } | 4295 | } |
4227 | 4296 | ||
4297 | /* | ||
4298 | * If the owner changed to another task there is likely | ||
4299 | * heavy contention, stop spinning. | ||
4300 | */ | ||
4301 | if (lock->owner) | ||
4302 | return 0; | ||
4303 | |||
4228 | return 1; | 4304 | return 1; |
4229 | } | 4305 | } |
4230 | #endif | 4306 | #endif |
@@ -4684,19 +4760,18 @@ EXPORT_SYMBOL(sleep_on_timeout); | |||
4684 | */ | 4760 | */ |
4685 | void rt_mutex_setprio(struct task_struct *p, int prio) | 4761 | void rt_mutex_setprio(struct task_struct *p, int prio) |
4686 | { | 4762 | { |
4687 | unsigned long flags; | ||
4688 | int oldprio, on_rq, running; | 4763 | int oldprio, on_rq, running; |
4689 | struct rq *rq; | 4764 | struct rq *rq; |
4690 | const struct sched_class *prev_class; | 4765 | const struct sched_class *prev_class; |
4691 | 4766 | ||
4692 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 4767 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
4693 | 4768 | ||
4694 | rq = task_rq_lock(p, &flags); | 4769 | rq = __task_rq_lock(p); |
4695 | 4770 | ||
4696 | trace_sched_pi_setprio(p, prio); | 4771 | trace_sched_pi_setprio(p, prio); |
4697 | oldprio = p->prio; | 4772 | oldprio = p->prio; |
4698 | prev_class = p->sched_class; | 4773 | prev_class = p->sched_class; |
4699 | on_rq = p->se.on_rq; | 4774 | on_rq = p->on_rq; |
4700 | running = task_current(rq, p); | 4775 | running = task_current(rq, p); |
4701 | if (on_rq) | 4776 | if (on_rq) |
4702 | dequeue_task(rq, p, 0); | 4777 | dequeue_task(rq, p, 0); |
@@ -4716,7 +4791,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4716 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); | 4791 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); |
4717 | 4792 | ||
4718 | check_class_changed(rq, p, prev_class, oldprio); | 4793 | check_class_changed(rq, p, prev_class, oldprio); |
4719 | task_rq_unlock(rq, &flags); | 4794 | __task_rq_unlock(rq); |
4720 | } | 4795 | } |
4721 | 4796 | ||
4722 | #endif | 4797 | #endif |
@@ -4744,7 +4819,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4744 | p->static_prio = NICE_TO_PRIO(nice); | 4819 | p->static_prio = NICE_TO_PRIO(nice); |
4745 | goto out_unlock; | 4820 | goto out_unlock; |
4746 | } | 4821 | } |
4747 | on_rq = p->se.on_rq; | 4822 | on_rq = p->on_rq; |
4748 | if (on_rq) | 4823 | if (on_rq) |
4749 | dequeue_task(rq, p, 0); | 4824 | dequeue_task(rq, p, 0); |
4750 | 4825 | ||
@@ -4764,7 +4839,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4764 | resched_task(rq->curr); | 4839 | resched_task(rq->curr); |
4765 | } | 4840 | } |
4766 | out_unlock: | 4841 | out_unlock: |
4767 | task_rq_unlock(rq, &flags); | 4842 | task_rq_unlock(rq, p, &flags); |
4768 | } | 4843 | } |
4769 | EXPORT_SYMBOL(set_user_nice); | 4844 | EXPORT_SYMBOL(set_user_nice); |
4770 | 4845 | ||
@@ -4878,8 +4953,6 @@ static struct task_struct *find_process_by_pid(pid_t pid) | |||
4878 | static void | 4953 | static void |
4879 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | 4954 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) |
4880 | { | 4955 | { |
4881 | BUG_ON(p->se.on_rq); | ||
4882 | |||
4883 | p->policy = policy; | 4956 | p->policy = policy; |
4884 | p->rt_priority = prio; | 4957 | p->rt_priority = prio; |
4885 | p->normal_prio = normal_prio(p); | 4958 | p->normal_prio = normal_prio(p); |
@@ -4994,20 +5067,17 @@ recheck: | |||
4994 | /* | 5067 | /* |
4995 | * make sure no PI-waiters arrive (or leave) while we are | 5068 | * make sure no PI-waiters arrive (or leave) while we are |
4996 | * changing the priority of the task: | 5069 | * changing the priority of the task: |
4997 | */ | 5070 | * |
4998 | raw_spin_lock_irqsave(&p->pi_lock, flags); | ||
4999 | /* | ||
5000 | * To be able to change p->policy safely, the appropriate | 5071 | * To be able to change p->policy safely, the appropriate |
5001 | * runqueue lock must be held. | 5072 | * runqueue lock must be held. |
5002 | */ | 5073 | */ |
5003 | rq = __task_rq_lock(p); | 5074 | rq = task_rq_lock(p, &flags); |
5004 | 5075 | ||
5005 | /* | 5076 | /* |
5006 | * Changing the policy of the stop threads its a very bad idea | 5077 | * Changing the policy of the stop threads its a very bad idea |
5007 | */ | 5078 | */ |
5008 | if (p == rq->stop) { | 5079 | if (p == rq->stop) { |
5009 | __task_rq_unlock(rq); | 5080 | task_rq_unlock(rq, p, &flags); |
5010 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
5011 | return -EINVAL; | 5081 | return -EINVAL; |
5012 | } | 5082 | } |
5013 | 5083 | ||
@@ -5031,8 +5101,7 @@ recheck: | |||
5031 | if (rt_bandwidth_enabled() && rt_policy(policy) && | 5101 | if (rt_bandwidth_enabled() && rt_policy(policy) && |
5032 | task_group(p)->rt_bandwidth.rt_runtime == 0 && | 5102 | task_group(p)->rt_bandwidth.rt_runtime == 0 && |
5033 | !task_group_is_autogroup(task_group(p))) { | 5103 | !task_group_is_autogroup(task_group(p))) { |
5034 | __task_rq_unlock(rq); | 5104 | task_rq_unlock(rq, p, &flags); |
5035 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
5036 | return -EPERM; | 5105 | return -EPERM; |
5037 | } | 5106 | } |
5038 | } | 5107 | } |
@@ -5041,11 +5110,10 @@ recheck: | |||
5041 | /* recheck policy now with rq lock held */ | 5110 | /* recheck policy now with rq lock held */ |
5042 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 5111 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
5043 | policy = oldpolicy = -1; | 5112 | policy = oldpolicy = -1; |
5044 | __task_rq_unlock(rq); | 5113 | task_rq_unlock(rq, p, &flags); |
5045 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
5046 | goto recheck; | 5114 | goto recheck; |
5047 | } | 5115 | } |
5048 | on_rq = p->se.on_rq; | 5116 | on_rq = p->on_rq; |
5049 | running = task_current(rq, p); | 5117 | running = task_current(rq, p); |
5050 | if (on_rq) | 5118 | if (on_rq) |
5051 | deactivate_task(rq, p, 0); | 5119 | deactivate_task(rq, p, 0); |
@@ -5064,8 +5132,7 @@ recheck: | |||
5064 | activate_task(rq, p, 0); | 5132 | activate_task(rq, p, 0); |
5065 | 5133 | ||
5066 | check_class_changed(rq, p, prev_class, oldprio); | 5134 | check_class_changed(rq, p, prev_class, oldprio); |
5067 | __task_rq_unlock(rq); | 5135 | task_rq_unlock(rq, p, &flags); |
5068 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
5069 | 5136 | ||
5070 | rt_mutex_adjust_pi(p); | 5137 | rt_mutex_adjust_pi(p); |
5071 | 5138 | ||
@@ -5316,7 +5383,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
5316 | { | 5383 | { |
5317 | struct task_struct *p; | 5384 | struct task_struct *p; |
5318 | unsigned long flags; | 5385 | unsigned long flags; |
5319 | struct rq *rq; | ||
5320 | int retval; | 5386 | int retval; |
5321 | 5387 | ||
5322 | get_online_cpus(); | 5388 | get_online_cpus(); |
@@ -5331,9 +5397,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
5331 | if (retval) | 5397 | if (retval) |
5332 | goto out_unlock; | 5398 | goto out_unlock; |
5333 | 5399 | ||
5334 | rq = task_rq_lock(p, &flags); | 5400 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
5335 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); | 5401 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); |
5336 | task_rq_unlock(rq, &flags); | 5402 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
5337 | 5403 | ||
5338 | out_unlock: | 5404 | out_unlock: |
5339 | rcu_read_unlock(); | 5405 | rcu_read_unlock(); |
@@ -5658,7 +5724,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | |||
5658 | 5724 | ||
5659 | rq = task_rq_lock(p, &flags); | 5725 | rq = task_rq_lock(p, &flags); |
5660 | time_slice = p->sched_class->get_rr_interval(rq, p); | 5726 | time_slice = p->sched_class->get_rr_interval(rq, p); |
5661 | task_rq_unlock(rq, &flags); | 5727 | task_rq_unlock(rq, p, &flags); |
5662 | 5728 | ||
5663 | rcu_read_unlock(); | 5729 | rcu_read_unlock(); |
5664 | jiffies_to_timespec(time_slice, &t); | 5730 | jiffies_to_timespec(time_slice, &t); |
@@ -5776,17 +5842,14 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5776 | rcu_read_unlock(); | 5842 | rcu_read_unlock(); |
5777 | 5843 | ||
5778 | rq->curr = rq->idle = idle; | 5844 | rq->curr = rq->idle = idle; |
5779 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 5845 | #if defined(CONFIG_SMP) |
5780 | idle->oncpu = 1; | 5846 | idle->on_cpu = 1; |
5781 | #endif | 5847 | #endif |
5782 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 5848 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
5783 | 5849 | ||
5784 | /* Set the preempt count _outside_ the spinlocks! */ | 5850 | /* Set the preempt count _outside_ the spinlocks! */ |
5785 | #if defined(CONFIG_PREEMPT) | ||
5786 | task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); | ||
5787 | #else | ||
5788 | task_thread_info(idle)->preempt_count = 0; | 5851 | task_thread_info(idle)->preempt_count = 0; |
5789 | #endif | 5852 | |
5790 | /* | 5853 | /* |
5791 | * The idle tasks have their own, simple scheduling class: | 5854 | * The idle tasks have their own, simple scheduling class: |
5792 | */ | 5855 | */ |
@@ -5881,26 +5944,17 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
5881 | unsigned int dest_cpu; | 5944 | unsigned int dest_cpu; |
5882 | int ret = 0; | 5945 | int ret = 0; |
5883 | 5946 | ||
5884 | /* | ||
5885 | * Serialize against TASK_WAKING so that ttwu() and wunt() can | ||
5886 | * drop the rq->lock and still rely on ->cpus_allowed. | ||
5887 | */ | ||
5888 | again: | ||
5889 | while (task_is_waking(p)) | ||
5890 | cpu_relax(); | ||
5891 | rq = task_rq_lock(p, &flags); | 5947 | rq = task_rq_lock(p, &flags); |
5892 | if (task_is_waking(p)) { | 5948 | |
5893 | task_rq_unlock(rq, &flags); | 5949 | if (cpumask_equal(&p->cpus_allowed, new_mask)) |
5894 | goto again; | 5950 | goto out; |
5895 | } | ||
5896 | 5951 | ||
5897 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { | 5952 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { |
5898 | ret = -EINVAL; | 5953 | ret = -EINVAL; |
5899 | goto out; | 5954 | goto out; |
5900 | } | 5955 | } |
5901 | 5956 | ||
5902 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && | 5957 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { |
5903 | !cpumask_equal(&p->cpus_allowed, new_mask))) { | ||
5904 | ret = -EINVAL; | 5958 | ret = -EINVAL; |
5905 | goto out; | 5959 | goto out; |
5906 | } | 5960 | } |
@@ -5917,16 +5971,16 @@ again: | |||
5917 | goto out; | 5971 | goto out; |
5918 | 5972 | ||
5919 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); | 5973 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); |
5920 | if (migrate_task(p, rq)) { | 5974 | if (p->on_rq) { |
5921 | struct migration_arg arg = { p, dest_cpu }; | 5975 | struct migration_arg arg = { p, dest_cpu }; |
5922 | /* Need help from migration thread: drop lock and wait. */ | 5976 | /* Need help from migration thread: drop lock and wait. */ |
5923 | task_rq_unlock(rq, &flags); | 5977 | task_rq_unlock(rq, p, &flags); |
5924 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); | 5978 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); |
5925 | tlb_migrate_finish(p->mm); | 5979 | tlb_migrate_finish(p->mm); |
5926 | return 0; | 5980 | return 0; |
5927 | } | 5981 | } |
5928 | out: | 5982 | out: |
5929 | task_rq_unlock(rq, &flags); | 5983 | task_rq_unlock(rq, p, &flags); |
5930 | 5984 | ||
5931 | return ret; | 5985 | return ret; |
5932 | } | 5986 | } |
@@ -5954,6 +6008,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5954 | rq_src = cpu_rq(src_cpu); | 6008 | rq_src = cpu_rq(src_cpu); |
5955 | rq_dest = cpu_rq(dest_cpu); | 6009 | rq_dest = cpu_rq(dest_cpu); |
5956 | 6010 | ||
6011 | raw_spin_lock(&p->pi_lock); | ||
5957 | double_rq_lock(rq_src, rq_dest); | 6012 | double_rq_lock(rq_src, rq_dest); |
5958 | /* Already moved. */ | 6013 | /* Already moved. */ |
5959 | if (task_cpu(p) != src_cpu) | 6014 | if (task_cpu(p) != src_cpu) |
@@ -5966,7 +6021,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5966 | * If we're not on a rq, the next wake-up will ensure we're | 6021 | * If we're not on a rq, the next wake-up will ensure we're |
5967 | * placed properly. | 6022 | * placed properly. |
5968 | */ | 6023 | */ |
5969 | if (p->se.on_rq) { | 6024 | if (p->on_rq) { |
5970 | deactivate_task(rq_src, p, 0); | 6025 | deactivate_task(rq_src, p, 0); |
5971 | set_task_cpu(p, dest_cpu); | 6026 | set_task_cpu(p, dest_cpu); |
5972 | activate_task(rq_dest, p, 0); | 6027 | activate_task(rq_dest, p, 0); |
@@ -5976,6 +6031,7 @@ done: | |||
5976 | ret = 1; | 6031 | ret = 1; |
5977 | fail: | 6032 | fail: |
5978 | double_rq_unlock(rq_src, rq_dest); | 6033 | double_rq_unlock(rq_src, rq_dest); |
6034 | raw_spin_unlock(&p->pi_lock); | ||
5979 | return ret; | 6035 | return ret; |
5980 | } | 6036 | } |
5981 | 6037 | ||
@@ -6316,6 +6372,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6316 | 6372 | ||
6317 | #ifdef CONFIG_HOTPLUG_CPU | 6373 | #ifdef CONFIG_HOTPLUG_CPU |
6318 | case CPU_DYING: | 6374 | case CPU_DYING: |
6375 | sched_ttwu_pending(); | ||
6319 | /* Update our root-domain */ | 6376 | /* Update our root-domain */ |
6320 | raw_spin_lock_irqsave(&rq->lock, flags); | 6377 | raw_spin_lock_irqsave(&rq->lock, flags); |
6321 | if (rq->rd) { | 6378 | if (rq->rd) { |
@@ -6394,6 +6451,8 @@ early_initcall(migration_init); | |||
6394 | 6451 | ||
6395 | #ifdef CONFIG_SMP | 6452 | #ifdef CONFIG_SMP |
6396 | 6453 | ||
6454 | static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ | ||
6455 | |||
6397 | #ifdef CONFIG_SCHED_DEBUG | 6456 | #ifdef CONFIG_SCHED_DEBUG |
6398 | 6457 | ||
6399 | static __read_mostly int sched_domain_debug_enabled; | 6458 | static __read_mostly int sched_domain_debug_enabled; |
@@ -6489,7 +6548,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
6489 | 6548 | ||
6490 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 6549 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
6491 | { | 6550 | { |
6492 | cpumask_var_t groupmask; | ||
6493 | int level = 0; | 6551 | int level = 0; |
6494 | 6552 | ||
6495 | if (!sched_domain_debug_enabled) | 6553 | if (!sched_domain_debug_enabled) |
@@ -6502,20 +6560,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
6502 | 6560 | ||
6503 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | 6561 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); |
6504 | 6562 | ||
6505 | if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { | ||
6506 | printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); | ||
6507 | return; | ||
6508 | } | ||
6509 | |||
6510 | for (;;) { | 6563 | for (;;) { |
6511 | if (sched_domain_debug_one(sd, cpu, level, groupmask)) | 6564 | if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) |
6512 | break; | 6565 | break; |
6513 | level++; | 6566 | level++; |
6514 | sd = sd->parent; | 6567 | sd = sd->parent; |
6515 | if (!sd) | 6568 | if (!sd) |
6516 | break; | 6569 | break; |
6517 | } | 6570 | } |
6518 | free_cpumask_var(groupmask); | ||
6519 | } | 6571 | } |
6520 | #else /* !CONFIG_SCHED_DEBUG */ | 6572 | #else /* !CONFIG_SCHED_DEBUG */ |
6521 | # define sched_domain_debug(sd, cpu) do { } while (0) | 6573 | # define sched_domain_debug(sd, cpu) do { } while (0) |
@@ -6572,12 +6624,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
6572 | return 1; | 6624 | return 1; |
6573 | } | 6625 | } |
6574 | 6626 | ||
6575 | static void free_rootdomain(struct root_domain *rd) | 6627 | static void free_rootdomain(struct rcu_head *rcu) |
6576 | { | 6628 | { |
6577 | synchronize_sched(); | 6629 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); |
6578 | 6630 | ||
6579 | cpupri_cleanup(&rd->cpupri); | 6631 | cpupri_cleanup(&rd->cpupri); |
6580 | |||
6581 | free_cpumask_var(rd->rto_mask); | 6632 | free_cpumask_var(rd->rto_mask); |
6582 | free_cpumask_var(rd->online); | 6633 | free_cpumask_var(rd->online); |
6583 | free_cpumask_var(rd->span); | 6634 | free_cpumask_var(rd->span); |
@@ -6618,7 +6669,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
6618 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6669 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
6619 | 6670 | ||
6620 | if (old_rd) | 6671 | if (old_rd) |
6621 | free_rootdomain(old_rd); | 6672 | call_rcu_sched(&old_rd->rcu, free_rootdomain); |
6622 | } | 6673 | } |
6623 | 6674 | ||
6624 | static int init_rootdomain(struct root_domain *rd) | 6675 | static int init_rootdomain(struct root_domain *rd) |
@@ -6669,6 +6720,25 @@ static struct root_domain *alloc_rootdomain(void) | |||
6669 | return rd; | 6720 | return rd; |
6670 | } | 6721 | } |
6671 | 6722 | ||
6723 | static void free_sched_domain(struct rcu_head *rcu) | ||
6724 | { | ||
6725 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); | ||
6726 | if (atomic_dec_and_test(&sd->groups->ref)) | ||
6727 | kfree(sd->groups); | ||
6728 | kfree(sd); | ||
6729 | } | ||
6730 | |||
6731 | static void destroy_sched_domain(struct sched_domain *sd, int cpu) | ||
6732 | { | ||
6733 | call_rcu(&sd->rcu, free_sched_domain); | ||
6734 | } | ||
6735 | |||
6736 | static void destroy_sched_domains(struct sched_domain *sd, int cpu) | ||
6737 | { | ||
6738 | for (; sd; sd = sd->parent) | ||
6739 | destroy_sched_domain(sd, cpu); | ||
6740 | } | ||
6741 | |||
6672 | /* | 6742 | /* |
6673 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 6743 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
6674 | * hold the hotplug lock. | 6744 | * hold the hotplug lock. |
@@ -6679,9 +6749,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6679 | struct rq *rq = cpu_rq(cpu); | 6749 | struct rq *rq = cpu_rq(cpu); |
6680 | struct sched_domain *tmp; | 6750 | struct sched_domain *tmp; |
6681 | 6751 | ||
6682 | for (tmp = sd; tmp; tmp = tmp->parent) | ||
6683 | tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); | ||
6684 | |||
6685 | /* Remove the sched domains which do not contribute to scheduling. */ | 6752 | /* Remove the sched domains which do not contribute to scheduling. */ |
6686 | for (tmp = sd; tmp; ) { | 6753 | for (tmp = sd; tmp; ) { |
6687 | struct sched_domain *parent = tmp->parent; | 6754 | struct sched_domain *parent = tmp->parent; |
@@ -6692,12 +6759,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6692 | tmp->parent = parent->parent; | 6759 | tmp->parent = parent->parent; |
6693 | if (parent->parent) | 6760 | if (parent->parent) |
6694 | parent->parent->child = tmp; | 6761 | parent->parent->child = tmp; |
6762 | destroy_sched_domain(parent, cpu); | ||
6695 | } else | 6763 | } else |
6696 | tmp = tmp->parent; | 6764 | tmp = tmp->parent; |
6697 | } | 6765 | } |
6698 | 6766 | ||
6699 | if (sd && sd_degenerate(sd)) { | 6767 | if (sd && sd_degenerate(sd)) { |
6768 | tmp = sd; | ||
6700 | sd = sd->parent; | 6769 | sd = sd->parent; |
6770 | destroy_sched_domain(tmp, cpu); | ||
6701 | if (sd) | 6771 | if (sd) |
6702 | sd->child = NULL; | 6772 | sd->child = NULL; |
6703 | } | 6773 | } |
@@ -6705,7 +6775,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6705 | sched_domain_debug(sd, cpu); | 6775 | sched_domain_debug(sd, cpu); |
6706 | 6776 | ||
6707 | rq_attach_root(rq, rd); | 6777 | rq_attach_root(rq, rd); |
6778 | tmp = rq->sd; | ||
6708 | rcu_assign_pointer(rq->sd, sd); | 6779 | rcu_assign_pointer(rq->sd, sd); |
6780 | destroy_sched_domains(tmp, cpu); | ||
6709 | } | 6781 | } |
6710 | 6782 | ||
6711 | /* cpus with isolated domains */ | 6783 | /* cpus with isolated domains */ |
@@ -6721,56 +6793,6 @@ static int __init isolated_cpu_setup(char *str) | |||
6721 | 6793 | ||
6722 | __setup("isolcpus=", isolated_cpu_setup); | 6794 | __setup("isolcpus=", isolated_cpu_setup); |
6723 | 6795 | ||
6724 | /* | ||
6725 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer | ||
6726 | * to a function which identifies what group(along with sched group) a CPU | ||
6727 | * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids | ||
6728 | * (due to the fact that we keep track of groups covered with a struct cpumask). | ||
6729 | * | ||
6730 | * init_sched_build_groups will build a circular linked list of the groups | ||
6731 | * covered by the given span, and will set each group's ->cpumask correctly, | ||
6732 | * and ->cpu_power to 0. | ||
6733 | */ | ||
6734 | static void | ||
6735 | init_sched_build_groups(const struct cpumask *span, | ||
6736 | const struct cpumask *cpu_map, | ||
6737 | int (*group_fn)(int cpu, const struct cpumask *cpu_map, | ||
6738 | struct sched_group **sg, | ||
6739 | struct cpumask *tmpmask), | ||
6740 | struct cpumask *covered, struct cpumask *tmpmask) | ||
6741 | { | ||
6742 | struct sched_group *first = NULL, *last = NULL; | ||
6743 | int i; | ||
6744 | |||
6745 | cpumask_clear(covered); | ||
6746 | |||
6747 | for_each_cpu(i, span) { | ||
6748 | struct sched_group *sg; | ||
6749 | int group = group_fn(i, cpu_map, &sg, tmpmask); | ||
6750 | int j; | ||
6751 | |||
6752 | if (cpumask_test_cpu(i, covered)) | ||
6753 | continue; | ||
6754 | |||
6755 | cpumask_clear(sched_group_cpus(sg)); | ||
6756 | sg->cpu_power = 0; | ||
6757 | |||
6758 | for_each_cpu(j, span) { | ||
6759 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) | ||
6760 | continue; | ||
6761 | |||
6762 | cpumask_set_cpu(j, covered); | ||
6763 | cpumask_set_cpu(j, sched_group_cpus(sg)); | ||
6764 | } | ||
6765 | if (!first) | ||
6766 | first = sg; | ||
6767 | if (last) | ||
6768 | last->next = sg; | ||
6769 | last = sg; | ||
6770 | } | ||
6771 | last->next = first; | ||
6772 | } | ||
6773 | |||
6774 | #define SD_NODES_PER_DOMAIN 16 | 6796 | #define SD_NODES_PER_DOMAIN 16 |
6775 | 6797 | ||
6776 | #ifdef CONFIG_NUMA | 6798 | #ifdef CONFIG_NUMA |
@@ -6787,7 +6809,7 @@ init_sched_build_groups(const struct cpumask *span, | |||
6787 | */ | 6809 | */ |
6788 | static int find_next_best_node(int node, nodemask_t *used_nodes) | 6810 | static int find_next_best_node(int node, nodemask_t *used_nodes) |
6789 | { | 6811 | { |
6790 | int i, n, val, min_val, best_node = 0; | 6812 | int i, n, val, min_val, best_node = -1; |
6791 | 6813 | ||
6792 | min_val = INT_MAX; | 6814 | min_val = INT_MAX; |
6793 | 6815 | ||
@@ -6811,7 +6833,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) | |||
6811 | } | 6833 | } |
6812 | } | 6834 | } |
6813 | 6835 | ||
6814 | node_set(best_node, *used_nodes); | 6836 | if (best_node != -1) |
6837 | node_set(best_node, *used_nodes); | ||
6815 | return best_node; | 6838 | return best_node; |
6816 | } | 6839 | } |
6817 | 6840 | ||
@@ -6837,315 +6860,130 @@ static void sched_domain_node_span(int node, struct cpumask *span) | |||
6837 | 6860 | ||
6838 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | 6861 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { |
6839 | int next_node = find_next_best_node(node, &used_nodes); | 6862 | int next_node = find_next_best_node(node, &used_nodes); |
6840 | 6863 | if (next_node < 0) | |
6864 | break; | ||
6841 | cpumask_or(span, span, cpumask_of_node(next_node)); | 6865 | cpumask_or(span, span, cpumask_of_node(next_node)); |
6842 | } | 6866 | } |
6843 | } | 6867 | } |
6868 | |||
6869 | static const struct cpumask *cpu_node_mask(int cpu) | ||
6870 | { | ||
6871 | lockdep_assert_held(&sched_domains_mutex); | ||
6872 | |||
6873 | sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); | ||
6874 | |||
6875 | return sched_domains_tmpmask; | ||
6876 | } | ||
6877 | |||
6878 | static const struct cpumask *cpu_allnodes_mask(int cpu) | ||
6879 | { | ||
6880 | return cpu_possible_mask; | ||
6881 | } | ||
6844 | #endif /* CONFIG_NUMA */ | 6882 | #endif /* CONFIG_NUMA */ |
6845 | 6883 | ||
6846 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | 6884 | static const struct cpumask *cpu_cpu_mask(int cpu) |
6885 | { | ||
6886 | return cpumask_of_node(cpu_to_node(cpu)); | ||
6887 | } | ||
6847 | 6888 | ||
6848 | /* | 6889 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; |
6849 | * The cpus mask in sched_group and sched_domain hangs off the end. | ||
6850 | * | ||
6851 | * ( See the the comments in include/linux/sched.h:struct sched_group | ||
6852 | * and struct sched_domain. ) | ||
6853 | */ | ||
6854 | struct static_sched_group { | ||
6855 | struct sched_group sg; | ||
6856 | DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); | ||
6857 | }; | ||
6858 | 6890 | ||
6859 | struct static_sched_domain { | 6891 | struct sd_data { |
6860 | struct sched_domain sd; | 6892 | struct sched_domain **__percpu sd; |
6861 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); | 6893 | struct sched_group **__percpu sg; |
6862 | }; | 6894 | }; |
6863 | 6895 | ||
6864 | struct s_data { | 6896 | struct s_data { |
6865 | #ifdef CONFIG_NUMA | 6897 | struct sched_domain ** __percpu sd; |
6866 | int sd_allnodes; | ||
6867 | cpumask_var_t domainspan; | ||
6868 | cpumask_var_t covered; | ||
6869 | cpumask_var_t notcovered; | ||
6870 | #endif | ||
6871 | cpumask_var_t nodemask; | ||
6872 | cpumask_var_t this_sibling_map; | ||
6873 | cpumask_var_t this_core_map; | ||
6874 | cpumask_var_t this_book_map; | ||
6875 | cpumask_var_t send_covered; | ||
6876 | cpumask_var_t tmpmask; | ||
6877 | struct sched_group **sched_group_nodes; | ||
6878 | struct root_domain *rd; | 6898 | struct root_domain *rd; |
6879 | }; | 6899 | }; |
6880 | 6900 | ||
6881 | enum s_alloc { | 6901 | enum s_alloc { |
6882 | sa_sched_groups = 0, | ||
6883 | sa_rootdomain, | 6902 | sa_rootdomain, |
6884 | sa_tmpmask, | 6903 | sa_sd, |
6885 | sa_send_covered, | 6904 | sa_sd_storage, |
6886 | sa_this_book_map, | ||
6887 | sa_this_core_map, | ||
6888 | sa_this_sibling_map, | ||
6889 | sa_nodemask, | ||
6890 | sa_sched_group_nodes, | ||
6891 | #ifdef CONFIG_NUMA | ||
6892 | sa_notcovered, | ||
6893 | sa_covered, | ||
6894 | sa_domainspan, | ||
6895 | #endif | ||
6896 | sa_none, | 6905 | sa_none, |
6897 | }; | 6906 | }; |
6898 | 6907 | ||
6899 | /* | 6908 | struct sched_domain_topology_level; |
6900 | * SMT sched-domains: | ||
6901 | */ | ||
6902 | #ifdef CONFIG_SCHED_SMT | ||
6903 | static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); | ||
6904 | static DEFINE_PER_CPU(struct static_sched_group, sched_groups); | ||
6905 | 6909 | ||
6906 | static int | 6910 | typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); |
6907 | cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, | 6911 | typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); |
6908 | struct sched_group **sg, struct cpumask *unused) | ||
6909 | { | ||
6910 | if (sg) | ||
6911 | *sg = &per_cpu(sched_groups, cpu).sg; | ||
6912 | return cpu; | ||
6913 | } | ||
6914 | #endif /* CONFIG_SCHED_SMT */ | ||
6915 | 6912 | ||
6916 | /* | 6913 | struct sched_domain_topology_level { |
6917 | * multi-core sched-domains: | 6914 | sched_domain_init_f init; |
6918 | */ | 6915 | sched_domain_mask_f mask; |
6919 | #ifdef CONFIG_SCHED_MC | 6916 | struct sd_data data; |
6920 | static DEFINE_PER_CPU(struct static_sched_domain, core_domains); | 6917 | }; |
6921 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); | ||
6922 | |||
6923 | static int | ||
6924 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, | ||
6925 | struct sched_group **sg, struct cpumask *mask) | ||
6926 | { | ||
6927 | int group; | ||
6928 | #ifdef CONFIG_SCHED_SMT | ||
6929 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
6930 | group = cpumask_first(mask); | ||
6931 | #else | ||
6932 | group = cpu; | ||
6933 | #endif | ||
6934 | if (sg) | ||
6935 | *sg = &per_cpu(sched_group_core, group).sg; | ||
6936 | return group; | ||
6937 | } | ||
6938 | #endif /* CONFIG_SCHED_MC */ | ||
6939 | 6918 | ||
6940 | /* | 6919 | /* |
6941 | * book sched-domains: | 6920 | * Assumes the sched_domain tree is fully constructed |
6942 | */ | 6921 | */ |
6943 | #ifdef CONFIG_SCHED_BOOK | 6922 | static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) |
6944 | static DEFINE_PER_CPU(struct static_sched_domain, book_domains); | ||
6945 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); | ||
6946 | |||
6947 | static int | ||
6948 | cpu_to_book_group(int cpu, const struct cpumask *cpu_map, | ||
6949 | struct sched_group **sg, struct cpumask *mask) | ||
6950 | { | 6923 | { |
6951 | int group = cpu; | 6924 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); |
6952 | #ifdef CONFIG_SCHED_MC | 6925 | struct sched_domain *child = sd->child; |
6953 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | ||
6954 | group = cpumask_first(mask); | ||
6955 | #elif defined(CONFIG_SCHED_SMT) | ||
6956 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
6957 | group = cpumask_first(mask); | ||
6958 | #endif | ||
6959 | if (sg) | ||
6960 | *sg = &per_cpu(sched_group_book, group).sg; | ||
6961 | return group; | ||
6962 | } | ||
6963 | #endif /* CONFIG_SCHED_BOOK */ | ||
6964 | 6926 | ||
6965 | static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); | 6927 | if (child) |
6966 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); | 6928 | cpu = cpumask_first(sched_domain_span(child)); |
6967 | 6929 | ||
6968 | static int | ||
6969 | cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, | ||
6970 | struct sched_group **sg, struct cpumask *mask) | ||
6971 | { | ||
6972 | int group; | ||
6973 | #ifdef CONFIG_SCHED_BOOK | ||
6974 | cpumask_and(mask, cpu_book_mask(cpu), cpu_map); | ||
6975 | group = cpumask_first(mask); | ||
6976 | #elif defined(CONFIG_SCHED_MC) | ||
6977 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | ||
6978 | group = cpumask_first(mask); | ||
6979 | #elif defined(CONFIG_SCHED_SMT) | ||
6980 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
6981 | group = cpumask_first(mask); | ||
6982 | #else | ||
6983 | group = cpu; | ||
6984 | #endif | ||
6985 | if (sg) | 6930 | if (sg) |
6986 | *sg = &per_cpu(sched_group_phys, group).sg; | 6931 | *sg = *per_cpu_ptr(sdd->sg, cpu); |
6987 | return group; | 6932 | |
6933 | return cpu; | ||
6988 | } | 6934 | } |
6989 | 6935 | ||
6990 | #ifdef CONFIG_NUMA | ||
6991 | /* | 6936 | /* |
6992 | * The init_sched_build_groups can't handle what we want to do with node | 6937 | * build_sched_groups takes the cpumask we wish to span, and a pointer |
6993 | * groups, so roll our own. Now each node has its own list of groups which | 6938 | * to a function which identifies what group(along with sched group) a CPU |
6994 | * gets dynamically allocated. | 6939 | * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids |
6940 | * (due to the fact that we keep track of groups covered with a struct cpumask). | ||
6941 | * | ||
6942 | * build_sched_groups will build a circular linked list of the groups | ||
6943 | * covered by the given span, and will set each group's ->cpumask correctly, | ||
6944 | * and ->cpu_power to 0. | ||
6995 | */ | 6945 | */ |
6996 | static DEFINE_PER_CPU(struct static_sched_domain, node_domains); | 6946 | static void |
6997 | static struct sched_group ***sched_group_nodes_bycpu; | 6947 | build_sched_groups(struct sched_domain *sd) |
6998 | |||
6999 | static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); | ||
7000 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); | ||
7001 | |||
7002 | static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, | ||
7003 | struct sched_group **sg, | ||
7004 | struct cpumask *nodemask) | ||
7005 | { | ||
7006 | int group; | ||
7007 | |||
7008 | cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); | ||
7009 | group = cpumask_first(nodemask); | ||
7010 | |||
7011 | if (sg) | ||
7012 | *sg = &per_cpu(sched_group_allnodes, group).sg; | ||
7013 | return group; | ||
7014 | } | ||
7015 | |||
7016 | static void init_numa_sched_groups_power(struct sched_group *group_head) | ||
7017 | { | ||
7018 | struct sched_group *sg = group_head; | ||
7019 | int j; | ||
7020 | |||
7021 | if (!sg) | ||
7022 | return; | ||
7023 | do { | ||
7024 | for_each_cpu(j, sched_group_cpus(sg)) { | ||
7025 | struct sched_domain *sd; | ||
7026 | |||
7027 | sd = &per_cpu(phys_domains, j).sd; | ||
7028 | if (j != group_first_cpu(sd->groups)) { | ||
7029 | /* | ||
7030 | * Only add "power" once for each | ||
7031 | * physical package. | ||
7032 | */ | ||
7033 | continue; | ||
7034 | } | ||
7035 | |||
7036 | sg->cpu_power += sd->groups->cpu_power; | ||
7037 | } | ||
7038 | sg = sg->next; | ||
7039 | } while (sg != group_head); | ||
7040 | } | ||
7041 | |||
7042 | static int build_numa_sched_groups(struct s_data *d, | ||
7043 | const struct cpumask *cpu_map, int num) | ||
7044 | { | 6948 | { |
7045 | struct sched_domain *sd; | 6949 | struct sched_group *first = NULL, *last = NULL; |
7046 | struct sched_group *sg, *prev; | 6950 | struct sd_data *sdd = sd->private; |
7047 | int n, j; | 6951 | const struct cpumask *span = sched_domain_span(sd); |
7048 | 6952 | struct cpumask *covered; | |
7049 | cpumask_clear(d->covered); | 6953 | int i; |
7050 | cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); | ||
7051 | if (cpumask_empty(d->nodemask)) { | ||
7052 | d->sched_group_nodes[num] = NULL; | ||
7053 | goto out; | ||
7054 | } | ||
7055 | |||
7056 | sched_domain_node_span(num, d->domainspan); | ||
7057 | cpumask_and(d->domainspan, d->domainspan, cpu_map); | ||
7058 | |||
7059 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
7060 | GFP_KERNEL, num); | ||
7061 | if (!sg) { | ||
7062 | printk(KERN_WARNING "Can not alloc domain group for node %d\n", | ||
7063 | num); | ||
7064 | return -ENOMEM; | ||
7065 | } | ||
7066 | d->sched_group_nodes[num] = sg; | ||
7067 | |||
7068 | for_each_cpu(j, d->nodemask) { | ||
7069 | sd = &per_cpu(node_domains, j).sd; | ||
7070 | sd->groups = sg; | ||
7071 | } | ||
7072 | |||
7073 | sg->cpu_power = 0; | ||
7074 | cpumask_copy(sched_group_cpus(sg), d->nodemask); | ||
7075 | sg->next = sg; | ||
7076 | cpumask_or(d->covered, d->covered, d->nodemask); | ||
7077 | 6954 | ||
7078 | prev = sg; | 6955 | lockdep_assert_held(&sched_domains_mutex); |
7079 | for (j = 0; j < nr_node_ids; j++) { | 6956 | covered = sched_domains_tmpmask; |
7080 | n = (num + j) % nr_node_ids; | ||
7081 | cpumask_complement(d->notcovered, d->covered); | ||
7082 | cpumask_and(d->tmpmask, d->notcovered, cpu_map); | ||
7083 | cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); | ||
7084 | if (cpumask_empty(d->tmpmask)) | ||
7085 | break; | ||
7086 | cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); | ||
7087 | if (cpumask_empty(d->tmpmask)) | ||
7088 | continue; | ||
7089 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
7090 | GFP_KERNEL, num); | ||
7091 | if (!sg) { | ||
7092 | printk(KERN_WARNING | ||
7093 | "Can not alloc domain group for node %d\n", j); | ||
7094 | return -ENOMEM; | ||
7095 | } | ||
7096 | sg->cpu_power = 0; | ||
7097 | cpumask_copy(sched_group_cpus(sg), d->tmpmask); | ||
7098 | sg->next = prev->next; | ||
7099 | cpumask_or(d->covered, d->covered, d->tmpmask); | ||
7100 | prev->next = sg; | ||
7101 | prev = sg; | ||
7102 | } | ||
7103 | out: | ||
7104 | return 0; | ||
7105 | } | ||
7106 | #endif /* CONFIG_NUMA */ | ||
7107 | 6957 | ||
7108 | #ifdef CONFIG_NUMA | 6958 | cpumask_clear(covered); |
7109 | /* Free memory allocated for various sched_group structures */ | ||
7110 | static void free_sched_groups(const struct cpumask *cpu_map, | ||
7111 | struct cpumask *nodemask) | ||
7112 | { | ||
7113 | int cpu, i; | ||
7114 | 6959 | ||
7115 | for_each_cpu(cpu, cpu_map) { | 6960 | for_each_cpu(i, span) { |
7116 | struct sched_group **sched_group_nodes | 6961 | struct sched_group *sg; |
7117 | = sched_group_nodes_bycpu[cpu]; | 6962 | int group = get_group(i, sdd, &sg); |
6963 | int j; | ||
7118 | 6964 | ||
7119 | if (!sched_group_nodes) | 6965 | if (cpumask_test_cpu(i, covered)) |
7120 | continue; | 6966 | continue; |
7121 | 6967 | ||
7122 | for (i = 0; i < nr_node_ids; i++) { | 6968 | cpumask_clear(sched_group_cpus(sg)); |
7123 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | 6969 | sg->cpu_power = 0; |
7124 | 6970 | ||
7125 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 6971 | for_each_cpu(j, span) { |
7126 | if (cpumask_empty(nodemask)) | 6972 | if (get_group(j, sdd, NULL) != group) |
7127 | continue; | 6973 | continue; |
7128 | 6974 | ||
7129 | if (sg == NULL) | 6975 | cpumask_set_cpu(j, covered); |
7130 | continue; | 6976 | cpumask_set_cpu(j, sched_group_cpus(sg)); |
7131 | sg = sg->next; | ||
7132 | next_sg: | ||
7133 | oldsg = sg; | ||
7134 | sg = sg->next; | ||
7135 | kfree(oldsg); | ||
7136 | if (oldsg != sched_group_nodes[i]) | ||
7137 | goto next_sg; | ||
7138 | } | 6977 | } |
7139 | kfree(sched_group_nodes); | 6978 | |
7140 | sched_group_nodes_bycpu[cpu] = NULL; | 6979 | if (!first) |
6980 | first = sg; | ||
6981 | if (last) | ||
6982 | last->next = sg; | ||
6983 | last = sg; | ||
7141 | } | 6984 | } |
6985 | last->next = first; | ||
7142 | } | 6986 | } |
7143 | #else /* !CONFIG_NUMA */ | ||
7144 | static void free_sched_groups(const struct cpumask *cpu_map, | ||
7145 | struct cpumask *nodemask) | ||
7146 | { | ||
7147 | } | ||
7148 | #endif /* CONFIG_NUMA */ | ||
7149 | 6987 | ||
7150 | /* | 6988 | /* |
7151 | * Initialize sched groups cpu_power. | 6989 | * Initialize sched groups cpu_power. |
@@ -7159,11 +6997,6 @@ static void free_sched_groups(const struct cpumask *cpu_map, | |||
7159 | */ | 6997 | */ |
7160 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 6998 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) |
7161 | { | 6999 | { |
7162 | struct sched_domain *child; | ||
7163 | struct sched_group *group; | ||
7164 | long power; | ||
7165 | int weight; | ||
7166 | |||
7167 | WARN_ON(!sd || !sd->groups); | 7000 | WARN_ON(!sd || !sd->groups); |
7168 | 7001 | ||
7169 | if (cpu != group_first_cpu(sd->groups)) | 7002 | if (cpu != group_first_cpu(sd->groups)) |
@@ -7171,36 +7004,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
7171 | 7004 | ||
7172 | sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); | 7005 | sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); |
7173 | 7006 | ||
7174 | child = sd->child; | 7007 | update_group_power(sd, cpu); |
7175 | |||
7176 | sd->groups->cpu_power = 0; | ||
7177 | |||
7178 | if (!child) { | ||
7179 | power = SCHED_LOAD_SCALE; | ||
7180 | weight = cpumask_weight(sched_domain_span(sd)); | ||
7181 | /* | ||
7182 | * SMT siblings share the power of a single core. | ||
7183 | * Usually multiple threads get a better yield out of | ||
7184 | * that one core than a single thread would have, | ||
7185 | * reflect that in sd->smt_gain. | ||
7186 | */ | ||
7187 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
7188 | power *= sd->smt_gain; | ||
7189 | power /= weight; | ||
7190 | power >>= SCHED_LOAD_SHIFT; | ||
7191 | } | ||
7192 | sd->groups->cpu_power += power; | ||
7193 | return; | ||
7194 | } | ||
7195 | |||
7196 | /* | ||
7197 | * Add cpu_power of each child group to this groups cpu_power. | ||
7198 | */ | ||
7199 | group = child->groups; | ||
7200 | do { | ||
7201 | sd->groups->cpu_power += group->cpu_power; | ||
7202 | group = group->next; | ||
7203 | } while (group != child->groups); | ||
7204 | } | 7008 | } |
7205 | 7009 | ||
7206 | /* | 7010 | /* |
@@ -7214,15 +7018,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
7214 | # define SD_INIT_NAME(sd, type) do { } while (0) | 7018 | # define SD_INIT_NAME(sd, type) do { } while (0) |
7215 | #endif | 7019 | #endif |
7216 | 7020 | ||
7217 | #define SD_INIT(sd, type) sd_init_##type(sd) | 7021 | #define SD_INIT_FUNC(type) \ |
7218 | 7022 | static noinline struct sched_domain * \ | |
7219 | #define SD_INIT_FUNC(type) \ | 7023 | sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ |
7220 | static noinline void sd_init_##type(struct sched_domain *sd) \ | 7024 | { \ |
7221 | { \ | 7025 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ |
7222 | memset(sd, 0, sizeof(*sd)); \ | 7026 | *sd = SD_##type##_INIT; \ |
7223 | *sd = SD_##type##_INIT; \ | 7027 | SD_INIT_NAME(sd, type); \ |
7224 | sd->level = SD_LV_##type; \ | 7028 | sd->private = &tl->data; \ |
7225 | SD_INIT_NAME(sd, type); \ | 7029 | return sd; \ |
7226 | } | 7030 | } |
7227 | 7031 | ||
7228 | SD_INIT_FUNC(CPU) | 7032 | SD_INIT_FUNC(CPU) |
@@ -7241,13 +7045,14 @@ SD_INIT_FUNC(CPU) | |||
7241 | #endif | 7045 | #endif |
7242 | 7046 | ||
7243 | static int default_relax_domain_level = -1; | 7047 | static int default_relax_domain_level = -1; |
7048 | int sched_domain_level_max; | ||
7244 | 7049 | ||
7245 | static int __init setup_relax_domain_level(char *str) | 7050 | static int __init setup_relax_domain_level(char *str) |
7246 | { | 7051 | { |
7247 | unsigned long val; | 7052 | unsigned long val; |
7248 | 7053 | ||
7249 | val = simple_strtoul(str, NULL, 0); | 7054 | val = simple_strtoul(str, NULL, 0); |
7250 | if (val < SD_LV_MAX) | 7055 | if (val < sched_domain_level_max) |
7251 | default_relax_domain_level = val; | 7056 | default_relax_domain_level = val; |
7252 | 7057 | ||
7253 | return 1; | 7058 | return 1; |
@@ -7275,37 +7080,20 @@ static void set_domain_attribute(struct sched_domain *sd, | |||
7275 | } | 7080 | } |
7276 | } | 7081 | } |
7277 | 7082 | ||
7083 | static void __sdt_free(const struct cpumask *cpu_map); | ||
7084 | static int __sdt_alloc(const struct cpumask *cpu_map); | ||
7085 | |||
7278 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | 7086 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, |
7279 | const struct cpumask *cpu_map) | 7087 | const struct cpumask *cpu_map) |
7280 | { | 7088 | { |
7281 | switch (what) { | 7089 | switch (what) { |
7282 | case sa_sched_groups: | ||
7283 | free_sched_groups(cpu_map, d->tmpmask); /* fall through */ | ||
7284 | d->sched_group_nodes = NULL; | ||
7285 | case sa_rootdomain: | 7090 | case sa_rootdomain: |
7286 | free_rootdomain(d->rd); /* fall through */ | 7091 | if (!atomic_read(&d->rd->refcount)) |
7287 | case sa_tmpmask: | 7092 | free_rootdomain(&d->rd->rcu); /* fall through */ |
7288 | free_cpumask_var(d->tmpmask); /* fall through */ | 7093 | case sa_sd: |
7289 | case sa_send_covered: | 7094 | free_percpu(d->sd); /* fall through */ |
7290 | free_cpumask_var(d->send_covered); /* fall through */ | 7095 | case sa_sd_storage: |
7291 | case sa_this_book_map: | 7096 | __sdt_free(cpu_map); /* fall through */ |
7292 | free_cpumask_var(d->this_book_map); /* fall through */ | ||
7293 | case sa_this_core_map: | ||
7294 | free_cpumask_var(d->this_core_map); /* fall through */ | ||
7295 | case sa_this_sibling_map: | ||
7296 | free_cpumask_var(d->this_sibling_map); /* fall through */ | ||
7297 | case sa_nodemask: | ||
7298 | free_cpumask_var(d->nodemask); /* fall through */ | ||
7299 | case sa_sched_group_nodes: | ||
7300 | #ifdef CONFIG_NUMA | ||
7301 | kfree(d->sched_group_nodes); /* fall through */ | ||
7302 | case sa_notcovered: | ||
7303 | free_cpumask_var(d->notcovered); /* fall through */ | ||
7304 | case sa_covered: | ||
7305 | free_cpumask_var(d->covered); /* fall through */ | ||
7306 | case sa_domainspan: | ||
7307 | free_cpumask_var(d->domainspan); /* fall through */ | ||
7308 | #endif | ||
7309 | case sa_none: | 7097 | case sa_none: |
7310 | break; | 7098 | break; |
7311 | } | 7099 | } |
@@ -7314,308 +7102,212 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | |||
7314 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | 7102 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, |
7315 | const struct cpumask *cpu_map) | 7103 | const struct cpumask *cpu_map) |
7316 | { | 7104 | { |
7317 | #ifdef CONFIG_NUMA | 7105 | memset(d, 0, sizeof(*d)); |
7318 | if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) | 7106 | |
7319 | return sa_none; | 7107 | if (__sdt_alloc(cpu_map)) |
7320 | if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) | 7108 | return sa_sd_storage; |
7321 | return sa_domainspan; | 7109 | d->sd = alloc_percpu(struct sched_domain *); |
7322 | if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) | 7110 | if (!d->sd) |
7323 | return sa_covered; | 7111 | return sa_sd_storage; |
7324 | /* Allocate the per-node list of sched groups */ | ||
7325 | d->sched_group_nodes = kcalloc(nr_node_ids, | ||
7326 | sizeof(struct sched_group *), GFP_KERNEL); | ||
7327 | if (!d->sched_group_nodes) { | ||
7328 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | ||
7329 | return sa_notcovered; | ||
7330 | } | ||
7331 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; | ||
7332 | #endif | ||
7333 | if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) | ||
7334 | return sa_sched_group_nodes; | ||
7335 | if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) | ||
7336 | return sa_nodemask; | ||
7337 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) | ||
7338 | return sa_this_sibling_map; | ||
7339 | if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) | ||
7340 | return sa_this_core_map; | ||
7341 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | ||
7342 | return sa_this_book_map; | ||
7343 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | ||
7344 | return sa_send_covered; | ||
7345 | d->rd = alloc_rootdomain(); | 7112 | d->rd = alloc_rootdomain(); |
7346 | if (!d->rd) { | 7113 | if (!d->rd) |
7347 | printk(KERN_WARNING "Cannot alloc root domain\n"); | 7114 | return sa_sd; |
7348 | return sa_tmpmask; | ||
7349 | } | ||
7350 | return sa_rootdomain; | 7115 | return sa_rootdomain; |
7351 | } | 7116 | } |
7352 | 7117 | ||
7353 | static struct sched_domain *__build_numa_sched_domains(struct s_data *d, | 7118 | /* |
7354 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) | 7119 | * NULL the sd_data elements we've used to build the sched_domain and |
7120 | * sched_group structure so that the subsequent __free_domain_allocs() | ||
7121 | * will not free the data we're using. | ||
7122 | */ | ||
7123 | static void claim_allocations(int cpu, struct sched_domain *sd) | ||
7355 | { | 7124 | { |
7356 | struct sched_domain *sd = NULL; | 7125 | struct sd_data *sdd = sd->private; |
7357 | #ifdef CONFIG_NUMA | 7126 | struct sched_group *sg = sd->groups; |
7358 | struct sched_domain *parent; | ||
7359 | |||
7360 | d->sd_allnodes = 0; | ||
7361 | if (cpumask_weight(cpu_map) > | ||
7362 | SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { | ||
7363 | sd = &per_cpu(allnodes_domains, i).sd; | ||
7364 | SD_INIT(sd, ALLNODES); | ||
7365 | set_domain_attribute(sd, attr); | ||
7366 | cpumask_copy(sched_domain_span(sd), cpu_map); | ||
7367 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7368 | d->sd_allnodes = 1; | ||
7369 | } | ||
7370 | parent = sd; | ||
7371 | |||
7372 | sd = &per_cpu(node_domains, i).sd; | ||
7373 | SD_INIT(sd, NODE); | ||
7374 | set_domain_attribute(sd, attr); | ||
7375 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | ||
7376 | sd->parent = parent; | ||
7377 | if (parent) | ||
7378 | parent->child = sd; | ||
7379 | cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); | ||
7380 | #endif | ||
7381 | return sd; | ||
7382 | } | ||
7383 | 7127 | ||
7384 | static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, | 7128 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); |
7385 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7129 | *per_cpu_ptr(sdd->sd, cpu) = NULL; |
7386 | struct sched_domain *parent, int i) | ||
7387 | { | ||
7388 | struct sched_domain *sd; | ||
7389 | sd = &per_cpu(phys_domains, i).sd; | ||
7390 | SD_INIT(sd, CPU); | ||
7391 | set_domain_attribute(sd, attr); | ||
7392 | cpumask_copy(sched_domain_span(sd), d->nodemask); | ||
7393 | sd->parent = parent; | ||
7394 | if (parent) | ||
7395 | parent->child = sd; | ||
7396 | cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7397 | return sd; | ||
7398 | } | ||
7399 | 7130 | ||
7400 | static struct sched_domain *__build_book_sched_domain(struct s_data *d, | 7131 | if (cpu == cpumask_first(sched_group_cpus(sg))) { |
7401 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7132 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); |
7402 | struct sched_domain *parent, int i) | 7133 | *per_cpu_ptr(sdd->sg, cpu) = NULL; |
7403 | { | 7134 | } |
7404 | struct sched_domain *sd = parent; | ||
7405 | #ifdef CONFIG_SCHED_BOOK | ||
7406 | sd = &per_cpu(book_domains, i).sd; | ||
7407 | SD_INIT(sd, BOOK); | ||
7408 | set_domain_attribute(sd, attr); | ||
7409 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); | ||
7410 | sd->parent = parent; | ||
7411 | parent->child = sd; | ||
7412 | cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7413 | #endif | ||
7414 | return sd; | ||
7415 | } | 7135 | } |
7416 | 7136 | ||
7417 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, | 7137 | #ifdef CONFIG_SCHED_SMT |
7418 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7138 | static const struct cpumask *cpu_smt_mask(int cpu) |
7419 | struct sched_domain *parent, int i) | ||
7420 | { | 7139 | { |
7421 | struct sched_domain *sd = parent; | 7140 | return topology_thread_cpumask(cpu); |
7422 | #ifdef CONFIG_SCHED_MC | ||
7423 | sd = &per_cpu(core_domains, i).sd; | ||
7424 | SD_INIT(sd, MC); | ||
7425 | set_domain_attribute(sd, attr); | ||
7426 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); | ||
7427 | sd->parent = parent; | ||
7428 | parent->child = sd; | ||
7429 | cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7430 | #endif | ||
7431 | return sd; | ||
7432 | } | 7141 | } |
7433 | |||
7434 | static struct sched_domain *__build_smt_sched_domain(struct s_data *d, | ||
7435 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
7436 | struct sched_domain *parent, int i) | ||
7437 | { | ||
7438 | struct sched_domain *sd = parent; | ||
7439 | #ifdef CONFIG_SCHED_SMT | ||
7440 | sd = &per_cpu(cpu_domains, i).sd; | ||
7441 | SD_INIT(sd, SIBLING); | ||
7442 | set_domain_attribute(sd, attr); | ||
7443 | cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); | ||
7444 | sd->parent = parent; | ||
7445 | parent->child = sd; | ||
7446 | cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7447 | #endif | 7142 | #endif |
7448 | return sd; | ||
7449 | } | ||
7450 | 7143 | ||
7451 | static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | 7144 | /* |
7452 | const struct cpumask *cpu_map, int cpu) | 7145 | * Topology list, bottom-up. |
7453 | { | 7146 | */ |
7454 | switch (l) { | 7147 | static struct sched_domain_topology_level default_topology[] = { |
7455 | #ifdef CONFIG_SCHED_SMT | 7148 | #ifdef CONFIG_SCHED_SMT |
7456 | case SD_LV_SIBLING: /* set up CPU (sibling) groups */ | 7149 | { sd_init_SIBLING, cpu_smt_mask, }, |
7457 | cpumask_and(d->this_sibling_map, cpu_map, | ||
7458 | topology_thread_cpumask(cpu)); | ||
7459 | if (cpu == cpumask_first(d->this_sibling_map)) | ||
7460 | init_sched_build_groups(d->this_sibling_map, cpu_map, | ||
7461 | &cpu_to_cpu_group, | ||
7462 | d->send_covered, d->tmpmask); | ||
7463 | break; | ||
7464 | #endif | 7150 | #endif |
7465 | #ifdef CONFIG_SCHED_MC | 7151 | #ifdef CONFIG_SCHED_MC |
7466 | case SD_LV_MC: /* set up multi-core groups */ | 7152 | { sd_init_MC, cpu_coregroup_mask, }, |
7467 | cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); | ||
7468 | if (cpu == cpumask_first(d->this_core_map)) | ||
7469 | init_sched_build_groups(d->this_core_map, cpu_map, | ||
7470 | &cpu_to_core_group, | ||
7471 | d->send_covered, d->tmpmask); | ||
7472 | break; | ||
7473 | #endif | 7153 | #endif |
7474 | #ifdef CONFIG_SCHED_BOOK | 7154 | #ifdef CONFIG_SCHED_BOOK |
7475 | case SD_LV_BOOK: /* set up book groups */ | 7155 | { sd_init_BOOK, cpu_book_mask, }, |
7476 | cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); | ||
7477 | if (cpu == cpumask_first(d->this_book_map)) | ||
7478 | init_sched_build_groups(d->this_book_map, cpu_map, | ||
7479 | &cpu_to_book_group, | ||
7480 | d->send_covered, d->tmpmask); | ||
7481 | break; | ||
7482 | #endif | 7156 | #endif |
7483 | case SD_LV_CPU: /* set up physical groups */ | 7157 | { sd_init_CPU, cpu_cpu_mask, }, |
7484 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); | ||
7485 | if (!cpumask_empty(d->nodemask)) | ||
7486 | init_sched_build_groups(d->nodemask, cpu_map, | ||
7487 | &cpu_to_phys_group, | ||
7488 | d->send_covered, d->tmpmask); | ||
7489 | break; | ||
7490 | #ifdef CONFIG_NUMA | 7158 | #ifdef CONFIG_NUMA |
7491 | case SD_LV_ALLNODES: | 7159 | { sd_init_NODE, cpu_node_mask, }, |
7492 | init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, | 7160 | { sd_init_ALLNODES, cpu_allnodes_mask, }, |
7493 | d->send_covered, d->tmpmask); | ||
7494 | break; | ||
7495 | #endif | 7161 | #endif |
7496 | default: | 7162 | { NULL, }, |
7497 | break; | 7163 | }; |
7164 | |||
7165 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; | ||
7166 | |||
7167 | static int __sdt_alloc(const struct cpumask *cpu_map) | ||
7168 | { | ||
7169 | struct sched_domain_topology_level *tl; | ||
7170 | int j; | ||
7171 | |||
7172 | for (tl = sched_domain_topology; tl->init; tl++) { | ||
7173 | struct sd_data *sdd = &tl->data; | ||
7174 | |||
7175 | sdd->sd = alloc_percpu(struct sched_domain *); | ||
7176 | if (!sdd->sd) | ||
7177 | return -ENOMEM; | ||
7178 | |||
7179 | sdd->sg = alloc_percpu(struct sched_group *); | ||
7180 | if (!sdd->sg) | ||
7181 | return -ENOMEM; | ||
7182 | |||
7183 | for_each_cpu(j, cpu_map) { | ||
7184 | struct sched_domain *sd; | ||
7185 | struct sched_group *sg; | ||
7186 | |||
7187 | sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), | ||
7188 | GFP_KERNEL, cpu_to_node(j)); | ||
7189 | if (!sd) | ||
7190 | return -ENOMEM; | ||
7191 | |||
7192 | *per_cpu_ptr(sdd->sd, j) = sd; | ||
7193 | |||
7194 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
7195 | GFP_KERNEL, cpu_to_node(j)); | ||
7196 | if (!sg) | ||
7197 | return -ENOMEM; | ||
7198 | |||
7199 | *per_cpu_ptr(sdd->sg, j) = sg; | ||
7200 | } | ||
7201 | } | ||
7202 | |||
7203 | return 0; | ||
7204 | } | ||
7205 | |||
7206 | static void __sdt_free(const struct cpumask *cpu_map) | ||
7207 | { | ||
7208 | struct sched_domain_topology_level *tl; | ||
7209 | int j; | ||
7210 | |||
7211 | for (tl = sched_domain_topology; tl->init; tl++) { | ||
7212 | struct sd_data *sdd = &tl->data; | ||
7213 | |||
7214 | for_each_cpu(j, cpu_map) { | ||
7215 | kfree(*per_cpu_ptr(sdd->sd, j)); | ||
7216 | kfree(*per_cpu_ptr(sdd->sg, j)); | ||
7217 | } | ||
7218 | free_percpu(sdd->sd); | ||
7219 | free_percpu(sdd->sg); | ||
7220 | } | ||
7221 | } | ||
7222 | |||
7223 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | ||
7224 | struct s_data *d, const struct cpumask *cpu_map, | ||
7225 | struct sched_domain_attr *attr, struct sched_domain *child, | ||
7226 | int cpu) | ||
7227 | { | ||
7228 | struct sched_domain *sd = tl->init(tl, cpu); | ||
7229 | if (!sd) | ||
7230 | return child; | ||
7231 | |||
7232 | set_domain_attribute(sd, attr); | ||
7233 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | ||
7234 | if (child) { | ||
7235 | sd->level = child->level + 1; | ||
7236 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | ||
7237 | child->parent = sd; | ||
7498 | } | 7238 | } |
7239 | sd->child = child; | ||
7240 | |||
7241 | return sd; | ||
7499 | } | 7242 | } |
7500 | 7243 | ||
7501 | /* | 7244 | /* |
7502 | * Build sched domains for a given set of cpus and attach the sched domains | 7245 | * Build sched domains for a given set of cpus and attach the sched domains |
7503 | * to the individual cpus | 7246 | * to the individual cpus |
7504 | */ | 7247 | */ |
7505 | static int __build_sched_domains(const struct cpumask *cpu_map, | 7248 | static int build_sched_domains(const struct cpumask *cpu_map, |
7506 | struct sched_domain_attr *attr) | 7249 | struct sched_domain_attr *attr) |
7507 | { | 7250 | { |
7508 | enum s_alloc alloc_state = sa_none; | 7251 | enum s_alloc alloc_state = sa_none; |
7509 | struct s_data d; | ||
7510 | struct sched_domain *sd; | 7252 | struct sched_domain *sd; |
7511 | int i; | 7253 | struct s_data d; |
7512 | #ifdef CONFIG_NUMA | 7254 | int i, ret = -ENOMEM; |
7513 | d.sd_allnodes = 0; | ||
7514 | #endif | ||
7515 | 7255 | ||
7516 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); | 7256 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
7517 | if (alloc_state != sa_rootdomain) | 7257 | if (alloc_state != sa_rootdomain) |
7518 | goto error; | 7258 | goto error; |
7519 | alloc_state = sa_sched_groups; | ||
7520 | 7259 | ||
7521 | /* | 7260 | /* Set up domains for cpus specified by the cpu_map. */ |
7522 | * Set up domains for cpus specified by the cpu_map. | ||
7523 | */ | ||
7524 | for_each_cpu(i, cpu_map) { | 7261 | for_each_cpu(i, cpu_map) { |
7525 | cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), | 7262 | struct sched_domain_topology_level *tl; |
7526 | cpu_map); | ||
7527 | 7263 | ||
7528 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); | 7264 | sd = NULL; |
7529 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); | 7265 | for (tl = sched_domain_topology; tl->init; tl++) |
7530 | sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); | 7266 | sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); |
7531 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); | ||
7532 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); | ||
7533 | } | ||
7534 | |||
7535 | for_each_cpu(i, cpu_map) { | ||
7536 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); | ||
7537 | build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); | ||
7538 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | ||
7539 | } | ||
7540 | |||
7541 | /* Set up physical groups */ | ||
7542 | for (i = 0; i < nr_node_ids; i++) | ||
7543 | build_sched_groups(&d, SD_LV_CPU, cpu_map, i); | ||
7544 | 7267 | ||
7545 | #ifdef CONFIG_NUMA | 7268 | while (sd->child) |
7546 | /* Set up node groups */ | 7269 | sd = sd->child; |
7547 | if (d.sd_allnodes) | ||
7548 | build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); | ||
7549 | |||
7550 | for (i = 0; i < nr_node_ids; i++) | ||
7551 | if (build_numa_sched_groups(&d, cpu_map, i)) | ||
7552 | goto error; | ||
7553 | #endif | ||
7554 | 7270 | ||
7555 | /* Calculate CPU power for physical packages and nodes */ | 7271 | *per_cpu_ptr(d.sd, i) = sd; |
7556 | #ifdef CONFIG_SCHED_SMT | ||
7557 | for_each_cpu(i, cpu_map) { | ||
7558 | sd = &per_cpu(cpu_domains, i).sd; | ||
7559 | init_sched_groups_power(i, sd); | ||
7560 | } | ||
7561 | #endif | ||
7562 | #ifdef CONFIG_SCHED_MC | ||
7563 | for_each_cpu(i, cpu_map) { | ||
7564 | sd = &per_cpu(core_domains, i).sd; | ||
7565 | init_sched_groups_power(i, sd); | ||
7566 | } | 7272 | } |
7567 | #endif | ||
7568 | #ifdef CONFIG_SCHED_BOOK | ||
7569 | for_each_cpu(i, cpu_map) { | ||
7570 | sd = &per_cpu(book_domains, i).sd; | ||
7571 | init_sched_groups_power(i, sd); | ||
7572 | } | ||
7573 | #endif | ||
7574 | 7273 | ||
7274 | /* Build the groups for the domains */ | ||
7575 | for_each_cpu(i, cpu_map) { | 7275 | for_each_cpu(i, cpu_map) { |
7576 | sd = &per_cpu(phys_domains, i).sd; | 7276 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { |
7577 | init_sched_groups_power(i, sd); | 7277 | sd->span_weight = cpumask_weight(sched_domain_span(sd)); |
7578 | } | 7278 | get_group(i, sd->private, &sd->groups); |
7279 | atomic_inc(&sd->groups->ref); | ||
7579 | 7280 | ||
7580 | #ifdef CONFIG_NUMA | 7281 | if (i != cpumask_first(sched_domain_span(sd))) |
7581 | for (i = 0; i < nr_node_ids; i++) | 7282 | continue; |
7582 | init_numa_sched_groups_power(d.sched_group_nodes[i]); | ||
7583 | 7283 | ||
7584 | if (d.sd_allnodes) { | 7284 | build_sched_groups(sd); |
7585 | struct sched_group *sg; | 7285 | } |
7286 | } | ||
7586 | 7287 | ||
7587 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, | 7288 | /* Calculate CPU power for physical packages and nodes */ |
7588 | d.tmpmask); | 7289 | for (i = nr_cpumask_bits-1; i >= 0; i--) { |
7589 | init_numa_sched_groups_power(sg); | 7290 | if (!cpumask_test_cpu(i, cpu_map)) |
7291 | continue; | ||
7292 | |||
7293 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | ||
7294 | claim_allocations(i, sd); | ||
7295 | init_sched_groups_power(i, sd); | ||
7296 | } | ||
7590 | } | 7297 | } |
7591 | #endif | ||
7592 | 7298 | ||
7593 | /* Attach the domains */ | 7299 | /* Attach the domains */ |
7300 | rcu_read_lock(); | ||
7594 | for_each_cpu(i, cpu_map) { | 7301 | for_each_cpu(i, cpu_map) { |
7595 | #ifdef CONFIG_SCHED_SMT | 7302 | sd = *per_cpu_ptr(d.sd, i); |
7596 | sd = &per_cpu(cpu_domains, i).sd; | ||
7597 | #elif defined(CONFIG_SCHED_MC) | ||
7598 | sd = &per_cpu(core_domains, i).sd; | ||
7599 | #elif defined(CONFIG_SCHED_BOOK) | ||
7600 | sd = &per_cpu(book_domains, i).sd; | ||
7601 | #else | ||
7602 | sd = &per_cpu(phys_domains, i).sd; | ||
7603 | #endif | ||
7604 | cpu_attach_domain(sd, d.rd, i); | 7303 | cpu_attach_domain(sd, d.rd, i); |
7605 | } | 7304 | } |
7305 | rcu_read_unlock(); | ||
7606 | 7306 | ||
7607 | d.sched_group_nodes = NULL; /* don't free this we still need it */ | 7307 | ret = 0; |
7608 | __free_domain_allocs(&d, sa_tmpmask, cpu_map); | ||
7609 | return 0; | ||
7610 | |||
7611 | error: | 7308 | error: |
7612 | __free_domain_allocs(&d, alloc_state, cpu_map); | 7309 | __free_domain_allocs(&d, alloc_state, cpu_map); |
7613 | return -ENOMEM; | 7310 | return ret; |
7614 | } | ||
7615 | |||
7616 | static int build_sched_domains(const struct cpumask *cpu_map) | ||
7617 | { | ||
7618 | return __build_sched_domains(cpu_map, NULL); | ||
7619 | } | 7311 | } |
7620 | 7312 | ||
7621 | static cpumask_var_t *doms_cur; /* current sched domains */ | 7313 | static cpumask_var_t *doms_cur; /* current sched domains */ |
@@ -7670,7 +7362,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) | |||
7670 | * For now this just excludes isolated cpus, but could be used to | 7362 | * For now this just excludes isolated cpus, but could be used to |
7671 | * exclude other special cases in the future. | 7363 | * exclude other special cases in the future. |
7672 | */ | 7364 | */ |
7673 | static int arch_init_sched_domains(const struct cpumask *cpu_map) | 7365 | static int init_sched_domains(const struct cpumask *cpu_map) |
7674 | { | 7366 | { |
7675 | int err; | 7367 | int err; |
7676 | 7368 | ||
@@ -7681,32 +7373,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) | |||
7681 | doms_cur = &fallback_doms; | 7373 | doms_cur = &fallback_doms; |
7682 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | 7374 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); |
7683 | dattr_cur = NULL; | 7375 | dattr_cur = NULL; |
7684 | err = build_sched_domains(doms_cur[0]); | 7376 | err = build_sched_domains(doms_cur[0], NULL); |
7685 | register_sched_domain_sysctl(); | 7377 | register_sched_domain_sysctl(); |
7686 | 7378 | ||
7687 | return err; | 7379 | return err; |
7688 | } | 7380 | } |
7689 | 7381 | ||
7690 | static void arch_destroy_sched_domains(const struct cpumask *cpu_map, | ||
7691 | struct cpumask *tmpmask) | ||
7692 | { | ||
7693 | free_sched_groups(cpu_map, tmpmask); | ||
7694 | } | ||
7695 | |||
7696 | /* | 7382 | /* |
7697 | * Detach sched domains from a group of cpus specified in cpu_map | 7383 | * Detach sched domains from a group of cpus specified in cpu_map |
7698 | * These cpus will now be attached to the NULL domain | 7384 | * These cpus will now be attached to the NULL domain |
7699 | */ | 7385 | */ |
7700 | static void detach_destroy_domains(const struct cpumask *cpu_map) | 7386 | static void detach_destroy_domains(const struct cpumask *cpu_map) |
7701 | { | 7387 | { |
7702 | /* Save because hotplug lock held. */ | ||
7703 | static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); | ||
7704 | int i; | 7388 | int i; |
7705 | 7389 | ||
7390 | rcu_read_lock(); | ||
7706 | for_each_cpu(i, cpu_map) | 7391 | for_each_cpu(i, cpu_map) |
7707 | cpu_attach_domain(NULL, &def_root_domain, i); | 7392 | cpu_attach_domain(NULL, &def_root_domain, i); |
7708 | synchronize_sched(); | 7393 | rcu_read_unlock(); |
7709 | arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); | ||
7710 | } | 7394 | } |
7711 | 7395 | ||
7712 | /* handle null as "default" */ | 7396 | /* handle null as "default" */ |
@@ -7795,8 +7479,7 @@ match1: | |||
7795 | goto match2; | 7479 | goto match2; |
7796 | } | 7480 | } |
7797 | /* no match - add a new doms_new */ | 7481 | /* no match - add a new doms_new */ |
7798 | __build_sched_domains(doms_new[i], | 7482 | build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); |
7799 | dattr_new ? dattr_new + i : NULL); | ||
7800 | match2: | 7483 | match2: |
7801 | ; | 7484 | ; |
7802 | } | 7485 | } |
@@ -7815,7 +7498,7 @@ match2: | |||
7815 | } | 7498 | } |
7816 | 7499 | ||
7817 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 7500 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
7818 | static void arch_reinit_sched_domains(void) | 7501 | static void reinit_sched_domains(void) |
7819 | { | 7502 | { |
7820 | get_online_cpus(); | 7503 | get_online_cpus(); |
7821 | 7504 | ||
@@ -7848,7 +7531,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | |||
7848 | else | 7531 | else |
7849 | sched_mc_power_savings = level; | 7532 | sched_mc_power_savings = level; |
7850 | 7533 | ||
7851 | arch_reinit_sched_domains(); | 7534 | reinit_sched_domains(); |
7852 | 7535 | ||
7853 | return count; | 7536 | return count; |
7854 | } | 7537 | } |
@@ -7967,14 +7650,9 @@ void __init sched_init_smp(void) | |||
7967 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | 7650 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
7968 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | 7651 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); |
7969 | 7652 | ||
7970 | #if defined(CONFIG_NUMA) | ||
7971 | sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), | ||
7972 | GFP_KERNEL); | ||
7973 | BUG_ON(sched_group_nodes_bycpu == NULL); | ||
7974 | #endif | ||
7975 | get_online_cpus(); | 7653 | get_online_cpus(); |
7976 | mutex_lock(&sched_domains_mutex); | 7654 | mutex_lock(&sched_domains_mutex); |
7977 | arch_init_sched_domains(cpu_active_mask); | 7655 | init_sched_domains(cpu_active_mask); |
7978 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | 7656 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); |
7979 | if (cpumask_empty(non_isolated_cpus)) | 7657 | if (cpumask_empty(non_isolated_cpus)) |
7980 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); | 7658 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); |
@@ -8281,6 +7959,7 @@ void __init sched_init(void) | |||
8281 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ | 7959 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ |
8282 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); | 7960 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); |
8283 | #ifdef CONFIG_SMP | 7961 | #ifdef CONFIG_SMP |
7962 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); | ||
8284 | #ifdef CONFIG_NO_HZ | 7963 | #ifdef CONFIG_NO_HZ |
8285 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | 7964 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
8286 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); | 7965 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); |
@@ -8340,7 +8019,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p) | |||
8340 | int old_prio = p->prio; | 8019 | int old_prio = p->prio; |
8341 | int on_rq; | 8020 | int on_rq; |
8342 | 8021 | ||
8343 | on_rq = p->se.on_rq; | 8022 | on_rq = p->on_rq; |
8344 | if (on_rq) | 8023 | if (on_rq) |
8345 | deactivate_task(rq, p, 0); | 8024 | deactivate_task(rq, p, 0); |
8346 | __setscheduler(rq, p, SCHED_NORMAL, 0); | 8025 | __setscheduler(rq, p, SCHED_NORMAL, 0); |
@@ -8553,7 +8232,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8553 | { | 8232 | { |
8554 | struct rt_rq *rt_rq; | 8233 | struct rt_rq *rt_rq; |
8555 | struct sched_rt_entity *rt_se; | 8234 | struct sched_rt_entity *rt_se; |
8556 | struct rq *rq; | ||
8557 | int i; | 8235 | int i; |
8558 | 8236 | ||
8559 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); | 8237 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); |
@@ -8567,8 +8245,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8567 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); | 8245 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); |
8568 | 8246 | ||
8569 | for_each_possible_cpu(i) { | 8247 | for_each_possible_cpu(i) { |
8570 | rq = cpu_rq(i); | ||
8571 | |||
8572 | rt_rq = kzalloc_node(sizeof(struct rt_rq), | 8248 | rt_rq = kzalloc_node(sizeof(struct rt_rq), |
8573 | GFP_KERNEL, cpu_to_node(i)); | 8249 | GFP_KERNEL, cpu_to_node(i)); |
8574 | if (!rt_rq) | 8250 | if (!rt_rq) |
@@ -8683,7 +8359,7 @@ void sched_move_task(struct task_struct *tsk) | |||
8683 | rq = task_rq_lock(tsk, &flags); | 8359 | rq = task_rq_lock(tsk, &flags); |
8684 | 8360 | ||
8685 | running = task_current(rq, tsk); | 8361 | running = task_current(rq, tsk); |
8686 | on_rq = tsk->se.on_rq; | 8362 | on_rq = tsk->on_rq; |
8687 | 8363 | ||
8688 | if (on_rq) | 8364 | if (on_rq) |
8689 | dequeue_task(rq, tsk, 0); | 8365 | dequeue_task(rq, tsk, 0); |
@@ -8702,7 +8378,7 @@ void sched_move_task(struct task_struct *tsk) | |||
8702 | if (on_rq) | 8378 | if (on_rq) |
8703 | enqueue_task(rq, tsk, 0); | 8379 | enqueue_task(rq, tsk, 0); |
8704 | 8380 | ||
8705 | task_rq_unlock(rq, &flags); | 8381 | task_rq_unlock(rq, tsk, &flags); |
8706 | } | 8382 | } |
8707 | #endif /* CONFIG_CGROUP_SCHED */ | 8383 | #endif /* CONFIG_CGROUP_SCHED */ |
8708 | 8384 | ||
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 7bacd83a4158..a6710a112b4f 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -152,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
152 | read_lock_irqsave(&tasklist_lock, flags); | 152 | read_lock_irqsave(&tasklist_lock, flags); |
153 | 153 | ||
154 | do_each_thread(g, p) { | 154 | do_each_thread(g, p) { |
155 | if (!p->se.on_rq || task_cpu(p) != rq_cpu) | 155 | if (!p->on_rq || task_cpu(p) != rq_cpu) |
156 | continue; | 156 | continue; |
157 | 157 | ||
158 | print_task(m, rq, p); | 158 | print_task(m, rq, p); |
@@ -296,9 +296,6 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
296 | P(ttwu_count); | 296 | P(ttwu_count); |
297 | P(ttwu_local); | 297 | P(ttwu_local); |
298 | 298 | ||
299 | SEQ_printf(m, " .%-30s: %d\n", "bkl_count", | ||
300 | rq->rq_sched_info.bkl_count); | ||
301 | |||
302 | #undef P | 299 | #undef P |
303 | #undef P64 | 300 | #undef P64 |
304 | #endif | 301 | #endif |
@@ -441,7 +438,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
441 | P(se.statistics.wait_count); | 438 | P(se.statistics.wait_count); |
442 | PN(se.statistics.iowait_sum); | 439 | PN(se.statistics.iowait_sum); |
443 | P(se.statistics.iowait_count); | 440 | P(se.statistics.iowait_count); |
444 | P(sched_info.bkl_count); | ||
445 | P(se.nr_migrations); | 441 | P(se.nr_migrations); |
446 | P(se.statistics.nr_migrations_cold); | 442 | P(se.statistics.nr_migrations_cold); |
447 | P(se.statistics.nr_failed_migrations_affine); | 443 | P(se.statistics.nr_failed_migrations_affine); |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6fa833ab2cb8..37f22626225e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -358,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) | |||
358 | } | 358 | } |
359 | 359 | ||
360 | cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); | 360 | cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); |
361 | #ifndef CONFIG_64BIT | ||
362 | smp_wmb(); | ||
363 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | ||
364 | #endif | ||
361 | } | 365 | } |
362 | 366 | ||
363 | /* | 367 | /* |
@@ -1340,6 +1344,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1340 | hrtick_update(rq); | 1344 | hrtick_update(rq); |
1341 | } | 1345 | } |
1342 | 1346 | ||
1347 | static void set_next_buddy(struct sched_entity *se); | ||
1348 | |||
1343 | /* | 1349 | /* |
1344 | * The dequeue_task method is called before nr_running is | 1350 | * The dequeue_task method is called before nr_running is |
1345 | * decreased. We remove the task from the rbtree and | 1351 | * decreased. We remove the task from the rbtree and |
@@ -1349,14 +1355,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1349 | { | 1355 | { |
1350 | struct cfs_rq *cfs_rq; | 1356 | struct cfs_rq *cfs_rq; |
1351 | struct sched_entity *se = &p->se; | 1357 | struct sched_entity *se = &p->se; |
1358 | int task_sleep = flags & DEQUEUE_SLEEP; | ||
1352 | 1359 | ||
1353 | for_each_sched_entity(se) { | 1360 | for_each_sched_entity(se) { |
1354 | cfs_rq = cfs_rq_of(se); | 1361 | cfs_rq = cfs_rq_of(se); |
1355 | dequeue_entity(cfs_rq, se, flags); | 1362 | dequeue_entity(cfs_rq, se, flags); |
1356 | 1363 | ||
1357 | /* Don't dequeue parent if it has other entities besides us */ | 1364 | /* Don't dequeue parent if it has other entities besides us */ |
1358 | if (cfs_rq->load.weight) | 1365 | if (cfs_rq->load.weight) { |
1366 | /* | ||
1367 | * Bias pick_next to pick a task from this cfs_rq, as | ||
1368 | * p is sleeping when it is within its sched_slice. | ||
1369 | */ | ||
1370 | if (task_sleep && parent_entity(se)) | ||
1371 | set_next_buddy(parent_entity(se)); | ||
1359 | break; | 1372 | break; |
1373 | } | ||
1360 | flags |= DEQUEUE_SLEEP; | 1374 | flags |= DEQUEUE_SLEEP; |
1361 | } | 1375 | } |
1362 | 1376 | ||
@@ -1372,12 +1386,25 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1372 | 1386 | ||
1373 | #ifdef CONFIG_SMP | 1387 | #ifdef CONFIG_SMP |
1374 | 1388 | ||
1375 | static void task_waking_fair(struct rq *rq, struct task_struct *p) | 1389 | static void task_waking_fair(struct task_struct *p) |
1376 | { | 1390 | { |
1377 | struct sched_entity *se = &p->se; | 1391 | struct sched_entity *se = &p->se; |
1378 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 1392 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
1393 | u64 min_vruntime; | ||
1379 | 1394 | ||
1380 | se->vruntime -= cfs_rq->min_vruntime; | 1395 | #ifndef CONFIG_64BIT |
1396 | u64 min_vruntime_copy; | ||
1397 | |||
1398 | do { | ||
1399 | min_vruntime_copy = cfs_rq->min_vruntime_copy; | ||
1400 | smp_rmb(); | ||
1401 | min_vruntime = cfs_rq->min_vruntime; | ||
1402 | } while (min_vruntime != min_vruntime_copy); | ||
1403 | #else | ||
1404 | min_vruntime = cfs_rq->min_vruntime; | ||
1405 | #endif | ||
1406 | |||
1407 | se->vruntime -= min_vruntime; | ||
1381 | } | 1408 | } |
1382 | 1409 | ||
1383 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1410 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -1622,6 +1649,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
1622 | /* | 1649 | /* |
1623 | * Otherwise, iterate the domains and find an elegible idle cpu. | 1650 | * Otherwise, iterate the domains and find an elegible idle cpu. |
1624 | */ | 1651 | */ |
1652 | rcu_read_lock(); | ||
1625 | for_each_domain(target, sd) { | 1653 | for_each_domain(target, sd) { |
1626 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) | 1654 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) |
1627 | break; | 1655 | break; |
@@ -1641,6 +1669,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
1641 | cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) | 1669 | cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) |
1642 | break; | 1670 | break; |
1643 | } | 1671 | } |
1672 | rcu_read_unlock(); | ||
1644 | 1673 | ||
1645 | return target; | 1674 | return target; |
1646 | } | 1675 | } |
@@ -1657,7 +1686,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
1657 | * preempt must be disabled. | 1686 | * preempt must be disabled. |
1658 | */ | 1687 | */ |
1659 | static int | 1688 | static int |
1660 | select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) | 1689 | select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) |
1661 | { | 1690 | { |
1662 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; | 1691 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; |
1663 | int cpu = smp_processor_id(); | 1692 | int cpu = smp_processor_id(); |
@@ -1673,6 +1702,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1673 | new_cpu = prev_cpu; | 1702 | new_cpu = prev_cpu; |
1674 | } | 1703 | } |
1675 | 1704 | ||
1705 | rcu_read_lock(); | ||
1676 | for_each_domain(cpu, tmp) { | 1706 | for_each_domain(cpu, tmp) { |
1677 | if (!(tmp->flags & SD_LOAD_BALANCE)) | 1707 | if (!(tmp->flags & SD_LOAD_BALANCE)) |
1678 | continue; | 1708 | continue; |
@@ -1723,9 +1753,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1723 | 1753 | ||
1724 | if (affine_sd) { | 1754 | if (affine_sd) { |
1725 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) | 1755 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) |
1726 | return select_idle_sibling(p, cpu); | 1756 | prev_cpu = cpu; |
1727 | else | 1757 | |
1728 | return select_idle_sibling(p, prev_cpu); | 1758 | new_cpu = select_idle_sibling(p, prev_cpu); |
1759 | goto unlock; | ||
1729 | } | 1760 | } |
1730 | 1761 | ||
1731 | while (sd) { | 1762 | while (sd) { |
@@ -1766,6 +1797,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1766 | } | 1797 | } |
1767 | /* while loop will break here if sd == NULL */ | 1798 | /* while loop will break here if sd == NULL */ |
1768 | } | 1799 | } |
1800 | unlock: | ||
1801 | rcu_read_unlock(); | ||
1769 | 1802 | ||
1770 | return new_cpu; | 1803 | return new_cpu; |
1771 | } | 1804 | } |
@@ -1789,10 +1822,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se) | |||
1789 | * This is especially important for buddies when the leftmost | 1822 | * This is especially important for buddies when the leftmost |
1790 | * task is higher priority than the buddy. | 1823 | * task is higher priority than the buddy. |
1791 | */ | 1824 | */ |
1792 | if (unlikely(se->load.weight != NICE_0_LOAD)) | 1825 | return calc_delta_fair(gran, se); |
1793 | gran = calc_delta_fair(gran, se); | ||
1794 | |||
1795 | return gran; | ||
1796 | } | 1826 | } |
1797 | 1827 | ||
1798 | /* | 1828 | /* |
@@ -1826,26 +1856,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) | |||
1826 | 1856 | ||
1827 | static void set_last_buddy(struct sched_entity *se) | 1857 | static void set_last_buddy(struct sched_entity *se) |
1828 | { | 1858 | { |
1829 | if (likely(task_of(se)->policy != SCHED_IDLE)) { | 1859 | if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) |
1830 | for_each_sched_entity(se) | 1860 | return; |
1831 | cfs_rq_of(se)->last = se; | 1861 | |
1832 | } | 1862 | for_each_sched_entity(se) |
1863 | cfs_rq_of(se)->last = se; | ||
1833 | } | 1864 | } |
1834 | 1865 | ||
1835 | static void set_next_buddy(struct sched_entity *se) | 1866 | static void set_next_buddy(struct sched_entity *se) |
1836 | { | 1867 | { |
1837 | if (likely(task_of(se)->policy != SCHED_IDLE)) { | 1868 | if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) |
1838 | for_each_sched_entity(se) | 1869 | return; |
1839 | cfs_rq_of(se)->next = se; | 1870 | |
1840 | } | 1871 | for_each_sched_entity(se) |
1872 | cfs_rq_of(se)->next = se; | ||
1841 | } | 1873 | } |
1842 | 1874 | ||
1843 | static void set_skip_buddy(struct sched_entity *se) | 1875 | static void set_skip_buddy(struct sched_entity *se) |
1844 | { | 1876 | { |
1845 | if (likely(task_of(se)->policy != SCHED_IDLE)) { | 1877 | for_each_sched_entity(se) |
1846 | for_each_sched_entity(se) | 1878 | cfs_rq_of(se)->skip = se; |
1847 | cfs_rq_of(se)->skip = se; | ||
1848 | } | ||
1849 | } | 1879 | } |
1850 | 1880 | ||
1851 | /* | 1881 | /* |
@@ -1857,12 +1887,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1857 | struct sched_entity *se = &curr->se, *pse = &p->se; | 1887 | struct sched_entity *se = &curr->se, *pse = &p->se; |
1858 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 1888 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
1859 | int scale = cfs_rq->nr_running >= sched_nr_latency; | 1889 | int scale = cfs_rq->nr_running >= sched_nr_latency; |
1890 | int next_buddy_marked = 0; | ||
1860 | 1891 | ||
1861 | if (unlikely(se == pse)) | 1892 | if (unlikely(se == pse)) |
1862 | return; | 1893 | return; |
1863 | 1894 | ||
1864 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) | 1895 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { |
1865 | set_next_buddy(pse); | 1896 | set_next_buddy(pse); |
1897 | next_buddy_marked = 1; | ||
1898 | } | ||
1866 | 1899 | ||
1867 | /* | 1900 | /* |
1868 | * We can come here with TIF_NEED_RESCHED already set from new task | 1901 | * We can come here with TIF_NEED_RESCHED already set from new task |
@@ -1890,8 +1923,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1890 | update_curr(cfs_rq); | 1923 | update_curr(cfs_rq); |
1891 | find_matching_se(&se, &pse); | 1924 | find_matching_se(&se, &pse); |
1892 | BUG_ON(!pse); | 1925 | BUG_ON(!pse); |
1893 | if (wakeup_preempt_entity(se, pse) == 1) | 1926 | if (wakeup_preempt_entity(se, pse) == 1) { |
1927 | /* | ||
1928 | * Bias pick_next to pick the sched entity that is | ||
1929 | * triggering this preemption. | ||
1930 | */ | ||
1931 | if (!next_buddy_marked) | ||
1932 | set_next_buddy(pse); | ||
1894 | goto preempt; | 1933 | goto preempt; |
1934 | } | ||
1895 | 1935 | ||
1896 | return; | 1936 | return; |
1897 | 1937 | ||
@@ -2102,7 +2142,7 @@ static unsigned long | |||
2102 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2142 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2103 | unsigned long max_load_move, struct sched_domain *sd, | 2143 | unsigned long max_load_move, struct sched_domain *sd, |
2104 | enum cpu_idle_type idle, int *all_pinned, | 2144 | enum cpu_idle_type idle, int *all_pinned, |
2105 | int *this_best_prio, struct cfs_rq *busiest_cfs_rq) | 2145 | struct cfs_rq *busiest_cfs_rq) |
2106 | { | 2146 | { |
2107 | int loops = 0, pulled = 0; | 2147 | int loops = 0, pulled = 0; |
2108 | long rem_load_move = max_load_move; | 2148 | long rem_load_move = max_load_move; |
@@ -2140,9 +2180,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2140 | */ | 2180 | */ |
2141 | if (rem_load_move <= 0) | 2181 | if (rem_load_move <= 0) |
2142 | break; | 2182 | break; |
2143 | |||
2144 | if (p->prio < *this_best_prio) | ||
2145 | *this_best_prio = p->prio; | ||
2146 | } | 2183 | } |
2147 | out: | 2184 | out: |
2148 | /* | 2185 | /* |
@@ -2202,7 +2239,7 @@ static unsigned long | |||
2202 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2239 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2203 | unsigned long max_load_move, | 2240 | unsigned long max_load_move, |
2204 | struct sched_domain *sd, enum cpu_idle_type idle, | 2241 | struct sched_domain *sd, enum cpu_idle_type idle, |
2205 | int *all_pinned, int *this_best_prio) | 2242 | int *all_pinned) |
2206 | { | 2243 | { |
2207 | long rem_load_move = max_load_move; | 2244 | long rem_load_move = max_load_move; |
2208 | int busiest_cpu = cpu_of(busiest); | 2245 | int busiest_cpu = cpu_of(busiest); |
@@ -2227,7 +2264,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2227 | rem_load = div_u64(rem_load, busiest_h_load + 1); | 2264 | rem_load = div_u64(rem_load, busiest_h_load + 1); |
2228 | 2265 | ||
2229 | moved_load = balance_tasks(this_rq, this_cpu, busiest, | 2266 | moved_load = balance_tasks(this_rq, this_cpu, busiest, |
2230 | rem_load, sd, idle, all_pinned, this_best_prio, | 2267 | rem_load, sd, idle, all_pinned, |
2231 | busiest_cfs_rq); | 2268 | busiest_cfs_rq); |
2232 | 2269 | ||
2233 | if (!moved_load) | 2270 | if (!moved_load) |
@@ -2253,11 +2290,11 @@ static unsigned long | |||
2253 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2290 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2254 | unsigned long max_load_move, | 2291 | unsigned long max_load_move, |
2255 | struct sched_domain *sd, enum cpu_idle_type idle, | 2292 | struct sched_domain *sd, enum cpu_idle_type idle, |
2256 | int *all_pinned, int *this_best_prio) | 2293 | int *all_pinned) |
2257 | { | 2294 | { |
2258 | return balance_tasks(this_rq, this_cpu, busiest, | 2295 | return balance_tasks(this_rq, this_cpu, busiest, |
2259 | max_load_move, sd, idle, all_pinned, | 2296 | max_load_move, sd, idle, all_pinned, |
2260 | this_best_prio, &busiest->cfs); | 2297 | &busiest->cfs); |
2261 | } | 2298 | } |
2262 | #endif | 2299 | #endif |
2263 | 2300 | ||
@@ -2274,12 +2311,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2274 | int *all_pinned) | 2311 | int *all_pinned) |
2275 | { | 2312 | { |
2276 | unsigned long total_load_moved = 0, load_moved; | 2313 | unsigned long total_load_moved = 0, load_moved; |
2277 | int this_best_prio = this_rq->curr->prio; | ||
2278 | 2314 | ||
2279 | do { | 2315 | do { |
2280 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, | 2316 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, |
2281 | max_load_move - total_load_moved, | 2317 | max_load_move - total_load_moved, |
2282 | sd, idle, all_pinned, &this_best_prio); | 2318 | sd, idle, all_pinned); |
2283 | 2319 | ||
2284 | total_load_moved += load_moved; | 2320 | total_load_moved += load_moved; |
2285 | 2321 | ||
@@ -2648,7 +2684,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
2648 | /* | 2684 | /* |
2649 | * Only siblings can have significantly less than SCHED_LOAD_SCALE | 2685 | * Only siblings can have significantly less than SCHED_LOAD_SCALE |
2650 | */ | 2686 | */ |
2651 | if (sd->level != SD_LV_SIBLING) | 2687 | if (!(sd->flags & SD_SHARE_CPUPOWER)) |
2652 | return 0; | 2688 | return 0; |
2653 | 2689 | ||
2654 | /* | 2690 | /* |
@@ -3465,6 +3501,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3465 | raw_spin_unlock(&this_rq->lock); | 3501 | raw_spin_unlock(&this_rq->lock); |
3466 | 3502 | ||
3467 | update_shares(this_cpu); | 3503 | update_shares(this_cpu); |
3504 | rcu_read_lock(); | ||
3468 | for_each_domain(this_cpu, sd) { | 3505 | for_each_domain(this_cpu, sd) { |
3469 | unsigned long interval; | 3506 | unsigned long interval; |
3470 | int balance = 1; | 3507 | int balance = 1; |
@@ -3486,6 +3523,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3486 | break; | 3523 | break; |
3487 | } | 3524 | } |
3488 | } | 3525 | } |
3526 | rcu_read_unlock(); | ||
3489 | 3527 | ||
3490 | raw_spin_lock(&this_rq->lock); | 3528 | raw_spin_lock(&this_rq->lock); |
3491 | 3529 | ||
@@ -3534,6 +3572,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
3534 | double_lock_balance(busiest_rq, target_rq); | 3572 | double_lock_balance(busiest_rq, target_rq); |
3535 | 3573 | ||
3536 | /* Search for an sd spanning us and the target CPU. */ | 3574 | /* Search for an sd spanning us and the target CPU. */ |
3575 | rcu_read_lock(); | ||
3537 | for_each_domain(target_cpu, sd) { | 3576 | for_each_domain(target_cpu, sd) { |
3538 | if ((sd->flags & SD_LOAD_BALANCE) && | 3577 | if ((sd->flags & SD_LOAD_BALANCE) && |
3539 | cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) | 3578 | cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) |
@@ -3549,6 +3588,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
3549 | else | 3588 | else |
3550 | schedstat_inc(sd, alb_failed); | 3589 | schedstat_inc(sd, alb_failed); |
3551 | } | 3590 | } |
3591 | rcu_read_unlock(); | ||
3552 | double_unlock_balance(busiest_rq, target_rq); | 3592 | double_unlock_balance(busiest_rq, target_rq); |
3553 | out_unlock: | 3593 | out_unlock: |
3554 | busiest_rq->active_balance = 0; | 3594 | busiest_rq->active_balance = 0; |
@@ -3675,6 +3715,7 @@ static int find_new_ilb(int cpu) | |||
3675 | { | 3715 | { |
3676 | struct sched_domain *sd; | 3716 | struct sched_domain *sd; |
3677 | struct sched_group *ilb_group; | 3717 | struct sched_group *ilb_group; |
3718 | int ilb = nr_cpu_ids; | ||
3678 | 3719 | ||
3679 | /* | 3720 | /* |
3680 | * Have idle load balancer selection from semi-idle packages only | 3721 | * Have idle load balancer selection from semi-idle packages only |
@@ -3690,20 +3731,25 @@ static int find_new_ilb(int cpu) | |||
3690 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) | 3731 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) |
3691 | goto out_done; | 3732 | goto out_done; |
3692 | 3733 | ||
3734 | rcu_read_lock(); | ||
3693 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | 3735 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { |
3694 | ilb_group = sd->groups; | 3736 | ilb_group = sd->groups; |
3695 | 3737 | ||
3696 | do { | 3738 | do { |
3697 | if (is_semi_idle_group(ilb_group)) | 3739 | if (is_semi_idle_group(ilb_group)) { |
3698 | return cpumask_first(nohz.grp_idle_mask); | 3740 | ilb = cpumask_first(nohz.grp_idle_mask); |
3741 | goto unlock; | ||
3742 | } | ||
3699 | 3743 | ||
3700 | ilb_group = ilb_group->next; | 3744 | ilb_group = ilb_group->next; |
3701 | 3745 | ||
3702 | } while (ilb_group != sd->groups); | 3746 | } while (ilb_group != sd->groups); |
3703 | } | 3747 | } |
3748 | unlock: | ||
3749 | rcu_read_unlock(); | ||
3704 | 3750 | ||
3705 | out_done: | 3751 | out_done: |
3706 | return nr_cpu_ids; | 3752 | return ilb; |
3707 | } | 3753 | } |
3708 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | 3754 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ |
3709 | static inline int find_new_ilb(int call_cpu) | 3755 | static inline int find_new_ilb(int call_cpu) |
@@ -3848,6 +3894,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3848 | 3894 | ||
3849 | update_shares(cpu); | 3895 | update_shares(cpu); |
3850 | 3896 | ||
3897 | rcu_read_lock(); | ||
3851 | for_each_domain(cpu, sd) { | 3898 | for_each_domain(cpu, sd) { |
3852 | if (!(sd->flags & SD_LOAD_BALANCE)) | 3899 | if (!(sd->flags & SD_LOAD_BALANCE)) |
3853 | continue; | 3900 | continue; |
@@ -3893,6 +3940,7 @@ out: | |||
3893 | if (!balance) | 3940 | if (!balance) |
3894 | break; | 3941 | break; |
3895 | } | 3942 | } |
3943 | rcu_read_unlock(); | ||
3896 | 3944 | ||
3897 | /* | 3945 | /* |
3898 | * next_balance will be updated only when there is a need. | 3946 | * next_balance will be updated only when there is a need. |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 68e69acc29b9..be40f7371ee1 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
@@ -64,3 +64,9 @@ SCHED_FEAT(OWNER_SPIN, 1) | |||
64 | * Decrement CPU power based on irq activity | 64 | * Decrement CPU power based on irq activity |
65 | */ | 65 | */ |
66 | SCHED_FEAT(NONIRQ_POWER, 1) | 66 | SCHED_FEAT(NONIRQ_POWER, 1) |
67 | |||
68 | /* | ||
69 | * Queue remote wakeups on the target CPU and process them | ||
70 | * using the scheduler IPI. Reduces rq->lock contention/bounces. | ||
71 | */ | ||
72 | SCHED_FEAT(TTWU_QUEUE, 1) | ||
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index a776a6396427..0a51882534ea 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
@@ -7,7 +7,7 @@ | |||
7 | 7 | ||
8 | #ifdef CONFIG_SMP | 8 | #ifdef CONFIG_SMP |
9 | static int | 9 | static int |
10 | select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) | 10 | select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) |
11 | { | 11 | { |
12 | return task_cpu(p); /* IDLE tasks as never migrated */ | 12 | return task_cpu(p); /* IDLE tasks as never migrated */ |
13 | } | 13 | } |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index e7cebdc65f82..64b2a37c07d0 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -183,6 +183,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) | |||
183 | return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); | 183 | return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); |
184 | } | 184 | } |
185 | 185 | ||
186 | typedef struct task_group *rt_rq_iter_t; | ||
187 | |||
188 | #define for_each_rt_rq(rt_rq, iter, rq) \ | ||
189 | for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \ | ||
190 | (&iter->list != &task_groups) && \ | ||
191 | (rt_rq = iter->rt_rq[cpu_of(rq)]); \ | ||
192 | iter = list_entry_rcu(iter->list.next, typeof(*iter), list)) | ||
193 | |||
186 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) | 194 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) |
187 | { | 195 | { |
188 | list_add_rcu(&rt_rq->leaf_rt_rq_list, | 196 | list_add_rcu(&rt_rq->leaf_rt_rq_list, |
@@ -288,6 +296,11 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) | |||
288 | return ktime_to_ns(def_rt_bandwidth.rt_period); | 296 | return ktime_to_ns(def_rt_bandwidth.rt_period); |
289 | } | 297 | } |
290 | 298 | ||
299 | typedef struct rt_rq *rt_rq_iter_t; | ||
300 | |||
301 | #define for_each_rt_rq(rt_rq, iter, rq) \ | ||
302 | for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) | ||
303 | |||
291 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) | 304 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) |
292 | { | 305 | { |
293 | } | 306 | } |
@@ -402,12 +415,13 @@ next: | |||
402 | static void __disable_runtime(struct rq *rq) | 415 | static void __disable_runtime(struct rq *rq) |
403 | { | 416 | { |
404 | struct root_domain *rd = rq->rd; | 417 | struct root_domain *rd = rq->rd; |
418 | rt_rq_iter_t iter; | ||
405 | struct rt_rq *rt_rq; | 419 | struct rt_rq *rt_rq; |
406 | 420 | ||
407 | if (unlikely(!scheduler_running)) | 421 | if (unlikely(!scheduler_running)) |
408 | return; | 422 | return; |
409 | 423 | ||
410 | for_each_leaf_rt_rq(rt_rq, rq) { | 424 | for_each_rt_rq(rt_rq, iter, rq) { |
411 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | 425 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); |
412 | s64 want; | 426 | s64 want; |
413 | int i; | 427 | int i; |
@@ -487,6 +501,7 @@ static void disable_runtime(struct rq *rq) | |||
487 | 501 | ||
488 | static void __enable_runtime(struct rq *rq) | 502 | static void __enable_runtime(struct rq *rq) |
489 | { | 503 | { |
504 | rt_rq_iter_t iter; | ||
490 | struct rt_rq *rt_rq; | 505 | struct rt_rq *rt_rq; |
491 | 506 | ||
492 | if (unlikely(!scheduler_running)) | 507 | if (unlikely(!scheduler_running)) |
@@ -495,7 +510,7 @@ static void __enable_runtime(struct rq *rq) | |||
495 | /* | 510 | /* |
496 | * Reset each runqueue's bandwidth settings | 511 | * Reset each runqueue's bandwidth settings |
497 | */ | 512 | */ |
498 | for_each_leaf_rt_rq(rt_rq, rq) { | 513 | for_each_rt_rq(rt_rq, iter, rq) { |
499 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | 514 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); |
500 | 515 | ||
501 | raw_spin_lock(&rt_b->rt_runtime_lock); | 516 | raw_spin_lock(&rt_b->rt_runtime_lock); |
@@ -562,6 +577,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | |||
562 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { | 577 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { |
563 | rt_rq->rt_throttled = 0; | 578 | rt_rq->rt_throttled = 0; |
564 | enqueue = 1; | 579 | enqueue = 1; |
580 | |||
581 | /* | ||
582 | * Force a clock update if the CPU was idle, | ||
583 | * lest wakeup -> unthrottle time accumulate. | ||
584 | */ | ||
585 | if (rt_rq->rt_nr_running && rq->curr == rq->idle) | ||
586 | rq->skip_clock_update = -1; | ||
565 | } | 587 | } |
566 | if (rt_rq->rt_time || rt_rq->rt_nr_running) | 588 | if (rt_rq->rt_time || rt_rq->rt_nr_running) |
567 | idle = 0; | 589 | idle = 0; |
@@ -977,13 +999,23 @@ static void yield_task_rt(struct rq *rq) | |||
977 | static int find_lowest_rq(struct task_struct *task); | 999 | static int find_lowest_rq(struct task_struct *task); |
978 | 1000 | ||
979 | static int | 1001 | static int |
980 | select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) | 1002 | select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) |
981 | { | 1003 | { |
1004 | struct task_struct *curr; | ||
1005 | struct rq *rq; | ||
1006 | int cpu; | ||
1007 | |||
982 | if (sd_flag != SD_BALANCE_WAKE) | 1008 | if (sd_flag != SD_BALANCE_WAKE) |
983 | return smp_processor_id(); | 1009 | return smp_processor_id(); |
984 | 1010 | ||
1011 | cpu = task_cpu(p); | ||
1012 | rq = cpu_rq(cpu); | ||
1013 | |||
1014 | rcu_read_lock(); | ||
1015 | curr = ACCESS_ONCE(rq->curr); /* unlocked access */ | ||
1016 | |||
985 | /* | 1017 | /* |
986 | * If the current task is an RT task, then | 1018 | * If the current task on @p's runqueue is an RT task, then |
987 | * try to see if we can wake this RT task up on another | 1019 | * try to see if we can wake this RT task up on another |
988 | * runqueue. Otherwise simply start this RT task | 1020 | * runqueue. Otherwise simply start this RT task |
989 | * on its current runqueue. | 1021 | * on its current runqueue. |
@@ -997,21 +1029,25 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) | |||
997 | * lock? | 1029 | * lock? |
998 | * | 1030 | * |
999 | * For equal prio tasks, we just let the scheduler sort it out. | 1031 | * For equal prio tasks, we just let the scheduler sort it out. |
1032 | * | ||
1033 | * Otherwise, just let it ride on the affined RQ and the | ||
1034 | * post-schedule router will push the preempted task away | ||
1035 | * | ||
1036 | * This test is optimistic, if we get it wrong the load-balancer | ||
1037 | * will have to sort it out. | ||
1000 | */ | 1038 | */ |
1001 | if (unlikely(rt_task(rq->curr)) && | 1039 | if (curr && unlikely(rt_task(curr)) && |
1002 | (rq->curr->rt.nr_cpus_allowed < 2 || | 1040 | (curr->rt.nr_cpus_allowed < 2 || |
1003 | rq->curr->prio < p->prio) && | 1041 | curr->prio < p->prio) && |
1004 | (p->rt.nr_cpus_allowed > 1)) { | 1042 | (p->rt.nr_cpus_allowed > 1)) { |
1005 | int cpu = find_lowest_rq(p); | 1043 | int target = find_lowest_rq(p); |
1006 | 1044 | ||
1007 | return (cpu == -1) ? task_cpu(p) : cpu; | 1045 | if (target != -1) |
1046 | cpu = target; | ||
1008 | } | 1047 | } |
1048 | rcu_read_unlock(); | ||
1009 | 1049 | ||
1010 | /* | 1050 | return cpu; |
1011 | * Otherwise, just let it ride on the affined RQ and the | ||
1012 | * post-schedule router will push the preempted task away | ||
1013 | */ | ||
1014 | return task_cpu(p); | ||
1015 | } | 1051 | } |
1016 | 1052 | ||
1017 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | 1053 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) |
@@ -1136,7 +1172,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
1136 | * The previous task needs to be made eligible for pushing | 1172 | * The previous task needs to be made eligible for pushing |
1137 | * if it is still active | 1173 | * if it is still active |
1138 | */ | 1174 | */ |
1139 | if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) | 1175 | if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) |
1140 | enqueue_pushable_task(rq, p); | 1176 | enqueue_pushable_task(rq, p); |
1141 | } | 1177 | } |
1142 | 1178 | ||
@@ -1287,7 +1323,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
1287 | !cpumask_test_cpu(lowest_rq->cpu, | 1323 | !cpumask_test_cpu(lowest_rq->cpu, |
1288 | &task->cpus_allowed) || | 1324 | &task->cpus_allowed) || |
1289 | task_running(rq, task) || | 1325 | task_running(rq, task) || |
1290 | !task->se.on_rq)) { | 1326 | !task->on_rq)) { |
1291 | 1327 | ||
1292 | raw_spin_unlock(&lowest_rq->lock); | 1328 | raw_spin_unlock(&lowest_rq->lock); |
1293 | lowest_rq = NULL; | 1329 | lowest_rq = NULL; |
@@ -1321,7 +1357,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) | |||
1321 | BUG_ON(task_current(rq, p)); | 1357 | BUG_ON(task_current(rq, p)); |
1322 | BUG_ON(p->rt.nr_cpus_allowed <= 1); | 1358 | BUG_ON(p->rt.nr_cpus_allowed <= 1); |
1323 | 1359 | ||
1324 | BUG_ON(!p->se.on_rq); | 1360 | BUG_ON(!p->on_rq); |
1325 | BUG_ON(!rt_task(p)); | 1361 | BUG_ON(!rt_task(p)); |
1326 | 1362 | ||
1327 | return p; | 1363 | return p; |
@@ -1467,7 +1503,7 @@ static int pull_rt_task(struct rq *this_rq) | |||
1467 | */ | 1503 | */ |
1468 | if (p && (p->prio < this_rq->rt.highest_prio.curr)) { | 1504 | if (p && (p->prio < this_rq->rt.highest_prio.curr)) { |
1469 | WARN_ON(p == src_rq->curr); | 1505 | WARN_ON(p == src_rq->curr); |
1470 | WARN_ON(!p->se.on_rq); | 1506 | WARN_ON(!p->on_rq); |
1471 | 1507 | ||
1472 | /* | 1508 | /* |
1473 | * There's a chance that p is higher in priority | 1509 | * There's a chance that p is higher in priority |
@@ -1538,7 +1574,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, | |||
1538 | * Update the migration status of the RQ if we have an RT task | 1574 | * Update the migration status of the RQ if we have an RT task |
1539 | * which is running AND changing its weight value. | 1575 | * which is running AND changing its weight value. |
1540 | */ | 1576 | */ |
1541 | if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { | 1577 | if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { |
1542 | struct rq *rq = task_rq(p); | 1578 | struct rq *rq = task_rq(p); |
1543 | 1579 | ||
1544 | if (!task_current(rq, p)) { | 1580 | if (!task_current(rq, p)) { |
@@ -1608,7 +1644,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) | |||
1608 | * we may need to handle the pulling of RT tasks | 1644 | * we may need to handle the pulling of RT tasks |
1609 | * now. | 1645 | * now. |
1610 | */ | 1646 | */ |
1611 | if (p->se.on_rq && !rq->rt.rt_nr_running) | 1647 | if (p->on_rq && !rq->rt.rt_nr_running) |
1612 | pull_rt_task(rq); | 1648 | pull_rt_task(rq); |
1613 | } | 1649 | } |
1614 | 1650 | ||
@@ -1638,7 +1674,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
1638 | * If that current running task is also an RT task | 1674 | * If that current running task is also an RT task |
1639 | * then see if we can move to another run queue. | 1675 | * then see if we can move to another run queue. |
1640 | */ | 1676 | */ |
1641 | if (p->se.on_rq && rq->curr != p) { | 1677 | if (p->on_rq && rq->curr != p) { |
1642 | #ifdef CONFIG_SMP | 1678 | #ifdef CONFIG_SMP |
1643 | if (rq->rt.overloaded && push_rt_task(rq) && | 1679 | if (rq->rt.overloaded && push_rt_task(rq) && |
1644 | /* Don't resched if we changed runqueues */ | 1680 | /* Don't resched if we changed runqueues */ |
@@ -1657,7 +1693,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
1657 | static void | 1693 | static void |
1658 | prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) | 1694 | prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) |
1659 | { | 1695 | { |
1660 | if (!p->se.on_rq) | 1696 | if (!p->on_rq) |
1661 | return; | 1697 | return; |
1662 | 1698 | ||
1663 | if (rq->curr == p) { | 1699 | if (rq->curr == p) { |
@@ -1796,10 +1832,11 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); | |||
1796 | 1832 | ||
1797 | static void print_rt_stats(struct seq_file *m, int cpu) | 1833 | static void print_rt_stats(struct seq_file *m, int cpu) |
1798 | { | 1834 | { |
1835 | rt_rq_iter_t iter; | ||
1799 | struct rt_rq *rt_rq; | 1836 | struct rt_rq *rt_rq; |
1800 | 1837 | ||
1801 | rcu_read_lock(); | 1838 | rcu_read_lock(); |
1802 | for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu)) | 1839 | for_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) |
1803 | print_rt_rq(m, cpu, rt_rq); | 1840 | print_rt_rq(m, cpu, rt_rq); |
1804 | rcu_read_unlock(); | 1841 | rcu_read_unlock(); |
1805 | } | 1842 | } |
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 1ba2bd40fdac..6f437632afab 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c | |||
@@ -9,8 +9,7 @@ | |||
9 | 9 | ||
10 | #ifdef CONFIG_SMP | 10 | #ifdef CONFIG_SMP |
11 | static int | 11 | static int |
12 | select_task_rq_stop(struct rq *rq, struct task_struct *p, | 12 | select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) |
13 | int sd_flag, int flags) | ||
14 | { | 13 | { |
15 | return task_cpu(p); /* stop tasks as never migrate */ | 14 | return task_cpu(p); /* stop tasks as never migrate */ |
16 | } | 15 | } |
@@ -26,7 +25,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) | |||
26 | { | 25 | { |
27 | struct task_struct *stop = rq->stop; | 26 | struct task_struct *stop = rq->stop; |
28 | 27 | ||
29 | if (stop && stop->se.on_rq) | 28 | if (stop && stop->on_rq) |
30 | return stop; | 29 | return stop; |
31 | 30 | ||
32 | return NULL; | 31 | return NULL; |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 35d55a386145..f925c45f0afa 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -53,7 +53,6 @@ const char *reserved_field_names[] = { | |||
53 | "common_preempt_count", | 53 | "common_preempt_count", |
54 | "common_pid", | 54 | "common_pid", |
55 | "common_tgid", | 55 | "common_tgid", |
56 | "common_lock_depth", | ||
57 | FIELD_STRING_IP, | 56 | FIELD_STRING_IP, |
58 | FIELD_STRING_RETIP, | 57 | FIELD_STRING_RETIP, |
59 | FIELD_STRING_FUNC, | 58 | FIELD_STRING_FUNC, |
diff --git a/tools/perf/Documentation/perf-script-perl.txt b/tools/perf/Documentation/perf-script-perl.txt index 5bb41e55a3ac..3152cca15501 100644 --- a/tools/perf/Documentation/perf-script-perl.txt +++ b/tools/perf/Documentation/perf-script-perl.txt | |||
@@ -63,7 +63,6 @@ The format file for the sched_wakep event defines the following fields | |||
63 | field:unsigned char common_flags; | 63 | field:unsigned char common_flags; |
64 | field:unsigned char common_preempt_count; | 64 | field:unsigned char common_preempt_count; |
65 | field:int common_pid; | 65 | field:int common_pid; |
66 | field:int common_lock_depth; | ||
67 | 66 | ||
68 | field:char comm[TASK_COMM_LEN]; | 67 | field:char comm[TASK_COMM_LEN]; |
69 | field:pid_t pid; | 68 | field:pid_t pid; |
diff --git a/tools/perf/Documentation/perf-script-python.txt b/tools/perf/Documentation/perf-script-python.txt index 36b38277422c..471022069119 100644 --- a/tools/perf/Documentation/perf-script-python.txt +++ b/tools/perf/Documentation/perf-script-python.txt | |||
@@ -463,7 +463,6 @@ The format file for the sched_wakep event defines the following fields | |||
463 | field:unsigned char common_flags; | 463 | field:unsigned char common_flags; |
464 | field:unsigned char common_preempt_count; | 464 | field:unsigned char common_preempt_count; |
465 | field:int common_pid; | 465 | field:int common_pid; |
466 | field:int common_lock_depth; | ||
467 | 466 | ||
468 | field:char comm[TASK_COMM_LEN]; | 467 | field:char comm[TASK_COMM_LEN]; |
469 | field:pid_t pid; | 468 | field:pid_t pid; |