aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/scheduler/sched-design-CFS.txt6
-rw-r--r--arch/ia64/include/asm/topology.h25
-rw-r--r--arch/mips/include/asm/mach-ip27/topology.h17
-rw-r--r--arch/powerpc/include/asm/topology.h36
-rw-r--r--arch/sh/include/asm/topology.h25
-rw-r--r--arch/sparc/include/asm/topology_64.h19
-rw-r--r--arch/tile/include/asm/topology.h26
-rw-r--r--arch/x86/include/asm/topology.h38
-rw-r--r--arch/x86/kernel/process.c8
-rw-r--r--arch/x86/kernel/smpboot.c105
-rw-r--r--arch/x86/mm/numa_emulation.c8
-rw-r--r--include/linux/sched.h2
-rw-r--r--include/linux/topology.h37
-rw-r--r--kernel/sched/core.c327
-rw-r--r--kernel/sched/debug.c12
-rw-r--r--kernel/sched/fair.c203
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/rt.c56
-rw-r--r--kernel/sched/sched.h8
19 files changed, 444 insertions, 516 deletions
diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt
index 91ecff07cede..d529e02d928d 100644
--- a/Documentation/scheduler/sched-design-CFS.txt
+++ b/Documentation/scheduler/sched-design-CFS.txt
@@ -130,7 +130,7 @@ CFS implements three scheduling policies:
130 idle timer scheduler in order to avoid to get into priority 130 idle timer scheduler in order to avoid to get into priority
131 inversion problems which would deadlock the machine. 131 inversion problems which would deadlock the machine.
132 132
133SCHED_FIFO/_RR are implemented in sched_rt.c and are as specified by 133SCHED_FIFO/_RR are implemented in sched/rt.c and are as specified by
134POSIX. 134POSIX.
135 135
136The command chrt from util-linux-ng 2.13.1.1 can set all of these except 136The command chrt from util-linux-ng 2.13.1.1 can set all of these except
@@ -145,9 +145,9 @@ Classes," an extensible hierarchy of scheduler modules. These modules
145encapsulate scheduling policy details and are handled by the scheduler core 145encapsulate scheduling policy details and are handled by the scheduler core
146without the core code assuming too much about them. 146without the core code assuming too much about them.
147 147
148sched_fair.c implements the CFS scheduler described above. 148sched/fair.c implements the CFS scheduler described above.
149 149
150sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler way than 150sched/rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler way than
151the previous vanilla scheduler did. It uses 100 runqueues (for all 100 RT 151the previous vanilla scheduler did. It uses 100 runqueues (for all 100 RT
152priority levels, instead of 140 in the previous scheduler) and it needs no 152priority levels, instead of 140 in the previous scheduler) and it needs no
153expired array. 153expired array.
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 09f646753d1a..a2496e449b75 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -70,31 +70,6 @@ void build_cpu_to_node_map(void);
70 .nr_balance_failed = 0, \ 70 .nr_balance_failed = 0, \
71} 71}
72 72
73/* sched_domains SD_NODE_INIT for IA64 NUMA machines */
74#define SD_NODE_INIT (struct sched_domain) { \
75 .parent = NULL, \
76 .child = NULL, \
77 .groups = NULL, \
78 .min_interval = 8, \
79 .max_interval = 8*(min(num_online_cpus(), 32U)), \
80 .busy_factor = 64, \
81 .imbalance_pct = 125, \
82 .cache_nice_tries = 2, \
83 .busy_idx = 3, \
84 .idle_idx = 2, \
85 .newidle_idx = 0, \
86 .wake_idx = 0, \
87 .forkexec_idx = 0, \
88 .flags = SD_LOAD_BALANCE \
89 | SD_BALANCE_NEWIDLE \
90 | SD_BALANCE_EXEC \
91 | SD_BALANCE_FORK \
92 | SD_SERIALIZE, \
93 .last_balance = jiffies, \
94 .balance_interval = 64, \
95 .nr_balance_failed = 0, \
96}
97
98#endif /* CONFIG_NUMA */ 73#endif /* CONFIG_NUMA */
99 74
100#ifdef CONFIG_SMP 75#ifdef CONFIG_SMP
diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h
index 1b1a7d1632b9..b2cf641f206f 100644
--- a/arch/mips/include/asm/mach-ip27/topology.h
+++ b/arch/mips/include/asm/mach-ip27/topology.h
@@ -36,23 +36,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
36 36
37#define node_distance(from, to) (__node_distances[(from)][(to)]) 37#define node_distance(from, to) (__node_distances[(from)][(to)])
38 38
39/* sched_domains SD_NODE_INIT for SGI IP27 machines */
40#define SD_NODE_INIT (struct sched_domain) { \
41 .parent = NULL, \
42 .child = NULL, \
43 .groups = NULL, \
44 .min_interval = 8, \
45 .max_interval = 32, \
46 .busy_factor = 32, \
47 .imbalance_pct = 125, \
48 .cache_nice_tries = 1, \
49 .flags = SD_LOAD_BALANCE | \
50 SD_BALANCE_EXEC, \
51 .last_balance = jiffies, \
52 .balance_interval = 1, \
53 .nr_balance_failed = 0, \
54}
55
56#include <asm-generic/topology.h> 39#include <asm-generic/topology.h>
57 40
58#endif /* _ASM_MACH_TOPOLOGY_H */ 41#endif /* _ASM_MACH_TOPOLOGY_H */
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index c97185885c6d..852ed1b384f6 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -18,12 +18,6 @@ struct device_node;
18 */ 18 */
19#define RECLAIM_DISTANCE 10 19#define RECLAIM_DISTANCE 10
20 20
21/*
22 * Avoid creating an extra level of balancing (SD_ALLNODES) on the largest
23 * POWER7 boxes which have a maximum of 32 nodes.
24 */
25#define SD_NODES_PER_DOMAIN 32
26
27#include <asm/mmzone.h> 21#include <asm/mmzone.h>
28 22
29static inline int cpu_to_node(int cpu) 23static inline int cpu_to_node(int cpu)
@@ -51,36 +45,6 @@ static inline int pcibus_to_node(struct pci_bus *bus)
51 cpu_all_mask : \ 45 cpu_all_mask : \
52 cpumask_of_node(pcibus_to_node(bus))) 46 cpumask_of_node(pcibus_to_node(bus)))
53 47
54/* sched_domains SD_NODE_INIT for PPC64 machines */
55#define SD_NODE_INIT (struct sched_domain) { \
56 .min_interval = 8, \
57 .max_interval = 32, \
58 .busy_factor = 32, \
59 .imbalance_pct = 125, \
60 .cache_nice_tries = 1, \
61 .busy_idx = 3, \
62 .idle_idx = 1, \
63 .newidle_idx = 0, \
64 .wake_idx = 0, \
65 .forkexec_idx = 0, \
66 \
67 .flags = 1*SD_LOAD_BALANCE \
68 | 0*SD_BALANCE_NEWIDLE \
69 | 1*SD_BALANCE_EXEC \
70 | 1*SD_BALANCE_FORK \
71 | 0*SD_BALANCE_WAKE \
72 | 1*SD_WAKE_AFFINE \
73 | 0*SD_PREFER_LOCAL \
74 | 0*SD_SHARE_CPUPOWER \
75 | 0*SD_POWERSAVINGS_BALANCE \
76 | 0*SD_SHARE_PKG_RESOURCES \
77 | 1*SD_SERIALIZE \
78 | 0*SD_PREFER_SIBLING \
79 , \
80 .last_balance = jiffies, \
81 .balance_interval = 1, \
82}
83
84extern int __node_distance(int, int); 48extern int __node_distance(int, int);
85#define node_distance(a, b) __node_distance(a, b) 49#define node_distance(a, b) __node_distance(a, b)
86 50
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index 88e734069fa6..b0a282d65f6a 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -3,31 +3,6 @@
3 3
4#ifdef CONFIG_NUMA 4#ifdef CONFIG_NUMA
5 5
6/* sched_domains SD_NODE_INIT for sh machines */
7#define SD_NODE_INIT (struct sched_domain) { \
8 .parent = NULL, \
9 .child = NULL, \
10 .groups = NULL, \
11 .min_interval = 8, \
12 .max_interval = 32, \
13 .busy_factor = 32, \
14 .imbalance_pct = 125, \
15 .cache_nice_tries = 2, \
16 .busy_idx = 3, \
17 .idle_idx = 2, \
18 .newidle_idx = 0, \
19 .wake_idx = 0, \
20 .forkexec_idx = 0, \
21 .flags = SD_LOAD_BALANCE \
22 | SD_BALANCE_FORK \
23 | SD_BALANCE_EXEC \
24 | SD_BALANCE_NEWIDLE \
25 | SD_SERIALIZE, \
26 .last_balance = jiffies, \
27 .balance_interval = 1, \
28 .nr_balance_failed = 0, \
29}
30
31#define cpu_to_node(cpu) ((void)(cpu),0) 6#define cpu_to_node(cpu) ((void)(cpu),0)
32#define parent_node(node) ((void)(node),0) 7#define parent_node(node) ((void)(node),0)
33 8
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index 8b9c556d630b..1754390a426f 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -31,25 +31,6 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
31 cpu_all_mask : \ 31 cpu_all_mask : \
32 cpumask_of_node(pcibus_to_node(bus))) 32 cpumask_of_node(pcibus_to_node(bus)))
33 33
34#define SD_NODE_INIT (struct sched_domain) { \
35 .min_interval = 8, \
36 .max_interval = 32, \
37 .busy_factor = 32, \
38 .imbalance_pct = 125, \
39 .cache_nice_tries = 2, \
40 .busy_idx = 3, \
41 .idle_idx = 2, \
42 .newidle_idx = 0, \
43 .wake_idx = 0, \
44 .forkexec_idx = 0, \
45 .flags = SD_LOAD_BALANCE \
46 | SD_BALANCE_FORK \
47 | SD_BALANCE_EXEC \
48 | SD_SERIALIZE, \
49 .last_balance = jiffies, \
50 .balance_interval = 1, \
51}
52
53#else /* CONFIG_NUMA */ 34#else /* CONFIG_NUMA */
54 35
55#include <asm-generic/topology.h> 36#include <asm-generic/topology.h>
diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h
index 6fdd0c860193..7a7ce390534f 100644
--- a/arch/tile/include/asm/topology.h
+++ b/arch/tile/include/asm/topology.h
@@ -78,32 +78,6 @@ static inline const struct cpumask *cpumask_of_node(int node)
78 .balance_interval = 32, \ 78 .balance_interval = 32, \
79} 79}
80 80
81/* sched_domains SD_NODE_INIT for TILE architecture */
82#define SD_NODE_INIT (struct sched_domain) { \
83 .min_interval = 16, \
84 .max_interval = 512, \
85 .busy_factor = 32, \
86 .imbalance_pct = 125, \
87 .cache_nice_tries = 1, \
88 .busy_idx = 3, \
89 .idle_idx = 1, \
90 .newidle_idx = 2, \
91 .wake_idx = 1, \
92 .flags = 1*SD_LOAD_BALANCE \
93 | 1*SD_BALANCE_NEWIDLE \
94 | 1*SD_BALANCE_EXEC \
95 | 1*SD_BALANCE_FORK \
96 | 0*SD_BALANCE_WAKE \
97 | 0*SD_WAKE_AFFINE \
98 | 0*SD_PREFER_LOCAL \
99 | 0*SD_SHARE_CPUPOWER \
100 | 0*SD_SHARE_PKG_RESOURCES \
101 | 1*SD_SERIALIZE \
102 , \
103 .last_balance = jiffies, \
104 .balance_interval = 128, \
105}
106
107/* By definition, we create nodes based on online memory. */ 81/* By definition, we create nodes based on online memory. */
108#define node_has_online_mem(nid) 1 82#define node_has_online_mem(nid) 1
109 83
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index b9676ae37ada..095b21507b6a 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -92,44 +92,6 @@ extern void setup_node_to_cpumask_map(void);
92 92
93#define pcibus_to_node(bus) __pcibus_to_node(bus) 93#define pcibus_to_node(bus) __pcibus_to_node(bus)
94 94
95#ifdef CONFIG_X86_32
96# define SD_CACHE_NICE_TRIES 1
97# define SD_IDLE_IDX 1
98#else
99# define SD_CACHE_NICE_TRIES 2
100# define SD_IDLE_IDX 2
101#endif
102
103/* sched_domains SD_NODE_INIT for NUMA machines */
104#define SD_NODE_INIT (struct sched_domain) { \
105 .min_interval = 8, \
106 .max_interval = 32, \
107 .busy_factor = 32, \
108 .imbalance_pct = 125, \
109 .cache_nice_tries = SD_CACHE_NICE_TRIES, \
110 .busy_idx = 3, \
111 .idle_idx = SD_IDLE_IDX, \
112 .newidle_idx = 0, \
113 .wake_idx = 0, \
114 .forkexec_idx = 0, \
115 \
116 .flags = 1*SD_LOAD_BALANCE \
117 | 1*SD_BALANCE_NEWIDLE \
118 | 1*SD_BALANCE_EXEC \
119 | 1*SD_BALANCE_FORK \
120 | 0*SD_BALANCE_WAKE \
121 | 1*SD_WAKE_AFFINE \
122 | 0*SD_PREFER_LOCAL \
123 | 0*SD_SHARE_CPUPOWER \
124 | 0*SD_POWERSAVINGS_BALANCE \
125 | 0*SD_SHARE_PKG_RESOURCES \
126 | 1*SD_SERIALIZE \
127 | 0*SD_PREFER_SIBLING \
128 , \
129 .last_balance = jiffies, \
130 .balance_interval = 1, \
131}
132
133extern int __node_distance(int, int); 95extern int __node_distance(int, int);
134#define node_distance(a, b) __node_distance(a, b) 96#define node_distance(a, b) __node_distance(a, b)
135 97
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1d92a5ab6e8b..ad57d832d96f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -594,9 +594,17 @@ int mwait_usable(const struct cpuinfo_x86 *c)
594{ 594{
595 u32 eax, ebx, ecx, edx; 595 u32 eax, ebx, ecx, edx;
596 596
597 /* Use mwait if idle=mwait boot option is given */
597 if (boot_option_idle_override == IDLE_FORCE_MWAIT) 598 if (boot_option_idle_override == IDLE_FORCE_MWAIT)
598 return 1; 599 return 1;
599 600
601 /*
602 * Any idle= boot option other than idle=mwait means that we must not
603 * use mwait. Eg: idle=halt or idle=poll or idle=nomwait
604 */
605 if (boot_option_idle_override != IDLE_NO_OVERRIDE)
606 return 0;
607
600 if (c->cpuid_level < MWAIT_INFO) 608 if (c->cpuid_level < MWAIT_INFO)
601 return 0; 609 return 0;
602 610
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6e1e406038c2..e84c1bbea339 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -315,59 +315,90 @@ void __cpuinit smp_store_cpu_info(int id)
315 identify_secondary_cpu(c); 315 identify_secondary_cpu(c);
316} 316}
317 317
318static void __cpuinit link_thread_siblings(int cpu1, int cpu2) 318static bool __cpuinit
319topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
319{ 320{
320 cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); 321 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
321 cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); 322
322 cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); 323 return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2),
323 cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); 324 "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
324 cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2)); 325 "[node: %d != %d]. Ignoring dependency.\n",
325 cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1)); 326 cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
326} 327}
327 328
329#define link_mask(_m, c1, c2) \
330do { \
331 cpumask_set_cpu((c1), cpu_##_m##_mask(c2)); \
332 cpumask_set_cpu((c2), cpu_##_m##_mask(c1)); \
333} while (0)
334
335static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
336{
337 if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
338 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
339
340 if (c->phys_proc_id == o->phys_proc_id &&
341 per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) &&
342 c->compute_unit_id == o->compute_unit_id)
343 return topology_sane(c, o, "smt");
344
345 } else if (c->phys_proc_id == o->phys_proc_id &&
346 c->cpu_core_id == o->cpu_core_id) {
347 return topology_sane(c, o, "smt");
348 }
349
350 return false;
351}
352
353static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
354{
355 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
356
357 if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID &&
358 per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2))
359 return topology_sane(c, o, "llc");
360
361 return false;
362}
363
364static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
365{
366 if (c->phys_proc_id == o->phys_proc_id)
367 return topology_sane(c, o, "mc");
368
369 return false;
370}
328 371
329void __cpuinit set_cpu_sibling_map(int cpu) 372void __cpuinit set_cpu_sibling_map(int cpu)
330{ 373{
331 int i; 374 bool has_mc = boot_cpu_data.x86_max_cores > 1;
375 bool has_smt = smp_num_siblings > 1;
332 struct cpuinfo_x86 *c = &cpu_data(cpu); 376 struct cpuinfo_x86 *c = &cpu_data(cpu);
377 struct cpuinfo_x86 *o;
378 int i;
333 379
334 cpumask_set_cpu(cpu, cpu_sibling_setup_mask); 380 cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
335 381
336 if (smp_num_siblings > 1) { 382 if (!has_smt && !has_mc) {
337 for_each_cpu(i, cpu_sibling_setup_mask) {
338 struct cpuinfo_x86 *o = &cpu_data(i);
339
340 if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
341 if (c->phys_proc_id == o->phys_proc_id &&
342 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) &&
343 c->compute_unit_id == o->compute_unit_id)
344 link_thread_siblings(cpu, i);
345 } else if (c->phys_proc_id == o->phys_proc_id &&
346 c->cpu_core_id == o->cpu_core_id) {
347 link_thread_siblings(cpu, i);
348 }
349 }
350 } else {
351 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); 383 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
352 } 384 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
353 385 cpumask_set_cpu(cpu, cpu_core_mask(cpu));
354 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
355
356 if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
357 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
358 c->booted_cores = 1; 386 c->booted_cores = 1;
359 return; 387 return;
360 } 388 }
361 389
362 for_each_cpu(i, cpu_sibling_setup_mask) { 390 for_each_cpu(i, cpu_sibling_setup_mask) {
363 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && 391 o = &cpu_data(i);
364 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { 392
365 cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); 393 if ((i == cpu) || (has_smt && match_smt(c, o)))
366 cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); 394 link_mask(sibling, cpu, i);
367 } 395
368 if (c->phys_proc_id == cpu_data(i).phys_proc_id) { 396 if ((i == cpu) || (has_mc && match_llc(c, o)))
369 cpumask_set_cpu(i, cpu_core_mask(cpu)); 397 link_mask(llc_shared, cpu, i);
370 cpumask_set_cpu(cpu, cpu_core_mask(i)); 398
399 if ((i == cpu) || (has_mc && match_mc(c, o))) {
400 link_mask(core, cpu, i);
401
371 /* 402 /*
372 * Does this new cpu bringup a new core? 403 * Does this new cpu bringup a new core?
373 */ 404 */
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 53489ff6bf82..871dd8868170 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -339,9 +339,11 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
339 } else { 339 } else {
340 unsigned long n; 340 unsigned long n;
341 341
342 n = simple_strtoul(emu_cmdline, NULL, 0); 342 n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
343 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); 343 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
344 } 344 }
345 if (*emu_cmdline == ':')
346 emu_cmdline++;
345 347
346 if (ret < 0) 348 if (ret < 0)
347 goto no_emu; 349 goto no_emu;
@@ -418,7 +420,9 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
418 int physj = emu_nid_to_phys[j]; 420 int physj = emu_nid_to_phys[j];
419 int dist; 421 int dist;
420 422
421 if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) 423 if (get_option(&emu_cmdline, &dist) == 2)
424 ;
425 else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
422 dist = physi == physj ? 426 dist = physi == physj ?
423 LOCAL_DISTANCE : REMOTE_DISTANCE; 427 LOCAL_DISTANCE : REMOTE_DISTANCE;
424 else 428 else
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 81a173c0897d..4a559bf0622f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1950,7 +1950,7 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
1950 */ 1950 */
1951extern unsigned long long notrace sched_clock(void); 1951extern unsigned long long notrace sched_clock(void);
1952/* 1952/*
1953 * See the comment in kernel/sched_clock.c 1953 * See the comment in kernel/sched/clock.c
1954 */ 1954 */
1955extern u64 cpu_clock(int cpu); 1955extern u64 cpu_clock(int cpu);
1956extern u64 local_clock(void); 1956extern u64 local_clock(void);
diff --git a/include/linux/topology.h b/include/linux/topology.h
index e26db031303b..4f59bf36f0af 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -70,7 +70,6 @@ int arch_update_cpu_topology(void);
70 * Below are the 3 major initializers used in building sched_domains: 70 * Below are the 3 major initializers used in building sched_domains:
71 * SD_SIBLING_INIT, for SMT domains 71 * SD_SIBLING_INIT, for SMT domains
72 * SD_CPU_INIT, for SMP domains 72 * SD_CPU_INIT, for SMP domains
73 * SD_NODE_INIT, for NUMA domains
74 * 73 *
75 * Any architecture that cares to do any tuning to these values should do so 74 * Any architecture that cares to do any tuning to these values should do so
76 * by defining their own arch-specific initializer in include/asm/topology.h. 75 * by defining their own arch-specific initializer in include/asm/topology.h.
@@ -176,48 +175,12 @@ int arch_update_cpu_topology(void);
176} 175}
177#endif 176#endif
178 177
179/* sched_domains SD_ALLNODES_INIT for NUMA machines */
180#define SD_ALLNODES_INIT (struct sched_domain) { \
181 .min_interval = 64, \
182 .max_interval = 64*num_online_cpus(), \
183 .busy_factor = 128, \
184 .imbalance_pct = 133, \
185 .cache_nice_tries = 1, \
186 .busy_idx = 3, \
187 .idle_idx = 3, \
188 .flags = 1*SD_LOAD_BALANCE \
189 | 1*SD_BALANCE_NEWIDLE \
190 | 0*SD_BALANCE_EXEC \
191 | 0*SD_BALANCE_FORK \
192 | 0*SD_BALANCE_WAKE \
193 | 0*SD_WAKE_AFFINE \
194 | 0*SD_SHARE_CPUPOWER \
195 | 0*SD_POWERSAVINGS_BALANCE \
196 | 0*SD_SHARE_PKG_RESOURCES \
197 | 1*SD_SERIALIZE \
198 | 0*SD_PREFER_SIBLING \
199 , \
200 .last_balance = jiffies, \
201 .balance_interval = 64, \
202}
203
204#ifndef SD_NODES_PER_DOMAIN
205#define SD_NODES_PER_DOMAIN 16
206#endif
207
208#ifdef CONFIG_SCHED_BOOK 178#ifdef CONFIG_SCHED_BOOK
209#ifndef SD_BOOK_INIT 179#ifndef SD_BOOK_INIT
210#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!! 180#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
211#endif 181#endif
212#endif /* CONFIG_SCHED_BOOK */ 182#endif /* CONFIG_SCHED_BOOK */
213 183
214#ifdef CONFIG_NUMA
215#ifndef SD_NODE_INIT
216#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
217#endif
218
219#endif /* CONFIG_NUMA */
220
221#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 184#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
222DECLARE_PER_CPU(int, numa_node); 185DECLARE_PER_CPU(int, numa_node);
223 186
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e5212ae294f6..bd314d7cd9f8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -692,8 +692,6 @@ int tg_nop(struct task_group *tg, void *data)
692} 692}
693#endif 693#endif
694 694
695void update_cpu_load(struct rq *this_rq);
696
697static void set_load_weight(struct task_struct *p) 695static void set_load_weight(struct task_struct *p)
698{ 696{
699 int prio = p->static_prio - MAX_RT_PRIO; 697 int prio = p->static_prio - MAX_RT_PRIO;
@@ -2486,22 +2484,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2486 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2484 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
2487 * every tick. We fix it up based on jiffies. 2485 * every tick. We fix it up based on jiffies.
2488 */ 2486 */
2489void update_cpu_load(struct rq *this_rq) 2487static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2488 unsigned long pending_updates)
2490{ 2489{
2491 unsigned long this_load = this_rq->load.weight;
2492 unsigned long curr_jiffies = jiffies;
2493 unsigned long pending_updates;
2494 int i, scale; 2490 int i, scale;
2495 2491
2496 this_rq->nr_load_updates++; 2492 this_rq->nr_load_updates++;
2497 2493
2498 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
2499 if (curr_jiffies == this_rq->last_load_update_tick)
2500 return;
2501
2502 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2503 this_rq->last_load_update_tick = curr_jiffies;
2504
2505 /* Update our load: */ 2494 /* Update our load: */
2506 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ 2495 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2507 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 2496 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2526,9 +2515,45 @@ void update_cpu_load(struct rq *this_rq)
2526 sched_avg_update(this_rq); 2515 sched_avg_update(this_rq);
2527} 2516}
2528 2517
2518/*
2519 * Called from nohz_idle_balance() to update the load ratings before doing the
2520 * idle balance.
2521 */
2522void update_idle_cpu_load(struct rq *this_rq)
2523{
2524 unsigned long curr_jiffies = jiffies;
2525 unsigned long load = this_rq->load.weight;
2526 unsigned long pending_updates;
2527
2528 /*
2529 * Bloody broken means of dealing with nohz, but better than nothing..
2530 * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
2531 * update and see 0 difference the one time and 2 the next, even though
2532 * we ticked at roughtly the same rate.
2533 *
2534 * Hence we only use this from nohz_idle_balance() and skip this
2535 * nonsense when called from the scheduler_tick() since that's
2536 * guaranteed a stable rate.
2537 */
2538 if (load || curr_jiffies == this_rq->last_load_update_tick)
2539 return;
2540
2541 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2542 this_rq->last_load_update_tick = curr_jiffies;
2543
2544 __update_cpu_load(this_rq, load, pending_updates);
2545}
2546
2547/*
2548 * Called from scheduler_tick()
2549 */
2529static void update_cpu_load_active(struct rq *this_rq) 2550static void update_cpu_load_active(struct rq *this_rq)
2530{ 2551{
2531 update_cpu_load(this_rq); 2552 /*
2553 * See the mess in update_idle_cpu_load().
2554 */
2555 this_rq->last_load_update_tick = jiffies;
2556 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2532 2557
2533 calc_load_account_active(this_rq); 2558 calc_load_account_active(this_rq);
2534} 2559}
@@ -5560,7 +5585,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5560 break; 5585 break;
5561 } 5586 }
5562 5587
5563 if (cpumask_intersects(groupmask, sched_group_cpus(group))) { 5588 if (!(sd->flags & SD_OVERLAP) &&
5589 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5564 printk(KERN_CONT "\n"); 5590 printk(KERN_CONT "\n");
5565 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5591 printk(KERN_ERR "ERROR: repeated CPUs\n");
5566 break; 5592 break;
@@ -5898,92 +5924,6 @@ static int __init isolated_cpu_setup(char *str)
5898 5924
5899__setup("isolcpus=", isolated_cpu_setup); 5925__setup("isolcpus=", isolated_cpu_setup);
5900 5926
5901#ifdef CONFIG_NUMA
5902
5903/**
5904 * find_next_best_node - find the next node to include in a sched_domain
5905 * @node: node whose sched_domain we're building
5906 * @used_nodes: nodes already in the sched_domain
5907 *
5908 * Find the next node to include in a given scheduling domain. Simply
5909 * finds the closest node not already in the @used_nodes map.
5910 *
5911 * Should use nodemask_t.
5912 */
5913static int find_next_best_node(int node, nodemask_t *used_nodes)
5914{
5915 int i, n, val, min_val, best_node = -1;
5916
5917 min_val = INT_MAX;
5918
5919 for (i = 0; i < nr_node_ids; i++) {
5920 /* Start at @node */
5921 n = (node + i) % nr_node_ids;
5922
5923 if (!nr_cpus_node(n))
5924 continue;
5925
5926 /* Skip already used nodes */
5927 if (node_isset(n, *used_nodes))
5928 continue;
5929
5930 /* Simple min distance search */
5931 val = node_distance(node, n);
5932
5933 if (val < min_val) {
5934 min_val = val;
5935 best_node = n;
5936 }
5937 }
5938
5939 if (best_node != -1)
5940 node_set(best_node, *used_nodes);
5941 return best_node;
5942}
5943
5944/**
5945 * sched_domain_node_span - get a cpumask for a node's sched_domain
5946 * @node: node whose cpumask we're constructing
5947 * @span: resulting cpumask
5948 *
5949 * Given a node, construct a good cpumask for its sched_domain to span. It
5950 * should be one that prevents unnecessary balancing, but also spreads tasks
5951 * out optimally.
5952 */
5953static void sched_domain_node_span(int node, struct cpumask *span)
5954{
5955 nodemask_t used_nodes;
5956 int i;
5957
5958 cpumask_clear(span);
5959 nodes_clear(used_nodes);
5960
5961 cpumask_or(span, span, cpumask_of_node(node));
5962 node_set(node, used_nodes);
5963
5964 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5965 int next_node = find_next_best_node(node, &used_nodes);
5966 if (next_node < 0)
5967 break;
5968 cpumask_or(span, span, cpumask_of_node(next_node));
5969 }
5970}
5971
5972static const struct cpumask *cpu_node_mask(int cpu)
5973{
5974 lockdep_assert_held(&sched_domains_mutex);
5975
5976 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
5977
5978 return sched_domains_tmpmask;
5979}
5980
5981static const struct cpumask *cpu_allnodes_mask(int cpu)
5982{
5983 return cpu_possible_mask;
5984}
5985#endif /* CONFIG_NUMA */
5986
5987static const struct cpumask *cpu_cpu_mask(int cpu) 5927static const struct cpumask *cpu_cpu_mask(int cpu)
5988{ 5928{
5989 return cpumask_of_node(cpu_to_node(cpu)); 5929 return cpumask_of_node(cpu_to_node(cpu));
@@ -6020,6 +5960,7 @@ struct sched_domain_topology_level {
6020 sched_domain_init_f init; 5960 sched_domain_init_f init;
6021 sched_domain_mask_f mask; 5961 sched_domain_mask_f mask;
6022 int flags; 5962 int flags;
5963 int numa_level;
6023 struct sd_data data; 5964 struct sd_data data;
6024}; 5965};
6025 5966
@@ -6211,10 +6152,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6211} 6152}
6212 6153
6213SD_INIT_FUNC(CPU) 6154SD_INIT_FUNC(CPU)
6214#ifdef CONFIG_NUMA
6215 SD_INIT_FUNC(ALLNODES)
6216 SD_INIT_FUNC(NODE)
6217#endif
6218#ifdef CONFIG_SCHED_SMT 6155#ifdef CONFIG_SCHED_SMT
6219 SD_INIT_FUNC(SIBLING) 6156 SD_INIT_FUNC(SIBLING)
6220#endif 6157#endif
@@ -6336,15 +6273,185 @@ static struct sched_domain_topology_level default_topology[] = {
6336 { sd_init_BOOK, cpu_book_mask, }, 6273 { sd_init_BOOK, cpu_book_mask, },
6337#endif 6274#endif
6338 { sd_init_CPU, cpu_cpu_mask, }, 6275 { sd_init_CPU, cpu_cpu_mask, },
6339#ifdef CONFIG_NUMA
6340 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
6341 { sd_init_ALLNODES, cpu_allnodes_mask, },
6342#endif
6343 { NULL, }, 6276 { NULL, },
6344}; 6277};
6345 6278
6346static struct sched_domain_topology_level *sched_domain_topology = default_topology; 6279static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6347 6280
6281#ifdef CONFIG_NUMA
6282
6283static int sched_domains_numa_levels;
6284static int sched_domains_numa_scale;
6285static int *sched_domains_numa_distance;
6286static struct cpumask ***sched_domains_numa_masks;
6287static int sched_domains_curr_level;
6288
6289static inline int sd_local_flags(int level)
6290{
6291 if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
6292 return 0;
6293
6294 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
6295}
6296
6297static struct sched_domain *
6298sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6299{
6300 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6301 int level = tl->numa_level;
6302 int sd_weight = cpumask_weight(
6303 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
6304
6305 *sd = (struct sched_domain){
6306 .min_interval = sd_weight,
6307 .max_interval = 2*sd_weight,
6308 .busy_factor = 32,
6309 .imbalance_pct = 125,
6310 .cache_nice_tries = 2,
6311 .busy_idx = 3,
6312 .idle_idx = 2,
6313 .newidle_idx = 0,
6314 .wake_idx = 0,
6315 .forkexec_idx = 0,
6316
6317 .flags = 1*SD_LOAD_BALANCE
6318 | 1*SD_BALANCE_NEWIDLE
6319 | 0*SD_BALANCE_EXEC
6320 | 0*SD_BALANCE_FORK
6321 | 0*SD_BALANCE_WAKE
6322 | 0*SD_WAKE_AFFINE
6323 | 0*SD_PREFER_LOCAL
6324 | 0*SD_SHARE_CPUPOWER
6325 | 0*SD_POWERSAVINGS_BALANCE
6326 | 0*SD_SHARE_PKG_RESOURCES
6327 | 1*SD_SERIALIZE
6328 | 0*SD_PREFER_SIBLING
6329 | sd_local_flags(level)
6330 ,
6331 .last_balance = jiffies,
6332 .balance_interval = sd_weight,
6333 };
6334 SD_INIT_NAME(sd, NUMA);
6335 sd->private = &tl->data;
6336
6337 /*
6338 * Ugly hack to pass state to sd_numa_mask()...
6339 */
6340 sched_domains_curr_level = tl->numa_level;
6341
6342 return sd;
6343}
6344
6345static const struct cpumask *sd_numa_mask(int cpu)
6346{
6347 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6348}
6349
6350static void sched_init_numa(void)
6351{
6352 int next_distance, curr_distance = node_distance(0, 0);
6353 struct sched_domain_topology_level *tl;
6354 int level = 0;
6355 int i, j, k;
6356
6357 sched_domains_numa_scale = curr_distance;
6358 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6359 if (!sched_domains_numa_distance)
6360 return;
6361
6362 /*
6363 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
6364 * unique distances in the node_distance() table.
6365 *
6366 * Assumes node_distance(0,j) includes all distances in
6367 * node_distance(i,j) in order to avoid cubic time.
6368 *
6369 * XXX: could be optimized to O(n log n) by using sort()
6370 */
6371 next_distance = curr_distance;
6372 for (i = 0; i < nr_node_ids; i++) {
6373 for (j = 0; j < nr_node_ids; j++) {
6374 int distance = node_distance(0, j);
6375 if (distance > curr_distance &&
6376 (distance < next_distance ||
6377 next_distance == curr_distance))
6378 next_distance = distance;
6379 }
6380 if (next_distance != curr_distance) {
6381 sched_domains_numa_distance[level++] = next_distance;
6382 sched_domains_numa_levels = level;
6383 curr_distance = next_distance;
6384 } else break;
6385 }
6386 /*
6387 * 'level' contains the number of unique distances, excluding the
6388 * identity distance node_distance(i,i).
6389 *
6390 * The sched_domains_nume_distance[] array includes the actual distance
6391 * numbers.
6392 */
6393
6394 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6395 if (!sched_domains_numa_masks)
6396 return;
6397
6398 /*
6399 * Now for each level, construct a mask per node which contains all
6400 * cpus of nodes that are that many hops away from us.
6401 */
6402 for (i = 0; i < level; i++) {
6403 sched_domains_numa_masks[i] =
6404 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6405 if (!sched_domains_numa_masks[i])
6406 return;
6407
6408 for (j = 0; j < nr_node_ids; j++) {
6409 struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
6410 if (!mask)
6411 return;
6412
6413 sched_domains_numa_masks[i][j] = mask;
6414
6415 for (k = 0; k < nr_node_ids; k++) {
6416 if (node_distance(j, k) > sched_domains_numa_distance[i])
6417 continue;
6418
6419 cpumask_or(mask, mask, cpumask_of_node(k));
6420 }
6421 }
6422 }
6423
6424 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6425 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6426 if (!tl)
6427 return;
6428
6429 /*
6430 * Copy the default topology bits..
6431 */
6432 for (i = 0; default_topology[i].init; i++)
6433 tl[i] = default_topology[i];
6434
6435 /*
6436 * .. and append 'j' levels of NUMA goodness.
6437 */
6438 for (j = 0; j < level; i++, j++) {
6439 tl[i] = (struct sched_domain_topology_level){
6440 .init = sd_numa_init,
6441 .mask = sd_numa_mask,
6442 .flags = SDTL_OVERLAP,
6443 .numa_level = j,
6444 };
6445 }
6446
6447 sched_domain_topology = tl;
6448}
6449#else
6450static inline void sched_init_numa(void)
6451{
6452}
6453#endif /* CONFIG_NUMA */
6454
6348static int __sdt_alloc(const struct cpumask *cpu_map) 6455static int __sdt_alloc(const struct cpumask *cpu_map)
6349{ 6456{
6350 struct sched_domain_topology_level *tl; 6457 struct sched_domain_topology_level *tl;
@@ -6840,6 +6947,8 @@ void __init sched_init_smp(void)
6840 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 6947 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6841 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 6948 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6842 6949
6950 sched_init_numa();
6951
6843 get_online_cpus(); 6952 get_online_cpus();
6844 mutex_lock(&sched_domains_mutex); 6953 mutex_lock(&sched_domains_mutex);
6845 init_sched_domains(cpu_active_mask); 6954 init_sched_domains(cpu_active_mask);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 09acaa15161d..6f79596e0ea9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -202,7 +202,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
202 SPLIT_NS(spread0)); 202 SPLIT_NS(spread0));
203 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", 203 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
204 cfs_rq->nr_spread_over); 204 cfs_rq->nr_spread_over);
205 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); 205 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207#ifdef CONFIG_FAIR_GROUP_SCHED 207#ifdef CONFIG_FAIR_GROUP_SCHED
208#ifdef CONFIG_SMP 208#ifdef CONFIG_SMP
@@ -260,8 +260,14 @@ static void print_cpu(struct seq_file *m, int cpu)
260 SEQ_printf(m, "\ncpu#%d\n", cpu); 260 SEQ_printf(m, "\ncpu#%d\n", cpu);
261#endif 261#endif
262 262
263#define P(x) \ 263#define P(x) \
264 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) 264do { \
265 if (sizeof(rq->x) == 4) \
266 SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
267 else \
268 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
269} while (0)
270
265#define PN(x) \ 271#define PN(x) \
266 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) 272 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
267 273
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e9553640c1c3..0b42f4487329 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3082,7 +3082,7 @@ struct lb_env {
3082 struct rq *dst_rq; 3082 struct rq *dst_rq;
3083 3083
3084 enum cpu_idle_type idle; 3084 enum cpu_idle_type idle;
3085 long load_move; 3085 long imbalance;
3086 unsigned int flags; 3086 unsigned int flags;
3087 3087
3088 unsigned int loop; 3088 unsigned int loop;
@@ -3218,7 +3218,7 @@ static unsigned long task_h_load(struct task_struct *p);
3218static const unsigned int sched_nr_migrate_break = 32; 3218static const unsigned int sched_nr_migrate_break = 32;
3219 3219
3220/* 3220/*
3221 * move_tasks tries to move up to load_move weighted load from busiest to 3221 * move_tasks tries to move up to imbalance weighted load from busiest to
3222 * this_rq, as part of a balancing operation within domain "sd". 3222 * this_rq, as part of a balancing operation within domain "sd".
3223 * Returns 1 if successful and 0 otherwise. 3223 * Returns 1 if successful and 0 otherwise.
3224 * 3224 *
@@ -3231,7 +3231,7 @@ static int move_tasks(struct lb_env *env)
3231 unsigned long load; 3231 unsigned long load;
3232 int pulled = 0; 3232 int pulled = 0;
3233 3233
3234 if (env->load_move <= 0) 3234 if (env->imbalance <= 0)
3235 return 0; 3235 return 0;
3236 3236
3237 while (!list_empty(tasks)) { 3237 while (!list_empty(tasks)) {
@@ -3257,7 +3257,7 @@ static int move_tasks(struct lb_env *env)
3257 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) 3257 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
3258 goto next; 3258 goto next;
3259 3259
3260 if ((load / 2) > env->load_move) 3260 if ((load / 2) > env->imbalance)
3261 goto next; 3261 goto next;
3262 3262
3263 if (!can_migrate_task(p, env)) 3263 if (!can_migrate_task(p, env))
@@ -3265,7 +3265,7 @@ static int move_tasks(struct lb_env *env)
3265 3265
3266 move_task(p, env); 3266 move_task(p, env);
3267 pulled++; 3267 pulled++;
3268 env->load_move -= load; 3268 env->imbalance -= load;
3269 3269
3270#ifdef CONFIG_PREEMPT 3270#ifdef CONFIG_PREEMPT
3271 /* 3271 /*
@@ -3281,7 +3281,7 @@ static int move_tasks(struct lb_env *env)
3281 * We only want to steal up to the prescribed amount of 3281 * We only want to steal up to the prescribed amount of
3282 * weighted load. 3282 * weighted load.
3283 */ 3283 */
3284 if (env->load_move <= 0) 3284 if (env->imbalance <= 0)
3285 break; 3285 break;
3286 3286
3287 continue; 3287 continue;
@@ -3578,10 +3578,9 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
3578 3578
3579/** 3579/**
3580 * check_power_save_busiest_group - see if there is potential for some power-savings balance 3580 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3581 * @env: load balance environment
3581 * @sds: Variable containing the statistics of the sched_domain 3582 * @sds: Variable containing the statistics of the sched_domain
3582 * under consideration. 3583 * under consideration.
3583 * @this_cpu: Cpu at which we're currently performing load-balancing.
3584 * @imbalance: Variable to store the imbalance.
3585 * 3584 *
3586 * Description: 3585 * Description:
3587 * Check if we have potential to perform some power-savings balance. 3586 * Check if we have potential to perform some power-savings balance.
@@ -3591,8 +3590,8 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
3591 * Returns 1 if there is potential to perform power-savings balance. 3590 * Returns 1 if there is potential to perform power-savings balance.
3592 * Else returns 0. 3591 * Else returns 0.
3593 */ 3592 */
3594static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, 3593static inline
3595 int this_cpu, unsigned long *imbalance) 3594int check_power_save_busiest_group(struct lb_env *env, struct sd_lb_stats *sds)
3596{ 3595{
3597 if (!sds->power_savings_balance) 3596 if (!sds->power_savings_balance)
3598 return 0; 3597 return 0;
@@ -3601,7 +3600,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3601 sds->group_leader == sds->group_min) 3600 sds->group_leader == sds->group_min)
3602 return 0; 3601 return 0;
3603 3602
3604 *imbalance = sds->min_load_per_task; 3603 env->imbalance = sds->min_load_per_task;
3605 sds->busiest = sds->group_min; 3604 sds->busiest = sds->group_min;
3606 3605
3607 return 1; 3606 return 1;
@@ -3620,8 +3619,8 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
3620 return; 3619 return;
3621} 3620}
3622 3621
3623static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, 3622static inline
3624 int this_cpu, unsigned long *imbalance) 3623int check_power_save_busiest_group(struct lb_env *env, struct sd_lb_stats *sds)
3625{ 3624{
3626 return 0; 3625 return 0;
3627} 3626}
@@ -3765,24 +3764,22 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
3765 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3764 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3766 * @sd: The sched_domain whose statistics are to be updated. 3765 * @sd: The sched_domain whose statistics are to be updated.
3767 * @group: sched_group whose statistics are to be updated. 3766 * @group: sched_group whose statistics are to be updated.
3768 * @this_cpu: Cpu for which load balance is currently performed.
3769 * @idle: Idle status of this_cpu
3770 * @load_idx: Load index of sched_domain of this_cpu for load calc. 3767 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3771 * @local_group: Does group contain this_cpu. 3768 * @local_group: Does group contain this_cpu.
3772 * @cpus: Set of cpus considered for load balancing. 3769 * @cpus: Set of cpus considered for load balancing.
3773 * @balance: Should we balance. 3770 * @balance: Should we balance.
3774 * @sgs: variable to hold the statistics for this group. 3771 * @sgs: variable to hold the statistics for this group.
3775 */ 3772 */
3776static inline void update_sg_lb_stats(struct sched_domain *sd, 3773static inline void update_sg_lb_stats(struct lb_env *env,
3777 struct sched_group *group, int this_cpu, 3774 struct sched_group *group, int load_idx,
3778 enum cpu_idle_type idle, int load_idx,
3779 int local_group, const struct cpumask *cpus, 3775 int local_group, const struct cpumask *cpus,
3780 int *balance, struct sg_lb_stats *sgs) 3776 int *balance, struct sg_lb_stats *sgs)
3781{ 3777{
3782 unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; 3778 unsigned long nr_running, max_nr_running, min_nr_running;
3783 int i; 3779 unsigned long load, max_cpu_load, min_cpu_load;
3784 unsigned int balance_cpu = -1, first_idle_cpu = 0; 3780 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3785 unsigned long avg_load_per_task = 0; 3781 unsigned long avg_load_per_task = 0;
3782 int i;
3786 3783
3787 if (local_group) 3784 if (local_group)
3788 balance_cpu = group_first_cpu(group); 3785 balance_cpu = group_first_cpu(group);
@@ -3791,10 +3788,13 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3791 max_cpu_load = 0; 3788 max_cpu_load = 0;
3792 min_cpu_load = ~0UL; 3789 min_cpu_load = ~0UL;
3793 max_nr_running = 0; 3790 max_nr_running = 0;
3791 min_nr_running = ~0UL;
3794 3792
3795 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 3793 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3796 struct rq *rq = cpu_rq(i); 3794 struct rq *rq = cpu_rq(i);
3797 3795
3796 nr_running = rq->nr_running;
3797
3798 /* Bias balancing toward cpus of our domain */ 3798 /* Bias balancing toward cpus of our domain */
3799 if (local_group) { 3799 if (local_group) {
3800 if (idle_cpu(i) && !first_idle_cpu) { 3800 if (idle_cpu(i) && !first_idle_cpu) {
@@ -3805,16 +3805,19 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3805 load = target_load(i, load_idx); 3805 load = target_load(i, load_idx);
3806 } else { 3806 } else {
3807 load = source_load(i, load_idx); 3807 load = source_load(i, load_idx);
3808 if (load > max_cpu_load) { 3808 if (load > max_cpu_load)
3809 max_cpu_load = load; 3809 max_cpu_load = load;
3810 max_nr_running = rq->nr_running;
3811 }
3812 if (min_cpu_load > load) 3810 if (min_cpu_load > load)
3813 min_cpu_load = load; 3811 min_cpu_load = load;
3812
3813 if (nr_running > max_nr_running)
3814 max_nr_running = nr_running;
3815 if (min_nr_running > nr_running)
3816 min_nr_running = nr_running;
3814 } 3817 }
3815 3818
3816 sgs->group_load += load; 3819 sgs->group_load += load;
3817 sgs->sum_nr_running += rq->nr_running; 3820 sgs->sum_nr_running += nr_running;
3818 sgs->sum_weighted_load += weighted_cpuload(i); 3821 sgs->sum_weighted_load += weighted_cpuload(i);
3819 if (idle_cpu(i)) 3822 if (idle_cpu(i))
3820 sgs->idle_cpus++; 3823 sgs->idle_cpus++;
@@ -3827,14 +3830,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3827 * to do the newly idle load balance. 3830 * to do the newly idle load balance.
3828 */ 3831 */
3829 if (local_group) { 3832 if (local_group) {
3830 if (idle != CPU_NEWLY_IDLE) { 3833 if (env->idle != CPU_NEWLY_IDLE) {
3831 if (balance_cpu != this_cpu) { 3834 if (balance_cpu != env->dst_cpu) {
3832 *balance = 0; 3835 *balance = 0;
3833 return; 3836 return;
3834 } 3837 }
3835 update_group_power(sd, this_cpu); 3838 update_group_power(env->sd, env->dst_cpu);
3836 } else if (time_after_eq(jiffies, group->sgp->next_update)) 3839 } else if (time_after_eq(jiffies, group->sgp->next_update))
3837 update_group_power(sd, this_cpu); 3840 update_group_power(env->sd, env->dst_cpu);
3838 } 3841 }
3839 3842
3840 /* Adjust by relative CPU power of the group */ 3843 /* Adjust by relative CPU power of the group */
@@ -3852,13 +3855,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3852 if (sgs->sum_nr_running) 3855 if (sgs->sum_nr_running)
3853 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 3856 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
3854 3857
3855 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) 3858 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
3859 (max_nr_running - min_nr_running) > 1)
3856 sgs->group_imb = 1; 3860 sgs->group_imb = 1;
3857 3861
3858 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, 3862 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
3859 SCHED_POWER_SCALE); 3863 SCHED_POWER_SCALE);
3860 if (!sgs->group_capacity) 3864 if (!sgs->group_capacity)
3861 sgs->group_capacity = fix_small_capacity(sd, group); 3865 sgs->group_capacity = fix_small_capacity(env->sd, group);
3862 sgs->group_weight = group->group_weight; 3866 sgs->group_weight = group->group_weight;
3863 3867
3864 if (sgs->group_capacity > sgs->sum_nr_running) 3868 if (sgs->group_capacity > sgs->sum_nr_running)
@@ -3876,11 +3880,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3876 * Determine if @sg is a busier group than the previously selected 3880 * Determine if @sg is a busier group than the previously selected
3877 * busiest group. 3881 * busiest group.
3878 */ 3882 */
3879static bool update_sd_pick_busiest(struct sched_domain *sd, 3883static bool update_sd_pick_busiest(struct lb_env *env,
3880 struct sd_lb_stats *sds, 3884 struct sd_lb_stats *sds,
3881 struct sched_group *sg, 3885 struct sched_group *sg,
3882 struct sg_lb_stats *sgs, 3886 struct sg_lb_stats *sgs)
3883 int this_cpu)
3884{ 3887{
3885 if (sgs->avg_load <= sds->max_load) 3888 if (sgs->avg_load <= sds->max_load)
3886 return false; 3889 return false;
@@ -3896,8 +3899,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
3896 * numbered CPUs in the group, therefore mark all groups 3899 * numbered CPUs in the group, therefore mark all groups
3897 * higher than ourself as busy. 3900 * higher than ourself as busy.
3898 */ 3901 */
3899 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && 3902 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
3900 this_cpu < group_first_cpu(sg)) { 3903 env->dst_cpu < group_first_cpu(sg)) {
3901 if (!sds->busiest) 3904 if (!sds->busiest)
3902 return true; 3905 return true;
3903 3906
@@ -3917,28 +3920,28 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
3917 * @balance: Should we balance. 3920 * @balance: Should we balance.
3918 * @sds: variable to hold the statistics for this sched_domain. 3921 * @sds: variable to hold the statistics for this sched_domain.
3919 */ 3922 */
3920static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, 3923static inline void update_sd_lb_stats(struct lb_env *env,
3921 enum cpu_idle_type idle, const struct cpumask *cpus, 3924 const struct cpumask *cpus,
3922 int *balance, struct sd_lb_stats *sds) 3925 int *balance, struct sd_lb_stats *sds)
3923{ 3926{
3924 struct sched_domain *child = sd->child; 3927 struct sched_domain *child = env->sd->child;
3925 struct sched_group *sg = sd->groups; 3928 struct sched_group *sg = env->sd->groups;
3926 struct sg_lb_stats sgs; 3929 struct sg_lb_stats sgs;
3927 int load_idx, prefer_sibling = 0; 3930 int load_idx, prefer_sibling = 0;
3928 3931
3929 if (child && child->flags & SD_PREFER_SIBLING) 3932 if (child && child->flags & SD_PREFER_SIBLING)
3930 prefer_sibling = 1; 3933 prefer_sibling = 1;
3931 3934
3932 init_sd_power_savings_stats(sd, sds, idle); 3935 init_sd_power_savings_stats(env->sd, sds, env->idle);
3933 load_idx = get_sd_load_idx(sd, idle); 3936 load_idx = get_sd_load_idx(env->sd, env->idle);
3934 3937
3935 do { 3938 do {
3936 int local_group; 3939 int local_group;
3937 3940
3938 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); 3941 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
3939 memset(&sgs, 0, sizeof(sgs)); 3942 memset(&sgs, 0, sizeof(sgs));
3940 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, 3943 update_sg_lb_stats(env, sg, load_idx, local_group,
3941 local_group, cpus, balance, &sgs); 3944 cpus, balance, &sgs);
3942 3945
3943 if (local_group && !(*balance)) 3946 if (local_group && !(*balance))
3944 return; 3947 return;
@@ -3966,7 +3969,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3966 sds->this_load_per_task = sgs.sum_weighted_load; 3969 sds->this_load_per_task = sgs.sum_weighted_load;
3967 sds->this_has_capacity = sgs.group_has_capacity; 3970 sds->this_has_capacity = sgs.group_has_capacity;
3968 sds->this_idle_cpus = sgs.idle_cpus; 3971 sds->this_idle_cpus = sgs.idle_cpus;
3969 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 3972 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
3970 sds->max_load = sgs.avg_load; 3973 sds->max_load = sgs.avg_load;
3971 sds->busiest = sg; 3974 sds->busiest = sg;
3972 sds->busiest_nr_running = sgs.sum_nr_running; 3975 sds->busiest_nr_running = sgs.sum_nr_running;
@@ -3980,7 +3983,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3980 3983
3981 update_sd_power_savings_stats(sg, sds, local_group, &sgs); 3984 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
3982 sg = sg->next; 3985 sg = sg->next;
3983 } while (sg != sd->groups); 3986 } while (sg != env->sd->groups);
3984} 3987}
3985 3988
3986/** 3989/**
@@ -4008,24 +4011,23 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
4008 * @this_cpu: The cpu at whose sched_domain we're performing load-balance. 4011 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
4009 * @imbalance: returns amount of imbalanced due to packing. 4012 * @imbalance: returns amount of imbalanced due to packing.
4010 */ 4013 */
4011static int check_asym_packing(struct sched_domain *sd, 4014static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4012 struct sd_lb_stats *sds,
4013 int this_cpu, unsigned long *imbalance)
4014{ 4015{
4015 int busiest_cpu; 4016 int busiest_cpu;
4016 4017
4017 if (!(sd->flags & SD_ASYM_PACKING)) 4018 if (!(env->sd->flags & SD_ASYM_PACKING))
4018 return 0; 4019 return 0;
4019 4020
4020 if (!sds->busiest) 4021 if (!sds->busiest)
4021 return 0; 4022 return 0;
4022 4023
4023 busiest_cpu = group_first_cpu(sds->busiest); 4024 busiest_cpu = group_first_cpu(sds->busiest);
4024 if (this_cpu > busiest_cpu) 4025 if (env->dst_cpu > busiest_cpu)
4025 return 0; 4026 return 0;
4026 4027
4027 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, 4028 env->imbalance = DIV_ROUND_CLOSEST(
4028 SCHED_POWER_SCALE); 4029 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
4030
4029 return 1; 4031 return 1;
4030} 4032}
4031 4033
@@ -4037,8 +4039,8 @@ static int check_asym_packing(struct sched_domain *sd,
4037 * @this_cpu: The cpu at whose sched_domain we're performing load-balance. 4039 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
4038 * @imbalance: Variable to store the imbalance. 4040 * @imbalance: Variable to store the imbalance.
4039 */ 4041 */
4040static inline void fix_small_imbalance(struct sd_lb_stats *sds, 4042static inline
4041 int this_cpu, unsigned long *imbalance) 4043void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4042{ 4044{
4043 unsigned long tmp, pwr_now = 0, pwr_move = 0; 4045 unsigned long tmp, pwr_now = 0, pwr_move = 0;
4044 unsigned int imbn = 2; 4046 unsigned int imbn = 2;
@@ -4049,9 +4051,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4049 if (sds->busiest_load_per_task > 4051 if (sds->busiest_load_per_task >
4050 sds->this_load_per_task) 4052 sds->this_load_per_task)
4051 imbn = 1; 4053 imbn = 1;
4052 } else 4054 } else {
4053 sds->this_load_per_task = 4055 sds->this_load_per_task =
4054 cpu_avg_load_per_task(this_cpu); 4056 cpu_avg_load_per_task(env->dst_cpu);
4057 }
4055 4058
4056 scaled_busy_load_per_task = sds->busiest_load_per_task 4059 scaled_busy_load_per_task = sds->busiest_load_per_task
4057 * SCHED_POWER_SCALE; 4060 * SCHED_POWER_SCALE;
@@ -4059,7 +4062,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4059 4062
4060 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 4063 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
4061 (scaled_busy_load_per_task * imbn)) { 4064 (scaled_busy_load_per_task * imbn)) {
4062 *imbalance = sds->busiest_load_per_task; 4065 env->imbalance = sds->busiest_load_per_task;
4063 return; 4066 return;
4064 } 4067 }
4065 4068
@@ -4096,18 +4099,16 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4096 4099
4097 /* Move if we gain throughput */ 4100 /* Move if we gain throughput */
4098 if (pwr_move > pwr_now) 4101 if (pwr_move > pwr_now)
4099 *imbalance = sds->busiest_load_per_task; 4102 env->imbalance = sds->busiest_load_per_task;
4100} 4103}
4101 4104
4102/** 4105/**
4103 * calculate_imbalance - Calculate the amount of imbalance present within the 4106 * calculate_imbalance - Calculate the amount of imbalance present within the
4104 * groups of a given sched_domain during load balance. 4107 * groups of a given sched_domain during load balance.
4108 * @env: load balance environment
4105 * @sds: statistics of the sched_domain whose imbalance is to be calculated. 4109 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
4106 * @this_cpu: Cpu for which currently load balance is being performed.
4107 * @imbalance: The variable to store the imbalance.
4108 */ 4110 */
4109static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, 4111static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4110 unsigned long *imbalance)
4111{ 4112{
4112 unsigned long max_pull, load_above_capacity = ~0UL; 4113 unsigned long max_pull, load_above_capacity = ~0UL;
4113 4114
@@ -4123,8 +4124,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4123 * its cpu_power, while calculating max_load..) 4124 * its cpu_power, while calculating max_load..)
4124 */ 4125 */
4125 if (sds->max_load < sds->avg_load) { 4126 if (sds->max_load < sds->avg_load) {
4126 *imbalance = 0; 4127 env->imbalance = 0;
4127 return fix_small_imbalance(sds, this_cpu, imbalance); 4128 return fix_small_imbalance(env, sds);
4128 } 4129 }
4129 4130
4130 if (!sds->group_imb) { 4131 if (!sds->group_imb) {
@@ -4152,7 +4153,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4152 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); 4153 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
4153 4154
4154 /* How much load to actually move to equalise the imbalance */ 4155 /* How much load to actually move to equalise the imbalance */
4155 *imbalance = min(max_pull * sds->busiest->sgp->power, 4156 env->imbalance = min(max_pull * sds->busiest->sgp->power,
4156 (sds->avg_load - sds->this_load) * sds->this->sgp->power) 4157 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
4157 / SCHED_POWER_SCALE; 4158 / SCHED_POWER_SCALE;
4158 4159
@@ -4162,8 +4163,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4162 * a think about bumping its value to force at least one task to be 4163 * a think about bumping its value to force at least one task to be
4163 * moved 4164 * moved
4164 */ 4165 */
4165 if (*imbalance < sds->busiest_load_per_task) 4166 if (env->imbalance < sds->busiest_load_per_task)
4166 return fix_small_imbalance(sds, this_cpu, imbalance); 4167 return fix_small_imbalance(env, sds);
4167 4168
4168} 4169}
4169 4170
@@ -4194,9 +4195,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4194 * put to idle by rebalancing its tasks onto our group. 4195 * put to idle by rebalancing its tasks onto our group.
4195 */ 4196 */
4196static struct sched_group * 4197static struct sched_group *
4197find_busiest_group(struct sched_domain *sd, int this_cpu, 4198find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
4198 unsigned long *imbalance, enum cpu_idle_type idle,
4199 const struct cpumask *cpus, int *balance)
4200{ 4199{
4201 struct sd_lb_stats sds; 4200 struct sd_lb_stats sds;
4202 4201
@@ -4206,7 +4205,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4206 * Compute the various statistics relavent for load balancing at 4205 * Compute the various statistics relavent for load balancing at
4207 * this level. 4206 * this level.
4208 */ 4207 */
4209 update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); 4208 update_sd_lb_stats(env, cpus, balance, &sds);
4210 4209
4211 /* 4210 /*
4212 * this_cpu is not the appropriate cpu to perform load balancing at 4211 * this_cpu is not the appropriate cpu to perform load balancing at
@@ -4215,8 +4214,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4215 if (!(*balance)) 4214 if (!(*balance))
4216 goto ret; 4215 goto ret;
4217 4216
4218 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && 4217 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4219 check_asym_packing(sd, &sds, this_cpu, imbalance)) 4218 check_asym_packing(env, &sds))
4220 return sds.busiest; 4219 return sds.busiest;
4221 4220
4222 /* There is no busy sibling group to pull tasks from */ 4221 /* There is no busy sibling group to pull tasks from */
@@ -4234,7 +4233,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4234 goto force_balance; 4233 goto force_balance;
4235 4234
4236 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 4235 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4237 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && 4236 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
4238 !sds.busiest_has_capacity) 4237 !sds.busiest_has_capacity)
4239 goto force_balance; 4238 goto force_balance;
4240 4239
@@ -4252,7 +4251,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4252 if (sds.this_load >= sds.avg_load) 4251 if (sds.this_load >= sds.avg_load)
4253 goto out_balanced; 4252 goto out_balanced;
4254 4253
4255 if (idle == CPU_IDLE) { 4254 if (env->idle == CPU_IDLE) {
4256 /* 4255 /*
4257 * This cpu is idle. If the busiest group load doesn't 4256 * This cpu is idle. If the busiest group load doesn't
4258 * have more tasks than the number of available cpu's and 4257 * have more tasks than the number of available cpu's and
@@ -4267,13 +4266,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4267 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use 4266 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4268 * imbalance_pct to be conservative. 4267 * imbalance_pct to be conservative.
4269 */ 4268 */
4270 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 4269 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
4271 goto out_balanced; 4270 goto out_balanced;
4272 } 4271 }
4273 4272
4274force_balance: 4273force_balance:
4275 /* Looks like there is an imbalance. Compute it */ 4274 /* Looks like there is an imbalance. Compute it */
4276 calculate_imbalance(&sds, this_cpu, imbalance); 4275 calculate_imbalance(env, &sds);
4277 return sds.busiest; 4276 return sds.busiest;
4278 4277
4279out_balanced: 4278out_balanced:
@@ -4281,20 +4280,19 @@ out_balanced:
4281 * There is no obvious imbalance. But check if we can do some balancing 4280 * There is no obvious imbalance. But check if we can do some balancing
4282 * to save power. 4281 * to save power.
4283 */ 4282 */
4284 if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) 4283 if (check_power_save_busiest_group(env, &sds))
4285 return sds.busiest; 4284 return sds.busiest;
4286ret: 4285ret:
4287 *imbalance = 0; 4286 env->imbalance = 0;
4288 return NULL; 4287 return NULL;
4289} 4288}
4290 4289
4291/* 4290/*
4292 * find_busiest_queue - find the busiest runqueue among the cpus in group. 4291 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4293 */ 4292 */
4294static struct rq * 4293static struct rq *find_busiest_queue(struct lb_env *env,
4295find_busiest_queue(struct sched_domain *sd, struct sched_group *group, 4294 struct sched_group *group,
4296 enum cpu_idle_type idle, unsigned long imbalance, 4295 const struct cpumask *cpus)
4297 const struct cpumask *cpus)
4298{ 4296{
4299 struct rq *busiest = NULL, *rq; 4297 struct rq *busiest = NULL, *rq;
4300 unsigned long max_load = 0; 4298 unsigned long max_load = 0;
@@ -4307,7 +4305,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4307 unsigned long wl; 4305 unsigned long wl;
4308 4306
4309 if (!capacity) 4307 if (!capacity)
4310 capacity = fix_small_capacity(sd, group); 4308 capacity = fix_small_capacity(env->sd, group);
4311 4309
4312 if (!cpumask_test_cpu(i, cpus)) 4310 if (!cpumask_test_cpu(i, cpus))
4313 continue; 4311 continue;
@@ -4319,7 +4317,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4319 * When comparing with imbalance, use weighted_cpuload() 4317 * When comparing with imbalance, use weighted_cpuload()
4320 * which is not scaled with the cpu power. 4318 * which is not scaled with the cpu power.
4321 */ 4319 */
4322 if (capacity && rq->nr_running == 1 && wl > imbalance) 4320 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
4323 continue; 4321 continue;
4324 4322
4325 /* 4323 /*
@@ -4348,17 +4346,18 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4348/* Working cpumask for load_balance and load_balance_newidle. */ 4346/* Working cpumask for load_balance and load_balance_newidle. */
4349DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4347DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4350 4348
4351static int need_active_balance(struct sched_domain *sd, int idle, 4349static int need_active_balance(struct lb_env *env)
4352 int busiest_cpu, int this_cpu)
4353{ 4350{
4354 if (idle == CPU_NEWLY_IDLE) { 4351 struct sched_domain *sd = env->sd;
4352
4353 if (env->idle == CPU_NEWLY_IDLE) {
4355 4354
4356 /* 4355 /*
4357 * ASYM_PACKING needs to force migrate tasks from busy but 4356 * ASYM_PACKING needs to force migrate tasks from busy but
4358 * higher numbered CPUs in order to pack all tasks in the 4357 * higher numbered CPUs in order to pack all tasks in the
4359 * lowest numbered CPUs. 4358 * lowest numbered CPUs.
4360 */ 4359 */
4361 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) 4360 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
4362 return 1; 4361 return 1;
4363 4362
4364 /* 4363 /*
@@ -4399,7 +4398,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4399{ 4398{
4400 int ld_moved, active_balance = 0; 4399 int ld_moved, active_balance = 0;
4401 struct sched_group *group; 4400 struct sched_group *group;
4402 unsigned long imbalance;
4403 struct rq *busiest; 4401 struct rq *busiest;
4404 unsigned long flags; 4402 unsigned long flags;
4405 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4403 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
@@ -4417,8 +4415,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4417 schedstat_inc(sd, lb_count[idle]); 4415 schedstat_inc(sd, lb_count[idle]);
4418 4416
4419redo: 4417redo:
4420 group = find_busiest_group(sd, this_cpu, &imbalance, idle, 4418 group = find_busiest_group(&env, cpus, balance);
4421 cpus, balance);
4422 4419
4423 if (*balance == 0) 4420 if (*balance == 0)
4424 goto out_balanced; 4421 goto out_balanced;
@@ -4428,7 +4425,7 @@ redo:
4428 goto out_balanced; 4425 goto out_balanced;
4429 } 4426 }
4430 4427
4431 busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); 4428 busiest = find_busiest_queue(&env, group, cpus);
4432 if (!busiest) { 4429 if (!busiest) {
4433 schedstat_inc(sd, lb_nobusyq[idle]); 4430 schedstat_inc(sd, lb_nobusyq[idle]);
4434 goto out_balanced; 4431 goto out_balanced;
@@ -4436,7 +4433,7 @@ redo:
4436 4433
4437 BUG_ON(busiest == this_rq); 4434 BUG_ON(busiest == this_rq);
4438 4435
4439 schedstat_add(sd, lb_imbalance[idle], imbalance); 4436 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4440 4437
4441 ld_moved = 0; 4438 ld_moved = 0;
4442 if (busiest->nr_running > 1) { 4439 if (busiest->nr_running > 1) {
@@ -4447,10 +4444,9 @@ redo:
4447 * correctly treated as an imbalance. 4444 * correctly treated as an imbalance.
4448 */ 4445 */
4449 env.flags |= LBF_ALL_PINNED; 4446 env.flags |= LBF_ALL_PINNED;
4450 env.load_move = imbalance; 4447 env.src_cpu = busiest->cpu;
4451 env.src_cpu = busiest->cpu; 4448 env.src_rq = busiest;
4452 env.src_rq = busiest; 4449 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
4453 env.loop_max = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running);
4454 4450
4455more_balance: 4451more_balance:
4456 local_irq_save(flags); 4452 local_irq_save(flags);
@@ -4492,7 +4488,7 @@ more_balance:
4492 if (idle != CPU_NEWLY_IDLE) 4488 if (idle != CPU_NEWLY_IDLE)
4493 sd->nr_balance_failed++; 4489 sd->nr_balance_failed++;
4494 4490
4495 if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { 4491 if (need_active_balance(&env)) {
4496 raw_spin_lock_irqsave(&busiest->lock, flags); 4492 raw_spin_lock_irqsave(&busiest->lock, flags);
4497 4493
4498 /* don't kick the active_load_balance_cpu_stop, 4494 /* don't kick the active_load_balance_cpu_stop,
@@ -4519,10 +4515,11 @@ more_balance:
4519 } 4515 }
4520 raw_spin_unlock_irqrestore(&busiest->lock, flags); 4516 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4521 4517
4522 if (active_balance) 4518 if (active_balance) {
4523 stop_one_cpu_nowait(cpu_of(busiest), 4519 stop_one_cpu_nowait(cpu_of(busiest),
4524 active_load_balance_cpu_stop, busiest, 4520 active_load_balance_cpu_stop, busiest,
4525 &busiest->active_balance_work); 4521 &busiest->active_balance_work);
4522 }
4526 4523
4527 /* 4524 /*
4528 * We've kicked active balancing, reset the failure 4525 * We've kicked active balancing, reset the failure
@@ -5023,7 +5020,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
5023 5020
5024 raw_spin_lock_irq(&this_rq->lock); 5021 raw_spin_lock_irq(&this_rq->lock);
5025 update_rq_clock(this_rq); 5022 update_rq_clock(this_rq);
5026 update_cpu_load(this_rq); 5023 update_idle_cpu_load(this_rq);
5027 raw_spin_unlock_irq(&this_rq->lock); 5024 raw_spin_unlock_irq(&this_rq->lock);
5028 5025
5029 rebalance_domains(balance_cpu, CPU_IDLE); 5026 rebalance_domains(balance_cpu, CPU_IDLE);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 91b4c957f289..b44d604b35d1 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -4,7 +4,7 @@
4 * idle-task scheduling class. 4 * idle-task scheduling class.
5 * 5 *
6 * (NOTE: these are not related to SCHED_IDLE tasks which are 6 * (NOTE: these are not related to SCHED_IDLE tasks which are
7 * handled in sched_fair.c) 7 * handled in sched/fair.c)
8 */ 8 */
9 9
10#ifdef CONFIG_SMP 10#ifdef CONFIG_SMP
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 44af55e6d5d0..c5565c3c515f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1803,44 +1803,40 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1803static void set_cpus_allowed_rt(struct task_struct *p, 1803static void set_cpus_allowed_rt(struct task_struct *p,
1804 const struct cpumask *new_mask) 1804 const struct cpumask *new_mask)
1805{ 1805{
1806 int weight = cpumask_weight(new_mask); 1806 struct rq *rq;
1807 int weight;
1807 1808
1808 BUG_ON(!rt_task(p)); 1809 BUG_ON(!rt_task(p));
1809 1810
1810 /* 1811 if (!p->on_rq)
1811 * Update the migration status of the RQ if we have an RT task 1812 return;
1812 * which is running AND changing its weight value.
1813 */
1814 if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
1815 struct rq *rq = task_rq(p);
1816
1817 if (!task_current(rq, p)) {
1818 /*
1819 * Make sure we dequeue this task from the pushable list
1820 * before going further. It will either remain off of
1821 * the list because we are no longer pushable, or it
1822 * will be requeued.
1823 */
1824 if (p->rt.nr_cpus_allowed > 1)
1825 dequeue_pushable_task(rq, p);
1826 1813
1827 /* 1814 weight = cpumask_weight(new_mask);
1828 * Requeue if our weight is changing and still > 1
1829 */
1830 if (weight > 1)
1831 enqueue_pushable_task(rq, p);
1832 1815
1833 } 1816 /*
1817 * Only update if the process changes its state from whether it
1818 * can migrate or not.
1819 */
1820 if ((p->rt.nr_cpus_allowed > 1) == (weight > 1))
1821 return;
1834 1822
1835 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { 1823 rq = task_rq(p);
1836 rq->rt.rt_nr_migratory++;
1837 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
1838 BUG_ON(!rq->rt.rt_nr_migratory);
1839 rq->rt.rt_nr_migratory--;
1840 }
1841 1824
1842 update_rt_migration(&rq->rt); 1825 /*
1826 * The process used to be able to migrate OR it can now migrate
1827 */
1828 if (weight <= 1) {
1829 if (!task_current(rq, p))
1830 dequeue_pushable_task(rq, p);
1831 BUG_ON(!rq->rt.rt_nr_migratory);
1832 rq->rt.rt_nr_migratory--;
1833 } else {
1834 if (!task_current(rq, p))
1835 enqueue_pushable_task(rq, p);
1836 rq->rt.rt_nr_migratory++;
1843 } 1837 }
1838
1839 update_rt_migration(&rq->rt);
1844} 1840}
1845 1841
1846/* Assumes rq->lock is held */ 1842/* Assumes rq->lock is held */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fb3acba4d52e..ba9dccfd24ce 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -201,7 +201,7 @@ struct cfs_bandwidth { };
201/* CFS-related fields in a runqueue */ 201/* CFS-related fields in a runqueue */
202struct cfs_rq { 202struct cfs_rq {
203 struct load_weight load; 203 struct load_weight load;
204 unsigned long nr_running, h_nr_running; 204 unsigned int nr_running, h_nr_running;
205 205
206 u64 exec_clock; 206 u64 exec_clock;
207 u64 min_vruntime; 207 u64 min_vruntime;
@@ -279,7 +279,7 @@ static inline int rt_bandwidth_enabled(void)
279/* Real-Time classes' related field in a runqueue: */ 279/* Real-Time classes' related field in a runqueue: */
280struct rt_rq { 280struct rt_rq {
281 struct rt_prio_array active; 281 struct rt_prio_array active;
282 unsigned long rt_nr_running; 282 unsigned int rt_nr_running;
283#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 283#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
284 struct { 284 struct {
285 int curr; /* highest queued rt task prio */ 285 int curr; /* highest queued rt task prio */
@@ -353,7 +353,7 @@ struct rq {
353 * nr_running and cpu_load should be in the same cacheline because 353 * nr_running and cpu_load should be in the same cacheline because
354 * remote CPUs use both these fields when doing load calculation. 354 * remote CPUs use both these fields when doing load calculation.
355 */ 355 */
356 unsigned long nr_running; 356 unsigned int nr_running;
357 #define CPU_LOAD_IDX_MAX 5 357 #define CPU_LOAD_IDX_MAX 5
358 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 358 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
359 unsigned long last_load_update_tick; 359 unsigned long last_load_update_tick;
@@ -876,7 +876,7 @@ extern void resched_cpu(int cpu);
876extern struct rt_bandwidth def_rt_bandwidth; 876extern struct rt_bandwidth def_rt_bandwidth;
877extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 877extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
878 878
879extern void update_cpu_load(struct rq *this_rq); 879extern void update_idle_cpu_load(struct rq *this_rq);
880 880
881#ifdef CONFIG_CGROUP_CPUACCT 881#ifdef CONFIG_CGROUP_CPUACCT
882#include <linux/cgroup.h> 882#include <linux/cgroup.h>