aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-05-22 21:27:32 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-05-22 21:27:32 -0400
commitd79ee93de909dfb252279b9a95978bbda9a814a9 (patch)
treebfccca60fd36259ff4bcc5e78a2c272fbd680065
parent2ff2b289a695807e291e1ed9f639d8a3ba5f4254 (diff)
parent1c2927f18576d65631d8e0ddd19e1d023183222e (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar: "The biggest change is the cleanup/simplification of the load-balancer: instead of the current practice of architectures twiddling scheduler internal data structures and providing the scheduler domains in colorfully inconsistent ways, we now have generic scheduler code in kernel/sched/core.c:sched_init_numa() that looks at the architecture's node_distance() parameters and (while not fully trusting it) deducts a NUMA topology from it. This inevitably changes balancing behavior - hopefully for the better. There are various smaller optimizations, cleanups and fixlets as well" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched: Taint kernel with TAINT_WARN after sleep-in-atomic bug sched: Remove stale power aware scheduling remnants and dysfunctional knobs sched/debug: Fix printing large integers on 32-bit platforms sched/fair: Improve the ->group_imb logic sched/nohz: Fix rq->cpu_load[] calculations sched/numa: Don't scale the imbalance sched/fair: Revert sched-domain iteration breakage sched/x86: Rewrite set_cpu_sibling_map() sched/numa: Fix the new NUMA topology bits sched/numa: Rewrite the CONFIG_NUMA sched domain support sched/fair: Propagate 'struct lb_env' usage into find_busiest_group sched/fair: Add some serialization to the sched_domain load-balance walk sched/fair: Let minimally loaded cpu balance the group sched: Change rq->nr_running to unsigned int x86/numa: Check for nonsensical topologies on real hw as well x86/numa: Hard partition cpu topology masks on node boundaries x86/numa: Allow specifying node_distance() for numa=fake x86/sched: Make mwait_usable() heed to "idle=" kernel parameters properly sched: Update documentation and comments sched_rt: Avoid unnecessary dequeue and enqueue of pushable tasks in set_cpus_allowed_rt()
-rw-r--r--Documentation/ABI/testing/sysfs-devices-system-cpu25
-rw-r--r--Documentation/scheduler/sched-design-CFS.txt6
-rw-r--r--Documentation/scheduler/sched-domains.txt4
-rw-r--r--arch/ia64/include/asm/topology.h25
-rw-r--r--arch/mips/include/asm/mach-ip27/topology.h17
-rw-r--r--arch/powerpc/include/asm/topology.h36
-rw-r--r--arch/sh/include/asm/topology.h25
-rw-r--r--arch/sparc/include/asm/topology_64.h19
-rw-r--r--arch/tile/include/asm/topology.h26
-rw-r--r--arch/x86/include/asm/topology.h38
-rw-r--r--arch/x86/kernel/process.c8
-rw-r--r--arch/x86/kernel/smpboot.c108
-rw-r--r--arch/x86/mm/numa_emulation.c8
-rw-r--r--drivers/base/cpu.c4
-rw-r--r--include/linux/cpu.h2
-rw-r--r--include/linux/sched.h49
-rw-r--r--include/linux/topology.h42
-rw-r--r--kernel/sched/core.c420
-rw-r--r--kernel/sched/debug.c12
-rw-r--r--kernel/sched/fair.c462
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/rt.c56
-rw-r--r--kernel/sched/sched.h8
-rw-r--r--tools/power/cpupower/man/cpupower-set.19
-rw-r--r--tools/power/cpupower/utils/helpers/sysfs.c35
25 files changed, 441 insertions, 1005 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index e7be75b96e4b..5dab36448b44 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -9,31 +9,6 @@ Description:
9 9
10 /sys/devices/system/cpu/cpu#/ 10 /sys/devices/system/cpu/cpu#/
11 11
12What: /sys/devices/system/cpu/sched_mc_power_savings
13 /sys/devices/system/cpu/sched_smt_power_savings
14Date: June 2006
15Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
16Description: Discover and adjust the kernel's multi-core scheduler support.
17
18 Possible values are:
19
20 0 - No power saving load balance (default value)
21 1 - Fill one thread/core/package first for long running threads
22 2 - Also bias task wakeups to semi-idle cpu package for power
23 savings
24
25 sched_mc_power_savings is dependent upon SCHED_MC, which is
26 itself architecture dependent.
27
28 sched_smt_power_savings is dependent upon SCHED_SMT, which
29 is itself architecture dependent.
30
31 The two files are independent of each other. It is possible
32 that one file may be present without the other.
33
34 Introduced by git commit 5c45bf27.
35
36
37What: /sys/devices/system/cpu/kernel_max 12What: /sys/devices/system/cpu/kernel_max
38 /sys/devices/system/cpu/offline 13 /sys/devices/system/cpu/offline
39 /sys/devices/system/cpu/online 14 /sys/devices/system/cpu/online
diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt
index 91ecff07cede..d529e02d928d 100644
--- a/Documentation/scheduler/sched-design-CFS.txt
+++ b/Documentation/scheduler/sched-design-CFS.txt
@@ -130,7 +130,7 @@ CFS implements three scheduling policies:
130 idle timer scheduler in order to avoid to get into priority 130 idle timer scheduler in order to avoid to get into priority
131 inversion problems which would deadlock the machine. 131 inversion problems which would deadlock the machine.
132 132
133SCHED_FIFO/_RR are implemented in sched_rt.c and are as specified by 133SCHED_FIFO/_RR are implemented in sched/rt.c and are as specified by
134POSIX. 134POSIX.
135 135
136The command chrt from util-linux-ng 2.13.1.1 can set all of these except 136The command chrt from util-linux-ng 2.13.1.1 can set all of these except
@@ -145,9 +145,9 @@ Classes," an extensible hierarchy of scheduler modules. These modules
145encapsulate scheduling policy details and are handled by the scheduler core 145encapsulate scheduling policy details and are handled by the scheduler core
146without the core code assuming too much about them. 146without the core code assuming too much about them.
147 147
148sched_fair.c implements the CFS scheduler described above. 148sched/fair.c implements the CFS scheduler described above.
149 149
150sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler way than 150sched/rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler way than
151the previous vanilla scheduler did. It uses 100 runqueues (for all 100 RT 151the previous vanilla scheduler did. It uses 100 runqueues (for all 100 RT
152priority levels, instead of 140 in the previous scheduler) and it needs no 152priority levels, instead of 140 in the previous scheduler) and it needs no
153expired array. 153expired array.
diff --git a/Documentation/scheduler/sched-domains.txt b/Documentation/scheduler/sched-domains.txt
index b7ee379b651b..443f0c76bab4 100644
--- a/Documentation/scheduler/sched-domains.txt
+++ b/Documentation/scheduler/sched-domains.txt
@@ -61,10 +61,6 @@ The implementor should read comments in include/linux/sched.h:
61struct sched_domain fields, SD_FLAG_*, SD_*_INIT to get an idea of 61struct sched_domain fields, SD_FLAG_*, SD_*_INIT to get an idea of
62the specifics and what to tune. 62the specifics and what to tune.
63 63
64For SMT, the architecture must define CONFIG_SCHED_SMT and provide a
65cpumask_t cpu_sibling_map[NR_CPUS], where cpu_sibling_map[i] is the mask of
66all "i"'s siblings as well as "i" itself.
67
68Architectures may retain the regular override the default SD_*_INIT flags 64Architectures may retain the regular override the default SD_*_INIT flags
69while using the generic domain builder in kernel/sched.c if they wish to 65while using the generic domain builder in kernel/sched.c if they wish to
70retain the traditional SMT->SMP->NUMA topology (or some subset of that). This 66retain the traditional SMT->SMP->NUMA topology (or some subset of that). This
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 09f646753d1a..a2496e449b75 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -70,31 +70,6 @@ void build_cpu_to_node_map(void);
70 .nr_balance_failed = 0, \ 70 .nr_balance_failed = 0, \
71} 71}
72 72
73/* sched_domains SD_NODE_INIT for IA64 NUMA machines */
74#define SD_NODE_INIT (struct sched_domain) { \
75 .parent = NULL, \
76 .child = NULL, \
77 .groups = NULL, \
78 .min_interval = 8, \
79 .max_interval = 8*(min(num_online_cpus(), 32U)), \
80 .busy_factor = 64, \
81 .imbalance_pct = 125, \
82 .cache_nice_tries = 2, \
83 .busy_idx = 3, \
84 .idle_idx = 2, \
85 .newidle_idx = 0, \
86 .wake_idx = 0, \
87 .forkexec_idx = 0, \
88 .flags = SD_LOAD_BALANCE \
89 | SD_BALANCE_NEWIDLE \
90 | SD_BALANCE_EXEC \
91 | SD_BALANCE_FORK \
92 | SD_SERIALIZE, \
93 .last_balance = jiffies, \
94 .balance_interval = 64, \
95 .nr_balance_failed = 0, \
96}
97
98#endif /* CONFIG_NUMA */ 73#endif /* CONFIG_NUMA */
99 74
100#ifdef CONFIG_SMP 75#ifdef CONFIG_SMP
diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h
index 1b1a7d1632b9..b2cf641f206f 100644
--- a/arch/mips/include/asm/mach-ip27/topology.h
+++ b/arch/mips/include/asm/mach-ip27/topology.h
@@ -36,23 +36,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
36 36
37#define node_distance(from, to) (__node_distances[(from)][(to)]) 37#define node_distance(from, to) (__node_distances[(from)][(to)])
38 38
39/* sched_domains SD_NODE_INIT for SGI IP27 machines */
40#define SD_NODE_INIT (struct sched_domain) { \
41 .parent = NULL, \
42 .child = NULL, \
43 .groups = NULL, \
44 .min_interval = 8, \
45 .max_interval = 32, \
46 .busy_factor = 32, \
47 .imbalance_pct = 125, \
48 .cache_nice_tries = 1, \
49 .flags = SD_LOAD_BALANCE | \
50 SD_BALANCE_EXEC, \
51 .last_balance = jiffies, \
52 .balance_interval = 1, \
53 .nr_balance_failed = 0, \
54}
55
56#include <asm-generic/topology.h> 39#include <asm-generic/topology.h>
57 40
58#endif /* _ASM_MACH_TOPOLOGY_H */ 41#endif /* _ASM_MACH_TOPOLOGY_H */
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index c97185885c6d..852ed1b384f6 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -18,12 +18,6 @@ struct device_node;
18 */ 18 */
19#define RECLAIM_DISTANCE 10 19#define RECLAIM_DISTANCE 10
20 20
21/*
22 * Avoid creating an extra level of balancing (SD_ALLNODES) on the largest
23 * POWER7 boxes which have a maximum of 32 nodes.
24 */
25#define SD_NODES_PER_DOMAIN 32
26
27#include <asm/mmzone.h> 21#include <asm/mmzone.h>
28 22
29static inline int cpu_to_node(int cpu) 23static inline int cpu_to_node(int cpu)
@@ -51,36 +45,6 @@ static inline int pcibus_to_node(struct pci_bus *bus)
51 cpu_all_mask : \ 45 cpu_all_mask : \
52 cpumask_of_node(pcibus_to_node(bus))) 46 cpumask_of_node(pcibus_to_node(bus)))
53 47
54/* sched_domains SD_NODE_INIT for PPC64 machines */
55#define SD_NODE_INIT (struct sched_domain) { \
56 .min_interval = 8, \
57 .max_interval = 32, \
58 .busy_factor = 32, \
59 .imbalance_pct = 125, \
60 .cache_nice_tries = 1, \
61 .busy_idx = 3, \
62 .idle_idx = 1, \
63 .newidle_idx = 0, \
64 .wake_idx = 0, \
65 .forkexec_idx = 0, \
66 \
67 .flags = 1*SD_LOAD_BALANCE \
68 | 0*SD_BALANCE_NEWIDLE \
69 | 1*SD_BALANCE_EXEC \
70 | 1*SD_BALANCE_FORK \
71 | 0*SD_BALANCE_WAKE \
72 | 1*SD_WAKE_AFFINE \
73 | 0*SD_PREFER_LOCAL \
74 | 0*SD_SHARE_CPUPOWER \
75 | 0*SD_POWERSAVINGS_BALANCE \
76 | 0*SD_SHARE_PKG_RESOURCES \
77 | 1*SD_SERIALIZE \
78 | 0*SD_PREFER_SIBLING \
79 , \
80 .last_balance = jiffies, \
81 .balance_interval = 1, \
82}
83
84extern int __node_distance(int, int); 48extern int __node_distance(int, int);
85#define node_distance(a, b) __node_distance(a, b) 49#define node_distance(a, b) __node_distance(a, b)
86 50
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index 88e734069fa6..b0a282d65f6a 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -3,31 +3,6 @@
3 3
4#ifdef CONFIG_NUMA 4#ifdef CONFIG_NUMA
5 5
6/* sched_domains SD_NODE_INIT for sh machines */
7#define SD_NODE_INIT (struct sched_domain) { \
8 .parent = NULL, \
9 .child = NULL, \
10 .groups = NULL, \
11 .min_interval = 8, \
12 .max_interval = 32, \
13 .busy_factor = 32, \
14 .imbalance_pct = 125, \
15 .cache_nice_tries = 2, \
16 .busy_idx = 3, \
17 .idle_idx = 2, \
18 .newidle_idx = 0, \
19 .wake_idx = 0, \
20 .forkexec_idx = 0, \
21 .flags = SD_LOAD_BALANCE \
22 | SD_BALANCE_FORK \
23 | SD_BALANCE_EXEC \
24 | SD_BALANCE_NEWIDLE \
25 | SD_SERIALIZE, \
26 .last_balance = jiffies, \
27 .balance_interval = 1, \
28 .nr_balance_failed = 0, \
29}
30
31#define cpu_to_node(cpu) ((void)(cpu),0) 6#define cpu_to_node(cpu) ((void)(cpu),0)
32#define parent_node(node) ((void)(node),0) 7#define parent_node(node) ((void)(node),0)
33 8
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index 8b9c556d630b..1754390a426f 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -31,25 +31,6 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
31 cpu_all_mask : \ 31 cpu_all_mask : \
32 cpumask_of_node(pcibus_to_node(bus))) 32 cpumask_of_node(pcibus_to_node(bus)))
33 33
34#define SD_NODE_INIT (struct sched_domain) { \
35 .min_interval = 8, \
36 .max_interval = 32, \
37 .busy_factor = 32, \
38 .imbalance_pct = 125, \
39 .cache_nice_tries = 2, \
40 .busy_idx = 3, \
41 .idle_idx = 2, \
42 .newidle_idx = 0, \
43 .wake_idx = 0, \
44 .forkexec_idx = 0, \
45 .flags = SD_LOAD_BALANCE \
46 | SD_BALANCE_FORK \
47 | SD_BALANCE_EXEC \
48 | SD_SERIALIZE, \
49 .last_balance = jiffies, \
50 .balance_interval = 1, \
51}
52
53#else /* CONFIG_NUMA */ 34#else /* CONFIG_NUMA */
54 35
55#include <asm-generic/topology.h> 36#include <asm-generic/topology.h>
diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h
index 6fdd0c860193..7a7ce390534f 100644
--- a/arch/tile/include/asm/topology.h
+++ b/arch/tile/include/asm/topology.h
@@ -78,32 +78,6 @@ static inline const struct cpumask *cpumask_of_node(int node)
78 .balance_interval = 32, \ 78 .balance_interval = 32, \
79} 79}
80 80
81/* sched_domains SD_NODE_INIT for TILE architecture */
82#define SD_NODE_INIT (struct sched_domain) { \
83 .min_interval = 16, \
84 .max_interval = 512, \
85 .busy_factor = 32, \
86 .imbalance_pct = 125, \
87 .cache_nice_tries = 1, \
88 .busy_idx = 3, \
89 .idle_idx = 1, \
90 .newidle_idx = 2, \
91 .wake_idx = 1, \
92 .flags = 1*SD_LOAD_BALANCE \
93 | 1*SD_BALANCE_NEWIDLE \
94 | 1*SD_BALANCE_EXEC \
95 | 1*SD_BALANCE_FORK \
96 | 0*SD_BALANCE_WAKE \
97 | 0*SD_WAKE_AFFINE \
98 | 0*SD_PREFER_LOCAL \
99 | 0*SD_SHARE_CPUPOWER \
100 | 0*SD_SHARE_PKG_RESOURCES \
101 | 1*SD_SERIALIZE \
102 , \
103 .last_balance = jiffies, \
104 .balance_interval = 128, \
105}
106
107/* By definition, we create nodes based on online memory. */ 81/* By definition, we create nodes based on online memory. */
108#define node_has_online_mem(nid) 1 82#define node_has_online_mem(nid) 1
109 83
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index b9676ae37ada..095b21507b6a 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -92,44 +92,6 @@ extern void setup_node_to_cpumask_map(void);
92 92
93#define pcibus_to_node(bus) __pcibus_to_node(bus) 93#define pcibus_to_node(bus) __pcibus_to_node(bus)
94 94
95#ifdef CONFIG_X86_32
96# define SD_CACHE_NICE_TRIES 1
97# define SD_IDLE_IDX 1
98#else
99# define SD_CACHE_NICE_TRIES 2
100# define SD_IDLE_IDX 2
101#endif
102
103/* sched_domains SD_NODE_INIT for NUMA machines */
104#define SD_NODE_INIT (struct sched_domain) { \
105 .min_interval = 8, \
106 .max_interval = 32, \
107 .busy_factor = 32, \
108 .imbalance_pct = 125, \
109 .cache_nice_tries = SD_CACHE_NICE_TRIES, \
110 .busy_idx = 3, \
111 .idle_idx = SD_IDLE_IDX, \
112 .newidle_idx = 0, \
113 .wake_idx = 0, \
114 .forkexec_idx = 0, \
115 \
116 .flags = 1*SD_LOAD_BALANCE \
117 | 1*SD_BALANCE_NEWIDLE \
118 | 1*SD_BALANCE_EXEC \
119 | 1*SD_BALANCE_FORK \
120 | 0*SD_BALANCE_WAKE \
121 | 1*SD_WAKE_AFFINE \
122 | 0*SD_PREFER_LOCAL \
123 | 0*SD_SHARE_CPUPOWER \
124 | 0*SD_POWERSAVINGS_BALANCE \
125 | 0*SD_SHARE_PKG_RESOURCES \
126 | 1*SD_SERIALIZE \
127 | 0*SD_PREFER_SIBLING \
128 , \
129 .last_balance = jiffies, \
130 .balance_interval = 1, \
131}
132
133extern int __node_distance(int, int); 95extern int __node_distance(int, int);
134#define node_distance(a, b) __node_distance(a, b) 96#define node_distance(a, b) __node_distance(a, b)
135 97
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index dc8ca8ea78c4..8040b752ee4f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -582,9 +582,17 @@ int mwait_usable(const struct cpuinfo_x86 *c)
582{ 582{
583 u32 eax, ebx, ecx, edx; 583 u32 eax, ebx, ecx, edx;
584 584
585 /* Use mwait if idle=mwait boot option is given */
585 if (boot_option_idle_override == IDLE_FORCE_MWAIT) 586 if (boot_option_idle_override == IDLE_FORCE_MWAIT)
586 return 1; 587 return 1;
587 588
589 /*
590 * Any idle= boot option other than idle=mwait means that we must not
591 * use mwait. Eg: idle=halt or idle=poll or idle=nomwait
592 */
593 if (boot_option_idle_override != IDLE_NO_OVERRIDE)
594 return 0;
595
588 if (c->cpuid_level < MWAIT_INFO) 596 if (c->cpuid_level < MWAIT_INFO)
589 return 0; 597 return 0;
590 598
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 3acaf51dfddb..433529e29be4 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -299,59 +299,90 @@ void __cpuinit smp_store_cpu_info(int id)
299 identify_secondary_cpu(c); 299 identify_secondary_cpu(c);
300} 300}
301 301
302static void __cpuinit link_thread_siblings(int cpu1, int cpu2) 302static bool __cpuinit
303topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
303{ 304{
304 cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); 305 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
305 cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); 306
306 cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); 307 return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2),
307 cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); 308 "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
308 cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2)); 309 "[node: %d != %d]. Ignoring dependency.\n",
309 cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1)); 310 cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
310} 311}
311 312
313#define link_mask(_m, c1, c2) \
314do { \
315 cpumask_set_cpu((c1), cpu_##_m##_mask(c2)); \
316 cpumask_set_cpu((c2), cpu_##_m##_mask(c1)); \
317} while (0)
318
319static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
320{
321 if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
322 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
323
324 if (c->phys_proc_id == o->phys_proc_id &&
325 per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) &&
326 c->compute_unit_id == o->compute_unit_id)
327 return topology_sane(c, o, "smt");
328
329 } else if (c->phys_proc_id == o->phys_proc_id &&
330 c->cpu_core_id == o->cpu_core_id) {
331 return topology_sane(c, o, "smt");
332 }
333
334 return false;
335}
336
337static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
338{
339 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
340
341 if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID &&
342 per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2))
343 return topology_sane(c, o, "llc");
344
345 return false;
346}
347
348static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
349{
350 if (c->phys_proc_id == o->phys_proc_id)
351 return topology_sane(c, o, "mc");
352
353 return false;
354}
312 355
313void __cpuinit set_cpu_sibling_map(int cpu) 356void __cpuinit set_cpu_sibling_map(int cpu)
314{ 357{
315 int i; 358 bool has_mc = boot_cpu_data.x86_max_cores > 1;
359 bool has_smt = smp_num_siblings > 1;
316 struct cpuinfo_x86 *c = &cpu_data(cpu); 360 struct cpuinfo_x86 *c = &cpu_data(cpu);
361 struct cpuinfo_x86 *o;
362 int i;
317 363
318 cpumask_set_cpu(cpu, cpu_sibling_setup_mask); 364 cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
319 365
320 if (smp_num_siblings > 1) { 366 if (!has_smt && !has_mc) {
321 for_each_cpu(i, cpu_sibling_setup_mask) {
322 struct cpuinfo_x86 *o = &cpu_data(i);
323
324 if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
325 if (c->phys_proc_id == o->phys_proc_id &&
326 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) &&
327 c->compute_unit_id == o->compute_unit_id)
328 link_thread_siblings(cpu, i);
329 } else if (c->phys_proc_id == o->phys_proc_id &&
330 c->cpu_core_id == o->cpu_core_id) {
331 link_thread_siblings(cpu, i);
332 }
333 }
334 } else {
335 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); 367 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
336 } 368 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
337 369 cpumask_set_cpu(cpu, cpu_core_mask(cpu));
338 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
339
340 if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
341 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
342 c->booted_cores = 1; 370 c->booted_cores = 1;
343 return; 371 return;
344 } 372 }
345 373
346 for_each_cpu(i, cpu_sibling_setup_mask) { 374 for_each_cpu(i, cpu_sibling_setup_mask) {
347 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && 375 o = &cpu_data(i);
348 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { 376
349 cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); 377 if ((i == cpu) || (has_smt && match_smt(c, o)))
350 cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); 378 link_mask(sibling, cpu, i);
351 } 379
352 if (c->phys_proc_id == cpu_data(i).phys_proc_id) { 380 if ((i == cpu) || (has_mc && match_llc(c, o)))
353 cpumask_set_cpu(i, cpu_core_mask(cpu)); 381 link_mask(llc_shared, cpu, i);
354 cpumask_set_cpu(cpu, cpu_core_mask(i)); 382
383 if ((i == cpu) || (has_mc && match_mc(c, o))) {
384 link_mask(core, cpu, i);
385
355 /* 386 /*
356 * Does this new cpu bringup a new core? 387 * Does this new cpu bringup a new core?
357 */ 388 */
@@ -382,8 +413,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
382 * For perf, we return last level cache shared map. 413 * For perf, we return last level cache shared map.
383 * And for power savings, we return cpu_core_map 414 * And for power savings, we return cpu_core_map
384 */ 415 */
385 if ((sched_mc_power_savings || sched_smt_power_savings) && 416 if (!(cpu_has(c, X86_FEATURE_AMD_DCM)))
386 !(cpu_has(c, X86_FEATURE_AMD_DCM)))
387 return cpu_core_mask(cpu); 417 return cpu_core_mask(cpu);
388 else 418 else
389 return cpu_llc_shared_mask(cpu); 419 return cpu_llc_shared_mask(cpu);
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 53489ff6bf82..871dd8868170 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -339,9 +339,11 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
339 } else { 339 } else {
340 unsigned long n; 340 unsigned long n;
341 341
342 n = simple_strtoul(emu_cmdline, NULL, 0); 342 n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
343 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); 343 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
344 } 344 }
345 if (*emu_cmdline == ':')
346 emu_cmdline++;
345 347
346 if (ret < 0) 348 if (ret < 0)
347 goto no_emu; 349 goto no_emu;
@@ -418,7 +420,9 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
418 int physj = emu_nid_to_phys[j]; 420 int physj = emu_nid_to_phys[j];
419 int dist; 421 int dist;
420 422
421 if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) 423 if (get_option(&emu_cmdline, &dist) == 2)
424 ;
425 else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
422 dist = physi == physj ? 426 dist = physi == physj ?
423 LOCAL_DISTANCE : REMOTE_DISTANCE; 427 LOCAL_DISTANCE : REMOTE_DISTANCE;
424 else 428 else
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index adf937bf4091..63452943abd1 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -330,8 +330,4 @@ void __init cpu_dev_init(void)
330 panic("Failed to register CPU subsystem"); 330 panic("Failed to register CPU subsystem");
331 331
332 cpu_dev_register_generic(); 332 cpu_dev_register_generic();
333
334#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
335 sched_create_sysfs_power_savings_entries(cpu_subsys.dev_root);
336#endif
337} 333}
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index ee28844ae68e..7230bb59a06f 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -36,8 +36,6 @@ extern void cpu_remove_dev_attr(struct device_attribute *attr);
36extern int cpu_add_dev_attr_group(struct attribute_group *attrs); 36extern int cpu_add_dev_attr_group(struct attribute_group *attrs);
37extern void cpu_remove_dev_attr_group(struct attribute_group *attrs); 37extern void cpu_remove_dev_attr_group(struct attribute_group *attrs);
38 38
39extern int sched_create_sysfs_power_savings_entries(struct device *dev);
40
41#ifdef CONFIG_HOTPLUG_CPU 39#ifdef CONFIG_HOTPLUG_CPU
42extern void unregister_cpu(struct cpu *cpu); 40extern void unregister_cpu(struct cpu *cpu);
43extern ssize_t arch_cpu_probe(const char *, size_t); 41extern ssize_t arch_cpu_probe(const char *, size_t);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f774d88cd0aa..28fa9d02fd59 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -855,61 +855,14 @@ enum cpu_idle_type {
855#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ 855#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
856#define SD_PREFER_LOCAL 0x0040 /* Prefer to keep tasks local to this domain */ 856#define SD_PREFER_LOCAL 0x0040 /* Prefer to keep tasks local to this domain */
857#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ 857#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
858#define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */
859#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ 858#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
860#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ 859#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
861#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ 860#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
862#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ 861#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
863#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ 862#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
864 863
865enum powersavings_balance_level {
866 POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */
867 POWERSAVINGS_BALANCE_BASIC, /* Fill one thread/core/package
868 * first for long running threads
869 */
870 POWERSAVINGS_BALANCE_WAKEUP, /* Also bias task wakeups to semi-idle
871 * cpu package for power savings
872 */
873 MAX_POWERSAVINGS_BALANCE_LEVELS
874};
875
876extern int sched_mc_power_savings, sched_smt_power_savings;
877
878static inline int sd_balance_for_mc_power(void)
879{
880 if (sched_smt_power_savings)
881 return SD_POWERSAVINGS_BALANCE;
882
883 if (!sched_mc_power_savings)
884 return SD_PREFER_SIBLING;
885
886 return 0;
887}
888
889static inline int sd_balance_for_package_power(void)
890{
891 if (sched_mc_power_savings | sched_smt_power_savings)
892 return SD_POWERSAVINGS_BALANCE;
893
894 return SD_PREFER_SIBLING;
895}
896
897extern int __weak arch_sd_sibiling_asym_packing(void); 864extern int __weak arch_sd_sibiling_asym_packing(void);
898 865
899/*
900 * Optimise SD flags for power savings:
901 * SD_BALANCE_NEWIDLE helps aggressive task consolidation and power savings.
902 * Keep default SD flags if sched_{smt,mc}_power_saving=0
903 */
904
905static inline int sd_power_saving_flags(void)
906{
907 if (sched_mc_power_savings | sched_smt_power_savings)
908 return SD_BALANCE_NEWIDLE;
909
910 return 0;
911}
912
913struct sched_group_power { 866struct sched_group_power {
914 atomic_t ref; 867 atomic_t ref;
915 /* 868 /*
@@ -1962,7 +1915,7 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
1962 */ 1915 */
1963extern unsigned long long notrace sched_clock(void); 1916extern unsigned long long notrace sched_clock(void);
1964/* 1917/*
1965 * See the comment in kernel/sched_clock.c 1918 * See the comment in kernel/sched/clock.c
1966 */ 1919 */
1967extern u64 cpu_clock(int cpu); 1920extern u64 cpu_clock(int cpu);
1968extern u64 local_clock(void); 1921extern u64 local_clock(void);
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 9dc427cdb6ff..e91cd43394df 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -70,7 +70,6 @@ int arch_update_cpu_topology(void);
70 * Below are the 3 major initializers used in building sched_domains: 70 * Below are the 3 major initializers used in building sched_domains:
71 * SD_SIBLING_INIT, for SMT domains 71 * SD_SIBLING_INIT, for SMT domains
72 * SD_CPU_INIT, for SMP domains 72 * SD_CPU_INIT, for SMP domains
73 * SD_NODE_INIT, for NUMA domains
74 * 73 *
75 * Any architecture that cares to do any tuning to these values should do so 74 * Any architecture that cares to do any tuning to these values should do so
76 * by defining their own arch-specific initializer in include/asm/topology.h. 75 * by defining their own arch-specific initializer in include/asm/topology.h.
@@ -99,7 +98,6 @@ int arch_update_cpu_topology(void);
99 | 0*SD_BALANCE_WAKE \ 98 | 0*SD_BALANCE_WAKE \
100 | 1*SD_WAKE_AFFINE \ 99 | 1*SD_WAKE_AFFINE \
101 | 1*SD_SHARE_CPUPOWER \ 100 | 1*SD_SHARE_CPUPOWER \
102 | 0*SD_POWERSAVINGS_BALANCE \
103 | 1*SD_SHARE_PKG_RESOURCES \ 101 | 1*SD_SHARE_PKG_RESOURCES \
104 | 0*SD_SERIALIZE \ 102 | 0*SD_SERIALIZE \
105 | 0*SD_PREFER_SIBLING \ 103 | 0*SD_PREFER_SIBLING \
@@ -135,8 +133,6 @@ int arch_update_cpu_topology(void);
135 | 0*SD_SHARE_CPUPOWER \ 133 | 0*SD_SHARE_CPUPOWER \
136 | 1*SD_SHARE_PKG_RESOURCES \ 134 | 1*SD_SHARE_PKG_RESOURCES \
137 | 0*SD_SERIALIZE \ 135 | 0*SD_SERIALIZE \
138 | sd_balance_for_mc_power() \
139 | sd_power_saving_flags() \
140 , \ 136 , \
141 .last_balance = jiffies, \ 137 .last_balance = jiffies, \
142 .balance_interval = 1, \ 138 .balance_interval = 1, \
@@ -168,56 +164,18 @@ int arch_update_cpu_topology(void);
168 | 0*SD_SHARE_CPUPOWER \ 164 | 0*SD_SHARE_CPUPOWER \
169 | 0*SD_SHARE_PKG_RESOURCES \ 165 | 0*SD_SHARE_PKG_RESOURCES \
170 | 0*SD_SERIALIZE \ 166 | 0*SD_SERIALIZE \
171 | sd_balance_for_package_power() \
172 | sd_power_saving_flags() \
173 , \ 167 , \
174 .last_balance = jiffies, \ 168 .last_balance = jiffies, \
175 .balance_interval = 1, \ 169 .balance_interval = 1, \
176} 170}
177#endif 171#endif
178 172
179/* sched_domains SD_ALLNODES_INIT for NUMA machines */
180#define SD_ALLNODES_INIT (struct sched_domain) { \
181 .min_interval = 64, \
182 .max_interval = 64*num_online_cpus(), \
183 .busy_factor = 128, \
184 .imbalance_pct = 133, \
185 .cache_nice_tries = 1, \
186 .busy_idx = 3, \
187 .idle_idx = 3, \
188 .flags = 1*SD_LOAD_BALANCE \
189 | 1*SD_BALANCE_NEWIDLE \
190 | 0*SD_BALANCE_EXEC \
191 | 0*SD_BALANCE_FORK \
192 | 0*SD_BALANCE_WAKE \
193 | 0*SD_WAKE_AFFINE \
194 | 0*SD_SHARE_CPUPOWER \
195 | 0*SD_POWERSAVINGS_BALANCE \
196 | 0*SD_SHARE_PKG_RESOURCES \
197 | 1*SD_SERIALIZE \
198 | 0*SD_PREFER_SIBLING \
199 , \
200 .last_balance = jiffies, \
201 .balance_interval = 64, \
202}
203
204#ifndef SD_NODES_PER_DOMAIN
205#define SD_NODES_PER_DOMAIN 16
206#endif
207
208#ifdef CONFIG_SCHED_BOOK 173#ifdef CONFIG_SCHED_BOOK
209#ifndef SD_BOOK_INIT 174#ifndef SD_BOOK_INIT
210#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!! 175#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
211#endif 176#endif
212#endif /* CONFIG_SCHED_BOOK */ 177#endif /* CONFIG_SCHED_BOOK */
213 178
214#ifdef CONFIG_NUMA
215#ifndef SD_NODE_INIT
216#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
217#endif
218
219#endif /* CONFIG_NUMA */
220
221#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 179#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
222DECLARE_PER_CPU(int, numa_node); 180DECLARE_PER_CPU(int, numa_node);
223 181
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d2e2e173d8f7..d833cc94eedc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -693,8 +693,6 @@ int tg_nop(struct task_group *tg, void *data)
693} 693}
694#endif 694#endif
695 695
696void update_cpu_load(struct rq *this_rq);
697
698static void set_load_weight(struct task_struct *p) 696static void set_load_weight(struct task_struct *p)
699{ 697{
700 int prio = p->static_prio - MAX_RT_PRIO; 698 int prio = p->static_prio - MAX_RT_PRIO;
@@ -2481,22 +2479,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2481 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2479 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
2482 * every tick. We fix it up based on jiffies. 2480 * every tick. We fix it up based on jiffies.
2483 */ 2481 */
2484void update_cpu_load(struct rq *this_rq) 2482static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2483 unsigned long pending_updates)
2485{ 2484{
2486 unsigned long this_load = this_rq->load.weight;
2487 unsigned long curr_jiffies = jiffies;
2488 unsigned long pending_updates;
2489 int i, scale; 2485 int i, scale;
2490 2486
2491 this_rq->nr_load_updates++; 2487 this_rq->nr_load_updates++;
2492 2488
2493 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
2494 if (curr_jiffies == this_rq->last_load_update_tick)
2495 return;
2496
2497 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2498 this_rq->last_load_update_tick = curr_jiffies;
2499
2500 /* Update our load: */ 2489 /* Update our load: */
2501 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ 2490 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2502 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 2491 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2521,9 +2510,45 @@ void update_cpu_load(struct rq *this_rq)
2521 sched_avg_update(this_rq); 2510 sched_avg_update(this_rq);
2522} 2511}
2523 2512
2513/*
2514 * Called from nohz_idle_balance() to update the load ratings before doing the
2515 * idle balance.
2516 */
2517void update_idle_cpu_load(struct rq *this_rq)
2518{
2519 unsigned long curr_jiffies = jiffies;
2520 unsigned long load = this_rq->load.weight;
2521 unsigned long pending_updates;
2522
2523 /*
2524 * Bloody broken means of dealing with nohz, but better than nothing..
2525 * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
2526 * update and see 0 difference the one time and 2 the next, even though
2527 * we ticked at roughtly the same rate.
2528 *
2529 * Hence we only use this from nohz_idle_balance() and skip this
2530 * nonsense when called from the scheduler_tick() since that's
2531 * guaranteed a stable rate.
2532 */
2533 if (load || curr_jiffies == this_rq->last_load_update_tick)
2534 return;
2535
2536 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2537 this_rq->last_load_update_tick = curr_jiffies;
2538
2539 __update_cpu_load(this_rq, load, pending_updates);
2540}
2541
2542/*
2543 * Called from scheduler_tick()
2544 */
2524static void update_cpu_load_active(struct rq *this_rq) 2545static void update_cpu_load_active(struct rq *this_rq)
2525{ 2546{
2526 update_cpu_load(this_rq); 2547 /*
2548 * See the mess in update_idle_cpu_load().
2549 */
2550 this_rq->last_load_update_tick = jiffies;
2551 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2527 2552
2528 calc_load_account_active(this_rq); 2553 calc_load_account_active(this_rq);
2529} 2554}
@@ -3108,6 +3133,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
3108 if (irqs_disabled()) 3133 if (irqs_disabled())
3109 print_irqtrace_events(prev); 3134 print_irqtrace_events(prev);
3110 dump_stack(); 3135 dump_stack();
3136 add_taint(TAINT_WARN);
3111} 3137}
3112 3138
3113/* 3139/*
@@ -5555,7 +5581,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5555 break; 5581 break;
5556 } 5582 }
5557 5583
5558 if (cpumask_intersects(groupmask, sched_group_cpus(group))) { 5584 if (!(sd->flags & SD_OVERLAP) &&
5585 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5559 printk(KERN_CONT "\n"); 5586 printk(KERN_CONT "\n");
5560 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5587 printk(KERN_ERR "ERROR: repeated CPUs\n");
5561 break; 5588 break;
@@ -5893,99 +5920,11 @@ static int __init isolated_cpu_setup(char *str)
5893 5920
5894__setup("isolcpus=", isolated_cpu_setup); 5921__setup("isolcpus=", isolated_cpu_setup);
5895 5922
5896#ifdef CONFIG_NUMA
5897
5898/**
5899 * find_next_best_node - find the next node to include in a sched_domain
5900 * @node: node whose sched_domain we're building
5901 * @used_nodes: nodes already in the sched_domain
5902 *
5903 * Find the next node to include in a given scheduling domain. Simply
5904 * finds the closest node not already in the @used_nodes map.
5905 *
5906 * Should use nodemask_t.
5907 */
5908static int find_next_best_node(int node, nodemask_t *used_nodes)
5909{
5910 int i, n, val, min_val, best_node = -1;
5911
5912 min_val = INT_MAX;
5913
5914 for (i = 0; i < nr_node_ids; i++) {
5915 /* Start at @node */
5916 n = (node + i) % nr_node_ids;
5917
5918 if (!nr_cpus_node(n))
5919 continue;
5920
5921 /* Skip already used nodes */
5922 if (node_isset(n, *used_nodes))
5923 continue;
5924
5925 /* Simple min distance search */
5926 val = node_distance(node, n);
5927
5928 if (val < min_val) {
5929 min_val = val;
5930 best_node = n;
5931 }
5932 }
5933
5934 if (best_node != -1)
5935 node_set(best_node, *used_nodes);
5936 return best_node;
5937}
5938
5939/**
5940 * sched_domain_node_span - get a cpumask for a node's sched_domain
5941 * @node: node whose cpumask we're constructing
5942 * @span: resulting cpumask
5943 *
5944 * Given a node, construct a good cpumask for its sched_domain to span. It
5945 * should be one that prevents unnecessary balancing, but also spreads tasks
5946 * out optimally.
5947 */
5948static void sched_domain_node_span(int node, struct cpumask *span)
5949{
5950 nodemask_t used_nodes;
5951 int i;
5952
5953 cpumask_clear(span);
5954 nodes_clear(used_nodes);
5955
5956 cpumask_or(span, span, cpumask_of_node(node));
5957 node_set(node, used_nodes);
5958
5959 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5960 int next_node = find_next_best_node(node, &used_nodes);
5961 if (next_node < 0)
5962 break;
5963 cpumask_or(span, span, cpumask_of_node(next_node));
5964 }
5965}
5966
5967static const struct cpumask *cpu_node_mask(int cpu)
5968{
5969 lockdep_assert_held(&sched_domains_mutex);
5970
5971 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
5972
5973 return sched_domains_tmpmask;
5974}
5975
5976static const struct cpumask *cpu_allnodes_mask(int cpu)
5977{
5978 return cpu_possible_mask;
5979}
5980#endif /* CONFIG_NUMA */
5981
5982static const struct cpumask *cpu_cpu_mask(int cpu) 5923static const struct cpumask *cpu_cpu_mask(int cpu)
5983{ 5924{
5984 return cpumask_of_node(cpu_to_node(cpu)); 5925 return cpumask_of_node(cpu_to_node(cpu));
5985} 5926}
5986 5927
5987int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5988
5989struct sd_data { 5928struct sd_data {
5990 struct sched_domain **__percpu sd; 5929 struct sched_domain **__percpu sd;
5991 struct sched_group **__percpu sg; 5930 struct sched_group **__percpu sg;
@@ -6015,6 +5954,7 @@ struct sched_domain_topology_level {
6015 sched_domain_init_f init; 5954 sched_domain_init_f init;
6016 sched_domain_mask_f mask; 5955 sched_domain_mask_f mask;
6017 int flags; 5956 int flags;
5957 int numa_level;
6018 struct sd_data data; 5958 struct sd_data data;
6019}; 5959};
6020 5960
@@ -6206,10 +6146,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6206} 6146}
6207 6147
6208SD_INIT_FUNC(CPU) 6148SD_INIT_FUNC(CPU)
6209#ifdef CONFIG_NUMA
6210 SD_INIT_FUNC(ALLNODES)
6211 SD_INIT_FUNC(NODE)
6212#endif
6213#ifdef CONFIG_SCHED_SMT 6149#ifdef CONFIG_SCHED_SMT
6214 SD_INIT_FUNC(SIBLING) 6150 SD_INIT_FUNC(SIBLING)
6215#endif 6151#endif
@@ -6331,15 +6267,184 @@ static struct sched_domain_topology_level default_topology[] = {
6331 { sd_init_BOOK, cpu_book_mask, }, 6267 { sd_init_BOOK, cpu_book_mask, },
6332#endif 6268#endif
6333 { sd_init_CPU, cpu_cpu_mask, }, 6269 { sd_init_CPU, cpu_cpu_mask, },
6334#ifdef CONFIG_NUMA
6335 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
6336 { sd_init_ALLNODES, cpu_allnodes_mask, },
6337#endif
6338 { NULL, }, 6270 { NULL, },
6339}; 6271};
6340 6272
6341static struct sched_domain_topology_level *sched_domain_topology = default_topology; 6273static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6342 6274
6275#ifdef CONFIG_NUMA
6276
6277static int sched_domains_numa_levels;
6278static int sched_domains_numa_scale;
6279static int *sched_domains_numa_distance;
6280static struct cpumask ***sched_domains_numa_masks;
6281static int sched_domains_curr_level;
6282
6283static inline int sd_local_flags(int level)
6284{
6285 if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
6286 return 0;
6287
6288 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
6289}
6290
6291static struct sched_domain *
6292sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6293{
6294 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6295 int level = tl->numa_level;
6296 int sd_weight = cpumask_weight(
6297 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
6298
6299 *sd = (struct sched_domain){
6300 .min_interval = sd_weight,
6301 .max_interval = 2*sd_weight,
6302 .busy_factor = 32,
6303 .imbalance_pct = 125,
6304 .cache_nice_tries = 2,
6305 .busy_idx = 3,
6306 .idle_idx = 2,
6307 .newidle_idx = 0,
6308 .wake_idx = 0,
6309 .forkexec_idx = 0,
6310
6311 .flags = 1*SD_LOAD_BALANCE
6312 | 1*SD_BALANCE_NEWIDLE
6313 | 0*SD_BALANCE_EXEC
6314 | 0*SD_BALANCE_FORK
6315 | 0*SD_BALANCE_WAKE
6316 | 0*SD_WAKE_AFFINE
6317 | 0*SD_PREFER_LOCAL
6318 | 0*SD_SHARE_CPUPOWER
6319 | 0*SD_SHARE_PKG_RESOURCES
6320 | 1*SD_SERIALIZE
6321 | 0*SD_PREFER_SIBLING
6322 | sd_local_flags(level)
6323 ,
6324 .last_balance = jiffies,
6325 .balance_interval = sd_weight,
6326 };
6327 SD_INIT_NAME(sd, NUMA);
6328 sd->private = &tl->data;
6329
6330 /*
6331 * Ugly hack to pass state to sd_numa_mask()...
6332 */
6333 sched_domains_curr_level = tl->numa_level;
6334
6335 return sd;
6336}
6337
6338static const struct cpumask *sd_numa_mask(int cpu)
6339{
6340 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6341}
6342
6343static void sched_init_numa(void)
6344{
6345 int next_distance, curr_distance = node_distance(0, 0);
6346 struct sched_domain_topology_level *tl;
6347 int level = 0;
6348 int i, j, k;
6349
6350 sched_domains_numa_scale = curr_distance;
6351 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6352 if (!sched_domains_numa_distance)
6353 return;
6354
6355 /*
6356 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
6357 * unique distances in the node_distance() table.
6358 *
6359 * Assumes node_distance(0,j) includes all distances in
6360 * node_distance(i,j) in order to avoid cubic time.
6361 *
6362 * XXX: could be optimized to O(n log n) by using sort()
6363 */
6364 next_distance = curr_distance;
6365 for (i = 0; i < nr_node_ids; i++) {
6366 for (j = 0; j < nr_node_ids; j++) {
6367 int distance = node_distance(0, j);
6368 if (distance > curr_distance &&
6369 (distance < next_distance ||
6370 next_distance == curr_distance))
6371 next_distance = distance;
6372 }
6373 if (next_distance != curr_distance) {
6374 sched_domains_numa_distance[level++] = next_distance;
6375 sched_domains_numa_levels = level;
6376 curr_distance = next_distance;
6377 } else break;
6378 }
6379 /*
6380 * 'level' contains the number of unique distances, excluding the
6381 * identity distance node_distance(i,i).
6382 *
6383 * The sched_domains_nume_distance[] array includes the actual distance
6384 * numbers.
6385 */
6386
6387 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6388 if (!sched_domains_numa_masks)
6389 return;
6390
6391 /*
6392 * Now for each level, construct a mask per node which contains all
6393 * cpus of nodes that are that many hops away from us.
6394 */
6395 for (i = 0; i < level; i++) {
6396 sched_domains_numa_masks[i] =
6397 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6398 if (!sched_domains_numa_masks[i])
6399 return;
6400
6401 for (j = 0; j < nr_node_ids; j++) {
6402 struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
6403 if (!mask)
6404 return;
6405
6406 sched_domains_numa_masks[i][j] = mask;
6407
6408 for (k = 0; k < nr_node_ids; k++) {
6409 if (node_distance(j, k) > sched_domains_numa_distance[i])
6410 continue;
6411
6412 cpumask_or(mask, mask, cpumask_of_node(k));
6413 }
6414 }
6415 }
6416
6417 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6418 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6419 if (!tl)
6420 return;
6421
6422 /*
6423 * Copy the default topology bits..
6424 */
6425 for (i = 0; default_topology[i].init; i++)
6426 tl[i] = default_topology[i];
6427
6428 /*
6429 * .. and append 'j' levels of NUMA goodness.
6430 */
6431 for (j = 0; j < level; i++, j++) {
6432 tl[i] = (struct sched_domain_topology_level){
6433 .init = sd_numa_init,
6434 .mask = sd_numa_mask,
6435 .flags = SDTL_OVERLAP,
6436 .numa_level = j,
6437 };
6438 }
6439
6440 sched_domain_topology = tl;
6441}
6442#else
6443static inline void sched_init_numa(void)
6444{
6445}
6446#endif /* CONFIG_NUMA */
6447
6343static int __sdt_alloc(const struct cpumask *cpu_map) 6448static int __sdt_alloc(const struct cpumask *cpu_map)
6344{ 6449{
6345 struct sched_domain_topology_level *tl; 6450 struct sched_domain_topology_level *tl;
@@ -6707,97 +6812,6 @@ match2:
6707 mutex_unlock(&sched_domains_mutex); 6812 mutex_unlock(&sched_domains_mutex);
6708} 6813}
6709 6814
6710#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6711static void reinit_sched_domains(void)
6712{
6713 get_online_cpus();
6714
6715 /* Destroy domains first to force the rebuild */
6716 partition_sched_domains(0, NULL, NULL);
6717
6718 rebuild_sched_domains();
6719 put_online_cpus();
6720}
6721
6722static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6723{
6724 unsigned int level = 0;
6725
6726 if (sscanf(buf, "%u", &level) != 1)
6727 return -EINVAL;
6728
6729 /*
6730 * level is always be positive so don't check for
6731 * level < POWERSAVINGS_BALANCE_NONE which is 0
6732 * What happens on 0 or 1 byte write,
6733 * need to check for count as well?
6734 */
6735
6736 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
6737 return -EINVAL;
6738
6739 if (smt)
6740 sched_smt_power_savings = level;
6741 else
6742 sched_mc_power_savings = level;
6743
6744 reinit_sched_domains();
6745
6746 return count;
6747}
6748
6749#ifdef CONFIG_SCHED_MC
6750static ssize_t sched_mc_power_savings_show(struct device *dev,
6751 struct device_attribute *attr,
6752 char *buf)
6753{
6754 return sprintf(buf, "%u\n", sched_mc_power_savings);
6755}
6756static ssize_t sched_mc_power_savings_store(struct device *dev,
6757 struct device_attribute *attr,
6758 const char *buf, size_t count)
6759{
6760 return sched_power_savings_store(buf, count, 0);
6761}
6762static DEVICE_ATTR(sched_mc_power_savings, 0644,
6763 sched_mc_power_savings_show,
6764 sched_mc_power_savings_store);
6765#endif
6766
6767#ifdef CONFIG_SCHED_SMT
6768static ssize_t sched_smt_power_savings_show(struct device *dev,
6769 struct device_attribute *attr,
6770 char *buf)
6771{
6772 return sprintf(buf, "%u\n", sched_smt_power_savings);
6773}
6774static ssize_t sched_smt_power_savings_store(struct device *dev,
6775 struct device_attribute *attr,
6776 const char *buf, size_t count)
6777{
6778 return sched_power_savings_store(buf, count, 1);
6779}
6780static DEVICE_ATTR(sched_smt_power_savings, 0644,
6781 sched_smt_power_savings_show,
6782 sched_smt_power_savings_store);
6783#endif
6784
6785int __init sched_create_sysfs_power_savings_entries(struct device *dev)
6786{
6787 int err = 0;
6788
6789#ifdef CONFIG_SCHED_SMT
6790 if (smt_capable())
6791 err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
6792#endif
6793#ifdef CONFIG_SCHED_MC
6794 if (!err && mc_capable())
6795 err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
6796#endif
6797 return err;
6798}
6799#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
6800
6801/* 6815/*
6802 * Update cpusets according to cpu_active mask. If cpusets are 6816 * Update cpusets according to cpu_active mask. If cpusets are
6803 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 6817 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
@@ -6835,6 +6849,8 @@ void __init sched_init_smp(void)
6835 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 6849 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6836 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 6850 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6837 6851
6852 sched_init_numa();
6853
6838 get_online_cpus(); 6854 get_online_cpus();
6839 mutex_lock(&sched_domains_mutex); 6855 mutex_lock(&sched_domains_mutex);
6840 init_sched_domains(cpu_active_mask); 6856 init_sched_domains(cpu_active_mask);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 09acaa15161d..6f79596e0ea9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -202,7 +202,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
202 SPLIT_NS(spread0)); 202 SPLIT_NS(spread0));
203 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", 203 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
204 cfs_rq->nr_spread_over); 204 cfs_rq->nr_spread_over);
205 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); 205 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207#ifdef CONFIG_FAIR_GROUP_SCHED 207#ifdef CONFIG_FAIR_GROUP_SCHED
208#ifdef CONFIG_SMP 208#ifdef CONFIG_SMP
@@ -260,8 +260,14 @@ static void print_cpu(struct seq_file *m, int cpu)
260 SEQ_printf(m, "\ncpu#%d\n", cpu); 260 SEQ_printf(m, "\ncpu#%d\n", cpu);
261#endif 261#endif
262 262
263#define P(x) \ 263#define P(x) \
264 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) 264do { \
265 if (sizeof(rq->x) == 4) \
266 SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
267 else \
268 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
269} while (0)
270
265#define PN(x) \ 271#define PN(x) \
266 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) 272 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
267 273
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e9553640c1c3..940e6d17cf96 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2721 * If power savings logic is enabled for a domain, see if we 2721 * If power savings logic is enabled for a domain, see if we
2722 * are not overloaded, if so, don't balance wider. 2722 * are not overloaded, if so, don't balance wider.
2723 */ 2723 */
2724 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { 2724 if (tmp->flags & (SD_PREFER_LOCAL)) {
2725 unsigned long power = 0; 2725 unsigned long power = 0;
2726 unsigned long nr_running = 0; 2726 unsigned long nr_running = 0;
2727 unsigned long capacity; 2727 unsigned long capacity;
@@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2734 2734
2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); 2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
2736 2736
2737 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2738 nr_running /= 2;
2739
2740 if (nr_running < capacity) 2737 if (nr_running < capacity)
2741 want_sd = 0; 2738 want_sd = 0;
2742 } 2739 }
@@ -3082,7 +3079,7 @@ struct lb_env {
3082 struct rq *dst_rq; 3079 struct rq *dst_rq;
3083 3080
3084 enum cpu_idle_type idle; 3081 enum cpu_idle_type idle;
3085 long load_move; 3082 long imbalance;
3086 unsigned int flags; 3083 unsigned int flags;
3087 3084
3088 unsigned int loop; 3085 unsigned int loop;
@@ -3218,7 +3215,7 @@ static unsigned long task_h_load(struct task_struct *p);
3218static const unsigned int sched_nr_migrate_break = 32; 3215static const unsigned int sched_nr_migrate_break = 32;
3219 3216
3220/* 3217/*
3221 * move_tasks tries to move up to load_move weighted load from busiest to 3218 * move_tasks tries to move up to imbalance weighted load from busiest to
3222 * this_rq, as part of a balancing operation within domain "sd". 3219 * this_rq, as part of a balancing operation within domain "sd".
3223 * Returns 1 if successful and 0 otherwise. 3220 * Returns 1 if successful and 0 otherwise.
3224 * 3221 *
@@ -3231,7 +3228,7 @@ static int move_tasks(struct lb_env *env)
3231 unsigned long load; 3228 unsigned long load;
3232 int pulled = 0; 3229 int pulled = 0;
3233 3230
3234 if (env->load_move <= 0) 3231 if (env->imbalance <= 0)
3235 return 0; 3232 return 0;
3236 3233
3237 while (!list_empty(tasks)) { 3234 while (!list_empty(tasks)) {
@@ -3257,7 +3254,7 @@ static int move_tasks(struct lb_env *env)
3257 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) 3254 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
3258 goto next; 3255 goto next;
3259 3256
3260 if ((load / 2) > env->load_move) 3257 if ((load / 2) > env->imbalance)
3261 goto next; 3258 goto next;
3262 3259
3263 if (!can_migrate_task(p, env)) 3260 if (!can_migrate_task(p, env))
@@ -3265,7 +3262,7 @@ static int move_tasks(struct lb_env *env)
3265 3262
3266 move_task(p, env); 3263 move_task(p, env);
3267 pulled++; 3264 pulled++;
3268 env->load_move -= load; 3265 env->imbalance -= load;
3269 3266
3270#ifdef CONFIG_PREEMPT 3267#ifdef CONFIG_PREEMPT
3271 /* 3268 /*
@@ -3281,7 +3278,7 @@ static int move_tasks(struct lb_env *env)
3281 * We only want to steal up to the prescribed amount of 3278 * We only want to steal up to the prescribed amount of
3282 * weighted load. 3279 * weighted load.
3283 */ 3280 */
3284 if (env->load_move <= 0) 3281 if (env->imbalance <= 0)
3285 break; 3282 break;
3286 3283
3287 continue; 3284 continue;
@@ -3435,14 +3432,6 @@ struct sd_lb_stats {
3435 unsigned int busiest_group_weight; 3432 unsigned int busiest_group_weight;
3436 3433
3437 int group_imb; /* Is there imbalance in this sd */ 3434 int group_imb; /* Is there imbalance in this sd */
3438#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3439 int power_savings_balance; /* Is powersave balance needed for this sd */
3440 struct sched_group *group_min; /* Least loaded group in sd */
3441 struct sched_group *group_leader; /* Group which relieves group_min */
3442 unsigned long min_load_per_task; /* load_per_task in group_min */
3443 unsigned long leader_nr_running; /* Nr running of group_leader */
3444 unsigned long min_nr_running; /* Nr running of group_min */
3445#endif
3446}; 3435};
3447 3436
3448/* 3437/*
@@ -3486,148 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
3486 return load_idx; 3475 return load_idx;
3487} 3476}
3488 3477
3489
3490#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3491/**
3492 * init_sd_power_savings_stats - Initialize power savings statistics for
3493 * the given sched_domain, during load balancing.
3494 *
3495 * @sd: Sched domain whose power-savings statistics are to be initialized.
3496 * @sds: Variable containing the statistics for sd.
3497 * @idle: Idle status of the CPU at which we're performing load-balancing.
3498 */
3499static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3500 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3501{
3502 /*
3503 * Busy processors will not participate in power savings
3504 * balance.
3505 */
3506 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3507 sds->power_savings_balance = 0;
3508 else {
3509 sds->power_savings_balance = 1;
3510 sds->min_nr_running = ULONG_MAX;
3511 sds->leader_nr_running = 0;
3512 }
3513}
3514
3515/**
3516 * update_sd_power_savings_stats - Update the power saving stats for a
3517 * sched_domain while performing load balancing.
3518 *
3519 * @group: sched_group belonging to the sched_domain under consideration.
3520 * @sds: Variable containing the statistics of the sched_domain
3521 * @local_group: Does group contain the CPU for which we're performing
3522 * load balancing ?
3523 * @sgs: Variable containing the statistics of the group.
3524 */
3525static inline void update_sd_power_savings_stats(struct sched_group *group,
3526 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3527{
3528
3529 if (!sds->power_savings_balance)
3530 return;
3531
3532 /*
3533 * If the local group is idle or completely loaded
3534 * no need to do power savings balance at this domain
3535 */
3536 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3537 !sds->this_nr_running))
3538 sds->power_savings_balance = 0;
3539
3540 /*
3541 * If a group is already running at full capacity or idle,
3542 * don't include that group in power savings calculations
3543 */
3544 if (!sds->power_savings_balance ||
3545 sgs->sum_nr_running >= sgs->group_capacity ||
3546 !sgs->sum_nr_running)
3547 return;
3548
3549 /*
3550 * Calculate the group which has the least non-idle load.
3551 * This is the group from where we need to pick up the load
3552 * for saving power
3553 */
3554 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3555 (sgs->sum_nr_running == sds->min_nr_running &&
3556 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3557 sds->group_min = group;
3558 sds->min_nr_running = sgs->sum_nr_running;
3559 sds->min_load_per_task = sgs->sum_weighted_load /
3560 sgs->sum_nr_running;
3561 }
3562
3563 /*
3564 * Calculate the group which is almost near its
3565 * capacity but still has some space to pick up some load
3566 * from other group and save more power
3567 */
3568 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3569 return;
3570
3571 if (sgs->sum_nr_running > sds->leader_nr_running ||
3572 (sgs->sum_nr_running == sds->leader_nr_running &&
3573 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3574 sds->group_leader = group;
3575 sds->leader_nr_running = sgs->sum_nr_running;
3576 }
3577}
3578
3579/**
3580 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3581 * @sds: Variable containing the statistics of the sched_domain
3582 * under consideration.
3583 * @this_cpu: Cpu at which we're currently performing load-balancing.
3584 * @imbalance: Variable to store the imbalance.
3585 *
3586 * Description:
3587 * Check if we have potential to perform some power-savings balance.
3588 * If yes, set the busiest group to be the least loaded group in the
3589 * sched_domain, so that it's CPUs can be put to idle.
3590 *
3591 * Returns 1 if there is potential to perform power-savings balance.
3592 * Else returns 0.
3593 */
3594static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3595 int this_cpu, unsigned long *imbalance)
3596{
3597 if (!sds->power_savings_balance)
3598 return 0;
3599
3600 if (sds->this != sds->group_leader ||
3601 sds->group_leader == sds->group_min)
3602 return 0;
3603
3604 *imbalance = sds->min_load_per_task;
3605 sds->busiest = sds->group_min;
3606
3607 return 1;
3608
3609}
3610#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3611static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3612 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3613{
3614 return;
3615}
3616
3617static inline void update_sd_power_savings_stats(struct sched_group *group,
3618 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3619{
3620 return;
3621}
3622
3623static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3624 int this_cpu, unsigned long *imbalance)
3625{
3626 return 0;
3627}
3628#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3629
3630
3631unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 3478unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3632{ 3479{
3633 return SCHED_POWER_SCALE; 3480 return SCHED_POWER_SCALE;
@@ -3765,24 +3612,22 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
3765 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3612 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3766 * @sd: The sched_domain whose statistics are to be updated. 3613 * @sd: The sched_domain whose statistics are to be updated.
3767 * @group: sched_group whose statistics are to be updated. 3614 * @group: sched_group whose statistics are to be updated.
3768 * @this_cpu: Cpu for which load balance is currently performed.
3769 * @idle: Idle status of this_cpu
3770 * @load_idx: Load index of sched_domain of this_cpu for load calc. 3615 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3771 * @local_group: Does group contain this_cpu. 3616 * @local_group: Does group contain this_cpu.
3772 * @cpus: Set of cpus considered for load balancing. 3617 * @cpus: Set of cpus considered for load balancing.
3773 * @balance: Should we balance. 3618 * @balance: Should we balance.
3774 * @sgs: variable to hold the statistics for this group. 3619 * @sgs: variable to hold the statistics for this group.
3775 */ 3620 */
3776static inline void update_sg_lb_stats(struct sched_domain *sd, 3621static inline void update_sg_lb_stats(struct lb_env *env,
3777 struct sched_group *group, int this_cpu, 3622 struct sched_group *group, int load_idx,
3778 enum cpu_idle_type idle, int load_idx,
3779 int local_group, const struct cpumask *cpus, 3623 int local_group, const struct cpumask *cpus,
3780 int *balance, struct sg_lb_stats *sgs) 3624 int *balance, struct sg_lb_stats *sgs)
3781{ 3625{
3782 unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; 3626 unsigned long nr_running, max_nr_running, min_nr_running;
3783 int i; 3627 unsigned long load, max_cpu_load, min_cpu_load;
3784 unsigned int balance_cpu = -1, first_idle_cpu = 0; 3628 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3785 unsigned long avg_load_per_task = 0; 3629 unsigned long avg_load_per_task = 0;
3630 int i;
3786 3631
3787 if (local_group) 3632 if (local_group)
3788 balance_cpu = group_first_cpu(group); 3633 balance_cpu = group_first_cpu(group);
@@ -3791,10 +3636,13 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3791 max_cpu_load = 0; 3636 max_cpu_load = 0;
3792 min_cpu_load = ~0UL; 3637 min_cpu_load = ~0UL;
3793 max_nr_running = 0; 3638 max_nr_running = 0;
3639 min_nr_running = ~0UL;
3794 3640
3795 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 3641 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3796 struct rq *rq = cpu_rq(i); 3642 struct rq *rq = cpu_rq(i);
3797 3643
3644 nr_running = rq->nr_running;
3645
3798 /* Bias balancing toward cpus of our domain */ 3646 /* Bias balancing toward cpus of our domain */
3799 if (local_group) { 3647 if (local_group) {
3800 if (idle_cpu(i) && !first_idle_cpu) { 3648 if (idle_cpu(i) && !first_idle_cpu) {
@@ -3805,16 +3653,19 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3805 load = target_load(i, load_idx); 3653 load = target_load(i, load_idx);
3806 } else { 3654 } else {
3807 load = source_load(i, load_idx); 3655 load = source_load(i, load_idx);
3808 if (load > max_cpu_load) { 3656 if (load > max_cpu_load)
3809 max_cpu_load = load; 3657 max_cpu_load = load;
3810 max_nr_running = rq->nr_running;
3811 }
3812 if (min_cpu_load > load) 3658 if (min_cpu_load > load)
3813 min_cpu_load = load; 3659 min_cpu_load = load;
3660
3661 if (nr_running > max_nr_running)
3662 max_nr_running = nr_running;
3663 if (min_nr_running > nr_running)
3664 min_nr_running = nr_running;
3814 } 3665 }
3815 3666
3816 sgs->group_load += load; 3667 sgs->group_load += load;
3817 sgs->sum_nr_running += rq->nr_running; 3668 sgs->sum_nr_running += nr_running;
3818 sgs->sum_weighted_load += weighted_cpuload(i); 3669 sgs->sum_weighted_load += weighted_cpuload(i);
3819 if (idle_cpu(i)) 3670 if (idle_cpu(i))
3820 sgs->idle_cpus++; 3671 sgs->idle_cpus++;
@@ -3827,14 +3678,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3827 * to do the newly idle load balance. 3678 * to do the newly idle load balance.
3828 */ 3679 */
3829 if (local_group) { 3680 if (local_group) {
3830 if (idle != CPU_NEWLY_IDLE) { 3681 if (env->idle != CPU_NEWLY_IDLE) {
3831 if (balance_cpu != this_cpu) { 3682 if (balance_cpu != env->dst_cpu) {
3832 *balance = 0; 3683 *balance = 0;
3833 return; 3684 return;
3834 } 3685 }
3835 update_group_power(sd, this_cpu); 3686 update_group_power(env->sd, env->dst_cpu);
3836 } else if (time_after_eq(jiffies, group->sgp->next_update)) 3687 } else if (time_after_eq(jiffies, group->sgp->next_update))
3837 update_group_power(sd, this_cpu); 3688 update_group_power(env->sd, env->dst_cpu);
3838 } 3689 }
3839 3690
3840 /* Adjust by relative CPU power of the group */ 3691 /* Adjust by relative CPU power of the group */
@@ -3852,13 +3703,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3852 if (sgs->sum_nr_running) 3703 if (sgs->sum_nr_running)
3853 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 3704 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
3854 3705
3855 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) 3706 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
3707 (max_nr_running - min_nr_running) > 1)
3856 sgs->group_imb = 1; 3708 sgs->group_imb = 1;
3857 3709
3858 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, 3710 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
3859 SCHED_POWER_SCALE); 3711 SCHED_POWER_SCALE);
3860 if (!sgs->group_capacity) 3712 if (!sgs->group_capacity)
3861 sgs->group_capacity = fix_small_capacity(sd, group); 3713 sgs->group_capacity = fix_small_capacity(env->sd, group);
3862 sgs->group_weight = group->group_weight; 3714 sgs->group_weight = group->group_weight;
3863 3715
3864 if (sgs->group_capacity > sgs->sum_nr_running) 3716 if (sgs->group_capacity > sgs->sum_nr_running)
@@ -3876,11 +3728,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3876 * Determine if @sg is a busier group than the previously selected 3728 * Determine if @sg is a busier group than the previously selected
3877 * busiest group. 3729 * busiest group.
3878 */ 3730 */
3879static bool update_sd_pick_busiest(struct sched_domain *sd, 3731static bool update_sd_pick_busiest(struct lb_env *env,
3880 struct sd_lb_stats *sds, 3732 struct sd_lb_stats *sds,
3881 struct sched_group *sg, 3733 struct sched_group *sg,
3882 struct sg_lb_stats *sgs, 3734 struct sg_lb_stats *sgs)
3883 int this_cpu)
3884{ 3735{
3885 if (sgs->avg_load <= sds->max_load) 3736 if (sgs->avg_load <= sds->max_load)
3886 return false; 3737 return false;
@@ -3896,8 +3747,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
3896 * numbered CPUs in the group, therefore mark all groups 3747 * numbered CPUs in the group, therefore mark all groups
3897 * higher than ourself as busy. 3748 * higher than ourself as busy.
3898 */ 3749 */
3899 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && 3750 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
3900 this_cpu < group_first_cpu(sg)) { 3751 env->dst_cpu < group_first_cpu(sg)) {
3901 if (!sds->busiest) 3752 if (!sds->busiest)
3902 return true; 3753 return true;
3903 3754
@@ -3917,28 +3768,27 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
3917 * @balance: Should we balance. 3768 * @balance: Should we balance.
3918 * @sds: variable to hold the statistics for this sched_domain. 3769 * @sds: variable to hold the statistics for this sched_domain.
3919 */ 3770 */
3920static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, 3771static inline void update_sd_lb_stats(struct lb_env *env,
3921 enum cpu_idle_type idle, const struct cpumask *cpus, 3772 const struct cpumask *cpus,
3922 int *balance, struct sd_lb_stats *sds) 3773 int *balance, struct sd_lb_stats *sds)
3923{ 3774{
3924 struct sched_domain *child = sd->child; 3775 struct sched_domain *child = env->sd->child;
3925 struct sched_group *sg = sd->groups; 3776 struct sched_group *sg = env->sd->groups;
3926 struct sg_lb_stats sgs; 3777 struct sg_lb_stats sgs;
3927 int load_idx, prefer_sibling = 0; 3778 int load_idx, prefer_sibling = 0;
3928 3779
3929 if (child && child->flags & SD_PREFER_SIBLING) 3780 if (child && child->flags & SD_PREFER_SIBLING)
3930 prefer_sibling = 1; 3781 prefer_sibling = 1;
3931 3782
3932 init_sd_power_savings_stats(sd, sds, idle); 3783 load_idx = get_sd_load_idx(env->sd, env->idle);
3933 load_idx = get_sd_load_idx(sd, idle);
3934 3784
3935 do { 3785 do {
3936 int local_group; 3786 int local_group;
3937 3787
3938 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); 3788 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
3939 memset(&sgs, 0, sizeof(sgs)); 3789 memset(&sgs, 0, sizeof(sgs));
3940 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, 3790 update_sg_lb_stats(env, sg, load_idx, local_group,
3941 local_group, cpus, balance, &sgs); 3791 cpus, balance, &sgs);
3942 3792
3943 if (local_group && !(*balance)) 3793 if (local_group && !(*balance))
3944 return; 3794 return;
@@ -3966,7 +3816,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3966 sds->this_load_per_task = sgs.sum_weighted_load; 3816 sds->this_load_per_task = sgs.sum_weighted_load;
3967 sds->this_has_capacity = sgs.group_has_capacity; 3817 sds->this_has_capacity = sgs.group_has_capacity;
3968 sds->this_idle_cpus = sgs.idle_cpus; 3818 sds->this_idle_cpus = sgs.idle_cpus;
3969 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 3819 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
3970 sds->max_load = sgs.avg_load; 3820 sds->max_load = sgs.avg_load;
3971 sds->busiest = sg; 3821 sds->busiest = sg;
3972 sds->busiest_nr_running = sgs.sum_nr_running; 3822 sds->busiest_nr_running = sgs.sum_nr_running;
@@ -3978,9 +3828,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3978 sds->group_imb = sgs.group_imb; 3828 sds->group_imb = sgs.group_imb;
3979 } 3829 }
3980 3830
3981 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
3982 sg = sg->next; 3831 sg = sg->next;
3983 } while (sg != sd->groups); 3832 } while (sg != env->sd->groups);
3984} 3833}
3985 3834
3986/** 3835/**
@@ -4008,24 +3857,23 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
4008 * @this_cpu: The cpu at whose sched_domain we're performing load-balance. 3857 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
4009 * @imbalance: returns amount of imbalanced due to packing. 3858 * @imbalance: returns amount of imbalanced due to packing.
4010 */ 3859 */
4011static int check_asym_packing(struct sched_domain *sd, 3860static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4012 struct sd_lb_stats *sds,
4013 int this_cpu, unsigned long *imbalance)
4014{ 3861{
4015 int busiest_cpu; 3862 int busiest_cpu;
4016 3863
4017 if (!(sd->flags & SD_ASYM_PACKING)) 3864 if (!(env->sd->flags & SD_ASYM_PACKING))
4018 return 0; 3865 return 0;
4019 3866
4020 if (!sds->busiest) 3867 if (!sds->busiest)
4021 return 0; 3868 return 0;
4022 3869
4023 busiest_cpu = group_first_cpu(sds->busiest); 3870 busiest_cpu = group_first_cpu(sds->busiest);
4024 if (this_cpu > busiest_cpu) 3871 if (env->dst_cpu > busiest_cpu)
4025 return 0; 3872 return 0;
4026 3873
4027 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, 3874 env->imbalance = DIV_ROUND_CLOSEST(
4028 SCHED_POWER_SCALE); 3875 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
3876
4029 return 1; 3877 return 1;
4030} 3878}
4031 3879
@@ -4037,8 +3885,8 @@ static int check_asym_packing(struct sched_domain *sd,
4037 * @this_cpu: The cpu at whose sched_domain we're performing load-balance. 3885 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
4038 * @imbalance: Variable to store the imbalance. 3886 * @imbalance: Variable to store the imbalance.
4039 */ 3887 */
4040static inline void fix_small_imbalance(struct sd_lb_stats *sds, 3888static inline
4041 int this_cpu, unsigned long *imbalance) 3889void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4042{ 3890{
4043 unsigned long tmp, pwr_now = 0, pwr_move = 0; 3891 unsigned long tmp, pwr_now = 0, pwr_move = 0;
4044 unsigned int imbn = 2; 3892 unsigned int imbn = 2;
@@ -4049,9 +3897,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4049 if (sds->busiest_load_per_task > 3897 if (sds->busiest_load_per_task >
4050 sds->this_load_per_task) 3898 sds->this_load_per_task)
4051 imbn = 1; 3899 imbn = 1;
4052 } else 3900 } else {
4053 sds->this_load_per_task = 3901 sds->this_load_per_task =
4054 cpu_avg_load_per_task(this_cpu); 3902 cpu_avg_load_per_task(env->dst_cpu);
3903 }
4055 3904
4056 scaled_busy_load_per_task = sds->busiest_load_per_task 3905 scaled_busy_load_per_task = sds->busiest_load_per_task
4057 * SCHED_POWER_SCALE; 3906 * SCHED_POWER_SCALE;
@@ -4059,7 +3908,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4059 3908
4060 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 3909 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
4061 (scaled_busy_load_per_task * imbn)) { 3910 (scaled_busy_load_per_task * imbn)) {
4062 *imbalance = sds->busiest_load_per_task; 3911 env->imbalance = sds->busiest_load_per_task;
4063 return; 3912 return;
4064 } 3913 }
4065 3914
@@ -4096,18 +3945,16 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4096 3945
4097 /* Move if we gain throughput */ 3946 /* Move if we gain throughput */
4098 if (pwr_move > pwr_now) 3947 if (pwr_move > pwr_now)
4099 *imbalance = sds->busiest_load_per_task; 3948 env->imbalance = sds->busiest_load_per_task;
4100} 3949}
4101 3950
4102/** 3951/**
4103 * calculate_imbalance - Calculate the amount of imbalance present within the 3952 * calculate_imbalance - Calculate the amount of imbalance present within the
4104 * groups of a given sched_domain during load balance. 3953 * groups of a given sched_domain during load balance.
3954 * @env: load balance environment
4105 * @sds: statistics of the sched_domain whose imbalance is to be calculated. 3955 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
4106 * @this_cpu: Cpu for which currently load balance is being performed.
4107 * @imbalance: The variable to store the imbalance.
4108 */ 3956 */
4109static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, 3957static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4110 unsigned long *imbalance)
4111{ 3958{
4112 unsigned long max_pull, load_above_capacity = ~0UL; 3959 unsigned long max_pull, load_above_capacity = ~0UL;
4113 3960
@@ -4123,8 +3970,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4123 * its cpu_power, while calculating max_load..) 3970 * its cpu_power, while calculating max_load..)
4124 */ 3971 */
4125 if (sds->max_load < sds->avg_load) { 3972 if (sds->max_load < sds->avg_load) {
4126 *imbalance = 0; 3973 env->imbalance = 0;
4127 return fix_small_imbalance(sds, this_cpu, imbalance); 3974 return fix_small_imbalance(env, sds);
4128 } 3975 }
4129 3976
4130 if (!sds->group_imb) { 3977 if (!sds->group_imb) {
@@ -4152,7 +3999,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4152 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); 3999 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
4153 4000
4154 /* How much load to actually move to equalise the imbalance */ 4001 /* How much load to actually move to equalise the imbalance */
4155 *imbalance = min(max_pull * sds->busiest->sgp->power, 4002 env->imbalance = min(max_pull * sds->busiest->sgp->power,
4156 (sds->avg_load - sds->this_load) * sds->this->sgp->power) 4003 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
4157 / SCHED_POWER_SCALE; 4004 / SCHED_POWER_SCALE;
4158 4005
@@ -4162,8 +4009,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4162 * a think about bumping its value to force at least one task to be 4009 * a think about bumping its value to force at least one task to be
4163 * moved 4010 * moved
4164 */ 4011 */
4165 if (*imbalance < sds->busiest_load_per_task) 4012 if (env->imbalance < sds->busiest_load_per_task)
4166 return fix_small_imbalance(sds, this_cpu, imbalance); 4013 return fix_small_imbalance(env, sds);
4167 4014
4168} 4015}
4169 4016
@@ -4194,9 +4041,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4194 * put to idle by rebalancing its tasks onto our group. 4041 * put to idle by rebalancing its tasks onto our group.
4195 */ 4042 */
4196static struct sched_group * 4043static struct sched_group *
4197find_busiest_group(struct sched_domain *sd, int this_cpu, 4044find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
4198 unsigned long *imbalance, enum cpu_idle_type idle,
4199 const struct cpumask *cpus, int *balance)
4200{ 4045{
4201 struct sd_lb_stats sds; 4046 struct sd_lb_stats sds;
4202 4047
@@ -4206,7 +4051,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4206 * Compute the various statistics relavent for load balancing at 4051 * Compute the various statistics relavent for load balancing at
4207 * this level. 4052 * this level.
4208 */ 4053 */
4209 update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); 4054 update_sd_lb_stats(env, cpus, balance, &sds);
4210 4055
4211 /* 4056 /*
4212 * this_cpu is not the appropriate cpu to perform load balancing at 4057 * this_cpu is not the appropriate cpu to perform load balancing at
@@ -4215,8 +4060,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4215 if (!(*balance)) 4060 if (!(*balance))
4216 goto ret; 4061 goto ret;
4217 4062
4218 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && 4063 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4219 check_asym_packing(sd, &sds, this_cpu, imbalance)) 4064 check_asym_packing(env, &sds))
4220 return sds.busiest; 4065 return sds.busiest;
4221 4066
4222 /* There is no busy sibling group to pull tasks from */ 4067 /* There is no busy sibling group to pull tasks from */
@@ -4234,7 +4079,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4234 goto force_balance; 4079 goto force_balance;
4235 4080
4236 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 4081 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4237 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && 4082 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
4238 !sds.busiest_has_capacity) 4083 !sds.busiest_has_capacity)
4239 goto force_balance; 4084 goto force_balance;
4240 4085
@@ -4252,7 +4097,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4252 if (sds.this_load >= sds.avg_load) 4097 if (sds.this_load >= sds.avg_load)
4253 goto out_balanced; 4098 goto out_balanced;
4254 4099
4255 if (idle == CPU_IDLE) { 4100 if (env->idle == CPU_IDLE) {
4256 /* 4101 /*
4257 * This cpu is idle. If the busiest group load doesn't 4102 * This cpu is idle. If the busiest group load doesn't
4258 * have more tasks than the number of available cpu's and 4103 * have more tasks than the number of available cpu's and
@@ -4267,34 +4112,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4267 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use 4112 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4268 * imbalance_pct to be conservative. 4113 * imbalance_pct to be conservative.
4269 */ 4114 */
4270 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 4115 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
4271 goto out_balanced; 4116 goto out_balanced;
4272 } 4117 }
4273 4118
4274force_balance: 4119force_balance:
4275 /* Looks like there is an imbalance. Compute it */ 4120 /* Looks like there is an imbalance. Compute it */
4276 calculate_imbalance(&sds, this_cpu, imbalance); 4121 calculate_imbalance(env, &sds);
4277 return sds.busiest; 4122 return sds.busiest;
4278 4123
4279out_balanced: 4124out_balanced:
4280 /*
4281 * There is no obvious imbalance. But check if we can do some balancing
4282 * to save power.
4283 */
4284 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4285 return sds.busiest;
4286ret: 4125ret:
4287 *imbalance = 0; 4126 env->imbalance = 0;
4288 return NULL; 4127 return NULL;
4289} 4128}
4290 4129
4291/* 4130/*
4292 * find_busiest_queue - find the busiest runqueue among the cpus in group. 4131 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4293 */ 4132 */
4294static struct rq * 4133static struct rq *find_busiest_queue(struct lb_env *env,
4295find_busiest_queue(struct sched_domain *sd, struct sched_group *group, 4134 struct sched_group *group,
4296 enum cpu_idle_type idle, unsigned long imbalance, 4135 const struct cpumask *cpus)
4297 const struct cpumask *cpus)
4298{ 4136{
4299 struct rq *busiest = NULL, *rq; 4137 struct rq *busiest = NULL, *rq;
4300 unsigned long max_load = 0; 4138 unsigned long max_load = 0;
@@ -4307,7 +4145,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4307 unsigned long wl; 4145 unsigned long wl;
4308 4146
4309 if (!capacity) 4147 if (!capacity)
4310 capacity = fix_small_capacity(sd, group); 4148 capacity = fix_small_capacity(env->sd, group);
4311 4149
4312 if (!cpumask_test_cpu(i, cpus)) 4150 if (!cpumask_test_cpu(i, cpus))
4313 continue; 4151 continue;
@@ -4319,7 +4157,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4319 * When comparing with imbalance, use weighted_cpuload() 4157 * When comparing with imbalance, use weighted_cpuload()
4320 * which is not scaled with the cpu power. 4158 * which is not scaled with the cpu power.
4321 */ 4159 */
4322 if (capacity && rq->nr_running == 1 && wl > imbalance) 4160 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
4323 continue; 4161 continue;
4324 4162
4325 /* 4163 /*
@@ -4348,40 +4186,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4348/* Working cpumask for load_balance and load_balance_newidle. */ 4186/* Working cpumask for load_balance and load_balance_newidle. */
4349DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4187DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4350 4188
4351static int need_active_balance(struct sched_domain *sd, int idle, 4189static int need_active_balance(struct lb_env *env)
4352 int busiest_cpu, int this_cpu)
4353{ 4190{
4354 if (idle == CPU_NEWLY_IDLE) { 4191 struct sched_domain *sd = env->sd;
4192
4193 if (env->idle == CPU_NEWLY_IDLE) {
4355 4194
4356 /* 4195 /*
4357 * ASYM_PACKING needs to force migrate tasks from busy but 4196 * ASYM_PACKING needs to force migrate tasks from busy but
4358 * higher numbered CPUs in order to pack all tasks in the 4197 * higher numbered CPUs in order to pack all tasks in the
4359 * lowest numbered CPUs. 4198 * lowest numbered CPUs.
4360 */ 4199 */
4361 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) 4200 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
4362 return 1; 4201 return 1;
4363
4364 /*
4365 * The only task running in a non-idle cpu can be moved to this
4366 * cpu in an attempt to completely freeup the other CPU
4367 * package.
4368 *
4369 * The package power saving logic comes from
4370 * find_busiest_group(). If there are no imbalance, then
4371 * f_b_g() will return NULL. However when sched_mc={1,2} then
4372 * f_b_g() will select a group from which a running task may be
4373 * pulled to this cpu in order to make the other package idle.
4374 * If there is no opportunity to make a package idle and if
4375 * there are no imbalance, then f_b_g() will return NULL and no
4376 * action will be taken in load_balance_newidle().
4377 *
4378 * Under normal task pull operation due to imbalance, there
4379 * will be more than one task in the source run queue and
4380 * move_tasks() will succeed. ld_moved will be true and this
4381 * active balance code will not be triggered.
4382 */
4383 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4384 return 0;
4385 } 4202 }
4386 4203
4387 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 4204 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
@@ -4399,7 +4216,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4399{ 4216{
4400 int ld_moved, active_balance = 0; 4217 int ld_moved, active_balance = 0;
4401 struct sched_group *group; 4218 struct sched_group *group;
4402 unsigned long imbalance;
4403 struct rq *busiest; 4219 struct rq *busiest;
4404 unsigned long flags; 4220 unsigned long flags;
4405 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4221 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
@@ -4417,8 +4233,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4417 schedstat_inc(sd, lb_count[idle]); 4233 schedstat_inc(sd, lb_count[idle]);
4418 4234
4419redo: 4235redo:
4420 group = find_busiest_group(sd, this_cpu, &imbalance, idle, 4236 group = find_busiest_group(&env, cpus, balance);
4421 cpus, balance);
4422 4237
4423 if (*balance == 0) 4238 if (*balance == 0)
4424 goto out_balanced; 4239 goto out_balanced;
@@ -4428,7 +4243,7 @@ redo:
4428 goto out_balanced; 4243 goto out_balanced;
4429 } 4244 }
4430 4245
4431 busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); 4246 busiest = find_busiest_queue(&env, group, cpus);
4432 if (!busiest) { 4247 if (!busiest) {
4433 schedstat_inc(sd, lb_nobusyq[idle]); 4248 schedstat_inc(sd, lb_nobusyq[idle]);
4434 goto out_balanced; 4249 goto out_balanced;
@@ -4436,7 +4251,7 @@ redo:
4436 4251
4437 BUG_ON(busiest == this_rq); 4252 BUG_ON(busiest == this_rq);
4438 4253
4439 schedstat_add(sd, lb_imbalance[idle], imbalance); 4254 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4440 4255
4441 ld_moved = 0; 4256 ld_moved = 0;
4442 if (busiest->nr_running > 1) { 4257 if (busiest->nr_running > 1) {
@@ -4447,10 +4262,9 @@ redo:
4447 * correctly treated as an imbalance. 4262 * correctly treated as an imbalance.
4448 */ 4263 */
4449 env.flags |= LBF_ALL_PINNED; 4264 env.flags |= LBF_ALL_PINNED;
4450 env.load_move = imbalance; 4265 env.src_cpu = busiest->cpu;
4451 env.src_cpu = busiest->cpu; 4266 env.src_rq = busiest;
4452 env.src_rq = busiest; 4267 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
4453 env.loop_max = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running);
4454 4268
4455more_balance: 4269more_balance:
4456 local_irq_save(flags); 4270 local_irq_save(flags);
@@ -4492,7 +4306,7 @@ more_balance:
4492 if (idle != CPU_NEWLY_IDLE) 4306 if (idle != CPU_NEWLY_IDLE)
4493 sd->nr_balance_failed++; 4307 sd->nr_balance_failed++;
4494 4308
4495 if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { 4309 if (need_active_balance(&env)) {
4496 raw_spin_lock_irqsave(&busiest->lock, flags); 4310 raw_spin_lock_irqsave(&busiest->lock, flags);
4497 4311
4498 /* don't kick the active_load_balance_cpu_stop, 4312 /* don't kick the active_load_balance_cpu_stop,
@@ -4519,10 +4333,11 @@ more_balance:
4519 } 4333 }
4520 raw_spin_unlock_irqrestore(&busiest->lock, flags); 4334 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4521 4335
4522 if (active_balance) 4336 if (active_balance) {
4523 stop_one_cpu_nowait(cpu_of(busiest), 4337 stop_one_cpu_nowait(cpu_of(busiest),
4524 active_load_balance_cpu_stop, busiest, 4338 active_load_balance_cpu_stop, busiest,
4525 &busiest->active_balance_work); 4339 &busiest->active_balance_work);
4340 }
4526 4341
4527 /* 4342 /*
4528 * We've kicked active balancing, reset the failure 4343 * We've kicked active balancing, reset the failure
@@ -4703,104 +4518,15 @@ static struct {
4703 unsigned long next_balance; /* in jiffy units */ 4518 unsigned long next_balance; /* in jiffy units */
4704} nohz ____cacheline_aligned; 4519} nohz ____cacheline_aligned;
4705 4520
4706#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 4521static inline int find_new_ilb(int call_cpu)
4707/**
4708 * lowest_flag_domain - Return lowest sched_domain containing flag.
4709 * @cpu: The cpu whose lowest level of sched domain is to
4710 * be returned.
4711 * @flag: The flag to check for the lowest sched_domain
4712 * for the given cpu.
4713 *
4714 * Returns the lowest sched_domain of a cpu which contains the given flag.
4715 */
4716static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4717{
4718 struct sched_domain *sd;
4719
4720 for_each_domain(cpu, sd)
4721 if (sd->flags & flag)
4722 break;
4723
4724 return sd;
4725}
4726
4727/**
4728 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4729 * @cpu: The cpu whose domains we're iterating over.
4730 * @sd: variable holding the value of the power_savings_sd
4731 * for cpu.
4732 * @flag: The flag to filter the sched_domains to be iterated.
4733 *
4734 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4735 * set, starting from the lowest sched_domain to the highest.
4736 */
4737#define for_each_flag_domain(cpu, sd, flag) \
4738 for (sd = lowest_flag_domain(cpu, flag); \
4739 (sd && (sd->flags & flag)); sd = sd->parent)
4740
4741/**
4742 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4743 * @cpu: The cpu which is nominating a new idle_load_balancer.
4744 *
4745 * Returns: Returns the id of the idle load balancer if it exists,
4746 * Else, returns >= nr_cpu_ids.
4747 *
4748 * This algorithm picks the idle load balancer such that it belongs to a
4749 * semi-idle powersavings sched_domain. The idea is to try and avoid
4750 * completely idle packages/cores just for the purpose of idle load balancing
4751 * when there are other idle cpu's which are better suited for that job.
4752 */
4753static int find_new_ilb(int cpu)
4754{ 4522{
4755 int ilb = cpumask_first(nohz.idle_cpus_mask); 4523 int ilb = cpumask_first(nohz.idle_cpus_mask);
4756 struct sched_group *ilbg;
4757 struct sched_domain *sd;
4758
4759 /*
4760 * Have idle load balancer selection from semi-idle packages only
4761 * when power-aware load balancing is enabled
4762 */
4763 if (!(sched_smt_power_savings || sched_mc_power_savings))
4764 goto out_done;
4765
4766 /*
4767 * Optimize for the case when we have no idle CPUs or only one
4768 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4769 */
4770 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
4771 goto out_done;
4772
4773 rcu_read_lock();
4774 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4775 ilbg = sd->groups;
4776
4777 do {
4778 if (ilbg->group_weight !=
4779 atomic_read(&ilbg->sgp->nr_busy_cpus)) {
4780 ilb = cpumask_first_and(nohz.idle_cpus_mask,
4781 sched_group_cpus(ilbg));
4782 goto unlock;
4783 }
4784
4785 ilbg = ilbg->next;
4786
4787 } while (ilbg != sd->groups);
4788 }
4789unlock:
4790 rcu_read_unlock();
4791 4524
4792out_done:
4793 if (ilb < nr_cpu_ids && idle_cpu(ilb)) 4525 if (ilb < nr_cpu_ids && idle_cpu(ilb))
4794 return ilb; 4526 return ilb;
4795 4527
4796 return nr_cpu_ids; 4528 return nr_cpu_ids;
4797} 4529}
4798#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4799static inline int find_new_ilb(int call_cpu)
4800{
4801 return nr_cpu_ids;
4802}
4803#endif
4804 4530
4805/* 4531/*
4806 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the 4532 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
@@ -5023,7 +4749,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
5023 4749
5024 raw_spin_lock_irq(&this_rq->lock); 4750 raw_spin_lock_irq(&this_rq->lock);
5025 update_rq_clock(this_rq); 4751 update_rq_clock(this_rq);
5026 update_cpu_load(this_rq); 4752 update_idle_cpu_load(this_rq);
5027 raw_spin_unlock_irq(&this_rq->lock); 4753 raw_spin_unlock_irq(&this_rq->lock);
5028 4754
5029 rebalance_domains(balance_cpu, CPU_IDLE); 4755 rebalance_domains(balance_cpu, CPU_IDLE);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 91b4c957f289..b44d604b35d1 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -4,7 +4,7 @@
4 * idle-task scheduling class. 4 * idle-task scheduling class.
5 * 5 *
6 * (NOTE: these are not related to SCHED_IDLE tasks which are 6 * (NOTE: these are not related to SCHED_IDLE tasks which are
7 * handled in sched_fair.c) 7 * handled in sched/fair.c)
8 */ 8 */
9 9
10#ifdef CONFIG_SMP 10#ifdef CONFIG_SMP
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 44af55e6d5d0..c5565c3c515f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1803,44 +1803,40 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1803static void set_cpus_allowed_rt(struct task_struct *p, 1803static void set_cpus_allowed_rt(struct task_struct *p,
1804 const struct cpumask *new_mask) 1804 const struct cpumask *new_mask)
1805{ 1805{
1806 int weight = cpumask_weight(new_mask); 1806 struct rq *rq;
1807 int weight;
1807 1808
1808 BUG_ON(!rt_task(p)); 1809 BUG_ON(!rt_task(p));
1809 1810
1810 /* 1811 if (!p->on_rq)
1811 * Update the migration status of the RQ if we have an RT task 1812 return;
1812 * which is running AND changing its weight value.
1813 */
1814 if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
1815 struct rq *rq = task_rq(p);
1816
1817 if (!task_current(rq, p)) {
1818 /*
1819 * Make sure we dequeue this task from the pushable list
1820 * before going further. It will either remain off of
1821 * the list because we are no longer pushable, or it
1822 * will be requeued.
1823 */
1824 if (p->rt.nr_cpus_allowed > 1)
1825 dequeue_pushable_task(rq, p);
1826 1813
1827 /* 1814 weight = cpumask_weight(new_mask);
1828 * Requeue if our weight is changing and still > 1
1829 */
1830 if (weight > 1)
1831 enqueue_pushable_task(rq, p);
1832 1815
1833 } 1816 /*
1817 * Only update if the process changes its state from whether it
1818 * can migrate or not.
1819 */
1820 if ((p->rt.nr_cpus_allowed > 1) == (weight > 1))
1821 return;
1834 1822
1835 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { 1823 rq = task_rq(p);
1836 rq->rt.rt_nr_migratory++;
1837 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
1838 BUG_ON(!rq->rt.rt_nr_migratory);
1839 rq->rt.rt_nr_migratory--;
1840 }
1841 1824
1842 update_rt_migration(&rq->rt); 1825 /*
1826 * The process used to be able to migrate OR it can now migrate
1827 */
1828 if (weight <= 1) {
1829 if (!task_current(rq, p))
1830 dequeue_pushable_task(rq, p);
1831 BUG_ON(!rq->rt.rt_nr_migratory);
1832 rq->rt.rt_nr_migratory--;
1833 } else {
1834 if (!task_current(rq, p))
1835 enqueue_pushable_task(rq, p);
1836 rq->rt.rt_nr_migratory++;
1843 } 1837 }
1838
1839 update_rt_migration(&rq->rt);
1844} 1840}
1845 1841
1846/* Assumes rq->lock is held */ 1842/* Assumes rq->lock is held */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fb3acba4d52e..ba9dccfd24ce 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -201,7 +201,7 @@ struct cfs_bandwidth { };
201/* CFS-related fields in a runqueue */ 201/* CFS-related fields in a runqueue */
202struct cfs_rq { 202struct cfs_rq {
203 struct load_weight load; 203 struct load_weight load;
204 unsigned long nr_running, h_nr_running; 204 unsigned int nr_running, h_nr_running;
205 205
206 u64 exec_clock; 206 u64 exec_clock;
207 u64 min_vruntime; 207 u64 min_vruntime;
@@ -279,7 +279,7 @@ static inline int rt_bandwidth_enabled(void)
279/* Real-Time classes' related field in a runqueue: */ 279/* Real-Time classes' related field in a runqueue: */
280struct rt_rq { 280struct rt_rq {
281 struct rt_prio_array active; 281 struct rt_prio_array active;
282 unsigned long rt_nr_running; 282 unsigned int rt_nr_running;
283#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 283#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
284 struct { 284 struct {
285 int curr; /* highest queued rt task prio */ 285 int curr; /* highest queued rt task prio */
@@ -353,7 +353,7 @@ struct rq {
353 * nr_running and cpu_load should be in the same cacheline because 353 * nr_running and cpu_load should be in the same cacheline because
354 * remote CPUs use both these fields when doing load calculation. 354 * remote CPUs use both these fields when doing load calculation.
355 */ 355 */
356 unsigned long nr_running; 356 unsigned int nr_running;
357 #define CPU_LOAD_IDX_MAX 5 357 #define CPU_LOAD_IDX_MAX 5
358 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 358 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
359 unsigned long last_load_update_tick; 359 unsigned long last_load_update_tick;
@@ -876,7 +876,7 @@ extern void resched_cpu(int cpu);
876extern struct rt_bandwidth def_rt_bandwidth; 876extern struct rt_bandwidth def_rt_bandwidth;
877extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 877extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
878 878
879extern void update_cpu_load(struct rq *this_rq); 879extern void update_idle_cpu_load(struct rq *this_rq);
880 880
881#ifdef CONFIG_CGROUP_CPUACCT 881#ifdef CONFIG_CGROUP_CPUACCT
882#include <linux/cgroup.h> 882#include <linux/cgroup.h>
diff --git a/tools/power/cpupower/man/cpupower-set.1 b/tools/power/cpupower/man/cpupower-set.1
index c4954a9fe4e7..9dbd536518ab 100644
--- a/tools/power/cpupower/man/cpupower-set.1
+++ b/tools/power/cpupower/man/cpupower-set.1
@@ -85,15 +85,6 @@ Possible values are:
85savings 85savings
86.RE 86.RE
87 87
88sched_mc_power_savings is dependent upon SCHED_MC, which is
89itself architecture dependent.
90
91sched_smt_power_savings is dependent upon SCHED_SMT, which
92is itself architecture dependent.
93
94The two files are independent of each other. It is possible
95that one file may be present without the other.
96
97.SH "SEE ALSO" 88.SH "SEE ALSO"
98cpupower-info(1), cpupower-monitor(1), powertop(1) 89cpupower-info(1), cpupower-monitor(1), powertop(1)
99.PP 90.PP
diff --git a/tools/power/cpupower/utils/helpers/sysfs.c b/tools/power/cpupower/utils/helpers/sysfs.c
index c6343024a611..96e28c124b5c 100644
--- a/tools/power/cpupower/utils/helpers/sysfs.c
+++ b/tools/power/cpupower/utils/helpers/sysfs.c
@@ -362,22 +362,7 @@ char *sysfs_get_cpuidle_driver(void)
362 */ 362 */
363int sysfs_get_sched(const char *smt_mc) 363int sysfs_get_sched(const char *smt_mc)
364{ 364{
365 unsigned long value; 365 return -ENODEV;
366 char linebuf[MAX_LINE_LEN];
367 char *endp;
368 char path[SYSFS_PATH_MAX];
369
370 if (strcmp("mc", smt_mc) && strcmp("smt", smt_mc))
371 return -EINVAL;
372
373 snprintf(path, sizeof(path),
374 PATH_TO_CPU "sched_%s_power_savings", smt_mc);
375 if (sysfs_read_file(path, linebuf, MAX_LINE_LEN) == 0)
376 return -1;
377 value = strtoul(linebuf, &endp, 0);
378 if (endp == linebuf || errno == ERANGE)
379 return -1;
380 return value;
381} 366}
382 367
383/* 368/*
@@ -388,21 +373,5 @@ int sysfs_get_sched(const char *smt_mc)
388 */ 373 */
389int sysfs_set_sched(const char *smt_mc, int val) 374int sysfs_set_sched(const char *smt_mc, int val)
390{ 375{
391 char linebuf[MAX_LINE_LEN]; 376 return -ENODEV;
392 char path[SYSFS_PATH_MAX];
393 struct stat statbuf;
394
395 if (strcmp("mc", smt_mc) && strcmp("smt", smt_mc))
396 return -EINVAL;
397
398 snprintf(path, sizeof(path),
399 PATH_TO_CPU "sched_%s_power_savings", smt_mc);
400 sprintf(linebuf, "%d", val);
401
402 if (stat(path, &statbuf) != 0)
403 return -ENODEV;
404
405 if (sysfs_write_file(path, linebuf, MAX_LINE_LEN) == 0)
406 return -1;
407 return 0;
408} 377}