diff options
author | Siddha, Suresh B <suresh.b.siddha@intel.com> | 2006-06-27 05:54:42 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-06-27 20:32:45 -0400 |
commit | 5c45bf279d378d436ce45825c0f136696c7b6109 (patch) | |
tree | 80e2fcf4866b84fccb787562e1a83b16f4bc8850 | |
parent | 369381694ddcf03f1de403501c8b97099b5109ec (diff) |
[PATCH] sched: mc/smt power savings sched policy
sysfs entries 'sched_mc_power_savings' and 'sched_smt_power_savings' in
/sys/devices/system/cpu/ control the MC/SMT power savings policy for the
scheduler.
Based on the values (1-enable, 0-disable) for these controls, sched groups
cpu power will be determined for different domains. When power savings
policy is enabled and under light load conditions, scheduler will minimize
the physical packages/cpu cores carrying the load and thus conserving
power(with a perf impact based on the workload characteristics... see OLS
2005 CMP kernel scheduler paper for more details..)
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Con Kolivas <kernel@kolivas.org>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | arch/i386/kernel/smpboot.c | 8 | ||||
-rw-r--r-- | arch/x86_64/kernel/smpboot.c | 8 | ||||
-rw-r--r-- | drivers/base/cpu.c | 10 | ||||
-rw-r--r-- | include/asm-i386/topology.h | 5 | ||||
-rw-r--r-- | include/asm-ia64/topology.h | 1 | ||||
-rw-r--r-- | include/asm-powerpc/topology.h | 5 | ||||
-rw-r--r-- | include/asm-sparc64/topology.h | 3 | ||||
-rw-r--r-- | include/asm-x86_64/topology.h | 2 | ||||
-rw-r--r-- | include/linux/sched.h | 10 | ||||
-rw-r--r-- | include/linux/topology.h | 3 | ||||
-rw-r--r-- | kernel/sched.c | 240 |
11 files changed, 262 insertions, 33 deletions
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index ab5275beddf7..89e7315e539c 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c | |||
@@ -448,10 +448,12 @@ cpumask_t cpu_coregroup_map(int cpu) | |||
448 | struct cpuinfo_x86 *c = cpu_data + cpu; | 448 | struct cpuinfo_x86 *c = cpu_data + cpu; |
449 | /* | 449 | /* |
450 | * For perf, we return last level cache shared map. | 450 | * For perf, we return last level cache shared map. |
451 | * TBD: when power saving sched policy is added, we will return | 451 | * And for power savings, we return cpu_core_map |
452 | * cpu_core_map when power saving policy is enabled | ||
453 | */ | 452 | */ |
454 | return c->llc_shared_map; | 453 | if (sched_mc_power_savings || sched_smt_power_savings) |
454 | return cpu_core_map[cpu]; | ||
455 | else | ||
456 | return c->llc_shared_map; | ||
455 | } | 457 | } |
456 | 458 | ||
457 | /* representing cpus for which sibling maps can be computed */ | 459 | /* representing cpus for which sibling maps can be computed */ |
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 4e9755179ecf..540c0ccbcccc 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c | |||
@@ -455,10 +455,12 @@ cpumask_t cpu_coregroup_map(int cpu) | |||
455 | struct cpuinfo_x86 *c = cpu_data + cpu; | 455 | struct cpuinfo_x86 *c = cpu_data + cpu; |
456 | /* | 456 | /* |
457 | * For perf, we return last level cache shared map. | 457 | * For perf, we return last level cache shared map. |
458 | * TBD: when power saving sched policy is added, we will return | 458 | * And for power savings, we return cpu_core_map |
459 | * cpu_core_map when power saving policy is enabled | ||
460 | */ | 459 | */ |
461 | return c->llc_shared_map; | 460 | if (sched_mc_power_savings || sched_smt_power_savings) |
461 | return cpu_core_map[cpu]; | ||
462 | else | ||
463 | return c->llc_shared_map; | ||
462 | } | 464 | } |
463 | 465 | ||
464 | /* representing cpus for which sibling maps can be computed */ | 466 | /* representing cpus for which sibling maps can be computed */ |
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 3972d8ac9786..4bef76a2f3f2 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c | |||
@@ -143,5 +143,13 @@ EXPORT_SYMBOL_GPL(get_cpu_sysdev); | |||
143 | 143 | ||
144 | int __init cpu_dev_init(void) | 144 | int __init cpu_dev_init(void) |
145 | { | 145 | { |
146 | return sysdev_class_register(&cpu_sysdev_class); | 146 | int err; |
147 | |||
148 | err = sysdev_class_register(&cpu_sysdev_class); | ||
149 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
150 | if (!err) | ||
151 | err = sched_create_sysfs_power_savings_entries(&cpu_sysdev_class); | ||
152 | #endif | ||
153 | |||
154 | return err; | ||
147 | } | 155 | } |
diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h index aa4185ee81fb..6adbd9b1ae88 100644 --- a/include/asm-i386/topology.h +++ b/include/asm-i386/topology.h | |||
@@ -112,4 +112,9 @@ extern unsigned long node_remap_size[]; | |||
112 | 112 | ||
113 | extern cpumask_t cpu_coregroup_map(int cpu); | 113 | extern cpumask_t cpu_coregroup_map(int cpu); |
114 | 114 | ||
115 | #ifdef CONFIG_SMP | ||
116 | #define mc_capable() (boot_cpu_data.x86_max_cores > 1) | ||
117 | #define smt_capable() (smp_num_siblings > 1) | ||
118 | #endif | ||
119 | |||
115 | #endif /* _ASM_I386_TOPOLOGY_H */ | 120 | #endif /* _ASM_I386_TOPOLOGY_H */ |
diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h index 616b5ed2aa72..937c21257523 100644 --- a/include/asm-ia64/topology.h +++ b/include/asm-ia64/topology.h | |||
@@ -112,6 +112,7 @@ void build_cpu_to_node_map(void); | |||
112 | #define topology_core_id(cpu) (cpu_data(cpu)->core_id) | 112 | #define topology_core_id(cpu) (cpu_data(cpu)->core_id) |
113 | #define topology_core_siblings(cpu) (cpu_core_map[cpu]) | 113 | #define topology_core_siblings(cpu) (cpu_core_map[cpu]) |
114 | #define topology_thread_siblings(cpu) (cpu_sibling_map[cpu]) | 114 | #define topology_thread_siblings(cpu) (cpu_sibling_map[cpu]) |
115 | #define smt_capable() (smp_num_siblings > 1) | ||
115 | #endif | 116 | #endif |
116 | 117 | ||
117 | #include <asm-generic/topology.h> | 118 | #include <asm-generic/topology.h> |
diff --git a/include/asm-powerpc/topology.h b/include/asm-powerpc/topology.h index 92f3e5507d22..bbc3844b086f 100644 --- a/include/asm-powerpc/topology.h +++ b/include/asm-powerpc/topology.h | |||
@@ -93,5 +93,10 @@ static inline void sysfs_remove_device_from_node(struct sys_device *dev, | |||
93 | 93 | ||
94 | #endif /* CONFIG_NUMA */ | 94 | #endif /* CONFIG_NUMA */ |
95 | 95 | ||
96 | #ifdef CONFIG_SMP | ||
97 | #include <asm/cputable.h> | ||
98 | #define smt_capable() (cpu_has_feature(CPU_FTR_SMT)) | ||
99 | #endif | ||
100 | |||
96 | #endif /* __KERNEL__ */ | 101 | #endif /* __KERNEL__ */ |
97 | #endif /* _ASM_POWERPC_TOPOLOGY_H */ | 102 | #endif /* _ASM_POWERPC_TOPOLOGY_H */ |
diff --git a/include/asm-sparc64/topology.h b/include/asm-sparc64/topology.h index 0e234e201bd6..98a6c613589d 100644 --- a/include/asm-sparc64/topology.h +++ b/include/asm-sparc64/topology.h | |||
@@ -1,6 +1,9 @@ | |||
1 | #ifndef _ASM_SPARC64_TOPOLOGY_H | 1 | #ifndef _ASM_SPARC64_TOPOLOGY_H |
2 | #define _ASM_SPARC64_TOPOLOGY_H | 2 | #define _ASM_SPARC64_TOPOLOGY_H |
3 | 3 | ||
4 | #include <asm/spitfire.h> | ||
5 | #define smt_capable() (tlb_type == hypervisor) | ||
6 | |||
4 | #include <asm-generic/topology.h> | 7 | #include <asm-generic/topology.h> |
5 | 8 | ||
6 | #endif /* _ASM_SPARC64_TOPOLOGY_H */ | 9 | #endif /* _ASM_SPARC64_TOPOLOGY_H */ |
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h index c4e46e7fa7ba..6e7a2e976b04 100644 --- a/include/asm-x86_64/topology.h +++ b/include/asm-x86_64/topology.h | |||
@@ -59,6 +59,8 @@ extern int __node_distance(int, int); | |||
59 | #define topology_core_id(cpu) (cpu_data[cpu].cpu_core_id) | 59 | #define topology_core_id(cpu) (cpu_data[cpu].cpu_core_id) |
60 | #define topology_core_siblings(cpu) (cpu_core_map[cpu]) | 60 | #define topology_core_siblings(cpu) (cpu_core_map[cpu]) |
61 | #define topology_thread_siblings(cpu) (cpu_sibling_map[cpu]) | 61 | #define topology_thread_siblings(cpu) (cpu_sibling_map[cpu]) |
62 | #define mc_capable() (boot_cpu_data.x86_max_cores > 1) | ||
63 | #define smt_capable() (smp_num_siblings > 1) | ||
62 | #endif | 64 | #endif |
63 | 65 | ||
64 | #include <asm-generic/topology.h> | 66 | #include <asm-generic/topology.h> |
diff --git a/include/linux/sched.h b/include/linux/sched.h index ab8ffc54423a..0bc81a151e50 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -570,6 +570,11 @@ enum idle_type | |||
570 | #define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */ | 570 | #define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */ |
571 | #define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ | 571 | #define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ |
572 | #define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ | 572 | #define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ |
573 | #define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ | ||
574 | |||
575 | #define BALANCE_FOR_POWER ((sched_mc_power_savings || sched_smt_power_savings) \ | ||
576 | ? SD_POWERSAVINGS_BALANCE : 0) | ||
577 | |||
573 | 578 | ||
574 | struct sched_group { | 579 | struct sched_group { |
575 | struct sched_group *next; /* Must be a circular list */ | 580 | struct sched_group *next; /* Must be a circular list */ |
@@ -1412,6 +1417,11 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) | |||
1412 | extern long sched_setaffinity(pid_t pid, cpumask_t new_mask); | 1417 | extern long sched_setaffinity(pid_t pid, cpumask_t new_mask); |
1413 | extern long sched_getaffinity(pid_t pid, cpumask_t *mask); | 1418 | extern long sched_getaffinity(pid_t pid, cpumask_t *mask); |
1414 | 1419 | ||
1420 | #include <linux/sysdev.h> | ||
1421 | extern int sched_mc_power_savings, sched_smt_power_savings; | ||
1422 | extern struct sysdev_attribute attr_sched_mc_power_savings, attr_sched_smt_power_savings; | ||
1423 | extern int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls); | ||
1424 | |||
1415 | extern void normalize_rt_tasks(void); | 1425 | extern void normalize_rt_tasks(void); |
1416 | 1426 | ||
1417 | #ifdef CONFIG_PM | 1427 | #ifdef CONFIG_PM |
diff --git a/include/linux/topology.h b/include/linux/topology.h index a305ae2e44b6..ec1eca85290a 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h | |||
@@ -134,7 +134,8 @@ | |||
134 | .flags = SD_LOAD_BALANCE \ | 134 | .flags = SD_LOAD_BALANCE \ |
135 | | SD_BALANCE_NEWIDLE \ | 135 | | SD_BALANCE_NEWIDLE \ |
136 | | SD_BALANCE_EXEC \ | 136 | | SD_BALANCE_EXEC \ |
137 | | SD_WAKE_AFFINE, \ | 137 | | SD_WAKE_AFFINE \ |
138 | | BALANCE_FOR_POWER, \ | ||
138 | .last_balance = jiffies, \ | 139 | .last_balance = jiffies, \ |
139 | .balance_interval = 1, \ | 140 | .balance_interval = 1, \ |
140 | .nr_balance_failed = 0, \ | 141 | .nr_balance_failed = 0, \ |
diff --git a/kernel/sched.c b/kernel/sched.c index 122b75584a13..54fa282657cc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -1162,6 +1162,11 @@ static int sched_balance_self(int cpu, int flag) | |||
1162 | struct sched_domain *tmp, *sd = NULL; | 1162 | struct sched_domain *tmp, *sd = NULL; |
1163 | 1163 | ||
1164 | for_each_domain(cpu, tmp) { | 1164 | for_each_domain(cpu, tmp) { |
1165 | /* | ||
1166 | * If power savings logic is enabled for a domain, stop there. | ||
1167 | */ | ||
1168 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | ||
1169 | break; | ||
1165 | if (tmp->flags & flag) | 1170 | if (tmp->flags & flag) |
1166 | sd = tmp; | 1171 | sd = tmp; |
1167 | } | 1172 | } |
@@ -2082,6 +2087,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2082 | unsigned long busiest_load_per_task, busiest_nr_running; | 2087 | unsigned long busiest_load_per_task, busiest_nr_running; |
2083 | unsigned long this_load_per_task, this_nr_running; | 2088 | unsigned long this_load_per_task, this_nr_running; |
2084 | int load_idx; | 2089 | int load_idx; |
2090 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2091 | int power_savings_balance = 1; | ||
2092 | unsigned long leader_nr_running = 0, min_load_per_task = 0; | ||
2093 | unsigned long min_nr_running = ULONG_MAX; | ||
2094 | struct sched_group *group_min = NULL, *group_leader = NULL; | ||
2095 | #endif | ||
2085 | 2096 | ||
2086 | max_load = this_load = total_load = total_pwr = 0; | 2097 | max_load = this_load = total_load = total_pwr = 0; |
2087 | busiest_load_per_task = busiest_nr_running = 0; | 2098 | busiest_load_per_task = busiest_nr_running = 0; |
@@ -2094,7 +2105,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2094 | load_idx = sd->idle_idx; | 2105 | load_idx = sd->idle_idx; |
2095 | 2106 | ||
2096 | do { | 2107 | do { |
2097 | unsigned long load; | 2108 | unsigned long load, group_capacity; |
2098 | int local_group; | 2109 | int local_group; |
2099 | int i; | 2110 | int i; |
2100 | unsigned long sum_nr_running, sum_weighted_load; | 2111 | unsigned long sum_nr_running, sum_weighted_load; |
@@ -2127,18 +2138,76 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2127 | /* Adjust by relative CPU power of the group */ | 2138 | /* Adjust by relative CPU power of the group */ |
2128 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | 2139 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; |
2129 | 2140 | ||
2141 | group_capacity = group->cpu_power / SCHED_LOAD_SCALE; | ||
2142 | |||
2130 | if (local_group) { | 2143 | if (local_group) { |
2131 | this_load = avg_load; | 2144 | this_load = avg_load; |
2132 | this = group; | 2145 | this = group; |
2133 | this_nr_running = sum_nr_running; | 2146 | this_nr_running = sum_nr_running; |
2134 | this_load_per_task = sum_weighted_load; | 2147 | this_load_per_task = sum_weighted_load; |
2135 | } else if (avg_load > max_load && | 2148 | } else if (avg_load > max_load && |
2136 | sum_nr_running > group->cpu_power / SCHED_LOAD_SCALE) { | 2149 | sum_nr_running > group_capacity) { |
2137 | max_load = avg_load; | 2150 | max_load = avg_load; |
2138 | busiest = group; | 2151 | busiest = group; |
2139 | busiest_nr_running = sum_nr_running; | 2152 | busiest_nr_running = sum_nr_running; |
2140 | busiest_load_per_task = sum_weighted_load; | 2153 | busiest_load_per_task = sum_weighted_load; |
2141 | } | 2154 | } |
2155 | |||
2156 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2157 | /* | ||
2158 | * Busy processors will not participate in power savings | ||
2159 | * balance. | ||
2160 | */ | ||
2161 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
2162 | goto group_next; | ||
2163 | |||
2164 | /* | ||
2165 | * If the local group is idle or completely loaded | ||
2166 | * no need to do power savings balance at this domain | ||
2167 | */ | ||
2168 | if (local_group && (this_nr_running >= group_capacity || | ||
2169 | !this_nr_running)) | ||
2170 | power_savings_balance = 0; | ||
2171 | |||
2172 | /* | ||
2173 | * If a group is already running at full capacity or idle, | ||
2174 | * don't include that group in power savings calculations | ||
2175 | */ | ||
2176 | if (!power_savings_balance || sum_nr_running >= group_capacity | ||
2177 | || !sum_nr_running) | ||
2178 | goto group_next; | ||
2179 | |||
2180 | /* | ||
2181 | * Calculate the group which has the least non-idle load. | ||
2182 | * This is the group from where we need to pick up the load | ||
2183 | * for saving power | ||
2184 | */ | ||
2185 | if ((sum_nr_running < min_nr_running) || | ||
2186 | (sum_nr_running == min_nr_running && | ||
2187 | first_cpu(group->cpumask) < | ||
2188 | first_cpu(group_min->cpumask))) { | ||
2189 | group_min = group; | ||
2190 | min_nr_running = sum_nr_running; | ||
2191 | min_load_per_task = sum_weighted_load / | ||
2192 | sum_nr_running; | ||
2193 | } | ||
2194 | |||
2195 | /* | ||
2196 | * Calculate the group which is almost near its | ||
2197 | * capacity but still has some space to pick up some load | ||
2198 | * from other group and save more power | ||
2199 | */ | ||
2200 | if (sum_nr_running <= group_capacity - 1) | ||
2201 | if (sum_nr_running > leader_nr_running || | ||
2202 | (sum_nr_running == leader_nr_running && | ||
2203 | first_cpu(group->cpumask) > | ||
2204 | first_cpu(group_leader->cpumask))) { | ||
2205 | group_leader = group; | ||
2206 | leader_nr_running = sum_nr_running; | ||
2207 | } | ||
2208 | |||
2209 | group_next: | ||
2210 | #endif | ||
2142 | group = group->next; | 2211 | group = group->next; |
2143 | } while (group != sd->groups); | 2212 | } while (group != sd->groups); |
2144 | 2213 | ||
@@ -2247,7 +2316,16 @@ small_imbalance: | |||
2247 | return busiest; | 2316 | return busiest; |
2248 | 2317 | ||
2249 | out_balanced: | 2318 | out_balanced: |
2319 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2320 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
2321 | goto ret; | ||
2250 | 2322 | ||
2323 | if (this == group_leader && group_leader != group_min) { | ||
2324 | *imbalance = min_load_per_task; | ||
2325 | return group_min; | ||
2326 | } | ||
2327 | ret: | ||
2328 | #endif | ||
2251 | *imbalance = 0; | 2329 | *imbalance = 0; |
2252 | return NULL; | 2330 | return NULL; |
2253 | } | 2331 | } |
@@ -2300,7 +2378,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2300 | int active_balance = 0; | 2378 | int active_balance = 0; |
2301 | int sd_idle = 0; | 2379 | int sd_idle = 0; |
2302 | 2380 | ||
2303 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) | 2381 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && |
2382 | !sched_smt_power_savings) | ||
2304 | sd_idle = 1; | 2383 | sd_idle = 1; |
2305 | 2384 | ||
2306 | schedstat_inc(sd, lb_cnt[idle]); | 2385 | schedstat_inc(sd, lb_cnt[idle]); |
@@ -2389,7 +2468,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2389 | sd->balance_interval *= 2; | 2468 | sd->balance_interval *= 2; |
2390 | } | 2469 | } |
2391 | 2470 | ||
2392 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2471 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2472 | !sched_smt_power_savings) | ||
2393 | return -1; | 2473 | return -1; |
2394 | return nr_moved; | 2474 | return nr_moved; |
2395 | 2475 | ||
@@ -2404,7 +2484,7 @@ out_one_pinned: | |||
2404 | (sd->balance_interval < sd->max_interval)) | 2484 | (sd->balance_interval < sd->max_interval)) |
2405 | sd->balance_interval *= 2; | 2485 | sd->balance_interval *= 2; |
2406 | 2486 | ||
2407 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2487 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) |
2408 | return -1; | 2488 | return -1; |
2409 | return 0; | 2489 | return 0; |
2410 | } | 2490 | } |
@@ -2425,7 +2505,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2425 | int nr_moved = 0; | 2505 | int nr_moved = 0; |
2426 | int sd_idle = 0; | 2506 | int sd_idle = 0; |
2427 | 2507 | ||
2428 | if (sd->flags & SD_SHARE_CPUPOWER) | 2508 | if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) |
2429 | sd_idle = 1; | 2509 | sd_idle = 1; |
2430 | 2510 | ||
2431 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); | 2511 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); |
@@ -2466,7 +2546,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2466 | 2546 | ||
2467 | out_balanced: | 2547 | out_balanced: |
2468 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | 2548 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); |
2469 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2549 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) |
2470 | return -1; | 2550 | return -1; |
2471 | sd->nr_balance_failed = 0; | 2551 | sd->nr_balance_failed = 0; |
2472 | return 0; | 2552 | return 0; |
@@ -5732,6 +5812,7 @@ static cpumask_t sched_domain_node_span(int node) | |||
5732 | } | 5812 | } |
5733 | #endif | 5813 | #endif |
5734 | 5814 | ||
5815 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | ||
5735 | /* | 5816 | /* |
5736 | * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we | 5817 | * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we |
5737 | * can switch it on easily if needed. | 5818 | * can switch it on easily if needed. |
@@ -6113,37 +6194,72 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6113 | #endif | 6194 | #endif |
6114 | 6195 | ||
6115 | /* Calculate CPU power for physical packages and nodes */ | 6196 | /* Calculate CPU power for physical packages and nodes */ |
6197 | #ifdef CONFIG_SCHED_SMT | ||
6116 | for_each_cpu_mask(i, *cpu_map) { | 6198 | for_each_cpu_mask(i, *cpu_map) { |
6117 | int power; | ||
6118 | struct sched_domain *sd; | 6199 | struct sched_domain *sd; |
6119 | #ifdef CONFIG_SCHED_SMT | ||
6120 | sd = &per_cpu(cpu_domains, i); | 6200 | sd = &per_cpu(cpu_domains, i); |
6121 | power = SCHED_LOAD_SCALE; | 6201 | sd->groups->cpu_power = SCHED_LOAD_SCALE; |
6122 | sd->groups->cpu_power = power; | 6202 | } |
6123 | #endif | 6203 | #endif |
6124 | #ifdef CONFIG_SCHED_MC | 6204 | #ifdef CONFIG_SCHED_MC |
6205 | for_each_cpu_mask(i, *cpu_map) { | ||
6206 | int power; | ||
6207 | struct sched_domain *sd; | ||
6125 | sd = &per_cpu(core_domains, i); | 6208 | sd = &per_cpu(core_domains, i); |
6126 | power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) | 6209 | if (sched_smt_power_savings) |
6210 | power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); | ||
6211 | else | ||
6212 | power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) | ||
6127 | * SCHED_LOAD_SCALE / 10; | 6213 | * SCHED_LOAD_SCALE / 10; |
6128 | sd->groups->cpu_power = power; | 6214 | sd->groups->cpu_power = power; |
6215 | } | ||
6216 | #endif | ||
6129 | 6217 | ||
6218 | for_each_cpu_mask(i, *cpu_map) { | ||
6219 | struct sched_domain *sd; | ||
6220 | #ifdef CONFIG_SCHED_MC | ||
6130 | sd = &per_cpu(phys_domains, i); | 6221 | sd = &per_cpu(phys_domains, i); |
6222 | if (i != first_cpu(sd->groups->cpumask)) | ||
6223 | continue; | ||
6131 | 6224 | ||
6132 | /* | 6225 | sd->groups->cpu_power = 0; |
6133 | * This has to be < 2 * SCHED_LOAD_SCALE | 6226 | if (sched_mc_power_savings || sched_smt_power_savings) { |
6134 | * Lets keep it SCHED_LOAD_SCALE, so that | 6227 | int j; |
6135 | * while calculating NUMA group's cpu_power | 6228 | |
6136 | * we can simply do | 6229 | for_each_cpu_mask(j, sd->groups->cpumask) { |
6137 | * numa_group->cpu_power += phys_group->cpu_power; | 6230 | struct sched_domain *sd1; |
6138 | * | 6231 | sd1 = &per_cpu(core_domains, j); |
6139 | * See "only add power once for each physical pkg" | 6232 | /* |
6140 | * comment below | 6233 | * for each core we will add once |
6141 | */ | 6234 | * to the group in physical domain |
6142 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | 6235 | */ |
6236 | if (j != first_cpu(sd1->groups->cpumask)) | ||
6237 | continue; | ||
6238 | |||
6239 | if (sched_smt_power_savings) | ||
6240 | sd->groups->cpu_power += sd1->groups->cpu_power; | ||
6241 | else | ||
6242 | sd->groups->cpu_power += SCHED_LOAD_SCALE; | ||
6243 | } | ||
6244 | } else | ||
6245 | /* | ||
6246 | * This has to be < 2 * SCHED_LOAD_SCALE | ||
6247 | * Lets keep it SCHED_LOAD_SCALE, so that | ||
6248 | * while calculating NUMA group's cpu_power | ||
6249 | * we can simply do | ||
6250 | * numa_group->cpu_power += phys_group->cpu_power; | ||
6251 | * | ||
6252 | * See "only add power once for each physical pkg" | ||
6253 | * comment below | ||
6254 | */ | ||
6255 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | ||
6143 | #else | 6256 | #else |
6257 | int power; | ||
6144 | sd = &per_cpu(phys_domains, i); | 6258 | sd = &per_cpu(phys_domains, i); |
6145 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * | 6259 | if (sched_smt_power_savings) |
6146 | (cpus_weight(sd->groups->cpumask)-1) / 10; | 6260 | power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); |
6261 | else | ||
6262 | power = SCHED_LOAD_SCALE; | ||
6147 | sd->groups->cpu_power = power; | 6263 | sd->groups->cpu_power = power; |
6148 | #endif | 6264 | #endif |
6149 | } | 6265 | } |
@@ -6244,6 +6360,80 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | |||
6244 | return err; | 6360 | return err; |
6245 | } | 6361 | } |
6246 | 6362 | ||
6363 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
6364 | int arch_reinit_sched_domains(void) | ||
6365 | { | ||
6366 | int err; | ||
6367 | |||
6368 | lock_cpu_hotplug(); | ||
6369 | detach_destroy_domains(&cpu_online_map); | ||
6370 | err = arch_init_sched_domains(&cpu_online_map); | ||
6371 | unlock_cpu_hotplug(); | ||
6372 | |||
6373 | return err; | ||
6374 | } | ||
6375 | |||
6376 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | ||
6377 | { | ||
6378 | int ret; | ||
6379 | |||
6380 | if (buf[0] != '0' && buf[0] != '1') | ||
6381 | return -EINVAL; | ||
6382 | |||
6383 | if (smt) | ||
6384 | sched_smt_power_savings = (buf[0] == '1'); | ||
6385 | else | ||
6386 | sched_mc_power_savings = (buf[0] == '1'); | ||
6387 | |||
6388 | ret = arch_reinit_sched_domains(); | ||
6389 | |||
6390 | return ret ? ret : count; | ||
6391 | } | ||
6392 | |||
6393 | int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | ||
6394 | { | ||
6395 | int err = 0; | ||
6396 | #ifdef CONFIG_SCHED_SMT | ||
6397 | if (smt_capable()) | ||
6398 | err = sysfs_create_file(&cls->kset.kobj, | ||
6399 | &attr_sched_smt_power_savings.attr); | ||
6400 | #endif | ||
6401 | #ifdef CONFIG_SCHED_MC | ||
6402 | if (!err && mc_capable()) | ||
6403 | err = sysfs_create_file(&cls->kset.kobj, | ||
6404 | &attr_sched_mc_power_savings.attr); | ||
6405 | #endif | ||
6406 | return err; | ||
6407 | } | ||
6408 | #endif | ||
6409 | |||
6410 | #ifdef CONFIG_SCHED_MC | ||
6411 | static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) | ||
6412 | { | ||
6413 | return sprintf(page, "%u\n", sched_mc_power_savings); | ||
6414 | } | ||
6415 | static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count) | ||
6416 | { | ||
6417 | return sched_power_savings_store(buf, count, 0); | ||
6418 | } | ||
6419 | SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, | ||
6420 | sched_mc_power_savings_store); | ||
6421 | #endif | ||
6422 | |||
6423 | #ifdef CONFIG_SCHED_SMT | ||
6424 | static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) | ||
6425 | { | ||
6426 | return sprintf(page, "%u\n", sched_smt_power_savings); | ||
6427 | } | ||
6428 | static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count) | ||
6429 | { | ||
6430 | return sched_power_savings_store(buf, count, 1); | ||
6431 | } | ||
6432 | SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, | ||
6433 | sched_smt_power_savings_store); | ||
6434 | #endif | ||
6435 | |||
6436 | |||
6247 | #ifdef CONFIG_HOTPLUG_CPU | 6437 | #ifdef CONFIG_HOTPLUG_CPU |
6248 | /* | 6438 | /* |
6249 | * Force a reinitialization of the sched domains hierarchy. The domains | 6439 | * Force a reinitialization of the sched domains hierarchy. The domains |