aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSiddha, Suresh B <suresh.b.siddha@intel.com>2006-06-27 05:54:42 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-06-27 20:32:45 -0400
commit5c45bf279d378d436ce45825c0f136696c7b6109 (patch)
tree80e2fcf4866b84fccb787562e1a83b16f4bc8850
parent369381694ddcf03f1de403501c8b97099b5109ec (diff)
[PATCH] sched: mc/smt power savings sched policy
sysfs entries 'sched_mc_power_savings' and 'sched_smt_power_savings' in /sys/devices/system/cpu/ control the MC/SMT power savings policy for the scheduler. Based on the values (1-enable, 0-disable) for these controls, sched groups cpu power will be determined for different domains. When power savings policy is enabled and under light load conditions, scheduler will minimize the physical packages/cpu cores carrying the load and thus conserving power(with a perf impact based on the workload characteristics... see OLS 2005 CMP kernel scheduler paper for more details..) Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Con Kolivas <kernel@kolivas.org> Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com> Cc: "David S. Miller" <davem@davemloft.net> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--arch/i386/kernel/smpboot.c8
-rw-r--r--arch/x86_64/kernel/smpboot.c8
-rw-r--r--drivers/base/cpu.c10
-rw-r--r--include/asm-i386/topology.h5
-rw-r--r--include/asm-ia64/topology.h1
-rw-r--r--include/asm-powerpc/topology.h5
-rw-r--r--include/asm-sparc64/topology.h3
-rw-r--r--include/asm-x86_64/topology.h2
-rw-r--r--include/linux/sched.h10
-rw-r--r--include/linux/topology.h3
-rw-r--r--kernel/sched.c240
11 files changed, 262 insertions, 33 deletions
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index ab5275beddf7..89e7315e539c 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -448,10 +448,12 @@ cpumask_t cpu_coregroup_map(int cpu)
448 struct cpuinfo_x86 *c = cpu_data + cpu; 448 struct cpuinfo_x86 *c = cpu_data + cpu;
449 /* 449 /*
450 * For perf, we return last level cache shared map. 450 * For perf, we return last level cache shared map.
451 * TBD: when power saving sched policy is added, we will return 451 * And for power savings, we return cpu_core_map
452 * cpu_core_map when power saving policy is enabled
453 */ 452 */
454 return c->llc_shared_map; 453 if (sched_mc_power_savings || sched_smt_power_savings)
454 return cpu_core_map[cpu];
455 else
456 return c->llc_shared_map;
455} 457}
456 458
457/* representing cpus for which sibling maps can be computed */ 459/* representing cpus for which sibling maps can be computed */
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 4e9755179ecf..540c0ccbcccc 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -455,10 +455,12 @@ cpumask_t cpu_coregroup_map(int cpu)
455 struct cpuinfo_x86 *c = cpu_data + cpu; 455 struct cpuinfo_x86 *c = cpu_data + cpu;
456 /* 456 /*
457 * For perf, we return last level cache shared map. 457 * For perf, we return last level cache shared map.
458 * TBD: when power saving sched policy is added, we will return 458 * And for power savings, we return cpu_core_map
459 * cpu_core_map when power saving policy is enabled
460 */ 459 */
461 return c->llc_shared_map; 460 if (sched_mc_power_savings || sched_smt_power_savings)
461 return cpu_core_map[cpu];
462 else
463 return c->llc_shared_map;
462} 464}
463 465
464/* representing cpus for which sibling maps can be computed */ 466/* representing cpus for which sibling maps can be computed */
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 3972d8ac9786..4bef76a2f3f2 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -143,5 +143,13 @@ EXPORT_SYMBOL_GPL(get_cpu_sysdev);
143 143
144int __init cpu_dev_init(void) 144int __init cpu_dev_init(void)
145{ 145{
146 return sysdev_class_register(&cpu_sysdev_class); 146 int err;
147
148 err = sysdev_class_register(&cpu_sysdev_class);
149#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
150 if (!err)
151 err = sched_create_sysfs_power_savings_entries(&cpu_sysdev_class);
152#endif
153
154 return err;
147} 155}
diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index aa4185ee81fb..6adbd9b1ae88 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -112,4 +112,9 @@ extern unsigned long node_remap_size[];
112 112
113extern cpumask_t cpu_coregroup_map(int cpu); 113extern cpumask_t cpu_coregroup_map(int cpu);
114 114
115#ifdef CONFIG_SMP
116#define mc_capable() (boot_cpu_data.x86_max_cores > 1)
117#define smt_capable() (smp_num_siblings > 1)
118#endif
119
115#endif /* _ASM_I386_TOPOLOGY_H */ 120#endif /* _ASM_I386_TOPOLOGY_H */
diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h
index 616b5ed2aa72..937c21257523 100644
--- a/include/asm-ia64/topology.h
+++ b/include/asm-ia64/topology.h
@@ -112,6 +112,7 @@ void build_cpu_to_node_map(void);
112#define topology_core_id(cpu) (cpu_data(cpu)->core_id) 112#define topology_core_id(cpu) (cpu_data(cpu)->core_id)
113#define topology_core_siblings(cpu) (cpu_core_map[cpu]) 113#define topology_core_siblings(cpu) (cpu_core_map[cpu])
114#define topology_thread_siblings(cpu) (cpu_sibling_map[cpu]) 114#define topology_thread_siblings(cpu) (cpu_sibling_map[cpu])
115#define smt_capable() (smp_num_siblings > 1)
115#endif 116#endif
116 117
117#include <asm-generic/topology.h> 118#include <asm-generic/topology.h>
diff --git a/include/asm-powerpc/topology.h b/include/asm-powerpc/topology.h
index 92f3e5507d22..bbc3844b086f 100644
--- a/include/asm-powerpc/topology.h
+++ b/include/asm-powerpc/topology.h
@@ -93,5 +93,10 @@ static inline void sysfs_remove_device_from_node(struct sys_device *dev,
93 93
94#endif /* CONFIG_NUMA */ 94#endif /* CONFIG_NUMA */
95 95
96#ifdef CONFIG_SMP
97#include <asm/cputable.h>
98#define smt_capable() (cpu_has_feature(CPU_FTR_SMT))
99#endif
100
96#endif /* __KERNEL__ */ 101#endif /* __KERNEL__ */
97#endif /* _ASM_POWERPC_TOPOLOGY_H */ 102#endif /* _ASM_POWERPC_TOPOLOGY_H */
diff --git a/include/asm-sparc64/topology.h b/include/asm-sparc64/topology.h
index 0e234e201bd6..98a6c613589d 100644
--- a/include/asm-sparc64/topology.h
+++ b/include/asm-sparc64/topology.h
@@ -1,6 +1,9 @@
1#ifndef _ASM_SPARC64_TOPOLOGY_H 1#ifndef _ASM_SPARC64_TOPOLOGY_H
2#define _ASM_SPARC64_TOPOLOGY_H 2#define _ASM_SPARC64_TOPOLOGY_H
3 3
4#include <asm/spitfire.h>
5#define smt_capable() (tlb_type == hypervisor)
6
4#include <asm-generic/topology.h> 7#include <asm-generic/topology.h>
5 8
6#endif /* _ASM_SPARC64_TOPOLOGY_H */ 9#endif /* _ASM_SPARC64_TOPOLOGY_H */
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index c4e46e7fa7ba..6e7a2e976b04 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -59,6 +59,8 @@ extern int __node_distance(int, int);
59#define topology_core_id(cpu) (cpu_data[cpu].cpu_core_id) 59#define topology_core_id(cpu) (cpu_data[cpu].cpu_core_id)
60#define topology_core_siblings(cpu) (cpu_core_map[cpu]) 60#define topology_core_siblings(cpu) (cpu_core_map[cpu])
61#define topology_thread_siblings(cpu) (cpu_sibling_map[cpu]) 61#define topology_thread_siblings(cpu) (cpu_sibling_map[cpu])
62#define mc_capable() (boot_cpu_data.x86_max_cores > 1)
63#define smt_capable() (smp_num_siblings > 1)
62#endif 64#endif
63 65
64#include <asm-generic/topology.h> 66#include <asm-generic/topology.h>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ab8ffc54423a..0bc81a151e50 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -570,6 +570,11 @@ enum idle_type
570#define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */ 570#define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */
571#define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ 571#define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */
572#define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ 572#define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */
573#define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */
574
575#define BALANCE_FOR_POWER ((sched_mc_power_savings || sched_smt_power_savings) \
576 ? SD_POWERSAVINGS_BALANCE : 0)
577
573 578
574struct sched_group { 579struct sched_group {
575 struct sched_group *next; /* Must be a circular list */ 580 struct sched_group *next; /* Must be a circular list */
@@ -1412,6 +1417,11 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm)
1412extern long sched_setaffinity(pid_t pid, cpumask_t new_mask); 1417extern long sched_setaffinity(pid_t pid, cpumask_t new_mask);
1413extern long sched_getaffinity(pid_t pid, cpumask_t *mask); 1418extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
1414 1419
1420#include <linux/sysdev.h>
1421extern int sched_mc_power_savings, sched_smt_power_savings;
1422extern struct sysdev_attribute attr_sched_mc_power_savings, attr_sched_smt_power_savings;
1423extern int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls);
1424
1415extern void normalize_rt_tasks(void); 1425extern void normalize_rt_tasks(void);
1416 1426
1417#ifdef CONFIG_PM 1427#ifdef CONFIG_PM
diff --git a/include/linux/topology.h b/include/linux/topology.h
index a305ae2e44b6..ec1eca85290a 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -134,7 +134,8 @@
134 .flags = SD_LOAD_BALANCE \ 134 .flags = SD_LOAD_BALANCE \
135 | SD_BALANCE_NEWIDLE \ 135 | SD_BALANCE_NEWIDLE \
136 | SD_BALANCE_EXEC \ 136 | SD_BALANCE_EXEC \
137 | SD_WAKE_AFFINE, \ 137 | SD_WAKE_AFFINE \
138 | BALANCE_FOR_POWER, \
138 .last_balance = jiffies, \ 139 .last_balance = jiffies, \
139 .balance_interval = 1, \ 140 .balance_interval = 1, \
140 .nr_balance_failed = 0, \ 141 .nr_balance_failed = 0, \
diff --git a/kernel/sched.c b/kernel/sched.c
index 122b75584a13..54fa282657cc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1162,6 +1162,11 @@ static int sched_balance_self(int cpu, int flag)
1162 struct sched_domain *tmp, *sd = NULL; 1162 struct sched_domain *tmp, *sd = NULL;
1163 1163
1164 for_each_domain(cpu, tmp) { 1164 for_each_domain(cpu, tmp) {
1165 /*
1166 * If power savings logic is enabled for a domain, stop there.
1167 */
1168 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1169 break;
1165 if (tmp->flags & flag) 1170 if (tmp->flags & flag)
1166 sd = tmp; 1171 sd = tmp;
1167 } 1172 }
@@ -2082,6 +2087,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2082 unsigned long busiest_load_per_task, busiest_nr_running; 2087 unsigned long busiest_load_per_task, busiest_nr_running;
2083 unsigned long this_load_per_task, this_nr_running; 2088 unsigned long this_load_per_task, this_nr_running;
2084 int load_idx; 2089 int load_idx;
2090#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2091 int power_savings_balance = 1;
2092 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2093 unsigned long min_nr_running = ULONG_MAX;
2094 struct sched_group *group_min = NULL, *group_leader = NULL;
2095#endif
2085 2096
2086 max_load = this_load = total_load = total_pwr = 0; 2097 max_load = this_load = total_load = total_pwr = 0;
2087 busiest_load_per_task = busiest_nr_running = 0; 2098 busiest_load_per_task = busiest_nr_running = 0;
@@ -2094,7 +2105,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2094 load_idx = sd->idle_idx; 2105 load_idx = sd->idle_idx;
2095 2106
2096 do { 2107 do {
2097 unsigned long load; 2108 unsigned long load, group_capacity;
2098 int local_group; 2109 int local_group;
2099 int i; 2110 int i;
2100 unsigned long sum_nr_running, sum_weighted_load; 2111 unsigned long sum_nr_running, sum_weighted_load;
@@ -2127,18 +2138,76 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2127 /* Adjust by relative CPU power of the group */ 2138 /* Adjust by relative CPU power of the group */
2128 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 2139 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
2129 2140
2141 group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
2142
2130 if (local_group) { 2143 if (local_group) {
2131 this_load = avg_load; 2144 this_load = avg_load;
2132 this = group; 2145 this = group;
2133 this_nr_running = sum_nr_running; 2146 this_nr_running = sum_nr_running;
2134 this_load_per_task = sum_weighted_load; 2147 this_load_per_task = sum_weighted_load;
2135 } else if (avg_load > max_load && 2148 } else if (avg_load > max_load &&
2136 sum_nr_running > group->cpu_power / SCHED_LOAD_SCALE) { 2149 sum_nr_running > group_capacity) {
2137 max_load = avg_load; 2150 max_load = avg_load;
2138 busiest = group; 2151 busiest = group;
2139 busiest_nr_running = sum_nr_running; 2152 busiest_nr_running = sum_nr_running;
2140 busiest_load_per_task = sum_weighted_load; 2153 busiest_load_per_task = sum_weighted_load;
2141 } 2154 }
2155
2156#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2157 /*
2158 * Busy processors will not participate in power savings
2159 * balance.
2160 */
2161 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2162 goto group_next;
2163
2164 /*
2165 * If the local group is idle or completely loaded
2166 * no need to do power savings balance at this domain
2167 */
2168 if (local_group && (this_nr_running >= group_capacity ||
2169 !this_nr_running))
2170 power_savings_balance = 0;
2171
2172 /*
2173 * If a group is already running at full capacity or idle,
2174 * don't include that group in power savings calculations
2175 */
2176 if (!power_savings_balance || sum_nr_running >= group_capacity
2177 || !sum_nr_running)
2178 goto group_next;
2179
2180 /*
2181 * Calculate the group which has the least non-idle load.
2182 * This is the group from where we need to pick up the load
2183 * for saving power
2184 */
2185 if ((sum_nr_running < min_nr_running) ||
2186 (sum_nr_running == min_nr_running &&
2187 first_cpu(group->cpumask) <
2188 first_cpu(group_min->cpumask))) {
2189 group_min = group;
2190 min_nr_running = sum_nr_running;
2191 min_load_per_task = sum_weighted_load /
2192 sum_nr_running;
2193 }
2194
2195 /*
2196 * Calculate the group which is almost near its
2197 * capacity but still has some space to pick up some load
2198 * from other group and save more power
2199 */
2200 if (sum_nr_running <= group_capacity - 1)
2201 if (sum_nr_running > leader_nr_running ||
2202 (sum_nr_running == leader_nr_running &&
2203 first_cpu(group->cpumask) >
2204 first_cpu(group_leader->cpumask))) {
2205 group_leader = group;
2206 leader_nr_running = sum_nr_running;
2207 }
2208
2209group_next:
2210#endif
2142 group = group->next; 2211 group = group->next;
2143 } while (group != sd->groups); 2212 } while (group != sd->groups);
2144 2213
@@ -2247,7 +2316,16 @@ small_imbalance:
2247 return busiest; 2316 return busiest;
2248 2317
2249out_balanced: 2318out_balanced:
2319#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2320 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2321 goto ret;
2250 2322
2323 if (this == group_leader && group_leader != group_min) {
2324 *imbalance = min_load_per_task;
2325 return group_min;
2326 }
2327ret:
2328#endif
2251 *imbalance = 0; 2329 *imbalance = 0;
2252 return NULL; 2330 return NULL;
2253} 2331}
@@ -2300,7 +2378,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2300 int active_balance = 0; 2378 int active_balance = 0;
2301 int sd_idle = 0; 2379 int sd_idle = 0;
2302 2380
2303 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) 2381 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2382 !sched_smt_power_savings)
2304 sd_idle = 1; 2383 sd_idle = 1;
2305 2384
2306 schedstat_inc(sd, lb_cnt[idle]); 2385 schedstat_inc(sd, lb_cnt[idle]);
@@ -2389,7 +2468,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2389 sd->balance_interval *= 2; 2468 sd->balance_interval *= 2;
2390 } 2469 }
2391 2470
2392 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2471 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2472 !sched_smt_power_savings)
2393 return -1; 2473 return -1;
2394 return nr_moved; 2474 return nr_moved;
2395 2475
@@ -2404,7 +2484,7 @@ out_one_pinned:
2404 (sd->balance_interval < sd->max_interval)) 2484 (sd->balance_interval < sd->max_interval))
2405 sd->balance_interval *= 2; 2485 sd->balance_interval *= 2;
2406 2486
2407 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2487 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2408 return -1; 2488 return -1;
2409 return 0; 2489 return 0;
2410} 2490}
@@ -2425,7 +2505,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2425 int nr_moved = 0; 2505 int nr_moved = 0;
2426 int sd_idle = 0; 2506 int sd_idle = 0;
2427 2507
2428 if (sd->flags & SD_SHARE_CPUPOWER) 2508 if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2429 sd_idle = 1; 2509 sd_idle = 1;
2430 2510
2431 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2511 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
@@ -2466,7 +2546,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2466 2546
2467out_balanced: 2547out_balanced:
2468 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2548 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2469 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2549 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2470 return -1; 2550 return -1;
2471 sd->nr_balance_failed = 0; 2551 sd->nr_balance_failed = 0;
2472 return 0; 2552 return 0;
@@ -5732,6 +5812,7 @@ static cpumask_t sched_domain_node_span(int node)
5732} 5812}
5733#endif 5813#endif
5734 5814
5815int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5735/* 5816/*
5736 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we 5817 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
5737 * can switch it on easily if needed. 5818 * can switch it on easily if needed.
@@ -6113,37 +6194,72 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6113#endif 6194#endif
6114 6195
6115 /* Calculate CPU power for physical packages and nodes */ 6196 /* Calculate CPU power for physical packages and nodes */
6197#ifdef CONFIG_SCHED_SMT
6116 for_each_cpu_mask(i, *cpu_map) { 6198 for_each_cpu_mask(i, *cpu_map) {
6117 int power;
6118 struct sched_domain *sd; 6199 struct sched_domain *sd;
6119#ifdef CONFIG_SCHED_SMT
6120 sd = &per_cpu(cpu_domains, i); 6200 sd = &per_cpu(cpu_domains, i);
6121 power = SCHED_LOAD_SCALE; 6201 sd->groups->cpu_power = SCHED_LOAD_SCALE;
6122 sd->groups->cpu_power = power; 6202 }
6123#endif 6203#endif
6124#ifdef CONFIG_SCHED_MC 6204#ifdef CONFIG_SCHED_MC
6205 for_each_cpu_mask(i, *cpu_map) {
6206 int power;
6207 struct sched_domain *sd;
6125 sd = &per_cpu(core_domains, i); 6208 sd = &per_cpu(core_domains, i);
6126 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) 6209 if (sched_smt_power_savings)
6210 power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
6211 else
6212 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
6127 * SCHED_LOAD_SCALE / 10; 6213 * SCHED_LOAD_SCALE / 10;
6128 sd->groups->cpu_power = power; 6214 sd->groups->cpu_power = power;
6215 }
6216#endif
6129 6217
6218 for_each_cpu_mask(i, *cpu_map) {
6219 struct sched_domain *sd;
6220#ifdef CONFIG_SCHED_MC
6130 sd = &per_cpu(phys_domains, i); 6221 sd = &per_cpu(phys_domains, i);
6222 if (i != first_cpu(sd->groups->cpumask))
6223 continue;
6131 6224
6132 /* 6225 sd->groups->cpu_power = 0;
6133 * This has to be < 2 * SCHED_LOAD_SCALE 6226 if (sched_mc_power_savings || sched_smt_power_savings) {
6134 * Lets keep it SCHED_LOAD_SCALE, so that 6227 int j;
6135 * while calculating NUMA group's cpu_power 6228
6136 * we can simply do 6229 for_each_cpu_mask(j, sd->groups->cpumask) {
6137 * numa_group->cpu_power += phys_group->cpu_power; 6230 struct sched_domain *sd1;
6138 * 6231 sd1 = &per_cpu(core_domains, j);
6139 * See "only add power once for each physical pkg" 6232 /*
6140 * comment below 6233 * for each core we will add once
6141 */ 6234 * to the group in physical domain
6142 sd->groups->cpu_power = SCHED_LOAD_SCALE; 6235 */
6236 if (j != first_cpu(sd1->groups->cpumask))
6237 continue;
6238
6239 if (sched_smt_power_savings)
6240 sd->groups->cpu_power += sd1->groups->cpu_power;
6241 else
6242 sd->groups->cpu_power += SCHED_LOAD_SCALE;
6243 }
6244 } else
6245 /*
6246 * This has to be < 2 * SCHED_LOAD_SCALE
6247 * Lets keep it SCHED_LOAD_SCALE, so that
6248 * while calculating NUMA group's cpu_power
6249 * we can simply do
6250 * numa_group->cpu_power += phys_group->cpu_power;
6251 *
6252 * See "only add power once for each physical pkg"
6253 * comment below
6254 */
6255 sd->groups->cpu_power = SCHED_LOAD_SCALE;
6143#else 6256#else
6257 int power;
6144 sd = &per_cpu(phys_domains, i); 6258 sd = &per_cpu(phys_domains, i);
6145 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * 6259 if (sched_smt_power_savings)
6146 (cpus_weight(sd->groups->cpumask)-1) / 10; 6260 power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
6261 else
6262 power = SCHED_LOAD_SCALE;
6147 sd->groups->cpu_power = power; 6263 sd->groups->cpu_power = power;
6148#endif 6264#endif
6149 } 6265 }
@@ -6244,6 +6360,80 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6244 return err; 6360 return err;
6245} 6361}
6246 6362
6363#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6364int arch_reinit_sched_domains(void)
6365{
6366 int err;
6367
6368 lock_cpu_hotplug();
6369 detach_destroy_domains(&cpu_online_map);
6370 err = arch_init_sched_domains(&cpu_online_map);
6371 unlock_cpu_hotplug();
6372
6373 return err;
6374}
6375
6376static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6377{
6378 int ret;
6379
6380 if (buf[0] != '0' && buf[0] != '1')
6381 return -EINVAL;
6382
6383 if (smt)
6384 sched_smt_power_savings = (buf[0] == '1');
6385 else
6386 sched_mc_power_savings = (buf[0] == '1');
6387
6388 ret = arch_reinit_sched_domains();
6389
6390 return ret ? ret : count;
6391}
6392
6393int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6394{
6395 int err = 0;
6396#ifdef CONFIG_SCHED_SMT
6397 if (smt_capable())
6398 err = sysfs_create_file(&cls->kset.kobj,
6399 &attr_sched_smt_power_savings.attr);
6400#endif
6401#ifdef CONFIG_SCHED_MC
6402 if (!err && mc_capable())
6403 err = sysfs_create_file(&cls->kset.kobj,
6404 &attr_sched_mc_power_savings.attr);
6405#endif
6406 return err;
6407}
6408#endif
6409
6410#ifdef CONFIG_SCHED_MC
6411static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6412{
6413 return sprintf(page, "%u\n", sched_mc_power_savings);
6414}
6415static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
6416{
6417 return sched_power_savings_store(buf, count, 0);
6418}
6419SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6420 sched_mc_power_savings_store);
6421#endif
6422
6423#ifdef CONFIG_SCHED_SMT
6424static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6425{
6426 return sprintf(page, "%u\n", sched_smt_power_savings);
6427}
6428static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
6429{
6430 return sched_power_savings_store(buf, count, 1);
6431}
6432SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6433 sched_smt_power_savings_store);
6434#endif
6435
6436
6247#ifdef CONFIG_HOTPLUG_CPU 6437#ifdef CONFIG_HOTPLUG_CPU
6248/* 6438/*
6249 * Force a reinitialization of the sched domains hierarchy. The domains 6439 * Force a reinitialization of the sched domains hierarchy. The domains