diff options
author | Siddha, Suresh B <suresh.b.siddha@intel.com> | 2006-06-27 05:54:42 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-06-27 20:32:45 -0400 |
commit | 5c45bf279d378d436ce45825c0f136696c7b6109 (patch) | |
tree | 80e2fcf4866b84fccb787562e1a83b16f4bc8850 /kernel/sched.c | |
parent | 369381694ddcf03f1de403501c8b97099b5109ec (diff) |
[PATCH] sched: mc/smt power savings sched policy
sysfs entries 'sched_mc_power_savings' and 'sched_smt_power_savings' in
/sys/devices/system/cpu/ control the MC/SMT power savings policy for the
scheduler.
Based on the values (1-enable, 0-disable) for these controls, sched groups
cpu power will be determined for different domains. When power savings
policy is enabled and under light load conditions, scheduler will minimize
the physical packages/cpu cores carrying the load and thus conserving
power(with a perf impact based on the workload characteristics... see OLS
2005 CMP kernel scheduler paper for more details..)
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Con Kolivas <kernel@kolivas.org>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 240 |
1 files changed, 215 insertions, 25 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 122b75584a13..54fa282657cc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -1162,6 +1162,11 @@ static int sched_balance_self(int cpu, int flag) | |||
1162 | struct sched_domain *tmp, *sd = NULL; | 1162 | struct sched_domain *tmp, *sd = NULL; |
1163 | 1163 | ||
1164 | for_each_domain(cpu, tmp) { | 1164 | for_each_domain(cpu, tmp) { |
1165 | /* | ||
1166 | * If power savings logic is enabled for a domain, stop there. | ||
1167 | */ | ||
1168 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | ||
1169 | break; | ||
1165 | if (tmp->flags & flag) | 1170 | if (tmp->flags & flag) |
1166 | sd = tmp; | 1171 | sd = tmp; |
1167 | } | 1172 | } |
@@ -2082,6 +2087,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2082 | unsigned long busiest_load_per_task, busiest_nr_running; | 2087 | unsigned long busiest_load_per_task, busiest_nr_running; |
2083 | unsigned long this_load_per_task, this_nr_running; | 2088 | unsigned long this_load_per_task, this_nr_running; |
2084 | int load_idx; | 2089 | int load_idx; |
2090 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2091 | int power_savings_balance = 1; | ||
2092 | unsigned long leader_nr_running = 0, min_load_per_task = 0; | ||
2093 | unsigned long min_nr_running = ULONG_MAX; | ||
2094 | struct sched_group *group_min = NULL, *group_leader = NULL; | ||
2095 | #endif | ||
2085 | 2096 | ||
2086 | max_load = this_load = total_load = total_pwr = 0; | 2097 | max_load = this_load = total_load = total_pwr = 0; |
2087 | busiest_load_per_task = busiest_nr_running = 0; | 2098 | busiest_load_per_task = busiest_nr_running = 0; |
@@ -2094,7 +2105,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2094 | load_idx = sd->idle_idx; | 2105 | load_idx = sd->idle_idx; |
2095 | 2106 | ||
2096 | do { | 2107 | do { |
2097 | unsigned long load; | 2108 | unsigned long load, group_capacity; |
2098 | int local_group; | 2109 | int local_group; |
2099 | int i; | 2110 | int i; |
2100 | unsigned long sum_nr_running, sum_weighted_load; | 2111 | unsigned long sum_nr_running, sum_weighted_load; |
@@ -2127,18 +2138,76 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2127 | /* Adjust by relative CPU power of the group */ | 2138 | /* Adjust by relative CPU power of the group */ |
2128 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | 2139 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; |
2129 | 2140 | ||
2141 | group_capacity = group->cpu_power / SCHED_LOAD_SCALE; | ||
2142 | |||
2130 | if (local_group) { | 2143 | if (local_group) { |
2131 | this_load = avg_load; | 2144 | this_load = avg_load; |
2132 | this = group; | 2145 | this = group; |
2133 | this_nr_running = sum_nr_running; | 2146 | this_nr_running = sum_nr_running; |
2134 | this_load_per_task = sum_weighted_load; | 2147 | this_load_per_task = sum_weighted_load; |
2135 | } else if (avg_load > max_load && | 2148 | } else if (avg_load > max_load && |
2136 | sum_nr_running > group->cpu_power / SCHED_LOAD_SCALE) { | 2149 | sum_nr_running > group_capacity) { |
2137 | max_load = avg_load; | 2150 | max_load = avg_load; |
2138 | busiest = group; | 2151 | busiest = group; |
2139 | busiest_nr_running = sum_nr_running; | 2152 | busiest_nr_running = sum_nr_running; |
2140 | busiest_load_per_task = sum_weighted_load; | 2153 | busiest_load_per_task = sum_weighted_load; |
2141 | } | 2154 | } |
2155 | |||
2156 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2157 | /* | ||
2158 | * Busy processors will not participate in power savings | ||
2159 | * balance. | ||
2160 | */ | ||
2161 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
2162 | goto group_next; | ||
2163 | |||
2164 | /* | ||
2165 | * If the local group is idle or completely loaded | ||
2166 | * no need to do power savings balance at this domain | ||
2167 | */ | ||
2168 | if (local_group && (this_nr_running >= group_capacity || | ||
2169 | !this_nr_running)) | ||
2170 | power_savings_balance = 0; | ||
2171 | |||
2172 | /* | ||
2173 | * If a group is already running at full capacity or idle, | ||
2174 | * don't include that group in power savings calculations | ||
2175 | */ | ||
2176 | if (!power_savings_balance || sum_nr_running >= group_capacity | ||
2177 | || !sum_nr_running) | ||
2178 | goto group_next; | ||
2179 | |||
2180 | /* | ||
2181 | * Calculate the group which has the least non-idle load. | ||
2182 | * This is the group from where we need to pick up the load | ||
2183 | * for saving power | ||
2184 | */ | ||
2185 | if ((sum_nr_running < min_nr_running) || | ||
2186 | (sum_nr_running == min_nr_running && | ||
2187 | first_cpu(group->cpumask) < | ||
2188 | first_cpu(group_min->cpumask))) { | ||
2189 | group_min = group; | ||
2190 | min_nr_running = sum_nr_running; | ||
2191 | min_load_per_task = sum_weighted_load / | ||
2192 | sum_nr_running; | ||
2193 | } | ||
2194 | |||
2195 | /* | ||
2196 | * Calculate the group which is almost near its | ||
2197 | * capacity but still has some space to pick up some load | ||
2198 | * from other group and save more power | ||
2199 | */ | ||
2200 | if (sum_nr_running <= group_capacity - 1) | ||
2201 | if (sum_nr_running > leader_nr_running || | ||
2202 | (sum_nr_running == leader_nr_running && | ||
2203 | first_cpu(group->cpumask) > | ||
2204 | first_cpu(group_leader->cpumask))) { | ||
2205 | group_leader = group; | ||
2206 | leader_nr_running = sum_nr_running; | ||
2207 | } | ||
2208 | |||
2209 | group_next: | ||
2210 | #endif | ||
2142 | group = group->next; | 2211 | group = group->next; |
2143 | } while (group != sd->groups); | 2212 | } while (group != sd->groups); |
2144 | 2213 | ||
@@ -2247,7 +2316,16 @@ small_imbalance: | |||
2247 | return busiest; | 2316 | return busiest; |
2248 | 2317 | ||
2249 | out_balanced: | 2318 | out_balanced: |
2319 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2320 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
2321 | goto ret; | ||
2250 | 2322 | ||
2323 | if (this == group_leader && group_leader != group_min) { | ||
2324 | *imbalance = min_load_per_task; | ||
2325 | return group_min; | ||
2326 | } | ||
2327 | ret: | ||
2328 | #endif | ||
2251 | *imbalance = 0; | 2329 | *imbalance = 0; |
2252 | return NULL; | 2330 | return NULL; |
2253 | } | 2331 | } |
@@ -2300,7 +2378,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2300 | int active_balance = 0; | 2378 | int active_balance = 0; |
2301 | int sd_idle = 0; | 2379 | int sd_idle = 0; |
2302 | 2380 | ||
2303 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) | 2381 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && |
2382 | !sched_smt_power_savings) | ||
2304 | sd_idle = 1; | 2383 | sd_idle = 1; |
2305 | 2384 | ||
2306 | schedstat_inc(sd, lb_cnt[idle]); | 2385 | schedstat_inc(sd, lb_cnt[idle]); |
@@ -2389,7 +2468,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2389 | sd->balance_interval *= 2; | 2468 | sd->balance_interval *= 2; |
2390 | } | 2469 | } |
2391 | 2470 | ||
2392 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2471 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2472 | !sched_smt_power_savings) | ||
2393 | return -1; | 2473 | return -1; |
2394 | return nr_moved; | 2474 | return nr_moved; |
2395 | 2475 | ||
@@ -2404,7 +2484,7 @@ out_one_pinned: | |||
2404 | (sd->balance_interval < sd->max_interval)) | 2484 | (sd->balance_interval < sd->max_interval)) |
2405 | sd->balance_interval *= 2; | 2485 | sd->balance_interval *= 2; |
2406 | 2486 | ||
2407 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2487 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) |
2408 | return -1; | 2488 | return -1; |
2409 | return 0; | 2489 | return 0; |
2410 | } | 2490 | } |
@@ -2425,7 +2505,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2425 | int nr_moved = 0; | 2505 | int nr_moved = 0; |
2426 | int sd_idle = 0; | 2506 | int sd_idle = 0; |
2427 | 2507 | ||
2428 | if (sd->flags & SD_SHARE_CPUPOWER) | 2508 | if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) |
2429 | sd_idle = 1; | 2509 | sd_idle = 1; |
2430 | 2510 | ||
2431 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); | 2511 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); |
@@ -2466,7 +2546,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2466 | 2546 | ||
2467 | out_balanced: | 2547 | out_balanced: |
2468 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | 2548 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); |
2469 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2549 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) |
2470 | return -1; | 2550 | return -1; |
2471 | sd->nr_balance_failed = 0; | 2551 | sd->nr_balance_failed = 0; |
2472 | return 0; | 2552 | return 0; |
@@ -5732,6 +5812,7 @@ static cpumask_t sched_domain_node_span(int node) | |||
5732 | } | 5812 | } |
5733 | #endif | 5813 | #endif |
5734 | 5814 | ||
5815 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | ||
5735 | /* | 5816 | /* |
5736 | * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we | 5817 | * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we |
5737 | * can switch it on easily if needed. | 5818 | * can switch it on easily if needed. |
@@ -6113,37 +6194,72 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6113 | #endif | 6194 | #endif |
6114 | 6195 | ||
6115 | /* Calculate CPU power for physical packages and nodes */ | 6196 | /* Calculate CPU power for physical packages and nodes */ |
6197 | #ifdef CONFIG_SCHED_SMT | ||
6116 | for_each_cpu_mask(i, *cpu_map) { | 6198 | for_each_cpu_mask(i, *cpu_map) { |
6117 | int power; | ||
6118 | struct sched_domain *sd; | 6199 | struct sched_domain *sd; |
6119 | #ifdef CONFIG_SCHED_SMT | ||
6120 | sd = &per_cpu(cpu_domains, i); | 6200 | sd = &per_cpu(cpu_domains, i); |
6121 | power = SCHED_LOAD_SCALE; | 6201 | sd->groups->cpu_power = SCHED_LOAD_SCALE; |
6122 | sd->groups->cpu_power = power; | 6202 | } |
6123 | #endif | 6203 | #endif |
6124 | #ifdef CONFIG_SCHED_MC | 6204 | #ifdef CONFIG_SCHED_MC |
6205 | for_each_cpu_mask(i, *cpu_map) { | ||
6206 | int power; | ||
6207 | struct sched_domain *sd; | ||
6125 | sd = &per_cpu(core_domains, i); | 6208 | sd = &per_cpu(core_domains, i); |
6126 | power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) | 6209 | if (sched_smt_power_savings) |
6210 | power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); | ||
6211 | else | ||
6212 | power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) | ||
6127 | * SCHED_LOAD_SCALE / 10; | 6213 | * SCHED_LOAD_SCALE / 10; |
6128 | sd->groups->cpu_power = power; | 6214 | sd->groups->cpu_power = power; |
6215 | } | ||
6216 | #endif | ||
6129 | 6217 | ||
6218 | for_each_cpu_mask(i, *cpu_map) { | ||
6219 | struct sched_domain *sd; | ||
6220 | #ifdef CONFIG_SCHED_MC | ||
6130 | sd = &per_cpu(phys_domains, i); | 6221 | sd = &per_cpu(phys_domains, i); |
6222 | if (i != first_cpu(sd->groups->cpumask)) | ||
6223 | continue; | ||
6131 | 6224 | ||
6132 | /* | 6225 | sd->groups->cpu_power = 0; |
6133 | * This has to be < 2 * SCHED_LOAD_SCALE | 6226 | if (sched_mc_power_savings || sched_smt_power_savings) { |
6134 | * Lets keep it SCHED_LOAD_SCALE, so that | 6227 | int j; |
6135 | * while calculating NUMA group's cpu_power | 6228 | |
6136 | * we can simply do | 6229 | for_each_cpu_mask(j, sd->groups->cpumask) { |
6137 | * numa_group->cpu_power += phys_group->cpu_power; | 6230 | struct sched_domain *sd1; |
6138 | * | 6231 | sd1 = &per_cpu(core_domains, j); |
6139 | * See "only add power once for each physical pkg" | 6232 | /* |
6140 | * comment below | 6233 | * for each core we will add once |
6141 | */ | 6234 | * to the group in physical domain |
6142 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | 6235 | */ |
6236 | if (j != first_cpu(sd1->groups->cpumask)) | ||
6237 | continue; | ||
6238 | |||
6239 | if (sched_smt_power_savings) | ||
6240 | sd->groups->cpu_power += sd1->groups->cpu_power; | ||
6241 | else | ||
6242 | sd->groups->cpu_power += SCHED_LOAD_SCALE; | ||
6243 | } | ||
6244 | } else | ||
6245 | /* | ||
6246 | * This has to be < 2 * SCHED_LOAD_SCALE | ||
6247 | * Lets keep it SCHED_LOAD_SCALE, so that | ||
6248 | * while calculating NUMA group's cpu_power | ||
6249 | * we can simply do | ||
6250 | * numa_group->cpu_power += phys_group->cpu_power; | ||
6251 | * | ||
6252 | * See "only add power once for each physical pkg" | ||
6253 | * comment below | ||
6254 | */ | ||
6255 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | ||
6143 | #else | 6256 | #else |
6257 | int power; | ||
6144 | sd = &per_cpu(phys_domains, i); | 6258 | sd = &per_cpu(phys_domains, i); |
6145 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * | 6259 | if (sched_smt_power_savings) |
6146 | (cpus_weight(sd->groups->cpumask)-1) / 10; | 6260 | power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); |
6261 | else | ||
6262 | power = SCHED_LOAD_SCALE; | ||
6147 | sd->groups->cpu_power = power; | 6263 | sd->groups->cpu_power = power; |
6148 | #endif | 6264 | #endif |
6149 | } | 6265 | } |
@@ -6244,6 +6360,80 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | |||
6244 | return err; | 6360 | return err; |
6245 | } | 6361 | } |
6246 | 6362 | ||
6363 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
6364 | int arch_reinit_sched_domains(void) | ||
6365 | { | ||
6366 | int err; | ||
6367 | |||
6368 | lock_cpu_hotplug(); | ||
6369 | detach_destroy_domains(&cpu_online_map); | ||
6370 | err = arch_init_sched_domains(&cpu_online_map); | ||
6371 | unlock_cpu_hotplug(); | ||
6372 | |||
6373 | return err; | ||
6374 | } | ||
6375 | |||
6376 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | ||
6377 | { | ||
6378 | int ret; | ||
6379 | |||
6380 | if (buf[0] != '0' && buf[0] != '1') | ||
6381 | return -EINVAL; | ||
6382 | |||
6383 | if (smt) | ||
6384 | sched_smt_power_savings = (buf[0] == '1'); | ||
6385 | else | ||
6386 | sched_mc_power_savings = (buf[0] == '1'); | ||
6387 | |||
6388 | ret = arch_reinit_sched_domains(); | ||
6389 | |||
6390 | return ret ? ret : count; | ||
6391 | } | ||
6392 | |||
6393 | int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | ||
6394 | { | ||
6395 | int err = 0; | ||
6396 | #ifdef CONFIG_SCHED_SMT | ||
6397 | if (smt_capable()) | ||
6398 | err = sysfs_create_file(&cls->kset.kobj, | ||
6399 | &attr_sched_smt_power_savings.attr); | ||
6400 | #endif | ||
6401 | #ifdef CONFIG_SCHED_MC | ||
6402 | if (!err && mc_capable()) | ||
6403 | err = sysfs_create_file(&cls->kset.kobj, | ||
6404 | &attr_sched_mc_power_savings.attr); | ||
6405 | #endif | ||
6406 | return err; | ||
6407 | } | ||
6408 | #endif | ||
6409 | |||
6410 | #ifdef CONFIG_SCHED_MC | ||
6411 | static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) | ||
6412 | { | ||
6413 | return sprintf(page, "%u\n", sched_mc_power_savings); | ||
6414 | } | ||
6415 | static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count) | ||
6416 | { | ||
6417 | return sched_power_savings_store(buf, count, 0); | ||
6418 | } | ||
6419 | SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, | ||
6420 | sched_mc_power_savings_store); | ||
6421 | #endif | ||
6422 | |||
6423 | #ifdef CONFIG_SCHED_SMT | ||
6424 | static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) | ||
6425 | { | ||
6426 | return sprintf(page, "%u\n", sched_smt_power_savings); | ||
6427 | } | ||
6428 | static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count) | ||
6429 | { | ||
6430 | return sched_power_savings_store(buf, count, 1); | ||
6431 | } | ||
6432 | SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, | ||
6433 | sched_smt_power_savings_store); | ||
6434 | #endif | ||
6435 | |||
6436 | |||
6247 | #ifdef CONFIG_HOTPLUG_CPU | 6437 | #ifdef CONFIG_HOTPLUG_CPU |
6248 | /* | 6438 | /* |
6249 | * Force a reinitialization of the sched domains hierarchy. The domains | 6439 | * Force a reinitialization of the sched domains hierarchy. The domains |