aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorSiddha, Suresh B <suresh.b.siddha@intel.com>2006-10-03 04:14:09 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-10-03 11:04:06 -0400
commit89c4710ee9bbbefe6a4d469d9f36266a92c275c5 (patch)
treef84fe28e48bbda210f01f22ae0065f7ed1fcc5e1 /kernel
parent1a84887080dc15f048db7c3a643e98f1435790d6 (diff)
[PATCH] sched: cleanup sched_group cpu_power setup
Up to now sched group's cpu_power for each sched domain is initialized independently. This made the setup code ugly as the new sched domains are getting added. Make the sched group cpu_power setup code generic, by using domain child field and new domain flag in sched_domain. For most of the sched domains(except NUMA), sched group's cpu_power is now computed generically using the domain properties of itself and of the child domain. sched groups in NUMA domains are setup little differently and hence they don't use this generic mechanism. Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> Acked-by: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched.c145
1 files changed, 82 insertions, 63 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 0feeacb91497..0a5e814cc618 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2541,8 +2541,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2541 struct rq *busiest; 2541 struct rq *busiest;
2542 cpumask_t cpus = CPU_MASK_ALL; 2542 cpumask_t cpus = CPU_MASK_ALL;
2543 2543
2544 /*
2545 * When power savings policy is enabled for the parent domain, idle
2546 * sibling can pick up load irrespective of busy siblings. In this case,
2547 * let the state of idle sibling percolate up as IDLE, instead of
2548 * portraying it as NOT_IDLE.
2549 */
2544 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && 2550 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2545 !sched_smt_power_savings) 2551 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2546 sd_idle = 1; 2552 sd_idle = 1;
2547 2553
2548 schedstat_inc(sd, lb_cnt[idle]); 2554 schedstat_inc(sd, lb_cnt[idle]);
@@ -2638,7 +2644,7 @@ redo:
2638 } 2644 }
2639 2645
2640 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2646 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2641 !sched_smt_power_savings) 2647 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2642 return -1; 2648 return -1;
2643 return nr_moved; 2649 return nr_moved;
2644 2650
@@ -2654,7 +2660,7 @@ out_one_pinned:
2654 sd->balance_interval *= 2; 2660 sd->balance_interval *= 2;
2655 2661
2656 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2662 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2657 !sched_smt_power_savings) 2663 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2658 return -1; 2664 return -1;
2659 return 0; 2665 return 0;
2660} 2666}
@@ -2676,7 +2682,14 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2676 int sd_idle = 0; 2682 int sd_idle = 0;
2677 cpumask_t cpus = CPU_MASK_ALL; 2683 cpumask_t cpus = CPU_MASK_ALL;
2678 2684
2679 if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) 2685 /*
2686 * When power savings policy is enabled for the parent domain, idle
2687 * sibling can pick up load irrespective of busy siblings. In this case,
2688 * let the state of idle sibling percolate up as IDLE, instead of
2689 * portraying it as NOT_IDLE.
2690 */
2691 if (sd->flags & SD_SHARE_CPUPOWER &&
2692 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2680 sd_idle = 1; 2693 sd_idle = 1;
2681 2694
2682 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2695 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
@@ -2717,7 +2730,8 @@ redo:
2717 2730
2718 if (!nr_moved) { 2731 if (!nr_moved) {
2719 schedstat_inc(sd, lb_failed[NEWLY_IDLE]); 2732 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2720 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2733 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2734 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2721 return -1; 2735 return -1;
2722 } else 2736 } else
2723 sd->nr_balance_failed = 0; 2737 sd->nr_balance_failed = 0;
@@ -2727,7 +2741,7 @@ redo:
2727out_balanced: 2741out_balanced:
2728 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2742 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2729 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2743 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2730 !sched_smt_power_savings) 2744 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2731 return -1; 2745 return -1;
2732 sd->nr_balance_failed = 0; 2746 sd->nr_balance_failed = 0;
2733 2747
@@ -5400,7 +5414,9 @@ static int sd_degenerate(struct sched_domain *sd)
5400 if (sd->flags & (SD_LOAD_BALANCE | 5414 if (sd->flags & (SD_LOAD_BALANCE |
5401 SD_BALANCE_NEWIDLE | 5415 SD_BALANCE_NEWIDLE |
5402 SD_BALANCE_FORK | 5416 SD_BALANCE_FORK |
5403 SD_BALANCE_EXEC)) { 5417 SD_BALANCE_EXEC |
5418 SD_SHARE_CPUPOWER |
5419 SD_SHARE_PKG_RESOURCES)) {
5404 if (sd->groups != sd->groups->next) 5420 if (sd->groups != sd->groups->next)
5405 return 0; 5421 return 0;
5406 } 5422 }
@@ -5434,7 +5450,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5434 pflags &= ~(SD_LOAD_BALANCE | 5450 pflags &= ~(SD_LOAD_BALANCE |
5435 SD_BALANCE_NEWIDLE | 5451 SD_BALANCE_NEWIDLE |
5436 SD_BALANCE_FORK | 5452 SD_BALANCE_FORK |
5437 SD_BALANCE_EXEC); 5453 SD_BALANCE_EXEC |
5454 SD_SHARE_CPUPOWER |
5455 SD_SHARE_PKG_RESOURCES);
5438 } 5456 }
5439 if (~cflags & pflags) 5457 if (~cflags & pflags)
5440 return 0; 5458 return 0;
@@ -6241,12 +6259,65 @@ static void free_sched_groups(const cpumask_t *cpu_map)
6241#endif 6259#endif
6242 6260
6243/* 6261/*
6262 * Initialize sched groups cpu_power.
6263 *
6264 * cpu_power indicates the capacity of sched group, which is used while
6265 * distributing the load between different sched groups in a sched domain.
6266 * Typically cpu_power for all the groups in a sched domain will be same unless
6267 * there are asymmetries in the topology. If there are asymmetries, group
6268 * having more cpu_power will pickup more load compared to the group having
6269 * less cpu_power.
6270 *
6271 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
6272 * the maximum number of tasks a group can handle in the presence of other idle
6273 * or lightly loaded groups in the same sched domain.
6274 */
6275static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6276{
6277 struct sched_domain *child;
6278 struct sched_group *group;
6279
6280 WARN_ON(!sd || !sd->groups);
6281
6282 if (cpu != first_cpu(sd->groups->cpumask))
6283 return;
6284
6285 child = sd->child;
6286
6287 /*
6288 * For perf policy, if the groups in child domain share resources
6289 * (for example cores sharing some portions of the cache hierarchy
6290 * or SMT), then set this domain groups cpu_power such that each group
6291 * can handle only one task, when there are other idle groups in the
6292 * same sched domain.
6293 */
6294 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
6295 (child->flags &
6296 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
6297 sd->groups->cpu_power = SCHED_LOAD_SCALE;
6298 return;
6299 }
6300
6301 sd->groups->cpu_power = 0;
6302
6303 /*
6304 * add cpu_power of each child group to this groups cpu_power
6305 */
6306 group = child->groups;
6307 do {
6308 sd->groups->cpu_power += group->cpu_power;
6309 group = group->next;
6310 } while (group != child->groups);
6311}
6312
6313/*
6244 * Build sched domains for a given set of cpus and attach the sched domains 6314 * Build sched domains for a given set of cpus and attach the sched domains
6245 * to the individual cpus 6315 * to the individual cpus
6246 */ 6316 */
6247static int build_sched_domains(const cpumask_t *cpu_map) 6317static int build_sched_domains(const cpumask_t *cpu_map)
6248{ 6318{
6249 int i; 6319 int i;
6320 struct sched_domain *sd;
6250#ifdef CONFIG_NUMA 6321#ifdef CONFIG_NUMA
6251 struct sched_group **sched_group_nodes = NULL; 6322 struct sched_group **sched_group_nodes = NULL;
6252 struct sched_group *sched_group_allnodes = NULL; 6323 struct sched_group *sched_group_allnodes = NULL;
@@ -6456,72 +6527,20 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6456 /* Calculate CPU power for physical packages and nodes */ 6527 /* Calculate CPU power for physical packages and nodes */
6457#ifdef CONFIG_SCHED_SMT 6528#ifdef CONFIG_SCHED_SMT
6458 for_each_cpu_mask(i, *cpu_map) { 6529 for_each_cpu_mask(i, *cpu_map) {
6459 struct sched_domain *sd;
6460 sd = &per_cpu(cpu_domains, i); 6530 sd = &per_cpu(cpu_domains, i);
6461 sd->groups->cpu_power = SCHED_LOAD_SCALE; 6531 init_sched_groups_power(i, sd);
6462 } 6532 }
6463#endif 6533#endif
6464#ifdef CONFIG_SCHED_MC 6534#ifdef CONFIG_SCHED_MC
6465 for_each_cpu_mask(i, *cpu_map) { 6535 for_each_cpu_mask(i, *cpu_map) {
6466 int power;
6467 struct sched_domain *sd;
6468 sd = &per_cpu(core_domains, i); 6536 sd = &per_cpu(core_domains, i);
6469 if (sched_smt_power_savings) 6537 init_sched_groups_power(i, sd);
6470 power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
6471 else
6472 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
6473 * SCHED_LOAD_SCALE / 10;
6474 sd->groups->cpu_power = power;
6475 } 6538 }
6476#endif 6539#endif
6477 6540
6478 for_each_cpu_mask(i, *cpu_map) { 6541 for_each_cpu_mask(i, *cpu_map) {
6479 struct sched_domain *sd;
6480#ifdef CONFIG_SCHED_MC
6481 sd = &per_cpu(phys_domains, i);
6482 if (i != first_cpu(sd->groups->cpumask))
6483 continue;
6484
6485 sd->groups->cpu_power = 0;
6486 if (sched_mc_power_savings || sched_smt_power_savings) {
6487 int j;
6488
6489 for_each_cpu_mask(j, sd->groups->cpumask) {
6490 struct sched_domain *sd1;
6491 sd1 = &per_cpu(core_domains, j);
6492 /*
6493 * for each core we will add once
6494 * to the group in physical domain
6495 */
6496 if (j != first_cpu(sd1->groups->cpumask))
6497 continue;
6498
6499 if (sched_smt_power_savings)
6500 sd->groups->cpu_power += sd1->groups->cpu_power;
6501 else
6502 sd->groups->cpu_power += SCHED_LOAD_SCALE;
6503 }
6504 } else
6505 /*
6506 * This has to be < 2 * SCHED_LOAD_SCALE
6507 * Lets keep it SCHED_LOAD_SCALE, so that
6508 * while calculating NUMA group's cpu_power
6509 * we can simply do
6510 * numa_group->cpu_power += phys_group->cpu_power;
6511 *
6512 * See "only add power once for each physical pkg"
6513 * comment below
6514 */
6515 sd->groups->cpu_power = SCHED_LOAD_SCALE;
6516#else
6517 int power;
6518 sd = &per_cpu(phys_domains, i); 6542 sd = &per_cpu(phys_domains, i);
6519 if (sched_smt_power_savings) 6543 init_sched_groups_power(i, sd);
6520 power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
6521 else
6522 power = SCHED_LOAD_SCALE;
6523 sd->groups->cpu_power = power;
6524#endif
6525 } 6544 }
6526 6545
6527#ifdef CONFIG_NUMA 6546#ifdef CONFIG_NUMA