diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched.c | 145 |
1 files changed, 82 insertions, 63 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 0feeacb91497..0a5e814cc618 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -2541,8 +2541,14 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
2541 | struct rq *busiest; | 2541 | struct rq *busiest; |
2542 | cpumask_t cpus = CPU_MASK_ALL; | 2542 | cpumask_t cpus = CPU_MASK_ALL; |
2543 | 2543 | ||
2544 | /* | ||
2545 | * When power savings policy is enabled for the parent domain, idle | ||
2546 | * sibling can pick up load irrespective of busy siblings. In this case, | ||
2547 | * let the state of idle sibling percolate up as IDLE, instead of | ||
2548 | * portraying it as NOT_IDLE. | ||
2549 | */ | ||
2544 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && | 2550 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && |
2545 | !sched_smt_power_savings) | 2551 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2546 | sd_idle = 1; | 2552 | sd_idle = 1; |
2547 | 2553 | ||
2548 | schedstat_inc(sd, lb_cnt[idle]); | 2554 | schedstat_inc(sd, lb_cnt[idle]); |
@@ -2638,7 +2644,7 @@ redo: | |||
2638 | } | 2644 | } |
2639 | 2645 | ||
2640 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 2646 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2641 | !sched_smt_power_savings) | 2647 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2642 | return -1; | 2648 | return -1; |
2643 | return nr_moved; | 2649 | return nr_moved; |
2644 | 2650 | ||
@@ -2654,7 +2660,7 @@ out_one_pinned: | |||
2654 | sd->balance_interval *= 2; | 2660 | sd->balance_interval *= 2; |
2655 | 2661 | ||
2656 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 2662 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2657 | !sched_smt_power_savings) | 2663 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2658 | return -1; | 2664 | return -1; |
2659 | return 0; | 2665 | return 0; |
2660 | } | 2666 | } |
@@ -2676,7 +2682,14 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | |||
2676 | int sd_idle = 0; | 2682 | int sd_idle = 0; |
2677 | cpumask_t cpus = CPU_MASK_ALL; | 2683 | cpumask_t cpus = CPU_MASK_ALL; |
2678 | 2684 | ||
2679 | if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) | 2685 | /* |
2686 | * When power savings policy is enabled for the parent domain, idle | ||
2687 | * sibling can pick up load irrespective of busy siblings. In this case, | ||
2688 | * let the state of idle sibling percolate up as IDLE, instead of | ||
2689 | * portraying it as NOT_IDLE. | ||
2690 | */ | ||
2691 | if (sd->flags & SD_SHARE_CPUPOWER && | ||
2692 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
2680 | sd_idle = 1; | 2693 | sd_idle = 1; |
2681 | 2694 | ||
2682 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); | 2695 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); |
@@ -2717,7 +2730,8 @@ redo: | |||
2717 | 2730 | ||
2718 | if (!nr_moved) { | 2731 | if (!nr_moved) { |
2719 | schedstat_inc(sd, lb_failed[NEWLY_IDLE]); | 2732 | schedstat_inc(sd, lb_failed[NEWLY_IDLE]); |
2720 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2733 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2734 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
2721 | return -1; | 2735 | return -1; |
2722 | } else | 2736 | } else |
2723 | sd->nr_balance_failed = 0; | 2737 | sd->nr_balance_failed = 0; |
@@ -2727,7 +2741,7 @@ redo: | |||
2727 | out_balanced: | 2741 | out_balanced: |
2728 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | 2742 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); |
2729 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 2743 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2730 | !sched_smt_power_savings) | 2744 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2731 | return -1; | 2745 | return -1; |
2732 | sd->nr_balance_failed = 0; | 2746 | sd->nr_balance_failed = 0; |
2733 | 2747 | ||
@@ -5400,7 +5414,9 @@ static int sd_degenerate(struct sched_domain *sd) | |||
5400 | if (sd->flags & (SD_LOAD_BALANCE | | 5414 | if (sd->flags & (SD_LOAD_BALANCE | |
5401 | SD_BALANCE_NEWIDLE | | 5415 | SD_BALANCE_NEWIDLE | |
5402 | SD_BALANCE_FORK | | 5416 | SD_BALANCE_FORK | |
5403 | SD_BALANCE_EXEC)) { | 5417 | SD_BALANCE_EXEC | |
5418 | SD_SHARE_CPUPOWER | | ||
5419 | SD_SHARE_PKG_RESOURCES)) { | ||
5404 | if (sd->groups != sd->groups->next) | 5420 | if (sd->groups != sd->groups->next) |
5405 | return 0; | 5421 | return 0; |
5406 | } | 5422 | } |
@@ -5434,7 +5450,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
5434 | pflags &= ~(SD_LOAD_BALANCE | | 5450 | pflags &= ~(SD_LOAD_BALANCE | |
5435 | SD_BALANCE_NEWIDLE | | 5451 | SD_BALANCE_NEWIDLE | |
5436 | SD_BALANCE_FORK | | 5452 | SD_BALANCE_FORK | |
5437 | SD_BALANCE_EXEC); | 5453 | SD_BALANCE_EXEC | |
5454 | SD_SHARE_CPUPOWER | | ||
5455 | SD_SHARE_PKG_RESOURCES); | ||
5438 | } | 5456 | } |
5439 | if (~cflags & pflags) | 5457 | if (~cflags & pflags) |
5440 | return 0; | 5458 | return 0; |
@@ -6241,12 +6259,65 @@ static void free_sched_groups(const cpumask_t *cpu_map) | |||
6241 | #endif | 6259 | #endif |
6242 | 6260 | ||
6243 | /* | 6261 | /* |
6262 | * Initialize sched groups cpu_power. | ||
6263 | * | ||
6264 | * cpu_power indicates the capacity of sched group, which is used while | ||
6265 | * distributing the load between different sched groups in a sched domain. | ||
6266 | * Typically cpu_power for all the groups in a sched domain will be same unless | ||
6267 | * there are asymmetries in the topology. If there are asymmetries, group | ||
6268 | * having more cpu_power will pickup more load compared to the group having | ||
6269 | * less cpu_power. | ||
6270 | * | ||
6271 | * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents | ||
6272 | * the maximum number of tasks a group can handle in the presence of other idle | ||
6273 | * or lightly loaded groups in the same sched domain. | ||
6274 | */ | ||
6275 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | ||
6276 | { | ||
6277 | struct sched_domain *child; | ||
6278 | struct sched_group *group; | ||
6279 | |||
6280 | WARN_ON(!sd || !sd->groups); | ||
6281 | |||
6282 | if (cpu != first_cpu(sd->groups->cpumask)) | ||
6283 | return; | ||
6284 | |||
6285 | child = sd->child; | ||
6286 | |||
6287 | /* | ||
6288 | * For perf policy, if the groups in child domain share resources | ||
6289 | * (for example cores sharing some portions of the cache hierarchy | ||
6290 | * or SMT), then set this domain groups cpu_power such that each group | ||
6291 | * can handle only one task, when there are other idle groups in the | ||
6292 | * same sched domain. | ||
6293 | */ | ||
6294 | if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && | ||
6295 | (child->flags & | ||
6296 | (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { | ||
6297 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | ||
6298 | return; | ||
6299 | } | ||
6300 | |||
6301 | sd->groups->cpu_power = 0; | ||
6302 | |||
6303 | /* | ||
6304 | * add cpu_power of each child group to this groups cpu_power | ||
6305 | */ | ||
6306 | group = child->groups; | ||
6307 | do { | ||
6308 | sd->groups->cpu_power += group->cpu_power; | ||
6309 | group = group->next; | ||
6310 | } while (group != child->groups); | ||
6311 | } | ||
6312 | |||
6313 | /* | ||
6244 | * Build sched domains for a given set of cpus and attach the sched domains | 6314 | * Build sched domains for a given set of cpus and attach the sched domains |
6245 | * to the individual cpus | 6315 | * to the individual cpus |
6246 | */ | 6316 | */ |
6247 | static int build_sched_domains(const cpumask_t *cpu_map) | 6317 | static int build_sched_domains(const cpumask_t *cpu_map) |
6248 | { | 6318 | { |
6249 | int i; | 6319 | int i; |
6320 | struct sched_domain *sd; | ||
6250 | #ifdef CONFIG_NUMA | 6321 | #ifdef CONFIG_NUMA |
6251 | struct sched_group **sched_group_nodes = NULL; | 6322 | struct sched_group **sched_group_nodes = NULL; |
6252 | struct sched_group *sched_group_allnodes = NULL; | 6323 | struct sched_group *sched_group_allnodes = NULL; |
@@ -6456,72 +6527,20 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6456 | /* Calculate CPU power for physical packages and nodes */ | 6527 | /* Calculate CPU power for physical packages and nodes */ |
6457 | #ifdef CONFIG_SCHED_SMT | 6528 | #ifdef CONFIG_SCHED_SMT |
6458 | for_each_cpu_mask(i, *cpu_map) { | 6529 | for_each_cpu_mask(i, *cpu_map) { |
6459 | struct sched_domain *sd; | ||
6460 | sd = &per_cpu(cpu_domains, i); | 6530 | sd = &per_cpu(cpu_domains, i); |
6461 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | 6531 | init_sched_groups_power(i, sd); |
6462 | } | 6532 | } |
6463 | #endif | 6533 | #endif |
6464 | #ifdef CONFIG_SCHED_MC | 6534 | #ifdef CONFIG_SCHED_MC |
6465 | for_each_cpu_mask(i, *cpu_map) { | 6535 | for_each_cpu_mask(i, *cpu_map) { |
6466 | int power; | ||
6467 | struct sched_domain *sd; | ||
6468 | sd = &per_cpu(core_domains, i); | 6536 | sd = &per_cpu(core_domains, i); |
6469 | if (sched_smt_power_savings) | 6537 | init_sched_groups_power(i, sd); |
6470 | power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); | ||
6471 | else | ||
6472 | power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) | ||
6473 | * SCHED_LOAD_SCALE / 10; | ||
6474 | sd->groups->cpu_power = power; | ||
6475 | } | 6538 | } |
6476 | #endif | 6539 | #endif |
6477 | 6540 | ||
6478 | for_each_cpu_mask(i, *cpu_map) { | 6541 | for_each_cpu_mask(i, *cpu_map) { |
6479 | struct sched_domain *sd; | ||
6480 | #ifdef CONFIG_SCHED_MC | ||
6481 | sd = &per_cpu(phys_domains, i); | ||
6482 | if (i != first_cpu(sd->groups->cpumask)) | ||
6483 | continue; | ||
6484 | |||
6485 | sd->groups->cpu_power = 0; | ||
6486 | if (sched_mc_power_savings || sched_smt_power_savings) { | ||
6487 | int j; | ||
6488 | |||
6489 | for_each_cpu_mask(j, sd->groups->cpumask) { | ||
6490 | struct sched_domain *sd1; | ||
6491 | sd1 = &per_cpu(core_domains, j); | ||
6492 | /* | ||
6493 | * for each core we will add once | ||
6494 | * to the group in physical domain | ||
6495 | */ | ||
6496 | if (j != first_cpu(sd1->groups->cpumask)) | ||
6497 | continue; | ||
6498 | |||
6499 | if (sched_smt_power_savings) | ||
6500 | sd->groups->cpu_power += sd1->groups->cpu_power; | ||
6501 | else | ||
6502 | sd->groups->cpu_power += SCHED_LOAD_SCALE; | ||
6503 | } | ||
6504 | } else | ||
6505 | /* | ||
6506 | * This has to be < 2 * SCHED_LOAD_SCALE | ||
6507 | * Lets keep it SCHED_LOAD_SCALE, so that | ||
6508 | * while calculating NUMA group's cpu_power | ||
6509 | * we can simply do | ||
6510 | * numa_group->cpu_power += phys_group->cpu_power; | ||
6511 | * | ||
6512 | * See "only add power once for each physical pkg" | ||
6513 | * comment below | ||
6514 | */ | ||
6515 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | ||
6516 | #else | ||
6517 | int power; | ||
6518 | sd = &per_cpu(phys_domains, i); | 6542 | sd = &per_cpu(phys_domains, i); |
6519 | if (sched_smt_power_savings) | 6543 | init_sched_groups_power(i, sd); |
6520 | power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); | ||
6521 | else | ||
6522 | power = SCHED_LOAD_SCALE; | ||
6523 | sd->groups->cpu_power = power; | ||
6524 | #endif | ||
6525 | } | 6544 | } |
6526 | 6545 | ||
6527 | #ifdef CONFIG_NUMA | 6546 | #ifdef CONFIG_NUMA |