aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/sched.h12
-rw-r--r--include/linux/topology.h43
-rw-r--r--kernel/sched.c145
3 files changed, 125 insertions, 75 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8e26c9069f15..331f4502e92b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -625,9 +625,17 @@ enum idle_type
625#define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ 625#define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */
626#define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ 626#define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */
627#define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ 627#define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */
628#define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */
628 629
629#define BALANCE_FOR_POWER ((sched_mc_power_savings || sched_smt_power_savings) \ 630#define BALANCE_FOR_MC_POWER \
630 ? SD_POWERSAVINGS_BALANCE : 0) 631 (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
632
633#define BALANCE_FOR_PKG_POWER \
634 ((sched_mc_power_savings || sched_smt_power_savings) ? \
635 SD_POWERSAVINGS_BALANCE : 0)
636
637#define test_sd_parent(sd, flag) ((sd->parent && \
638 (sd->parent->flags & flag)) ? 1 : 0)
631 639
632 640
633struct sched_group { 641struct sched_group {
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 486bec23f986..da508d1998e4 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -115,6 +115,38 @@
115#endif 115#endif
116#endif /* CONFIG_SCHED_SMT */ 116#endif /* CONFIG_SCHED_SMT */
117 117
118#ifdef CONFIG_SCHED_MC
119/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */
120#ifndef SD_MC_INIT
121#define SD_MC_INIT (struct sched_domain) { \
122 .span = CPU_MASK_NONE, \
123 .parent = NULL, \
124 .child = NULL, \
125 .groups = NULL, \
126 .min_interval = 1, \
127 .max_interval = 4, \
128 .busy_factor = 64, \
129 .imbalance_pct = 125, \
130 .cache_nice_tries = 1, \
131 .per_cpu_gain = 100, \
132 .busy_idx = 2, \
133 .idle_idx = 1, \
134 .newidle_idx = 2, \
135 .wake_idx = 1, \
136 .forkexec_idx = 1, \
137 .flags = SD_LOAD_BALANCE \
138 | SD_BALANCE_NEWIDLE \
139 | SD_BALANCE_EXEC \
140 | SD_WAKE_AFFINE \
141 | SD_SHARE_PKG_RESOURCES\
142 | BALANCE_FOR_MC_POWER, \
143 .last_balance = jiffies, \
144 .balance_interval = 1, \
145 .nr_balance_failed = 0, \
146}
147#endif
148#endif /* CONFIG_SCHED_MC */
149
118/* Common values for CPUs */ 150/* Common values for CPUs */
119#ifndef SD_CPU_INIT 151#ifndef SD_CPU_INIT
120#define SD_CPU_INIT (struct sched_domain) { \ 152#define SD_CPU_INIT (struct sched_domain) { \
@@ -137,7 +169,7 @@
137 | SD_BALANCE_NEWIDLE \ 169 | SD_BALANCE_NEWIDLE \
138 | SD_BALANCE_EXEC \ 170 | SD_BALANCE_EXEC \
139 | SD_WAKE_AFFINE \ 171 | SD_WAKE_AFFINE \
140 | BALANCE_FOR_POWER, \ 172 | BALANCE_FOR_PKG_POWER,\
141 .last_balance = jiffies, \ 173 .last_balance = jiffies, \
142 .balance_interval = 1, \ 174 .balance_interval = 1, \
143 .nr_balance_failed = 0, \ 175 .nr_balance_failed = 0, \
@@ -168,15 +200,6 @@
168 .nr_balance_failed = 0, \ 200 .nr_balance_failed = 0, \
169} 201}
170 202
171#ifdef CONFIG_SCHED_MC
172#ifndef SD_MC_INIT
173/* for now its same as SD_CPU_INIT.
174 * TBD: Tune Domain parameters!
175 */
176#define SD_MC_INIT SD_CPU_INIT
177#endif
178#endif
179
180#ifdef CONFIG_NUMA 203#ifdef CONFIG_NUMA
181#ifndef SD_NODE_INIT 204#ifndef SD_NODE_INIT
182#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!! 205#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
diff --git a/kernel/sched.c b/kernel/sched.c
index 0feeacb91497..0a5e814cc618 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2541,8 +2541,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2541 struct rq *busiest; 2541 struct rq *busiest;
2542 cpumask_t cpus = CPU_MASK_ALL; 2542 cpumask_t cpus = CPU_MASK_ALL;
2543 2543
2544 /*
2545 * When power savings policy is enabled for the parent domain, idle
2546 * sibling can pick up load irrespective of busy siblings. In this case,
2547 * let the state of idle sibling percolate up as IDLE, instead of
2548 * portraying it as NOT_IDLE.
2549 */
2544 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && 2550 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2545 !sched_smt_power_savings) 2551 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2546 sd_idle = 1; 2552 sd_idle = 1;
2547 2553
2548 schedstat_inc(sd, lb_cnt[idle]); 2554 schedstat_inc(sd, lb_cnt[idle]);
@@ -2638,7 +2644,7 @@ redo:
2638 } 2644 }
2639 2645
2640 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2646 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2641 !sched_smt_power_savings) 2647 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2642 return -1; 2648 return -1;
2643 return nr_moved; 2649 return nr_moved;
2644 2650
@@ -2654,7 +2660,7 @@ out_one_pinned:
2654 sd->balance_interval *= 2; 2660 sd->balance_interval *= 2;
2655 2661
2656 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2662 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2657 !sched_smt_power_savings) 2663 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2658 return -1; 2664 return -1;
2659 return 0; 2665 return 0;
2660} 2666}
@@ -2676,7 +2682,14 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2676 int sd_idle = 0; 2682 int sd_idle = 0;
2677 cpumask_t cpus = CPU_MASK_ALL; 2683 cpumask_t cpus = CPU_MASK_ALL;
2678 2684
2679 if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) 2685 /*
2686 * When power savings policy is enabled for the parent domain, idle
2687 * sibling can pick up load irrespective of busy siblings. In this case,
2688 * let the state of idle sibling percolate up as IDLE, instead of
2689 * portraying it as NOT_IDLE.
2690 */
2691 if (sd->flags & SD_SHARE_CPUPOWER &&
2692 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2680 sd_idle = 1; 2693 sd_idle = 1;
2681 2694
2682 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2695 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
@@ -2717,7 +2730,8 @@ redo:
2717 2730
2718 if (!nr_moved) { 2731 if (!nr_moved) {
2719 schedstat_inc(sd, lb_failed[NEWLY_IDLE]); 2732 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2720 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2733 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2734 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2721 return -1; 2735 return -1;
2722 } else 2736 } else
2723 sd->nr_balance_failed = 0; 2737 sd->nr_balance_failed = 0;
@@ -2727,7 +2741,7 @@ redo:
2727out_balanced: 2741out_balanced:
2728 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2742 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2729 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2743 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2730 !sched_smt_power_savings) 2744 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2731 return -1; 2745 return -1;
2732 sd->nr_balance_failed = 0; 2746 sd->nr_balance_failed = 0;
2733 2747
@@ -5400,7 +5414,9 @@ static int sd_degenerate(struct sched_domain *sd)
5400 if (sd->flags & (SD_LOAD_BALANCE | 5414 if (sd->flags & (SD_LOAD_BALANCE |
5401 SD_BALANCE_NEWIDLE | 5415 SD_BALANCE_NEWIDLE |
5402 SD_BALANCE_FORK | 5416 SD_BALANCE_FORK |
5403 SD_BALANCE_EXEC)) { 5417 SD_BALANCE_EXEC |
5418 SD_SHARE_CPUPOWER |
5419 SD_SHARE_PKG_RESOURCES)) {
5404 if (sd->groups != sd->groups->next) 5420 if (sd->groups != sd->groups->next)
5405 return 0; 5421 return 0;
5406 } 5422 }
@@ -5434,7 +5450,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5434 pflags &= ~(SD_LOAD_BALANCE | 5450 pflags &= ~(SD_LOAD_BALANCE |
5435 SD_BALANCE_NEWIDLE | 5451 SD_BALANCE_NEWIDLE |
5436 SD_BALANCE_FORK | 5452 SD_BALANCE_FORK |
5437 SD_BALANCE_EXEC); 5453 SD_BALANCE_EXEC |
5454 SD_SHARE_CPUPOWER |
5455 SD_SHARE_PKG_RESOURCES);
5438 } 5456 }
5439 if (~cflags & pflags) 5457 if (~cflags & pflags)
5440 return 0; 5458 return 0;
@@ -6241,12 +6259,65 @@ static void free_sched_groups(const cpumask_t *cpu_map)
6241#endif 6259#endif
6242 6260
6243/* 6261/*
6262 * Initialize sched groups cpu_power.
6263 *
6264 * cpu_power indicates the capacity of sched group, which is used while
6265 * distributing the load between different sched groups in a sched domain.
6266 * Typically cpu_power for all the groups in a sched domain will be same unless
6267 * there are asymmetries in the topology. If there are asymmetries, group
6268 * having more cpu_power will pickup more load compared to the group having
6269 * less cpu_power.
6270 *
6271 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
6272 * the maximum number of tasks a group can handle in the presence of other idle
6273 * or lightly loaded groups in the same sched domain.
6274 */
6275static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6276{
6277 struct sched_domain *child;
6278 struct sched_group *group;
6279
6280 WARN_ON(!sd || !sd->groups);
6281
6282 if (cpu != first_cpu(sd->groups->cpumask))
6283 return;
6284
6285 child = sd->child;
6286
6287 /*
6288 * For perf policy, if the groups in child domain share resources
6289 * (for example cores sharing some portions of the cache hierarchy
6290 * or SMT), then set this domain groups cpu_power such that each group
6291 * can handle only one task, when there are other idle groups in the
6292 * same sched domain.
6293 */
6294 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
6295 (child->flags &
6296 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
6297 sd->groups->cpu_power = SCHED_LOAD_SCALE;
6298 return;
6299 }
6300
6301 sd->groups->cpu_power = 0;
6302
6303 /*
6304 * add cpu_power of each child group to this groups cpu_power
6305 */
6306 group = child->groups;
6307 do {
6308 sd->groups->cpu_power += group->cpu_power;
6309 group = group->next;
6310 } while (group != child->groups);
6311}
6312
6313/*
6244 * Build sched domains for a given set of cpus and attach the sched domains 6314 * Build sched domains for a given set of cpus and attach the sched domains
6245 * to the individual cpus 6315 * to the individual cpus
6246 */ 6316 */
6247static int build_sched_domains(const cpumask_t *cpu_map) 6317static int build_sched_domains(const cpumask_t *cpu_map)
6248{ 6318{
6249 int i; 6319 int i;
6320 struct sched_domain *sd;
6250#ifdef CONFIG_NUMA 6321#ifdef CONFIG_NUMA
6251 struct sched_group **sched_group_nodes = NULL; 6322 struct sched_group **sched_group_nodes = NULL;
6252 struct sched_group *sched_group_allnodes = NULL; 6323 struct sched_group *sched_group_allnodes = NULL;
@@ -6456,72 +6527,20 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6456 /* Calculate CPU power for physical packages and nodes */ 6527 /* Calculate CPU power for physical packages and nodes */
6457#ifdef CONFIG_SCHED_SMT 6528#ifdef CONFIG_SCHED_SMT
6458 for_each_cpu_mask(i, *cpu_map) { 6529 for_each_cpu_mask(i, *cpu_map) {
6459 struct sched_domain *sd;
6460 sd = &per_cpu(cpu_domains, i); 6530 sd = &per_cpu(cpu_domains, i);
6461 sd->groups->cpu_power = SCHED_LOAD_SCALE; 6531 init_sched_groups_power(i, sd);
6462 } 6532 }
6463#endif 6533#endif
6464#ifdef CONFIG_SCHED_MC 6534#ifdef CONFIG_SCHED_MC
6465 for_each_cpu_mask(i, *cpu_map) { 6535 for_each_cpu_mask(i, *cpu_map) {
6466 int power;
6467 struct sched_domain *sd;
6468 sd = &per_cpu(core_domains, i); 6536 sd = &per_cpu(core_domains, i);
6469 if (sched_smt_power_savings) 6537 init_sched_groups_power(i, sd);
6470 power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
6471 else
6472 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
6473 * SCHED_LOAD_SCALE / 10;
6474 sd->groups->cpu_power = power;
6475 } 6538 }
6476#endif 6539#endif
6477 6540
6478 for_each_cpu_mask(i, *cpu_map) { 6541 for_each_cpu_mask(i, *cpu_map) {
6479 struct sched_domain *sd;
6480#ifdef CONFIG_SCHED_MC
6481 sd = &per_cpu(phys_domains, i);
6482 if (i != first_cpu(sd->groups->cpumask))
6483 continue;
6484
6485 sd->groups->cpu_power = 0;
6486 if (sched_mc_power_savings || sched_smt_power_savings) {
6487 int j;
6488
6489 for_each_cpu_mask(j, sd->groups->cpumask) {
6490 struct sched_domain *sd1;
6491 sd1 = &per_cpu(core_domains, j);
6492 /*
6493 * for each core we will add once
6494 * to the group in physical domain
6495 */
6496 if (j != first_cpu(sd1->groups->cpumask))
6497 continue;
6498
6499 if (sched_smt_power_savings)
6500 sd->groups->cpu_power += sd1->groups->cpu_power;
6501 else
6502 sd->groups->cpu_power += SCHED_LOAD_SCALE;
6503 }
6504 } else
6505 /*
6506 * This has to be < 2 * SCHED_LOAD_SCALE
6507 * Lets keep it SCHED_LOAD_SCALE, so that
6508 * while calculating NUMA group's cpu_power
6509 * we can simply do
6510 * numa_group->cpu_power += phys_group->cpu_power;
6511 *
6512 * See "only add power once for each physical pkg"
6513 * comment below
6514 */
6515 sd->groups->cpu_power = SCHED_LOAD_SCALE;
6516#else
6517 int power;
6518 sd = &per_cpu(phys_domains, i); 6542 sd = &per_cpu(phys_domains, i);
6519 if (sched_smt_power_savings) 6543 init_sched_groups_power(i, sd);
6520 power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
6521 else
6522 power = SCHED_LOAD_SCALE;
6523 sd->groups->cpu_power = power;
6524#endif
6525 } 6544 }
6526 6545
6527#ifdef CONFIG_NUMA 6546#ifdef CONFIG_NUMA