aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/sched.h1
-rw-r--r--kernel/sched.c59
2 files changed, 48 insertions, 12 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ea92e5c89089..72d6927d29ed 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -707,6 +707,7 @@ struct sched_domain {
707 unsigned long lb_hot_gained[MAX_IDLE_TYPES]; 707 unsigned long lb_hot_gained[MAX_IDLE_TYPES];
708 unsigned long lb_nobusyg[MAX_IDLE_TYPES]; 708 unsigned long lb_nobusyg[MAX_IDLE_TYPES];
709 unsigned long lb_nobusyq[MAX_IDLE_TYPES]; 709 unsigned long lb_nobusyq[MAX_IDLE_TYPES];
710 unsigned long lb_stopbalance[MAX_IDLE_TYPES];
710 711
711 /* Active load balancing */ 712 /* Active load balancing */
712 unsigned long alb_cnt; 713 unsigned long alb_cnt;
diff --git a/kernel/sched.c b/kernel/sched.c
index 15ce772a471a..4e453431c61a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -428,7 +428,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
428 * bump this up when changing the output format or the meaning of an existing 428 * bump this up when changing the output format or the meaning of an existing
429 * format, so that tools can adapt (or abort) 429 * format, so that tools can adapt (or abort)
430 */ 430 */
431#define SCHEDSTAT_VERSION 12 431#define SCHEDSTAT_VERSION 13
432 432
433static int show_schedstat(struct seq_file *seq, void *v) 433static int show_schedstat(struct seq_file *seq, void *v)
434{ 434{
@@ -466,7 +466,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
466 seq_printf(seq, "domain%d %s", dcnt++, mask_str); 466 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
467 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; 467 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
468 itype++) { 468 itype++) {
469 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", 469 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu",
470 sd->lb_cnt[itype], 470 sd->lb_cnt[itype],
471 sd->lb_balanced[itype], 471 sd->lb_balanced[itype],
472 sd->lb_failed[itype], 472 sd->lb_failed[itype],
@@ -474,7 +474,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
474 sd->lb_gained[itype], 474 sd->lb_gained[itype],
475 sd->lb_hot_gained[itype], 475 sd->lb_hot_gained[itype],
476 sd->lb_nobusyq[itype], 476 sd->lb_nobusyq[itype],
477 sd->lb_nobusyg[itype]); 477 sd->lb_nobusyg[itype],
478 sd->lb_stopbalance[itype]);
478 } 479 }
479 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", 480 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
480 sd->alb_cnt, sd->alb_failed, sd->alb_pushed, 481 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
@@ -2249,7 +2250,7 @@ out:
2249static struct sched_group * 2250static struct sched_group *
2250find_busiest_group(struct sched_domain *sd, int this_cpu, 2251find_busiest_group(struct sched_domain *sd, int this_cpu,
2251 unsigned long *imbalance, enum idle_type idle, int *sd_idle, 2252 unsigned long *imbalance, enum idle_type idle, int *sd_idle,
2252 cpumask_t *cpus) 2253 cpumask_t *cpus, int *balance)
2253{ 2254{
2254 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2255 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2255 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2256 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2278,10 +2279,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2278 unsigned long load, group_capacity; 2279 unsigned long load, group_capacity;
2279 int local_group; 2280 int local_group;
2280 int i; 2281 int i;
2282 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2281 unsigned long sum_nr_running, sum_weighted_load; 2283 unsigned long sum_nr_running, sum_weighted_load;
2282 2284
2283 local_group = cpu_isset(this_cpu, group->cpumask); 2285 local_group = cpu_isset(this_cpu, group->cpumask);
2284 2286
2287 if (local_group)
2288 balance_cpu = first_cpu(group->cpumask);
2289
2285 /* Tally up the load of all CPUs in the group */ 2290 /* Tally up the load of all CPUs in the group */
2286 sum_weighted_load = sum_nr_running = avg_load = 0; 2291 sum_weighted_load = sum_nr_running = avg_load = 0;
2287 2292
@@ -2297,9 +2302,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2297 *sd_idle = 0; 2302 *sd_idle = 0;
2298 2303
2299 /* Bias balancing toward cpus of our domain */ 2304 /* Bias balancing toward cpus of our domain */
2300 if (local_group) 2305 if (local_group) {
2306 if (idle_cpu(i) && !first_idle_cpu) {
2307 first_idle_cpu = 1;
2308 balance_cpu = i;
2309 }
2310
2301 load = target_load(i, load_idx); 2311 load = target_load(i, load_idx);
2302 else 2312 } else
2303 load = source_load(i, load_idx); 2313 load = source_load(i, load_idx);
2304 2314
2305 avg_load += load; 2315 avg_load += load;
@@ -2307,6 +2317,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2307 sum_weighted_load += rq->raw_weighted_load; 2317 sum_weighted_load += rq->raw_weighted_load;
2308 } 2318 }
2309 2319
2320 /*
2321 * First idle cpu or the first cpu(busiest) in this sched group
2322 * is eligible for doing load balancing at this and above
2323 * domains.
2324 */
2325 if (local_group && balance_cpu != this_cpu && balance) {
2326 *balance = 0;
2327 goto ret;
2328 }
2329
2310 total_load += avg_load; 2330 total_load += avg_load;
2311 total_pwr += group->cpu_power; 2331 total_pwr += group->cpu_power;
2312 2332
@@ -2498,8 +2518,8 @@ out_balanced:
2498 *imbalance = min_load_per_task; 2518 *imbalance = min_load_per_task;
2499 return group_min; 2519 return group_min;
2500 } 2520 }
2501ret:
2502#endif 2521#endif
2522ret:
2503 *imbalance = 0; 2523 *imbalance = 0;
2504 return NULL; 2524 return NULL;
2505} 2525}
@@ -2550,7 +2570,8 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
2550 * tasks if there is an imbalance. 2570 * tasks if there is an imbalance.
2551 */ 2571 */
2552static int load_balance(int this_cpu, struct rq *this_rq, 2572static int load_balance(int this_cpu, struct rq *this_rq,
2553 struct sched_domain *sd, enum idle_type idle) 2573 struct sched_domain *sd, enum idle_type idle,
2574 int *balance)
2554{ 2575{
2555 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 2576 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2556 struct sched_group *group; 2577 struct sched_group *group;
@@ -2573,7 +2594,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2573 2594
2574redo: 2595redo:
2575 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 2596 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2576 &cpus); 2597 &cpus, balance);
2598
2599 if (*balance == 0) {
2600 schedstat_inc(sd, lb_stopbalance[idle]);
2601 goto out_balanced;
2602 }
2603
2577 if (!group) { 2604 if (!group) {
2578 schedstat_inc(sd, lb_nobusyg[idle]); 2605 schedstat_inc(sd, lb_nobusyg[idle]);
2579 goto out_balanced; 2606 goto out_balanced;
@@ -2715,7 +2742,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2715 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2742 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2716redo: 2743redo:
2717 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, 2744 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
2718 &sd_idle, &cpus); 2745 &sd_idle, &cpus, NULL);
2719 if (!group) { 2746 if (!group) {
2720 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2747 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2721 goto out_balanced; 2748 goto out_balanced;
@@ -2885,7 +2912,7 @@ static DEFINE_SPINLOCK(balancing);
2885 2912
2886static void run_rebalance_domains(struct softirq_action *h) 2913static void run_rebalance_domains(struct softirq_action *h)
2887{ 2914{
2888 int this_cpu = smp_processor_id(); 2915 int this_cpu = smp_processor_id(), balance = 1;
2889 struct rq *this_rq = cpu_rq(this_cpu); 2916 struct rq *this_rq = cpu_rq(this_cpu);
2890 unsigned long interval; 2917 unsigned long interval;
2891 struct sched_domain *sd; 2918 struct sched_domain *sd;
@@ -2917,7 +2944,7 @@ static void run_rebalance_domains(struct softirq_action *h)
2917 } 2944 }
2918 2945
2919 if (time_after_eq(jiffies, sd->last_balance + interval)) { 2946 if (time_after_eq(jiffies, sd->last_balance + interval)) {
2920 if (load_balance(this_cpu, this_rq, sd, idle)) { 2947 if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
2921 /* 2948 /*
2922 * We've pulled tasks over so either we're no 2949 * We've pulled tasks over so either we're no
2923 * longer idle, or one of our SMT siblings is 2950 * longer idle, or one of our SMT siblings is
@@ -2932,6 +2959,14 @@ static void run_rebalance_domains(struct softirq_action *h)
2932out: 2959out:
2933 if (time_after(next_balance, sd->last_balance + interval)) 2960 if (time_after(next_balance, sd->last_balance + interval))
2934 next_balance = sd->last_balance + interval; 2961 next_balance = sd->last_balance + interval;
2962
2963 /*
2964 * Stop the load balance at this level. There is another
2965 * CPU in our sched group which is doing load balancing more
2966 * actively.
2967 */
2968 if (!balance)
2969 break;
2935 } 2970 }
2936 this_rq->next_balance = next_balance; 2971 this_rq->next_balance = next_balance;
2937} 2972}