aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorSiddha, Suresh B <suresh.b.siddha@intel.com>2006-12-10 05:20:33 -0500
committerLinus Torvalds <torvalds@woody.osdl.org>2006-12-10 12:55:43 -0500
commit783609c6cb4eaa23f2ac5c968a44483584ec133f (patch)
tree678704bab2c69f5115ad84452e931adf4c11f3f4 /kernel
parentb18ec80396834497933d77b81ec0918519f4e2a7 (diff)
[PATCH] sched: decrease number of load balances
Currently at a particular domain, each cpu in the sched group will do a load balance at the frequency of balance_interval. More the cores and threads, more the cpus will be in each sched group at SMP and NUMA domain. And we endup spending quite a bit of time doing load balancing in those domains. Fix this by making only one cpu(first idle cpu or first cpu in the group if all the cpus are busy) in the sched group do the load balance at that particular sched domain and this load will slowly percolate down to the other cpus with in that group(when they do load balancing at lower domains). Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Cc: Christoph Lameter <clameter@engr.sgi.com> Cc: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched.c59
1 files changed, 47 insertions, 12 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 15ce772a471a..4e453431c61a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -428,7 +428,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
428 * bump this up when changing the output format or the meaning of an existing 428 * bump this up when changing the output format or the meaning of an existing
429 * format, so that tools can adapt (or abort) 429 * format, so that tools can adapt (or abort)
430 */ 430 */
431#define SCHEDSTAT_VERSION 12 431#define SCHEDSTAT_VERSION 13
432 432
433static int show_schedstat(struct seq_file *seq, void *v) 433static int show_schedstat(struct seq_file *seq, void *v)
434{ 434{
@@ -466,7 +466,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
466 seq_printf(seq, "domain%d %s", dcnt++, mask_str); 466 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
467 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; 467 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
468 itype++) { 468 itype++) {
469 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", 469 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu",
470 sd->lb_cnt[itype], 470 sd->lb_cnt[itype],
471 sd->lb_balanced[itype], 471 sd->lb_balanced[itype],
472 sd->lb_failed[itype], 472 sd->lb_failed[itype],
@@ -474,7 +474,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
474 sd->lb_gained[itype], 474 sd->lb_gained[itype],
475 sd->lb_hot_gained[itype], 475 sd->lb_hot_gained[itype],
476 sd->lb_nobusyq[itype], 476 sd->lb_nobusyq[itype],
477 sd->lb_nobusyg[itype]); 477 sd->lb_nobusyg[itype],
478 sd->lb_stopbalance[itype]);
478 } 479 }
479 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", 480 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
480 sd->alb_cnt, sd->alb_failed, sd->alb_pushed, 481 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
@@ -2249,7 +2250,7 @@ out:
2249static struct sched_group * 2250static struct sched_group *
2250find_busiest_group(struct sched_domain *sd, int this_cpu, 2251find_busiest_group(struct sched_domain *sd, int this_cpu,
2251 unsigned long *imbalance, enum idle_type idle, int *sd_idle, 2252 unsigned long *imbalance, enum idle_type idle, int *sd_idle,
2252 cpumask_t *cpus) 2253 cpumask_t *cpus, int *balance)
2253{ 2254{
2254 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2255 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2255 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2256 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2278,10 +2279,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2278 unsigned long load, group_capacity; 2279 unsigned long load, group_capacity;
2279 int local_group; 2280 int local_group;
2280 int i; 2281 int i;
2282 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2281 unsigned long sum_nr_running, sum_weighted_load; 2283 unsigned long sum_nr_running, sum_weighted_load;
2282 2284
2283 local_group = cpu_isset(this_cpu, group->cpumask); 2285 local_group = cpu_isset(this_cpu, group->cpumask);
2284 2286
2287 if (local_group)
2288 balance_cpu = first_cpu(group->cpumask);
2289
2285 /* Tally up the load of all CPUs in the group */ 2290 /* Tally up the load of all CPUs in the group */
2286 sum_weighted_load = sum_nr_running = avg_load = 0; 2291 sum_weighted_load = sum_nr_running = avg_load = 0;
2287 2292
@@ -2297,9 +2302,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2297 *sd_idle = 0; 2302 *sd_idle = 0;
2298 2303
2299 /* Bias balancing toward cpus of our domain */ 2304 /* Bias balancing toward cpus of our domain */
2300 if (local_group) 2305 if (local_group) {
2306 if (idle_cpu(i) && !first_idle_cpu) {
2307 first_idle_cpu = 1;
2308 balance_cpu = i;
2309 }
2310
2301 load = target_load(i, load_idx); 2311 load = target_load(i, load_idx);
2302 else 2312 } else
2303 load = source_load(i, load_idx); 2313 load = source_load(i, load_idx);
2304 2314
2305 avg_load += load; 2315 avg_load += load;
@@ -2307,6 +2317,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2307 sum_weighted_load += rq->raw_weighted_load; 2317 sum_weighted_load += rq->raw_weighted_load;
2308 } 2318 }
2309 2319
2320 /*
2321 * First idle cpu or the first cpu(busiest) in this sched group
2322 * is eligible for doing load balancing at this and above
2323 * domains.
2324 */
2325 if (local_group && balance_cpu != this_cpu && balance) {
2326 *balance = 0;
2327 goto ret;
2328 }
2329
2310 total_load += avg_load; 2330 total_load += avg_load;
2311 total_pwr += group->cpu_power; 2331 total_pwr += group->cpu_power;
2312 2332
@@ -2498,8 +2518,8 @@ out_balanced:
2498 *imbalance = min_load_per_task; 2518 *imbalance = min_load_per_task;
2499 return group_min; 2519 return group_min;
2500 } 2520 }
2501ret:
2502#endif 2521#endif
2522ret:
2503 *imbalance = 0; 2523 *imbalance = 0;
2504 return NULL; 2524 return NULL;
2505} 2525}
@@ -2550,7 +2570,8 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
2550 * tasks if there is an imbalance. 2570 * tasks if there is an imbalance.
2551 */ 2571 */
2552static int load_balance(int this_cpu, struct rq *this_rq, 2572static int load_balance(int this_cpu, struct rq *this_rq,
2553 struct sched_domain *sd, enum idle_type idle) 2573 struct sched_domain *sd, enum idle_type idle,
2574 int *balance)
2554{ 2575{
2555 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 2576 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2556 struct sched_group *group; 2577 struct sched_group *group;
@@ -2573,7 +2594,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2573 2594
2574redo: 2595redo:
2575 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 2596 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2576 &cpus); 2597 &cpus, balance);
2598
2599 if (*balance == 0) {
2600 schedstat_inc(sd, lb_stopbalance[idle]);
2601 goto out_balanced;
2602 }
2603
2577 if (!group) { 2604 if (!group) {
2578 schedstat_inc(sd, lb_nobusyg[idle]); 2605 schedstat_inc(sd, lb_nobusyg[idle]);
2579 goto out_balanced; 2606 goto out_balanced;
@@ -2715,7 +2742,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2715 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2742 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2716redo: 2743redo:
2717 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, 2744 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
2718 &sd_idle, &cpus); 2745 &sd_idle, &cpus, NULL);
2719 if (!group) { 2746 if (!group) {
2720 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2747 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2721 goto out_balanced; 2748 goto out_balanced;
@@ -2885,7 +2912,7 @@ static DEFINE_SPINLOCK(balancing);
2885 2912
2886static void run_rebalance_domains(struct softirq_action *h) 2913static void run_rebalance_domains(struct softirq_action *h)
2887{ 2914{
2888 int this_cpu = smp_processor_id(); 2915 int this_cpu = smp_processor_id(), balance = 1;
2889 struct rq *this_rq = cpu_rq(this_cpu); 2916 struct rq *this_rq = cpu_rq(this_cpu);
2890 unsigned long interval; 2917 unsigned long interval;
2891 struct sched_domain *sd; 2918 struct sched_domain *sd;
@@ -2917,7 +2944,7 @@ static void run_rebalance_domains(struct softirq_action *h)
2917 } 2944 }
2918 2945
2919 if (time_after_eq(jiffies, sd->last_balance + interval)) { 2946 if (time_after_eq(jiffies, sd->last_balance + interval)) {
2920 if (load_balance(this_cpu, this_rq, sd, idle)) { 2947 if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
2921 /* 2948 /*
2922 * We've pulled tasks over so either we're no 2949 * We've pulled tasks over so either we're no
2923 * longer idle, or one of our SMT siblings is 2950 * longer idle, or one of our SMT siblings is
@@ -2932,6 +2959,14 @@ static void run_rebalance_domains(struct softirq_action *h)
2932out: 2959out:
2933 if (time_after(next_balance, sd->last_balance + interval)) 2960 if (time_after(next_balance, sd->last_balance + interval))
2934 next_balance = sd->last_balance + interval; 2961 next_balance = sd->last_balance + interval;
2962
2963 /*
2964 * Stop the load balance at this level. There is another
2965 * CPU in our sched group which is doing load balancing more
2966 * actively.
2967 */
2968 if (!balance)
2969 break;
2935 } 2970 }
2936 this_rq->next_balance = next_balance; 2971 this_rq->next_balance = next_balance;
2937} 2972}