aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKen Chen <kenchen@google.com>2007-10-17 10:55:11 -0400
committerIngo Molnar <mingo@elte.hu>2007-10-17 10:55:11 -0400
commit908a7c1b9b80d06708177432020c80d147754691 (patch)
treec3015123d79cb7484e1f64030512ab5a920747e5
parentcd79007634854f9e936e2369890f2512f94b8759 (diff)
sched: fix improper load balance across sched domain
We recently discovered a nasty performance bug in the kernel CPU load balancer where we were hit by 50% performance regression. When tasks are assigned to a subset of CPUs that span across sched_domains (either ccNUMA node or the new multi-core domain) via cpu affinity, kernel fails to perform proper load balance at these domains, due to several logic in find_busiest_group() miss identified busiest sched group within a given domain. This leads to inadequate load balance and causes 50% performance hit. To give you a concrete example, on a dual-core, 2 socket numa system, there are 4 logical cpu, organized as: CPU0 attaching sched-domain: domain 0: span 0003 groups: 0001 0002 domain 1: span 000f groups: 0003 000c CPU1 attaching sched-domain: domain 0: span 0003 groups: 0002 0001 domain 1: span 000f groups: 0003 000c CPU2 attaching sched-domain: domain 0: span 000c groups: 0004 0008 domain 1: span 000f groups: 000c 0003 CPU3 attaching sched-domain: domain 0: span 000c groups: 0008 0004 domain 1: span 000f groups: 000c 0003 If I run 2 tasks with CPU affinity set to 0x5. There are situation where cpu0 has run queue length of 2, and cpu2 will be idle. The kernel load balancer is unable to balance out these two tasks over cpu0 and cpu2 due to at least three logics in find_busiest_group() that heavily bias load balance towards power saving mode. e.g. while determining "busiest" variable, kernel only set it when "sum_nr_running > group_capacity". This test is flawed that "sum_nr_running" is not necessary same as sum-tasks-allowed-to-run-within-the sched-group. The end result is that kernel "think" everything is balanced, but in reality we have an imbalance and thus causing one CPU to be over-subscribed and leaving other idle. There are two other logic in the same function will also causing similar effect. The nastiness of this bug is that kernel not be able to get unstuck in this unfortunate broken state. From what we've seen in our environment, kernel will stuck in imbalanced state for extended period of time and it is also very easy for the kernel to stuck into that state (it's pretty much 100% reproducible for us). So proposing the following fix: add addition logic in find_busiest_group to detect intrinsic imbalance within the busiest group. When such condition is detected, load balance goes into spread mode instead of default grouping mode. Signed-off-by: Ken Chen <kenchen@google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--kernel/sched.c23
1 files changed, 19 insertions, 4 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 5e220bf7d4c4..975436435b42 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2336,7 +2336,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2336 unsigned long max_pull; 2336 unsigned long max_pull;
2337 unsigned long busiest_load_per_task, busiest_nr_running; 2337 unsigned long busiest_load_per_task, busiest_nr_running;
2338 unsigned long this_load_per_task, this_nr_running; 2338 unsigned long this_load_per_task, this_nr_running;
2339 int load_idx; 2339 int load_idx, group_imb = 0;
2340#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2340#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2341 int power_savings_balance = 1; 2341 int power_savings_balance = 1;
2342 unsigned long leader_nr_running = 0, min_load_per_task = 0; 2342 unsigned long leader_nr_running = 0, min_load_per_task = 0;
@@ -2355,9 +2355,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2355 load_idx = sd->idle_idx; 2355 load_idx = sd->idle_idx;
2356 2356
2357 do { 2357 do {
2358 unsigned long load, group_capacity; 2358 unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
2359 int local_group; 2359 int local_group;
2360 int i; 2360 int i;
2361 int __group_imb = 0;
2361 unsigned int balance_cpu = -1, first_idle_cpu = 0; 2362 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2362 unsigned long sum_nr_running, sum_weighted_load; 2363 unsigned long sum_nr_running, sum_weighted_load;
2363 2364
@@ -2368,6 +2369,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2368 2369
2369 /* Tally up the load of all CPUs in the group */ 2370 /* Tally up the load of all CPUs in the group */
2370 sum_weighted_load = sum_nr_running = avg_load = 0; 2371 sum_weighted_load = sum_nr_running = avg_load = 0;
2372 max_cpu_load = 0;
2373 min_cpu_load = ~0UL;
2371 2374
2372 for_each_cpu_mask(i, group->cpumask) { 2375 for_each_cpu_mask(i, group->cpumask) {
2373 struct rq *rq; 2376 struct rq *rq;
@@ -2388,8 +2391,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2388 } 2391 }
2389 2392
2390 load = target_load(i, load_idx); 2393 load = target_load(i, load_idx);
2391 } else 2394 } else {
2392 load = source_load(i, load_idx); 2395 load = source_load(i, load_idx);
2396 if (load > max_cpu_load)
2397 max_cpu_load = load;
2398 if (min_cpu_load > load)
2399 min_cpu_load = load;
2400 }
2393 2401
2394 avg_load += load; 2402 avg_load += load;
2395 sum_nr_running += rq->nr_running; 2403 sum_nr_running += rq->nr_running;
@@ -2415,6 +2423,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2415 avg_load = sg_div_cpu_power(group, 2423 avg_load = sg_div_cpu_power(group,
2416 avg_load * SCHED_LOAD_SCALE); 2424 avg_load * SCHED_LOAD_SCALE);
2417 2425
2426 if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE)
2427 __group_imb = 1;
2428
2418 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 2429 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
2419 2430
2420 if (local_group) { 2431 if (local_group) {
@@ -2423,11 +2434,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2423 this_nr_running = sum_nr_running; 2434 this_nr_running = sum_nr_running;
2424 this_load_per_task = sum_weighted_load; 2435 this_load_per_task = sum_weighted_load;
2425 } else if (avg_load > max_load && 2436 } else if (avg_load > max_load &&
2426 sum_nr_running > group_capacity) { 2437 (sum_nr_running > group_capacity || __group_imb)) {
2427 max_load = avg_load; 2438 max_load = avg_load;
2428 busiest = group; 2439 busiest = group;
2429 busiest_nr_running = sum_nr_running; 2440 busiest_nr_running = sum_nr_running;
2430 busiest_load_per_task = sum_weighted_load; 2441 busiest_load_per_task = sum_weighted_load;
2442 group_imb = __group_imb;
2431 } 2443 }
2432 2444
2433#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2445#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2499,6 +2511,9 @@ group_next:
2499 goto out_balanced; 2511 goto out_balanced;
2500 2512
2501 busiest_load_per_task /= busiest_nr_running; 2513 busiest_load_per_task /= busiest_nr_running;
2514 if (group_imb)
2515 busiest_load_per_task = min(busiest_load_per_task, avg_load);
2516
2502 /* 2517 /*
2503 * We're trying to get all the cpus to the average_load, so we don't 2518 * We're trying to get all the cpus to the average_load, so we don't
2504 * want to push ourselves above the average load, nor do we wish to 2519 * want to push ourselves above the average load, nor do we wish to