aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Lameter <christoph@sgi.com>2006-09-26 02:30:51 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-09-26 11:48:43 -0400
commit0a2966b48fb784e437520e400ddc94874ddbd4e8 (patch)
tree1bb2a4d2c33aa9d60eb427b0b4bf189e65073caf
parent656ddf798dbe588217c97e58b9cfdfce649ebdc3 (diff)
[PATCH] Fix longstanding load balancing bug in the scheduler
The scheduler will stop load balancing if the most busy processor contains processes pinned via processor affinity. The scheduler currently only does one search for busiest cpu. If it cannot pull any tasks away from the busiest cpu because they were pinned then the scheduler goes into a corner and sulks leaving the idle processors idle. F.e. If you have processor 0 busy running four tasks pinned via taskset, there are none on processor 1 and one just started two processes on processor 2 then the scheduler will not move one of the two processes away from processor 2. This patch fixes that issue by forcing the scheduler to come out of its corner and retrying the load balancing by considering other processors for load balancing. This patch was originally developed by John Hawkes and discussed at http://marc.theaimsgroup.com/?l=linux-kernel&m=113901368523205&w=2. I have removed extraneous material and gone back to equipping struct rq with the cpu the queue is associated with since this makes the patch much easier and it is likely that others in the future will have the same difficulty of figuring out which processor owns which runqueue. The overhead added through these patches is a single word on the stack if the kernel is configured to support 32 cpus or less (32 bit). For 32 bit environments the maximum number of cpus that can be configued is 255 which would result in the use of 32 bytes additional on the stack. On IA64 up to 1k cpus can be configured which will result in the use of 128 additional bytes on the stack. The maximum additional cache footprint is one cacheline. Typically memory use will be much less than a cacheline and the additional cpumask will be placed on the stack in a cacheline that already contains other local variable. Signed-off-by: Christoph Lameter <clameter@sgi.com> Cc: John Hawkes <hawkes@sgi.com> Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Peter Williams <pwil3058@bigpond.net.au> Cc: <stable@kernel.org> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--kernel/sched.c54
1 files changed, 46 insertions, 8 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index a234fbee1238..5c848fd4e461 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -238,6 +238,7 @@ struct rq {
238 /* For active balancing */ 238 /* For active balancing */
239 int active_balance; 239 int active_balance;
240 int push_cpu; 240 int push_cpu;
241 int cpu; /* cpu of this runqueue */
241 242
242 struct task_struct *migration_thread; 243 struct task_struct *migration_thread;
243 struct list_head migration_queue; 244 struct list_head migration_queue;
@@ -267,6 +268,15 @@ struct rq {
267 268
268static DEFINE_PER_CPU(struct rq, runqueues); 269static DEFINE_PER_CPU(struct rq, runqueues);
269 270
271static inline int cpu_of(struct rq *rq)
272{
273#ifdef CONFIG_SMP
274 return rq->cpu;
275#else
276 return 0;
277#endif
278}
279
270/* 280/*
271 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 281 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
272 * See detach_destroy_domains: synchronize_sched for details. 282 * See detach_destroy_domains: synchronize_sched for details.
@@ -2211,7 +2221,8 @@ out:
2211 */ 2221 */
2212static struct sched_group * 2222static struct sched_group *
2213find_busiest_group(struct sched_domain *sd, int this_cpu, 2223find_busiest_group(struct sched_domain *sd, int this_cpu,
2214 unsigned long *imbalance, enum idle_type idle, int *sd_idle) 2224 unsigned long *imbalance, enum idle_type idle, int *sd_idle,
2225 cpumask_t *cpus)
2215{ 2226{
2216 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2227 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2217 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2228 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2248,7 +2259,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2248 sum_weighted_load = sum_nr_running = avg_load = 0; 2259 sum_weighted_load = sum_nr_running = avg_load = 0;
2249 2260
2250 for_each_cpu_mask(i, group->cpumask) { 2261 for_each_cpu_mask(i, group->cpumask) {
2251 struct rq *rq = cpu_rq(i); 2262 struct rq *rq;
2263
2264 if (!cpu_isset(i, *cpus))
2265 continue;
2266
2267 rq = cpu_rq(i);
2252 2268
2253 if (*sd_idle && !idle_cpu(i)) 2269 if (*sd_idle && !idle_cpu(i))
2254 *sd_idle = 0; 2270 *sd_idle = 0;
@@ -2466,13 +2482,17 @@ ret:
2466 */ 2482 */
2467static struct rq * 2483static struct rq *
2468find_busiest_queue(struct sched_group *group, enum idle_type idle, 2484find_busiest_queue(struct sched_group *group, enum idle_type idle,
2469 unsigned long imbalance) 2485 unsigned long imbalance, cpumask_t *cpus)
2470{ 2486{
2471 struct rq *busiest = NULL, *rq; 2487 struct rq *busiest = NULL, *rq;
2472 unsigned long max_load = 0; 2488 unsigned long max_load = 0;
2473 int i; 2489 int i;
2474 2490
2475 for_each_cpu_mask(i, group->cpumask) { 2491 for_each_cpu_mask(i, group->cpumask) {
2492
2493 if (!cpu_isset(i, *cpus))
2494 continue;
2495
2476 rq = cpu_rq(i); 2496 rq = cpu_rq(i);
2477 2497
2478 if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) 2498 if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
@@ -2511,6 +2531,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2511 struct sched_group *group; 2531 struct sched_group *group;
2512 unsigned long imbalance; 2532 unsigned long imbalance;
2513 struct rq *busiest; 2533 struct rq *busiest;
2534 cpumask_t cpus = CPU_MASK_ALL;
2514 2535
2515 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && 2536 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2516 !sched_smt_power_savings) 2537 !sched_smt_power_savings)
@@ -2518,13 +2539,15 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2518 2539
2519 schedstat_inc(sd, lb_cnt[idle]); 2540 schedstat_inc(sd, lb_cnt[idle]);
2520 2541
2521 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle); 2542redo:
2543 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2544 &cpus);
2522 if (!group) { 2545 if (!group) {
2523 schedstat_inc(sd, lb_nobusyg[idle]); 2546 schedstat_inc(sd, lb_nobusyg[idle]);
2524 goto out_balanced; 2547 goto out_balanced;
2525 } 2548 }
2526 2549
2527 busiest = find_busiest_queue(group, idle, imbalance); 2550 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
2528 if (!busiest) { 2551 if (!busiest) {
2529 schedstat_inc(sd, lb_nobusyq[idle]); 2552 schedstat_inc(sd, lb_nobusyq[idle]);
2530 goto out_balanced; 2553 goto out_balanced;
@@ -2549,8 +2572,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2549 double_rq_unlock(this_rq, busiest); 2572 double_rq_unlock(this_rq, busiest);
2550 2573
2551 /* All tasks on this runqueue were pinned by CPU affinity */ 2574 /* All tasks on this runqueue were pinned by CPU affinity */
2552 if (unlikely(all_pinned)) 2575 if (unlikely(all_pinned)) {
2576 cpu_clear(cpu_of(busiest), cpus);
2577 if (!cpus_empty(cpus))
2578 goto redo;
2553 goto out_balanced; 2579 goto out_balanced;
2580 }
2554 } 2581 }
2555 2582
2556 if (!nr_moved) { 2583 if (!nr_moved) {
@@ -2639,18 +2666,22 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2639 unsigned long imbalance; 2666 unsigned long imbalance;
2640 int nr_moved = 0; 2667 int nr_moved = 0;
2641 int sd_idle = 0; 2668 int sd_idle = 0;
2669 cpumask_t cpus = CPU_MASK_ALL;
2642 2670
2643 if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) 2671 if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2644 sd_idle = 1; 2672 sd_idle = 1;
2645 2673
2646 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2674 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2647 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle); 2675redo:
2676 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
2677 &sd_idle, &cpus);
2648 if (!group) { 2678 if (!group) {
2649 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2679 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2650 goto out_balanced; 2680 goto out_balanced;
2651 } 2681 }
2652 2682
2653 busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance); 2683 busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,
2684 &cpus);
2654 if (!busiest) { 2685 if (!busiest) {
2655 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2686 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2656 goto out_balanced; 2687 goto out_balanced;
@@ -2668,6 +2699,12 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2668 minus_1_or_zero(busiest->nr_running), 2699 minus_1_or_zero(busiest->nr_running),
2669 imbalance, sd, NEWLY_IDLE, NULL); 2700 imbalance, sd, NEWLY_IDLE, NULL);
2670 spin_unlock(&busiest->lock); 2701 spin_unlock(&busiest->lock);
2702
2703 if (!nr_moved) {
2704 cpu_clear(cpu_of(busiest), cpus);
2705 if (!cpus_empty(cpus))
2706 goto redo;
2707 }
2671 } 2708 }
2672 2709
2673 if (!nr_moved) { 2710 if (!nr_moved) {
@@ -6747,6 +6784,7 @@ void __init sched_init(void)
6747 rq->cpu_load[j] = 0; 6784 rq->cpu_load[j] = 0;
6748 rq->active_balance = 0; 6785 rq->active_balance = 0;
6749 rq->push_cpu = 0; 6786 rq->push_cpu = 0;
6787 rq->cpu = i;
6750 rq->migration_thread = NULL; 6788 rq->migration_thread = NULL;
6751 INIT_LIST_HEAD(&rq->migration_queue); 6789 INIT_LIST_HEAD(&rq->migration_queue);
6752#endif 6790#endif