aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2012-03-09 18:07:36 -0500
committerIngo Molnar <mingo@elte.hu>2012-03-12 15:43:15 -0400
commit5d6523ebd2f67de9d23285aad7f3910e7b0aee83 (patch)
tree745ee9e9378dd255fe7097e42c34a86ba9b8ee48 /kernel/sched/fair.c
parent2e5b5b3a1b7768c89fbfeca18e75f8ee377e924c (diff)
sched: Fix load-balance wreckage
Commit 367456c ("sched: Ditch per cgroup task lists for load-balancing") completely wrecked load-balancing due to a few silly mistakes. Correct those and remove more pointless code. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/n/tip-zk04ihygwxn7qqrlpaf73b0r@git.kernel.org Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c110
1 files changed, 39 insertions, 71 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a0424fc4cc54..def17aa302d5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -784,7 +784,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
784 update_load_add(&rq_of(cfs_rq)->load, se->load.weight); 784 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
785#ifdef CONFIG_SMP 785#ifdef CONFIG_SMP
786 if (entity_is_task(se)) 786 if (entity_is_task(se))
787 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); 787 list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
788#endif 788#endif
789 cfs_rq->nr_running++; 789 cfs_rq->nr_running++;
790} 790}
@@ -3071,7 +3071,6 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3071 3071
3072#define LBF_ALL_PINNED 0x01 3072#define LBF_ALL_PINNED 0x01
3073#define LBF_NEED_BREAK 0x02 3073#define LBF_NEED_BREAK 0x02
3074#define LBF_ABORT 0x04
3075 3074
3076struct lb_env { 3075struct lb_env {
3077 struct sched_domain *sd; 3076 struct sched_domain *sd;
@@ -3083,7 +3082,7 @@ struct lb_env {
3083 struct rq *dst_rq; 3082 struct rq *dst_rq;
3084 3083
3085 enum cpu_idle_type idle; 3084 enum cpu_idle_type idle;
3086 unsigned long max_load_move; 3085 long load_move;
3087 unsigned int flags; 3086 unsigned int flags;
3088 3087
3089 unsigned int loop; 3088 unsigned int loop;
@@ -3216,39 +3215,47 @@ static int move_one_task(struct lb_env *env)
3216 3215
3217static unsigned long task_h_load(struct task_struct *p); 3216static unsigned long task_h_load(struct task_struct *p);
3218 3217
3219static unsigned long balance_tasks(struct lb_env *env) 3218/*
3219 * move_tasks tries to move up to load_move weighted load from busiest to
3220 * this_rq, as part of a balancing operation within domain "sd".
3221 * Returns 1 if successful and 0 otherwise.
3222 *
3223 * Called with both runqueues locked.
3224 */
3225static int move_tasks(struct lb_env *env)
3220{ 3226{
3221 long rem_load_move = env->max_load_move; 3227 struct list_head *tasks = &env->src_rq->cfs_tasks;
3222 struct task_struct *p, *n; 3228 struct task_struct *p;
3223 unsigned long load; 3229 unsigned long load;
3224 int pulled = 0; 3230 int pulled = 0;
3225 3231
3226 if (env->max_load_move == 0) 3232 if (env->load_move <= 0)
3227 goto out; 3233 return 0;
3234
3235 while (!list_empty(tasks)) {
3236 p = list_first_entry(tasks, struct task_struct, se.group_node);
3228 3237
3229 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
3230 env->loop++; 3238 env->loop++;
3231 /* We've more or less seen every task there is, call it quits */ 3239 /* We've more or less seen every task there is, call it quits */
3232 if (env->loop > env->loop_max) { 3240 if (env->loop > env->loop_max)
3233 env->flags |= LBF_ABORT;
3234 break; 3241 break;
3235 } 3242
3236 /* take a beather every nr_migrate tasks */ 3243 /* take a breather every nr_migrate tasks */
3237 if (env->loop > env->loop_break) { 3244 if (env->loop > env->loop_break) {
3238 env->loop_break += sysctl_sched_nr_migrate; 3245 env->loop_break += sysctl_sched_nr_migrate;
3239 env->flags |= LBF_NEED_BREAK; 3246 env->flags |= LBF_NEED_BREAK;
3240 break; 3247 break;
3241 } 3248 }
3242 3249
3243 if (throttled_lb_pair(task_group(p), env->src_rq->cpu, 3250 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
3244 env->dst_cpu))
3245 goto next; 3251 goto next;
3246 3252
3247 load = task_h_load(p); 3253 load = task_h_load(p);
3254
3248 if (load < 16 && !env->sd->nr_balance_failed) 3255 if (load < 16 && !env->sd->nr_balance_failed)
3249 goto next; 3256 goto next;
3250 3257
3251 if ((load * 2) > rem_load_move) 3258 if ((load / 2) > env->load_move)
3252 goto next; 3259 goto next;
3253 3260
3254 if (!can_migrate_task(p, env)) 3261 if (!can_migrate_task(p, env))
@@ -3256,7 +3263,7 @@ static unsigned long balance_tasks(struct lb_env *env)
3256 3263
3257 move_task(p, env); 3264 move_task(p, env);
3258 pulled++; 3265 pulled++;
3259 rem_load_move -= load; 3266 env->load_move -= load;
3260 3267
3261#ifdef CONFIG_PREEMPT 3268#ifdef CONFIG_PREEMPT
3262 /* 3269 /*
@@ -3264,24 +3271,22 @@ static unsigned long balance_tasks(struct lb_env *env)
3264 * kernels will stop after the first task is pulled to minimize 3271 * kernels will stop after the first task is pulled to minimize
3265 * the critical section. 3272 * the critical section.
3266 */ 3273 */
3267 if (env->idle == CPU_NEWLY_IDLE) { 3274 if (env->idle == CPU_NEWLY_IDLE)
3268 env->flags |= LBF_ABORT;
3269 break; 3275 break;
3270 }
3271#endif 3276#endif
3272 3277
3273 /* 3278 /*
3274 * We only want to steal up to the prescribed amount of 3279 * We only want to steal up to the prescribed amount of
3275 * weighted load. 3280 * weighted load.
3276 */ 3281 */
3277 if (rem_load_move <= 0) 3282 if (env->load_move <= 0)
3278 break; 3283 break;
3279 3284
3280 continue; 3285 continue;
3281next: 3286next:
3282 list_move_tail(&p->se.group_node, &env->src_rq->cfs_tasks); 3287 list_move_tail(&p->se.group_node, tasks);
3283 } 3288 }
3284out: 3289
3285 /* 3290 /*
3286 * Right now, this is one of only two places move_task() is called, 3291 * Right now, this is one of only two places move_task() is called,
3287 * so we can safely collect move_task() stats here rather than 3292 * so we can safely collect move_task() stats here rather than
@@ -3289,7 +3294,7 @@ out:
3289 */ 3294 */
3290 schedstat_add(env->sd, lb_gained[env->idle], pulled); 3295 schedstat_add(env->sd, lb_gained[env->idle], pulled);
3291 3296
3292 return env->max_load_move - rem_load_move; 3297 return pulled;
3293} 3298}
3294 3299
3295#ifdef CONFIG_FAIR_GROUP_SCHED 3300#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3399,43 +3404,6 @@ static unsigned long task_h_load(struct task_struct *p)
3399} 3404}
3400#endif 3405#endif
3401 3406
3402/*
3403 * move_tasks tries to move up to max_load_move weighted load from busiest to
3404 * this_rq, as part of a balancing operation within domain "sd".
3405 * Returns 1 if successful and 0 otherwise.
3406 *
3407 * Called with both runqueues locked.
3408 */
3409static int move_tasks(struct lb_env *env)
3410{
3411 unsigned long max_load_move = env->max_load_move;
3412 unsigned long total_load_moved = 0, load_moved;
3413
3414 update_h_load(cpu_of(env->src_rq));
3415 do {
3416 env->max_load_move = max_load_move - total_load_moved;
3417 load_moved = balance_tasks(env);
3418 total_load_moved += load_moved;
3419
3420 if (env->flags & (LBF_NEED_BREAK|LBF_ABORT))
3421 break;
3422
3423#ifdef CONFIG_PREEMPT
3424 /*
3425 * NEWIDLE balancing is a source of latency, so preemptible
3426 * kernels will stop after the first task is pulled to minimize
3427 * the critical section.
3428 */
3429 if (env->idle == CPU_NEWLY_IDLE && env->dst_rq->nr_running) {
3430 env->flags |= LBF_ABORT;
3431 break;
3432 }
3433#endif
3434 } while (load_moved && max_load_move > total_load_moved);
3435
3436 return total_load_moved > 0;
3437}
3438
3439/********** Helpers for find_busiest_group ************************/ 3407/********** Helpers for find_busiest_group ************************/
3440/* 3408/*
3441 * sd_lb_stats - Structure to store the statistics of a sched_domain 3409 * sd_lb_stats - Structure to store the statistics of a sched_domain
@@ -4477,31 +4445,31 @@ redo:
4477 * correctly treated as an imbalance. 4445 * correctly treated as an imbalance.
4478 */ 4446 */
4479 env.flags |= LBF_ALL_PINNED; 4447 env.flags |= LBF_ALL_PINNED;
4480 env.max_load_move = imbalance; 4448 env.load_move = imbalance;
4481 env.src_cpu = busiest->cpu; 4449 env.src_cpu = busiest->cpu;
4482 env.src_rq = busiest; 4450 env.src_rq = busiest;
4483 env.loop_max = busiest->nr_running; 4451 env.loop_max = busiest->nr_running;
4484 4452
4453more_balance:
4485 local_irq_save(flags); 4454 local_irq_save(flags);
4486 double_rq_lock(this_rq, busiest); 4455 double_rq_lock(this_rq, busiest);
4487 ld_moved = move_tasks(&env); 4456 if (!env.loop)
4457 update_h_load(env.src_cpu);
4458 ld_moved += move_tasks(&env);
4488 double_rq_unlock(this_rq, busiest); 4459 double_rq_unlock(this_rq, busiest);
4489 local_irq_restore(flags); 4460 local_irq_restore(flags);
4490 4461
4462 if (env.flags & LBF_NEED_BREAK) {
4463 env.flags &= ~LBF_NEED_BREAK;
4464 goto more_balance;
4465 }
4466
4491 /* 4467 /*
4492 * some other cpu did the load balance for us. 4468 * some other cpu did the load balance for us.
4493 */ 4469 */
4494 if (ld_moved && this_cpu != smp_processor_id()) 4470 if (ld_moved && this_cpu != smp_processor_id())
4495 resched_cpu(this_cpu); 4471 resched_cpu(this_cpu);
4496 4472
4497 if (env.flags & LBF_ABORT)
4498 goto out_balanced;
4499
4500 if (env.flags & LBF_NEED_BREAK) {
4501 env.flags &= ~LBF_NEED_BREAK;
4502 goto redo;
4503 }
4504
4505 /* All tasks on this runqueue were pinned by CPU affinity */ 4473 /* All tasks on this runqueue were pinned by CPU affinity */
4506 if (unlikely(env.flags & LBF_ALL_PINNED)) { 4474 if (unlikely(env.flags & LBF_ALL_PINNED)) {
4507 cpumask_clear_cpu(cpu_of(busiest), cpus); 4475 cpumask_clear_cpu(cpu_of(busiest), cpus);