diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2012-03-09 18:07:36 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2012-03-12 15:43:15 -0400 |
commit | 5d6523ebd2f67de9d23285aad7f3910e7b0aee83 (patch) | |
tree | 745ee9e9378dd255fe7097e42c34a86ba9b8ee48 /kernel/sched | |
parent | 2e5b5b3a1b7768c89fbfeca18e75f8ee377e924c (diff) |
sched: Fix load-balance wreckage
Commit 367456c ("sched: Ditch per cgroup task lists for
load-balancing") completely wrecked load-balancing due to
a few silly mistakes.
Correct those and remove more pointless code.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-zk04ihygwxn7qqrlpaf73b0r@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/fair.c | 110 |
1 files changed, 39 insertions, 71 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a0424fc4cc54..def17aa302d5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -784,7 +784,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
784 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); | 784 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); |
785 | #ifdef CONFIG_SMP | 785 | #ifdef CONFIG_SMP |
786 | if (entity_is_task(se)) | 786 | if (entity_is_task(se)) |
787 | list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); | 787 | list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); |
788 | #endif | 788 | #endif |
789 | cfs_rq->nr_running++; | 789 | cfs_rq->nr_running++; |
790 | } | 790 | } |
@@ -3071,7 +3071,6 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; | |||
3071 | 3071 | ||
3072 | #define LBF_ALL_PINNED 0x01 | 3072 | #define LBF_ALL_PINNED 0x01 |
3073 | #define LBF_NEED_BREAK 0x02 | 3073 | #define LBF_NEED_BREAK 0x02 |
3074 | #define LBF_ABORT 0x04 | ||
3075 | 3074 | ||
3076 | struct lb_env { | 3075 | struct lb_env { |
3077 | struct sched_domain *sd; | 3076 | struct sched_domain *sd; |
@@ -3083,7 +3082,7 @@ struct lb_env { | |||
3083 | struct rq *dst_rq; | 3082 | struct rq *dst_rq; |
3084 | 3083 | ||
3085 | enum cpu_idle_type idle; | 3084 | enum cpu_idle_type idle; |
3086 | unsigned long max_load_move; | 3085 | long load_move; |
3087 | unsigned int flags; | 3086 | unsigned int flags; |
3088 | 3087 | ||
3089 | unsigned int loop; | 3088 | unsigned int loop; |
@@ -3216,39 +3215,47 @@ static int move_one_task(struct lb_env *env) | |||
3216 | 3215 | ||
3217 | static unsigned long task_h_load(struct task_struct *p); | 3216 | static unsigned long task_h_load(struct task_struct *p); |
3218 | 3217 | ||
3219 | static unsigned long balance_tasks(struct lb_env *env) | 3218 | /* |
3219 | * move_tasks tries to move up to load_move weighted load from busiest to | ||
3220 | * this_rq, as part of a balancing operation within domain "sd". | ||
3221 | * Returns 1 if successful and 0 otherwise. | ||
3222 | * | ||
3223 | * Called with both runqueues locked. | ||
3224 | */ | ||
3225 | static int move_tasks(struct lb_env *env) | ||
3220 | { | 3226 | { |
3221 | long rem_load_move = env->max_load_move; | 3227 | struct list_head *tasks = &env->src_rq->cfs_tasks; |
3222 | struct task_struct *p, *n; | 3228 | struct task_struct *p; |
3223 | unsigned long load; | 3229 | unsigned long load; |
3224 | int pulled = 0; | 3230 | int pulled = 0; |
3225 | 3231 | ||
3226 | if (env->max_load_move == 0) | 3232 | if (env->load_move <= 0) |
3227 | goto out; | 3233 | return 0; |
3234 | |||
3235 | while (!list_empty(tasks)) { | ||
3236 | p = list_first_entry(tasks, struct task_struct, se.group_node); | ||
3228 | 3237 | ||
3229 | list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { | ||
3230 | env->loop++; | 3238 | env->loop++; |
3231 | /* We've more or less seen every task there is, call it quits */ | 3239 | /* We've more or less seen every task there is, call it quits */ |
3232 | if (env->loop > env->loop_max) { | 3240 | if (env->loop > env->loop_max) |
3233 | env->flags |= LBF_ABORT; | ||
3234 | break; | 3241 | break; |
3235 | } | 3242 | |
3236 | /* take a beather every nr_migrate tasks */ | 3243 | /* take a breather every nr_migrate tasks */ |
3237 | if (env->loop > env->loop_break) { | 3244 | if (env->loop > env->loop_break) { |
3238 | env->loop_break += sysctl_sched_nr_migrate; | 3245 | env->loop_break += sysctl_sched_nr_migrate; |
3239 | env->flags |= LBF_NEED_BREAK; | 3246 | env->flags |= LBF_NEED_BREAK; |
3240 | break; | 3247 | break; |
3241 | } | 3248 | } |
3242 | 3249 | ||
3243 | if (throttled_lb_pair(task_group(p), env->src_rq->cpu, | 3250 | if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) |
3244 | env->dst_cpu)) | ||
3245 | goto next; | 3251 | goto next; |
3246 | 3252 | ||
3247 | load = task_h_load(p); | 3253 | load = task_h_load(p); |
3254 | |||
3248 | if (load < 16 && !env->sd->nr_balance_failed) | 3255 | if (load < 16 && !env->sd->nr_balance_failed) |
3249 | goto next; | 3256 | goto next; |
3250 | 3257 | ||
3251 | if ((load * 2) > rem_load_move) | 3258 | if ((load / 2) > env->load_move) |
3252 | goto next; | 3259 | goto next; |
3253 | 3260 | ||
3254 | if (!can_migrate_task(p, env)) | 3261 | if (!can_migrate_task(p, env)) |
@@ -3256,7 +3263,7 @@ static unsigned long balance_tasks(struct lb_env *env) | |||
3256 | 3263 | ||
3257 | move_task(p, env); | 3264 | move_task(p, env); |
3258 | pulled++; | 3265 | pulled++; |
3259 | rem_load_move -= load; | 3266 | env->load_move -= load; |
3260 | 3267 | ||
3261 | #ifdef CONFIG_PREEMPT | 3268 | #ifdef CONFIG_PREEMPT |
3262 | /* | 3269 | /* |
@@ -3264,24 +3271,22 @@ static unsigned long balance_tasks(struct lb_env *env) | |||
3264 | * kernels will stop after the first task is pulled to minimize | 3271 | * kernels will stop after the first task is pulled to minimize |
3265 | * the critical section. | 3272 | * the critical section. |
3266 | */ | 3273 | */ |
3267 | if (env->idle == CPU_NEWLY_IDLE) { | 3274 | if (env->idle == CPU_NEWLY_IDLE) |
3268 | env->flags |= LBF_ABORT; | ||
3269 | break; | 3275 | break; |
3270 | } | ||
3271 | #endif | 3276 | #endif |
3272 | 3277 | ||
3273 | /* | 3278 | /* |
3274 | * We only want to steal up to the prescribed amount of | 3279 | * We only want to steal up to the prescribed amount of |
3275 | * weighted load. | 3280 | * weighted load. |
3276 | */ | 3281 | */ |
3277 | if (rem_load_move <= 0) | 3282 | if (env->load_move <= 0) |
3278 | break; | 3283 | break; |
3279 | 3284 | ||
3280 | continue; | 3285 | continue; |
3281 | next: | 3286 | next: |
3282 | list_move_tail(&p->se.group_node, &env->src_rq->cfs_tasks); | 3287 | list_move_tail(&p->se.group_node, tasks); |
3283 | } | 3288 | } |
3284 | out: | 3289 | |
3285 | /* | 3290 | /* |
3286 | * Right now, this is one of only two places move_task() is called, | 3291 | * Right now, this is one of only two places move_task() is called, |
3287 | * so we can safely collect move_task() stats here rather than | 3292 | * so we can safely collect move_task() stats here rather than |
@@ -3289,7 +3294,7 @@ out: | |||
3289 | */ | 3294 | */ |
3290 | schedstat_add(env->sd, lb_gained[env->idle], pulled); | 3295 | schedstat_add(env->sd, lb_gained[env->idle], pulled); |
3291 | 3296 | ||
3292 | return env->max_load_move - rem_load_move; | 3297 | return pulled; |
3293 | } | 3298 | } |
3294 | 3299 | ||
3295 | #ifdef CONFIG_FAIR_GROUP_SCHED | 3300 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -3399,43 +3404,6 @@ static unsigned long task_h_load(struct task_struct *p) | |||
3399 | } | 3404 | } |
3400 | #endif | 3405 | #endif |
3401 | 3406 | ||
3402 | /* | ||
3403 | * move_tasks tries to move up to max_load_move weighted load from busiest to | ||
3404 | * this_rq, as part of a balancing operation within domain "sd". | ||
3405 | * Returns 1 if successful and 0 otherwise. | ||
3406 | * | ||
3407 | * Called with both runqueues locked. | ||
3408 | */ | ||
3409 | static int move_tasks(struct lb_env *env) | ||
3410 | { | ||
3411 | unsigned long max_load_move = env->max_load_move; | ||
3412 | unsigned long total_load_moved = 0, load_moved; | ||
3413 | |||
3414 | update_h_load(cpu_of(env->src_rq)); | ||
3415 | do { | ||
3416 | env->max_load_move = max_load_move - total_load_moved; | ||
3417 | load_moved = balance_tasks(env); | ||
3418 | total_load_moved += load_moved; | ||
3419 | |||
3420 | if (env->flags & (LBF_NEED_BREAK|LBF_ABORT)) | ||
3421 | break; | ||
3422 | |||
3423 | #ifdef CONFIG_PREEMPT | ||
3424 | /* | ||
3425 | * NEWIDLE balancing is a source of latency, so preemptible | ||
3426 | * kernels will stop after the first task is pulled to minimize | ||
3427 | * the critical section. | ||
3428 | */ | ||
3429 | if (env->idle == CPU_NEWLY_IDLE && env->dst_rq->nr_running) { | ||
3430 | env->flags |= LBF_ABORT; | ||
3431 | break; | ||
3432 | } | ||
3433 | #endif | ||
3434 | } while (load_moved && max_load_move > total_load_moved); | ||
3435 | |||
3436 | return total_load_moved > 0; | ||
3437 | } | ||
3438 | |||
3439 | /********** Helpers for find_busiest_group ************************/ | 3407 | /********** Helpers for find_busiest_group ************************/ |
3440 | /* | 3408 | /* |
3441 | * sd_lb_stats - Structure to store the statistics of a sched_domain | 3409 | * sd_lb_stats - Structure to store the statistics of a sched_domain |
@@ -4477,31 +4445,31 @@ redo: | |||
4477 | * correctly treated as an imbalance. | 4445 | * correctly treated as an imbalance. |
4478 | */ | 4446 | */ |
4479 | env.flags |= LBF_ALL_PINNED; | 4447 | env.flags |= LBF_ALL_PINNED; |
4480 | env.max_load_move = imbalance; | 4448 | env.load_move = imbalance; |
4481 | env.src_cpu = busiest->cpu; | 4449 | env.src_cpu = busiest->cpu; |
4482 | env.src_rq = busiest; | 4450 | env.src_rq = busiest; |
4483 | env.loop_max = busiest->nr_running; | 4451 | env.loop_max = busiest->nr_running; |
4484 | 4452 | ||
4453 | more_balance: | ||
4485 | local_irq_save(flags); | 4454 | local_irq_save(flags); |
4486 | double_rq_lock(this_rq, busiest); | 4455 | double_rq_lock(this_rq, busiest); |
4487 | ld_moved = move_tasks(&env); | 4456 | if (!env.loop) |
4457 | update_h_load(env.src_cpu); | ||
4458 | ld_moved += move_tasks(&env); | ||
4488 | double_rq_unlock(this_rq, busiest); | 4459 | double_rq_unlock(this_rq, busiest); |
4489 | local_irq_restore(flags); | 4460 | local_irq_restore(flags); |
4490 | 4461 | ||
4462 | if (env.flags & LBF_NEED_BREAK) { | ||
4463 | env.flags &= ~LBF_NEED_BREAK; | ||
4464 | goto more_balance; | ||
4465 | } | ||
4466 | |||
4491 | /* | 4467 | /* |
4492 | * some other cpu did the load balance for us. | 4468 | * some other cpu did the load balance for us. |
4493 | */ | 4469 | */ |
4494 | if (ld_moved && this_cpu != smp_processor_id()) | 4470 | if (ld_moved && this_cpu != smp_processor_id()) |
4495 | resched_cpu(this_cpu); | 4471 | resched_cpu(this_cpu); |
4496 | 4472 | ||
4497 | if (env.flags & LBF_ABORT) | ||
4498 | goto out_balanced; | ||
4499 | |||
4500 | if (env.flags & LBF_NEED_BREAK) { | ||
4501 | env.flags &= ~LBF_NEED_BREAK; | ||
4502 | goto redo; | ||
4503 | } | ||
4504 | |||
4505 | /* All tasks on this runqueue were pinned by CPU affinity */ | 4473 | /* All tasks on this runqueue were pinned by CPU affinity */ |
4506 | if (unlikely(env.flags & LBF_ALL_PINNED)) { | 4474 | if (unlikely(env.flags & LBF_ALL_PINNED)) { |
4507 | cpumask_clear_cpu(cpu_of(busiest), cpus); | 4475 | cpumask_clear_cpu(cpu_of(busiest), cpus); |