diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2012-02-20 15:49:09 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2012-03-01 07:08:37 -0500 |
commit | 367456c756a6b84f493ca9cc5b17b1f5d38ef466 (patch) | |
tree | 0e95a2fa5cb25ea14e2841d84d4d2410ff383e33 /kernel/sched | |
parent | ddcdf6e7d9919d139031fa2a6addd9544a9a833e (diff) |
sched: Ditch per cgroup task lists for load-balancing
Per cgroup load-balance has numerous problems, chief amongst them that
there is no real sane order in them. So stop pretending it makes sense
and enqueue all tasks on a single list.
This also allows us to more easily fix the fwd progress issue
uncovered by the lock-break stuff. Rotate the list on failure to
migreate and limit the total iterations to nr_running (which with
releasing the lock isn't strictly accurate but close enough).
Also add a filter that skips very light tasks on the first attempt
around the list, this attempts to avoid shooting whole cgroups around
without affecting over balance.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: pjt@google.com
Link: http://lkml.kernel.org/n/tip-tx8yqydc7eimgq7i4rkc3a4g@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/core.c | 3 | ||||
-rw-r--r-- | kernel/sched/fair.c | 176 | ||||
-rw-r--r-- | kernel/sched/sched.h | 10 |
3 files changed, 80 insertions, 109 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 25e06a5e048b..95545126be1c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -6959,6 +6959,9 @@ void __init sched_init(void) | |||
6959 | rq->online = 0; | 6959 | rq->online = 0; |
6960 | rq->idle_stamp = 0; | 6960 | rq->idle_stamp = 0; |
6961 | rq->avg_idle = 2*sysctl_sched_migration_cost; | 6961 | rq->avg_idle = 2*sysctl_sched_migration_cost; |
6962 | |||
6963 | INIT_LIST_HEAD(&rq->cfs_tasks); | ||
6964 | |||
6962 | rq_attach_root(rq, &def_root_domain); | 6965 | rq_attach_root(rq, &def_root_domain); |
6963 | #ifdef CONFIG_NO_HZ | 6966 | #ifdef CONFIG_NO_HZ |
6964 | rq->nohz_flags = 0; | 6967 | rq->nohz_flags = 0; |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 233d05171bf5..a0424fc4cc54 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -776,29 +776,16 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
776 | * Scheduling class queueing methods: | 776 | * Scheduling class queueing methods: |
777 | */ | 777 | */ |
778 | 778 | ||
779 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
780 | static void | ||
781 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
782 | { | ||
783 | cfs_rq->task_weight += weight; | ||
784 | } | ||
785 | #else | ||
786 | static inline void | ||
787 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
788 | { | ||
789 | } | ||
790 | #endif | ||
791 | |||
792 | static void | 779 | static void |
793 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 780 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
794 | { | 781 | { |
795 | update_load_add(&cfs_rq->load, se->load.weight); | 782 | update_load_add(&cfs_rq->load, se->load.weight); |
796 | if (!parent_entity(se)) | 783 | if (!parent_entity(se)) |
797 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); | 784 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); |
798 | if (entity_is_task(se)) { | 785 | #ifdef CONFIG_SMP |
799 | add_cfs_task_weight(cfs_rq, se->load.weight); | 786 | if (entity_is_task(se)) |
800 | list_add(&se->group_node, &cfs_rq->tasks); | 787 | list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); |
801 | } | 788 | #endif |
802 | cfs_rq->nr_running++; | 789 | cfs_rq->nr_running++; |
803 | } | 790 | } |
804 | 791 | ||
@@ -808,10 +795,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
808 | update_load_sub(&cfs_rq->load, se->load.weight); | 795 | update_load_sub(&cfs_rq->load, se->load.weight); |
809 | if (!parent_entity(se)) | 796 | if (!parent_entity(se)) |
810 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); | 797 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); |
811 | if (entity_is_task(se)) { | 798 | if (entity_is_task(se)) |
812 | add_cfs_task_weight(cfs_rq, -se->load.weight); | ||
813 | list_del_init(&se->group_node); | 799 | list_del_init(&se->group_node); |
814 | } | ||
815 | cfs_rq->nr_running--; | 800 | cfs_rq->nr_running--; |
816 | } | 801 | } |
817 | 802 | ||
@@ -3085,17 +3070,14 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
3085 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | 3070 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; |
3086 | 3071 | ||
3087 | #define LBF_ALL_PINNED 0x01 | 3072 | #define LBF_ALL_PINNED 0x01 |
3088 | #define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ | 3073 | #define LBF_NEED_BREAK 0x02 |
3089 | #define LBF_HAD_BREAK 0x04 | 3074 | #define LBF_ABORT 0x04 |
3090 | #define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ | ||
3091 | #define LBF_ABORT 0x10 | ||
3092 | 3075 | ||
3093 | struct lb_env { | 3076 | struct lb_env { |
3094 | struct sched_domain *sd; | 3077 | struct sched_domain *sd; |
3095 | 3078 | ||
3096 | int src_cpu; | 3079 | int src_cpu; |
3097 | struct rq *src_rq; | 3080 | struct rq *src_rq; |
3098 | struct cfs_rq *src_cfs_rq; | ||
3099 | 3081 | ||
3100 | int dst_cpu; | 3082 | int dst_cpu; |
3101 | struct rq *dst_rq; | 3083 | struct rq *dst_rq; |
@@ -3103,6 +3085,10 @@ struct lb_env { | |||
3103 | enum cpu_idle_type idle; | 3085 | enum cpu_idle_type idle; |
3104 | unsigned long max_load_move; | 3086 | unsigned long max_load_move; |
3105 | unsigned int flags; | 3087 | unsigned int flags; |
3088 | |||
3089 | unsigned int loop; | ||
3090 | unsigned int loop_break; | ||
3091 | unsigned int loop_max; | ||
3106 | }; | 3092 | }; |
3107 | 3093 | ||
3108 | /* | 3094 | /* |
@@ -3208,53 +3194,69 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
3208 | static int move_one_task(struct lb_env *env) | 3194 | static int move_one_task(struct lb_env *env) |
3209 | { | 3195 | { |
3210 | struct task_struct *p, *n; | 3196 | struct task_struct *p, *n; |
3211 | struct cfs_rq *cfs_rq; | ||
3212 | 3197 | ||
3213 | for_each_leaf_cfs_rq(env->src_rq, cfs_rq) { | 3198 | list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { |
3214 | list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { | 3199 | if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) |
3215 | if (throttled_lb_pair(task_group(p), | 3200 | continue; |
3216 | env->src_cpu, env->dst_cpu)) | ||
3217 | break; | ||
3218 | 3201 | ||
3219 | if (!can_migrate_task(p, env)) | 3202 | if (!can_migrate_task(p, env)) |
3220 | continue; | 3203 | continue; |
3221 | 3204 | ||
3222 | move_task(p, env); | 3205 | move_task(p, env); |
3223 | /* | 3206 | /* |
3224 | * Right now, this is only the second place move_task() | 3207 | * Right now, this is only the second place move_task() |
3225 | * is called, so we can safely collect move_task() | 3208 | * is called, so we can safely collect move_task() |
3226 | * stats here rather than inside move_task(). | 3209 | * stats here rather than inside move_task(). |
3227 | */ | 3210 | */ |
3228 | schedstat_inc(env->sd, lb_gained[env->idle]); | 3211 | schedstat_inc(env->sd, lb_gained[env->idle]); |
3229 | return 1; | 3212 | return 1; |
3230 | } | ||
3231 | } | 3213 | } |
3232 | |||
3233 | return 0; | 3214 | return 0; |
3234 | } | 3215 | } |
3235 | 3216 | ||
3217 | static unsigned long task_h_load(struct task_struct *p); | ||
3218 | |||
3236 | static unsigned long balance_tasks(struct lb_env *env) | 3219 | static unsigned long balance_tasks(struct lb_env *env) |
3237 | { | 3220 | { |
3238 | int loops = 0, pulled = 0; | ||
3239 | long rem_load_move = env->max_load_move; | 3221 | long rem_load_move = env->max_load_move; |
3240 | struct task_struct *p, *n; | 3222 | struct task_struct *p, *n; |
3223 | unsigned long load; | ||
3224 | int pulled = 0; | ||
3241 | 3225 | ||
3242 | if (env->max_load_move == 0) | 3226 | if (env->max_load_move == 0) |
3243 | goto out; | 3227 | goto out; |
3244 | 3228 | ||
3245 | list_for_each_entry_safe(p, n, &env->src_cfs_rq->tasks, se.group_node) { | 3229 | list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { |
3246 | if (loops++ > sysctl_sched_nr_migrate) { | 3230 | env->loop++; |
3231 | /* We've more or less seen every task there is, call it quits */ | ||
3232 | if (env->loop > env->loop_max) { | ||
3233 | env->flags |= LBF_ABORT; | ||
3234 | break; | ||
3235 | } | ||
3236 | /* take a beather every nr_migrate tasks */ | ||
3237 | if (env->loop > env->loop_break) { | ||
3238 | env->loop_break += sysctl_sched_nr_migrate; | ||
3247 | env->flags |= LBF_NEED_BREAK; | 3239 | env->flags |= LBF_NEED_BREAK; |
3248 | break; | 3240 | break; |
3249 | } | 3241 | } |
3250 | 3242 | ||
3251 | if ((p->se.load.weight >> 1) > rem_load_move || | 3243 | if (throttled_lb_pair(task_group(p), env->src_rq->cpu, |
3252 | !can_migrate_task(p, env)) | 3244 | env->dst_cpu)) |
3253 | continue; | 3245 | goto next; |
3246 | |||
3247 | load = task_h_load(p); | ||
3248 | if (load < 16 && !env->sd->nr_balance_failed) | ||
3249 | goto next; | ||
3250 | |||
3251 | if ((load * 2) > rem_load_move) | ||
3252 | goto next; | ||
3253 | |||
3254 | if (!can_migrate_task(p, env)) | ||
3255 | goto next; | ||
3254 | 3256 | ||
3255 | move_task(p, env); | 3257 | move_task(p, env); |
3256 | pulled++; | 3258 | pulled++; |
3257 | rem_load_move -= p->se.load.weight; | 3259 | rem_load_move -= load; |
3258 | 3260 | ||
3259 | #ifdef CONFIG_PREEMPT | 3261 | #ifdef CONFIG_PREEMPT |
3260 | /* | 3262 | /* |
@@ -3274,6 +3276,10 @@ static unsigned long balance_tasks(struct lb_env *env) | |||
3274 | */ | 3276 | */ |
3275 | if (rem_load_move <= 0) | 3277 | if (rem_load_move <= 0) |
3276 | break; | 3278 | break; |
3279 | |||
3280 | continue; | ||
3281 | next: | ||
3282 | list_move_tail(&p->se.group_node, &env->src_rq->cfs_tasks); | ||
3277 | } | 3283 | } |
3278 | out: | 3284 | out: |
3279 | /* | 3285 | /* |
@@ -3363,65 +3369,33 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
3363 | 3369 | ||
3364 | static void update_h_load(long cpu) | 3370 | static void update_h_load(long cpu) |
3365 | { | 3371 | { |
3372 | rcu_read_lock(); | ||
3366 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 3373 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
3374 | rcu_read_unlock(); | ||
3367 | } | 3375 | } |
3368 | 3376 | ||
3369 | static unsigned long load_balance_fair(struct lb_env *env) | 3377 | static unsigned long task_h_load(struct task_struct *p) |
3370 | { | 3378 | { |
3371 | unsigned long max_load_move = env->max_load_move; | 3379 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
3372 | long rem_load_move = env->max_load_move; | 3380 | unsigned long load; |
3373 | |||
3374 | rcu_read_lock(); | ||
3375 | update_h_load(cpu_of(env->src_rq)); | ||
3376 | |||
3377 | for_each_leaf_cfs_rq(env->src_rq, env->src_cfs_rq) { | ||
3378 | unsigned long busiest_h_load = env->src_cfs_rq->h_load; | ||
3379 | unsigned long busiest_weight = env->src_cfs_rq->load.weight; | ||
3380 | u64 rem_load, moved_load; | ||
3381 | |||
3382 | if (env->flags & (LBF_NEED_BREAK|LBF_ABORT)) | ||
3383 | break; | ||
3384 | |||
3385 | /* | ||
3386 | * empty group or part of a throttled hierarchy | ||
3387 | */ | ||
3388 | if (!env->src_cfs_rq->task_weight) | ||
3389 | continue; | ||
3390 | |||
3391 | if (throttled_lb_pair(env->src_cfs_rq->tg, | ||
3392 | cpu_of(env->src_rq), | ||
3393 | env->dst_cpu)) | ||
3394 | continue; | ||
3395 | |||
3396 | rem_load = (u64)rem_load_move * busiest_weight; | ||
3397 | rem_load = div_u64(rem_load, busiest_h_load + 1); | ||
3398 | |||
3399 | env->max_load_move = rem_load; | ||
3400 | |||
3401 | moved_load = balance_tasks(env); | ||
3402 | if (!moved_load) | ||
3403 | continue; | ||
3404 | |||
3405 | moved_load *= busiest_h_load; | ||
3406 | moved_load = div_u64(moved_load, busiest_weight + 1); | ||
3407 | 3381 | ||
3408 | rem_load_move -= moved_load; | 3382 | load = p->se.load.weight; |
3409 | if (rem_load_move < 0) | 3383 | load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1); |
3410 | break; | ||
3411 | } | ||
3412 | rcu_read_unlock(); | ||
3413 | 3384 | ||
3414 | return max_load_move - rem_load_move; | 3385 | return load; |
3415 | } | 3386 | } |
3416 | #else | 3387 | #else |
3417 | static inline void update_shares(int cpu) | 3388 | static inline void update_shares(int cpu) |
3418 | { | 3389 | { |
3419 | } | 3390 | } |
3420 | 3391 | ||
3421 | static unsigned long load_balance_fair(struct lb_env *env) | 3392 | static inline void update_h_load(long cpu) |
3422 | { | 3393 | { |
3423 | env->src_cfs_rq = &env->src_rq->cfs; | 3394 | } |
3424 | return balance_tasks(env); | 3395 | |
3396 | static unsigned long task_h_load(struct task_struct *p) | ||
3397 | { | ||
3398 | return p->se.load.weight; | ||
3425 | } | 3399 | } |
3426 | #endif | 3400 | #endif |
3427 | 3401 | ||
@@ -3437,9 +3411,10 @@ static int move_tasks(struct lb_env *env) | |||
3437 | unsigned long max_load_move = env->max_load_move; | 3411 | unsigned long max_load_move = env->max_load_move; |
3438 | unsigned long total_load_moved = 0, load_moved; | 3412 | unsigned long total_load_moved = 0, load_moved; |
3439 | 3413 | ||
3414 | update_h_load(cpu_of(env->src_rq)); | ||
3440 | do { | 3415 | do { |
3441 | env->max_load_move = max_load_move - total_load_moved; | 3416 | env->max_load_move = max_load_move - total_load_moved; |
3442 | load_moved = load_balance_fair(env); | 3417 | load_moved = balance_tasks(env); |
3443 | total_load_moved += load_moved; | 3418 | total_load_moved += load_moved; |
3444 | 3419 | ||
3445 | if (env->flags & (LBF_NEED_BREAK|LBF_ABORT)) | 3420 | if (env->flags & (LBF_NEED_BREAK|LBF_ABORT)) |
@@ -4464,6 +4439,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4464 | .dst_cpu = this_cpu, | 4439 | .dst_cpu = this_cpu, |
4465 | .dst_rq = this_rq, | 4440 | .dst_rq = this_rq, |
4466 | .idle = idle, | 4441 | .idle = idle, |
4442 | .loop_break = sysctl_sched_nr_migrate, | ||
4467 | }; | 4443 | }; |
4468 | 4444 | ||
4469 | cpumask_copy(cpus, cpu_active_mask); | 4445 | cpumask_copy(cpus, cpu_active_mask); |
@@ -4504,6 +4480,7 @@ redo: | |||
4504 | env.max_load_move = imbalance; | 4480 | env.max_load_move = imbalance; |
4505 | env.src_cpu = busiest->cpu; | 4481 | env.src_cpu = busiest->cpu; |
4506 | env.src_rq = busiest; | 4482 | env.src_rq = busiest; |
4483 | env.loop_max = busiest->nr_running; | ||
4507 | 4484 | ||
4508 | local_irq_save(flags); | 4485 | local_irq_save(flags); |
4509 | double_rq_lock(this_rq, busiest); | 4486 | double_rq_lock(this_rq, busiest); |
@@ -4521,9 +4498,7 @@ redo: | |||
4521 | goto out_balanced; | 4498 | goto out_balanced; |
4522 | 4499 | ||
4523 | if (env.flags & LBF_NEED_BREAK) { | 4500 | if (env.flags & LBF_NEED_BREAK) { |
4524 | env.flags += LBF_HAD_BREAK - LBF_NEED_BREAK; | 4501 | env.flags &= ~LBF_NEED_BREAK; |
4525 | if (env.flags & LBF_ABORT) | ||
4526 | goto out_balanced; | ||
4527 | goto redo; | 4502 | goto redo; |
4528 | } | 4503 | } |
4529 | 4504 | ||
@@ -5357,7 +5332,6 @@ static void set_curr_task_fair(struct rq *rq) | |||
5357 | void init_cfs_rq(struct cfs_rq *cfs_rq) | 5332 | void init_cfs_rq(struct cfs_rq *cfs_rq) |
5358 | { | 5333 | { |
5359 | cfs_rq->tasks_timeline = RB_ROOT; | 5334 | cfs_rq->tasks_timeline = RB_ROOT; |
5360 | INIT_LIST_HEAD(&cfs_rq->tasks); | ||
5361 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 5335 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
5362 | #ifndef CONFIG_64BIT | 5336 | #ifndef CONFIG_64BIT |
5363 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | 5337 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c0660a1a0cd1..753bdd567416 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -212,9 +212,6 @@ struct cfs_rq { | |||
212 | struct rb_root tasks_timeline; | 212 | struct rb_root tasks_timeline; |
213 | struct rb_node *rb_leftmost; | 213 | struct rb_node *rb_leftmost; |
214 | 214 | ||
215 | struct list_head tasks; | ||
216 | struct list_head *balance_iterator; | ||
217 | |||
218 | /* | 215 | /* |
219 | * 'curr' points to currently running entity on this cfs_rq. | 216 | * 'curr' points to currently running entity on this cfs_rq. |
220 | * It is set to NULL otherwise (i.e when none are currently running). | 217 | * It is set to NULL otherwise (i.e when none are currently running). |
@@ -242,11 +239,6 @@ struct cfs_rq { | |||
242 | 239 | ||
243 | #ifdef CONFIG_SMP | 240 | #ifdef CONFIG_SMP |
244 | /* | 241 | /* |
245 | * the part of load.weight contributed by tasks | ||
246 | */ | ||
247 | unsigned long task_weight; | ||
248 | |||
249 | /* | ||
250 | * h_load = weight * f(tg) | 242 | * h_load = weight * f(tg) |
251 | * | 243 | * |
252 | * Where f(tg) is the recursive weight fraction assigned to | 244 | * Where f(tg) is the recursive weight fraction assigned to |
@@ -420,6 +412,8 @@ struct rq { | |||
420 | int cpu; | 412 | int cpu; |
421 | int online; | 413 | int online; |
422 | 414 | ||
415 | struct list_head cfs_tasks; | ||
416 | |||
423 | u64 rt_avg; | 417 | u64 rt_avg; |
424 | u64 age_stamp; | 418 | u64 age_stamp; |
425 | u64 idle_stamp; | 419 | u64 idle_stamp; |