diff options
author | Kirill Tkhai <ktkhai@parallels.com> | 2014-08-20 05:48:29 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2014-08-20 08:53:05 -0400 |
commit | 163122b7fcfa28c0e4a838fcc8043c616746802e (patch) | |
tree | 6e23b7cafd125e42192cdfd52e385ad0ddbb1861 /kernel/sched | |
parent | e5673f280501298dbb56efa46e333cf64ee5080a (diff) |
sched/fair: Remove double_lock_balance() from load_balance()
Avoid double_rq_lock() and use TASK_ON_RQ_MIGRATING for
load_balance(). The advantage is (obviously) not holding two
rq->lock's at the same time and thereby increasing parallelism.
Further note that if there was no task to migrate we will not
have acquired the second rq->lock at all.
The important point to note is that because we acquire dst->lock
immediately after releasing src->lock the potential wait time of
task_rq_lock() callers on TASK_ON_RQ_MIGRATING is not longer
than it would have been in the double rq lock scenario.
Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Turner <pjt@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Kirill Tkhai <tkhai@yandex.ru>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1408528109.23412.94.camel@tkhai
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/fair.c | 151 |
1 files changed, 99 insertions, 52 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7e5cf051c144..d3427a8f254b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -4709,7 +4709,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
4709 | return; | 4709 | return; |
4710 | 4710 | ||
4711 | /* | 4711 | /* |
4712 | * This is possible from callers such as move_task(), in which we | 4712 | * This is possible from callers such as attach_tasks(), in which we |
4713 | * unconditionally check_prempt_curr() after an enqueue (which may have | 4713 | * unconditionally check_prempt_curr() after an enqueue (which may have |
4714 | * lead to a throttle). This both saves work and prevents false | 4714 | * lead to a throttle). This both saves work and prevents false |
4715 | * next-buddy nomination below. | 4715 | * next-buddy nomination below. |
@@ -5117,21 +5117,10 @@ struct lb_env { | |||
5117 | unsigned int loop_max; | 5117 | unsigned int loop_max; |
5118 | 5118 | ||
5119 | enum fbq_type fbq_type; | 5119 | enum fbq_type fbq_type; |
5120 | struct list_head tasks; | ||
5120 | }; | 5121 | }; |
5121 | 5122 | ||
5122 | /* | 5123 | /* |
5123 | * move_task - move a task from one runqueue to another runqueue. | ||
5124 | * Both runqueues must be locked. | ||
5125 | */ | ||
5126 | static void move_task(struct task_struct *p, struct lb_env *env) | ||
5127 | { | ||
5128 | deactivate_task(env->src_rq, p, 0); | ||
5129 | set_task_cpu(p, env->dst_cpu); | ||
5130 | activate_task(env->dst_rq, p, 0); | ||
5131 | check_preempt_curr(env->dst_rq, p, 0); | ||
5132 | } | ||
5133 | |||
5134 | /* | ||
5135 | * Is this task likely cache-hot: | 5124 | * Is this task likely cache-hot: |
5136 | */ | 5125 | */ |
5137 | static int task_hot(struct task_struct *p, struct lb_env *env) | 5126 | static int task_hot(struct task_struct *p, struct lb_env *env) |
@@ -5346,6 +5335,18 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
5346 | } | 5335 | } |
5347 | 5336 | ||
5348 | /* | 5337 | /* |
5338 | * detach_task() -- detach the task for the migration specified in env | ||
5339 | */ | ||
5340 | static void detach_task(struct task_struct *p, struct lb_env *env) | ||
5341 | { | ||
5342 | lockdep_assert_held(&env->src_rq->lock); | ||
5343 | |||
5344 | deactivate_task(env->src_rq, p, 0); | ||
5345 | p->on_rq = TASK_ON_RQ_MIGRATING; | ||
5346 | set_task_cpu(p, env->dst_cpu); | ||
5347 | } | ||
5348 | |||
5349 | /* | ||
5349 | * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as | 5350 | * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as |
5350 | * part of active balancing operations within "domain". | 5351 | * part of active balancing operations within "domain". |
5351 | * | 5352 | * |
@@ -5361,15 +5362,13 @@ static struct task_struct *detach_one_task(struct lb_env *env) | |||
5361 | if (!can_migrate_task(p, env)) | 5362 | if (!can_migrate_task(p, env)) |
5362 | continue; | 5363 | continue; |
5363 | 5364 | ||
5364 | deactivate_task(env->src_rq, p, 0); | 5365 | detach_task(p, env); |
5365 | p->on_rq = TASK_ON_RQ_MIGRATING; | ||
5366 | set_task_cpu(p, env->dst_cpu); | ||
5367 | 5366 | ||
5368 | /* | 5367 | /* |
5369 | * Right now, this is only the second place where | 5368 | * Right now, this is only the second place where |
5370 | * lb_gained[env->idle] is updated (other is move_tasks) | 5369 | * lb_gained[env->idle] is updated (other is detach_tasks) |
5371 | * so we can safely collect stats here rather than | 5370 | * so we can safely collect stats here rather than |
5372 | * inside move_tasks(). | 5371 | * inside detach_tasks(). |
5373 | */ | 5372 | */ |
5374 | schedstat_inc(env->sd, lb_gained[env->idle]); | 5373 | schedstat_inc(env->sd, lb_gained[env->idle]); |
5375 | return p; | 5374 | return p; |
@@ -5377,35 +5376,22 @@ static struct task_struct *detach_one_task(struct lb_env *env) | |||
5377 | return NULL; | 5376 | return NULL; |
5378 | } | 5377 | } |
5379 | 5378 | ||
5380 | /* | ||
5381 | * attach_one_task() -- attaches the task returned from detach_one_task() to | ||
5382 | * its new rq. | ||
5383 | */ | ||
5384 | static void attach_one_task(struct rq *rq, struct task_struct *p) | ||
5385 | { | ||
5386 | raw_spin_lock(&rq->lock); | ||
5387 | BUG_ON(task_rq(p) != rq); | ||
5388 | p->on_rq = TASK_ON_RQ_QUEUED; | ||
5389 | activate_task(rq, p, 0); | ||
5390 | check_preempt_curr(rq, p, 0); | ||
5391 | raw_spin_unlock(&rq->lock); | ||
5392 | } | ||
5393 | |||
5394 | static const unsigned int sched_nr_migrate_break = 32; | 5379 | static const unsigned int sched_nr_migrate_break = 32; |
5395 | 5380 | ||
5396 | /* | 5381 | /* |
5397 | * move_tasks tries to move up to imbalance weighted load from busiest to | 5382 | * detach_tasks() -- tries to detach up to imbalance weighted load from |
5398 | * this_rq, as part of a balancing operation within domain "sd". | 5383 | * busiest_rq, as part of a balancing operation within domain "sd". |
5399 | * Returns 1 if successful and 0 otherwise. | ||
5400 | * | 5384 | * |
5401 | * Called with both runqueues locked. | 5385 | * Returns number of detached tasks if successful and 0 otherwise. |
5402 | */ | 5386 | */ |
5403 | static int move_tasks(struct lb_env *env) | 5387 | static int detach_tasks(struct lb_env *env) |
5404 | { | 5388 | { |
5405 | struct list_head *tasks = &env->src_rq->cfs_tasks; | 5389 | struct list_head *tasks = &env->src_rq->cfs_tasks; |
5406 | struct task_struct *p; | 5390 | struct task_struct *p; |
5407 | unsigned long load; | 5391 | unsigned long load; |
5408 | int pulled = 0; | 5392 | int detached = 0; |
5393 | |||
5394 | lockdep_assert_held(&env->src_rq->lock); | ||
5409 | 5395 | ||
5410 | if (env->imbalance <= 0) | 5396 | if (env->imbalance <= 0) |
5411 | return 0; | 5397 | return 0; |
@@ -5436,14 +5422,16 @@ static int move_tasks(struct lb_env *env) | |||
5436 | if ((load / 2) > env->imbalance) | 5422 | if ((load / 2) > env->imbalance) |
5437 | goto next; | 5423 | goto next; |
5438 | 5424 | ||
5439 | move_task(p, env); | 5425 | detach_task(p, env); |
5440 | pulled++; | 5426 | list_add(&p->se.group_node, &env->tasks); |
5427 | |||
5428 | detached++; | ||
5441 | env->imbalance -= load; | 5429 | env->imbalance -= load; |
5442 | 5430 | ||
5443 | #ifdef CONFIG_PREEMPT | 5431 | #ifdef CONFIG_PREEMPT |
5444 | /* | 5432 | /* |
5445 | * NEWIDLE balancing is a source of latency, so preemptible | 5433 | * NEWIDLE balancing is a source of latency, so preemptible |
5446 | * kernels will stop after the first task is pulled to minimize | 5434 | * kernels will stop after the first task is detached to minimize |
5447 | * the critical section. | 5435 | * the critical section. |
5448 | */ | 5436 | */ |
5449 | if (env->idle == CPU_NEWLY_IDLE) | 5437 | if (env->idle == CPU_NEWLY_IDLE) |
@@ -5463,13 +5451,58 @@ next: | |||
5463 | } | 5451 | } |
5464 | 5452 | ||
5465 | /* | 5453 | /* |
5466 | * Right now, this is one of only two places move_task() is called, | 5454 | * Right now, this is one of only two places we collect this stat |
5467 | * so we can safely collect move_task() stats here rather than | 5455 | * so we can safely collect detach_one_task() stats here rather |
5468 | * inside move_task(). | 5456 | * than inside detach_one_task(). |
5469 | */ | 5457 | */ |
5470 | schedstat_add(env->sd, lb_gained[env->idle], pulled); | 5458 | schedstat_add(env->sd, lb_gained[env->idle], detached); |
5471 | 5459 | ||
5472 | return pulled; | 5460 | return detached; |
5461 | } | ||
5462 | |||
5463 | /* | ||
5464 | * attach_task() -- attach the task detached by detach_task() to its new rq. | ||
5465 | */ | ||
5466 | static void attach_task(struct rq *rq, struct task_struct *p) | ||
5467 | { | ||
5468 | lockdep_assert_held(&rq->lock); | ||
5469 | |||
5470 | BUG_ON(task_rq(p) != rq); | ||
5471 | p->on_rq = TASK_ON_RQ_QUEUED; | ||
5472 | activate_task(rq, p, 0); | ||
5473 | check_preempt_curr(rq, p, 0); | ||
5474 | } | ||
5475 | |||
5476 | /* | ||
5477 | * attach_one_task() -- attaches the task returned from detach_one_task() to | ||
5478 | * its new rq. | ||
5479 | */ | ||
5480 | static void attach_one_task(struct rq *rq, struct task_struct *p) | ||
5481 | { | ||
5482 | raw_spin_lock(&rq->lock); | ||
5483 | attach_task(rq, p); | ||
5484 | raw_spin_unlock(&rq->lock); | ||
5485 | } | ||
5486 | |||
5487 | /* | ||
5488 | * attach_tasks() -- attaches all tasks detached by detach_tasks() to their | ||
5489 | * new rq. | ||
5490 | */ | ||
5491 | static void attach_tasks(struct lb_env *env) | ||
5492 | { | ||
5493 | struct list_head *tasks = &env->tasks; | ||
5494 | struct task_struct *p; | ||
5495 | |||
5496 | raw_spin_lock(&env->dst_rq->lock); | ||
5497 | |||
5498 | while (!list_empty(tasks)) { | ||
5499 | p = list_first_entry(tasks, struct task_struct, se.group_node); | ||
5500 | list_del_init(&p->se.group_node); | ||
5501 | |||
5502 | attach_task(env->dst_rq, p); | ||
5503 | } | ||
5504 | |||
5505 | raw_spin_unlock(&env->dst_rq->lock); | ||
5473 | } | 5506 | } |
5474 | 5507 | ||
5475 | #ifdef CONFIG_FAIR_GROUP_SCHED | 5508 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -6603,6 +6636,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
6603 | .loop_break = sched_nr_migrate_break, | 6636 | .loop_break = sched_nr_migrate_break, |
6604 | .cpus = cpus, | 6637 | .cpus = cpus, |
6605 | .fbq_type = all, | 6638 | .fbq_type = all, |
6639 | .tasks = LIST_HEAD_INIT(env.tasks), | ||
6606 | }; | 6640 | }; |
6607 | 6641 | ||
6608 | /* | 6642 | /* |
@@ -6652,16 +6686,29 @@ redo: | |||
6652 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); | 6686 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
6653 | 6687 | ||
6654 | more_balance: | 6688 | more_balance: |
6655 | local_irq_save(flags); | 6689 | raw_spin_lock_irqsave(&busiest->lock, flags); |
6656 | double_rq_lock(env.dst_rq, busiest); | ||
6657 | 6690 | ||
6658 | /* | 6691 | /* |
6659 | * cur_ld_moved - load moved in current iteration | 6692 | * cur_ld_moved - load moved in current iteration |
6660 | * ld_moved - cumulative load moved across iterations | 6693 | * ld_moved - cumulative load moved across iterations |
6661 | */ | 6694 | */ |
6662 | cur_ld_moved = move_tasks(&env); | 6695 | cur_ld_moved = detach_tasks(&env); |
6663 | ld_moved += cur_ld_moved; | 6696 | |
6664 | double_rq_unlock(env.dst_rq, busiest); | 6697 | /* |
6698 | * We've detached some tasks from busiest_rq. Every | ||
6699 | * task is masked "TASK_ON_RQ_MIGRATING", so we can safely | ||
6700 | * unlock busiest->lock, and we are able to be sure | ||
6701 | * that nobody can manipulate the tasks in parallel. | ||
6702 | * See task_rq_lock() family for the details. | ||
6703 | */ | ||
6704 | |||
6705 | raw_spin_unlock(&busiest->lock); | ||
6706 | |||
6707 | if (cur_ld_moved) { | ||
6708 | attach_tasks(&env); | ||
6709 | ld_moved += cur_ld_moved; | ||
6710 | } | ||
6711 | |||
6665 | local_irq_restore(flags); | 6712 | local_irq_restore(flags); |
6666 | 6713 | ||
6667 | /* | 6714 | /* |
@@ -6797,7 +6844,7 @@ more_balance: | |||
6797 | * If we've begun active balancing, start to back off. This | 6844 | * If we've begun active balancing, start to back off. This |
6798 | * case may not be covered by the all_pinned logic if there | 6845 | * case may not be covered by the all_pinned logic if there |
6799 | * is only 1 task on the busy runqueue (because we don't call | 6846 | * is only 1 task on the busy runqueue (because we don't call |
6800 | * move_tasks). | 6847 | * detach_tasks). |
6801 | */ | 6848 | */ |
6802 | if (sd->balance_interval < sd->max_interval) | 6849 | if (sd->balance_interval < sd->max_interval) |
6803 | sd->balance_interval *= 2; | 6850 | sd->balance_interval *= 2; |