aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorKirill Tkhai <ktkhai@parallels.com>2014-08-20 05:48:29 -0400
committerIngo Molnar <mingo@kernel.org>2014-08-20 08:53:05 -0400
commit163122b7fcfa28c0e4a838fcc8043c616746802e (patch)
tree6e23b7cafd125e42192cdfd52e385ad0ddbb1861 /kernel/sched
parente5673f280501298dbb56efa46e333cf64ee5080a (diff)
sched/fair: Remove double_lock_balance() from load_balance()
Avoid double_rq_lock() and use TASK_ON_RQ_MIGRATING for load_balance(). The advantage is (obviously) not holding two rq->lock's at the same time and thereby increasing parallelism. Further note that if there was no task to migrate we will not have acquired the second rq->lock at all. The important point to note is that because we acquire dst->lock immediately after releasing src->lock the potential wait time of task_rq_lock() callers on TASK_ON_RQ_MIGRATING is not longer than it would have been in the double rq lock scenario. Signed-off-by: Kirill Tkhai <ktkhai@parallels.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Paul Turner <pjt@google.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Mike Galbraith <umgwanakikbuti@gmail.com> Cc: Kirill Tkhai <tkhai@yandex.ru> Cc: Tim Chen <tim.c.chen@linux.intel.com> Cc: Nicolas Pitre <nicolas.pitre@linaro.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/1408528109.23412.94.camel@tkhai Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/fair.c151
1 files changed, 99 insertions, 52 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7e5cf051c144..d3427a8f254b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4709,7 +4709,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
4709 return; 4709 return;
4710 4710
4711 /* 4711 /*
4712 * This is possible from callers such as move_task(), in which we 4712 * This is possible from callers such as attach_tasks(), in which we
4713 * unconditionally check_prempt_curr() after an enqueue (which may have 4713 * unconditionally check_prempt_curr() after an enqueue (which may have
4714 * lead to a throttle). This both saves work and prevents false 4714 * lead to a throttle). This both saves work and prevents false
4715 * next-buddy nomination below. 4715 * next-buddy nomination below.
@@ -5117,21 +5117,10 @@ struct lb_env {
5117 unsigned int loop_max; 5117 unsigned int loop_max;
5118 5118
5119 enum fbq_type fbq_type; 5119 enum fbq_type fbq_type;
5120 struct list_head tasks;
5120}; 5121};
5121 5122
5122/* 5123/*
5123 * move_task - move a task from one runqueue to another runqueue.
5124 * Both runqueues must be locked.
5125 */
5126static void move_task(struct task_struct *p, struct lb_env *env)
5127{
5128 deactivate_task(env->src_rq, p, 0);
5129 set_task_cpu(p, env->dst_cpu);
5130 activate_task(env->dst_rq, p, 0);
5131 check_preempt_curr(env->dst_rq, p, 0);
5132}
5133
5134/*
5135 * Is this task likely cache-hot: 5124 * Is this task likely cache-hot:
5136 */ 5125 */
5137static int task_hot(struct task_struct *p, struct lb_env *env) 5126static int task_hot(struct task_struct *p, struct lb_env *env)
@@ -5346,6 +5335,18 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
5346} 5335}
5347 5336
5348/* 5337/*
5338 * detach_task() -- detach the task for the migration specified in env
5339 */
5340static void detach_task(struct task_struct *p, struct lb_env *env)
5341{
5342 lockdep_assert_held(&env->src_rq->lock);
5343
5344 deactivate_task(env->src_rq, p, 0);
5345 p->on_rq = TASK_ON_RQ_MIGRATING;
5346 set_task_cpu(p, env->dst_cpu);
5347}
5348
5349/*
5349 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as 5350 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
5350 * part of active balancing operations within "domain". 5351 * part of active balancing operations within "domain".
5351 * 5352 *
@@ -5361,15 +5362,13 @@ static struct task_struct *detach_one_task(struct lb_env *env)
5361 if (!can_migrate_task(p, env)) 5362 if (!can_migrate_task(p, env))
5362 continue; 5363 continue;
5363 5364
5364 deactivate_task(env->src_rq, p, 0); 5365 detach_task(p, env);
5365 p->on_rq = TASK_ON_RQ_MIGRATING;
5366 set_task_cpu(p, env->dst_cpu);
5367 5366
5368 /* 5367 /*
5369 * Right now, this is only the second place where 5368 * Right now, this is only the second place where
5370 * lb_gained[env->idle] is updated (other is move_tasks) 5369 * lb_gained[env->idle] is updated (other is detach_tasks)
5371 * so we can safely collect stats here rather than 5370 * so we can safely collect stats here rather than
5372 * inside move_tasks(). 5371 * inside detach_tasks().
5373 */ 5372 */
5374 schedstat_inc(env->sd, lb_gained[env->idle]); 5373 schedstat_inc(env->sd, lb_gained[env->idle]);
5375 return p; 5374 return p;
@@ -5377,35 +5376,22 @@ static struct task_struct *detach_one_task(struct lb_env *env)
5377 return NULL; 5376 return NULL;
5378} 5377}
5379 5378
5380/*
5381 * attach_one_task() -- attaches the task returned from detach_one_task() to
5382 * its new rq.
5383 */
5384static void attach_one_task(struct rq *rq, struct task_struct *p)
5385{
5386 raw_spin_lock(&rq->lock);
5387 BUG_ON(task_rq(p) != rq);
5388 p->on_rq = TASK_ON_RQ_QUEUED;
5389 activate_task(rq, p, 0);
5390 check_preempt_curr(rq, p, 0);
5391 raw_spin_unlock(&rq->lock);
5392}
5393
5394static const unsigned int sched_nr_migrate_break = 32; 5379static const unsigned int sched_nr_migrate_break = 32;
5395 5380
5396/* 5381/*
5397 * move_tasks tries to move up to imbalance weighted load from busiest to 5382 * detach_tasks() -- tries to detach up to imbalance weighted load from
5398 * this_rq, as part of a balancing operation within domain "sd". 5383 * busiest_rq, as part of a balancing operation within domain "sd".
5399 * Returns 1 if successful and 0 otherwise.
5400 * 5384 *
5401 * Called with both runqueues locked. 5385 * Returns number of detached tasks if successful and 0 otherwise.
5402 */ 5386 */
5403static int move_tasks(struct lb_env *env) 5387static int detach_tasks(struct lb_env *env)
5404{ 5388{
5405 struct list_head *tasks = &env->src_rq->cfs_tasks; 5389 struct list_head *tasks = &env->src_rq->cfs_tasks;
5406 struct task_struct *p; 5390 struct task_struct *p;
5407 unsigned long load; 5391 unsigned long load;
5408 int pulled = 0; 5392 int detached = 0;
5393
5394 lockdep_assert_held(&env->src_rq->lock);
5409 5395
5410 if (env->imbalance <= 0) 5396 if (env->imbalance <= 0)
5411 return 0; 5397 return 0;
@@ -5436,14 +5422,16 @@ static int move_tasks(struct lb_env *env)
5436 if ((load / 2) > env->imbalance) 5422 if ((load / 2) > env->imbalance)
5437 goto next; 5423 goto next;
5438 5424
5439 move_task(p, env); 5425 detach_task(p, env);
5440 pulled++; 5426 list_add(&p->se.group_node, &env->tasks);
5427
5428 detached++;
5441 env->imbalance -= load; 5429 env->imbalance -= load;
5442 5430
5443#ifdef CONFIG_PREEMPT 5431#ifdef CONFIG_PREEMPT
5444 /* 5432 /*
5445 * NEWIDLE balancing is a source of latency, so preemptible 5433 * NEWIDLE balancing is a source of latency, so preemptible
5446 * kernels will stop after the first task is pulled to minimize 5434 * kernels will stop after the first task is detached to minimize
5447 * the critical section. 5435 * the critical section.
5448 */ 5436 */
5449 if (env->idle == CPU_NEWLY_IDLE) 5437 if (env->idle == CPU_NEWLY_IDLE)
@@ -5463,13 +5451,58 @@ next:
5463 } 5451 }
5464 5452
5465 /* 5453 /*
5466 * Right now, this is one of only two places move_task() is called, 5454 * Right now, this is one of only two places we collect this stat
5467 * so we can safely collect move_task() stats here rather than 5455 * so we can safely collect detach_one_task() stats here rather
5468 * inside move_task(). 5456 * than inside detach_one_task().
5469 */ 5457 */
5470 schedstat_add(env->sd, lb_gained[env->idle], pulled); 5458 schedstat_add(env->sd, lb_gained[env->idle], detached);
5471 5459
5472 return pulled; 5460 return detached;
5461}
5462
5463/*
5464 * attach_task() -- attach the task detached by detach_task() to its new rq.
5465 */
5466static void attach_task(struct rq *rq, struct task_struct *p)
5467{
5468 lockdep_assert_held(&rq->lock);
5469
5470 BUG_ON(task_rq(p) != rq);
5471 p->on_rq = TASK_ON_RQ_QUEUED;
5472 activate_task(rq, p, 0);
5473 check_preempt_curr(rq, p, 0);
5474}
5475
5476/*
5477 * attach_one_task() -- attaches the task returned from detach_one_task() to
5478 * its new rq.
5479 */
5480static void attach_one_task(struct rq *rq, struct task_struct *p)
5481{
5482 raw_spin_lock(&rq->lock);
5483 attach_task(rq, p);
5484 raw_spin_unlock(&rq->lock);
5485}
5486
5487/*
5488 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
5489 * new rq.
5490 */
5491static void attach_tasks(struct lb_env *env)
5492{
5493 struct list_head *tasks = &env->tasks;
5494 struct task_struct *p;
5495
5496 raw_spin_lock(&env->dst_rq->lock);
5497
5498 while (!list_empty(tasks)) {
5499 p = list_first_entry(tasks, struct task_struct, se.group_node);
5500 list_del_init(&p->se.group_node);
5501
5502 attach_task(env->dst_rq, p);
5503 }
5504
5505 raw_spin_unlock(&env->dst_rq->lock);
5473} 5506}
5474 5507
5475#ifdef CONFIG_FAIR_GROUP_SCHED 5508#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6603,6 +6636,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
6603 .loop_break = sched_nr_migrate_break, 6636 .loop_break = sched_nr_migrate_break,
6604 .cpus = cpus, 6637 .cpus = cpus,
6605 .fbq_type = all, 6638 .fbq_type = all,
6639 .tasks = LIST_HEAD_INIT(env.tasks),
6606 }; 6640 };
6607 6641
6608 /* 6642 /*
@@ -6652,16 +6686,29 @@ redo:
6652 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); 6686 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
6653 6687
6654more_balance: 6688more_balance:
6655 local_irq_save(flags); 6689 raw_spin_lock_irqsave(&busiest->lock, flags);
6656 double_rq_lock(env.dst_rq, busiest);
6657 6690
6658 /* 6691 /*
6659 * cur_ld_moved - load moved in current iteration 6692 * cur_ld_moved - load moved in current iteration
6660 * ld_moved - cumulative load moved across iterations 6693 * ld_moved - cumulative load moved across iterations
6661 */ 6694 */
6662 cur_ld_moved = move_tasks(&env); 6695 cur_ld_moved = detach_tasks(&env);
6663 ld_moved += cur_ld_moved; 6696
6664 double_rq_unlock(env.dst_rq, busiest); 6697 /*
6698 * We've detached some tasks from busiest_rq. Every
6699 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
6700 * unlock busiest->lock, and we are able to be sure
6701 * that nobody can manipulate the tasks in parallel.
6702 * See task_rq_lock() family for the details.
6703 */
6704
6705 raw_spin_unlock(&busiest->lock);
6706
6707 if (cur_ld_moved) {
6708 attach_tasks(&env);
6709 ld_moved += cur_ld_moved;
6710 }
6711
6665 local_irq_restore(flags); 6712 local_irq_restore(flags);
6666 6713
6667 /* 6714 /*
@@ -6797,7 +6844,7 @@ more_balance:
6797 * If we've begun active balancing, start to back off. This 6844 * If we've begun active balancing, start to back off. This
6798 * case may not be covered by the all_pinned logic if there 6845 * case may not be covered by the all_pinned logic if there
6799 * is only 1 task on the busy runqueue (because we don't call 6846 * is only 1 task on the busy runqueue (because we don't call
6800 * move_tasks). 6847 * detach_tasks).
6801 */ 6848 */
6802 if (sd->balance_interval < sd->max_interval) 6849 if (sd->balance_interval < sd->max_interval)
6803 sd->balance_interval *= 2; 6850 sd->balance_interval *= 2;