sched/fair: Remove double_lock_balance() from load_balance()

Avoid double_rq_lock() and use TASK_ON_RQ_MIGRATING for load_balance(). The advantage is (obviously) not holding two rq->lock's at the same time and thereby increasing parallelism. Further note that if there was no task to migrate we will not have acquired the second rq->lock at all. The important point to note is that because we acquire dst->lock immediately after releasing src->lock the potential wait time of task_rq_lock() callers on TASK_ON_RQ_MIGRATING is not longer than it would have been in the double rq lock scenario. Signed-off-by: Kirill Tkhai <ktkhai@parallels.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Paul Turner <pjt@google.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Mike Galbraith <umgwanakikbuti@gmail.com> Cc: Kirill Tkhai <tkhai@yandex.ru> Cc: Tim Chen <tim.c.chen@linux.intel.com> Cc: Nicolas Pitre <nicolas.pitre@linaro.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/1408528109.23412.94.camel@tkhai Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Kirill Tkhai <ktkhai@parallels.com> 2014-08-20 05:48:29 -0400
committer: Ingo Molnar <mingo@kernel.org> 2014-08-20 08:53:05 -0400
commit: 163122b7fcfa28c0e4a838fcc8043c616746802e (patch)
tree: 6e23b7cafd125e42192cdfd52e385ad0ddbb1861 /kernel/sched
parent: e5673f280501298dbb56efa46e333cf64ee5080a (diff)
1 files changed, 99 insertions, 52 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7e5cf051c144..d3427a8f254b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4709,7 +4709,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
                return;
        /*
-         * This is possible from callers such as move_task(), in which we
+         * This is possible from callers such as attach_tasks(), in which we
         * unconditionally check_prempt_curr() after an enqueue (which may have
         * lead to a throttle).  This both saves work and prevents false
         * next-buddy nomination below.
@@ -5117,21 +5117,10 @@ struct lb_env {
        unsigned int            loop_max;
        enum fbq_type           fbq_type;
+        struct list_head        tasks;
 };
 /*
- * move_task - move a task from one runqueue to another runqueue.
- * Both runqueues must be locked.
- */
-static void move_task(struct task_struct *p, struct lb_env *env)
-{
-        deactivate_task(env->src_rq, p, 0);
-        set_task_cpu(p, env->dst_cpu);
-        activate_task(env->dst_rq, p, 0);
-        check_preempt_curr(env->dst_rq, p, 0);
-}
-/*
 * Is this task likely cache-hot:
 */
 static int task_hot(struct task_struct *p, struct lb_env *env)
@@ -5346,6 +5335,18 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 }
 /*
+ * detach_task() -- detach the task for the migration specified in env
+ */
+static void detach_task(struct task_struct *p, struct lb_env *env)
+{
+        lockdep_assert_held(&env->src_rq->lock);
+        deactivate_task(env->src_rq, p, 0);
+        p->on_rq = TASK_ON_RQ_MIGRATING;
+        set_task_cpu(p, env->dst_cpu);
+}
+/*
 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
 * part of active balancing operations within "domain".
 *
@@ -5361,15 +5362,13 @@ static struct task_struct *detach_one_task(struct lb_env *env)
                if (!can_migrate_task(p, env))
                        continue;
-                deactivate_task(env->src_rq, p, 0);
+                detach_task(p, env);
-                p->on_rq = TASK_ON_RQ_MIGRATING;
-                set_task_cpu(p, env->dst_cpu);
                /*
                 * Right now, this is only the second place where
-                 * lb_gained[env->idle] is updated (other is move_tasks)
+                 * lb_gained[env->idle] is updated (other is detach_tasks)
                 * so we can safely collect stats here rather than
-                 * inside move_tasks().
+                 * inside detach_tasks().
                 */
                schedstat_inc(env->sd, lb_gained[env->idle]);
                return p;
@@ -5377,35 +5376,22 @@ static struct task_struct *detach_one_task(struct lb_env *env)
        return NULL;
 }
-/*
- * attach_one_task() -- attaches the task returned from detach_one_task() to
- * its new rq.
- */
-static void attach_one_task(struct rq *rq, struct task_struct *p)
-{
-        raw_spin_lock(&rq->lock);
-        BUG_ON(task_rq(p) != rq);
-        p->on_rq = TASK_ON_RQ_QUEUED;
-        activate_task(rq, p, 0);
-        check_preempt_curr(rq, p, 0);
-        raw_spin_unlock(&rq->lock);
-}
 static const unsigned int sched_nr_migrate_break = 32;
 /*
- * move_tasks tries to move up to imbalance weighted load from busiest to
+ * detach_tasks() -- tries to detach up to imbalance weighted load from
- * this_rq, as part of a balancing operation within domain "sd".
+ * busiest_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
 *
- * Called with both runqueues locked.
+ * Returns number of detached tasks if successful and 0 otherwise.
 */
-static int move_tasks(struct lb_env *env)
+static int detach_tasks(struct lb_env *env)
 {
        struct list_head *tasks = &env->src_rq->cfs_tasks;
        struct task_struct *p;
        unsigned long load;
-        int pulled = 0;
+        int detached = 0;
+        lockdep_assert_held(&env->src_rq->lock);
        if (env->imbalance <= 0)
                return 0;
@@ -5436,14 +5422,16 @@ static int move_tasks(struct lb_env *env)
                if ((load / 2) > env->imbalance)
                        goto next;
-                move_task(p, env);
+                detach_task(p, env);
-                pulled++;
+                list_add(&p->se.group_node, &env->tasks);
+                detached++;
                env->imbalance -= load;
 #ifdef CONFIG_PREEMPT
                /*
                 * NEWIDLE balancing is a source of latency, so preemptible
-                 * kernels will stop after the first task is pulled to minimize
+                 * kernels will stop after the first task is detached to minimize
                 * the critical section.
                 */
                if (env->idle == CPU_NEWLY_IDLE)
@@ -5463,13 +5451,58 @@ next:
        }
        /*
-         * Right now, this is one of only two places move_task() is called,
+         * Right now, this is one of only two places we collect this stat
-         * so we can safely collect move_task() stats here rather than
+         * so we can safely collect detach_one_task() stats here rather
-         * inside move_task().
+         * than inside detach_one_task().
         */
-        schedstat_add(env->sd, lb_gained[env->idle], pulled);
+        schedstat_add(env->sd, lb_gained[env->idle], detached);
-        return pulled;
+        return detached;
+}
+/*
+ * attach_task() -- attach the task detached by detach_task() to its new rq.
+ */
+static void attach_task(struct rq *rq, struct task_struct *p)
+{
+        lockdep_assert_held(&rq->lock);
+        BUG_ON(task_rq(p) != rq);
+        p->on_rq = TASK_ON_RQ_QUEUED;
+        activate_task(rq, p, 0);
+        check_preempt_curr(rq, p, 0);
+}
+/*
+ * attach_one_task() -- attaches the task returned from detach_one_task() to
+ * its new rq.
+ */
+static void attach_one_task(struct rq *rq, struct task_struct *p)
+{
+        raw_spin_lock(&rq->lock);
+        attach_task(rq, p);
+        raw_spin_unlock(&rq->lock);
+}
+/*
+ * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
+ * new rq.
+ */
+static void attach_tasks(struct lb_env *env)
+{
+        struct list_head *tasks = &env->tasks;
+        struct task_struct *p;
+        raw_spin_lock(&env->dst_rq->lock);
+        while (!list_empty(tasks)) {
+                p = list_first_entry(tasks, struct task_struct, se.group_node);
+                list_del_init(&p->se.group_node);
+                attach_task(env->dst_rq, p);
+        }
+        raw_spin_unlock(&env->dst_rq->lock);
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6603,6 +6636,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                .loop_break     = sched_nr_migrate_break,
                .cpus           = cpus,
                .fbq_type       = all,
+                .tasks          = LIST_HEAD_INIT(env.tasks),
        };
        /*
@@ -6652,16 +6686,29 @@ redo:
                env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
 more_balance:
-                local_irq_save(flags);
+                raw_spin_lock_irqsave(&busiest->lock, flags);
-                double_rq_lock(env.dst_rq, busiest);
                /*
                 * cur_ld_moved - load moved in current iteration
                 * ld_moved     - cumulative load moved across iterations
                 */
-                cur_ld_moved = move_tasks(&env);
+                cur_ld_moved = detach_tasks(&env);
-                ld_moved += cur_ld_moved;
-                double_rq_unlock(env.dst_rq, busiest);
+                /*
+                 * We've detached some tasks from busiest_rq. Every
+                 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
+                 * unlock busiest->lock, and we are able to be sure
+                 * that nobody can manipulate the tasks in parallel.
+                 * See task_rq_lock() family for the details.
+                 */
+                raw_spin_unlock(&busiest->lock);
+                if (cur_ld_moved) {
+                        attach_tasks(&env);
+                        ld_moved += cur_ld_moved;
+                }
                local_irq_restore(flags);
                /*
@@ -6797,7 +6844,7 @@ more_balance:
                 * If we've begun active balancing, start to back off. This
                 * case may not be covered by the all_pinned logic if there
                 * is only 1 task on the busy runqueue (because we don't call
-                 * move_tasks).
+                 * detach_tasks).
                 */
                if (sd->balance_interval < sd->max_interval)
                        sd->balance_interval *= 2;
author	Kirill Tkhai <ktkhai@parallels.com>	2014-08-20 05:48:29 -0400
committer	Ingo Molnar <mingo@kernel.org>	2014-08-20 08:53:05 -0400
commit	163122b7fcfa28c0e4a838fcc8043c616746802e (patch)
tree	6e23b7cafd125e42192cdfd52e385ad0ddbb1861 /kernel/sched
parent	e5673f280501298dbb56efa46e333cf64ee5080a (diff)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7e5cf051c144..d3427a8f254b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -4709,7 +4709,7 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_
4709	return;	4709	return;
4710		4710
4711	/*	4711	/*
4712	* This is possible from callers such as move_task(), in which we	4712	* This is possible from callers such as attach_tasks(), in which we
4713	* unconditionally check_prempt_curr() after an enqueue (which may have	4713	* unconditionally check_prempt_curr() after an enqueue (which may have
4714	* lead to a throttle). This both saves work and prevents false	4714	* lead to a throttle). This both saves work and prevents false
4715	* next-buddy nomination below.	4715	* next-buddy nomination below.
@@ -5117,21 +5117,10 @@ struct lb_env {
5117	unsigned int loop_max;	5117	unsigned int loop_max;
5118		5118
5119	enum fbq_type fbq_type;	5119	enum fbq_type fbq_type;
		5120	struct list_head tasks;
5120	};	5121	};
5121		5122
5122	/*	5123	/*
5123	* move_task - move a task from one runqueue to another runqueue.
5124	* Both runqueues must be locked.
5125	*/
5126	static void move_task(struct task_struct p, struct lb_env env)
5127	{
5128	deactivate_task(env->src_rq, p, 0);
5129	set_task_cpu(p, env->dst_cpu);
5130	activate_task(env->dst_rq, p, 0);
5131	check_preempt_curr(env->dst_rq, p, 0);
5132	}
5133
5134	/*
5135	* Is this task likely cache-hot:	5124	* Is this task likely cache-hot:
5136	*/	5125	*/
5137	static int task_hot(struct task_struct p, struct lb_env env)	5126	static int task_hot(struct task_struct p, struct lb_env env)
@@ -5346,6 +5335,18 @@ int can_migrate_task(struct task_struct p, struct lb_env env)
5346	}	5335	}
5347		5336
5348	/*	5337	/*
		5338	* detach_task() -- detach the task for the migration specified in env
		5339	*/
		5340	static void detach_task(struct task_struct p, struct lb_env env)
		5341	{
		5342	lockdep_assert_held(&env->src_rq->lock);
		5343
		5344	deactivate_task(env->src_rq, p, 0);
		5345	p->on_rq = TASK_ON_RQ_MIGRATING;
		5346	set_task_cpu(p, env->dst_cpu);
		5347	}
		5348
		5349	/*
5349	* detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as	5350	* detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
5350	* part of active balancing operations within "domain".	5351	* part of active balancing operations within "domain".
5351	*	5352	*
@@ -5361,15 +5362,13 @@ static struct task_struct detach_one_task(struct lb_env env)
5361	if (!can_migrate_task(p, env))	5362	if (!can_migrate_task(p, env))
5362	continue;	5363	continue;
5363		5364
5364	deactivate_task(env->src_rq, p, 0);	5365	detach_task(p, env);
5365	p->on_rq = TASK_ON_RQ_MIGRATING;
5366	set_task_cpu(p, env->dst_cpu);
5367		5366
5368	/*	5367	/*
5369	* Right now, this is only the second place where	5368	* Right now, this is only the second place where
5370	* lb_gained[env->idle] is updated (other is move_tasks)	5369	* lb_gained[env->idle] is updated (other is detach_tasks)
5371	* so we can safely collect stats here rather than	5370	* so we can safely collect stats here rather than
5372	* inside move_tasks().	5371	* inside detach_tasks().
5373	*/	5372	*/
5374	schedstat_inc(env->sd, lb_gained[env->idle]);	5373	schedstat_inc(env->sd, lb_gained[env->idle]);
5375	return p;	5374	return p;
@@ -5377,35 +5376,22 @@ static struct task_struct detach_one_task(struct lb_env env)
5377	return NULL;	5376	return NULL;
5378	}	5377	}
5379		5378
5380	/*
5381	* attach_one_task() -- attaches the task returned from detach_one_task() to
5382	* its new rq.
5383	*/
5384	static void attach_one_task(struct rq rq, struct task_struct p)
5385	{
5386	raw_spin_lock(&rq->lock);
5387	BUG_ON(task_rq(p) != rq);
5388	p->on_rq = TASK_ON_RQ_QUEUED;
5389	activate_task(rq, p, 0);
5390	check_preempt_curr(rq, p, 0);
5391	raw_spin_unlock(&rq->lock);
5392	}
5393
5394	static const unsigned int sched_nr_migrate_break = 32;	5379	static const unsigned int sched_nr_migrate_break = 32;
5395		5380
5396	/*	5381	/*
5397	* move_tasks tries to move up to imbalance weighted load from busiest to	5382	* detach_tasks() -- tries to detach up to imbalance weighted load from
5398	* this_rq, as part of a balancing operation within domain "sd".	5383	* busiest_rq, as part of a balancing operation within domain "sd".
5399	* Returns 1 if successful and 0 otherwise.
5400	*	5384	*
5401	* Called with both runqueues locked.	5385	* Returns number of detached tasks if successful and 0 otherwise.
5402	*/	5386	*/
5403	static int move_tasks(struct lb_env *env)	5387	static int detach_tasks(struct lb_env *env)
5404	{	5388	{
5405	struct list_head *tasks = &env->src_rq->cfs_tasks;	5389	struct list_head *tasks = &env->src_rq->cfs_tasks;
5406	struct task_struct *p;	5390	struct task_struct *p;
5407	unsigned long load;	5391	unsigned long load;
5408	int pulled = 0;	5392	int detached = 0;
		5393
		5394	lockdep_assert_held(&env->src_rq->lock);
5409		5395
5410	if (env->imbalance <= 0)	5396	if (env->imbalance <= 0)
5411	return 0;	5397	return 0;
@@ -5436,14 +5422,16 @@ static int move_tasks(struct lb_env *env)
5436	if ((load / 2) > env->imbalance)	5422	if ((load / 2) > env->imbalance)
5437	goto next;	5423	goto next;
5438		5424
5439	move_task(p, env);	5425	detach_task(p, env);
5440	pulled++;	5426	list_add(&p->se.group_node, &env->tasks);
		5427
		5428	detached++;
5441	env->imbalance -= load;	5429	env->imbalance -= load;
5442		5430
5443	#ifdef CONFIG_PREEMPT	5431	#ifdef CONFIG_PREEMPT
5444	/*	5432	/*
5445	* NEWIDLE balancing is a source of latency, so preemptible	5433	* NEWIDLE balancing is a source of latency, so preemptible
5446	* kernels will stop after the first task is pulled to minimize	5434	* kernels will stop after the first task is detached to minimize
5447	* the critical section.	5435	* the critical section.
5448	*/	5436	*/
5449	if (env->idle == CPU_NEWLY_IDLE)	5437	if (env->idle == CPU_NEWLY_IDLE)
@@ -5463,13 +5451,58 @@ next:
5463	}	5451	}
5464		5452
5465	/*	5453	/*
5466	* Right now, this is one of only two places move_task() is called,	5454	* Right now, this is one of only two places we collect this stat
5467	* so we can safely collect move_task() stats here rather than	5455	* so we can safely collect detach_one_task() stats here rather
5468	* inside move_task().	5456	* than inside detach_one_task().
5469	*/	5457	*/
5470	schedstat_add(env->sd, lb_gained[env->idle], pulled);	5458	schedstat_add(env->sd, lb_gained[env->idle], detached);
5471		5459
5472	return pulled;	5460	return detached;
		5461	}
		5462
		5463	/*
		5464	* attach_task() -- attach the task detached by detach_task() to its new rq.
		5465	*/
		5466	static void attach_task(struct rq rq, struct task_struct p)
		5467	{
		5468	lockdep_assert_held(&rq->lock);
		5469
		5470	BUG_ON(task_rq(p) != rq);
		5471	p->on_rq = TASK_ON_RQ_QUEUED;
		5472	activate_task(rq, p, 0);
		5473	check_preempt_curr(rq, p, 0);
		5474	}
		5475
		5476	/*
		5477	* attach_one_task() -- attaches the task returned from detach_one_task() to
		5478	* its new rq.
		5479	*/
		5480	static void attach_one_task(struct rq rq, struct task_struct p)
		5481	{
		5482	raw_spin_lock(&rq->lock);
		5483	attach_task(rq, p);
		5484	raw_spin_unlock(&rq->lock);
		5485	}
		5486
		5487	/*
		5488	* attach_tasks() -- attaches all tasks detached by detach_tasks() to their
		5489	* new rq.
		5490	*/
		5491	static void attach_tasks(struct lb_env *env)
		5492	{
		5493	struct list_head *tasks = &env->tasks;
		5494	struct task_struct *p;
		5495
		5496	raw_spin_lock(&env->dst_rq->lock);
		5497
		5498	while (!list_empty(tasks)) {
		5499	p = list_first_entry(tasks, struct task_struct, se.group_node);
		5500	list_del_init(&p->se.group_node);
		5501
		5502	attach_task(env->dst_rq, p);
		5503	}
		5504
		5505	raw_spin_unlock(&env->dst_rq->lock);
5473	}	5506	}
5474		5507
5475	#ifdef CONFIG_FAIR_GROUP_SCHED	5508	#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6603,6 +6636,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
6603	.loop_break = sched_nr_migrate_break,	6636	.loop_break = sched_nr_migrate_break,
6604	.cpus = cpus,	6637	.cpus = cpus,
6605	.fbq_type = all,	6638	.fbq_type = all,
		6639	.tasks = LIST_HEAD_INIT(env.tasks),
6606	};	6640	};
6607		6641
6608	/*	6642	/*
@@ -6652,16 +6686,29 @@ redo:
6652	env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);	6686	env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
6653		6687
6654	more_balance:	6688	more_balance:
6655	local_irq_save(flags);	6689	raw_spin_lock_irqsave(&busiest->lock, flags);
6656	double_rq_lock(env.dst_rq, busiest);
6657		6690
6658	/*	6691	/*
6659	* cur_ld_moved - load moved in current iteration	6692	* cur_ld_moved - load moved in current iteration
6660	* ld_moved - cumulative load moved across iterations	6693	* ld_moved - cumulative load moved across iterations
6661	*/	6694	*/
6662	cur_ld_moved = move_tasks(&env);	6695	cur_ld_moved = detach_tasks(&env);
6663	ld_moved += cur_ld_moved;	6696
6664	double_rq_unlock(env.dst_rq, busiest);	6697	/*
		6698	* We've detached some tasks from busiest_rq. Every
		6699	* task is masked "TASK_ON_RQ_MIGRATING", so we can safely
		6700	* unlock busiest->lock, and we are able to be sure
		6701	* that nobody can manipulate the tasks in parallel.
		6702	* See task_rq_lock() family for the details.
		6703	*/
		6704
		6705	raw_spin_unlock(&busiest->lock);
		6706
		6707	if (cur_ld_moved) {
		6708	attach_tasks(&env);
		6709	ld_moved += cur_ld_moved;
		6710	}
		6711
6665	local_irq_restore(flags);	6712	local_irq_restore(flags);
6666		6713
6667	/*	6714	/*
@@ -6797,7 +6844,7 @@ more_balance:
6797	* If we've begun active balancing, start to back off. This	6844	* If we've begun active balancing, start to back off. This
6798	* case may not be covered by the all_pinned logic if there	6845	* case may not be covered by the all_pinned logic if there
6799	* is only 1 task on the busy runqueue (because we don't call	6846	* is only 1 task on the busy runqueue (because we don't call
6800	* move_tasks).	6847	* detach_tasks).
6801	*/	6848	*/
6802	if (sd->balance_interval < sd->max_interval)	6849	if (sd->balance_interval < sd->max_interval)
6803	sd->balance_interval *= 2;	6850	sd->balance_interval *= 2;