1 files changed, 74 insertions, 4 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f9f9aa0edf3c..22321db64952 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3054,6 +3054,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 #define LBF_ALL_PINNED  0x01
 #define LBF_NEED_BREAK  0x02
+#define LBF_SOME_PINNED 0x04
 struct lb_env {
        struct sched_domain     *sd;
@@ -3064,6 +3065,8 @@ struct lb_env {
        int                     dst_cpu;
        struct rq               *dst_rq;
+        struct cpumask          *dst_grpmask;
+        int                     new_dst_cpu;
        enum cpu_idle_type      idle;
        long                    imbalance;
        unsigned int            flags;
@@ -3131,9 +3134,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
         * 3) are cache-hot on their current CPU.
         */
        if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+                int new_dst_cpu;
                schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+                /*
+                 * Remember if this task can be migrated to any other cpu in
+                 * our sched_group. We may want to revisit it if we couldn't
+                 * meet load balance goals by pulling other tasks on src_cpu.
+                 *
+                 * Also avoid computing new_dst_cpu if we have already computed
+                 * one in current iteration.
+                 */
+                if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
+                        return 0;
+                new_dst_cpu = cpumask_first_and(env->dst_grpmask,
+                                                tsk_cpus_allowed(p));
+                if (new_dst_cpu < nr_cpu_ids) {
+                        env->flags |= LBF_SOME_PINNED;
+                        env->new_dst_cpu = new_dst_cpu;
+                }
                return 0;
        }
+        /* Record that we found atleast one task that could run on dst_cpu */
        env->flags &= ~LBF_ALL_PINNED;
        if (task_running(env->src_rq, p)) {
@@ -4213,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                        struct sched_domain *sd, enum cpu_idle_type idle,
                        int *balance)
 {
-        int ld_moved, active_balance = 0;
+        int ld_moved, cur_ld_moved, active_balance = 0;
+        int lb_iterations, max_lb_iterations;
        struct sched_group *group;
        struct rq *busiest;
        unsigned long flags;
@@ -4223,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                .sd             = sd,
                .dst_cpu        = this_cpu,
                .dst_rq         = this_rq,
+                .dst_grpmask    = sched_group_cpus(sd->groups),
                .idle           = idle,
                .loop_break     = sched_nr_migrate_break,
        };
        cpumask_copy(cpus, cpu_active_mask);
+        max_lb_iterations = cpumask_weight(env.dst_grpmask);
        schedstat_inc(sd, lb_count[idle]);
@@ -4253,6 +4281,7 @@ redo:
        schedstat_add(sd, lb_imbalance[idle], env.imbalance);
        ld_moved = 0;
+        lb_iterations = 1;
        if (busiest->nr_running > 1) {
                /*
                 * Attempt to move tasks. If find_busiest_group has found
@@ -4270,7 +4299,13 @@ more_balance:
                double_rq_lock(this_rq, busiest);
                if (!env.loop)
                        update_h_load(env.src_cpu);
-                ld_moved += move_tasks(&env);
+                /*
+                 * cur_ld_moved - load moved in current iteration
+                 * ld_moved     - cumulative load moved across iterations
+                 */
+                cur_ld_moved = move_tasks(&env);
+                ld_moved += cur_ld_moved;
                double_rq_unlock(this_rq, busiest);
                local_irq_restore(flags);
@@ -4282,8 +4317,43 @@ more_balance:
                /*
                 * some other cpu did the load balance for us.
                 */
-                if (ld_moved && this_cpu != smp_processor_id())
+                if (cur_ld_moved && env.dst_cpu != smp_processor_id())
-                        resched_cpu(this_cpu);
+                        resched_cpu(env.dst_cpu);
+                /*
+                 * Revisit (affine) tasks on src_cpu that couldn't be moved to
+                 * us and move them to an alternate dst_cpu in our sched_group
+                 * where they can run. The upper limit on how many times we
+                 * iterate on same src_cpu is dependent on number of cpus in our
+                 * sched_group.
+                 *
+                 * This changes load balance semantics a bit on who can move
+                 * load to a given_cpu. In addition to the given_cpu itself
+                 * (or a ilb_cpu acting on its behalf where given_cpu is
+                 * nohz-idle), we now have balance_cpu in a position to move
+                 * load to given_cpu. In rare situations, this may cause
+                 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
+                 * _independently_ and at _same_ time to move some load to
+                 * given_cpu) causing exceess load to be moved to given_cpu.
+                 * This however should not happen so much in practice and
+                 * moreover subsequent load balance cycles should correct the
+                 * excess load moved.
+                 */
+                if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
+                                lb_iterations++ < max_lb_iterations) {
+                        this_rq          = cpu_rq(env.new_dst_cpu);
+                        env.dst_rq       = this_rq;
+                        env.dst_cpu      = env.new_dst_cpu;
+                        env.flags       &= ~LBF_SOME_PINNED;
+                        env.loop         = 0;
+                        env.loop_break   = sched_nr_migrate_break;
+                        /*
+                         * Go back to "more_balance" rather than "redo" since we
+                         * need to continue with same src_cpu.
+                         */
+                        goto more_balance;
+                }
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(env.flags & LBF_ALL_PINNED)) {

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f9f9aa0edf3c..22321db64952 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -3054,6 +3054,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3054		3054
3055	#define LBF_ALL_PINNED 0x01	3055	#define LBF_ALL_PINNED 0x01
3056	#define LBF_NEED_BREAK 0x02	3056	#define LBF_NEED_BREAK 0x02
		3057	#define LBF_SOME_PINNED 0x04
3057		3058
3058	struct lb_env {	3059	struct lb_env {
3059	struct sched_domain *sd;	3060	struct sched_domain *sd;
@@ -3064,6 +3065,8 @@ struct lb_env {
3064	int dst_cpu;	3065	int dst_cpu;
3065	struct rq *dst_rq;	3066	struct rq *dst_rq;
3066		3067
		3068	struct cpumask *dst_grpmask;
		3069	int new_dst_cpu;
3067	enum cpu_idle_type idle;	3070	enum cpu_idle_type idle;
3068	long imbalance;	3071	long imbalance;
3069	unsigned int flags;	3072	unsigned int flags;
@@ -3131,9 +3134,31 @@ int can_migrate_task(struct task_struct p, struct lb_env env)
3131	* 3) are cache-hot on their current CPU.	3134	* 3) are cache-hot on their current CPU.
3132	*/	3135	*/
3133	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {	3136	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
		3137	int new_dst_cpu;
		3138
3134	schedstat_inc(p, se.statistics.nr_failed_migrations_affine);	3139	schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
		3140
		3141	/*
		3142	* Remember if this task can be migrated to any other cpu in
		3143	* our sched_group. We may want to revisit it if we couldn't
		3144	* meet load balance goals by pulling other tasks on src_cpu.
		3145	*
		3146	* Also avoid computing new_dst_cpu if we have already computed
		3147	* one in current iteration.
		3148	*/
		3149	if (!env->dst_grpmask \|\| (env->flags & LBF_SOME_PINNED))
		3150	return 0;
		3151
		3152	new_dst_cpu = cpumask_first_and(env->dst_grpmask,
		3153	tsk_cpus_allowed(p));
		3154	if (new_dst_cpu < nr_cpu_ids) {
		3155	env->flags \|= LBF_SOME_PINNED;
		3156	env->new_dst_cpu = new_dst_cpu;
		3157	}
3135	return 0;	3158	return 0;
3136	}	3159	}
		3160
		3161	/* Record that we found atleast one task that could run on dst_cpu */
3137	env->flags &= ~LBF_ALL_PINNED;	3162	env->flags &= ~LBF_ALL_PINNED;
3138		3163
3139	if (task_running(env->src_rq, p)) {	3164	if (task_running(env->src_rq, p)) {
@@ -4213,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4213	struct sched_domain *sd, enum cpu_idle_type idle,	4238	struct sched_domain *sd, enum cpu_idle_type idle,
4214	int *balance)	4239	int *balance)
4215	{	4240	{
4216	int ld_moved, active_balance = 0;	4241	int ld_moved, cur_ld_moved, active_balance = 0;
		4242	int lb_iterations, max_lb_iterations;
4217	struct sched_group *group;	4243	struct sched_group *group;
4218	struct rq *busiest;	4244	struct rq *busiest;
4219	unsigned long flags;	4245	unsigned long flags;
@@ -4223,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4223	.sd = sd,	4249	.sd = sd,
4224	.dst_cpu = this_cpu,	4250	.dst_cpu = this_cpu,
4225	.dst_rq = this_rq,	4251	.dst_rq = this_rq,
		4252	.dst_grpmask = sched_group_cpus(sd->groups),
4226	.idle = idle,	4253	.idle = idle,
4227	.loop_break = sched_nr_migrate_break,	4254	.loop_break = sched_nr_migrate_break,
4228	};	4255	};
4229		4256
4230	cpumask_copy(cpus, cpu_active_mask);	4257	cpumask_copy(cpus, cpu_active_mask);
		4258	max_lb_iterations = cpumask_weight(env.dst_grpmask);
4231		4259
4232	schedstat_inc(sd, lb_count[idle]);	4260	schedstat_inc(sd, lb_count[idle]);
4233		4261
@@ -4253,6 +4281,7 @@ redo:
4253	schedstat_add(sd, lb_imbalance[idle], env.imbalance);	4281	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4254		4282
4255	ld_moved = 0;	4283	ld_moved = 0;
		4284	lb_iterations = 1;
4256	if (busiest->nr_running > 1) {	4285	if (busiest->nr_running > 1) {
4257	/*	4286	/*
4258	* Attempt to move tasks. If find_busiest_group has found	4287	* Attempt to move tasks. If find_busiest_group has found
@@ -4270,7 +4299,13 @@ more_balance:
4270	double_rq_lock(this_rq, busiest);	4299	double_rq_lock(this_rq, busiest);
4271	if (!env.loop)	4300	if (!env.loop)
4272	update_h_load(env.src_cpu);	4301	update_h_load(env.src_cpu);
4273	ld_moved += move_tasks(&env);	4302
		4303	/*
		4304	* cur_ld_moved - load moved in current iteration
		4305	* ld_moved - cumulative load moved across iterations
		4306	*/
		4307	cur_ld_moved = move_tasks(&env);
		4308	ld_moved += cur_ld_moved;
4274	double_rq_unlock(this_rq, busiest);	4309	double_rq_unlock(this_rq, busiest);
4275	local_irq_restore(flags);	4310	local_irq_restore(flags);
4276		4311
@@ -4282,8 +4317,43 @@ more_balance:
4282	/*	4317	/*
4283	* some other cpu did the load balance for us.	4318	* some other cpu did the load balance for us.
4284	*/	4319	*/
4285	if (ld_moved && this_cpu != smp_processor_id())	4320	if (cur_ld_moved && env.dst_cpu != smp_processor_id())
4286	resched_cpu(this_cpu);	4321	resched_cpu(env.dst_cpu);
		4322
		4323	/*
		4324	* Revisit (affine) tasks on src_cpu that couldn't be moved to
		4325	* us and move them to an alternate dst_cpu in our sched_group
		4326	* where they can run. The upper limit on how many times we
		4327	* iterate on same src_cpu is dependent on number of cpus in our
		4328	* sched_group.
		4329	*
		4330	* This changes load balance semantics a bit on who can move
		4331	* load to a given_cpu. In addition to the given_cpu itself
		4332	* (or a ilb_cpu acting on its behalf where given_cpu is
		4333	* nohz-idle), we now have balance_cpu in a position to move
		4334	* load to given_cpu. In rare situations, this may cause
		4335	* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
		4336	* _independently_ and at _same_ time to move some load to
		4337	* given_cpu) causing exceess load to be moved to given_cpu.
		4338	* This however should not happen so much in practice and
		4339	* moreover subsequent load balance cycles should correct the
		4340	* excess load moved.
		4341	*/
		4342	if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
		4343	lb_iterations++ < max_lb_iterations) {
		4344
		4345	this_rq = cpu_rq(env.new_dst_cpu);
		4346	env.dst_rq = this_rq;
		4347	env.dst_cpu = env.new_dst_cpu;
		4348	env.flags &= ~LBF_SOME_PINNED;
		4349	env.loop = 0;
		4350	env.loop_break = sched_nr_migrate_break;
		4351	/*
		4352	* Go back to "more_balance" rather than "redo" since we
		4353	* need to continue with same src_cpu.
		4354	*/
		4355	goto more_balance;
		4356	}
4287		4357
4288	/* All tasks on this runqueue were pinned by CPU affinity */	4358	/* All tasks on this runqueue were pinned by CPU affinity */
4289	if (unlikely(env.flags & LBF_ALL_PINNED)) {	4359	if (unlikely(env.flags & LBF_ALL_PINNED)) {