aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorSrivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>2012-06-19 08:13:15 -0400
committerIngo Molnar <mingo@kernel.org>2012-07-24 07:58:06 -0400
commit88b8dac0a14c511ff41486b83a8c3d688936eec0 (patch)
treee98b2c8b1d47b136725f1a2862802e7db0bba927 /kernel
parentbbf18b19495942cc730e8ff11fc3ffadf20cbfe1 (diff)
sched: Improve balance_cpu() to consider other cpus in its group as target of (pinned) task
Current load balance scheme requires only one cpu in a sched_group (balance_cpu) to look at other peer sched_groups for imbalance and pull tasks towards itself from a busy cpu. Tasks thus pulled by balance_cpu could later get picked up by cpus that are in the same sched_group as that of balance_cpu. This scheme however fails to pull tasks that are not allowed to run on balance_cpu (but are allowed to run on other cpus in its sched_group). That can affect fairness and in some worst case scenarios cause starvation. Consider a two core (2 threads/core) system running tasks as below: Core0 Core1 / \ / \ C0 C1 C2 C3 | | | | v v v v F0 T1 F1 [idle] T2 F0 = SCHED_FIFO task (pinned to C0) F1 = SCHED_FIFO task (pinned to C2) T1 = SCHED_OTHER task (pinned to C1) T2 = SCHED_OTHER task (pinned to C1 and C2) F1 could become a cpu hog, which will starve T2 unless C1 pulls it. Between C0 and C1 however, C0 is required to look for imbalance between cores, which will fail to pull T2 towards Core0. T2 will starve eternally in this case. The same scenario can arise in presence of non-rt tasks as well (say we replace F1 with high irq load). We tackle this problem by having balance_cpu move pinned tasks to one of its sibling cpus (where they can run). We first check if load balance goal can be met by ignoring pinned tasks, failing which we retry move_tasks() with a new env->dst_cpu. This patch modifies load balance semantics on who can move load towards a given cpu in a given sched_domain. Before this patch, a given_cpu or a ilb_cpu acting on behalf of an idle given_cpu is responsible for moving load to given_cpu. With this patch applied, balance_cpu can in addition decide on moving some load to a given_cpu. There is a remote possibility that excess load could get moved as a result of this (balance_cpu and given_cpu/ilb_cpu deciding *independently* and at *same* time to move some load to a given_cpu). However we should see less of such conflicting decisions in practice and moreover subsequent load balance cycles should correct the excess load moved to given_cpu. Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Signed-off-by: Prashanth Nageshappa <prashanth@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/4FE06CDB.2060605@linux.vnet.ibm.com [ minor edits ] Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/fair.c78
1 files changed, 74 insertions, 4 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f9f9aa0edf3c..22321db64952 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3054,6 +3054,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3054 3054
3055#define LBF_ALL_PINNED 0x01 3055#define LBF_ALL_PINNED 0x01
3056#define LBF_NEED_BREAK 0x02 3056#define LBF_NEED_BREAK 0x02
3057#define LBF_SOME_PINNED 0x04
3057 3058
3058struct lb_env { 3059struct lb_env {
3059 struct sched_domain *sd; 3060 struct sched_domain *sd;
@@ -3064,6 +3065,8 @@ struct lb_env {
3064 int dst_cpu; 3065 int dst_cpu;
3065 struct rq *dst_rq; 3066 struct rq *dst_rq;
3066 3067
3068 struct cpumask *dst_grpmask;
3069 int new_dst_cpu;
3067 enum cpu_idle_type idle; 3070 enum cpu_idle_type idle;
3068 long imbalance; 3071 long imbalance;
3069 unsigned int flags; 3072 unsigned int flags;
@@ -3131,9 +3134,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3131 * 3) are cache-hot on their current CPU. 3134 * 3) are cache-hot on their current CPU.
3132 */ 3135 */
3133 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { 3136 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3137 int new_dst_cpu;
3138
3134 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 3139 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3140
3141 /*
3142 * Remember if this task can be migrated to any other cpu in
3143 * our sched_group. We may want to revisit it if we couldn't
3144 * meet load balance goals by pulling other tasks on src_cpu.
3145 *
3146 * Also avoid computing new_dst_cpu if we have already computed
3147 * one in current iteration.
3148 */
3149 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3150 return 0;
3151
3152 new_dst_cpu = cpumask_first_and(env->dst_grpmask,
3153 tsk_cpus_allowed(p));
3154 if (new_dst_cpu < nr_cpu_ids) {
3155 env->flags |= LBF_SOME_PINNED;
3156 env->new_dst_cpu = new_dst_cpu;
3157 }
3135 return 0; 3158 return 0;
3136 } 3159 }
3160
3161 /* Record that we found atleast one task that could run on dst_cpu */
3137 env->flags &= ~LBF_ALL_PINNED; 3162 env->flags &= ~LBF_ALL_PINNED;
3138 3163
3139 if (task_running(env->src_rq, p)) { 3164 if (task_running(env->src_rq, p)) {
@@ -4213,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4213 struct sched_domain *sd, enum cpu_idle_type idle, 4238 struct sched_domain *sd, enum cpu_idle_type idle,
4214 int *balance) 4239 int *balance)
4215{ 4240{
4216 int ld_moved, active_balance = 0; 4241 int ld_moved, cur_ld_moved, active_balance = 0;
4242 int lb_iterations, max_lb_iterations;
4217 struct sched_group *group; 4243 struct sched_group *group;
4218 struct rq *busiest; 4244 struct rq *busiest;
4219 unsigned long flags; 4245 unsigned long flags;
@@ -4223,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4223 .sd = sd, 4249 .sd = sd,
4224 .dst_cpu = this_cpu, 4250 .dst_cpu = this_cpu,
4225 .dst_rq = this_rq, 4251 .dst_rq = this_rq,
4252 .dst_grpmask = sched_group_cpus(sd->groups),
4226 .idle = idle, 4253 .idle = idle,
4227 .loop_break = sched_nr_migrate_break, 4254 .loop_break = sched_nr_migrate_break,
4228 }; 4255 };
4229 4256
4230 cpumask_copy(cpus, cpu_active_mask); 4257 cpumask_copy(cpus, cpu_active_mask);
4258 max_lb_iterations = cpumask_weight(env.dst_grpmask);
4231 4259
4232 schedstat_inc(sd, lb_count[idle]); 4260 schedstat_inc(sd, lb_count[idle]);
4233 4261
@@ -4253,6 +4281,7 @@ redo:
4253 schedstat_add(sd, lb_imbalance[idle], env.imbalance); 4281 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4254 4282
4255 ld_moved = 0; 4283 ld_moved = 0;
4284 lb_iterations = 1;
4256 if (busiest->nr_running > 1) { 4285 if (busiest->nr_running > 1) {
4257 /* 4286 /*
4258 * Attempt to move tasks. If find_busiest_group has found 4287 * Attempt to move tasks. If find_busiest_group has found
@@ -4270,7 +4299,13 @@ more_balance:
4270 double_rq_lock(this_rq, busiest); 4299 double_rq_lock(this_rq, busiest);
4271 if (!env.loop) 4300 if (!env.loop)
4272 update_h_load(env.src_cpu); 4301 update_h_load(env.src_cpu);
4273 ld_moved += move_tasks(&env); 4302
4303 /*
4304 * cur_ld_moved - load moved in current iteration
4305 * ld_moved - cumulative load moved across iterations
4306 */
4307 cur_ld_moved = move_tasks(&env);
4308 ld_moved += cur_ld_moved;
4274 double_rq_unlock(this_rq, busiest); 4309 double_rq_unlock(this_rq, busiest);
4275 local_irq_restore(flags); 4310 local_irq_restore(flags);
4276 4311
@@ -4282,8 +4317,43 @@ more_balance:
4282 /* 4317 /*
4283 * some other cpu did the load balance for us. 4318 * some other cpu did the load balance for us.
4284 */ 4319 */
4285 if (ld_moved && this_cpu != smp_processor_id()) 4320 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
4286 resched_cpu(this_cpu); 4321 resched_cpu(env.dst_cpu);
4322
4323 /*
4324 * Revisit (affine) tasks on src_cpu that couldn't be moved to
4325 * us and move them to an alternate dst_cpu in our sched_group
4326 * where they can run. The upper limit on how many times we
4327 * iterate on same src_cpu is dependent on number of cpus in our
4328 * sched_group.
4329 *
4330 * This changes load balance semantics a bit on who can move
4331 * load to a given_cpu. In addition to the given_cpu itself
4332 * (or a ilb_cpu acting on its behalf where given_cpu is
4333 * nohz-idle), we now have balance_cpu in a position to move
4334 * load to given_cpu. In rare situations, this may cause
4335 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
4336 * _independently_ and at _same_ time to move some load to
4337 * given_cpu) causing exceess load to be moved to given_cpu.
4338 * This however should not happen so much in practice and
4339 * moreover subsequent load balance cycles should correct the
4340 * excess load moved.
4341 */
4342 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
4343 lb_iterations++ < max_lb_iterations) {
4344
4345 this_rq = cpu_rq(env.new_dst_cpu);
4346 env.dst_rq = this_rq;
4347 env.dst_cpu = env.new_dst_cpu;
4348 env.flags &= ~LBF_SOME_PINNED;
4349 env.loop = 0;
4350 env.loop_break = sched_nr_migrate_break;
4351 /*
4352 * Go back to "more_balance" rather than "redo" since we
4353 * need to continue with same src_cpu.
4354 */
4355 goto more_balance;
4356 }
4287 4357
4288 /* All tasks on this runqueue were pinned by CPU affinity */ 4358 /* All tasks on this runqueue were pinned by CPU affinity */
4289 if (unlikely(env.flags & LBF_ALL_PINNED)) { 4359 if (unlikely(env.flags & LBF_ALL_PINNED)) {