aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorKen Chen <kenchen@google.com>2011-04-08 15:20:16 -0400
committerIngo Molnar <mingo@elte.hu>2011-04-11 05:08:54 -0400
commitb30aef17f71cf9e24b10c11cbb5e5f0ebe8a85ab (patch)
tree0e5e0e0d1d93edf934f58b116ee0c4e81db01e72 /kernel
parentb0432d8f162c7d5d9537b4cb749d44076b76a783 (diff)
sched: Fix erroneous all_pinned logic
The scheduler load balancer has specific code to deal with cases of unbalanced system due to lots of unmovable tasks (for example because of hard CPU affinity). In those situation, it excludes the busiest CPU that has pinned tasks for load balance consideration such that it can perform second 2nd load balance pass on the rest of the system. This all works as designed if there is only one cgroup in the system. However, when we have multiple cgroups, this logic has false positives and triggers multiple load balance passes despite there are actually no pinned tasks at all. The reason it has false positives is that the all pinned logic is deep in the lowest function of can_migrate_task() and is too low level: load_balance_fair() iterates each task group and calls balance_tasks() to migrate target load. Along the way, balance_tasks() will also set a all_pinned variable. Given that task-groups are iterated, this all_pinned variable is essentially the status of last group in the scanning process. Task group can have number of reasons that no load being migrated, none due to cpu affinity. However, this status bit is being propagated back up to the higher level load_balance(), which incorrectly think that no tasks were moved. It kick off the all pinned logic and start multiple passes attempt to move load onto puller CPU. To fix this, move the all_pinned aggregation up at the iterator level. This ensures that the status is aggregated over all task-groups, not just last one in the list. Signed-off-by: Ken Chen <kenchen@google.com> Cc: stable@kernel.org Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/BANLkTi=ernzNawaR5tJZEsV_QVnfxqXmsQ@mail.gmail.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched_fair.c11
1 files changed, 4 insertions, 7 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 60f9d407c5ec..6fa833ab2cb8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2104,21 +2104,20 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2104 enum cpu_idle_type idle, int *all_pinned, 2104 enum cpu_idle_type idle, int *all_pinned,
2105 int *this_best_prio, struct cfs_rq *busiest_cfs_rq) 2105 int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
2106{ 2106{
2107 int loops = 0, pulled = 0, pinned = 0; 2107 int loops = 0, pulled = 0;
2108 long rem_load_move = max_load_move; 2108 long rem_load_move = max_load_move;
2109 struct task_struct *p, *n; 2109 struct task_struct *p, *n;
2110 2110
2111 if (max_load_move == 0) 2111 if (max_load_move == 0)
2112 goto out; 2112 goto out;
2113 2113
2114 pinned = 1;
2115
2116 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { 2114 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
2117 if (loops++ > sysctl_sched_nr_migrate) 2115 if (loops++ > sysctl_sched_nr_migrate)
2118 break; 2116 break;
2119 2117
2120 if ((p->se.load.weight >> 1) > rem_load_move || 2118 if ((p->se.load.weight >> 1) > rem_load_move ||
2121 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) 2119 !can_migrate_task(p, busiest, this_cpu, sd, idle,
2120 all_pinned))
2122 continue; 2121 continue;
2123 2122
2124 pull_task(busiest, p, this_rq, this_cpu); 2123 pull_task(busiest, p, this_rq, this_cpu);
@@ -2153,9 +2152,6 @@ out:
2153 */ 2152 */
2154 schedstat_add(sd, lb_gained[idle], pulled); 2153 schedstat_add(sd, lb_gained[idle], pulled);
2155 2154
2156 if (all_pinned)
2157 *all_pinned = pinned;
2158
2159 return max_load_move - rem_load_move; 2155 return max_load_move - rem_load_move;
2160} 2156}
2161 2157
@@ -3341,6 +3337,7 @@ redo:
3341 * still unbalanced. ld_moved simply stays zero, so it is 3337 * still unbalanced. ld_moved simply stays zero, so it is
3342 * correctly treated as an imbalance. 3338 * correctly treated as an imbalance.
3343 */ 3339 */
3340 all_pinned = 1;
3344 local_irq_save(flags); 3341 local_irq_save(flags);
3345 double_rq_lock(this_rq, busiest); 3342 double_rq_lock(this_rq, busiest);
3346 ld_moved = move_tasks(this_rq, this_cpu, busiest, 3343 ld_moved = move_tasks(this_rq, this_cpu, busiest,