diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2011-02-21 12:56:47 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2011-02-23 05:33:57 -0500 |
commit | 866ab43efd325fae8889ea77a744d03f2b957e38 (patch) | |
tree | 450263aa8a30abb4a0ab2812643aa7a83711df05 /kernel/sched_fair.c | |
parent | cc57aa8f4b3bece8c26c7929728edcc5fa6b5aed (diff) |
sched: Fix the group_imb logic
On a 2*6*2 machine something like:
taskset -c 3-11 bash -c 'for ((i=0;i<9;i++)) do while :; do :; done & done'
_should_ result in 9 busy CPUs, each running 1 task.
However it didn't quite work reliably, most of the time one cpu of the
second socket (6-11) would be idle and one cpu of the first socket
(0-5) would have two tasks on it.
The group_imb logic is supposed to deal with this and detect when a
particular group is imbalanced (like in our case, 0-2 are idle but 3-5
will have 4 tasks on it).
The detection phase needed a bit of a tweak as it was too weak and
required more than 2 avg weight tasks difference between idle and busy
cpus in the group which won't trigger for our test-case. So cure that
to be one or more avg task weight difference between cpus.
Once the detection phase worked, it was then defeated by the f_b_g()
tests trying to avoid ping-pongs. In particular, this_load >= max_load
triggered because the pulling cpu (the (first) idle cpu in on the
second socket, say 6) would find this_load to be 5 and max_load to be
4 (there'd be 5 tasks running on our socket and only 4 on the other
socket).
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nikhil Rao <ncrao@google.com>
Cc: Venkatesh Pallipadi <venki@google.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r-- | kernel/sched_fair.c | 12 |
1 files changed, 10 insertions, 2 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 03496ebc4553..3a88dee165c0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -2743,7 +2743,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2743 | 2743 | ||
2744 | /* | 2744 | /* |
2745 | * Consider the group unbalanced when the imbalance is larger | 2745 | * Consider the group unbalanced when the imbalance is larger |
2746 | * than the average weight of two tasks. | 2746 | * than the average weight of a task. |
2747 | * | 2747 | * |
2748 | * APZ: with cgroup the avg task weight can vary wildly and | 2748 | * APZ: with cgroup the avg task weight can vary wildly and |
2749 | * might not be a suitable number - should we keep a | 2749 | * might not be a suitable number - should we keep a |
@@ -2753,7 +2753,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2753 | if (sgs->sum_nr_running) | 2753 | if (sgs->sum_nr_running) |
2754 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 2754 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
2755 | 2755 | ||
2756 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) | 2756 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) |
2757 | sgs->group_imb = 1; | 2757 | sgs->group_imb = 1; |
2758 | 2758 | ||
2759 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | 2759 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); |
@@ -3128,6 +3128,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3128 | if (!sds.busiest || sds.busiest_nr_running == 0) | 3128 | if (!sds.busiest || sds.busiest_nr_running == 0) |
3129 | goto out_balanced; | 3129 | goto out_balanced; |
3130 | 3130 | ||
3131 | /* | ||
3132 | * If the busiest group is imbalanced the below checks don't | ||
3133 | * work because they assumes all things are equal, which typically | ||
3134 | * isn't true due to cpus_allowed constraints and the like. | ||
3135 | */ | ||
3136 | if (sds.group_imb) | ||
3137 | goto force_balance; | ||
3138 | |||
3131 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 3139 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
3132 | if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && | 3140 | if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && |
3133 | !sds.busiest_has_capacity) | 3141 | !sds.busiest_has_capacity) |