aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
authorRik van Riel <riel@redhat.com>2015-02-03 16:56:48 -0500
committerIngo Molnar <mingo@kernel.org>2015-02-18 10:18:00 -0500
commit095bebf61a460ad7f6a45bb17ddbf3a9df2b4397 (patch)
treed2d38a8c439295c04639861cae298f379e4e50b6 /kernel/sched/fair.c
parent2636ed5f8d15ff9395731593537b4b3fdf2af24d (diff)
sched/numa: Do not move past the balance point if unbalanced
There is a subtle interaction between the logic introduced in commit e63da03639cc ("sched/numa: Allow task switch if load imbalance improves"), the way the load balancer counts the load on each NUMA node, and the way NUMA hinting faults are done. Specifically, the load balancer only counts currently running tasks in the load, while NUMA hinting faults may cause tasks to stop, if the page is locked by another task. This could cause all of the threads of a large single instance workload, like SPECjbb2005, to migrate to the same NUMA node. This was possible because occasionally they all fault on the same few pages, and only one of the threads remains runnable. That thread can move to the process's preferred NUMA node without making the imbalance worse, because nothing else is running at that time. The fix is to check the direction of the net moving of load, and to refuse a NUMA move if it would cause the system to move past the point of balance. In an unbalanced state, only moves that bring us closer to the balance point are allowed. Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: mgorman@suse.de Link: http://lkml.kernel.org/r/20150203165648.0e9ac692@annuminas.surriel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c41
1 files changed, 26 insertions, 15 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7ce18f3c097a..28cbacae4e51 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1196,9 +1196,11 @@ static void task_numa_assign(struct task_numa_env *env,
1196static bool load_too_imbalanced(long src_load, long dst_load, 1196static bool load_too_imbalanced(long src_load, long dst_load,
1197 struct task_numa_env *env) 1197 struct task_numa_env *env)
1198{ 1198{
1199 long imb, old_imb;
1200 long orig_src_load, orig_dst_load;
1201 long src_capacity, dst_capacity; 1199 long src_capacity, dst_capacity;
1200 long orig_src_load;
1201 long load_a, load_b;
1202 long moved_load;
1203 long imb;
1202 1204
1203 /* 1205 /*
1204 * The load is corrected for the CPU capacity available on each node. 1206 * The load is corrected for the CPU capacity available on each node.
@@ -1211,30 +1213,39 @@ static bool load_too_imbalanced(long src_load, long dst_load,
1211 dst_capacity = env->dst_stats.compute_capacity; 1213 dst_capacity = env->dst_stats.compute_capacity;
1212 1214
1213 /* We care about the slope of the imbalance, not the direction. */ 1215 /* We care about the slope of the imbalance, not the direction. */
1214 if (dst_load < src_load) 1216 load_a = dst_load;
1215 swap(dst_load, src_load); 1217 load_b = src_load;
1218 if (load_a < load_b)
1219 swap(load_a, load_b);
1216 1220
1217 /* Is the difference below the threshold? */ 1221 /* Is the difference below the threshold? */
1218 imb = dst_load * src_capacity * 100 - 1222 imb = load_a * src_capacity * 100 -
1219 src_load * dst_capacity * env->imbalance_pct; 1223 load_b * dst_capacity * env->imbalance_pct;
1220 if (imb <= 0) 1224 if (imb <= 0)
1221 return false; 1225 return false;
1222 1226
1223 /* 1227 /*
1224 * The imbalance is above the allowed threshold. 1228 * The imbalance is above the allowed threshold.
1225 * Compare it with the old imbalance. 1229 * Allow a move that brings us closer to a balanced situation,
1230 * without moving things past the point of balance.
1226 */ 1231 */
1227 orig_src_load = env->src_stats.load; 1232 orig_src_load = env->src_stats.load;
1228 orig_dst_load = env->dst_stats.load;
1229 1233
1230 if (orig_dst_load < orig_src_load) 1234 /*
1231 swap(orig_dst_load, orig_src_load); 1235 * In a task swap, there will be one load moving from src to dst,
1232 1236 * and another moving back. This is the net sum of both moves.
1233 old_imb = orig_dst_load * src_capacity * 100 - 1237 * A simple task move will always have a positive value.
1234 orig_src_load * dst_capacity * env->imbalance_pct; 1238 * Allow the move if it brings the system closer to a balanced
1239 * situation, without crossing over the balance point.
1240 */
1241 moved_load = orig_src_load - src_load;
1235 1242
1236 /* Would this change make things worse? */ 1243 if (moved_load > 0)
1237 return (imb > old_imb); 1244 /* Moving src -> dst. Did we overshoot balance? */
1245 return src_load * dst_capacity < dst_load * src_capacity;
1246 else
1247 /* Moving dst -> src. Did we overshoot balance? */
1248 return dst_load * src_capacity < src_load * dst_capacity;
1238} 1249}
1239 1250
1240/* 1251/*