aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRik van Riel <riel@redhat.com>2014-06-23 11:46:16 -0400
committerIngo Molnar <mingo@kernel.org>2014-07-05 05:17:38 -0400
commit0132c3e1777ceabc24c7d209b7cbe78c28c03c09 (patch)
tree34d55dcb41981477a8eb4a136fbb9315572d4d6d
parent1c5d3eb3759013bc7ee4197aa0a9f245bdb6eb90 (diff)
sched/numa: Examine a task move when examining a task swap
Running "perf bench numa mem -0 -m -P 1000 -p 8 -t 20" on a 4 node system results in 160 runnable threads on a system with 80 CPU threads. Once a process has nearly converged, with 39 threads on one node and 1 thread on another node, the remaining thread will be unable to migrate to its preferred node through a task swap. However, a simple task move would make the workload converge, witout causing an imbalance. Test for this unlikely occurrence, and attempt a task move to the preferred nid when it happens. # Running main, "perf bench numa mem -p 8 -t 20 -0 -m -P 1000" ### # 160 tasks will execute (on 4 nodes, 80 CPUs): # -1x 0MB global shared mem operations # -1x 1000MB process shared mem operations # -1x 0MB thread local mem operations ### ### # # 0.0% [0.2 mins] 0/0 1/1 36/2 0/0 [36/3 ] l: 0-0 ( 0) {0-2} # 0.0% [0.3 mins] 43/3 37/2 39/2 41/3 [ 6/10] l: 0-1 ( 1) {1-2} # 0.0% [0.4 mins] 42/3 38/2 40/2 40/2 [ 4/9 ] l: 1-2 ( 1) [50.0%] {1-2} # 0.0% [0.6 mins] 41/3 39/2 40/2 40/2 [ 2/9 ] l: 2-4 ( 2) [50.0%] {1-2} # 0.0% [0.7 mins] 40/2 40/2 40/2 40/2 [ 0/8 ] l: 3-5 ( 2) [40.0%] ( 41.8s converged) Without this patch, this same perf bench numa mem run had to rely on the scheduler load balancer to first balance out the load (moving a random task), before a task swap could complete the NUMA convergence. The load balancer does not normally take action unless the load difference exceeds 25%. Convergence times of over half an hour have been observed without this patch. With this patch, the NUMA balancing code will simply migrate the task, if that does not cause an imbalance. Also skip examining a CPU in detail if the improvement on that CPU is no more than the best we already have. Signed-off-by: Rik van Riel <riel@redhat.com> Cc: chegu_vinod@hp.com Cc: mgorman@suse.de Cc: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/n/tip-ggthh0rnh0yua6o5o3p6cr1o@git.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--kernel/sched/fair.c23
1 files changed, 21 insertions, 2 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cebb312e874b..9d1734a724a8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1155,6 +1155,7 @@ static void task_numa_compare(struct task_numa_env *env,
1155 long src_load, dst_load; 1155 long src_load, dst_load;
1156 long load; 1156 long load;
1157 long imp = env->p->numa_group ? groupimp : taskimp; 1157 long imp = env->p->numa_group ? groupimp : taskimp;
1158 long moveimp = imp;
1158 1159
1159 rcu_read_lock(); 1160 rcu_read_lock();
1160 cur = ACCESS_ONCE(dst_rq->curr); 1161 cur = ACCESS_ONCE(dst_rq->curr);
@@ -1201,7 +1202,7 @@ static void task_numa_compare(struct task_numa_env *env,
1201 } 1202 }
1202 } 1203 }
1203 1204
1204 if (imp < env->best_imp) 1205 if (imp <= env->best_imp && moveimp <= env->best_imp)
1205 goto unlock; 1206 goto unlock;
1206 1207
1207 if (!cur) { 1208 if (!cur) {
@@ -1214,7 +1215,8 @@ static void task_numa_compare(struct task_numa_env *env,
1214 } 1215 }
1215 1216
1216 /* Balance doesn't matter much if we're running a task per cpu */ 1217 /* Balance doesn't matter much if we're running a task per cpu */
1217 if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) 1218 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1219 dst_rq->nr_running == 1)
1218 goto assign; 1220 goto assign;
1219 1221
1220 /* 1222 /*
@@ -1230,6 +1232,23 @@ balance:
1230 src_load += effective_load(tg, env->src_cpu, -load, -load); 1232 src_load += effective_load(tg, env->src_cpu, -load, -load);
1231 dst_load += effective_load(tg, env->dst_cpu, load, load); 1233 dst_load += effective_load(tg, env->dst_cpu, load, load);
1232 1234
1235 if (moveimp > imp && moveimp > env->best_imp) {
1236 /*
1237 * If the improvement from just moving env->p direction is
1238 * better than swapping tasks around, check if a move is
1239 * possible. Store a slightly smaller score than moveimp,
1240 * so an actually idle CPU will win.
1241 */
1242 if (!load_too_imbalanced(src_load, dst_load, env)) {
1243 imp = moveimp - 1;
1244 cur = NULL;
1245 goto assign;
1246 }
1247 }
1248
1249 if (imp <= env->best_imp)
1250 goto unlock;
1251
1233 if (cur) { 1252 if (cur) {
1234 /* Cur moves in the opposite direction. */ 1253 /* Cur moves in the opposite direction. */
1235 load = cur->se.load.weight; 1254 load = cur->se.load.weight;