aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRik van Riel <riel@redhat.com>2015-05-28 09:52:49 -0400
committerIngo Molnar <mingo@kernel.org>2015-06-07 09:57:45 -0400
commit6f9aad0bc37286c0441b57f0ba8cffee50715426 (patch)
tree309973a5dc146449ec211d68ecade7be4ba068cd
parente4991b240c622f0441c21f4869e13209abc08c5e (diff)
sched/numa: Only consider less busy nodes as numa balancing destinations
Changeset a43455a1d572 ("sched/numa: Ensure task_numa_migrate() checks the preferred node") fixes an issue where workloads would never converge on a fully loaded (or overloaded) system. However, it introduces a regression on less than fully loaded systems, where workloads converge on a few NUMA nodes, instead of properly staying spread out across the whole system. This leads to a reduction in available memory bandwidth, and usable CPU cache, with predictable performance problems. The root cause appears to be an interaction between the load balancer and NUMA balancing, where the short term load represented by the load balancer differs from the long term load the NUMA balancing code would like to base its decisions on. Simply reverting a43455a1d572 would re-introduce the non-convergence of workloads on fully loaded systems, so that is not a good option. As an aside, the check done before a43455a1d572 only applied to a task's preferred node, not to other candidate nodes in the system, so the converge-on-too-few-nodes problem still happens, just to a lesser degree. Instead, try to compensate for the impedance mismatch between the load balancer and NUMA balancing by only ever considering a lesser loaded node as a destination for NUMA balancing, regardless of whether the task is trying to move to the preferred node, or to another node. This patch also addresses the issue that a system with a single runnable thread would never migrate that thread to near its memory, introduced by 095bebf61a46 ("sched/numa: Do not move past the balance point if unbalanced"). A test where the main thread creates a large memory area, and spawns a worker thread to iterate over the memory (placed on another node by select_task_rq_fair), after which the main thread goes to sleep and waits for the worker thread to loop over all the memory now sees the worker thread migrated to where the memory is, instead of having all the memory migrated over like before. Jirka has run a number of performance tests on several systems: single instance SpecJBB 2005 performance is 7-15% higher on a 4 node system, with higher gains on systems with more cores per socket. Multi-instance SpecJBB 2005 (one per node), linpack, and stream see little or no changes with the revert of 095bebf61a46 and this patch. Reported-by: Artem Bityutski <dedekind1@gmail.com> Reported-by: Jirka Hladky <jhladky@redhat.com> Tested-by: Jirka Hladky <jhladky@redhat.com> Tested-by: Artem Bityutskiy <dedekind1@gmail.com> Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Mel Gorman <mgorman@suse.de> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/20150528095249.3083ade0@annuminas.surriel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--kernel/sched/fair.c30
1 files changed, 28 insertions, 2 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 723d69e241be..4b6e5f63d9af 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1398,6 +1398,30 @@ static void task_numa_find_cpu(struct task_numa_env *env,
1398 } 1398 }
1399} 1399}
1400 1400
1401/* Only move tasks to a NUMA node less busy than the current node. */
1402static bool numa_has_capacity(struct task_numa_env *env)
1403{
1404 struct numa_stats *src = &env->src_stats;
1405 struct numa_stats *dst = &env->dst_stats;
1406
1407 if (src->has_free_capacity && !dst->has_free_capacity)
1408 return false;
1409
1410 /*
1411 * Only consider a task move if the source has a higher load
1412 * than the destination, corrected for CPU capacity on each node.
1413 *
1414 * src->load dst->load
1415 * --------------------- vs ---------------------
1416 * src->compute_capacity dst->compute_capacity
1417 */
1418 if (src->load * dst->compute_capacity >
1419 dst->load * src->compute_capacity)
1420 return true;
1421
1422 return false;
1423}
1424
1401static int task_numa_migrate(struct task_struct *p) 1425static int task_numa_migrate(struct task_struct *p)
1402{ 1426{
1403 struct task_numa_env env = { 1427 struct task_numa_env env = {
@@ -1452,7 +1476,8 @@ static int task_numa_migrate(struct task_struct *p)
1452 update_numa_stats(&env.dst_stats, env.dst_nid); 1476 update_numa_stats(&env.dst_stats, env.dst_nid);
1453 1477
1454 /* Try to find a spot on the preferred nid. */ 1478 /* Try to find a spot on the preferred nid. */
1455 task_numa_find_cpu(&env, taskimp, groupimp); 1479 if (numa_has_capacity(&env))
1480 task_numa_find_cpu(&env, taskimp, groupimp);
1456 1481
1457 /* 1482 /*
1458 * Look at other nodes in these cases: 1483 * Look at other nodes in these cases:
@@ -1483,7 +1508,8 @@ static int task_numa_migrate(struct task_struct *p)
1483 env.dist = dist; 1508 env.dist = dist;
1484 env.dst_nid = nid; 1509 env.dst_nid = nid;
1485 update_numa_stats(&env.dst_stats, env.dst_nid); 1510 update_numa_stats(&env.dst_stats, env.dst_nid);
1486 task_numa_find_cpu(&env, taskimp, groupimp); 1511 if (numa_has_capacity(&env))
1512 task_numa_find_cpu(&env, taskimp, groupimp);
1487 } 1513 }
1488 } 1514 }
1489 1515