sched/numa: Only consider less busy nodes as numa balancing destinations

Changeset a43455a1d572 ("sched/numa: Ensure task_numa_migrate() checks the preferred node") fixes an issue where workloads would never converge on a fully loaded (or overloaded) system. However, it introduces a regression on less than fully loaded systems, where workloads converge on a few NUMA nodes, instead of properly staying spread out across the whole system. This leads to a reduction in available memory bandwidth, and usable CPU cache, with predictable performance problems. The root cause appears to be an interaction between the load balancer and NUMA balancing, where the short term load represented by the load balancer differs from the long term load the NUMA balancing code would like to base its decisions on. Simply reverting a43455a1d572 would re-introduce the non-convergence of workloads on fully loaded systems, so that is not a good option. As an aside, the check done before a43455a1d572 only applied to a task's preferred node, not to other candidate nodes in the system, so the converge-on-too-few-nodes problem still happens, just to a lesser degree. Instead, try to compensate for the impedance mismatch between the load balancer and NUMA balancing by only ever considering a lesser loaded node as a destination for NUMA balancing, regardless of whether the task is trying to move to the preferred node, or to another node. This patch also addresses the issue that a system with a single runnable thread would never migrate that thread to near its memory, introduced by 095bebf61a46 ("sched/numa: Do not move past the balance point if unbalanced"). A test where the main thread creates a large memory area, and spawns a worker thread to iterate over the memory (placed on another node by select_task_rq_fair), after which the main thread goes to sleep and waits for the worker thread to loop over all the memory now sees the worker thread migrated to where the memory is, instead of having all the memory migrated over like before. Jirka has run a number of performance tests on several systems: single instance SpecJBB 2005 performance is 7-15% higher on a 4 node system, with higher gains on systems with more cores per socket. Multi-instance SpecJBB 2005 (one per node), linpack, and stream see little or no changes with the revert of 095bebf61a46 and this patch. Reported-by: Artem Bityutski <dedekind1@gmail.com> Reported-by: Jirka Hladky <jhladky@redhat.com> Tested-by: Jirka Hladky <jhladky@redhat.com> Tested-by: Artem Bityutskiy <dedekind1@gmail.com> Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Mel Gorman <mgorman@suse.de> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/20150528095249.3083ade0@annuminas.surriel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Rik van Riel <riel@redhat.com> 2015-05-28 09:52:49 -0400
committer: Ingo Molnar <mingo@kernel.org> 2015-06-07 09:57:45 -0400
commit: 6f9aad0bc37286c0441b57f0ba8cffee50715426 (patch)
tree: 309973a5dc146449ec211d68ecade7be4ba068cd
parent: e4991b240c622f0441c21f4869e13209abc08c5e (diff)
1 files changed, 28 insertions, 2 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 723d69e241be..4b6e5f63d9af 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1398,6 +1398,30 @@ static void task_numa_find_cpu(struct task_numa_env *env,
        }
 }
+/* Only move tasks to a NUMA node less busy than the current node. */
+static bool numa_has_capacity(struct task_numa_env *env)
+{
+        struct numa_stats *src = &env->src_stats;
+        struct numa_stats *dst = &env->dst_stats;
+        if (src->has_free_capacity && !dst->has_free_capacity)
+                return false;
+        /*
+         * Only consider a task move if the source has a higher load
+         * than the destination, corrected for CPU capacity on each node.
+         *
+         *      src->load                dst->load
+         * --------------------- vs ---------------------
+         * src->compute_capacity    dst->compute_capacity
+         */
+        if (src->load * dst->compute_capacity >
+            dst->load * src->compute_capacity)
+                return true;
+        return false;
+}
 static int task_numa_migrate(struct task_struct *p)
 {
        struct task_numa_env env = {
@@ -1452,7 +1476,8 @@ static int task_numa_migrate(struct task_struct *p)
        update_numa_stats(&env.dst_stats, env.dst_nid);
        /* Try to find a spot on the preferred nid. */
-        task_numa_find_cpu(&env, taskimp, groupimp);
+        if (numa_has_capacity(&env))
+                task_numa_find_cpu(&env, taskimp, groupimp);
        /*
         * Look at other nodes in these cases:
@@ -1483,7 +1508,8 @@ static int task_numa_migrate(struct task_struct *p)
                        env.dist = dist;
                        env.dst_nid = nid;
                        update_numa_stats(&env.dst_stats, env.dst_nid);
-                        task_numa_find_cpu(&env, taskimp, groupimp);
+                        if (numa_has_capacity(&env))
+                                task_numa_find_cpu(&env, taskimp, groupimp);
                }
        }
author	Rik van Riel <riel@redhat.com>	2015-05-28 09:52:49 -0400
committer	Ingo Molnar <mingo@kernel.org>	2015-06-07 09:57:45 -0400
commit	6f9aad0bc37286c0441b57f0ba8cffee50715426 (patch)
tree	309973a5dc146449ec211d68ecade7be4ba068cd
parent	e4991b240c622f0441c21f4869e13209abc08c5e (diff)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 723d69e241be..4b6e5f63d9af 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -1398,6 +1398,30 @@ static void task_numa_find_cpu(struct task_numa_env *env,
1398	}	1398	}
1399	}	1399	}
1400		1400
		1401	/* Only move tasks to a NUMA node less busy than the current node. */
		1402	static bool numa_has_capacity(struct task_numa_env *env)
		1403	{
		1404	struct numa_stats *src = &env->src_stats;
		1405	struct numa_stats *dst = &env->dst_stats;
		1406
		1407	if (src->has_free_capacity && !dst->has_free_capacity)
		1408	return false;
		1409
		1410	/*
		1411	* Only consider a task move if the source has a higher load
		1412	* than the destination, corrected for CPU capacity on each node.
		1413	*
		1414	* src->load dst->load
		1415	* --------------------- vs ---------------------
		1416	* src->compute_capacity dst->compute_capacity
		1417	*/
		1418	if (src->load * dst->compute_capacity >
		1419	dst->load * src->compute_capacity)
		1420	return true;
		1421
		1422	return false;
		1423	}
		1424
1401	static int task_numa_migrate(struct task_struct *p)	1425	static int task_numa_migrate(struct task_struct *p)
1402	{	1426	{
1403	struct task_numa_env env = {	1427	struct task_numa_env env = {
@@ -1452,7 +1476,8 @@ static int task_numa_migrate(struct task_struct *p)
1452	update_numa_stats(&env.dst_stats, env.dst_nid);	1476	update_numa_stats(&env.dst_stats, env.dst_nid);
1453		1477
1454	/* Try to find a spot on the preferred nid. */	1478	/* Try to find a spot on the preferred nid. */
1455	task_numa_find_cpu(&env, taskimp, groupimp);	1479	if (numa_has_capacity(&env))
		1480	task_numa_find_cpu(&env, taskimp, groupimp);
1456		1481
1457	/*	1482	/*
1458	* Look at other nodes in these cases:	1483	* Look at other nodes in these cases:
@@ -1483,7 +1508,8 @@ static int task_numa_migrate(struct task_struct *p)
1483	env.dist = dist;	1508	env.dist = dist;
1484	env.dst_nid = nid;	1509	env.dst_nid = nid;
1485	update_numa_stats(&env.dst_stats, env.dst_nid);	1510	update_numa_stats(&env.dst_stats, env.dst_nid);
1486	task_numa_find_cpu(&env, taskimp, groupimp);	1511	if (numa_has_capacity(&env))
		1512	task_numa_find_cpu(&env, taskimp, groupimp);
1487	}	1513	}
1488	}	1514	}
1489		1515