summaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
authorRik van Riel <riel@redhat.com>2014-10-17 03:29:52 -0400
committerIngo Molnar <mingo@kernel.org>2014-10-28 05:47:50 -0400
commit6c6b1193e71fed1a58dc3fab9d967d245177f87b (patch)
tree9b0c2b2ad48f7acf8e8c52e306bb9e67f9ba82fd /kernel/sched/fair.c
parent7bd953206b0b5e0a3aded871982367410b42e1b1 (diff)
sched/numa: Calculate node scores in complex NUMA topologies
In order to do task placement on systems with complex NUMA topologies, it is necessary to count the faults on nodes nearby the node that is being examined for a potential move. In case of a system with a backplane interconnect, we are dealing with groups of NUMA nodes; each of the nodes within a group is the same number of hops away from nodes in other groups in the system. Optimal placement on this topology is achieved by counting all nearby nodes equally. When comparing nodes A and B at distance N, nearby nodes are those at distances smaller than N from nodes A or B. Placement strategy on a system with a glueless mesh NUMA topology needs to be different, because there are no natural groups of nodes determined by the hardware. Instead, when dealing with two nodes A and B at distance N, N >= 2, there will be intermediate nodes at distance < N from both nodes A and B. Good placement can be achieved by right shifting the faults on nearby nodes by the number of hops from the node being scored. In this context, a nearby node is any node less than the maximum distance in the system away from the node. Those nodes are skipped for efficiency reasons, there is no real policy reason to do so. Placement policy on directly connected NUMA systems is not affected. Signed-off-by: Rik van Riel <riel@redhat.com> Tested-by: Chegu Vinod <chegu_vinod@hp.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: mgorman@suse.de Cc: chegu_vinod@hp.com Link: http://lkml.kernel.org/r/1413530994-9732-5-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c74
1 files changed, 74 insertions, 0 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0af3bed3521d..7e5712a0e61b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -925,6 +925,71 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
925 group->faults_cpu[task_faults_idx(nid, 1)]; 925 group->faults_cpu[task_faults_idx(nid, 1)];
926} 926}
927 927
928/* Handle placement on systems where not all nodes are directly connected. */
929static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
930 int maxdist, bool task)
931{
932 unsigned long score = 0;
933 int node;
934
935 /*
936 * All nodes are directly connected, and the same distance
937 * from each other. No need for fancy placement algorithms.
938 */
939 if (sched_numa_topology_type == NUMA_DIRECT)
940 return 0;
941
942 /*
943 * This code is called for each node, introducing N^2 complexity,
944 * which should be ok given the number of nodes rarely exceeds 8.
945 */
946 for_each_online_node(node) {
947 unsigned long faults;
948 int dist = node_distance(nid, node);
949
950 /*
951 * The furthest away nodes in the system are not interesting
952 * for placement; nid was already counted.
953 */
954 if (dist == sched_max_numa_distance || node == nid)
955 continue;
956
957 /*
958 * On systems with a backplane NUMA topology, compare groups
959 * of nodes, and move tasks towards the group with the most
960 * memory accesses. When comparing two nodes at distance
961 * "hoplimit", only nodes closer by than "hoplimit" are part
962 * of each group. Skip other nodes.
963 */
964 if (sched_numa_topology_type == NUMA_BACKPLANE &&
965 dist > maxdist)
966 continue;
967
968 /* Add up the faults from nearby nodes. */
969 if (task)
970 faults = task_faults(p, node);
971 else
972 faults = group_faults(p, node);
973
974 /*
975 * On systems with a glueless mesh NUMA topology, there are
976 * no fixed "groups of nodes". Instead, nodes that are not
977 * directly connected bounce traffic through intermediate
978 * nodes; a numa_group can occupy any set of nodes.
979 * The further away a node is, the less the faults count.
980 * This seems to result in good task placement.
981 */
982 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
983 faults *= (sched_max_numa_distance - dist);
984 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
985 }
986
987 score += faults;
988 }
989
990 return score;
991}
992
928/* 993/*
929 * These return the fraction of accesses done by a particular task, or 994 * These return the fraction of accesses done by a particular task, or
930 * task group, on a particular numa node. The group weight is given a 995 * task group, on a particular numa node. The group weight is given a
@@ -945,6 +1010,8 @@ static inline unsigned long task_weight(struct task_struct *p, int nid,
945 return 0; 1010 return 0;
946 1011
947 faults = task_faults(p, nid); 1012 faults = task_faults(p, nid);
1013 faults += score_nearby_nodes(p, nid, dist, true);
1014
948 return 1000 * faults / total_faults; 1015 return 1000 * faults / total_faults;
949} 1016}
950 1017
@@ -962,6 +1029,8 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
962 return 0; 1029 return 0;
963 1030
964 faults = group_faults(p, nid); 1031 faults = group_faults(p, nid);
1032 faults += score_nearby_nodes(p, nid, dist, false);
1033
965 return 1000 * faults / total_faults; 1034 return 1000 * faults / total_faults;
966} 1035}
967 1036
@@ -1374,6 +1443,11 @@ static int task_numa_migrate(struct task_struct *p)
1374 continue; 1443 continue;
1375 1444
1376 dist = node_distance(env.src_nid, env.dst_nid); 1445 dist = node_distance(env.src_nid, env.dst_nid);
1446 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1447 dist != env.dist) {
1448 taskweight = task_weight(p, env.src_nid, dist);
1449 groupweight = group_weight(p, env.src_nid, dist);
1450 }
1377 1451
1378 /* Only consider nodes where both task and groups benefit */ 1452 /* Only consider nodes where both task and groups benefit */
1379 taskimp = task_weight(p, nid, dist) - taskweight; 1453 taskimp = task_weight(p, nid, dist) - taskweight;