diff options
author | Mel Gorman <mgorman@suse.de> | 2013-10-07 06:29:27 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2013-10-09 08:47:58 -0400 |
commit | 83e1d2cd9eabec5164afea295ff06b941ae8e4a9 (patch) | |
tree | f1f23d5483b00be3ce851c941de72ea52d6f7a4b /kernel | |
parent | 5e1576ed0e54d419286a8096133029062b6ad456 (diff) |
sched/numa: Use group fault statistics in numa placement
This patch uses the fraction of faults on a particular node for both task
and group, to figure out the best node to place a task. If the task and
group statistics disagree on what the preferred node should be then a full
rescan will select the node with the best combined weight.
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1381141781-10992-50-git-send-email-mgorman@suse.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched/fair.c | 124 |
1 files changed, 107 insertions, 17 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 35661b8afb4e..4c40e13310e9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -897,6 +897,7 @@ struct numa_group { | |||
897 | struct list_head task_list; | 897 | struct list_head task_list; |
898 | 898 | ||
899 | struct rcu_head rcu; | 899 | struct rcu_head rcu; |
900 | atomic_long_t total_faults; | ||
900 | atomic_long_t faults[0]; | 901 | atomic_long_t faults[0]; |
901 | }; | 902 | }; |
902 | 903 | ||
@@ -919,6 +920,51 @@ static inline unsigned long task_faults(struct task_struct *p, int nid) | |||
919 | p->numa_faults[task_faults_idx(nid, 1)]; | 920 | p->numa_faults[task_faults_idx(nid, 1)]; |
920 | } | 921 | } |
921 | 922 | ||
923 | static inline unsigned long group_faults(struct task_struct *p, int nid) | ||
924 | { | ||
925 | if (!p->numa_group) | ||
926 | return 0; | ||
927 | |||
928 | return atomic_long_read(&p->numa_group->faults[2*nid]) + | ||
929 | atomic_long_read(&p->numa_group->faults[2*nid+1]); | ||
930 | } | ||
931 | |||
932 | /* | ||
933 | * These return the fraction of accesses done by a particular task, or | ||
934 | * task group, on a particular numa node. The group weight is given a | ||
935 | * larger multiplier, in order to group tasks together that are almost | ||
936 | * evenly spread out between numa nodes. | ||
937 | */ | ||
938 | static inline unsigned long task_weight(struct task_struct *p, int nid) | ||
939 | { | ||
940 | unsigned long total_faults; | ||
941 | |||
942 | if (!p->numa_faults) | ||
943 | return 0; | ||
944 | |||
945 | total_faults = p->total_numa_faults; | ||
946 | |||
947 | if (!total_faults) | ||
948 | return 0; | ||
949 | |||
950 | return 1000 * task_faults(p, nid) / total_faults; | ||
951 | } | ||
952 | |||
953 | static inline unsigned long group_weight(struct task_struct *p, int nid) | ||
954 | { | ||
955 | unsigned long total_faults; | ||
956 | |||
957 | if (!p->numa_group) | ||
958 | return 0; | ||
959 | |||
960 | total_faults = atomic_long_read(&p->numa_group->total_faults); | ||
961 | |||
962 | if (!total_faults) | ||
963 | return 0; | ||
964 | |||
965 | return 1200 * group_faults(p, nid) / total_faults; | ||
966 | } | ||
967 | |||
922 | static unsigned long weighted_cpuload(const int cpu); | 968 | static unsigned long weighted_cpuload(const int cpu); |
923 | static unsigned long source_load(int cpu, int type); | 969 | static unsigned long source_load(int cpu, int type); |
924 | static unsigned long target_load(int cpu, int type); | 970 | static unsigned long target_load(int cpu, int type); |
@@ -1018,8 +1064,10 @@ static void task_numa_compare(struct task_numa_env *env, long imp) | |||
1018 | if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur))) | 1064 | if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur))) |
1019 | goto unlock; | 1065 | goto unlock; |
1020 | 1066 | ||
1021 | imp += task_faults(cur, env->src_nid) - | 1067 | imp += task_weight(cur, env->src_nid) + |
1022 | task_faults(cur, env->dst_nid); | 1068 | group_weight(cur, env->src_nid) - |
1069 | task_weight(cur, env->dst_nid) - | ||
1070 | group_weight(cur, env->dst_nid); | ||
1023 | } | 1071 | } |
1024 | 1072 | ||
1025 | if (imp < env->best_imp) | 1073 | if (imp < env->best_imp) |
@@ -1098,7 +1146,7 @@ static int task_numa_migrate(struct task_struct *p) | |||
1098 | .best_cpu = -1 | 1146 | .best_cpu = -1 |
1099 | }; | 1147 | }; |
1100 | struct sched_domain *sd; | 1148 | struct sched_domain *sd; |
1101 | unsigned long faults; | 1149 | unsigned long weight; |
1102 | int nid, ret; | 1150 | int nid, ret; |
1103 | long imp; | 1151 | long imp; |
1104 | 1152 | ||
@@ -1115,10 +1163,10 @@ static int task_numa_migrate(struct task_struct *p) | |||
1115 | env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; | 1163 | env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; |
1116 | rcu_read_unlock(); | 1164 | rcu_read_unlock(); |
1117 | 1165 | ||
1118 | faults = task_faults(p, env.src_nid); | 1166 | weight = task_weight(p, env.src_nid) + group_weight(p, env.src_nid); |
1119 | update_numa_stats(&env.src_stats, env.src_nid); | 1167 | update_numa_stats(&env.src_stats, env.src_nid); |
1120 | env.dst_nid = p->numa_preferred_nid; | 1168 | env.dst_nid = p->numa_preferred_nid; |
1121 | imp = task_faults(env.p, env.dst_nid) - faults; | 1169 | imp = task_weight(p, env.dst_nid) + group_weight(p, env.dst_nid) - weight; |
1122 | update_numa_stats(&env.dst_stats, env.dst_nid); | 1170 | update_numa_stats(&env.dst_stats, env.dst_nid); |
1123 | 1171 | ||
1124 | /* If the preferred nid has capacity, try to use it. */ | 1172 | /* If the preferred nid has capacity, try to use it. */ |
@@ -1131,8 +1179,8 @@ static int task_numa_migrate(struct task_struct *p) | |||
1131 | if (nid == env.src_nid || nid == p->numa_preferred_nid) | 1179 | if (nid == env.src_nid || nid == p->numa_preferred_nid) |
1132 | continue; | 1180 | continue; |
1133 | 1181 | ||
1134 | /* Only consider nodes that recorded more faults */ | 1182 | /* Only consider nodes where both task and groups benefit */ |
1135 | imp = task_faults(env.p, nid) - faults; | 1183 | imp = task_weight(p, nid) + group_weight(p, nid) - weight; |
1136 | if (imp < 0) | 1184 | if (imp < 0) |
1137 | continue; | 1185 | continue; |
1138 | 1186 | ||
@@ -1183,8 +1231,8 @@ static void numa_migrate_preferred(struct task_struct *p) | |||
1183 | 1231 | ||
1184 | static void task_numa_placement(struct task_struct *p) | 1232 | static void task_numa_placement(struct task_struct *p) |
1185 | { | 1233 | { |
1186 | int seq, nid, max_nid = -1; | 1234 | int seq, nid, max_nid = -1, max_group_nid = -1; |
1187 | unsigned long max_faults = 0; | 1235 | unsigned long max_faults = 0, max_group_faults = 0; |
1188 | 1236 | ||
1189 | seq = ACCESS_ONCE(p->mm->numa_scan_seq); | 1237 | seq = ACCESS_ONCE(p->mm->numa_scan_seq); |
1190 | if (p->numa_scan_seq == seq) | 1238 | if (p->numa_scan_seq == seq) |
@@ -1195,7 +1243,7 @@ static void task_numa_placement(struct task_struct *p) | |||
1195 | 1243 | ||
1196 | /* Find the node with the highest number of faults */ | 1244 | /* Find the node with the highest number of faults */ |
1197 | for_each_online_node(nid) { | 1245 | for_each_online_node(nid) { |
1198 | unsigned long faults = 0; | 1246 | unsigned long faults = 0, group_faults = 0; |
1199 | int priv, i; | 1247 | int priv, i; |
1200 | 1248 | ||
1201 | for (priv = 0; priv < 2; priv++) { | 1249 | for (priv = 0; priv < 2; priv++) { |
@@ -1211,9 +1259,12 @@ static void task_numa_placement(struct task_struct *p) | |||
1211 | 1259 | ||
1212 | faults += p->numa_faults[i]; | 1260 | faults += p->numa_faults[i]; |
1213 | diff += p->numa_faults[i]; | 1261 | diff += p->numa_faults[i]; |
1262 | p->total_numa_faults += diff; | ||
1214 | if (p->numa_group) { | 1263 | if (p->numa_group) { |
1215 | /* safe because we can only change our own group */ | 1264 | /* safe because we can only change our own group */ |
1216 | atomic_long_add(diff, &p->numa_group->faults[i]); | 1265 | atomic_long_add(diff, &p->numa_group->faults[i]); |
1266 | atomic_long_add(diff, &p->numa_group->total_faults); | ||
1267 | group_faults += atomic_long_read(&p->numa_group->faults[i]); | ||
1217 | } | 1268 | } |
1218 | } | 1269 | } |
1219 | 1270 | ||
@@ -1221,6 +1272,27 @@ static void task_numa_placement(struct task_struct *p) | |||
1221 | max_faults = faults; | 1272 | max_faults = faults; |
1222 | max_nid = nid; | 1273 | max_nid = nid; |
1223 | } | 1274 | } |
1275 | |||
1276 | if (group_faults > max_group_faults) { | ||
1277 | max_group_faults = group_faults; | ||
1278 | max_group_nid = nid; | ||
1279 | } | ||
1280 | } | ||
1281 | |||
1282 | /* | ||
1283 | * If the preferred task and group nids are different, | ||
1284 | * iterate over the nodes again to find the best place. | ||
1285 | */ | ||
1286 | if (p->numa_group && max_nid != max_group_nid) { | ||
1287 | unsigned long weight, max_weight = 0; | ||
1288 | |||
1289 | for_each_online_node(nid) { | ||
1290 | weight = task_weight(p, nid) + group_weight(p, nid); | ||
1291 | if (weight > max_weight) { | ||
1292 | max_weight = weight; | ||
1293 | max_nid = nid; | ||
1294 | } | ||
1295 | } | ||
1224 | } | 1296 | } |
1225 | 1297 | ||
1226 | /* Preferred node as the node with the most faults */ | 1298 | /* Preferred node as the node with the most faults */ |
@@ -1276,6 +1348,8 @@ static void task_numa_group(struct task_struct *p, int cpupid) | |||
1276 | for (i = 0; i < 2*nr_node_ids; i++) | 1348 | for (i = 0; i < 2*nr_node_ids; i++) |
1277 | atomic_long_set(&grp->faults[i], p->numa_faults[i]); | 1349 | atomic_long_set(&grp->faults[i], p->numa_faults[i]); |
1278 | 1350 | ||
1351 | atomic_long_set(&grp->total_faults, p->total_numa_faults); | ||
1352 | |||
1279 | list_add(&p->numa_entry, &grp->task_list); | 1353 | list_add(&p->numa_entry, &grp->task_list); |
1280 | grp->nr_tasks++; | 1354 | grp->nr_tasks++; |
1281 | rcu_assign_pointer(p->numa_group, grp); | 1355 | rcu_assign_pointer(p->numa_group, grp); |
@@ -1323,6 +1397,8 @@ unlock: | |||
1323 | atomic_long_sub(p->numa_faults[i], &my_grp->faults[i]); | 1397 | atomic_long_sub(p->numa_faults[i], &my_grp->faults[i]); |
1324 | atomic_long_add(p->numa_faults[i], &grp->faults[i]); | 1398 | atomic_long_add(p->numa_faults[i], &grp->faults[i]); |
1325 | } | 1399 | } |
1400 | atomic_long_sub(p->total_numa_faults, &my_grp->total_faults); | ||
1401 | atomic_long_add(p->total_numa_faults, &grp->total_faults); | ||
1326 | 1402 | ||
1327 | double_lock(&my_grp->lock, &grp->lock); | 1403 | double_lock(&my_grp->lock, &grp->lock); |
1328 | 1404 | ||
@@ -1347,6 +1423,8 @@ void task_numa_free(struct task_struct *p) | |||
1347 | for (i = 0; i < 2*nr_node_ids; i++) | 1423 | for (i = 0; i < 2*nr_node_ids; i++) |
1348 | atomic_long_sub(p->numa_faults[i], &grp->faults[i]); | 1424 | atomic_long_sub(p->numa_faults[i], &grp->faults[i]); |
1349 | 1425 | ||
1426 | atomic_long_sub(p->total_numa_faults, &grp->total_faults); | ||
1427 | |||
1350 | spin_lock(&grp->lock); | 1428 | spin_lock(&grp->lock); |
1351 | list_del(&p->numa_entry); | 1429 | list_del(&p->numa_entry); |
1352 | grp->nr_tasks--; | 1430 | grp->nr_tasks--; |
@@ -1385,6 +1463,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) | |||
1385 | 1463 | ||
1386 | BUG_ON(p->numa_faults_buffer); | 1464 | BUG_ON(p->numa_faults_buffer); |
1387 | p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); | 1465 | p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); |
1466 | p->total_numa_faults = 0; | ||
1388 | } | 1467 | } |
1389 | 1468 | ||
1390 | /* | 1469 | /* |
@@ -4572,12 +4651,17 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) | |||
4572 | src_nid = cpu_to_node(env->src_cpu); | 4651 | src_nid = cpu_to_node(env->src_cpu); |
4573 | dst_nid = cpu_to_node(env->dst_cpu); | 4652 | dst_nid = cpu_to_node(env->dst_cpu); |
4574 | 4653 | ||
4575 | if (src_nid == dst_nid || | 4654 | if (src_nid == dst_nid) |
4576 | p->numa_migrate_seq >= sysctl_numa_balancing_settle_count) | ||
4577 | return false; | 4655 | return false; |
4578 | 4656 | ||
4579 | if (dst_nid == p->numa_preferred_nid || | 4657 | /* Always encourage migration to the preferred node. */ |
4580 | task_faults(p, dst_nid) > task_faults(p, src_nid)) | 4658 | if (dst_nid == p->numa_preferred_nid) |
4659 | return true; | ||
4660 | |||
4661 | /* After the task has settled, check if the new node is better. */ | ||
4662 | if (p->numa_migrate_seq >= sysctl_numa_balancing_settle_count && | ||
4663 | task_weight(p, dst_nid) + group_weight(p, dst_nid) > | ||
4664 | task_weight(p, src_nid) + group_weight(p, src_nid)) | ||
4581 | return true; | 4665 | return true; |
4582 | 4666 | ||
4583 | return false; | 4667 | return false; |
@@ -4597,11 +4681,17 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | |||
4597 | src_nid = cpu_to_node(env->src_cpu); | 4681 | src_nid = cpu_to_node(env->src_cpu); |
4598 | dst_nid = cpu_to_node(env->dst_cpu); | 4682 | dst_nid = cpu_to_node(env->dst_cpu); |
4599 | 4683 | ||
4600 | if (src_nid == dst_nid || | 4684 | if (src_nid == dst_nid) |
4601 | p->numa_migrate_seq >= sysctl_numa_balancing_settle_count) | ||
4602 | return false; | 4685 | return false; |
4603 | 4686 | ||
4604 | if (task_faults(p, dst_nid) < task_faults(p, src_nid)) | 4687 | /* Migrating away from the preferred node is always bad. */ |
4688 | if (src_nid == p->numa_preferred_nid) | ||
4689 | return true; | ||
4690 | |||
4691 | /* After the task has settled, check if the new node is worse. */ | ||
4692 | if (p->numa_migrate_seq >= sysctl_numa_balancing_settle_count && | ||
4693 | task_weight(p, dst_nid) + group_weight(p, dst_nid) < | ||
4694 | task_weight(p, src_nid) + group_weight(p, src_nid)) | ||
4605 | return true; | 4695 | return true; |
4606 | 4696 | ||
4607 | return false; | 4697 | return false; |