aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2013-10-07 06:29:27 -0400
committerIngo Molnar <mingo@kernel.org>2013-10-09 08:47:58 -0400
commit83e1d2cd9eabec5164afea295ff06b941ae8e4a9 (patch)
treef1f23d5483b00be3ce851c941de72ea52d6f7a4b /kernel
parent5e1576ed0e54d419286a8096133029062b6ad456 (diff)
sched/numa: Use group fault statistics in numa placement
This patch uses the fraction of faults on a particular node for both task and group, to figure out the best node to place a task. If the task and group statistics disagree on what the preferred node should be then a full rescan will select the node with the best combined weight. Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1381141781-10992-50-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/fair.c124
1 files changed, 107 insertions, 17 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 35661b8afb4e..4c40e13310e9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -897,6 +897,7 @@ struct numa_group {
897 struct list_head task_list; 897 struct list_head task_list;
898 898
899 struct rcu_head rcu; 899 struct rcu_head rcu;
900 atomic_long_t total_faults;
900 atomic_long_t faults[0]; 901 atomic_long_t faults[0];
901}; 902};
902 903
@@ -919,6 +920,51 @@ static inline unsigned long task_faults(struct task_struct *p, int nid)
919 p->numa_faults[task_faults_idx(nid, 1)]; 920 p->numa_faults[task_faults_idx(nid, 1)];
920} 921}
921 922
923static inline unsigned long group_faults(struct task_struct *p, int nid)
924{
925 if (!p->numa_group)
926 return 0;
927
928 return atomic_long_read(&p->numa_group->faults[2*nid]) +
929 atomic_long_read(&p->numa_group->faults[2*nid+1]);
930}
931
932/*
933 * These return the fraction of accesses done by a particular task, or
934 * task group, on a particular numa node. The group weight is given a
935 * larger multiplier, in order to group tasks together that are almost
936 * evenly spread out between numa nodes.
937 */
938static inline unsigned long task_weight(struct task_struct *p, int nid)
939{
940 unsigned long total_faults;
941
942 if (!p->numa_faults)
943 return 0;
944
945 total_faults = p->total_numa_faults;
946
947 if (!total_faults)
948 return 0;
949
950 return 1000 * task_faults(p, nid) / total_faults;
951}
952
953static inline unsigned long group_weight(struct task_struct *p, int nid)
954{
955 unsigned long total_faults;
956
957 if (!p->numa_group)
958 return 0;
959
960 total_faults = atomic_long_read(&p->numa_group->total_faults);
961
962 if (!total_faults)
963 return 0;
964
965 return 1200 * group_faults(p, nid) / total_faults;
966}
967
922static unsigned long weighted_cpuload(const int cpu); 968static unsigned long weighted_cpuload(const int cpu);
923static unsigned long source_load(int cpu, int type); 969static unsigned long source_load(int cpu, int type);
924static unsigned long target_load(int cpu, int type); 970static unsigned long target_load(int cpu, int type);
@@ -1018,8 +1064,10 @@ static void task_numa_compare(struct task_numa_env *env, long imp)
1018 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur))) 1064 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1019 goto unlock; 1065 goto unlock;
1020 1066
1021 imp += task_faults(cur, env->src_nid) - 1067 imp += task_weight(cur, env->src_nid) +
1022 task_faults(cur, env->dst_nid); 1068 group_weight(cur, env->src_nid) -
1069 task_weight(cur, env->dst_nid) -
1070 group_weight(cur, env->dst_nid);
1023 } 1071 }
1024 1072
1025 if (imp < env->best_imp) 1073 if (imp < env->best_imp)
@@ -1098,7 +1146,7 @@ static int task_numa_migrate(struct task_struct *p)
1098 .best_cpu = -1 1146 .best_cpu = -1
1099 }; 1147 };
1100 struct sched_domain *sd; 1148 struct sched_domain *sd;
1101 unsigned long faults; 1149 unsigned long weight;
1102 int nid, ret; 1150 int nid, ret;
1103 long imp; 1151 long imp;
1104 1152
@@ -1115,10 +1163,10 @@ static int task_numa_migrate(struct task_struct *p)
1115 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; 1163 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1116 rcu_read_unlock(); 1164 rcu_read_unlock();
1117 1165
1118 faults = task_faults(p, env.src_nid); 1166 weight = task_weight(p, env.src_nid) + group_weight(p, env.src_nid);
1119 update_numa_stats(&env.src_stats, env.src_nid); 1167 update_numa_stats(&env.src_stats, env.src_nid);
1120 env.dst_nid = p->numa_preferred_nid; 1168 env.dst_nid = p->numa_preferred_nid;
1121 imp = task_faults(env.p, env.dst_nid) - faults; 1169 imp = task_weight(p, env.dst_nid) + group_weight(p, env.dst_nid) - weight;
1122 update_numa_stats(&env.dst_stats, env.dst_nid); 1170 update_numa_stats(&env.dst_stats, env.dst_nid);
1123 1171
1124 /* If the preferred nid has capacity, try to use it. */ 1172 /* If the preferred nid has capacity, try to use it. */
@@ -1131,8 +1179,8 @@ static int task_numa_migrate(struct task_struct *p)
1131 if (nid == env.src_nid || nid == p->numa_preferred_nid) 1179 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1132 continue; 1180 continue;
1133 1181
1134 /* Only consider nodes that recorded more faults */ 1182 /* Only consider nodes where both task and groups benefit */
1135 imp = task_faults(env.p, nid) - faults; 1183 imp = task_weight(p, nid) + group_weight(p, nid) - weight;
1136 if (imp < 0) 1184 if (imp < 0)
1137 continue; 1185 continue;
1138 1186
@@ -1183,8 +1231,8 @@ static void numa_migrate_preferred(struct task_struct *p)
1183 1231
1184static void task_numa_placement(struct task_struct *p) 1232static void task_numa_placement(struct task_struct *p)
1185{ 1233{
1186 int seq, nid, max_nid = -1; 1234 int seq, nid, max_nid = -1, max_group_nid = -1;
1187 unsigned long max_faults = 0; 1235 unsigned long max_faults = 0, max_group_faults = 0;
1188 1236
1189 seq = ACCESS_ONCE(p->mm->numa_scan_seq); 1237 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
1190 if (p->numa_scan_seq == seq) 1238 if (p->numa_scan_seq == seq)
@@ -1195,7 +1243,7 @@ static void task_numa_placement(struct task_struct *p)
1195 1243
1196 /* Find the node with the highest number of faults */ 1244 /* Find the node with the highest number of faults */
1197 for_each_online_node(nid) { 1245 for_each_online_node(nid) {
1198 unsigned long faults = 0; 1246 unsigned long faults = 0, group_faults = 0;
1199 int priv, i; 1247 int priv, i;
1200 1248
1201 for (priv = 0; priv < 2; priv++) { 1249 for (priv = 0; priv < 2; priv++) {
@@ -1211,9 +1259,12 @@ static void task_numa_placement(struct task_struct *p)
1211 1259
1212 faults += p->numa_faults[i]; 1260 faults += p->numa_faults[i];
1213 diff += p->numa_faults[i]; 1261 diff += p->numa_faults[i];
1262 p->total_numa_faults += diff;
1214 if (p->numa_group) { 1263 if (p->numa_group) {
1215 /* safe because we can only change our own group */ 1264 /* safe because we can only change our own group */
1216 atomic_long_add(diff, &p->numa_group->faults[i]); 1265 atomic_long_add(diff, &p->numa_group->faults[i]);
1266 atomic_long_add(diff, &p->numa_group->total_faults);
1267 group_faults += atomic_long_read(&p->numa_group->faults[i]);
1217 } 1268 }
1218 } 1269 }
1219 1270
@@ -1221,6 +1272,27 @@ static void task_numa_placement(struct task_struct *p)
1221 max_faults = faults; 1272 max_faults = faults;
1222 max_nid = nid; 1273 max_nid = nid;
1223 } 1274 }
1275
1276 if (group_faults > max_group_faults) {
1277 max_group_faults = group_faults;
1278 max_group_nid = nid;
1279 }
1280 }
1281
1282 /*
1283 * If the preferred task and group nids are different,
1284 * iterate over the nodes again to find the best place.
1285 */
1286 if (p->numa_group && max_nid != max_group_nid) {
1287 unsigned long weight, max_weight = 0;
1288
1289 for_each_online_node(nid) {
1290 weight = task_weight(p, nid) + group_weight(p, nid);
1291 if (weight > max_weight) {
1292 max_weight = weight;
1293 max_nid = nid;
1294 }
1295 }
1224 } 1296 }
1225 1297
1226 /* Preferred node as the node with the most faults */ 1298 /* Preferred node as the node with the most faults */
@@ -1276,6 +1348,8 @@ static void task_numa_group(struct task_struct *p, int cpupid)
1276 for (i = 0; i < 2*nr_node_ids; i++) 1348 for (i = 0; i < 2*nr_node_ids; i++)
1277 atomic_long_set(&grp->faults[i], p->numa_faults[i]); 1349 atomic_long_set(&grp->faults[i], p->numa_faults[i]);
1278 1350
1351 atomic_long_set(&grp->total_faults, p->total_numa_faults);
1352
1279 list_add(&p->numa_entry, &grp->task_list); 1353 list_add(&p->numa_entry, &grp->task_list);
1280 grp->nr_tasks++; 1354 grp->nr_tasks++;
1281 rcu_assign_pointer(p->numa_group, grp); 1355 rcu_assign_pointer(p->numa_group, grp);
@@ -1323,6 +1397,8 @@ unlock:
1323 atomic_long_sub(p->numa_faults[i], &my_grp->faults[i]); 1397 atomic_long_sub(p->numa_faults[i], &my_grp->faults[i]);
1324 atomic_long_add(p->numa_faults[i], &grp->faults[i]); 1398 atomic_long_add(p->numa_faults[i], &grp->faults[i]);
1325 } 1399 }
1400 atomic_long_sub(p->total_numa_faults, &my_grp->total_faults);
1401 atomic_long_add(p->total_numa_faults, &grp->total_faults);
1326 1402
1327 double_lock(&my_grp->lock, &grp->lock); 1403 double_lock(&my_grp->lock, &grp->lock);
1328 1404
@@ -1347,6 +1423,8 @@ void task_numa_free(struct task_struct *p)
1347 for (i = 0; i < 2*nr_node_ids; i++) 1423 for (i = 0; i < 2*nr_node_ids; i++)
1348 atomic_long_sub(p->numa_faults[i], &grp->faults[i]); 1424 atomic_long_sub(p->numa_faults[i], &grp->faults[i]);
1349 1425
1426 atomic_long_sub(p->total_numa_faults, &grp->total_faults);
1427
1350 spin_lock(&grp->lock); 1428 spin_lock(&grp->lock);
1351 list_del(&p->numa_entry); 1429 list_del(&p->numa_entry);
1352 grp->nr_tasks--; 1430 grp->nr_tasks--;
@@ -1385,6 +1463,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1385 1463
1386 BUG_ON(p->numa_faults_buffer); 1464 BUG_ON(p->numa_faults_buffer);
1387 p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); 1465 p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
1466 p->total_numa_faults = 0;
1388 } 1467 }
1389 1468
1390 /* 1469 /*
@@ -4572,12 +4651,17 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
4572 src_nid = cpu_to_node(env->src_cpu); 4651 src_nid = cpu_to_node(env->src_cpu);
4573 dst_nid = cpu_to_node(env->dst_cpu); 4652 dst_nid = cpu_to_node(env->dst_cpu);
4574 4653
4575 if (src_nid == dst_nid || 4654 if (src_nid == dst_nid)
4576 p->numa_migrate_seq >= sysctl_numa_balancing_settle_count)
4577 return false; 4655 return false;
4578 4656
4579 if (dst_nid == p->numa_preferred_nid || 4657 /* Always encourage migration to the preferred node. */
4580 task_faults(p, dst_nid) > task_faults(p, src_nid)) 4658 if (dst_nid == p->numa_preferred_nid)
4659 return true;
4660
4661 /* After the task has settled, check if the new node is better. */
4662 if (p->numa_migrate_seq >= sysctl_numa_balancing_settle_count &&
4663 task_weight(p, dst_nid) + group_weight(p, dst_nid) >
4664 task_weight(p, src_nid) + group_weight(p, src_nid))
4581 return true; 4665 return true;
4582 4666
4583 return false; 4667 return false;
@@ -4597,11 +4681,17 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
4597 src_nid = cpu_to_node(env->src_cpu); 4681 src_nid = cpu_to_node(env->src_cpu);
4598 dst_nid = cpu_to_node(env->dst_cpu); 4682 dst_nid = cpu_to_node(env->dst_cpu);
4599 4683
4600 if (src_nid == dst_nid || 4684 if (src_nid == dst_nid)
4601 p->numa_migrate_seq >= sysctl_numa_balancing_settle_count)
4602 return false; 4685 return false;
4603 4686
4604 if (task_faults(p, dst_nid) < task_faults(p, src_nid)) 4687 /* Migrating away from the preferred node is always bad. */
4688 if (src_nid == p->numa_preferred_nid)
4689 return true;
4690
4691 /* After the task has settled, check if the new node is worse. */
4692 if (p->numa_migrate_seq >= sysctl_numa_balancing_settle_count &&
4693 task_weight(p, dst_nid) + group_weight(p, dst_nid) <
4694 task_weight(p, src_nid) + group_weight(p, src_nid))
4605 return true; 4695 return true;
4606 4696
4607 return false; 4697 return false;