aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <peterz@infradead.org>2013-10-07 06:29:21 -0400
committerIngo Molnar <mingo@kernel.org>2013-10-09 08:47:47 -0400
commit8c8a743c5087bac9caac8155b8f3b367e75cdd0b (patch)
treed5cc5c5f4368cf7d3deb627388c869dbea2e83f2
parent90572890d202527c366aa9489b32404e88a7c020 (diff)
sched/numa: Use {cpu, pid} to create task groups for shared faults
While parallel applications tend to align their data on the cache boundary, they tend not to align on the page or THP boundary. Consequently tasks that partition their data can still "false-share" pages presenting a problem for optimal NUMA placement. This patch uses NUMA hinting faults to chain tasks together into numa_groups. As well as storing the NID a task was running on when accessing a page a truncated representation of the faulting PID is stored. If subsequent faults are from different PIDs it is reasonable to assume that those two tasks share a page and are candidates for being grouped together. Note that this patch makes no scheduling decisions based on the grouping information. Signed-off-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Link: http://lkml.kernel.org/r/1381141781-10992-44-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--include/linux/mm.h11
-rw-r--r--include/linux/sched.h3
-rw-r--r--kernel/sched/core.c3
-rw-r--r--kernel/sched/fair.c165
-rw-r--r--kernel/sched/sched.h5
-rw-r--r--mm/memory.c8
6 files changed, 182 insertions, 13 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ce464cd4777e..81443d557a2e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -691,6 +691,12 @@ static inline bool cpupid_cpu_unset(int cpupid)
691 return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK); 691 return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK);
692} 692}
693 693
694static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
695{
696 return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid);
697}
698
699#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
694#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS 700#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
695static inline int page_cpupid_xchg_last(struct page *page, int cpupid) 701static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
696{ 702{
@@ -760,6 +766,11 @@ static inline bool cpupid_pid_unset(int cpupid)
760static inline void page_cpupid_reset_last(struct page *page) 766static inline void page_cpupid_reset_last(struct page *page)
761{ 767{
762} 768}
769
770static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
771{
772 return false;
773}
763#endif /* CONFIG_NUMA_BALANCING */ 774#endif /* CONFIG_NUMA_BALANCING */
764 775
765static inline struct zone *page_zone(const struct page *page) 776static inline struct zone *page_zone(const struct page *page)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b6619792bb13..f587ded5c148 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1347,6 +1347,9 @@ struct task_struct {
1347 u64 node_stamp; /* migration stamp */ 1347 u64 node_stamp; /* migration stamp */
1348 struct callback_head numa_work; 1348 struct callback_head numa_work;
1349 1349
1350 struct list_head numa_entry;
1351 struct numa_group *numa_group;
1352
1350 /* 1353 /*
1351 * Exponential decaying average of faults on a per-node basis. 1354 * Exponential decaying average of faults on a per-node basis.
1352 * Scheduling placement decisions are made based on the these counts. 1355 * Scheduling placement decisions are made based on the these counts.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1fe59da280e3..51092d5cc64c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1733,6 +1733,9 @@ static void __sched_fork(struct task_struct *p)
1733 p->numa_work.next = &p->numa_work; 1733 p->numa_work.next = &p->numa_work;
1734 p->numa_faults = NULL; 1734 p->numa_faults = NULL;
1735 p->numa_faults_buffer = NULL; 1735 p->numa_faults_buffer = NULL;
1736
1737 INIT_LIST_HEAD(&p->numa_entry);
1738 p->numa_group = NULL;
1736#endif /* CONFIG_NUMA_BALANCING */ 1739#endif /* CONFIG_NUMA_BALANCING */
1737} 1740}
1738 1741
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dbe0f628efa3..85565053a6ed 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -888,6 +888,17 @@ static unsigned int task_scan_max(struct task_struct *p)
888 */ 888 */
889unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; 889unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
890 890
891struct numa_group {
892 atomic_t refcount;
893
894 spinlock_t lock; /* nr_tasks, tasks */
895 int nr_tasks;
896 struct list_head task_list;
897
898 struct rcu_head rcu;
899 atomic_long_t faults[0];
900};
901
891static inline int task_faults_idx(int nid, int priv) 902static inline int task_faults_idx(int nid, int priv)
892{ 903{
893 return 2 * nid + priv; 904 return 2 * nid + priv;
@@ -1182,7 +1193,10 @@ static void task_numa_placement(struct task_struct *p)
1182 int priv, i; 1193 int priv, i;
1183 1194
1184 for (priv = 0; priv < 2; priv++) { 1195 for (priv = 0; priv < 2; priv++) {
1196 long diff;
1197
1185 i = task_faults_idx(nid, priv); 1198 i = task_faults_idx(nid, priv);
1199 diff = -p->numa_faults[i];
1186 1200
1187 /* Decay existing window, copy faults since last scan */ 1201 /* Decay existing window, copy faults since last scan */
1188 p->numa_faults[i] >>= 1; 1202 p->numa_faults[i] >>= 1;
@@ -1190,6 +1204,11 @@ static void task_numa_placement(struct task_struct *p)
1190 p->numa_faults_buffer[i] = 0; 1204 p->numa_faults_buffer[i] = 0;
1191 1205
1192 faults += p->numa_faults[i]; 1206 faults += p->numa_faults[i];
1207 diff += p->numa_faults[i];
1208 if (p->numa_group) {
1209 /* safe because we can only change our own group */
1210 atomic_long_add(diff, &p->numa_group->faults[i]);
1211 }
1193 } 1212 }
1194 1213
1195 if (faults > max_faults) { 1214 if (faults > max_faults) {
@@ -1207,6 +1226,131 @@ static void task_numa_placement(struct task_struct *p)
1207 } 1226 }
1208} 1227}
1209 1228
1229static inline int get_numa_group(struct numa_group *grp)
1230{
1231 return atomic_inc_not_zero(&grp->refcount);
1232}
1233
1234static inline void put_numa_group(struct numa_group *grp)
1235{
1236 if (atomic_dec_and_test(&grp->refcount))
1237 kfree_rcu(grp, rcu);
1238}
1239
1240static void double_lock(spinlock_t *l1, spinlock_t *l2)
1241{
1242 if (l1 > l2)
1243 swap(l1, l2);
1244
1245 spin_lock(l1);
1246 spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1247}
1248
1249static void task_numa_group(struct task_struct *p, int cpupid)
1250{
1251 struct numa_group *grp, *my_grp;
1252 struct task_struct *tsk;
1253 bool join = false;
1254 int cpu = cpupid_to_cpu(cpupid);
1255 int i;
1256
1257 if (unlikely(!p->numa_group)) {
1258 unsigned int size = sizeof(struct numa_group) +
1259 2*nr_node_ids*sizeof(atomic_long_t);
1260
1261 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1262 if (!grp)
1263 return;
1264
1265 atomic_set(&grp->refcount, 1);
1266 spin_lock_init(&grp->lock);
1267 INIT_LIST_HEAD(&grp->task_list);
1268
1269 for (i = 0; i < 2*nr_node_ids; i++)
1270 atomic_long_set(&grp->faults[i], p->numa_faults[i]);
1271
1272 list_add(&p->numa_entry, &grp->task_list);
1273 grp->nr_tasks++;
1274 rcu_assign_pointer(p->numa_group, grp);
1275 }
1276
1277 rcu_read_lock();
1278 tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
1279
1280 if (!cpupid_match_pid(tsk, cpupid))
1281 goto unlock;
1282
1283 grp = rcu_dereference(tsk->numa_group);
1284 if (!grp)
1285 goto unlock;
1286
1287 my_grp = p->numa_group;
1288 if (grp == my_grp)
1289 goto unlock;
1290
1291 /*
1292 * Only join the other group if its bigger; if we're the bigger group,
1293 * the other task will join us.
1294 */
1295 if (my_grp->nr_tasks > grp->nr_tasks)
1296 goto unlock;
1297
1298 /*
1299 * Tie-break on the grp address.
1300 */
1301 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
1302 goto unlock;
1303
1304 if (!get_numa_group(grp))
1305 goto unlock;
1306
1307 join = true;
1308
1309unlock:
1310 rcu_read_unlock();
1311
1312 if (!join)
1313 return;
1314
1315 for (i = 0; i < 2*nr_node_ids; i++) {
1316 atomic_long_sub(p->numa_faults[i], &my_grp->faults[i]);
1317 atomic_long_add(p->numa_faults[i], &grp->faults[i]);
1318 }
1319
1320 double_lock(&my_grp->lock, &grp->lock);
1321
1322 list_move(&p->numa_entry, &grp->task_list);
1323 my_grp->nr_tasks--;
1324 grp->nr_tasks++;
1325
1326 spin_unlock(&my_grp->lock);
1327 spin_unlock(&grp->lock);
1328
1329 rcu_assign_pointer(p->numa_group, grp);
1330
1331 put_numa_group(my_grp);
1332}
1333
1334void task_numa_free(struct task_struct *p)
1335{
1336 struct numa_group *grp = p->numa_group;
1337 int i;
1338
1339 if (grp) {
1340 for (i = 0; i < 2*nr_node_ids; i++)
1341 atomic_long_sub(p->numa_faults[i], &grp->faults[i]);
1342
1343 spin_lock(&grp->lock);
1344 list_del(&p->numa_entry);
1345 grp->nr_tasks--;
1346 spin_unlock(&grp->lock);
1347 rcu_assign_pointer(p->numa_group, NULL);
1348 put_numa_group(grp);
1349 }
1350
1351 kfree(p->numa_faults);
1352}
1353
1210/* 1354/*
1211 * Got a PROT_NONE fault for a page on @node. 1355 * Got a PROT_NONE fault for a page on @node.
1212 */ 1356 */
@@ -1222,15 +1366,6 @@ void task_numa_fault(int last_cpupid, int node, int pages, bool migrated)
1222 if (!p->mm) 1366 if (!p->mm)
1223 return; 1367 return;
1224 1368
1225 /*
1226 * First accesses are treated as private, otherwise consider accesses
1227 * to be private if the accessing pid has not changed
1228 */
1229 if (!cpupid_pid_unset(last_cpupid))
1230 priv = ((p->pid & LAST__PID_MASK) == cpupid_to_pid(last_cpupid));
1231 else
1232 priv = 1;
1233
1234 /* Allocate buffer to track faults on a per-node basis */ 1369 /* Allocate buffer to track faults on a per-node basis */
1235 if (unlikely(!p->numa_faults)) { 1370 if (unlikely(!p->numa_faults)) {
1236 int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; 1371 int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
@@ -1245,6 +1380,18 @@ void task_numa_fault(int last_cpupid, int node, int pages, bool migrated)
1245 } 1380 }
1246 1381
1247 /* 1382 /*
1383 * First accesses are treated as private, otherwise consider accesses
1384 * to be private if the accessing pid has not changed
1385 */
1386 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
1387 priv = 1;
1388 } else {
1389 priv = cpupid_match_pid(p, last_cpupid);
1390 if (!priv)
1391 task_numa_group(p, last_cpupid);
1392 }
1393
1394 /*
1248 * If pages are properly placed (did not migrate) then scan slower. 1395 * If pages are properly placed (did not migrate) then scan slower.
1249 * This is reset periodically in case of phase changes 1396 * This is reset periodically in case of phase changes
1250 */ 1397 */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 691e96964dcc..8037b10a256f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -559,10 +559,7 @@ static inline u64 rq_clock_task(struct rq *rq)
559#ifdef CONFIG_NUMA_BALANCING 559#ifdef CONFIG_NUMA_BALANCING
560extern int migrate_task_to(struct task_struct *p, int cpu); 560extern int migrate_task_to(struct task_struct *p, int cpu);
561extern int migrate_swap(struct task_struct *, struct task_struct *); 561extern int migrate_swap(struct task_struct *, struct task_struct *);
562static inline void task_numa_free(struct task_struct *p) 562extern void task_numa_free(struct task_struct *p);
563{
564 kfree(p->numa_faults);
565}
566#else /* CONFIG_NUMA_BALANCING */ 563#else /* CONFIG_NUMA_BALANCING */
567static inline void task_numa_free(struct task_struct *p) 564static inline void task_numa_free(struct task_struct *p)
568{ 565{
diff --git a/mm/memory.c b/mm/memory.c
index 5162e6d0d652..c57efa25cdbb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2719,6 +2719,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2719 get_page(dirty_page); 2719 get_page(dirty_page);
2720 2720
2721reuse: 2721reuse:
2722 /*
2723 * Clear the pages cpupid information as the existing
2724 * information potentially belongs to a now completely
2725 * unrelated process.
2726 */
2727 if (old_page)
2728 page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
2729
2722 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2730 flush_cache_page(vma, address, pte_pfn(orig_pte));
2723 entry = pte_mkyoung(orig_pte); 2731 entry = pte_mkyoung(orig_pte);
2724 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2732 entry = maybe_mkwrite(pte_mkdirty(entry), vma);