aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRik van Riel <riel@redhat.com>2013-10-07 06:29:36 -0400
committerIngo Molnar <mingo@kernel.org>2013-10-09 08:48:16 -0400
commit04bb2f9475054298f0c67a89ca92cade42d3fe5e (patch)
treeab48887e23b7f820380a3f415cbe0a6f64f7fecc
parent3e6a9418cf05638b103e34f5d13be0321872e623 (diff)
sched/numa: Adjust scan rate in task_numa_placement
Adjust numa_scan_period in task_numa_placement, depending on how much useful work the numa code can do. The more local faults there are in a given scan window the longer the period (and hence the slower the scan rate) during the next window. If there are excessive shared faults then the scan period will decrease with the amount of scaling depending on whether the ratio of shared/private faults. If the preferred node changes then the scan rate is reset to recheck if the task is properly placed. Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1381141781-10992-59-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--include/linux/sched.h9
-rw-r--r--kernel/sched/fair.c112
-rw-r--r--mm/huge_memory.c4
-rw-r--r--mm/memory.c9
4 files changed, 105 insertions, 29 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 59f953b2e413..2292f6c1596f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1365,6 +1365,14 @@ struct task_struct {
1365 */ 1365 */
1366 unsigned long *numa_faults_buffer; 1366 unsigned long *numa_faults_buffer;
1367 1367
1368 /*
1369 * numa_faults_locality tracks if faults recorded during the last
1370 * scan window were remote/local. The task scan period is adapted
1371 * based on the locality of the faults with different weights
1372 * depending on whether they were shared or private faults
1373 */
1374 unsigned long numa_faults_locality[2];
1375
1368 int numa_preferred_nid; 1376 int numa_preferred_nid;
1369 unsigned long numa_pages_migrated; 1377 unsigned long numa_pages_migrated;
1370#endif /* CONFIG_NUMA_BALANCING */ 1378#endif /* CONFIG_NUMA_BALANCING */
@@ -1455,6 +1463,7 @@ struct task_struct {
1455#define TNF_MIGRATED 0x01 1463#define TNF_MIGRATED 0x01
1456#define TNF_NO_GROUP 0x02 1464#define TNF_NO_GROUP 0x02
1457#define TNF_SHARED 0x04 1465#define TNF_SHARED 0x04
1466#define TNF_FAULT_LOCAL 0x08
1458 1467
1459#ifdef CONFIG_NUMA_BALANCING 1468#ifdef CONFIG_NUMA_BALANCING
1460extern void task_numa_fault(int last_node, int node, int pages, int flags); 1469extern void task_numa_fault(int last_node, int node, int pages, int flags);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d26a16e45437..66237ff8b01e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1241,6 +1241,12 @@ static int task_numa_migrate(struct task_struct *p)
1241 1241
1242 sched_setnuma(p, env.dst_nid); 1242 sched_setnuma(p, env.dst_nid);
1243 1243
1244 /*
1245 * Reset the scan period if the task is being rescheduled on an
1246 * alternative node to recheck if the tasks is now properly placed.
1247 */
1248 p->numa_scan_period = task_scan_min(p);
1249
1244 if (env.best_task == NULL) { 1250 if (env.best_task == NULL) {
1245 int ret = migrate_task_to(p, env.best_cpu); 1251 int ret = migrate_task_to(p, env.best_cpu);
1246 return ret; 1252 return ret;
@@ -1276,10 +1282,86 @@ static void numa_migrate_preferred(struct task_struct *p)
1276 p->numa_migrate_retry = jiffies + HZ*5; 1282 p->numa_migrate_retry = jiffies + HZ*5;
1277} 1283}
1278 1284
1285/*
1286 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1287 * increments. The more local the fault statistics are, the higher the scan
1288 * period will be for the next scan window. If local/remote ratio is below
1289 * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
1290 * scan period will decrease
1291 */
1292#define NUMA_PERIOD_SLOTS 10
1293#define NUMA_PERIOD_THRESHOLD 3
1294
1295/*
1296 * Increase the scan period (slow down scanning) if the majority of
1297 * our memory is already on our local node, or if the majority of
1298 * the page accesses are shared with other processes.
1299 * Otherwise, decrease the scan period.
1300 */
1301static void update_task_scan_period(struct task_struct *p,
1302 unsigned long shared, unsigned long private)
1303{
1304 unsigned int period_slot;
1305 int ratio;
1306 int diff;
1307
1308 unsigned long remote = p->numa_faults_locality[0];
1309 unsigned long local = p->numa_faults_locality[1];
1310
1311 /*
1312 * If there were no record hinting faults then either the task is
1313 * completely idle or all activity is areas that are not of interest
1314 * to automatic numa balancing. Scan slower
1315 */
1316 if (local + shared == 0) {
1317 p->numa_scan_period = min(p->numa_scan_period_max,
1318 p->numa_scan_period << 1);
1319
1320 p->mm->numa_next_scan = jiffies +
1321 msecs_to_jiffies(p->numa_scan_period);
1322
1323 return;
1324 }
1325
1326 /*
1327 * Prepare to scale scan period relative to the current period.
1328 * == NUMA_PERIOD_THRESHOLD scan period stays the same
1329 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1330 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1331 */
1332 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1333 ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1334 if (ratio >= NUMA_PERIOD_THRESHOLD) {
1335 int slot = ratio - NUMA_PERIOD_THRESHOLD;
1336 if (!slot)
1337 slot = 1;
1338 diff = slot * period_slot;
1339 } else {
1340 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1341
1342 /*
1343 * Scale scan rate increases based on sharing. There is an
1344 * inverse relationship between the degree of sharing and
1345 * the adjustment made to the scanning period. Broadly
1346 * speaking the intent is that there is little point
1347 * scanning faster if shared accesses dominate as it may
1348 * simply bounce migrations uselessly
1349 */
1350 period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
1351 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
1352 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1353 }
1354
1355 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1356 task_scan_min(p), task_scan_max(p));
1357 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1358}
1359
1279static void task_numa_placement(struct task_struct *p) 1360static void task_numa_placement(struct task_struct *p)
1280{ 1361{
1281 int seq, nid, max_nid = -1, max_group_nid = -1; 1362 int seq, nid, max_nid = -1, max_group_nid = -1;
1282 unsigned long max_faults = 0, max_group_faults = 0; 1363 unsigned long max_faults = 0, max_group_faults = 0;
1364 unsigned long fault_types[2] = { 0, 0 };
1283 spinlock_t *group_lock = NULL; 1365 spinlock_t *group_lock = NULL;
1284 1366
1285 seq = ACCESS_ONCE(p->mm->numa_scan_seq); 1367 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
@@ -1309,6 +1391,7 @@ static void task_numa_placement(struct task_struct *p)
1309 /* Decay existing window, copy faults since last scan */ 1391 /* Decay existing window, copy faults since last scan */
1310 p->numa_faults[i] >>= 1; 1392 p->numa_faults[i] >>= 1;
1311 p->numa_faults[i] += p->numa_faults_buffer[i]; 1393 p->numa_faults[i] += p->numa_faults_buffer[i];
1394 fault_types[priv] += p->numa_faults_buffer[i];
1312 p->numa_faults_buffer[i] = 0; 1395 p->numa_faults_buffer[i] = 0;
1313 1396
1314 faults += p->numa_faults[i]; 1397 faults += p->numa_faults[i];
@@ -1333,6 +1416,8 @@ static void task_numa_placement(struct task_struct *p)
1333 } 1416 }
1334 } 1417 }
1335 1418
1419 update_task_scan_period(p, fault_types[0], fault_types[1]);
1420
1336 if (p->numa_group) { 1421 if (p->numa_group) {
1337 /* 1422 /*
1338 * If the preferred task and group nids are different, 1423 * If the preferred task and group nids are different,
@@ -1538,6 +1623,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1538 BUG_ON(p->numa_faults_buffer); 1623 BUG_ON(p->numa_faults_buffer);
1539 p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); 1624 p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
1540 p->total_numa_faults = 0; 1625 p->total_numa_faults = 0;
1626 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1541 } 1627 }
1542 1628
1543 /* 1629 /*
@@ -1552,19 +1638,6 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1552 task_numa_group(p, last_cpupid, flags, &priv); 1638 task_numa_group(p, last_cpupid, flags, &priv);
1553 } 1639 }
1554 1640
1555 /*
1556 * If pages are properly placed (did not migrate) then scan slower.
1557 * This is reset periodically in case of phase changes
1558 */
1559 if (!migrated) {
1560 /* Initialise if necessary */
1561 if (!p->numa_scan_period_max)
1562 p->numa_scan_period_max = task_scan_max(p);
1563
1564 p->numa_scan_period = min(p->numa_scan_period_max,
1565 p->numa_scan_period + 10);
1566 }
1567
1568 task_numa_placement(p); 1641 task_numa_placement(p);
1569 1642
1570 /* Retry task to preferred node migration if it previously failed */ 1643 /* Retry task to preferred node migration if it previously failed */
@@ -1575,6 +1648,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1575 p->numa_pages_migrated += pages; 1648 p->numa_pages_migrated += pages;
1576 1649
1577 p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; 1650 p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
1651 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
1578} 1652}
1579 1653
1580static void reset_ptenuma_scan(struct task_struct *p) 1654static void reset_ptenuma_scan(struct task_struct *p)
@@ -1702,18 +1776,6 @@ void task_numa_work(struct callback_head *work)
1702 1776
1703out: 1777out:
1704 /* 1778 /*
1705 * If the whole process was scanned without updates then no NUMA
1706 * hinting faults are being recorded and scan rate should be lower.
1707 */
1708 if (mm->numa_scan_offset == 0 && !nr_pte_updates) {
1709 p->numa_scan_period = min(p->numa_scan_period_max,
1710 p->numa_scan_period << 1);
1711
1712 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
1713 mm->numa_next_scan = next_scan;
1714 }
1715
1716 /*
1717 * It is possible to reach the end of the VMA list but the last few 1779 * It is possible to reach the end of the VMA list but the last few
1718 * VMAs are not guaranteed to the vma_migratable. If they are not, we 1780 * VMAs are not guaranteed to the vma_migratable. If they are not, we
1719 * would find the !migratable VMA on the next scan but not reset the 1781 * would find the !migratable VMA on the next scan but not reset the
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7ab4e32afe12..1be2a1f95b61 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1296,8 +1296,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1296 page_nid = page_to_nid(page); 1296 page_nid = page_to_nid(page);
1297 last_cpupid = page_cpupid_last(page); 1297 last_cpupid = page_cpupid_last(page);
1298 count_vm_numa_event(NUMA_HINT_FAULTS); 1298 count_vm_numa_event(NUMA_HINT_FAULTS);
1299 if (page_nid == this_nid) 1299 if (page_nid == this_nid) {
1300 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 1300 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
1301 flags |= TNF_FAULT_LOCAL;
1302 }
1301 1303
1302 /* 1304 /*
1303 * Avoid grouping on DSO/COW pages in specific and RO pages 1305 * Avoid grouping on DSO/COW pages in specific and RO pages
diff --git a/mm/memory.c b/mm/memory.c
index 823720c43ea9..1c7501f7fb1a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3527,13 +3527,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3527} 3527}
3528 3528
3529int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, 3529int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3530 unsigned long addr, int page_nid) 3530 unsigned long addr, int page_nid,
3531 int *flags)
3531{ 3532{
3532 get_page(page); 3533 get_page(page);
3533 3534
3534 count_vm_numa_event(NUMA_HINT_FAULTS); 3535 count_vm_numa_event(NUMA_HINT_FAULTS);
3535 if (page_nid == numa_node_id()) 3536 if (page_nid == numa_node_id()) {
3536 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 3537 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3538 *flags |= TNF_FAULT_LOCAL;
3539 }
3537 3540
3538 return mpol_misplaced(page, vma, addr); 3541 return mpol_misplaced(page, vma, addr);
3539} 3542}
@@ -3593,7 +3596,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3593 3596
3594 last_cpupid = page_cpupid_last(page); 3597 last_cpupid = page_cpupid_last(page);
3595 page_nid = page_to_nid(page); 3598 page_nid = page_to_nid(page);
3596 target_nid = numa_migrate_prep(page, vma, addr, page_nid); 3599 target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
3597 pte_unmap_unlock(ptep, ptl); 3600 pte_unmap_unlock(ptep, ptl);
3598 if (target_nid == -1) { 3601 if (target_nid == -1) {
3599 put_page(page); 3602 put_page(page);