aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-11-11 20:20:12 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-11 20:20:12 -0500
commit39cf275a1a18ba3c7eb9b986c5c9b35b57332798 (patch)
tree40b119ca9d2fbaf8128d3fa25f4c64669002b0c0 /mm
parentad5d69899e52792671c1aa6c7360464c7edfe09c (diff)
parente5137b50a0640009fd63a3e65c14bc6e1be8796a (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar: "The main changes in this cycle are: - (much) improved CONFIG_NUMA_BALANCING support from Mel Gorman, Rik van Riel, Peter Zijlstra et al. Yay! - optimize preemption counter handling: merge the NEED_RESCHED flag into the preempt_count variable, by Peter Zijlstra. - wait.h fixes and code reorganization from Peter Zijlstra - cfs_bandwidth fixes from Ben Segall - SMP load-balancer cleanups from Peter Zijstra - idle balancer improvements from Jason Low - other fixes and cleanups" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (129 commits) ftrace, sched: Add TRACE_FLAG_PREEMPT_RESCHED stop_machine: Fix race between stop_two_cpus() and stop_cpus() sched: Remove unnecessary iteration over sched domains to update nr_busy_cpus sched: Fix asymmetric scheduling for POWER7 sched: Move completion code from core.c to completion.c sched: Move wait code from core.c to wait.c sched: Move wait.c into kernel/sched/ sched/wait: Fix __wait_event_interruptible_lock_irq_timeout() sched: Avoid throttle_cfs_rq() racing with period_timer stopping sched: Guarantee new group-entities always have weight sched: Fix hrtimer_cancel()/rq->lock deadlock sched: Fix cfs_bandwidth misuse of hrtimer_expires_remaining sched: Fix race on toggling cfs_bandwidth_used sched: Remove extra put_online_cpus() inside sched_setaffinity() sched/rt: Fix task_tick_rt() comment sched/wait: Fix build breakage sched/wait: Introduce prepare_to_wait_event() sched/wait: Add ___wait_cond_timeout() to wait_event*_timeout() too sched: Remove get_online_cpus() usage sched: Fix race in migrate_swap_stop() ...
Diffstat (limited to 'mm')
-rw-r--r--mm/huge_memory.c55
-rw-r--r--mm/memory.c139
-rw-r--r--mm/mempolicy.c82
-rw-r--r--mm/migrate.c30
-rw-r--r--mm/mm_init.c18
-rw-r--r--mm/mmzone.c14
-rw-r--r--mm/mprotect.c65
-rw-r--r--mm/page_alloc.c4
8 files changed, 218 insertions, 189 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cca80d96e509..2612f60f53ee 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1282,19 +1282,32 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1282 struct page *page; 1282 struct page *page;
1283 unsigned long haddr = addr & HPAGE_PMD_MASK; 1283 unsigned long haddr = addr & HPAGE_PMD_MASK;
1284 int page_nid = -1, this_nid = numa_node_id(); 1284 int page_nid = -1, this_nid = numa_node_id();
1285 int target_nid; 1285 int target_nid, last_cpupid = -1;
1286 bool page_locked; 1286 bool page_locked;
1287 bool migrated = false; 1287 bool migrated = false;
1288 int flags = 0;
1288 1289
1289 spin_lock(&mm->page_table_lock); 1290 spin_lock(&mm->page_table_lock);
1290 if (unlikely(!pmd_same(pmd, *pmdp))) 1291 if (unlikely(!pmd_same(pmd, *pmdp)))
1291 goto out_unlock; 1292 goto out_unlock;
1292 1293
1293 page = pmd_page(pmd); 1294 page = pmd_page(pmd);
1295 BUG_ON(is_huge_zero_page(page));
1294 page_nid = page_to_nid(page); 1296 page_nid = page_to_nid(page);
1297 last_cpupid = page_cpupid_last(page);
1295 count_vm_numa_event(NUMA_HINT_FAULTS); 1298 count_vm_numa_event(NUMA_HINT_FAULTS);
1296 if (page_nid == this_nid) 1299 if (page_nid == this_nid) {
1297 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 1300 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
1301 flags |= TNF_FAULT_LOCAL;
1302 }
1303
1304 /*
1305 * Avoid grouping on DSO/COW pages in specific and RO pages
1306 * in general, RO pages shouldn't hurt as much anyway since
1307 * they can be in shared cache state.
1308 */
1309 if (!pmd_write(pmd))
1310 flags |= TNF_NO_GROUP;
1298 1311
1299 /* 1312 /*
1300 * Acquire the page lock to serialise THP migrations but avoid dropping 1313 * Acquire the page lock to serialise THP migrations but avoid dropping
@@ -1325,7 +1338,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1325 lock_page(page); 1338 lock_page(page);
1326 anon_vma = page_lock_anon_vma_read(page); 1339 anon_vma = page_lock_anon_vma_read(page);
1327 1340
1328 /* Confirm the PTE did not while locked */ 1341 /* Confirm the PMD did not change while page_table_lock was released */
1329 spin_lock(&mm->page_table_lock); 1342 spin_lock(&mm->page_table_lock);
1330 if (unlikely(!pmd_same(pmd, *pmdp))) { 1343 if (unlikely(!pmd_same(pmd, *pmdp))) {
1331 unlock_page(page); 1344 unlock_page(page);
@@ -1341,8 +1354,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1341 spin_unlock(&mm->page_table_lock); 1354 spin_unlock(&mm->page_table_lock);
1342 migrated = migrate_misplaced_transhuge_page(mm, vma, 1355 migrated = migrate_misplaced_transhuge_page(mm, vma,
1343 pmdp, pmd, addr, page, target_nid); 1356 pmdp, pmd, addr, page, target_nid);
1344 if (migrated) 1357 if (migrated) {
1358 flags |= TNF_MIGRATED;
1345 page_nid = target_nid; 1359 page_nid = target_nid;
1360 }
1346 1361
1347 goto out; 1362 goto out;
1348clear_pmdnuma: 1363clear_pmdnuma:
@@ -1360,7 +1375,7 @@ out:
1360 page_unlock_anon_vma_read(anon_vma); 1375 page_unlock_anon_vma_read(anon_vma);
1361 1376
1362 if (page_nid != -1) 1377 if (page_nid != -1)
1363 task_numa_fault(page_nid, HPAGE_PMD_NR, migrated); 1378 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
1364 1379
1365 return 0; 1380 return 0;
1366} 1381}
@@ -1458,6 +1473,12 @@ out:
1458 return ret; 1473 return ret;
1459} 1474}
1460 1475
1476/*
1477 * Returns
1478 * - 0 if PMD could not be locked
1479 * - 1 if PMD was locked but protections unchange and TLB flush unnecessary
1480 * - HPAGE_PMD_NR is protections changed and TLB flush necessary
1481 */
1461int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1482int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1462 unsigned long addr, pgprot_t newprot, int prot_numa) 1483 unsigned long addr, pgprot_t newprot, int prot_numa)
1463{ 1484{
@@ -1466,22 +1487,34 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1466 1487
1467 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1488 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1468 pmd_t entry; 1489 pmd_t entry;
1469 entry = pmdp_get_and_clear(mm, addr, pmd); 1490 ret = 1;
1470 if (!prot_numa) { 1491 if (!prot_numa) {
1492 entry = pmdp_get_and_clear(mm, addr, pmd);
1471 entry = pmd_modify(entry, newprot); 1493 entry = pmd_modify(entry, newprot);
1494 ret = HPAGE_PMD_NR;
1472 BUG_ON(pmd_write(entry)); 1495 BUG_ON(pmd_write(entry));
1473 } else { 1496 } else {
1474 struct page *page = pmd_page(*pmd); 1497 struct page *page = pmd_page(*pmd);
1475 1498
1476 /* only check non-shared pages */ 1499 /*
1477 if (page_mapcount(page) == 1 && 1500 * Do not trap faults against the zero page. The
1501 * read-only data is likely to be read-cached on the
1502 * local CPU cache and it is less useful to know about
1503 * local vs remote hits on the zero page.
1504 */
1505 if (!is_huge_zero_page(page) &&
1478 !pmd_numa(*pmd)) { 1506 !pmd_numa(*pmd)) {
1507 entry = pmdp_get_and_clear(mm, addr, pmd);
1479 entry = pmd_mknuma(entry); 1508 entry = pmd_mknuma(entry);
1509 ret = HPAGE_PMD_NR;
1480 } 1510 }
1481 } 1511 }
1482 set_pmd_at(mm, addr, pmd, entry); 1512
1513 /* Set PMD if cleared earlier */
1514 if (ret == HPAGE_PMD_NR)
1515 set_pmd_at(mm, addr, pmd, entry);
1516
1483 spin_unlock(&vma->vm_mm->page_table_lock); 1517 spin_unlock(&vma->vm_mm->page_table_lock);
1484 ret = 1;
1485 } 1518 }
1486 1519
1487 return ret; 1520 return ret;
@@ -1662,7 +1695,7 @@ static void __split_huge_page_refcount(struct page *page,
1662 page_tail->mapping = page->mapping; 1695 page_tail->mapping = page->mapping;
1663 1696
1664 page_tail->index = page->index + i; 1697 page_tail->index = page->index + i;
1665 page_nid_xchg_last(page_tail, page_nid_last(page)); 1698 page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
1666 1699
1667 BUG_ON(!PageAnon(page_tail)); 1700 BUG_ON(!PageAnon(page_tail));
1668 BUG_ON(!PageUptodate(page_tail)); 1701 BUG_ON(!PageUptodate(page_tail));
diff --git a/mm/memory.c b/mm/memory.c
index d176154c243f..1f2287eaa88e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,8 +69,8 @@
69 69
70#include "internal.h" 70#include "internal.h"
71 71
72#ifdef LAST_NID_NOT_IN_PAGE_FLAGS 72#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
73#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid. 73#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
74#endif 74#endif
75 75
76#ifndef CONFIG_NEED_MULTIPLE_NODES 76#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -2721,6 +2721,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2721 get_page(dirty_page); 2721 get_page(dirty_page);
2722 2722
2723reuse: 2723reuse:
2724 /*
2725 * Clear the pages cpupid information as the existing
2726 * information potentially belongs to a now completely
2727 * unrelated process.
2728 */
2729 if (old_page)
2730 page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
2731
2724 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2732 flush_cache_page(vma, address, pte_pfn(orig_pte));
2725 entry = pte_mkyoung(orig_pte); 2733 entry = pte_mkyoung(orig_pte);
2726 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2734 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -3521,13 +3529,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3521} 3529}
3522 3530
3523int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, 3531int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3524 unsigned long addr, int page_nid) 3532 unsigned long addr, int page_nid,
3533 int *flags)
3525{ 3534{
3526 get_page(page); 3535 get_page(page);
3527 3536
3528 count_vm_numa_event(NUMA_HINT_FAULTS); 3537 count_vm_numa_event(NUMA_HINT_FAULTS);
3529 if (page_nid == numa_node_id()) 3538 if (page_nid == numa_node_id()) {
3530 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 3539 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3540 *flags |= TNF_FAULT_LOCAL;
3541 }
3531 3542
3532 return mpol_misplaced(page, vma, addr); 3543 return mpol_misplaced(page, vma, addr);
3533} 3544}
@@ -3538,8 +3549,10 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3538 struct page *page = NULL; 3549 struct page *page = NULL;
3539 spinlock_t *ptl; 3550 spinlock_t *ptl;
3540 int page_nid = -1; 3551 int page_nid = -1;
3552 int last_cpupid;
3541 int target_nid; 3553 int target_nid;
3542 bool migrated = false; 3554 bool migrated = false;
3555 int flags = 0;
3543 3556
3544 /* 3557 /*
3545 * The "pte" at this point cannot be used safely without 3558 * The "pte" at this point cannot be used safely without
@@ -3566,9 +3579,26 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3566 pte_unmap_unlock(ptep, ptl); 3579 pte_unmap_unlock(ptep, ptl);
3567 return 0; 3580 return 0;
3568 } 3581 }
3582 BUG_ON(is_zero_pfn(page_to_pfn(page)));
3569 3583
3584 /*
3585 * Avoid grouping on DSO/COW pages in specific and RO pages
3586 * in general, RO pages shouldn't hurt as much anyway since
3587 * they can be in shared cache state.
3588 */
3589 if (!pte_write(pte))
3590 flags |= TNF_NO_GROUP;
3591
3592 /*
3593 * Flag if the page is shared between multiple address spaces. This
3594 * is later used when determining whether to group tasks together
3595 */
3596 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
3597 flags |= TNF_SHARED;
3598
3599 last_cpupid = page_cpupid_last(page);
3570 page_nid = page_to_nid(page); 3600 page_nid = page_to_nid(page);
3571 target_nid = numa_migrate_prep(page, vma, addr, page_nid); 3601 target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
3572 pte_unmap_unlock(ptep, ptl); 3602 pte_unmap_unlock(ptep, ptl);
3573 if (target_nid == -1) { 3603 if (target_nid == -1) {
3574 put_page(page); 3604 put_page(page);
@@ -3576,102 +3606,17 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3576 } 3606 }
3577 3607
3578 /* Migrate to the requested node */ 3608 /* Migrate to the requested node */
3579 migrated = migrate_misplaced_page(page, target_nid); 3609 migrated = migrate_misplaced_page(page, vma, target_nid);
3580 if (migrated) 3610 if (migrated) {
3581 page_nid = target_nid; 3611 page_nid = target_nid;
3612 flags |= TNF_MIGRATED;
3613 }
3582 3614
3583out: 3615out:
3584 if (page_nid != -1) 3616 if (page_nid != -1)
3585 task_numa_fault(page_nid, 1, migrated); 3617 task_numa_fault(last_cpupid, page_nid, 1, flags);
3586 return 0;
3587}
3588
3589/* NUMA hinting page fault entry point for regular pmds */
3590#ifdef CONFIG_NUMA_BALANCING
3591static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3592 unsigned long addr, pmd_t *pmdp)
3593{
3594 pmd_t pmd;
3595 pte_t *pte, *orig_pte;
3596 unsigned long _addr = addr & PMD_MASK;
3597 unsigned long offset;
3598 spinlock_t *ptl;
3599 bool numa = false;
3600
3601 spin_lock(&mm->page_table_lock);
3602 pmd = *pmdp;
3603 if (pmd_numa(pmd)) {
3604 set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
3605 numa = true;
3606 }
3607 spin_unlock(&mm->page_table_lock);
3608
3609 if (!numa)
3610 return 0;
3611
3612 /* we're in a page fault so some vma must be in the range */
3613 BUG_ON(!vma);
3614 BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
3615 offset = max(_addr, vma->vm_start) & ~PMD_MASK;
3616 VM_BUG_ON(offset >= PMD_SIZE);
3617 orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
3618 pte += offset >> PAGE_SHIFT;
3619 for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
3620 pte_t pteval = *pte;
3621 struct page *page;
3622 int page_nid = -1;
3623 int target_nid;
3624 bool migrated = false;
3625
3626 if (!pte_present(pteval))
3627 continue;
3628 if (!pte_numa(pteval))
3629 continue;
3630 if (addr >= vma->vm_end) {
3631 vma = find_vma(mm, addr);
3632 /* there's a pte present so there must be a vma */
3633 BUG_ON(!vma);
3634 BUG_ON(addr < vma->vm_start);
3635 }
3636 if (pte_numa(pteval)) {
3637 pteval = pte_mknonnuma(pteval);
3638 set_pte_at(mm, addr, pte, pteval);
3639 }
3640 page = vm_normal_page(vma, addr, pteval);
3641 if (unlikely(!page))
3642 continue;
3643 /* only check non-shared pages */
3644 if (unlikely(page_mapcount(page) != 1))
3645 continue;
3646
3647 page_nid = page_to_nid(page);
3648 target_nid = numa_migrate_prep(page, vma, addr, page_nid);
3649 pte_unmap_unlock(pte, ptl);
3650 if (target_nid != -1) {
3651 migrated = migrate_misplaced_page(page, target_nid);
3652 if (migrated)
3653 page_nid = target_nid;
3654 } else {
3655 put_page(page);
3656 }
3657
3658 if (page_nid != -1)
3659 task_numa_fault(page_nid, 1, migrated);
3660
3661 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
3662 }
3663 pte_unmap_unlock(orig_pte, ptl);
3664
3665 return 0;
3666}
3667#else
3668static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3669 unsigned long addr, pmd_t *pmdp)
3670{
3671 BUG();
3672 return 0; 3618 return 0;
3673} 3619}
3674#endif /* CONFIG_NUMA_BALANCING */
3675 3620
3676/* 3621/*
3677 * These routines also need to handle stuff like marking pages dirty 3622 * These routines also need to handle stuff like marking pages dirty
@@ -3811,8 +3756,8 @@ retry:
3811 } 3756 }
3812 } 3757 }
3813 3758
3814 if (pmd_numa(*pmd)) 3759 /* THP should already have been handled */
3815 return do_pmd_numa_page(mm, vma, address, pmd); 3760 BUG_ON(pmd_numa(*pmd));
3816 3761
3817 /* 3762 /*
3818 * Use __pte_alloc instead of pte_alloc_map, because we can't 3763 * Use __pte_alloc instead of pte_alloc_map, because we can't
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 04729647f359..71cb253368cb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1679,6 +1679,30 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
1679 return pol; 1679 return pol;
1680} 1680}
1681 1681
1682bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
1683{
1684 struct mempolicy *pol = get_task_policy(task);
1685 if (vma) {
1686 if (vma->vm_ops && vma->vm_ops->get_policy) {
1687 bool ret = false;
1688
1689 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1690 if (pol && (pol->flags & MPOL_F_MOF))
1691 ret = true;
1692 mpol_cond_put(pol);
1693
1694 return ret;
1695 } else if (vma->vm_policy) {
1696 pol = vma->vm_policy;
1697 }
1698 }
1699
1700 if (!pol)
1701 return default_policy.flags & MPOL_F_MOF;
1702
1703 return pol->flags & MPOL_F_MOF;
1704}
1705
1682static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) 1706static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1683{ 1707{
1684 enum zone_type dynamic_policy_zone = policy_zone; 1708 enum zone_type dynamic_policy_zone = policy_zone;
@@ -2277,6 +2301,35 @@ static void sp_free(struct sp_node *n)
2277 kmem_cache_free(sn_cache, n); 2301 kmem_cache_free(sn_cache, n);
2278} 2302}
2279 2303
2304#ifdef CONFIG_NUMA_BALANCING
2305static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
2306{
2307 /* Never defer a private fault */
2308 if (cpupid_match_pid(p, last_cpupid))
2309 return false;
2310
2311 if (p->numa_migrate_deferred) {
2312 p->numa_migrate_deferred--;
2313 return true;
2314 }
2315 return false;
2316}
2317
2318static inline void defer_numa_migrate(struct task_struct *p)
2319{
2320 p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
2321}
2322#else
2323static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
2324{
2325 return false;
2326}
2327
2328static inline void defer_numa_migrate(struct task_struct *p)
2329{
2330}
2331#endif /* CONFIG_NUMA_BALANCING */
2332
2280/** 2333/**
2281 * mpol_misplaced - check whether current page node is valid in policy 2334 * mpol_misplaced - check whether current page node is valid in policy
2282 * 2335 *
@@ -2300,6 +2353,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2300 struct zone *zone; 2353 struct zone *zone;
2301 int curnid = page_to_nid(page); 2354 int curnid = page_to_nid(page);
2302 unsigned long pgoff; 2355 unsigned long pgoff;
2356 int thiscpu = raw_smp_processor_id();
2357 int thisnid = cpu_to_node(thiscpu);
2303 int polnid = -1; 2358 int polnid = -1;
2304 int ret = -1; 2359 int ret = -1;
2305 2360
@@ -2348,9 +2403,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2348 2403
2349 /* Migrate the page towards the node whose CPU is referencing it */ 2404 /* Migrate the page towards the node whose CPU is referencing it */
2350 if (pol->flags & MPOL_F_MORON) { 2405 if (pol->flags & MPOL_F_MORON) {
2351 int last_nid; 2406 int last_cpupid;
2407 int this_cpupid;
2352 2408
2353 polnid = numa_node_id(); 2409 polnid = thisnid;
2410 this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid);
2354 2411
2355 /* 2412 /*
2356 * Multi-stage node selection is used in conjunction 2413 * Multi-stage node selection is used in conjunction
@@ -2373,8 +2430,25 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2373 * it less likely we act on an unlikely task<->page 2430 * it less likely we act on an unlikely task<->page
2374 * relation. 2431 * relation.
2375 */ 2432 */
2376 last_nid = page_nid_xchg_last(page, polnid); 2433 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
2377 if (last_nid != polnid) 2434 if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
2435
2436 /* See sysctl_numa_balancing_migrate_deferred comment */
2437 if (!cpupid_match_pid(current, last_cpupid))
2438 defer_numa_migrate(current);
2439
2440 goto out;
2441 }
2442
2443 /*
2444 * The quadratic filter above reduces extraneous migration
2445 * of shared pages somewhat. This code reduces it even more,
2446 * reducing the overhead of page migrations of shared pages.
2447 * This makes workloads with shared pages rely more on
2448 * "move task near its memory", and less on "move memory
2449 * towards its task", which is exactly what we want.
2450 */
2451 if (numa_migrate_deferred(current, last_cpupid))
2378 goto out; 2452 goto out;
2379 } 2453 }
2380 2454
diff --git a/mm/migrate.c b/mm/migrate.c
index c04692774e88..dfc8300ecbb2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -445,6 +445,8 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
445 */ 445 */
446void migrate_page_copy(struct page *newpage, struct page *page) 446void migrate_page_copy(struct page *newpage, struct page *page)
447{ 447{
448 int cpupid;
449
448 if (PageHuge(page) || PageTransHuge(page)) 450 if (PageHuge(page) || PageTransHuge(page))
449 copy_huge_page(newpage, page); 451 copy_huge_page(newpage, page);
450 else 452 else
@@ -481,6 +483,13 @@ void migrate_page_copy(struct page *newpage, struct page *page)
481 __set_page_dirty_nobuffers(newpage); 483 __set_page_dirty_nobuffers(newpage);
482 } 484 }
483 485
486 /*
487 * Copy NUMA information to the new page, to prevent over-eager
488 * future migrations of this same page.
489 */
490 cpupid = page_cpupid_xchg_last(page, -1);
491 page_cpupid_xchg_last(newpage, cpupid);
492
484 mlock_migrate_page(newpage, page); 493 mlock_migrate_page(newpage, page);
485 ksm_migrate_page(newpage, page); 494 ksm_migrate_page(newpage, page);
486 /* 495 /*
@@ -1500,7 +1509,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
1500 __GFP_NOWARN) & 1509 __GFP_NOWARN) &
1501 ~GFP_IOFS, 0); 1510 ~GFP_IOFS, 0);
1502 if (newpage) 1511 if (newpage)
1503 page_nid_xchg_last(newpage, page_nid_last(page)); 1512 page_cpupid_xchg_last(newpage, page_cpupid_last(page));
1504 1513
1505 return newpage; 1514 return newpage;
1506} 1515}
@@ -1601,7 +1610,8 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1601 * node. Caller is expected to have an elevated reference count on 1610 * node. Caller is expected to have an elevated reference count on
1602 * the page that will be dropped by this function before returning. 1611 * the page that will be dropped by this function before returning.
1603 */ 1612 */
1604int migrate_misplaced_page(struct page *page, int node) 1613int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
1614 int node)
1605{ 1615{
1606 pg_data_t *pgdat = NODE_DATA(node); 1616 pg_data_t *pgdat = NODE_DATA(node);
1607 int isolated; 1617 int isolated;
@@ -1609,10 +1619,11 @@ int migrate_misplaced_page(struct page *page, int node)
1609 LIST_HEAD(migratepages); 1619 LIST_HEAD(migratepages);
1610 1620
1611 /* 1621 /*
1612 * Don't migrate pages that are mapped in multiple processes. 1622 * Don't migrate file pages that are mapped in multiple processes
1613 * TODO: Handle false sharing detection instead of this hammer 1623 * with execute permissions as they are probably shared libraries.
1614 */ 1624 */
1615 if (page_mapcount(page) != 1) 1625 if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
1626 (vma->vm_flags & VM_EXEC))
1616 goto out; 1627 goto out;
1617 1628
1618 /* 1629 /*
@@ -1663,13 +1674,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1663 int page_lru = page_is_file_cache(page); 1674 int page_lru = page_is_file_cache(page);
1664 1675
1665 /* 1676 /*
1666 * Don't migrate pages that are mapped in multiple processes.
1667 * TODO: Handle false sharing detection instead of this hammer
1668 */
1669 if (page_mapcount(page) != 1)
1670 goto out_dropref;
1671
1672 /*
1673 * Rate-limit the amount of data that is being migrated to a node. 1677 * Rate-limit the amount of data that is being migrated to a node.
1674 * Optimal placement is no good if the memory bus is saturated and 1678 * Optimal placement is no good if the memory bus is saturated and
1675 * all the time is being spent migrating! 1679 * all the time is being spent migrating!
@@ -1682,7 +1686,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1682 if (!new_page) 1686 if (!new_page)
1683 goto out_fail; 1687 goto out_fail;
1684 1688
1685 page_nid_xchg_last(new_page, page_nid_last(page)); 1689 page_cpupid_xchg_last(new_page, page_cpupid_last(page));
1686 1690
1687 isolated = numamigrate_isolate_page(pgdat, page); 1691 isolated = numamigrate_isolate_page(pgdat, page);
1688 if (!isolated) { 1692 if (!isolated) {
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 633c08863fd8..68562e92d50c 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -71,26 +71,26 @@ void __init mminit_verify_pageflags_layout(void)
71 unsigned long or_mask, add_mask; 71 unsigned long or_mask, add_mask;
72 72
73 shift = 8 * sizeof(unsigned long); 73 shift = 8 * sizeof(unsigned long);
74 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT; 74 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT;
75 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", 75 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
76 "Section %d Node %d Zone %d Lastnid %d Flags %d\n", 76 "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n",
77 SECTIONS_WIDTH, 77 SECTIONS_WIDTH,
78 NODES_WIDTH, 78 NODES_WIDTH,
79 ZONES_WIDTH, 79 ZONES_WIDTH,
80 LAST_NID_WIDTH, 80 LAST_CPUPID_WIDTH,
81 NR_PAGEFLAGS); 81 NR_PAGEFLAGS);
82 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", 82 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
83 "Section %d Node %d Zone %d Lastnid %d\n", 83 "Section %d Node %d Zone %d Lastcpupid %d\n",
84 SECTIONS_SHIFT, 84 SECTIONS_SHIFT,
85 NODES_SHIFT, 85 NODES_SHIFT,
86 ZONES_SHIFT, 86 ZONES_SHIFT,
87 LAST_NID_SHIFT); 87 LAST_CPUPID_SHIFT);
88 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", 88 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
89 "Section %lu Node %lu Zone %lu Lastnid %lu\n", 89 "Section %lu Node %lu Zone %lu Lastcpupid %lu\n",
90 (unsigned long)SECTIONS_PGSHIFT, 90 (unsigned long)SECTIONS_PGSHIFT,
91 (unsigned long)NODES_PGSHIFT, 91 (unsigned long)NODES_PGSHIFT,
92 (unsigned long)ZONES_PGSHIFT, 92 (unsigned long)ZONES_PGSHIFT,
93 (unsigned long)LAST_NID_PGSHIFT); 93 (unsigned long)LAST_CPUPID_PGSHIFT);
94 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", 94 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
95 "Node/Zone ID: %lu -> %lu\n", 95 "Node/Zone ID: %lu -> %lu\n",
96 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), 96 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
@@ -102,9 +102,9 @@ void __init mminit_verify_pageflags_layout(void)
102 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", 102 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
103 "Node not in page flags"); 103 "Node not in page flags");
104#endif 104#endif
105#ifdef LAST_NID_NOT_IN_PAGE_FLAGS 105#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
106 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", 106 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
107 "Last nid not in page flags"); 107 "Last cpupid not in page flags");
108#endif 108#endif
109 109
110 if (SECTIONS_WIDTH) { 110 if (SECTIONS_WIDTH) {
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 2ac0afbd68f3..bf34fb8556db 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -97,20 +97,20 @@ void lruvec_init(struct lruvec *lruvec)
97 INIT_LIST_HEAD(&lruvec->lists[lru]); 97 INIT_LIST_HEAD(&lruvec->lists[lru]);
98} 98}
99 99
100#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS) 100#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
101int page_nid_xchg_last(struct page *page, int nid) 101int page_cpupid_xchg_last(struct page *page, int cpupid)
102{ 102{
103 unsigned long old_flags, flags; 103 unsigned long old_flags, flags;
104 int last_nid; 104 int last_cpupid;
105 105
106 do { 106 do {
107 old_flags = flags = page->flags; 107 old_flags = flags = page->flags;
108 last_nid = page_nid_last(page); 108 last_cpupid = page_cpupid_last(page);
109 109
110 flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); 110 flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
111 flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; 111 flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
112 } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); 112 } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
113 113
114 return last_nid; 114 return last_cpupid;
115} 115}
116#endif 116#endif
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 412ba2b7326a..a597f2ffcd6f 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -37,14 +37,12 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
37 37
38static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 38static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
39 unsigned long addr, unsigned long end, pgprot_t newprot, 39 unsigned long addr, unsigned long end, pgprot_t newprot,
40 int dirty_accountable, int prot_numa, bool *ret_all_same_node) 40 int dirty_accountable, int prot_numa)
41{ 41{
42 struct mm_struct *mm = vma->vm_mm; 42 struct mm_struct *mm = vma->vm_mm;
43 pte_t *pte, oldpte; 43 pte_t *pte, oldpte;
44 spinlock_t *ptl; 44 spinlock_t *ptl;
45 unsigned long pages = 0; 45 unsigned long pages = 0;
46 bool all_same_node = true;
47 int last_nid = -1;
48 46
49 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 47 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
50 arch_enter_lazy_mmu_mode(); 48 arch_enter_lazy_mmu_mode();
@@ -63,15 +61,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
63 61
64 page = vm_normal_page(vma, addr, oldpte); 62 page = vm_normal_page(vma, addr, oldpte);
65 if (page) { 63 if (page) {
66 int this_nid = page_to_nid(page); 64 if (!pte_numa(oldpte)) {
67 if (last_nid == -1)
68 last_nid = this_nid;
69 if (last_nid != this_nid)
70 all_same_node = false;
71
72 /* only check non-shared pages */
73 if (!pte_numa(oldpte) &&
74 page_mapcount(page) == 1) {
75 ptent = pte_mknuma(ptent); 65 ptent = pte_mknuma(ptent);
76 updated = true; 66 updated = true;
77 } 67 }
@@ -104,33 +94,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
104 if (pte_swp_soft_dirty(oldpte)) 94 if (pte_swp_soft_dirty(oldpte))
105 newpte = pte_swp_mksoft_dirty(newpte); 95 newpte = pte_swp_mksoft_dirty(newpte);
106 set_pte_at(mm, addr, pte, newpte); 96 set_pte_at(mm, addr, pte, newpte);
97
98 pages++;
107 } 99 }
108 pages++;
109 } 100 }
110 } while (pte++, addr += PAGE_SIZE, addr != end); 101 } while (pte++, addr += PAGE_SIZE, addr != end);
111 arch_leave_lazy_mmu_mode(); 102 arch_leave_lazy_mmu_mode();
112 pte_unmap_unlock(pte - 1, ptl); 103 pte_unmap_unlock(pte - 1, ptl);
113 104
114 *ret_all_same_node = all_same_node;
115 return pages; 105 return pages;
116} 106}
117 107
118#ifdef CONFIG_NUMA_BALANCING
119static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
120 pmd_t *pmd)
121{
122 spin_lock(&mm->page_table_lock);
123 set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
124 spin_unlock(&mm->page_table_lock);
125}
126#else
127static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
128 pmd_t *pmd)
129{
130 BUG();
131}
132#endif /* CONFIG_NUMA_BALANCING */
133
134static inline unsigned long change_pmd_range(struct vm_area_struct *vma, 108static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
135 pud_t *pud, unsigned long addr, unsigned long end, 109 pud_t *pud, unsigned long addr, unsigned long end,
136 pgprot_t newprot, int dirty_accountable, int prot_numa) 110 pgprot_t newprot, int dirty_accountable, int prot_numa)
@@ -138,34 +112,33 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
138 pmd_t *pmd; 112 pmd_t *pmd;
139 unsigned long next; 113 unsigned long next;
140 unsigned long pages = 0; 114 unsigned long pages = 0;
141 bool all_same_node;
142 115
143 pmd = pmd_offset(pud, addr); 116 pmd = pmd_offset(pud, addr);
144 do { 117 do {
118 unsigned long this_pages;
119
145 next = pmd_addr_end(addr, end); 120 next = pmd_addr_end(addr, end);
146 if (pmd_trans_huge(*pmd)) { 121 if (pmd_trans_huge(*pmd)) {
147 if (next - addr != HPAGE_PMD_SIZE) 122 if (next - addr != HPAGE_PMD_SIZE)
148 split_huge_page_pmd(vma, addr, pmd); 123 split_huge_page_pmd(vma, addr, pmd);
149 else if (change_huge_pmd(vma, pmd, addr, newprot, 124 else {
150 prot_numa)) { 125 int nr_ptes = change_huge_pmd(vma, pmd, addr,
151 pages++; 126 newprot, prot_numa);
152 continue; 127
128 if (nr_ptes) {
129 if (nr_ptes == HPAGE_PMD_NR)
130 pages++;
131
132 continue;
133 }
153 } 134 }
154 /* fall through */ 135 /* fall through */
155 } 136 }
156 if (pmd_none_or_clear_bad(pmd)) 137 if (pmd_none_or_clear_bad(pmd))
157 continue; 138 continue;
158 pages += change_pte_range(vma, pmd, addr, next, newprot, 139 this_pages = change_pte_range(vma, pmd, addr, next, newprot,
159 dirty_accountable, prot_numa, &all_same_node); 140 dirty_accountable, prot_numa);
160 141 pages += this_pages;
161 /*
162 * If we are changing protections for NUMA hinting faults then
163 * set pmd_numa if the examined pages were all on the same
164 * node. This allows a regular PMD to be handled as one fault
165 * and effectively batches the taking of the PTL
166 */
167 if (prot_numa && all_same_node)
168 change_pmd_protnuma(vma->vm_mm, addr, pmd);
169 } while (pmd++, addr = next, addr != end); 142 } while (pmd++, addr = next, addr != end);
170 143
171 return pages; 144 return pages;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dd886fac451a..73d812f16dde 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -626,7 +626,7 @@ static inline int free_pages_check(struct page *page)
626 bad_page(page); 626 bad_page(page);
627 return 1; 627 return 1;
628 } 628 }
629 page_nid_reset_last(page); 629 page_cpupid_reset_last(page);
630 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 630 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
631 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 631 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
632 return 0; 632 return 0;
@@ -4015,7 +4015,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
4015 mminit_verify_page_links(page, zone, nid, pfn); 4015 mminit_verify_page_links(page, zone, nid, pfn);
4016 init_page_count(page); 4016 init_page_count(page);
4017 page_mapcount_reset(page); 4017 page_mapcount_reset(page);
4018 page_nid_reset_last(page); 4018 page_cpupid_reset_last(page);
4019 SetPageReserved(page); 4019 SetPageReserved(page);
4020 /* 4020 /*
4021 * Mark the block movable so that blocks are reserved for 4021 * Mark the block movable so that blocks are reserved for