18 files changed, 1098 insertions, 131 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 129791218226..5ad7f4f4d6f7 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -303,6 +303,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
        if (blockpfn == end_pfn)
                update_pageblock_skip(cc, valid_page, total_isolated, false);
+        count_vm_events(COMPACTFREE_SCANNED, nr_scanned);
+        if (total_isolated)
+                count_vm_events(COMPACTISOLATED, total_isolated);
        return total_isolated;
 }
@@ -609,6 +613,10 @@ next_pageblock:
        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
+        count_vm_events(COMPACTMIGRATE_SCANNED, nr_scanned);
+        if (nr_isolated)
+                count_vm_events(COMPACTISOLATED, nr_isolated);
        return low_pfn;
 }
@@ -1015,14 +1023,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                nr_migrate = cc->nr_migratepages;
                err = migrate_pages(&cc->migratepages, compaction_alloc,
                                (unsigned long)cc, false,
-                                cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
+                                cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
+                                MR_COMPACTION);
                update_nr_listpages(cc);
                nr_remaining = cc->nr_migratepages;
-                count_vm_event(COMPACTBLOCKS);
-                count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
-                if (nr_remaining)
-                        count_vm_events(COMPACTPAGEFAILED, nr_remaining);
                trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
                                                nr_remaining);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 827d9c813051..d7ee1691fd21 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -19,6 +19,7 @@
 #include <linux/freezer.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
+#include <linux/migrate.h>
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -690,7 +691,7 @@ out:
 }
 __setup("transparent_hugepage=", setup_transparent_hugepage);
-static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 {
        if (likely(vma->vm_flags & VM_WRITE))
                pmd = pmd_mkwrite(pmd);
@@ -848,7 +849,8 @@ out:
         * run pte_offset_map on the pmd, if an huge pmd could
         * materialize from under us from a different thread.
         */
-        if (unlikely(__pte_alloc(mm, vma, pmd, address)))
+        if (unlikely(pmd_none(*pmd)) &&
+            unlikely(__pte_alloc(mm, vma, pmd, address)))
                return VM_FAULT_OOM;
        /* if an huge pmd materialized from under us just retry later */
        if (unlikely(pmd_trans_huge(*pmd)))
@@ -1287,6 +1289,81 @@ out:
        return page;
 }
+/* NUMA hinting page fault entry point for trans huge pmds */
+int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                                unsigned long addr, pmd_t pmd, pmd_t *pmdp)
+{
+        struct page *page;
+        unsigned long haddr = addr & HPAGE_PMD_MASK;
+        int target_nid;
+        int current_nid = -1;
+        bool migrated;
+        bool page_locked = false;
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_same(pmd, *pmdp)))
+                goto out_unlock;
+        page = pmd_page(pmd);
+        get_page(page);
+        current_nid = page_to_nid(page);
+        count_vm_numa_event(NUMA_HINT_FAULTS);
+        if (current_nid == numa_node_id())
+                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+        target_nid = mpol_misplaced(page, vma, haddr);
+        if (target_nid == -1) {
+                put_page(page);
+                goto clear_pmdnuma;
+        }
+        /* Acquire the page lock to serialise THP migrations */
+        spin_unlock(&mm->page_table_lock);
+        lock_page(page);
+        page_locked = true;
+        /* Confirm the PTE did not while locked */
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_same(pmd, *pmdp))) {
+                unlock_page(page);
+                put_page(page);
+                goto out_unlock;
+        }
+        spin_unlock(&mm->page_table_lock);
+        /* Migrate the THP to the requested node */
+        migrated = migrate_misplaced_transhuge_page(mm, vma,
+                                pmdp, pmd, addr,
+                                page, target_nid);
+        if (migrated)
+                current_nid = target_nid;
+        else {
+                spin_lock(&mm->page_table_lock);
+                if (unlikely(!pmd_same(pmd, *pmdp))) {
+                        unlock_page(page);
+                        goto out_unlock;
+                }
+                goto clear_pmdnuma;
+        }
+        task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+        return 0;
+clear_pmdnuma:
+        pmd = pmd_mknonnuma(pmd);
+        set_pmd_at(mm, haddr, pmdp, pmd);
+        VM_BUG_ON(pmd_numa(*pmdp));
+        update_mmu_cache_pmd(vma, addr, pmdp);
+        if (page_locked)
+                unlock_page(page);
+out_unlock:
+        spin_unlock(&mm->page_table_lock);
+        if (current_nid != -1)
+                task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+        return 0;
+}
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 pmd_t *pmd, unsigned long addr)
 {
@@ -1375,7 +1452,7 @@ out:
 }
 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-                unsigned long addr, pgprot_t newprot)
+                unsigned long addr, pgprot_t newprot, int prot_numa)
 {
        struct mm_struct *mm = vma->vm_mm;
        int ret = 0;
@@ -1383,7 +1460,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
        if (__pmd_trans_huge_lock(pmd, vma) == 1) {
                pmd_t entry;
                entry = pmdp_get_and_clear(mm, addr, pmd);
-                entry = pmd_modify(entry, newprot);
+                if (!prot_numa)
+                        entry = pmd_modify(entry, newprot);
+                else {
+                        struct page *page = pmd_page(*pmd);
+                        /* only check non-shared pages */
+                        if (page_mapcount(page) == 1 &&
+                            !pmd_numa(*pmd)) {
+                                entry = pmd_mknuma(entry);
+                        }
+                }
                BUG_ON(pmd_write(entry));
                set_pmd_at(mm, addr, pmd, entry);
                spin_unlock(&vma->vm_mm->page_table_lock);
@@ -1474,7 +1561,7 @@ static int __split_huge_page_splitting(struct page *page,
                 * We can't temporarily set the pmd to null in order
                 * to split it, the pmd must remain marked huge at all
                 * times or the VM won't take the pmd_trans_huge paths
-                 * and it won't wait on the anon_vma->root->mutex to
+                 * and it won't wait on the anon_vma->root->rwsem to
                 * serialize against split_huge_page*.
                 */
                pmdp_splitting_flush(vma, address, pmd);
@@ -1565,6 +1652,7 @@ static void __split_huge_page_refcount(struct page *page)
                page_tail->mapping = page->mapping;
                page_tail->index = page->index + i;
+                page_xchg_last_nid(page_tail, page_last_nid(page));
                BUG_ON(!PageAnon(page_tail));
                BUG_ON(!PageUptodate(page_tail));
@@ -1632,6 +1720,8 @@ static int __split_huge_page_map(struct page *page,
                                BUG_ON(page_mapcount(page) != 1);
                        if (!pmd_young(*pmd))
                                entry = pte_mkold(entry);
+                        if (pmd_numa(*pmd))
+                                entry = pte_mknuma(entry);
                        pte = pte_offset_map(&_pmd, haddr);
                        BUG_ON(!pte_none(*pte));
                        set_pte_at(mm, haddr, pte, entry);
@@ -1674,7 +1764,7 @@ static int __split_huge_page_map(struct page *page,
        return ret;
 }
-/* must be called with anon_vma->root->mutex hold */
+/* must be called with anon_vma->root->rwsem held */
 static void __split_huge_page(struct page *page,
                              struct anon_vma *anon_vma)
 {
@@ -1729,7 +1819,7 @@ int split_huge_page(struct page *page)
        BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
        BUG_ON(!PageAnon(page));
-        anon_vma = page_lock_anon_vma(page);
+        anon_vma = page_lock_anon_vma_read(page);
        if (!anon_vma)
                goto out;
        ret = 0;
@@ -1742,7 +1832,7 @@ int split_huge_page(struct page *page)
        BUG_ON(PageCompound(page));
 out_unlock:
-        page_unlock_anon_vma(anon_vma);
+        page_unlock_anon_vma_read(anon_vma);
 out:
        return ret;
 }
@@ -2234,7 +2324,7 @@ static void collapse_huge_page(struct mm_struct *mm,
        if (pmd_trans_huge(*pmd))
                goto out;
-        anon_vma_lock(vma->anon_vma);
+        anon_vma_lock_write(vma->anon_vma);
        pte = pte_offset_map(pmd, address);
        ptl = pte_lockptr(mm, pmd);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 88e7293b96bd..e5318c7793ae 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3016,7 +3016,7 @@ same_page:
        return i ? i : -EFAULT;
 }
-void hugetlb_change_protection(struct vm_area_struct *vma,
+unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end, pgprot_t newprot)
 {
        struct mm_struct *mm = vma->vm_mm;
@@ -3024,6 +3024,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
        pte_t *ptep;
        pte_t pte;
        struct hstate *h = hstate_vma(vma);
+        unsigned long pages = 0;
        BUG_ON(address >= end);
        flush_cache_range(vma, address, end);
@@ -3034,12 +3035,15 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
                ptep = huge_pte_offset(mm, address);
                if (!ptep)
                        continue;
-                if (huge_pmd_unshare(mm, &address, ptep))
+                if (huge_pmd_unshare(mm, &address, ptep)) {
+                        pages++;
                        continue;
+                }
                if (!huge_pte_none(huge_ptep_get(ptep))) {
                        pte = huge_ptep_get_and_clear(mm, address, ptep);
                        pte = pte_mkhuge(pte_modify(pte, newprot));
                        set_huge_pte_at(mm, address, ptep, pte);
+                        pages++;
                }
        }
        spin_unlock(&mm->page_table_lock);
@@ -3051,6 +3055,8 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
         */
        flush_tlb_range(vma, start, end);
        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+        return pages << h->order;
 }
 int hugetlb_reserve_pages(struct inode *inode,
diff --git a/mm/internal.h b/mm/internal.h
index 52d1fa957194..d597f94cc205 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -217,15 +217,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
 {
        if (TestClearPageMlocked(page)) {
                unsigned long flags;
+                int nr_pages = hpage_nr_pages(page);
                local_irq_save(flags);
-                __dec_zone_page_state(page, NR_MLOCK);
+                __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
                SetPageMlocked(newpage);
-                __inc_zone_page_state(newpage, NR_MLOCK);
+                __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
                local_irq_restore(flags);
        }
 }
+extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern unsigned long vma_address(struct page *page,
                                 struct vm_area_struct *vma);
diff --git a/mm/ksm.c b/mm/ksm.c
index 382d930a0bf1..82dfb4b54321 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1624,7 +1624,7 @@ again:
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
-                anon_vma_lock(anon_vma);
+                anon_vma_lock_write(anon_vma);
                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
                                               0, ULONG_MAX) {
                        vma = vmac->vma;
@@ -1678,7 +1678,7 @@ again:
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
-                anon_vma_lock(anon_vma);
+                anon_vma_lock_write(anon_vma);
                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
                                               0, ULONG_MAX) {
                        vma = vmac->vma;
@@ -1731,7 +1731,7 @@ again:
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
-                anon_vma_lock(anon_vma);
+                anon_vma_lock_write(anon_vma);
                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
                                               0, ULONG_MAX) {
                        vma = vmac->vma;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6c055929c8cc..bbfac5063ca8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3289,15 +3289,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
                                  struct mem_cgroup **memcgp)
 {
        struct mem_cgroup *memcg = NULL;
+        unsigned int nr_pages = 1;
        struct page_cgroup *pc;
        enum charge_type ctype;
        *memcgp = NULL;
-        VM_BUG_ON(PageTransHuge(page));
        if (mem_cgroup_disabled())
                return;
+        if (PageTransHuge(page))
+                nr_pages <<= compound_order(page);
        pc = lookup_page_cgroup(page);
        lock_page_cgroup(pc);
        if (PageCgroupUsed(pc)) {
@@ -3359,7 +3362,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
         * charged to the res_counter since we plan on replacing the
         * old one and only one page is going to be left afterwards.
         */
-        __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
+        __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
 }
 /* remove redundant charge if migration failed*/
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 108c52fa60f6..c6e4dd3e1c08 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
        struct anon_vma *av;
        pgoff_t pgoff;
-        av = page_lock_anon_vma(page);
+        av = page_lock_anon_vma_read(page);
        if (av == NULL) /* Not actually mapped anymore */
                return;
@@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
                }
        }
        read_unlock(&tasklist_lock);
-        page_unlock_anon_vma(av);
+        page_unlock_anon_vma_read(av);
 }
 /*
@@ -1566,7 +1566,8 @@ int soft_offline_page(struct page *page, int flags)
                                            page_is_file_cache(page));
                list_add(&page->lru, &pagelist);
                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
-                                                        false, MIGRATE_SYNC);
+                                                        false, MIGRATE_SYNC,
+                                                        MR_MEMORY_FAILURE);
                if (ret) {
                        putback_lru_pages(&pagelist);
                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index db2e9e797a05..e6a3b933517e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,6 +57,7 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
 #include <linux/gfp.h>
+#include <linux/migrate.h>
 #include <asm/io.h>
 #include <asm/pgalloc.h>
@@ -1503,6 +1504,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
                goto out;
        }
+        if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+                goto no_page_table;
        if (pmd_trans_huge(*pmd)) {
                if (flags & FOLL_SPLIT) {
                        split_huge_page_pmd(vma, address, pmd);
@@ -1532,6 +1535,8 @@ split_fallthrough:
        pte = *ptep;
        if (!pte_present(pte))
                goto no_page;
+        if ((flags & FOLL_NUMA) && pte_numa(pte))
+                goto no_page;
        if ((flags & FOLL_WRITE) && !pte_write(pte))
                goto unlock;
@@ -1683,6 +1688,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
        vm_flags &= (gup_flags & FOLL_FORCE) ?
                        (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+        /*
+         * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
+         * would be called on PROT_NONE ranges. We must never invoke
+         * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
+         * page faults would unprotect the PROT_NONE ranges if
+         * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
+         * bitflag. So to avoid that, don't set FOLL_NUMA if
+         * FOLL_FORCE is set.
+         */
+        if (!(gup_flags & FOLL_FORCE))
+                gup_flags |= FOLL_NUMA;
        i = 0;
        do {
@@ -3412,6 +3430,169 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
+int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+                                unsigned long addr, int current_nid)
+{
+        get_page(page);
+        count_vm_numa_event(NUMA_HINT_FAULTS);
+        if (current_nid == numa_node_id())
+                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+        return mpol_misplaced(page, vma, addr);
+}
+int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                   unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
+{
+        struct page *page = NULL;
+        spinlock_t *ptl;
+        int current_nid = -1;
+        int target_nid;
+        bool migrated = false;
+        /*
+        * The "pte" at this point cannot be used safely without
+        * validation through pte_unmap_same(). It's of NUMA type but
+        * the pfn may be screwed if the read is non atomic.
+        *
+        * ptep_modify_prot_start is not called as this is clearing
+        * the _PAGE_NUMA bit and it is not really expected that there
+        * would be concurrent hardware modifications to the PTE.
+        */
+        ptl = pte_lockptr(mm, pmd);
+        spin_lock(ptl);
+        if (unlikely(!pte_same(*ptep, pte))) {
+                pte_unmap_unlock(ptep, ptl);
+                goto out;
+        }
+        pte = pte_mknonnuma(pte);
+        set_pte_at(mm, addr, ptep, pte);
+        update_mmu_cache(vma, addr, ptep);
+        page = vm_normal_page(vma, addr, pte);
+        if (!page) {
+                pte_unmap_unlock(ptep, ptl);
+                return 0;
+        }
+        current_nid = page_to_nid(page);
+        target_nid = numa_migrate_prep(page, vma, addr, current_nid);
+        pte_unmap_unlock(ptep, ptl);
+        if (target_nid == -1) {
+                /*
+                 * Account for the fault against the current node if it not
+                 * being replaced regardless of where the page is located.
+                 */
+                current_nid = numa_node_id();
+                put_page(page);
+                goto out;
+        }
+        /* Migrate to the requested node */
+        migrated = migrate_misplaced_page(page, target_nid);
+        if (migrated)
+                current_nid = target_nid;
+out:
+        if (current_nid != -1)
+                task_numa_fault(current_nid, 1, migrated);
+        return 0;
+}
+/* NUMA hinting page fault entry point for regular pmds */
+#ifdef CONFIG_NUMA_BALANCING
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, pmd_t *pmdp)
+{
+        pmd_t pmd;
+        pte_t *pte, *orig_pte;
+        unsigned long _addr = addr & PMD_MASK;
+        unsigned long offset;
+        spinlock_t *ptl;
+        bool numa = false;
+        int local_nid = numa_node_id();
+        spin_lock(&mm->page_table_lock);
+        pmd = *pmdp;
+        if (pmd_numa(pmd)) {
+                set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
+                numa = true;
+        }
+        spin_unlock(&mm->page_table_lock);
+        if (!numa)
+                return 0;
+        /* we're in a page fault so some vma must be in the range */
+        BUG_ON(!vma);
+        BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
+        offset = max(_addr, vma->vm_start) & ~PMD_MASK;
+        VM_BUG_ON(offset >= PMD_SIZE);
+        orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
+        pte += offset >> PAGE_SHIFT;
+        for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
+                pte_t pteval = *pte;
+                struct page *page;
+                int curr_nid = local_nid;
+                int target_nid;
+                bool migrated;
+                if (!pte_present(pteval))
+                        continue;
+                if (!pte_numa(pteval))
+                        continue;
+                if (addr >= vma->vm_end) {
+                        vma = find_vma(mm, addr);
+                        /* there's a pte present so there must be a vma */
+                        BUG_ON(!vma);
+                        BUG_ON(addr < vma->vm_start);
+                }
+                if (pte_numa(pteval)) {
+                        pteval = pte_mknonnuma(pteval);
+                        set_pte_at(mm, addr, pte, pteval);
+                }
+                page = vm_normal_page(vma, addr, pteval);
+                if (unlikely(!page))
+                        continue;
+                /* only check non-shared pages */
+                if (unlikely(page_mapcount(page) != 1))
+                        continue;
+                /*
+                 * Note that the NUMA fault is later accounted to either
+                 * the node that is currently running or where the page is
+                 * migrated to.
+                 */
+                curr_nid = local_nid;
+                target_nid = numa_migrate_prep(page, vma, addr,
+                                               page_to_nid(page));
+                if (target_nid == -1) {
+                        put_page(page);
+                        continue;
+                }
+                /* Migrate to the requested node */
+                pte_unmap_unlock(pte, ptl);
+                migrated = migrate_misplaced_page(page, target_nid);
+                if (migrated)
+                        curr_nid = target_nid;
+                task_numa_fault(curr_nid, 1, migrated);
+                pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+        }
+        pte_unmap_unlock(orig_pte, ptl);
+        return 0;
+}
+#else
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, pmd_t *pmdp)
+{
+        BUG();
+}
+#endif /* CONFIG_NUMA_BALANCING */
 /*
 * These routines also need to handle stuff like marking pages dirty
 * and/or accessed for architectures that don't do it in hardware (most
@@ -3450,6 +3631,9 @@ int handle_pte_fault(struct mm_struct *mm,
                                        pte, pmd, flags, entry);
        }
+        if (pte_numa(entry))
+                return do_numa_page(mm, vma, address, entry, pte, pmd);
        ptl = pte_lockptr(mm, pmd);
        spin_lock(ptl);
        if (unlikely(!pte_same(*pte, entry)))
@@ -3520,8 +3704,11 @@ retry:
                if (pmd_trans_huge(orig_pmd)) {
                        unsigned int dirty = flags & FAULT_FLAG_WRITE;
-                        if (dirty && !pmd_write(orig_pmd) &&
+                        if (pmd_numa(orig_pmd))
-                            !pmd_trans_splitting(orig_pmd)) {
+                                return do_huge_pmd_numa_page(mm, vma, address,
+                                                             orig_pmd, pmd);
+                        if (dirty && !pmd_write(orig_pmd)) {
                                ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
                                                          orig_pmd);
                                /*
@@ -3536,16 +3723,21 @@ retry:
                                huge_pmd_set_accessed(mm, vma, address, pmd,
                                                      orig_pmd, dirty);
                        }
                        return 0;
                }
        }
+        if (pmd_numa(*pmd))
+                return do_pmd_numa_page(mm, vma, address, pmd);
        /*
         * Use __pte_alloc instead of pte_alloc_map, because we can't
         * run pte_offset_map on the pmd, if an huge pmd could
         * materialize from under us from a different thread.
         */
-        if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
+        if (unlikely(pmd_none(*pmd)) &&
+            unlikely(__pte_alloc(mm, vma, pmd, address)))
                return VM_FAULT_OOM;
        /* if an huge pmd materialized from under us just retry later */
        if (unlikely(pmd_trans_huge(*pmd)))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 518baa896e83..962e353aa86f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1055,7 +1055,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                 * migrate_pages returns # of failed pages.
                 */
                ret = migrate_pages(&source, alloc_migrate_target, 0,
-                                                        true, MIGRATE_SYNC);
+                                                        true, MIGRATE_SYNC,
+                                                        MR_MEMORY_HOTPLUG);
                if (ret)
                        putback_lru_pages(&source);
        }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index aaf54566cb6b..d1b315e98627 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -90,6 +90,7 @@
 #include <linux/syscalls.h>
 #include <linux/ctype.h>
 #include <linux/mm_inline.h>
+#include <linux/mmu_notifier.h>
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -117,6 +118,26 @@ static struct mempolicy default_policy = {
        .flags = MPOL_F_LOCAL,
 };
+static struct mempolicy preferred_node_policy[MAX_NUMNODES];
+static struct mempolicy *get_task_policy(struct task_struct *p)
+{
+        struct mempolicy *pol = p->mempolicy;
+        int node;
+        if (!pol) {
+                node = numa_node_id();
+                if (node != -1)
+                        pol = &preferred_node_policy[node];
+                /* preferred_node_policy is not initialised early in boot */
+                if (!pol->mode)
+                        pol = NULL;
+        }
+        return pol;
+}
 static const struct mempolicy_operations {
        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
        /*
@@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
        if (mode == MPOL_DEFAULT) {
                if (nodes && !nodes_empty(*nodes))
                        return ERR_PTR(-EINVAL);
-                return NULL;    /* simply delete any existing policy */
+                return NULL;
        }
        VM_BUG_ON(!nodes);
@@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
                             (flags & MPOL_F_RELATIVE_NODES)))
                                return ERR_PTR(-EINVAL);
                }
+        } else if (mode == MPOL_LOCAL) {
+                if (!nodes_empty(*nodes))
+                        return ERR_PTR(-EINVAL);
+                mode = MPOL_PREFERRED;
        } else if (nodes_empty(*nodes))
                return ERR_PTR(-EINVAL);
        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
        return 0;
 }
+#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+/*
+ * This is used to mark a range of virtual addresses to be inaccessible.
+ * These are later cleared by a NUMA hinting fault. Depending on these
+ * faults, pages may be migrated for better NUMA placement.
+ *
+ * This is assuming that NUMA faults are handled using PROT_NONE. If
+ * an architecture makes a different choice, it will need further
+ * changes to the core.
+ */
+unsigned long change_prot_numa(struct vm_area_struct *vma,
+                        unsigned long addr, unsigned long end)
+{
+        int nr_updated;
+        BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
+        nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
+        if (nr_updated)
+                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
+        return nr_updated;
+}
+#else
+static unsigned long change_prot_numa(struct vm_area_struct *vma,
+                        unsigned long addr, unsigned long end)
+{
+        return 0;
+}
+#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
 /*
 * Check if all pages in a range are on a set of nodes.
 * If pagelist != NULL then isolate pages from the LRU and
@@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                return ERR_PTR(-EFAULT);
        prev = NULL;
        for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+                unsigned long endvma = vma->vm_end;
+                if (endvma > end)
+                        endvma = end;
+                if (vma->vm_start > start)
+                        start = vma->vm_start;
                if (!(flags & MPOL_MF_DISCONTIG_OK)) {
                        if (!vma->vm_next && vma->vm_end < end)
                                return ERR_PTR(-EFAULT);
                        if (prev && prev->vm_end < vma->vm_start)
                                return ERR_PTR(-EFAULT);
                }
-                if (!is_vm_hugetlb_page(vma) &&
-                    ((flags & MPOL_MF_STRICT) ||
+                if (is_vm_hugetlb_page(vma))
+                        goto next;
+                if (flags & MPOL_MF_LAZY) {
+                        change_prot_numa(vma, start, endvma);
+                        goto next;
+                }
+                if ((flags & MPOL_MF_STRICT) ||
                     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
-                                vma_migratable(vma)))) {
+                      vma_migratable(vma))) {
-                        unsigned long endvma = vma->vm_end;
-                        if (endvma > end)
-                                endvma = end;
-                        if (vma->vm_start > start)
-                                start = vma->vm_start;
                        err = check_pgd_range(vma, start, endvma, nodes,
                                                flags, private);
                        if (err) {
@@ -602,6 +667,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                                break;
                        }
                }
+next:
                prev = vma;
        }
        return first;
@@ -961,7 +1027,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_node_page, dest,
-                                                        false, MIGRATE_SYNC);
+                                                        false, MIGRATE_SYNC,
+                                                        MR_SYSCALL);
                if (err)
                        putback_lru_pages(&pagelist);
        }
@@ -1133,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len,
        int err;
        LIST_HEAD(pagelist);
-        if (flags & ~(unsigned long)(MPOL_MF_STRICT |
+        if (flags & ~(unsigned long)MPOL_MF_VALID)
-                                     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                return -EINVAL;
        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
                return -EPERM;
@@ -1157,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len,
        if (IS_ERR(new))
                return PTR_ERR(new);
+        if (flags & MPOL_MF_LAZY)
+                new->flags |= MPOL_F_MOF;
        /*
         * If we are using the default policy then operation
         * on discontinuous address spaces is okay after all
@@ -1193,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len,
        vma = check_range(mm, start, end, nmask,
                          flags | MPOL_MF_INVERT, &pagelist);
-        err = PTR_ERR(vma);
+        err = PTR_ERR(vma);     /* maybe ... */
-        if (!IS_ERR(vma)) {
+        if (!IS_ERR(vma))
-                int nr_failed = 0;
                err = mbind_range(mm, start, end, new);
+        if (!err) {
+                int nr_failed = 0;
                if (!list_empty(&pagelist)) {
+                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
                        nr_failed = migrate_pages(&pagelist, new_vma_page,
                                                (unsigned long)vma,
-                                                false, MIGRATE_SYNC);
+                                                false, MIGRATE_SYNC,
+                                                MR_MEMPOLICY_MBIND);
                        if (nr_failed)
                                putback_lru_pages(&pagelist);
                }
-                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+                if (nr_failed && (flags & MPOL_MF_STRICT))
                        err = -EIO;
        } else
                putback_lru_pages(&pagelist);
@@ -1546,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 struct mempolicy *get_vma_policy(struct task_struct *task,
                struct vm_area_struct *vma, unsigned long addr)
 {
-        struct mempolicy *pol = task->mempolicy;
+        struct mempolicy *pol = get_task_policy(task);
        if (vma) {
                if (vma->vm_ops && vma->vm_ops->get_policy) {
@@ -1956,7 +2028,7 @@ retry_cpuset:
 */
 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
-        struct mempolicy *pol = current->mempolicy;
+        struct mempolicy *pol = get_task_policy(current);
        struct page *page;
        unsigned int cpuset_mems_cookie;
@@ -2140,6 +2212,115 @@ static void sp_free(struct sp_node *n)
        kmem_cache_free(sn_cache, n);
 }
+/**
+ * mpol_misplaced - check whether current page node is valid in policy
+ *
+ * @page   - page to be checked
+ * @vma    - vm area where page mapped
+ * @addr   - virtual address where page mapped
+ *
+ * Lookup current policy node id for vma,addr and "compare to" page's
+ * node id.
+ *
+ * Returns:
+ *      -1      - not misplaced, page is in the right node
+ *      node    - node id where the page should be
+ *
+ * Policy determination "mimics" alloc_page_vma().
+ * Called from fault path where we know the vma and faulting address.
+ */
+int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
+{
+        struct mempolicy *pol;
+        struct zone *zone;
+        int curnid = page_to_nid(page);
+        unsigned long pgoff;
+        int polnid = -1;
+        int ret = -1;
+        BUG_ON(!vma);
+        pol = get_vma_policy(current, vma, addr);
+        if (!(pol->flags & MPOL_F_MOF))
+                goto out;
+        switch (pol->mode) {
+        case MPOL_INTERLEAVE:
+                BUG_ON(addr >= vma->vm_end);
+                BUG_ON(addr < vma->vm_start);
+                pgoff = vma->vm_pgoff;
+                pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
+                polnid = offset_il_node(pol, vma, pgoff);
+                break;
+        case MPOL_PREFERRED:
+                if (pol->flags & MPOL_F_LOCAL)
+                        polnid = numa_node_id();
+                else
+                        polnid = pol->v.preferred_node;
+                break;
+        case MPOL_BIND:
+                /*
+                 * allows binding to multiple nodes.
+                 * use current page if in policy nodemask,
+                 * else select nearest allowed node, if any.
+                 * If no allowed nodes, use current [!misplaced].
+                 */
+                if (node_isset(curnid, pol->v.nodes))
+                        goto out;
+                (void)first_zones_zonelist(
+                                node_zonelist(numa_node_id(), GFP_HIGHUSER),
+                                gfp_zone(GFP_HIGHUSER),
+                                &pol->v.nodes, &zone);
+                polnid = zone->node;
+                break;
+        default:
+                BUG();
+        }
+        /* Migrate the page towards the node whose CPU is referencing it */
+        if (pol->flags & MPOL_F_MORON) {
+                int last_nid;
+                polnid = numa_node_id();
+                /*
+                 * Multi-stage node selection is used in conjunction
+                 * with a periodic migration fault to build a temporal
+                 * task<->page relation. By using a two-stage filter we
+                 * remove short/unlikely relations.
+                 *
+                 * Using P(p) ~ n_p / n_t as per frequentist
+                 * probability, we can equate a task's usage of a
+                 * particular page (n_p) per total usage of this
+                 * page (n_t) (in a given time-span) to a probability.
+                 *
+                 * Our periodic faults will sample this probability and
+                 * getting the same result twice in a row, given these
+                 * samples are fully independent, is then given by
+                 * P(n)^2, provided our sample period is sufficiently
+                 * short compared to the usage pattern.
+                 *
+                 * This quadric squishes small probabilities, making
+                 * it less likely we act on an unlikely task<->page
+                 * relation.
+                 */
+                last_nid = page_xchg_last_nid(page, polnid);
+                if (last_nid != polnid)
+                        goto out;
+        }
+        if (curnid != polnid)
+                ret = polnid;
+out:
+        mpol_cond_put(pol);
+        return ret;
+}
 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
@@ -2305,6 +2486,50 @@ void mpol_free_shared_policy(struct shared_policy *p)
        mutex_unlock(&p->mutex);
 }
+#ifdef CONFIG_NUMA_BALANCING
+static bool __initdata numabalancing_override;
+static void __init check_numabalancing_enable(void)
+{
+        bool numabalancing_default = false;
+        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
+                numabalancing_default = true;
+        if (nr_node_ids > 1 && !numabalancing_override) {
+                printk(KERN_INFO "Enabling automatic NUMA balancing. "
+                        "Configure with numa_balancing= or sysctl");
+                set_numabalancing_state(numabalancing_default);
+        }
+}
+static int __init setup_numabalancing(char *str)
+{
+        int ret = 0;
+        if (!str)
+                goto out;
+        numabalancing_override = true;
+        if (!strcmp(str, "enable")) {
+                set_numabalancing_state(true);
+                ret = 1;
+        } else if (!strcmp(str, "disable")) {
+                set_numabalancing_state(false);
+                ret = 1;
+        }
+out:
+        if (!ret)
+                printk(KERN_WARNING "Unable to parse numa_balancing=\n");
+        return ret;
+}
+__setup("numa_balancing=", setup_numabalancing);
+#else
+static inline void __init check_numabalancing_enable(void)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
 /* assumes fs == KERNEL_DS */
 void __init numa_policy_init(void)
 {
@@ -2320,6 +2545,15 @@ void __init numa_policy_init(void)
                                     sizeof(struct sp_node),
                                     0, SLAB_PANIC, NULL);
+        for_each_node(nid) {
+                preferred_node_policy[nid] = (struct mempolicy) {
+                        .refcnt = ATOMIC_INIT(1),
+                        .mode = MPOL_PREFERRED,
+                        .flags = MPOL_F_MOF | MPOL_F_MORON,
+                        .v = { .preferred_node = nid, },
+                };
+        }
        /*
         * Set interleaving policy for system init. Interleaving is only
         * enabled across suitably sized nodes (default is >= 16MB), or
@@ -2346,6 +2580,8 @@ void __init numa_policy_init(void)
        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
                printk("numa_policy_init: interleaving failed\n");
+        check_numabalancing_enable();
 }
 /* Reset policy of current process to default */
@@ -2362,14 +2598,13 @@ void numa_default_policy(void)
 * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
 * Used only for mpol_parse_str() and mpol_to_str()
 */
-#define MPOL_LOCAL MPOL_MAX
 static const char * const policy_modes[] =
 {
        [MPOL_DEFAULT]    = "default",
        [MPOL_PREFERRED]  = "prefer",
        [MPOL_BIND]       = "bind",
        [MPOL_INTERLEAVE] = "interleave",
-        [MPOL_LOCAL]      = "local"
+        [MPOL_LOCAL]      = "local",
 };
@@ -2415,12 +2650,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
        if (flags)
                *flags++ = '\0';        /* terminate mode string */
-        for (mode = 0; mode <= MPOL_LOCAL; mode++) {
+        for (mode = 0; mode < MPOL_MAX; mode++) {
                if (!strcmp(str, policy_modes[mode])) {
                        break;
                }
        }
-        if (mode > MPOL_LOCAL)
+        if (mode >= MPOL_MAX)
                goto out;
        switch (mode) {
diff --git a/mm/migrate.c b/mm/migrate.c
index cae02711181d..32efd8028bc9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -39,6 +39,9 @@
 #include <asm/tlbflush.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/migrate.h>
 #include "internal.h"
 /*
@@ -293,7 +296,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
                struct page *newpage, struct page *page,
                struct buffer_head *head, enum migrate_mode mode)
 {
-        int expected_count;
+        int expected_count = 0;
        void **pslot;
        if (!mapping) {
@@ -421,7 +424,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
 */
 void migrate_page_copy(struct page *newpage, struct page *page)
 {
-        if (PageHuge(page))
+        if (PageHuge(page) || PageTransHuge(page))
                copy_huge_page(newpage, page);
        else
                copy_highpage(newpage, page);
@@ -765,7 +768,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
         */
        if (PageAnon(page)) {
                /*
-                 * Only page_lock_anon_vma() understands the subtleties of
+                 * Only page_lock_anon_vma_read() understands the subtleties of
                 * getting a hold on an anon_vma from outside one of its mms.
                 */
                anon_vma = page_get_anon_vma(page);
@@ -998,10 +1001,11 @@ out:
 */
 int migrate_pages(struct list_head *from,
                new_page_t get_new_page, unsigned long private, bool offlining,
-                enum migrate_mode mode)
+                enum migrate_mode mode, int reason)
 {
        int retry = 1;
        int nr_failed = 0;
+        int nr_succeeded = 0;
        int pass = 0;
        struct page *page;
        struct page *page2;
@@ -1028,6 +1032,7 @@ int migrate_pages(struct list_head *from,
                                retry++;
                                break;
                        case MIGRATEPAGE_SUCCESS:
+                                nr_succeeded++;
                                break;
                        default:
                                /* Permanent failure */
@@ -1038,6 +1043,12 @@ int migrate_pages(struct list_head *from,
        }
        rc = nr_failed + retry;
 out:
+        if (nr_succeeded)
+                count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
+        if (nr_failed)
+                count_vm_events(PGMIGRATE_FAIL, nr_failed);
+        trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
        if (!swapwrite)
                current->flags &= ~PF_SWAPWRITE;
@@ -1176,7 +1187,8 @@ set_status:
        err = 0;
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_page_node,
-                                (unsigned long)pm, 0, MIGRATE_SYNC);
+                                (unsigned long)pm, 0, MIGRATE_SYNC,
+                                MR_SYSCALL);
                if (err)
                        putback_lru_pages(&pagelist);
        }
@@ -1440,4 +1452,317 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
        }
        return err;
 }
-#endif
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * Returns true if this is a safe migration target node for misplaced NUMA
+ * pages. Currently it only checks the watermarks which crude
+ */
+static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
+                                   int nr_migrate_pages)
+{
+        int z;
+        for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+                struct zone *zone = pgdat->node_zones + z;
+                if (!populated_zone(zone))
+                        continue;
+                if (zone->all_unreclaimable)
+                        continue;
+                /* Avoid waking kswapd by allocating pages_to_migrate pages. */
+                if (!zone_watermark_ok(zone, 0,
+                                       high_wmark_pages(zone) +
+                                       nr_migrate_pages,
+                                       0, 0))
+                        continue;
+                return true;
+        }
+        return false;
+}
+static struct page *alloc_misplaced_dst_page(struct page *page,
+                                           unsigned long data,
+                                           int **result)
+{
+        int nid = (int) data;
+        struct page *newpage;
+        newpage = alloc_pages_exact_node(nid,
+                                         (GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
+                                          __GFP_NOMEMALLOC | __GFP_NORETRY |
+                                          __GFP_NOWARN) &
+                                         ~GFP_IOFS, 0);
+        if (newpage)
+                page_xchg_last_nid(newpage, page_last_nid(page));
+        return newpage;
+}
+/*
+ * page migration rate limiting control.
+ * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
+ * window of time. Default here says do not migrate more than 1280M per second.
+ * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
+ * as it is faults that reset the window, pte updates will happen unconditionally
+ * if there has not been a fault since @pteupdate_interval_millisecs after the
+ * throttle window closed.
+ */
+static unsigned int migrate_interval_millisecs __read_mostly = 100;
+static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
+static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
+/* Returns true if NUMA migration is currently rate limited */
+bool migrate_ratelimited(int node)
+{
+        pg_data_t *pgdat = NODE_DATA(node);
+        if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
+                                msecs_to_jiffies(pteupdate_interval_millisecs)))
+                return false;
+        if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
+                return false;
+        return true;
+}
+/* Returns true if the node is migrate rate-limited after the update */
+bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
+{
+        bool rate_limited = false;
+        /*
+         * Rate-limit the amount of data that is being migrated to a node.
+         * Optimal placement is no good if the memory bus is saturated and
+         * all the time is being spent migrating!
+         */
+        spin_lock(&pgdat->numabalancing_migrate_lock);
+        if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
+                pgdat->numabalancing_migrate_nr_pages = 0;
+                pgdat->numabalancing_migrate_next_window = jiffies +
+                        msecs_to_jiffies(migrate_interval_millisecs);
+        }
+        if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
+                rate_limited = true;
+        else
+                pgdat->numabalancing_migrate_nr_pages += nr_pages;
+        spin_unlock(&pgdat->numabalancing_migrate_lock);
+        
+        return rate_limited;
+}
+int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
+{
+        int ret = 0;
+        /* Avoid migrating to a node that is nearly full */
+        if (migrate_balanced_pgdat(pgdat, 1)) {
+                int page_lru;
+                if (isolate_lru_page(page)) {
+                        put_page(page);
+                        return 0;
+                }
+                /* Page is isolated */
+                ret = 1;
+                page_lru = page_is_file_cache(page);
+                if (!PageTransHuge(page))
+                        inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
+                else
+                        mod_zone_page_state(page_zone(page),
+                                        NR_ISOLATED_ANON + page_lru,
+                                        HPAGE_PMD_NR);
+        }
+        /*
+         * Page is either isolated or there is not enough space on the target
+         * node. If isolated, then it has taken a reference count and the
+         * callers reference can be safely dropped without the page
+         * disappearing underneath us during migration. Otherwise the page is
+         * not to be migrated but the callers reference should still be
+         * dropped so it does not leak.
+         */
+        put_page(page);
+        return ret;
+}
+/*
+ * Attempt to migrate a misplaced page to the specified destination
+ * node. Caller is expected to have an elevated reference count on
+ * the page that will be dropped by this function before returning.
+ */
+int migrate_misplaced_page(struct page *page, int node)
+{
+        pg_data_t *pgdat = NODE_DATA(node);
+        int isolated = 0;
+        int nr_remaining;
+        LIST_HEAD(migratepages);
+        /*
+         * Don't migrate pages that are mapped in multiple processes.
+         * TODO: Handle false sharing detection instead of this hammer
+         */
+        if (page_mapcount(page) != 1) {
+                put_page(page);
+                goto out;
+        }
+        /*
+         * Rate-limit the amount of data that is being migrated to a node.
+         * Optimal placement is no good if the memory bus is saturated and
+         * all the time is being spent migrating!
+         */
+        if (numamigrate_update_ratelimit(pgdat, 1)) {
+                put_page(page);
+                goto out;
+        }
+        isolated = numamigrate_isolate_page(pgdat, page);
+        if (!isolated)
+                goto out;
+        list_add(&page->lru, &migratepages);
+        nr_remaining = migrate_pages(&migratepages,
+                        alloc_misplaced_dst_page,
+                        node, false, MIGRATE_ASYNC,
+                        MR_NUMA_MISPLACED);
+        if (nr_remaining) {
+                putback_lru_pages(&migratepages);
+                isolated = 0;
+        } else
+                count_vm_numa_event(NUMA_PAGE_MIGRATE);
+        BUG_ON(!list_empty(&migratepages));
+out:
+        return isolated;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+int migrate_misplaced_transhuge_page(struct mm_struct *mm,
+                                struct vm_area_struct *vma,
+                                pmd_t *pmd, pmd_t entry,
+                                unsigned long address,
+                                struct page *page, int node)
+{
+        unsigned long haddr = address & HPAGE_PMD_MASK;
+        pg_data_t *pgdat = NODE_DATA(node);
+        int isolated = 0;
+        struct page *new_page = NULL;
+        struct mem_cgroup *memcg = NULL;
+        int page_lru = page_is_file_cache(page);
+        /*
+         * Don't migrate pages that are mapped in multiple processes.
+         * TODO: Handle false sharing detection instead of this hammer
+         */
+        if (page_mapcount(page) != 1)
+                goto out_dropref;
+        /*
+         * Rate-limit the amount of data that is being migrated to a node.
+         * Optimal placement is no good if the memory bus is saturated and
+         * all the time is being spent migrating!
+         */
+        if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
+                goto out_dropref;
+        new_page = alloc_pages_node(node,
+                (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
+        if (!new_page) {
+                count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+                goto out_dropref;
+        }
+        page_xchg_last_nid(new_page, page_last_nid(page));
+        isolated = numamigrate_isolate_page(pgdat, page);
+        if (!isolated) {
+                count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+                put_page(new_page);
+                goto out_keep_locked;
+        }
+        /* Prepare a page as a migration target */
+        __set_page_locked(new_page);
+        SetPageSwapBacked(new_page);
+        /* anon mapping, we can simply copy page->mapping to the new page: */
+        new_page->mapping = page->mapping;
+        new_page->index = page->index;
+        migrate_page_copy(new_page, page);
+        WARN_ON(PageLRU(new_page));
+        /* Recheck the target PMD */
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_same(*pmd, entry))) {
+                spin_unlock(&mm->page_table_lock);
+                /* Reverse changes made by migrate_page_copy() */
+                if (TestClearPageActive(new_page))
+                        SetPageActive(page);
+                if (TestClearPageUnevictable(new_page))
+                        SetPageUnevictable(page);
+                mlock_migrate_page(page, new_page);
+                unlock_page(new_page);
+                put_page(new_page);             /* Free it */
+                unlock_page(page);
+                putback_lru_page(page);
+                count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+                goto out;
+        }
+        /*
+         * Traditional migration needs to prepare the memcg charge
+         * transaction early to prevent the old page from being
+         * uncharged when installing migration entries.  Here we can
+         * save the potential rollback and start the charge transfer
+         * only when migration is already known to end successfully.
+         */
+        mem_cgroup_prepare_migration(page, new_page, &memcg);
+        entry = mk_pmd(new_page, vma->vm_page_prot);
+        entry = pmd_mknonnuma(entry);
+        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+        entry = pmd_mkhuge(entry);
+        page_add_new_anon_rmap(new_page, vma, haddr);
+        set_pmd_at(mm, haddr, pmd, entry);
+        update_mmu_cache_pmd(vma, address, entry);
+        page_remove_rmap(page);
+        /*
+         * Finish the charge transaction under the page table lock to
+         * prevent split_huge_page() from dividing up the charge
+         * before it's fully transferred to the new page.
+         */
+        mem_cgroup_end_migration(memcg, page, new_page, true);
+        spin_unlock(&mm->page_table_lock);
+        unlock_page(new_page);
+        unlock_page(page);
+        put_page(page);                 /* Drop the rmap reference */
+        put_page(page);                 /* Drop the LRU isolation reference */
+        count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
+        count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
+out:
+        mod_zone_page_state(page_zone(page),
+                        NR_ISOLATED_ANON + page_lru,
+                        -HPAGE_PMD_NR);
+        return isolated;
+out_dropref:
+        put_page(page);
+out_keep_locked:
+        return 0;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_NUMA */
diff --git a/mm/mmap.c b/mm/mmap.c
index 2b7d9e78a569..f54b235f29a9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -736,7 +736,7 @@ again:			remove_next = 1 + (end > next->vm_end);
        if (anon_vma) {
                VM_BUG_ON(adjust_next && next->anon_vma &&
                          anon_vma != next->anon_vma);
-                anon_vma_lock(anon_vma);
+                anon_vma_lock_write(anon_vma);
                anon_vma_interval_tree_pre_update_vma(vma);
                if (adjust_next)
                        anon_vma_interval_tree_pre_update_vma(next);
@@ -2886,15 +2886,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
                 * The LSB of head.next can't change from under us
                 * because we hold the mm_all_locks_mutex.
                 */
-                mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem);
+                down_write(&anon_vma->root->rwsem);
                /*
                 * We can safely modify head.next after taking the
-                 * anon_vma->root->mutex. If some other vma in this mm shares
+                 * anon_vma->root->rwsem. If some other vma in this mm shares
                 * the same anon_vma we won't take it again.
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us thanks to the
-                 * anon_vma->root->mutex.
+                 * anon_vma->root->rwsem.
                 */
                if (__test_and_set_bit(0, (unsigned long *)
                                       &anon_vma->root->rb_root.rb_node))
@@ -2996,7 +2996,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us until we release the
-                 * anon_vma->root->mutex.
+                 * anon_vma->root->rwsem.
                 */
                if (!__test_and_clear_bit(0, (unsigned long *)
                                          &anon_vma->root->rb_root.rb_node))
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e8c3938db6fa..3dca970367db 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -35,12 +35,16 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
 }
 #endif
-static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
+static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, unsigned long end, pgprot_t newprot,
-                int dirty_accountable)
+                int dirty_accountable, int prot_numa, bool *ret_all_same_node)
 {
+        struct mm_struct *mm = vma->vm_mm;
        pte_t *pte, oldpte;
        spinlock_t *ptl;
+        unsigned long pages = 0;
+        bool all_same_node = true;
+        int last_nid = -1;
        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        arch_enter_lazy_mmu_mode();
@@ -48,17 +52,43 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                oldpte = *pte;
                if (pte_present(oldpte)) {
                        pte_t ptent;
+                        bool updated = false;
                        ptent = ptep_modify_prot_start(mm, addr, pte);
-                        ptent = pte_modify(ptent, newprot);
+                        if (!prot_numa) {
+                                ptent = pte_modify(ptent, newprot);
+                                updated = true;
+                        } else {
+                                struct page *page;
+                                page = vm_normal_page(vma, addr, oldpte);
+                                if (page) {
+                                        int this_nid = page_to_nid(page);
+                                        if (last_nid == -1)
+                                                last_nid = this_nid;
+                                        if (last_nid != this_nid)
+                                                all_same_node = false;
+                                        /* only check non-shared pages */
+                                        if (!pte_numa(oldpte) &&
+                                            page_mapcount(page) == 1) {
+                                                ptent = pte_mknuma(ptent);
+                                                updated = true;
+                                        }
+                                }
+                        }
                        /*
                         * Avoid taking write faults for pages we know to be
                         * dirty.
                         */
-                        if (dirty_accountable && pte_dirty(ptent))
+                        if (dirty_accountable && pte_dirty(ptent)) {
                                ptent = pte_mkwrite(ptent);
+                                updated = true;
+                        }
+                        if (updated)
+                                pages++;
                        ptep_modify_prot_commit(mm, addr, pte, ptent);
                } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
                        swp_entry_t entry = pte_to_swp_entry(oldpte);
@@ -72,18 +102,40 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                                set_pte_at(mm, addr, pte,
                                        swp_entry_to_pte(entry));
                        }
+                        pages++;
                }
        } while (pte++, addr += PAGE_SIZE, addr != end);
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(pte - 1, ptl);
+        *ret_all_same_node = all_same_node;
+        return pages;
 }
-static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+#ifdef CONFIG_NUMA_BALANCING
+static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
+                pmd_t *pmd)
+{
+        spin_lock(&mm->page_table_lock);
+        set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
+        spin_unlock(&mm->page_table_lock);
+}
+#else
+static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
+                pmd_t *pmd)
+{
+        BUG();
+}
+#endif /* CONFIG_NUMA_BALANCING */
+static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                unsigned long addr, unsigned long end, pgprot_t newprot,
-                int dirty_accountable)
+                int dirty_accountable, int prot_numa)
 {
        pmd_t *pmd;
        unsigned long next;
+        unsigned long pages = 0;
+        bool all_same_node;
        pmd = pmd_offset(pud, addr);
        do {
@@ -91,42 +143,59 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                if (pmd_trans_huge(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE)
                                split_huge_page_pmd(vma, addr, pmd);
-                        else if (change_huge_pmd(vma, pmd, addr, newprot))
+                        else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) {
+                                pages += HPAGE_PMD_NR;
                                continue;
+                        }
                        /* fall through */
                }
                if (pmd_none_or_clear_bad(pmd))
                        continue;
-                change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
+                pages += change_pte_range(vma, pmd, addr, next, newprot,
-                                 dirty_accountable);
+                                 dirty_accountable, prot_numa, &all_same_node);
+                /*
+                 * If we are changing protections for NUMA hinting faults then
+                 * set pmd_numa if the examined pages were all on the same
+                 * node. This allows a regular PMD to be handled as one fault
+                 * and effectively batches the taking of the PTL
+                 */
+                if (prot_numa && all_same_node)
+                        change_pmd_protnuma(vma->vm_mm, addr, pmd);
        } while (pmd++, addr = next, addr != end);
+        return pages;
 }
-static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
                unsigned long addr, unsigned long end, pgprot_t newprot,
-                int dirty_accountable)
+                int dirty_accountable, int prot_numa)
 {
        pud_t *pud;
        unsigned long next;
+        unsigned long pages = 0;
        pud = pud_offset(pgd, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
-                change_pmd_range(vma, pud, addr, next, newprot,
+                pages += change_pmd_range(vma, pud, addr, next, newprot,
-                                 dirty_accountable);
+                                 dirty_accountable, prot_numa);
        } while (pud++, addr = next, addr != end);
+        return pages;
 }
-static void change_protection(struct vm_area_struct *vma,
+static unsigned long change_protection_range(struct vm_area_struct *vma,
                unsigned long addr, unsigned long end, pgprot_t newprot,
-                int dirty_accountable)
+                int dirty_accountable, int prot_numa)
 {
        struct mm_struct *mm = vma->vm_mm;
        pgd_t *pgd;
        unsigned long next;
        unsigned long start = addr;
+        unsigned long pages = 0;
        BUG_ON(addr >= end);
        pgd = pgd_offset(mm, addr);
@@ -135,10 +204,32 @@ static void change_protection(struct vm_area_struct *vma,
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-                change_pud_range(vma, pgd, addr, next, newprot,
+                pages += change_pud_range(vma, pgd, addr, next, newprot,
-                                 dirty_accountable);
+                                 dirty_accountable, prot_numa);
        } while (pgd++, addr = next, addr != end);
-        flush_tlb_range(vma, start, end);
+        /* Only flush the TLB if we actually modified any entries: */
+        if (pages)
+                flush_tlb_range(vma, start, end);
+        return pages;
+}
+unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
+                       unsigned long end, pgprot_t newprot,
+                       int dirty_accountable, int prot_numa)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        unsigned long pages;
+        mmu_notifier_invalidate_range_start(mm, start, end);
+        if (is_vm_hugetlb_page(vma))
+                pages = hugetlb_change_protection(vma, start, end, newprot);
+        else
+                pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
+        mmu_notifier_invalidate_range_end(mm, start, end);
+        return pages;
 }
 int
@@ -213,12 +304,8 @@ success:
                dirty_accountable = 1;
        }
-        mmu_notifier_invalidate_range_start(mm, start, end);
+        change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0);
-        if (is_vm_hugetlb_page(vma))
-                hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
-        else
-                change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
-        mmu_notifier_invalidate_range_end(mm, start, end);
        vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
        vm_stat_account(mm, newflags, vma->vm_file, nrpages);
        perf_event_mmap(vma);
diff --git a/mm/mremap.c b/mm/mremap.c
index eabb24da6c9e..e1031e1f6a61 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                }
                if (vma->anon_vma) {
                        anon_vma = vma->anon_vma;
-                        anon_vma_lock(anon_vma);
+                        anon_vma_lock_write(anon_vma);
                }
        }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 83637dfba110..d037c8bc1512 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -611,6 +611,7 @@ static inline int free_pages_check(struct page *page)
                bad_page(page);
                return 1;
        }
+        reset_page_last_nid(page);
        if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
                page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
        return 0;
@@ -3883,6 +3884,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                mminit_verify_page_links(page, zone, nid, pfn);
                init_page_count(page);
                reset_page_mapcount(page);
+                reset_page_last_nid(page);
                SetPageReserved(page);
                /*
                 * Mark the block movable so that blocks are reserved for
@@ -4526,6 +4528,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
        int ret;
        pgdat_resize_init(pgdat);
+#ifdef CONFIG_NUMA_BALANCING
+        spin_lock_init(&pgdat->numabalancing_migrate_lock);
+        pgdat->numabalancing_migrate_nr_pages = 0;
+        pgdat->numabalancing_migrate_next_window = jiffies;
+#endif
        init_waitqueue_head(&pgdat->kswapd_wait);
        init_waitqueue_head(&pgdat->pfmemalloc_wait);
        pgdat_page_cgroup_init(pgdat);
@@ -5800,7 +5807,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
                ret = migrate_pages(&cc->migratepages,
                                    alloc_migrate_target,
-                                    0, false, MIGRATE_SYNC);
+                                    0, false, MIGRATE_SYNC,
+                                    MR_CMA);
        }
        putback_movable_pages(&cc->migratepages);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index e642627da6b7..0c8323fe6c8f 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -12,8 +12,8 @@
 #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 /*
- * Only sets the access flags (dirty, accessed, and
+ * Only sets the access flags (dirty, accessed), as well as write 
- * writable). Furthermore, we know it always gets set to a "more
+ * permission. Furthermore, we know it always gets set to a "more
 * permissive" setting, which allows most architectures to optimize
 * this. We return whether the PTE actually changed, which in turn
 * instructs the caller to do things like update__mmu_cache.  This
@@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
        int changed = !pte_same(*ptep, entry);
        if (changed) {
                set_pte_at(vma->vm_mm, address, ptep, entry);
-                flush_tlb_page(vma, address);
+                flush_tlb_fix_spurious_fault(vma, address);
        }
        return changed;
 }
@@ -88,7 +88,8 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
 {
        pte_t pte;
        pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
-        flush_tlb_page(vma, address);
+        if (pte_accessible(pte))
+                flush_tlb_page(vma, address);
        return pte;
 }
 #endif
diff --git a/mm/rmap.c b/mm/rmap.c
index face808a489e..2c78f8cadc95 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,7 +24,7 @@
 *   mm->mmap_sem
 *     page->flags PG_locked (lock_page)
 *       mapping->i_mmap_mutex
- *         anon_vma->mutex
+ *         anon_vma->rwsem
 *           mm->page_table_lock or pte_lock
 *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)
 *             swap_lock (in swap_duplicate, swap_info_get)
@@ -37,7 +37,7 @@
 *                           in arch-dependent flush_dcache_mmap_lock,
 *                           within bdi.wb->list_lock in __sync_single_inode)
 *
- * anon_vma->mutex,mapping->i_mutex      (memory_failure, collect_procs_anon)
+ * anon_vma->rwsem,mapping->i_mutex      (memory_failure, collect_procs_anon)
 *   ->tasklist_lock
 *     pte map lock
 */
@@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
        VM_BUG_ON(atomic_read(&anon_vma->refcount));
        /*
-         * Synchronize against page_lock_anon_vma() such that
+         * Synchronize against page_lock_anon_vma_read() such that
         * we can safely hold the lock without the anon_vma getting
         * freed.
         *
         * Relies on the full mb implied by the atomic_dec_and_test() from
         * put_anon_vma() against the acquire barrier implied by
-         * mutex_trylock() from page_lock_anon_vma(). This orders:
+         * down_read_trylock() from page_lock_anon_vma_read(). This orders:
         *
-         * page_lock_anon_vma()         VS      put_anon_vma()
+         * page_lock_anon_vma_read()    VS      put_anon_vma()
-         *   mutex_trylock()                      atomic_dec_and_test()
+         *   down_read_trylock()                  atomic_dec_and_test()
         *   LOCK                                 MB
-         *   atomic_read()                        mutex_is_locked()
+         *   atomic_read()                        rwsem_is_locked()
         *
         * LOCK should suffice since the actual taking of the lock must
         * happen _before_ what follows.
         */
-        if (mutex_is_locked(&anon_vma->root->mutex)) {
+        if (rwsem_is_locked(&anon_vma->root->rwsem)) {
-                anon_vma_lock(anon_vma);
+                anon_vma_lock_write(anon_vma);
                anon_vma_unlock(anon_vma);
        }
@@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
 * allocate a new one.
 *
 * Anon-vma allocations are very subtle, because we may have
- * optimistically looked up an anon_vma in page_lock_anon_vma()
+ * optimistically looked up an anon_vma in page_lock_anon_vma_read()
 * and that may actually touch the spinlock even in the newly
 * allocated vma (it depends on RCU to make sure that the
 * anon_vma isn't actually destroyed).
@@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                        allocated = anon_vma;
                }
-                anon_vma_lock(anon_vma);
+                anon_vma_lock_write(anon_vma);
                /* page_table_lock to protect against threads */
                spin_lock(&mm->page_table_lock);
                if (likely(!vma->anon_vma)) {
@@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
        struct anon_vma *new_root = anon_vma->root;
        if (new_root != root) {
                if (WARN_ON_ONCE(root))
-                        mutex_unlock(&root->mutex);
+                        up_write(&root->rwsem);
                root = new_root;
-                mutex_lock(&root->mutex);
+                down_write(&root->rwsem);
        }
        return root;
 }
@@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
 static inline void unlock_anon_vma_root(struct anon_vma *root)
 {
        if (root)
-                mutex_unlock(&root->mutex);
+                up_write(&root->rwsem);
 }
 /*
@@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
        get_anon_vma(anon_vma->root);
        /* Mark this anon_vma as the one where our new (COWed) pages go. */
        vma->anon_vma = anon_vma;
-        anon_vma_lock(anon_vma);
+        anon_vma_lock_write(anon_vma);
        anon_vma_chain_link(vma, avc, anon_vma);
        anon_vma_unlock(anon_vma);
@@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
        /*
         * Iterate the list once more, it now only contains empty and unlinked
         * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
-         * needing to acquire the anon_vma->root->mutex.
+         * needing to write-acquire the anon_vma->root->rwsem.
         */
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma = avc->anon_vma;
@@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data)
 {
        struct anon_vma *anon_vma = data;
-        mutex_init(&anon_vma->mutex);
+        init_rwsem(&anon_vma->rwsem);
        atomic_set(&anon_vma->refcount, 0);
        anon_vma->rb_root = RB_ROOT;
 }
@@ -442,7 +442,7 @@ out:
 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
 * reference like with page_get_anon_vma() and then block on the mutex.
 */
-struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_lock_anon_vma_read(struct page *page)
 {
        struct anon_vma *anon_vma = NULL;
        struct anon_vma *root_anon_vma;
@@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
        root_anon_vma = ACCESS_ONCE(anon_vma->root);
-        if (mutex_trylock(&root_anon_vma->mutex)) {
+        if (down_read_trylock(&root_anon_vma->rwsem)) {
                /*
                 * If the page is still mapped, then this anon_vma is still
                 * its anon_vma, and holding the mutex ensures that it will
                 * not go away, see anon_vma_free().
                 */
                if (!page_mapped(page)) {
-                        mutex_unlock(&root_anon_vma->mutex);
+                        up_read(&root_anon_vma->rwsem);
                        anon_vma = NULL;
                }
                goto out;
@@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
        /* we pinned the anon_vma, its safe to sleep */
        rcu_read_unlock();
-        anon_vma_lock(anon_vma);
+        anon_vma_lock_read(anon_vma);
        if (atomic_dec_and_test(&anon_vma->refcount)) {
                /*
                 * Oops, we held the last refcount, release the lock
                 * and bail -- can't simply use put_anon_vma() because
-                 * we'll deadlock on the anon_vma_lock() recursion.
+                 * we'll deadlock on the anon_vma_lock_write() recursion.
                 */
-                anon_vma_unlock(anon_vma);
+                anon_vma_unlock_read(anon_vma);
                __put_anon_vma(anon_vma);
                anon_vma = NULL;
        }
@@ -504,9 +504,9 @@ out:
        return anon_vma;
 }
-void page_unlock_anon_vma(struct anon_vma *anon_vma)
+void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
 {
-        anon_vma_unlock(anon_vma);
+        anon_vma_unlock_read(anon_vma);
 }
 /*
@@ -744,7 +744,7 @@ static int page_referenced_anon(struct page *page,
        struct anon_vma_chain *avc;
        int referenced = 0;
-        anon_vma = page_lock_anon_vma(page);
+        anon_vma = page_lock_anon_vma_read(page);
        if (!anon_vma)
                return referenced;
@@ -766,7 +766,7 @@ static int page_referenced_anon(struct page *page,
                        break;
        }
-        page_unlock_anon_vma(anon_vma);
+        page_unlock_anon_vma_read(anon_vma);
        return referenced;
 }
@@ -1315,7 +1315,7 @@ out_mlock:
        /*
         * We need mmap_sem locking, Otherwise VM_LOCKED check makes
         * unstable result and race. Plus, We can't wait here because
-         * we now hold anon_vma->mutex or mapping->i_mmap_mutex.
+         * we now hold anon_vma->rwsem or mapping->i_mmap_mutex.
         * if trylock failed, the page remain in evictable lru and later
         * vmscan could retry to move the page to unevictable lru if the
         * page is actually mlocked.
@@ -1480,7 +1480,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
        struct anon_vma_chain *avc;
        int ret = SWAP_AGAIN;
-        anon_vma = page_lock_anon_vma(page);
+        anon_vma = page_lock_anon_vma_read(page);
        if (!anon_vma)
                return ret;
@@ -1507,7 +1507,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
                        break;
        }
-        page_unlock_anon_vma(anon_vma);
+        page_unlock_anon_vma_read(anon_vma);
        return ret;
 }
@@ -1702,7 +1702,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
        int ret = SWAP_AGAIN;
        /*
-         * Note: remove_migration_ptes() cannot use page_lock_anon_vma()
+         * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
         * because that depends on page_mapped(); but not all its usages
         * are holding mmap_sem. Users without mmap_sem are required to
         * take a reference count to prevent the anon_vma disappearing
@@ -1710,7 +1710,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
        anon_vma = page_anon_vma(page);
        if (!anon_vma)
                return ret;
-        anon_vma_lock(anon_vma);
+        anon_vma_lock_read(anon_vma);
        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);
@@ -1718,7 +1718,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
                if (ret != SWAP_AGAIN)
                        break;
        }
-        anon_vma_unlock(anon_vma);
+        anon_vma_unlock_read(anon_vma);
        return ret;
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index df14808f0a36..9800306c8195 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -774,10 +774,20 @@ const char * const vmstat_text[] = {
        "pgrotated",
+#ifdef CONFIG_NUMA_BALANCING
+        "numa_pte_updates",
+        "numa_hint_faults",
+        "numa_hint_faults_local",
+        "numa_pages_migrated",
+#endif
+#ifdef CONFIG_MIGRATION
+        "pgmigrate_success",
+        "pgmigrate_fail",
+#endif
 #ifdef CONFIG_COMPACTION
-        "compact_blocks_moved",
+        "compact_migrate_scanned",
-        "compact_pages_moved",
+        "compact_free_scanned",
-        "compact_pagemigrate_failed",
+        "compact_isolated",
        "compact_stall",
        "compact_fail",
        "compact_success",