45 files changed, 1938 insertions, 172 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 20e248cc03a9..ea8e5b485576 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2032,6 +2032,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
        nr_uarts=       [SERIAL] maximum number of UARTs to be registered.
+        numa_balancing= [KNL,X86] Enable or disable automatic NUMA balancing.
+                        Allowed values are enable and disable
        numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA.
                        one of ['zone', 'node', 'default'] can be specified
                        This can be set from sysctl after boot.
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index cb8f9920f4dd..0f7c852f355c 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -111,6 +111,7 @@ config VSYSCALL
 config NUMA
        bool "Non Uniform Memory Access (NUMA) Support"
        depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL
+        select ARCH_WANT_NUMA_VARIABLE_LOCALITY
        default n
        help
          Some SH systems have many various memories scattered around
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 65a872bf72f9..97f8c5ad8c2d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -22,6 +22,8 @@ config X86
        def_bool y
        select HAVE_AOUT if X86_32
        select HAVE_UNSTABLE_SCHED_CLOCK
+        select ARCH_SUPPORTS_NUMA_BALANCING
+        select ARCH_WANTS_PROT_NUMA_PROT_NONE
        select HAVE_IDE
        select HAVE_OPROFILE
        select HAVE_PCSPKR_PLATFORM
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a1f780d45f76..5199db2923d3 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -404,7 +404,14 @@ static inline int pte_same(pte_t a, pte_t b)
 static inline int pte_present(pte_t a)
 {
-        return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
+        return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE |
+                               _PAGE_NUMA);
+}
+#define pte_accessible pte_accessible
+static inline int pte_accessible(pte_t a)
+{
+        return pte_flags(a) & _PAGE_PRESENT;
 }
 static inline int pte_hidden(pte_t pte)
@@ -420,7 +427,8 @@ static inline int pmd_present(pmd_t pmd)
         * the _PAGE_PSE flag will remain set at all times while the
         * _PAGE_PRESENT bit is clear).
         */
-        return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
+        return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE |
+                                 _PAGE_NUMA);
 }
 static inline int pmd_none(pmd_t pmd)
@@ -479,6 +487,11 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
 static inline int pmd_bad(pmd_t pmd)
 {
+#ifdef CONFIG_NUMA_BALANCING
+        /* pmd_numa check */
+        if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)
+                return 0;
+#endif
        return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
 }
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index ec8a1fc9505d..3c32db8c539d 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -64,6 +64,26 @@
 #define _PAGE_FILE      (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
 #define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+/*
+ * _PAGE_NUMA indicates that this page will trigger a numa hinting
+ * minor page fault to gather numa placement statistics (see
+ * pte_numa()). The bit picked (8) is within the range between
+ * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't
+ * require changes to the swp entry format because that bit is always
+ * zero when the pte is not present.
+ *
+ * The bit picked must be always zero when the pmd is present and not
+ * present, so that we don't lose information when we set it while
+ * atomically clearing the present bit.
+ *
+ * Because we shared the same bit (8) with _PAGE_PROTNONE this can be
+ * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE
+ * couldn't reach, like handle_mm_fault() (see access_error in
+ * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for
+ * handle_mm_fault() to be invoked).
+ */
+#define _PAGE_NUMA      _PAGE_PROTNONE
 #define _PAGE_TABLE     (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |        \
                         _PAGE_ACCESSED | _PAGE_DIRTY)
 #define _KERNPG_TABLE   (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |    \
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 217eb705fac0..e27fbf887f3b 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -301,6 +301,13 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
        free_page((unsigned long)pgd);
 }
+/*
+ * Used to set accessed or dirty bits in the page table entries
+ * on other architectures. On x86, the accessed and dirty bits
+ * are tracked by hardware. However, do_wp_page calls this function
+ * to also make the pte writeable at the same time the dirty bit is
+ * set. In that case we do actually need to write the PTE.
+ */
 int ptep_set_access_flags(struct vm_area_struct *vma,
                          unsigned long address, pte_t *ptep,
                          pte_t entry, int dirty)
@@ -310,7 +317,6 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
        if (changed && dirty) {
                *ptep = entry;
                pte_update_defer(vma->vm_mm, address, ptep);
-                flush_tlb_page(vma, address);
        }
        return changed;
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 284e80831d2c..701beab27aab 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -219,6 +219,10 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
 #define move_pte(pte, prot, old_addr, new_addr) (pte)
 #endif
+#ifndef pte_accessible
+# define pte_accessible(pte)            ((void)(pte),1)
+#endif
 #ifndef flush_tlb_fix_spurious_fault
 #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
 #endif
@@ -580,6 +584,112 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
 #endif
 }
+#ifdef CONFIG_NUMA_BALANCING
+#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+/*
+ * _PAGE_NUMA works identical to _PAGE_PROTNONE (it's actually the
+ * same bit too). It's set only when _PAGE_PRESET is not set and it's
+ * never set if _PAGE_PRESENT is set.
+ *
+ * pte/pmd_present() returns true if pte/pmd_numa returns true. Page
+ * fault triggers on those regions if pte/pmd_numa returns true
+ * (because _PAGE_PRESENT is not set).
+ */
+#ifndef pte_numa
+static inline int pte_numa(pte_t pte)
+{
+        return (pte_flags(pte) &
+                (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
+}
+#endif
+#ifndef pmd_numa
+static inline int pmd_numa(pmd_t pmd)
+{
+        return (pmd_flags(pmd) &
+                (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
+}
+#endif
+/*
+ * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically
+ * because they're called by the NUMA hinting minor page fault. If we
+ * wouldn't set the _PAGE_ACCESSED bitflag here, the TLB miss handler
+ * would be forced to set it later while filling the TLB after we
+ * return to userland. That would trigger a second write to memory
+ * that we optimize away by setting _PAGE_ACCESSED here.
+ */
+#ifndef pte_mknonnuma
+static inline pte_t pte_mknonnuma(pte_t pte)
+{
+        pte = pte_clear_flags(pte, _PAGE_NUMA);
+        return pte_set_flags(pte, _PAGE_PRESENT|_PAGE_ACCESSED);
+}
+#endif
+#ifndef pmd_mknonnuma
+static inline pmd_t pmd_mknonnuma(pmd_t pmd)
+{
+        pmd = pmd_clear_flags(pmd, _PAGE_NUMA);
+        return pmd_set_flags(pmd, _PAGE_PRESENT|_PAGE_ACCESSED);
+}
+#endif
+#ifndef pte_mknuma
+static inline pte_t pte_mknuma(pte_t pte)
+{
+        pte = pte_set_flags(pte, _PAGE_NUMA);
+        return pte_clear_flags(pte, _PAGE_PRESENT);
+}
+#endif
+#ifndef pmd_mknuma
+static inline pmd_t pmd_mknuma(pmd_t pmd)
+{
+        pmd = pmd_set_flags(pmd, _PAGE_NUMA);
+        return pmd_clear_flags(pmd, _PAGE_PRESENT);
+}
+#endif
+#else
+extern int pte_numa(pte_t pte);
+extern int pmd_numa(pmd_t pmd);
+extern pte_t pte_mknonnuma(pte_t pte);
+extern pmd_t pmd_mknonnuma(pmd_t pmd);
+extern pte_t pte_mknuma(pte_t pte);
+extern pmd_t pmd_mknuma(pmd_t pmd);
+#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
+#else
+static inline int pmd_numa(pmd_t pmd)
+{
+        return 0;
+}
+static inline int pte_numa(pte_t pte)
+{
+        return 0;
+}
+static inline pte_t pte_mknonnuma(pte_t pte)
+{
+        return pte;
+}
+static inline pmd_t pmd_mknonnuma(pmd_t pmd)
+{
+        return pmd;
+}
+static inline pte_t pte_mknuma(pte_t pte)
+{
+        return pte;
+}
+static inline pmd_t pmd_mknuma(pmd_t pmd)
+{
+        return pmd;
+}
+#endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_MMU */
 #endif /* !__ASSEMBLY__ */
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 092dc5305a32..1d76f8ca90f0 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -31,7 +31,8 @@ extern int move_huge_pmd(struct vm_area_struct *vma,
                         unsigned long new_addr, unsigned long old_end,
                         pmd_t *old_pmd, pmd_t *new_pmd);
 extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-                        unsigned long addr, pgprot_t newprot);
+                        unsigned long addr, pgprot_t newprot,
+                        int prot_numa);
 enum transparent_hugepage_flag {
        TRANSPARENT_HUGEPAGE_FLAG,
@@ -111,7 +112,7 @@ extern void __split_huge_page_pmd(struct vm_area_struct *vma,
 #define wait_split_huge_page(__anon_vma, __pmd)                         \
        do {                                                            \
                pmd_t *____pmd = (__pmd);                               \
-                anon_vma_lock(__anon_vma);                              \
+                anon_vma_lock_write(__anon_vma);                        \
                anon_vma_unlock(__anon_vma);                            \
                BUG_ON(pmd_trans_splitting(*____pmd) ||                 \
                       pmd_trans_huge(*____pmd));                       \
@@ -171,6 +172,10 @@ static inline struct page *compound_trans_head(struct page *page)
        }
        return page;
 }
+extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                                unsigned long addr, pmd_t pmd, pmd_t *pmdp);
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
 #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -209,6 +214,13 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd,
 {
        return 0;
 }
+static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                                        unsigned long addr, pmd_t pmd, pmd_t *pmdp)
+{
+        return 0;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3e7fa1acf09c..0c80d3f57a5b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -87,7 +87,7 @@ struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
                                pud_t *pud, int write);
 int pmd_huge(pmd_t pmd);
 int pud_huge(pud_t pmd);
-void hugetlb_change_protection(struct vm_area_struct *vma,
+unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end, pgprot_t newprot);
 #else /* !CONFIG_HUGETLB_PAGE */
@@ -132,7 +132,11 @@ static inline void copy_huge_page(struct page *dst, struct page *src)
 {
 }
-#define hugetlb_change_protection(vma, address, end, newprot)
+static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
+                unsigned long address, unsigned long end, pgprot_t newprot)
+{
+        return 0;
+}
 static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
                        struct vm_area_struct *vma, unsigned long start,
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index dbd212723b74..9adc270de7ef 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -188,6 +188,8 @@ static inline int vma_migratable(struct vm_area_struct *vma)
        return 1;
 }
+extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
 #else
 struct mempolicy {};
@@ -307,5 +309,11 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol,
        return 0;
 }
+static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
+                                 unsigned long address)
+{
+        return -1; /* no node preference */
+}
 #endif /* CONFIG_NUMA */
 #endif
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 0b5865c61efd..1e9f627967a3 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -23,6 +23,15 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **);
 #define MIGRATEPAGE_BALLOON_SUCCESS     1 /* special ret code for balloon page
                                           * sucessful migration case.
                                           */
+enum migrate_reason {
+        MR_COMPACTION,
+        MR_MEMORY_FAILURE,
+        MR_MEMORY_HOTPLUG,
+        MR_SYSCALL,             /* also applies to cpusets */
+        MR_MEMPOLICY_MBIND,
+        MR_NUMA_MISPLACED,
+        MR_CMA
+};
 #ifdef CONFIG_MIGRATION
@@ -32,7 +41,7 @@ extern int migrate_page(struct address_space *,
                        struct page *, struct page *, enum migrate_mode);
 extern int migrate_pages(struct list_head *l, new_page_t x,
                        unsigned long private, bool offlining,
-                        enum migrate_mode mode);
+                        enum migrate_mode mode, int reason);
 extern int migrate_huge_page(struct page *, new_page_t x,
                        unsigned long private, bool offlining,
                        enum migrate_mode mode);
@@ -54,7 +63,7 @@ static inline void putback_lru_pages(struct list_head *l) {}
 static inline void putback_movable_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_page_t x,
                unsigned long private, bool offlining,
-                enum migrate_mode mode) { return -ENOSYS; }
+                enum migrate_mode mode, int reason) { return -ENOSYS; }
 static inline int migrate_huge_page(struct page *page, new_page_t x,
                unsigned long private, bool offlining,
                enum migrate_mode mode) { return -ENOSYS; }
@@ -83,4 +92,37 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 #define fail_migrate_page NULL
 #endif /* CONFIG_MIGRATION */
+#ifdef CONFIG_NUMA_BALANCING
+extern int migrate_misplaced_page(struct page *page, int node);
+extern int migrate_misplaced_page(struct page *page, int node);
+extern bool migrate_ratelimited(int node);
+#else
+static inline int migrate_misplaced_page(struct page *page, int node)
+{
+        return -EAGAIN; /* can't migrate now */
+}
+static inline bool migrate_ratelimited(int node)
+{
+        return false;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+extern int migrate_misplaced_transhuge_page(struct mm_struct *mm,
+                        struct vm_area_struct *vma,
+                        pmd_t *pmd, pmd_t entry,
+                        unsigned long address,
+                        struct page *page, int node);
+#else
+static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,
+                        struct vm_area_struct *vma,
+                        pmd_t *pmd, pmd_t entry,
+                        unsigned long address,
+                        struct page *page, int node)
+{
+        return -EAGAIN;
+}
+#endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/
 #endif /* _LINUX_MIGRATE_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4af4f0b1be4c..7f4f906190bd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -693,6 +693,36 @@ static inline int page_to_nid(const struct page *page)
 }
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+static inline int page_xchg_last_nid(struct page *page, int nid)
+{
+        return xchg(&page->_last_nid, nid);
+}
+static inline int page_last_nid(struct page *page)
+{
+        return page->_last_nid;
+}
+static inline void reset_page_last_nid(struct page *page)
+{
+        page->_last_nid = -1;
+}
+#else
+static inline int page_xchg_last_nid(struct page *page, int nid)
+{
+        return page_to_nid(page);
+}
+static inline int page_last_nid(struct page *page)
+{
+        return page_to_nid(page);
+}
+static inline void reset_page_last_nid(struct page *page)
+{
+}
+#endif
 static inline struct zone *page_zone(const struct page *page)
 {
        return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
@@ -1078,6 +1108,9 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
 extern unsigned long do_mremap(unsigned long addr,
                               unsigned long old_len, unsigned long new_len,
                               unsigned long flags, unsigned long new_addr);
+extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
+                              unsigned long end, pgprot_t newprot,
+                              int dirty_accountable, int prot_numa);
 extern int mprotect_fixup(struct vm_area_struct *vma,
                          struct vm_area_struct **pprev, unsigned long start,
                          unsigned long end, unsigned long newflags);
@@ -1579,6 +1612,11 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
 }
 #endif
+#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+unsigned long change_prot_numa(struct vm_area_struct *vma,
+                        unsigned long start, unsigned long end);
+#endif
 struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
 int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
                        unsigned long pfn, unsigned long size, pgprot_t);
@@ -1600,6 +1638,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
 #define FOLL_MLOCK      0x40    /* mark page as mlocked */
 #define FOLL_SPLIT      0x80    /* don't return transhuge pages, split them */
 #define FOLL_HWPOISON   0x100   /* check page is hwpoisoned */
+#define FOLL_NUMA       0x200   /* force NUMA hinting page fault */
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
                        void *data);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7ade2731b5d6..7d9ebb7cc982 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -175,6 +175,10 @@ struct page {
         */
        void *shadow;
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+        int _last_nid;
+#endif
 }
 /*
 * The struct page can be forced to be double word aligned so that atomic ops
@@ -411,9 +415,36 @@ struct mm_struct {
 #ifdef CONFIG_CPUMASK_OFFSTACK
        struct cpumask cpumask_allocation;
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+        /*
+         * numa_next_scan is the next time when the PTEs will me marked
+         * pte_numa to gather statistics and migrate pages to new nodes
+         * if necessary
+         */
+        unsigned long numa_next_scan;
+        /* numa_next_reset is when the PTE scanner period will be reset */
+        unsigned long numa_next_reset;
+        /* Restart point for scanning and setting pte_numa */
+        unsigned long numa_scan_offset;
+        /* numa_scan_seq prevents two threads setting pte_numa */
+        int numa_scan_seq;
+        /*
+         * The first node a task was scheduled on. If a task runs on
+         * a different node than Make PTE Scan Go Now.
+         */
+        int first_nid;
+#endif
        struct uprobes_state uprobes_state;
 };
+/* first nid will either be a valid NID or one of these values */
+#define NUMA_PTE_SCAN_INIT      -1
+#define NUMA_PTE_SCAN_ACTIVE    -2
 static inline void mm_init_cpumask(struct mm_struct *mm)
 {
 #ifdef CONFIG_CPUMASK_OFFSTACK
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index cd55dad56aac..4bec5be82cab 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -735,6 +735,19 @@ typedef struct pglist_data {
        struct task_struct *kswapd;     /* Protected by lock_memory_hotplug() */
        int kswapd_max_order;
        enum zone_type classzone_idx;
+#ifdef CONFIG_NUMA_BALANCING
+        /*
+         * Lock serializing the per destination node AutoNUMA memory
+         * migration rate limiting data.
+         */
+        spinlock_t numabalancing_migrate_lock;
+        /* Rate limiting time interval */
+        unsigned long numabalancing_migrate_next_window;
+        /* Number of pages migrated during the rate limiting time interval */
+        unsigned long numabalancing_migrate_nr_pages;
+#endif
 } pg_data_t;
 #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bfe1f4780644..c20635c527a9 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -7,7 +7,7 @@
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
-#include <linux/mutex.h>
+#include <linux/rwsem.h>
 #include <linux/memcontrol.h>
 /*
@@ -25,8 +25,8 @@
 * pointing to this anon_vma once its vma list is empty.
 */
 struct anon_vma {
-        struct anon_vma *root;  /* Root of this anon_vma tree */
+        struct anon_vma *root;          /* Root of this anon_vma tree */
-        struct mutex mutex;     /* Serialize access to vma list */
+        struct rw_semaphore rwsem;      /* W: modification, R: walking the list */
        /*
         * The refcount is taken on an anon_vma when there is no
         * guarantee that the vma of page tables will exist for
@@ -64,7 +64,7 @@ struct anon_vma_chain {
        struct vm_area_struct *vma;
        struct anon_vma *anon_vma;
        struct list_head same_vma;   /* locked by mmap_sem & page_table_lock */
-        struct rb_node rb;                      /* locked by anon_vma->mutex */
+        struct rb_node rb;                      /* locked by anon_vma->rwsem */
        unsigned long rb_subtree_last;
 #ifdef CONFIG_DEBUG_VM_RB
        unsigned long cached_vma_start, cached_vma_last;
@@ -108,26 +108,37 @@ static inline void vma_lock_anon_vma(struct vm_area_struct *vma)
 {
        struct anon_vma *anon_vma = vma->anon_vma;
        if (anon_vma)
-                mutex_lock(&anon_vma->root->mutex);
+                down_write(&anon_vma->root->rwsem);
 }
 static inline void vma_unlock_anon_vma(struct vm_area_struct *vma)
 {
        struct anon_vma *anon_vma = vma->anon_vma;
        if (anon_vma)
-                mutex_unlock(&anon_vma->root->mutex);
+                up_write(&anon_vma->root->rwsem);
 }
-static inline void anon_vma_lock(struct anon_vma *anon_vma)
+static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
 {
-        mutex_lock(&anon_vma->root->mutex);
+        down_write(&anon_vma->root->rwsem);
 }
 static inline void anon_vma_unlock(struct anon_vma *anon_vma)
 {
-        mutex_unlock(&anon_vma->root->mutex);
+        up_write(&anon_vma->root->rwsem);
 }
+static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
+{
+        down_read(&anon_vma->root->rwsem);
+}
+static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
+{
+        up_read(&anon_vma->root->rwsem);
+}
 /*
 * anon_vma helper functions.
 */
@@ -220,8 +231,8 @@ int try_to_munlock(struct page *);
 /*
 * Called by memory-failure.c to kill processes.
 */
-struct anon_vma *page_lock_anon_vma(struct page *page);
+struct anon_vma *page_lock_anon_vma_read(struct page *page);
-void page_unlock_anon_vma(struct anon_vma *anon_vma);
+void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
 int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
 /*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c2f3072beef..b089c92c609b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1527,6 +1527,14 @@ struct task_struct {
        short il_next;
        short pref_node_fork;
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+        int numa_scan_seq;
+        int numa_migrate_seq;
+        unsigned int numa_scan_period;
+        u64 node_stamp;                 /* migration stamp  */
+        struct callback_head numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
        struct rcu_head rcu;
        /*
@@ -1601,6 +1609,18 @@ struct task_struct {
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
+#ifdef CONFIG_NUMA_BALANCING
+extern void task_numa_fault(int node, int pages, bool migrated);
+extern void set_numabalancing_state(bool enabled);
+#else
+static inline void task_numa_fault(int node, int pages, bool migrated)
+{
+}
+static inline void set_numabalancing_state(bool enabled)
+{
+}
+#endif
 /*
 * Priority of a process goes from 0..MAX_PRIO-1, valid RT
 * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@ -2030,6 +2050,13 @@ enum sched_tunable_scaling {
 };
 extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
+extern unsigned int sysctl_numa_balancing_scan_delay;
+extern unsigned int sysctl_numa_balancing_scan_period_min;
+extern unsigned int sysctl_numa_balancing_scan_period_max;
+extern unsigned int sysctl_numa_balancing_scan_period_reset;
+extern unsigned int sysctl_numa_balancing_scan_size;
+extern unsigned int sysctl_numa_balancing_settle_count;
 #ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index fe786f07d2bd..fce0a2799d43 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -38,8 +38,18 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
                KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,
                KSWAPD_SKIP_CONGESTION_WAIT,
                PAGEOUTRUN, ALLOCSTALL, PGROTATED,
+#ifdef CONFIG_NUMA_BALANCING
+                NUMA_PTE_UPDATES,
+                NUMA_HINT_FAULTS,
+                NUMA_HINT_FAULTS_LOCAL,
+                NUMA_PAGE_MIGRATE,
+#endif
+#ifdef CONFIG_MIGRATION
+                PGMIGRATE_SUCCESS, PGMIGRATE_FAIL,
+#endif
 #ifdef CONFIG_COMPACTION
-                COMPACTBLOCKS, COMPACTPAGES, COMPACTPAGEFAILED,
+                COMPACTMIGRATE_SCANNED, COMPACTFREE_SCANNED,
+                COMPACTISOLATED,
                COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS,
 #endif
 #ifdef CONFIG_HUGETLB_PAGE
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 92a86b2cce33..a13291f7da88 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -80,6 +80,14 @@ static inline void vm_events_fold_cpu(int cpu)
 #endif /* CONFIG_VM_EVENT_COUNTERS */
+#ifdef CONFIG_NUMA_BALANCING
+#define count_vm_numa_event(x)     count_vm_event(x)
+#define count_vm_numa_events(x, y) count_vm_events(x, y)
+#else
+#define count_vm_numa_event(x) do {} while (0)
+#define count_vm_numa_events(x, y) do {} while (0)
+#endif /* CONFIG_NUMA_BALANCING */
 #define __count_zone_vm_events(item, zone, delta) \
                __count_vm_events(item##_NORMAL - ZONE_NORMAL + \
                zone_idx(zone), delta)
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
new file mode 100644
index 000000000000..ec2a6ccfd7e5
--- /dev/null
+++ b/include/trace/events/migrate.h
@@ -0,0 +1,51 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM migrate
+#if !defined(_TRACE_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MIGRATE_H
+#define MIGRATE_MODE                                            \
+        {MIGRATE_ASYNC,         "MIGRATE_ASYNC"},               \
+        {MIGRATE_SYNC_LIGHT,    "MIGRATE_SYNC_LIGHT"},          \
+        {MIGRATE_SYNC,          "MIGRATE_SYNC"}         
+#define MIGRATE_REASON                                          \
+        {MR_COMPACTION,         "compaction"},                  \
+        {MR_MEMORY_FAILURE,     "memory_failure"},              \
+        {MR_MEMORY_HOTPLUG,     "memory_hotplug"},              \
+        {MR_SYSCALL,            "syscall_or_cpuset"},           \
+        {MR_MEMPOLICY_MBIND,    "mempolicy_mbind"},             \
+        {MR_CMA,                "cma"}
+TRACE_EVENT(mm_migrate_pages,
+        TP_PROTO(unsigned long succeeded, unsigned long failed,
+                 enum migrate_mode mode, int reason),
+        TP_ARGS(succeeded, failed, mode, reason),
+        TP_STRUCT__entry(
+                __field(        unsigned long,          succeeded)
+                __field(        unsigned long,          failed)
+                __field(        enum migrate_mode,      mode)
+                __field(        int,                    reason)
+        ),
+        TP_fast_assign(
+                __entry->succeeded      = succeeded;
+                __entry->failed         = failed;
+                __entry->mode           = mode;
+                __entry->reason         = reason;
+        ),
+        TP_printk("nr_succeeded=%lu nr_failed=%lu mode=%s reason=%s",
+                __entry->succeeded,
+                __entry->failed,
+                __print_symbolic(__entry->mode, MIGRATE_MODE),
+                __print_symbolic(__entry->reason, MIGRATE_REASON))
+);
+#endif /* _TRACE_MIGRATE_H */
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 23e62e0537e2..0d11c3dcd3a1 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -20,6 +20,7 @@ enum {
        MPOL_PREFERRED,
        MPOL_BIND,
        MPOL_INTERLEAVE,
+        MPOL_LOCAL,
        MPOL_MAX,       /* always last member of enum */
 };
@@ -47,9 +48,15 @@ enum mpol_rebind_step {
 /* Flags for mbind */
 #define MPOL_MF_STRICT  (1<<0)  /* Verify existing pages in the mapping */
-#define MPOL_MF_MOVE    (1<<1)  /* Move pages owned by this process to conform to mapping */
+#define MPOL_MF_MOVE     (1<<1) /* Move pages owned by this process to conform
-#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */
+                                   to policy */
-#define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */
+#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */
+#define MPOL_MF_LAZY     (1<<3) /* Modifies '_MOVE:  lazy migrate on fault */
+#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */
+#define MPOL_MF_VALID   (MPOL_MF_STRICT   |     \
+                         MPOL_MF_MOVE     |     \
+                         MPOL_MF_MOVE_ALL)
 /*
 * Internal flags that share the struct mempolicy flags word with
@@ -59,6 +66,8 @@ enum mpol_rebind_step {
 #define MPOL_F_SHARED  (1 << 0) /* identify shared policies */
 #define MPOL_F_LOCAL   (1 << 1) /* preferred local allocation */
 #define MPOL_F_REBINDING (1 << 2)       /* identify policies in rebinding */
+#define MPOL_F_MOF      (1 << 3) /* this policy wants migrate on fault */
+#define MPOL_F_MORON    (1 << 4) /* Migrate On pte_numa Reference On Node */
 #endif /* _UAPI_LINUX_MEMPOLICY_H */
diff --git a/init/Kconfig b/init/Kconfig
index 2054e048bb98..1a207efca591 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -717,6 +717,50 @@ config LOG_BUF_SHIFT
 config HAVE_UNSTABLE_SCHED_CLOCK
        bool
+#
+# For architectures that want to enable the support for NUMA-affine scheduler
+# balancing logic:
+#
+config ARCH_SUPPORTS_NUMA_BALANCING
+        bool
+# For architectures that (ab)use NUMA to represent different memory regions
+# all cpu-local but of different latencies, such as SuperH.
+#
+config ARCH_WANT_NUMA_VARIABLE_LOCALITY
+        bool
+#
+# For architectures that are willing to define _PAGE_NUMA as _PAGE_PROTNONE
+config ARCH_WANTS_PROT_NUMA_PROT_NONE
+        bool
+config ARCH_USES_NUMA_PROT_NONE
+        bool
+        default y
+        depends on ARCH_WANTS_PROT_NUMA_PROT_NONE
+        depends on NUMA_BALANCING
+config NUMA_BALANCING_DEFAULT_ENABLED
+        bool "Automatically enable NUMA aware memory/task placement"
+        default y
+        depends on NUMA_BALANCING
+        help
+          If set, autonumic NUMA balancing will be enabled if running on a NUMA
+          machine.
+config NUMA_BALANCING
+        bool "Memory placement aware NUMA scheduler"
+        depends on ARCH_SUPPORTS_NUMA_BALANCING
+        depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY
+        depends on SMP && NUMA && MIGRATION
+        help
+          This option adds support for automatic NUMA aware memory/task placement.
+          The mechanism is quite primitive and is based on migrating memory when
+          it is references to the node the task is running on.
+          This system will be inactive on UMA systems.
 menuconfig CGROUPS
        boolean "Control Group support"
        depends on EVENTFD
diff --git a/kernel/fork.c b/kernel/fork.c
index 3c31e874afad..115d6c2e4cca 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -823,6 +823,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        mm->pmd_huge_pte = NULL;
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+        mm->first_nid = NUMA_PTE_SCAN_INIT;
+#endif
        if (!mm_init(mm, tsk))
                goto fail_nomem;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0533496b6228..c1fb82104bfb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -193,23 +193,10 @@ static void sched_feat_disable(int i) { };
 static void sched_feat_enable(int i) { };
 #endif /* HAVE_JUMP_LABEL */
-static ssize_t
+static int sched_feat_set(char *cmp)
-sched_feat_write(struct file *filp, const char __user *ubuf,
-                size_t cnt, loff_t *ppos)
 {
-        char buf[64];
-        char *cmp;
-        int neg = 0;
        int i;
+        int neg = 0;
-        if (cnt > 63)
-                cnt = 63;
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        cmp = strstrip(buf);
        if (strncmp(cmp, "NO_", 3) == 0) {
                neg = 1;
@@ -229,6 +216,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
                }
        }
+        return i;
+}
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+                size_t cnt, loff_t *ppos)
+{
+        char buf[64];
+        char *cmp;
+        int i;
+        if (cnt > 63)
+                cnt = 63;
+        if (copy_from_user(&buf, ubuf, cnt))
+                return -EFAULT;
+        buf[cnt] = 0;
+        cmp = strstrip(buf);
+        i = sched_feat_set(cmp);
        if (i == __SCHED_FEAT_NR)
                return -EINVAL;
@@ -1560,7 +1568,40 @@ static void __sched_fork(struct task_struct *p)
 #ifdef CONFIG_PREEMPT_NOTIFIERS
        INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+        if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
+                p->mm->numa_next_scan = jiffies;
+                p->mm->numa_next_reset = jiffies;
+                p->mm->numa_scan_seq = 0;
+        }
+        p->node_stamp = 0ULL;
+        p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+        p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
+        p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+        p->numa_work.next = &p->numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
+}
+#ifdef CONFIG_NUMA_BALANCING
+#ifdef CONFIG_SCHED_DEBUG
+void set_numabalancing_state(bool enabled)
+{
+        if (enabled)
+                sched_feat_set("NUMA");
+        else
+                sched_feat_set("NO_NUMA");
+}
+#else
+__read_mostly bool numabalancing_enabled;
+void set_numabalancing_state(bool enabled)
+{
+        numabalancing_enabled = enabled;
 }
+#endif /* CONFIG_SCHED_DEBUG */
+#endif /* CONFIG_NUMA_BALANCING */
 /*
 * fork()/clone()-time setup:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 756f9f9e8542..9af5af979a13 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,9 @@
 #include <linux/slab.h>
 #include <linux/profile.h>
 #include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/migrate.h>
+#include <linux/task_work.h>
 #include <trace/events/sched.h>
@@ -774,6 +777,227 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 * Scheduling class queueing methods:
 */
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * numa task sample period in ms
+ */
+unsigned int sysctl_numa_balancing_scan_period_min = 100;
+unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
+unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
+/* Portion of address space to scan in MB */
+unsigned int sysctl_numa_balancing_scan_size = 256;
+/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
+unsigned int sysctl_numa_balancing_scan_delay = 1000;
+static void task_numa_placement(struct task_struct *p)
+{
+        int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+        if (p->numa_scan_seq == seq)
+                return;
+        p->numa_scan_seq = seq;
+        /* FIXME: Scheduling placement policy hints go here */
+}
+/*
+ * Got a PROT_NONE fault for a page on @node.
+ */
+void task_numa_fault(int node, int pages, bool migrated)
+{
+        struct task_struct *p = current;
+        if (!sched_feat_numa(NUMA))
+                return;
+        /* FIXME: Allocate task-specific structure for placement policy here */
+        /*
+         * If pages are properly placed (did not migrate) then scan slower.
+         * This is reset periodically in case of phase changes
+         */
+        if (!migrated)
+                p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
+                        p->numa_scan_period + jiffies_to_msecs(10));
+        task_numa_placement(p);
+}
+static void reset_ptenuma_scan(struct task_struct *p)
+{
+        ACCESS_ONCE(p->mm->numa_scan_seq)++;
+        p->mm->numa_scan_offset = 0;
+}
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_work(struct callback_head *work)
+{
+        unsigned long migrate, next_scan, now = jiffies;
+        struct task_struct *p = current;
+        struct mm_struct *mm = p->mm;
+        struct vm_area_struct *vma;
+        unsigned long start, end;
+        long pages;
+        WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+        work->next = work; /* protect against double add */
+        /*
+         * Who cares about NUMA placement when they're dying.
+         *
+         * NOTE: make sure not to dereference p->mm before this check,
+         * exit_task_work() happens _after_ exit_mm() so we could be called
+         * without p->mm even though we still had it when we enqueued this
+         * work.
+         */
+        if (p->flags & PF_EXITING)
+                return;
+        /*
+         * We do not care about task placement until a task runs on a node
+         * other than the first one used by the address space. This is
+         * largely because migrations are driven by what CPU the task
+         * is running on. If it's never scheduled on another node, it'll
+         * not migrate so why bother trapping the fault.
+         */
+        if (mm->first_nid == NUMA_PTE_SCAN_INIT)
+                mm->first_nid = numa_node_id();
+        if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
+                /* Are we running on a new node yet? */
+                if (numa_node_id() == mm->first_nid &&
+                    !sched_feat_numa(NUMA_FORCE))
+                        return;
+                mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
+        }
+        /*
+         * Reset the scan period if enough time has gone by. Objective is that
+         * scanning will be reduced if pages are properly placed. As tasks
+         * can enter different phases this needs to be re-examined. Lacking
+         * proper tracking of reference behaviour, this blunt hammer is used.
+         */
+        migrate = mm->numa_next_reset;
+        if (time_after(now, migrate)) {
+                p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+                next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
+                xchg(&mm->numa_next_reset, next_scan);
+        }
+        /*
+         * Enforce maximal scan/migration frequency..
+         */
+        migrate = mm->numa_next_scan;
+        if (time_before(now, migrate))
+                return;
+        if (p->numa_scan_period == 0)
+                p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+        next_scan = now + msecs_to_jiffies(p->numa_scan_period);
+        if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+                return;
+        /*
+         * Do not set pte_numa if the current running node is rate-limited.
+         * This loses statistics on the fault but if we are unwilling to
+         * migrate to this node, it is less likely we can do useful work
+         */
+        if (migrate_ratelimited(numa_node_id()))
+                return;
+        start = mm->numa_scan_offset;
+        pages = sysctl_numa_balancing_scan_size;
+        pages <<= 20 - PAGE_SHIFT; /* MB in pages */
+        if (!pages)
+                return;
+        down_read(&mm->mmap_sem);
+        vma = find_vma(mm, start);
+        if (!vma) {
+                reset_ptenuma_scan(p);
+                start = 0;
+                vma = mm->mmap;
+        }
+        for (; vma; vma = vma->vm_next) {
+                if (!vma_migratable(vma))
+                        continue;
+                /* Skip small VMAs. They are not likely to be of relevance */
+                if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) < HPAGE_PMD_NR)
+                        continue;
+                do {
+                        start = max(start, vma->vm_start);
+                        end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
+                        end = min(end, vma->vm_end);
+                        pages -= change_prot_numa(vma, start, end);
+                        start = end;
+                        if (pages <= 0)
+                                goto out;
+                } while (end != vma->vm_end);
+        }
+out:
+        /*
+         * It is possible to reach the end of the VMA list but the last few VMAs are
+         * not guaranteed to the vma_migratable. If they are not, we would find the
+         * !migratable VMA on the next scan but not reset the scanner to the start
+         * so check it now.
+         */
+        if (vma)
+                mm->numa_scan_offset = start;
+        else
+                reset_ptenuma_scan(p);
+        up_read(&mm->mmap_sem);
+}
+/*
+ * Drive the periodic memory faults..
+ */
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+        struct callback_head *work = &curr->numa_work;
+        u64 period, now;
+        /*
+         * We don't care about NUMA placement if we don't have memory.
+         */
+        if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+                return;
+        /*
+         * Using runtime rather than walltime has the dual advantage that
+         * we (mostly) drive the selection from busy threads and that the
+         * task needs to have done some actual work before we bother with
+         * NUMA placement.
+         */
+        now = curr->se.sum_exec_runtime;
+        period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
+        if (now - curr->node_stamp > period) {
+                if (!curr->node_stamp)
+                        curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+                curr->node_stamp = now;
+                if (!time_before(jiffies, curr->mm->numa_next_scan)) {
+                        init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+                        task_work_add(curr, work, true);
+                }
+        }
+}
+#else
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -5501,6 +5725,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
                entity_tick(cfs_rq, se, queued);
        }
+        if (sched_feat_numa(NUMA))
+                task_tick_numa(rq, curr);
        update_rq_runnable_avg(rq, 1);
 }
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index e68e69ab917d..1ad1d2b5395f 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -66,3 +66,14 @@ SCHED_FEAT(TTWU_QUEUE, true)
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
+/*
+ * Apply the automatic NUMA scheduling policy. Enabled automatically
+ * at runtime if running on a NUMA machine. Can be controlled via
+ * numa_balancing=. Allow PTE scanning to be forced on UMA machines
+ * for debugging the core machinery.
+ */
+#ifdef CONFIG_NUMA_BALANCING
+SCHED_FEAT(NUMA,        false)
+SCHED_FEAT(NUMA_FORCE,  false)
+#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5eca173b563f..fc886441436a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -663,6 +663,18 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
+#ifdef CONFIG_NUMA_BALANCING
+#define sched_feat_numa(x) sched_feat(x)
+#ifdef CONFIG_SCHED_DEBUG
+#define numabalancing_enabled sched_feat_numa(NUMA)
+#else
+extern bool numabalancing_enabled;
+#endif /* CONFIG_SCHED_DEBUG */
+#else
+#define sched_feat_numa(x) (0)
+#define numabalancing_enabled (0)
+#endif /* CONFIG_NUMA_BALANCING */
 static inline u64 global_rt_period(void)
 {
        return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 33f71f37267e..c88878db491e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000;		/* 100 usecs */
 static int max_sched_granularity_ns = NSEC_PER_SEC;     /* 1 second */
 static int min_wakeup_granularity_ns;                   /* 0 usecs */
 static int max_wakeup_granularity_ns = NSEC_PER_SEC;    /* 1 second */
+#ifdef CONFIG_SMP
 static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
 static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_SCHED_DEBUG */
 #ifdef CONFIG_COMPACTION
 static int min_extfrag_threshold;
@@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {
                .extra1         = &min_wakeup_granularity_ns,
                .extra2         = &max_wakeup_granularity_ns,
        },
+#ifdef CONFIG_SMP
        {
                .procname       = "sched_tunable_scaling",
                .data           = &sysctl_sched_tunable_scaling,
@@ -347,7 +350,45 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &one,
        },
-#endif
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_NUMA_BALANCING
+        {
+                .procname       = "numa_balancing_scan_delay_ms",
+                .data           = &sysctl_numa_balancing_scan_delay,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+        {
+                .procname       = "numa_balancing_scan_period_min_ms",
+                .data           = &sysctl_numa_balancing_scan_period_min,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+        {
+                .procname       = "numa_balancing_scan_period_reset",
+                .data           = &sysctl_numa_balancing_scan_period_reset,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+        {
+                .procname       = "numa_balancing_scan_period_max_ms",
+                .data           = &sysctl_numa_balancing_scan_period_max,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+        {
+                .procname       = "numa_balancing_scan_size_mb",
+                .data           = &sysctl_numa_balancing_scan_size,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
        {
                .procname       = "sched_rt_period_us",
                .data           = &sysctl_sched_rt_period,
diff --git a/mm/compaction.c b/mm/compaction.c
index 129791218226..5ad7f4f4d6f7 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -303,6 +303,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
        if (blockpfn == end_pfn)
                update_pageblock_skip(cc, valid_page, total_isolated, false);
+        count_vm_events(COMPACTFREE_SCANNED, nr_scanned);
+        if (total_isolated)
+                count_vm_events(COMPACTISOLATED, total_isolated);
        return total_isolated;
 }
@@ -609,6 +613,10 @@ next_pageblock:
        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
+        count_vm_events(COMPACTMIGRATE_SCANNED, nr_scanned);
+        if (nr_isolated)
+                count_vm_events(COMPACTISOLATED, nr_isolated);
        return low_pfn;
 }
@@ -1015,14 +1023,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                nr_migrate = cc->nr_migratepages;
                err = migrate_pages(&cc->migratepages, compaction_alloc,
                                (unsigned long)cc, false,
-                                cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
+                                cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
+                                MR_COMPACTION);
                update_nr_listpages(cc);
                nr_remaining = cc->nr_migratepages;
-                count_vm_event(COMPACTBLOCKS);
-                count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
-                if (nr_remaining)
-                        count_vm_events(COMPACTPAGEFAILED, nr_remaining);
                trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
                                                nr_remaining);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 827d9c813051..d7ee1691fd21 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -19,6 +19,7 @@
 #include <linux/freezer.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
+#include <linux/migrate.h>
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -690,7 +691,7 @@ out:
 }
 __setup("transparent_hugepage=", setup_transparent_hugepage);
-static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 {
        if (likely(vma->vm_flags & VM_WRITE))
                pmd = pmd_mkwrite(pmd);
@@ -848,7 +849,8 @@ out:
         * run pte_offset_map on the pmd, if an huge pmd could
         * materialize from under us from a different thread.
         */
-        if (unlikely(__pte_alloc(mm, vma, pmd, address)))
+        if (unlikely(pmd_none(*pmd)) &&
+            unlikely(__pte_alloc(mm, vma, pmd, address)))
                return VM_FAULT_OOM;
        /* if an huge pmd materialized from under us just retry later */
        if (unlikely(pmd_trans_huge(*pmd)))
@@ -1287,6 +1289,81 @@ out:
        return page;
 }
+/* NUMA hinting page fault entry point for trans huge pmds */
+int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                                unsigned long addr, pmd_t pmd, pmd_t *pmdp)
+{
+        struct page *page;
+        unsigned long haddr = addr & HPAGE_PMD_MASK;
+        int target_nid;
+        int current_nid = -1;
+        bool migrated;
+        bool page_locked = false;
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_same(pmd, *pmdp)))
+                goto out_unlock;
+        page = pmd_page(pmd);
+        get_page(page);
+        current_nid = page_to_nid(page);
+        count_vm_numa_event(NUMA_HINT_FAULTS);
+        if (current_nid == numa_node_id())
+                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+        target_nid = mpol_misplaced(page, vma, haddr);
+        if (target_nid == -1) {
+                put_page(page);
+                goto clear_pmdnuma;
+        }
+        /* Acquire the page lock to serialise THP migrations */
+        spin_unlock(&mm->page_table_lock);
+        lock_page(page);
+        page_locked = true;
+        /* Confirm the PTE did not while locked */
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_same(pmd, *pmdp))) {
+                unlock_page(page);
+                put_page(page);
+                goto out_unlock;
+        }
+        spin_unlock(&mm->page_table_lock);
+        /* Migrate the THP to the requested node */
+        migrated = migrate_misplaced_transhuge_page(mm, vma,
+                                pmdp, pmd, addr,
+                                page, target_nid);
+        if (migrated)
+                current_nid = target_nid;
+        else {
+                spin_lock(&mm->page_table_lock);
+                if (unlikely(!pmd_same(pmd, *pmdp))) {
+                        unlock_page(page);
+                        goto out_unlock;
+                }
+                goto clear_pmdnuma;
+        }
+        task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+        return 0;
+clear_pmdnuma:
+        pmd = pmd_mknonnuma(pmd);
+        set_pmd_at(mm, haddr, pmdp, pmd);
+        VM_BUG_ON(pmd_numa(*pmdp));
+        update_mmu_cache_pmd(vma, addr, pmdp);
+        if (page_locked)
+                unlock_page(page);
+out_unlock:
+        spin_unlock(&mm->page_table_lock);
+        if (current_nid != -1)
+                task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+        return 0;
+}
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 pmd_t *pmd, unsigned long addr)
 {
@@ -1375,7 +1452,7 @@ out:
 }
 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-                unsigned long addr, pgprot_t newprot)
+                unsigned long addr, pgprot_t newprot, int prot_numa)
 {
        struct mm_struct *mm = vma->vm_mm;
        int ret = 0;
@@ -1383,7 +1460,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
        if (__pmd_trans_huge_lock(pmd, vma) == 1) {
                pmd_t entry;
                entry = pmdp_get_and_clear(mm, addr, pmd);
-                entry = pmd_modify(entry, newprot);
+                if (!prot_numa)
+                        entry = pmd_modify(entry, newprot);
+                else {
+                        struct page *page = pmd_page(*pmd);
+                        /* only check non-shared pages */
+                        if (page_mapcount(page) == 1 &&
+                            !pmd_numa(*pmd)) {
+                                entry = pmd_mknuma(entry);
+                        }
+                }
                BUG_ON(pmd_write(entry));
                set_pmd_at(mm, addr, pmd, entry);
                spin_unlock(&vma->vm_mm->page_table_lock);
@@ -1474,7 +1561,7 @@ static int __split_huge_page_splitting(struct page *page,
                 * We can't temporarily set the pmd to null in order
                 * to split it, the pmd must remain marked huge at all
                 * times or the VM won't take the pmd_trans_huge paths
-                 * and it won't wait on the anon_vma->root->mutex to
+                 * and it won't wait on the anon_vma->root->rwsem to
                 * serialize against split_huge_page*.
                 */
                pmdp_splitting_flush(vma, address, pmd);
@@ -1565,6 +1652,7 @@ static void __split_huge_page_refcount(struct page *page)
                page_tail->mapping = page->mapping;
                page_tail->index = page->index + i;
+                page_xchg_last_nid(page_tail, page_last_nid(page));
                BUG_ON(!PageAnon(page_tail));
                BUG_ON(!PageUptodate(page_tail));
@@ -1632,6 +1720,8 @@ static int __split_huge_page_map(struct page *page,
                                BUG_ON(page_mapcount(page) != 1);
                        if (!pmd_young(*pmd))
                                entry = pte_mkold(entry);
+                        if (pmd_numa(*pmd))
+                                entry = pte_mknuma(entry);
                        pte = pte_offset_map(&_pmd, haddr);
                        BUG_ON(!pte_none(*pte));
                        set_pte_at(mm, haddr, pte, entry);
@@ -1674,7 +1764,7 @@ static int __split_huge_page_map(struct page *page,
        return ret;
 }
-/* must be called with anon_vma->root->mutex hold */
+/* must be called with anon_vma->root->rwsem held */
 static void __split_huge_page(struct page *page,
                              struct anon_vma *anon_vma)
 {
@@ -1729,7 +1819,7 @@ int split_huge_page(struct page *page)
        BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
        BUG_ON(!PageAnon(page));
-        anon_vma = page_lock_anon_vma(page);
+        anon_vma = page_lock_anon_vma_read(page);
        if (!anon_vma)
                goto out;
        ret = 0;
@@ -1742,7 +1832,7 @@ int split_huge_page(struct page *page)
        BUG_ON(PageCompound(page));
 out_unlock:
-        page_unlock_anon_vma(anon_vma);
+        page_unlock_anon_vma_read(anon_vma);
 out:
        return ret;
 }
@@ -2234,7 +2324,7 @@ static void collapse_huge_page(struct mm_struct *mm,
        if (pmd_trans_huge(*pmd))
                goto out;
-        anon_vma_lock(vma->anon_vma);
+        anon_vma_lock_write(vma->anon_vma);
        pte = pte_offset_map(pmd, address);
        ptl = pte_lockptr(mm, pmd);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 88e7293b96bd..e5318c7793ae 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3016,7 +3016,7 @@ same_page:
        return i ? i : -EFAULT;
 }
-void hugetlb_change_protection(struct vm_area_struct *vma,
+unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end, pgprot_t newprot)
 {
        struct mm_struct *mm = vma->vm_mm;
@@ -3024,6 +3024,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
        pte_t *ptep;
        pte_t pte;
        struct hstate *h = hstate_vma(vma);
+        unsigned long pages = 0;
        BUG_ON(address >= end);
        flush_cache_range(vma, address, end);
@@ -3034,12 +3035,15 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
                ptep = huge_pte_offset(mm, address);
                if (!ptep)
                        continue;
-                if (huge_pmd_unshare(mm, &address, ptep))
+                if (huge_pmd_unshare(mm, &address, ptep)) {
+                        pages++;
                        continue;
+                }
                if (!huge_pte_none(huge_ptep_get(ptep))) {
                        pte = huge_ptep_get_and_clear(mm, address, ptep);
                        pte = pte_mkhuge(pte_modify(pte, newprot));
                        set_huge_pte_at(mm, address, ptep, pte);
+                        pages++;
                }
        }
        spin_unlock(&mm->page_table_lock);
@@ -3051,6 +3055,8 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
         */
        flush_tlb_range(vma, start, end);
        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+        return pages << h->order;
 }
 int hugetlb_reserve_pages(struct inode *inode,
diff --git a/mm/internal.h b/mm/internal.h
index 52d1fa957194..d597f94cc205 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -217,15 +217,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
 {
        if (TestClearPageMlocked(page)) {
                unsigned long flags;
+                int nr_pages = hpage_nr_pages(page);
                local_irq_save(flags);
-                __dec_zone_page_state(page, NR_MLOCK);
+                __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
                SetPageMlocked(newpage);
-                __inc_zone_page_state(newpage, NR_MLOCK);
+                __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
                local_irq_restore(flags);
        }
 }
+extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern unsigned long vma_address(struct page *page,
                                 struct vm_area_struct *vma);
diff --git a/mm/ksm.c b/mm/ksm.c
index 382d930a0bf1..82dfb4b54321 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1624,7 +1624,7 @@ again:
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
-                anon_vma_lock(anon_vma);
+                anon_vma_lock_write(anon_vma);
                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
                                               0, ULONG_MAX) {
                        vma = vmac->vma;
@@ -1678,7 +1678,7 @@ again:
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
-                anon_vma_lock(anon_vma);
+                anon_vma_lock_write(anon_vma);
                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
                                               0, ULONG_MAX) {
                        vma = vmac->vma;
@@ -1731,7 +1731,7 @@ again:
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
-                anon_vma_lock(anon_vma);
+                anon_vma_lock_write(anon_vma);
                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
                                               0, ULONG_MAX) {
                        vma = vmac->vma;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6c055929c8cc..bbfac5063ca8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3289,15 +3289,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
                                  struct mem_cgroup **memcgp)
 {
        struct mem_cgroup *memcg = NULL;
+        unsigned int nr_pages = 1;
        struct page_cgroup *pc;
        enum charge_type ctype;
        *memcgp = NULL;
-        VM_BUG_ON(PageTransHuge(page));
        if (mem_cgroup_disabled())
                return;
+        if (PageTransHuge(page))
+                nr_pages <<= compound_order(page);
        pc = lookup_page_cgroup(page);
        lock_page_cgroup(pc);
        if (PageCgroupUsed(pc)) {
@@ -3359,7 +3362,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
         * charged to the res_counter since we plan on replacing the
         * old one and only one page is going to be left afterwards.
         */
-        __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
+        __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
 }
 /* remove redundant charge if migration failed*/
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 108c52fa60f6..c6e4dd3e1c08 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
        struct anon_vma *av;
        pgoff_t pgoff;
-        av = page_lock_anon_vma(page);
+        av = page_lock_anon_vma_read(page);
        if (av == NULL) /* Not actually mapped anymore */
                return;
@@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
                }
        }
        read_unlock(&tasklist_lock);
-        page_unlock_anon_vma(av);
+        page_unlock_anon_vma_read(av);
 }
 /*
@@ -1566,7 +1566,8 @@ int soft_offline_page(struct page *page, int flags)
                                            page_is_file_cache(page));
                list_add(&page->lru, &pagelist);
                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
-                                                        false, MIGRATE_SYNC);
+                                                        false, MIGRATE_SYNC,
+                                                        MR_MEMORY_FAILURE);
                if (ret) {
                        putback_lru_pages(&pagelist);
                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index db2e9e797a05..e6a3b933517e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,6 +57,7 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
 #include <linux/gfp.h>
+#include <linux/migrate.h>
 #include <asm/io.h>
 #include <asm/pgalloc.h>
@@ -1503,6 +1504,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
                goto out;
        }
+        if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+                goto no_page_table;
        if (pmd_trans_huge(*pmd)) {
                if (flags & FOLL_SPLIT) {
                        split_huge_page_pmd(vma, address, pmd);
@@ -1532,6 +1535,8 @@ split_fallthrough:
        pte = *ptep;
        if (!pte_present(pte))
                goto no_page;
+        if ((flags & FOLL_NUMA) && pte_numa(pte))
+                goto no_page;
        if ((flags & FOLL_WRITE) && !pte_write(pte))
                goto unlock;
@@ -1683,6 +1688,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
        vm_flags &= (gup_flags & FOLL_FORCE) ?
                        (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+        /*
+         * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
+         * would be called on PROT_NONE ranges. We must never invoke
+         * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
+         * page faults would unprotect the PROT_NONE ranges if
+         * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
+         * bitflag. So to avoid that, don't set FOLL_NUMA if
+         * FOLL_FORCE is set.
+         */
+        if (!(gup_flags & FOLL_FORCE))
+                gup_flags |= FOLL_NUMA;
        i = 0;
        do {
@@ -3412,6 +3430,169 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
+int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+                                unsigned long addr, int current_nid)
+{
+        get_page(page);
+        count_vm_numa_event(NUMA_HINT_FAULTS);
+        if (current_nid == numa_node_id())
+                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+        return mpol_misplaced(page, vma, addr);
+}
+int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                   unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
+{
+        struct page *page = NULL;
+        spinlock_t *ptl;
+        int current_nid = -1;
+        int target_nid;
+        bool migrated = false;
+        /*
+        * The "pte" at this point cannot be used safely without
+        * validation through pte_unmap_same(). It's of NUMA type but
+        * the pfn may be screwed if the read is non atomic.
+        *
+        * ptep_modify_prot_start is not called as this is clearing
+        * the _PAGE_NUMA bit and it is not really expected that there
+        * would be concurrent hardware modifications to the PTE.
+        */
+        ptl = pte_lockptr(mm, pmd);
+        spin_lock(ptl);
+        if (unlikely(!pte_same(*ptep, pte))) {
+                pte_unmap_unlock(ptep, ptl);
+                goto out;
+        }
+        pte = pte_mknonnuma(pte);
+        set_pte_at(mm, addr, ptep, pte);
+        update_mmu_cache(vma, addr, ptep);
+        page = vm_normal_page(vma, addr, pte);
+        if (!page) {
+                pte_unmap_unlock(ptep, ptl);
+                return 0;
+        }
+        current_nid = page_to_nid(page);
+        target_nid = numa_migrate_prep(page, vma, addr, current_nid);
+        pte_unmap_unlock(ptep, ptl);
+        if (target_nid == -1) {
+                /*
+                 * Account for the fault against the current node if it not
+                 * being replaced regardless of where the page is located.
+                 */
+                current_nid = numa_node_id();
+                put_page(page);
+                goto out;
+        }
+        /* Migrate to the requested node */
+        migrated = migrate_misplaced_page(page, target_nid);
+        if (migrated)
+                current_nid = target_nid;
+out:
+        if (current_nid != -1)
+                task_numa_fault(current_nid, 1, migrated);
+        return 0;
+}
+/* NUMA hinting page fault entry point for regular pmds */
+#ifdef CONFIG_NUMA_BALANCING
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, pmd_t *pmdp)
+{
+        pmd_t pmd;
+        pte_t *pte, *orig_pte;
+        unsigned long _addr = addr & PMD_MASK;
+        unsigned long offset;
+        spinlock_t *ptl;
+        bool numa = false;
+        int local_nid = numa_node_id();
+        spin_lock(&mm->page_table_lock);
+        pmd = *pmdp;
+        if (pmd_numa(pmd)) {
+                set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
+                numa = true;
+        }
+        spin_unlock(&mm->page_table_lock);
+        if (!numa)
+                return 0;
+        /* we're in a page fault so some vma must be in the range */
+        BUG_ON(!vma);
+        BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
+        offset = max(_addr, vma->vm_start) & ~PMD_MASK;
+        VM_BUG_ON(offset >= PMD_SIZE);
+        orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
+        pte += offset >> PAGE_SHIFT;
+        for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
+                pte_t pteval = *pte;
+                struct page *page;
+                int curr_nid = local_nid;
+                int target_nid;
+                bool migrated;
+                if (!pte_present(pteval))
+                        continue;
+                if (!pte_numa(pteval))
+                        continue;
+                if (addr >= vma->vm_end) {
+                        vma = find_vma(mm, addr);
+                        /* there's a pte present so there must be a vma */
+                        BUG_ON(!vma);
+                        BUG_ON(addr < vma->vm_start);
+                }
+                if (pte_numa(pteval)) {
+                        pteval = pte_mknonnuma(pteval);
+                        set_pte_at(mm, addr, pte, pteval);
+                }
+                page = vm_normal_page(vma, addr, pteval);
+                if (unlikely(!page))
+                        continue;
+                /* only check non-shared pages */
+                if (unlikely(page_mapcount(page) != 1))
+                        continue;
+                /*
+                 * Note that the NUMA fault is later accounted to either
+                 * the node that is currently running or where the page is
+                 * migrated to.
+                 */
+                curr_nid = local_nid;
+                target_nid = numa_migrate_prep(page, vma, addr,
+                                               page_to_nid(page));
+                if (target_nid == -1) {
+                        put_page(page);
+                        continue;
+                }
+                /* Migrate to the requested node */
+                pte_unmap_unlock(pte, ptl);
+                migrated = migrate_misplaced_page(page, target_nid);
+                if (migrated)
+                        curr_nid = target_nid;
+                task_numa_fault(curr_nid, 1, migrated);
+                pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+        }
+        pte_unmap_unlock(orig_pte, ptl);
+        return 0;
+}
+#else
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, pmd_t *pmdp)
+{
+        BUG();
+}
+#endif /* CONFIG_NUMA_BALANCING */
 /*
 * These routines also need to handle stuff like marking pages dirty
 * and/or accessed for architectures that don't do it in hardware (most
@@ -3450,6 +3631,9 @@ int handle_pte_fault(struct mm_struct *mm,
                                        pte, pmd, flags, entry);
        }
+        if (pte_numa(entry))
+                return do_numa_page(mm, vma, address, entry, pte, pmd);
        ptl = pte_lockptr(mm, pmd);
        spin_lock(ptl);
        if (unlikely(!pte_same(*pte, entry)))
@@ -3520,8 +3704,11 @@ retry:
                if (pmd_trans_huge(orig_pmd)) {
                        unsigned int dirty = flags & FAULT_FLAG_WRITE;
-                        if (dirty && !pmd_write(orig_pmd) &&
+                        if (pmd_numa(orig_pmd))
-                            !pmd_trans_splitting(orig_pmd)) {
+                                return do_huge_pmd_numa_page(mm, vma, address,
+                                                             orig_pmd, pmd);
+                        if (dirty && !pmd_write(orig_pmd)) {
                                ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
                                                          orig_pmd);
                                /*
@@ -3536,16 +3723,21 @@ retry:
                                huge_pmd_set_accessed(mm, vma, address, pmd,
                                                      orig_pmd, dirty);
                        }
                        return 0;
                }
        }
+        if (pmd_numa(*pmd))
+                return do_pmd_numa_page(mm, vma, address, pmd);
        /*
         * Use __pte_alloc instead of pte_alloc_map, because we can't
         * run pte_offset_map on the pmd, if an huge pmd could
         * materialize from under us from a different thread.
         */
-        if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
+        if (unlikely(pmd_none(*pmd)) &&
+            unlikely(__pte_alloc(mm, vma, pmd, address)))
                return VM_FAULT_OOM;
        /* if an huge pmd materialized from under us just retry later */
        if (unlikely(pmd_trans_huge(*pmd)))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 518baa896e83..962e353aa86f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1055,7 +1055,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                 * migrate_pages returns # of failed pages.
                 */
                ret = migrate_pages(&source, alloc_migrate_target, 0,
-                                                        true, MIGRATE_SYNC);
+                                                        true, MIGRATE_SYNC,
+                                                        MR_MEMORY_HOTPLUG);
                if (ret)
                        putback_lru_pages(&source);
        }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index aaf54566cb6b..d1b315e98627 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -90,6 +90,7 @@
 #include <linux/syscalls.h>
 #include <linux/ctype.h>
 #include <linux/mm_inline.h>
+#include <linux/mmu_notifier.h>
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -117,6 +118,26 @@ static struct mempolicy default_policy = {
        .flags = MPOL_F_LOCAL,
 };
+static struct mempolicy preferred_node_policy[MAX_NUMNODES];
+static struct mempolicy *get_task_policy(struct task_struct *p)
+{
+        struct mempolicy *pol = p->mempolicy;
+        int node;
+        if (!pol) {
+                node = numa_node_id();
+                if (node != -1)
+                        pol = &preferred_node_policy[node];
+                /* preferred_node_policy is not initialised early in boot */
+                if (!pol->mode)
+                        pol = NULL;
+        }
+        return pol;
+}
 static const struct mempolicy_operations {
        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
        /*
@@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
        if (mode == MPOL_DEFAULT) {
                if (nodes && !nodes_empty(*nodes))
                        return ERR_PTR(-EINVAL);
-                return NULL;    /* simply delete any existing policy */
+                return NULL;
        }
        VM_BUG_ON(!nodes);
@@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
                             (flags & MPOL_F_RELATIVE_NODES)))
                                return ERR_PTR(-EINVAL);
                }
+        } else if (mode == MPOL_LOCAL) {
+                if (!nodes_empty(*nodes))
+                        return ERR_PTR(-EINVAL);
+                mode = MPOL_PREFERRED;
        } else if (nodes_empty(*nodes))
                return ERR_PTR(-EINVAL);
        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
        return 0;
 }
+#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+/*
+ * This is used to mark a range of virtual addresses to be inaccessible.
+ * These are later cleared by a NUMA hinting fault. Depending on these
+ * faults, pages may be migrated for better NUMA placement.
+ *
+ * This is assuming that NUMA faults are handled using PROT_NONE. If
+ * an architecture makes a different choice, it will need further
+ * changes to the core.
+ */
+unsigned long change_prot_numa(struct vm_area_struct *vma,
+                        unsigned long addr, unsigned long end)
+{
+        int nr_updated;
+        BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
+        nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
+        if (nr_updated)
+                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
+        return nr_updated;
+}
+#else
+static unsigned long change_prot_numa(struct vm_area_struct *vma,
+                        unsigned long addr, unsigned long end)
+{
+        return 0;
+}
+#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
 /*
 * Check if all pages in a range are on a set of nodes.
 * If pagelist != NULL then isolate pages from the LRU and
@@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                return ERR_PTR(-EFAULT);
        prev = NULL;
        for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+                unsigned long endvma = vma->vm_end;
+                if (endvma > end)
+                        endvma = end;
+                if (vma->vm_start > start)
+                        start = vma->vm_start;
                if (!(flags & MPOL_MF_DISCONTIG_OK)) {
                        if (!vma->vm_next && vma->vm_end < end)
                                return ERR_PTR(-EFAULT);
                        if (prev && prev->vm_end < vma->vm_start)
                                return ERR_PTR(-EFAULT);
                }
-                if (!is_vm_hugetlb_page(vma) &&
-                    ((flags & MPOL_MF_STRICT) ||
+                if (is_vm_hugetlb_page(vma))
+                        goto next;
+                if (flags & MPOL_MF_LAZY) {
+                        change_prot_numa(vma, start, endvma);
+                        goto next;
+                }
+                if ((flags & MPOL_MF_STRICT) ||
                     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
-                                vma_migratable(vma)))) {
+                      vma_migratable(vma))) {
-                        unsigned long endvma = vma->vm_end;
-                        if (endvma > end)
-                                endvma = end;
-                        if (vma->vm_start > start)
-                                start = vma->vm_start;
                        err = check_pgd_range(vma, start, endvma, nodes,
                                                flags, private);
                        if (err) {
@@ -602,6 +667,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                                break;
                        }
                }
+next:
                prev = vma;
        }
        return first;
@@ -961,7 +1027,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_node_page, dest,
-                                                        false, MIGRATE_SYNC);
+                                                        false, MIGRATE_SYNC,
+                                                        MR_SYSCALL);
                if (err)
                        putback_lru_pages(&pagelist);
        }
@@ -1133,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len,
        int err;
        LIST_HEAD(pagelist);
-        if (flags & ~(unsigned long)(MPOL_MF_STRICT |
+        if (flags & ~(unsigned long)MPOL_MF_VALID)
-                                     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                return -EINVAL;
        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
                return -EPERM;
@@ -1157,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len,
        if (IS_ERR(new))
                return PTR_ERR(new);
+        if (flags & MPOL_MF_LAZY)
+                new->flags |= MPOL_F_MOF;
        /*
         * If we are using the default policy then operation
         * on discontinuous address spaces is okay after all
@@ -1193,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len,
        vma = check_range(mm, start, end, nmask,
                          flags | MPOL_MF_INVERT, &pagelist);
-        err = PTR_ERR(vma);
+        err = PTR_ERR(vma);     /* maybe ... */
-        if (!IS_ERR(vma)) {
+        if (!IS_ERR(vma))
-                int nr_failed = 0;
                err = mbind_range(mm, start, end, new);
+        if (!err) {
+                int nr_failed = 0;
                if (!list_empty(&pagelist)) {
+                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
                        nr_failed = migrate_pages(&pagelist, new_vma_page,
                                                (unsigned long)vma,
-                                                false, MIGRATE_SYNC);
+                                                false, MIGRATE_SYNC,
+                                                MR_MEMPOLICY_MBIND);
                        if (nr_failed)
                                putback_lru_pages(&pagelist);
                }
-                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+                if (nr_failed && (flags & MPOL_MF_STRICT))
                        err = -EIO;
        } else
                putback_lru_pages(&pagelist);
@@ -1546,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 struct mempolicy *get_vma_policy(struct task_struct *task,
                struct vm_area_struct *vma, unsigned long addr)
 {
-        struct mempolicy *pol = task->mempolicy;
+        struct mempolicy *pol = get_task_policy(task);
        if (vma) {
                if (vma->vm_ops && vma->vm_ops->get_policy) {
@@ -1956,7 +2028,7 @@ retry_cpuset:
 */
 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
-        struct mempolicy *pol = current->mempolicy;
+        struct mempolicy *pol = get_task_policy(current);
        struct page *page;
        unsigned int cpuset_mems_cookie;
@@ -2140,6 +2212,115 @@ static void sp_free(struct sp_node *n)
        kmem_cache_free(sn_cache, n);
 }
+/**
+ * mpol_misplaced - check whether current page node is valid in policy
+ *
+ * @page   - page to be checked
+ * @vma    - vm area where page mapped
+ * @addr   - virtual address where page mapped
+ *
+ * Lookup current policy node id for vma,addr and "compare to" page's
+ * node id.
+ *
+ * Returns:
+ *      -1      - not misplaced, page is in the right node
+ *      node    - node id where the page should be
+ *
+ * Policy determination "mimics" alloc_page_vma().
+ * Called from fault path where we know the vma and faulting address.
+ */
+int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
+{
+        struct mempolicy *pol;
+        struct zone *zone;
+        int curnid = page_to_nid(page);
+        unsigned long pgoff;
+        int polnid = -1;
+        int ret = -1;
+        BUG_ON(!vma);
+        pol = get_vma_policy(current, vma, addr);
+        if (!(pol->flags & MPOL_F_MOF))
+                goto out;
+        switch (pol->mode) {
+        case MPOL_INTERLEAVE:
+                BUG_ON(addr >= vma->vm_end);
+                BUG_ON(addr < vma->vm_start);
+                pgoff = vma->vm_pgoff;
+                pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
+                polnid = offset_il_node(pol, vma, pgoff);
+                break;
+        case MPOL_PREFERRED:
+                if (pol->flags & MPOL_F_LOCAL)
+                        polnid = numa_node_id();
+                else
+                        polnid = pol->v.preferred_node;
+                break;
+        case MPOL_BIND:
+                /*
+                 * allows binding to multiple nodes.
+                 * use current page if in policy nodemask,
+                 * else select nearest allowed node, if any.
+                 * If no allowed nodes, use current [!misplaced].
+                 */
+                if (node_isset(curnid, pol->v.nodes))
+                        goto out;
+                (void)first_zones_zonelist(
+                                node_zonelist(numa_node_id(), GFP_HIGHUSER),
+                                gfp_zone(GFP_HIGHUSER),
+                                &pol->v.nodes, &zone);
+                polnid = zone->node;
+                break;
+        default:
+                BUG();
+        }
+        /* Migrate the page towards the node whose CPU is referencing it */
+        if (pol->flags & MPOL_F_MORON) {
+                int last_nid;
+                polnid = numa_node_id();
+                /*
+                 * Multi-stage node selection is used in conjunction
+                 * with a periodic migration fault to build a temporal
+                 * task<->page relation. By using a two-stage filter we
+                 * remove short/unlikely relations.
+                 *
+                 * Using P(p) ~ n_p / n_t as per frequentist
+                 * probability, we can equate a task's usage of a
+                 * particular page (n_p) per total usage of this
+                 * page (n_t) (in a given time-span) to a probability.
+                 *
+                 * Our periodic faults will sample this probability and
+                 * getting the same result twice in a row, given these
+                 * samples are fully independent, is then given by
+                 * P(n)^2, provided our sample period is sufficiently
+                 * short compared to the usage pattern.
+                 *
+                 * This quadric squishes small probabilities, making
+                 * it less likely we act on an unlikely task<->page
+                 * relation.
+                 */
+                last_nid = page_xchg_last_nid(page, polnid);
+                if (last_nid != polnid)
+                        goto out;
+        }
+        if (curnid != polnid)
+                ret = polnid;
+out:
+        mpol_cond_put(pol);
+        return ret;
+}
 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
@@ -2305,6 +2486,50 @@ void mpol_free_shared_policy(struct shared_policy *p)
        mutex_unlock(&p->mutex);
 }
+#ifdef CONFIG_NUMA_BALANCING
+static bool __initdata numabalancing_override;
+static void __init check_numabalancing_enable(void)
+{
+        bool numabalancing_default = false;
+        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
+                numabalancing_default = true;
+        if (nr_node_ids > 1 && !numabalancing_override) {
+                printk(KERN_INFO "Enabling automatic NUMA balancing. "
+                        "Configure with numa_balancing= or sysctl");
+                set_numabalancing_state(numabalancing_default);
+        }
+}
+static int __init setup_numabalancing(char *str)
+{
+        int ret = 0;
+        if (!str)
+                goto out;
+        numabalancing_override = true;
+        if (!strcmp(str, "enable")) {
+                set_numabalancing_state(true);
+                ret = 1;
+        } else if (!strcmp(str, "disable")) {
+                set_numabalancing_state(false);
+                ret = 1;
+        }
+out:
+        if (!ret)
+                printk(KERN_WARNING "Unable to parse numa_balancing=\n");
+        return ret;
+}
+__setup("numa_balancing=", setup_numabalancing);
+#else
+static inline void __init check_numabalancing_enable(void)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
 /* assumes fs == KERNEL_DS */
 void __init numa_policy_init(void)
 {
@@ -2320,6 +2545,15 @@ void __init numa_policy_init(void)
                                     sizeof(struct sp_node),
                                     0, SLAB_PANIC, NULL);
+        for_each_node(nid) {
+                preferred_node_policy[nid] = (struct mempolicy) {
+                        .refcnt = ATOMIC_INIT(1),
+                        .mode = MPOL_PREFERRED,
+                        .flags = MPOL_F_MOF | MPOL_F_MORON,
+                        .v = { .preferred_node = nid, },
+                };
+        }
        /*
         * Set interleaving policy for system init. Interleaving is only
         * enabled across suitably sized nodes (default is >= 16MB), or
@@ -2346,6 +2580,8 @@ void __init numa_policy_init(void)
        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
                printk("numa_policy_init: interleaving failed\n");
+        check_numabalancing_enable();
 }
 /* Reset policy of current process to default */
@@ -2362,14 +2598,13 @@ void numa_default_policy(void)
 * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
 * Used only for mpol_parse_str() and mpol_to_str()
 */
-#define MPOL_LOCAL MPOL_MAX
 static const char * const policy_modes[] =
 {
        [MPOL_DEFAULT]    = "default",
        [MPOL_PREFERRED]  = "prefer",
        [MPOL_BIND]       = "bind",
        [MPOL_INTERLEAVE] = "interleave",
-        [MPOL_LOCAL]      = "local"
+        [MPOL_LOCAL]      = "local",
 };
@@ -2415,12 +2650,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
        if (flags)
                *flags++ = '\0';        /* terminate mode string */
-        for (mode = 0; mode <= MPOL_LOCAL; mode++) {
+        for (mode = 0; mode < MPOL_MAX; mode++) {
                if (!strcmp(str, policy_modes[mode])) {
                        break;
                }
        }
-        if (mode > MPOL_LOCAL)
+        if (mode >= MPOL_MAX)
                goto out;
        switch (mode) {
diff --git a/mm/migrate.c b/mm/migrate.c
index cae02711181d..32efd8028bc9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -39,6 +39,9 @@
 #include <asm/tlbflush.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/migrate.h>
 #include "internal.h"
 /*
@@ -293,7 +296,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
                struct page *newpage, struct page *page,
                struct buffer_head *head, enum migrate_mode mode)
 {
-        int expected_count;
+        int expected_count = 0;
        void **pslot;
        if (!mapping) {
@@ -421,7 +424,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
 */
 void migrate_page_copy(struct page *newpage, struct page *page)
 {
-        if (PageHuge(page))
+        if (PageHuge(page) || PageTransHuge(page))
                copy_huge_page(newpage, page);
        else
                copy_highpage(newpage, page);
@@ -765,7 +768,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
         */
        if (PageAnon(page)) {
                /*
-                 * Only page_lock_anon_vma() understands the subtleties of
+                 * Only page_lock_anon_vma_read() understands the subtleties of
                 * getting a hold on an anon_vma from outside one of its mms.
                 */
                anon_vma = page_get_anon_vma(page);
@@ -998,10 +1001,11 @@ out:
 */
 int migrate_pages(struct list_head *from,
                new_page_t get_new_page, unsigned long private, bool offlining,
-                enum migrate_mode mode)
+                enum migrate_mode mode, int reason)
 {
        int retry = 1;
        int nr_failed = 0;
+        int nr_succeeded = 0;
        int pass = 0;
        struct page *page;
        struct page *page2;
@@ -1028,6 +1032,7 @@ int migrate_pages(struct list_head *from,
                                retry++;
                                break;
                        case MIGRATEPAGE_SUCCESS:
+                                nr_succeeded++;
                                break;
                        default:
                                /* Permanent failure */
@@ -1038,6 +1043,12 @@ int migrate_pages(struct list_head *from,
        }
        rc = nr_failed + retry;
 out:
+        if (nr_succeeded)
+                count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
+        if (nr_failed)
+                count_vm_events(PGMIGRATE_FAIL, nr_failed);
+        trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
        if (!swapwrite)
                current->flags &= ~PF_SWAPWRITE;
@@ -1176,7 +1187,8 @@ set_status:
        err = 0;
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_page_node,
-                                (unsigned long)pm, 0, MIGRATE_SYNC);
+                                (unsigned long)pm, 0, MIGRATE_SYNC,
+                                MR_SYSCALL);
                if (err)
                        putback_lru_pages(&pagelist);
        }
@@ -1440,4 +1452,317 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
        }
        return err;
 }
-#endif
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * Returns true if this is a safe migration target node for misplaced NUMA
+ * pages. Currently it only checks the watermarks which crude
+ */
+static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
+                                   int nr_migrate_pages)
+{
+        int z;
+        for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+                struct zone *zone = pgdat->node_zones + z;
+                if (!populated_zone(zone))
+                        continue;
+                if (zone->all_unreclaimable)
+                        continue;
+                /* Avoid waking kswapd by allocating pages_to_migrate pages. */
+                if (!zone_watermark_ok(zone, 0,
+                                       high_wmark_pages(zone) +
+                                       nr_migrate_pages,
+                                       0, 0))
+                        continue;
+                return true;
+        }
+        return false;
+}
+static struct page *alloc_misplaced_dst_page(struct page *page,
+                                           unsigned long data,
+                                           int **result)
+{
+        int nid = (int) data;
+        struct page *newpage;
+        newpage = alloc_pages_exact_node(nid,
+                                         (GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
+                                          __GFP_NOMEMALLOC | __GFP_NORETRY |
+                                          __GFP_NOWARN) &
+                                         ~GFP_IOFS, 0);
+        if (newpage)
+                page_xchg_last_nid(newpage, page_last_nid(page));
+        return newpage;
+}
+/*
+ * page migration rate limiting control.
+ * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
+ * window of time. Default here says do not migrate more than 1280M per second.
+ * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
+ * as it is faults that reset the window, pte updates will happen unconditionally
+ * if there has not been a fault since @pteupdate_interval_millisecs after the
+ * throttle window closed.
+ */
+static unsigned int migrate_interval_millisecs __read_mostly = 100;
+static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
+static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
+/* Returns true if NUMA migration is currently rate limited */
+bool migrate_ratelimited(int node)
+{
+        pg_data_t *pgdat = NODE_DATA(node);
+        if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
+                                msecs_to_jiffies(pteupdate_interval_millisecs)))
+                return false;
+        if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
+                return false;
+        return true;
+}
+/* Returns true if the node is migrate rate-limited after the update */
+bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
+{
+        bool rate_limited = false;
+        /*
+         * Rate-limit the amount of data that is being migrated to a node.
+         * Optimal placement is no good if the memory bus is saturated and
+         * all the time is being spent migrating!
+         */
+        spin_lock(&pgdat->numabalancing_migrate_lock);
+        if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
+                pgdat->numabalancing_migrate_nr_pages = 0;
+                pgdat->numabalancing_migrate_next_window = jiffies +
+                        msecs_to_jiffies(migrate_interval_millisecs);
+        }
+        if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
+                rate_limited = true;
+        else
+                pgdat->numabalancing_migrate_nr_pages += nr_pages;
+        spin_unlock(&pgdat->numabalancing_migrate_lock);
+        
+        return rate_limited;
+}
+int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
+{
+        int ret = 0;
+        /* Avoid migrating to a node that is nearly full */
+        if (migrate_balanced_pgdat(pgdat, 1)) {
+                int page_lru;
+                if (isolate_lru_page(page)) {
+                        put_page(page);
+                        return 0;
+                }
+                /* Page is isolated */
+                ret = 1;
+                page_lru = page_is_file_cache(page);
+                if (!PageTransHuge(page))
+                        inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
+                else
+                        mod_zone_page_state(page_zone(page),
+                                        NR_ISOLATED_ANON + page_lru,
+                                        HPAGE_PMD_NR);
+        }
+        /*
+         * Page is either isolated or there is not enough space on the target
+         * node. If isolated, then it has taken a reference count and the
+         * callers reference can be safely dropped without the page
+         * disappearing underneath us during migration. Otherwise the page is
+         * not to be migrated but the callers reference should still be
+         * dropped so it does not leak.
+         */
+        put_page(page);
+        return ret;
+}
+/*
+ * Attempt to migrate a misplaced page to the specified destination
+ * node. Caller is expected to have an elevated reference count on
+ * the page that will be dropped by this function before returning.
+ */
+int migrate_misplaced_page(struct page *page, int node)
+{
+        pg_data_t *pgdat = NODE_DATA(node);
+        int isolated = 0;
+        int nr_remaining;
+        LIST_HEAD(migratepages);
+        /*
+         * Don't migrate pages that are mapped in multiple processes.
+         * TODO: Handle false sharing detection instead of this hammer
+         */
+        if (page_mapcount(page) != 1) {
+                put_page(page);
+                goto out;
+        }
+        /*
+         * Rate-limit the amount of data that is being migrated to a node.
+         * Optimal placement is no good if the memory bus is saturated and
+         * all the time is being spent migrating!
+         */
+        if (numamigrate_update_ratelimit(pgdat, 1)) {
+                put_page(page);
+                goto out;
+        }
+        isolated = numamigrate_isolate_page(pgdat, page);
+        if (!isolated)
+                goto out;
+        list_add(&page->lru, &migratepages);
+        nr_remaining = migrate_pages(&migratepages,
+                        alloc_misplaced_dst_page,
+                        node, false, MIGRATE_ASYNC,
+                        MR_NUMA_MISPLACED);
+        if (nr_remaining) {
+                putback_lru_pages(&migratepages);
+                isolated = 0;
+        } else
+                count_vm_numa_event(NUMA_PAGE_MIGRATE);
+        BUG_ON(!list_empty(&migratepages));
+out:
+        return isolated;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+int migrate_misplaced_transhuge_page(struct mm_struct *mm,
+                                struct vm_area_struct *vma,
+                                pmd_t *pmd, pmd_t entry,
+                                unsigned long address,
+                                struct page *page, int node)
+{
+        unsigned long haddr = address & HPAGE_PMD_MASK;
+        pg_data_t *pgdat = NODE_DATA(node);
+        int isolated = 0;
+        struct page *new_page = NULL;
+        struct mem_cgroup *memcg = NULL;
+        int page_lru = page_is_file_cache(page);
+        /*
+         * Don't migrate pages that are mapped in multiple processes.
+         * TODO: Handle false sharing detection instead of this hammer
+         */
+        if (page_mapcount(page) != 1)
+                goto out_dropref;
+        /*
+         * Rate-limit the amount of data that is being migrated to a node.
+         * Optimal placement is no good if the memory bus is saturated and
+         * all the time is being spent migrating!
+         */
+        if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
+                goto out_dropref;
+        new_page = alloc_pages_node(node,
+                (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
+        if (!new_page) {
+                count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+                goto out_dropref;
+        }
+        page_xchg_last_nid(new_page, page_last_nid(page));
+        isolated = numamigrate_isolate_page(pgdat, page);
+        if (!isolated) {
+                count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+                put_page(new_page);
+                goto out_keep_locked;
+        }
+        /* Prepare a page as a migration target */
+        __set_page_locked(new_page);
+        SetPageSwapBacked(new_page);
+        /* anon mapping, we can simply copy page->mapping to the new page: */
+        new_page->mapping = page->mapping;
+        new_page->index = page->index;
+        migrate_page_copy(new_page, page);
+        WARN_ON(PageLRU(new_page));
+        /* Recheck the target PMD */
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_same(*pmd, entry))) {
+                spin_unlock(&mm->page_table_lock);
+                /* Reverse changes made by migrate_page_copy() */
+                if (TestClearPageActive(new_page))
+                        SetPageActive(page);
+                if (TestClearPageUnevictable(new_page))
+                        SetPageUnevictable(page);
+                mlock_migrate_page(page, new_page);
+                unlock_page(new_page);
+                put_page(new_page);             /* Free it */
+                unlock_page(page);
+                putback_lru_page(page);
+                count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+                goto out;
+        }
+        /*
+         * Traditional migration needs to prepare the memcg charge
+         * transaction early to prevent the old page from being
+         * uncharged when installing migration entries.  Here we can
+         * save the potential rollback and start the charge transfer
+         * only when migration is already known to end successfully.
+         */
+        mem_cgroup_prepare_migration(page, new_page, &memcg);
+        entry = mk_pmd(new_page, vma->vm_page_prot);
+        entry = pmd_mknonnuma(entry);
+        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+        entry = pmd_mkhuge(entry);
+        page_add_new_anon_rmap(new_page, vma, haddr);
+        set_pmd_at(mm, haddr, pmd, entry);
+        update_mmu_cache_pmd(vma, address, entry);
+        page_remove_rmap(page);
+        /*
+         * Finish the charge transaction under the page table lock to
+         * prevent split_huge_page() from dividing up the charge
+         * before it's fully transferred to the new page.
+         */
+        mem_cgroup_end_migration(memcg, page, new_page, true);
+        spin_unlock(&mm->page_table_lock);
+        unlock_page(new_page);
+        unlock_page(page);
+        put_page(page);                 /* Drop the rmap reference */
+        put_page(page);                 /* Drop the LRU isolation reference */
+        count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
+        count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
+out:
+        mod_zone_page_state(page_zone(page),
+                        NR_ISOLATED_ANON + page_lru,
+                        -HPAGE_PMD_NR);
+        return isolated;
+out_dropref:
+        put_page(page);
+out_keep_locked:
+        return 0;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_NUMA */
diff --git a/mm/mmap.c b/mm/mmap.c
index 2b7d9e78a569..f54b235f29a9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -736,7 +736,7 @@ again:			remove_next = 1 + (end > next->vm_end);
        if (anon_vma) {
                VM_BUG_ON(adjust_next && next->anon_vma &&
                          anon_vma != next->anon_vma);
-                anon_vma_lock(anon_vma);
+                anon_vma_lock_write(anon_vma);
                anon_vma_interval_tree_pre_update_vma(vma);
                if (adjust_next)
                        anon_vma_interval_tree_pre_update_vma(next);
@@ -2886,15 +2886,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
                 * The LSB of head.next can't change from under us
                 * because we hold the mm_all_locks_mutex.
                 */
-                mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem);
+                down_write(&anon_vma->root->rwsem);
                /*
                 * We can safely modify head.next after taking the
-                 * anon_vma->root->mutex. If some other vma in this mm shares
+                 * anon_vma->root->rwsem. If some other vma in this mm shares
                 * the same anon_vma we won't take it again.
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us thanks to the
-                 * anon_vma->root->mutex.
+                 * anon_vma->root->rwsem.
                 */
                if (__test_and_set_bit(0, (unsigned long *)
                                       &anon_vma->root->rb_root.rb_node))
@@ -2996,7 +2996,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us until we release the
-                 * anon_vma->root->mutex.
+                 * anon_vma->root->rwsem.
                 */
                if (!__test_and_clear_bit(0, (unsigned long *)
                                          &anon_vma->root->rb_root.rb_node))
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e8c3938db6fa..3dca970367db 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -35,12 +35,16 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
 }
 #endif
-static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
+static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, unsigned long end, pgprot_t newprot,
-                int dirty_accountable)
+                int dirty_accountable, int prot_numa, bool *ret_all_same_node)
 {
+        struct mm_struct *mm = vma->vm_mm;
        pte_t *pte, oldpte;
        spinlock_t *ptl;
+        unsigned long pages = 0;
+        bool all_same_node = true;
+        int last_nid = -1;
        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        arch_enter_lazy_mmu_mode();
@@ -48,17 +52,43 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                oldpte = *pte;
                if (pte_present(oldpte)) {
                        pte_t ptent;
+                        bool updated = false;
                        ptent = ptep_modify_prot_start(mm, addr, pte);
-                        ptent = pte_modify(ptent, newprot);
+                        if (!prot_numa) {
+                                ptent = pte_modify(ptent, newprot);
+                                updated = true;
+                        } else {
+                                struct page *page;
+                                page = vm_normal_page(vma, addr, oldpte);
+                                if (page) {
+                                        int this_nid = page_to_nid(page);
+                                        if (last_nid == -1)
+                                                last_nid = this_nid;
+                                        if (last_nid != this_nid)
+                                                all_same_node = false;
+                                        /* only check non-shared pages */
+                                        if (!pte_numa(oldpte) &&
+                                            page_mapcount(page) == 1) {
+                                                ptent = pte_mknuma(ptent);
+                                                updated = true;
+                                        }
+                                }
+                        }
                        /*
                         * Avoid taking write faults for pages we know to be
                         * dirty.
                         */
-                        if (dirty_accountable && pte_dirty(ptent))
+                        if (dirty_accountable && pte_dirty(ptent)) {
                                ptent = pte_mkwrite(ptent);
+                                updated = true;
+                        }
+                        if (updated)
+                                pages++;
                        ptep_modify_prot_commit(mm, addr, pte, ptent);
                } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
                        swp_entry_t entry = pte_to_swp_entry(oldpte);
@@ -72,18 +102,40 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                                set_pte_at(mm, addr, pte,
                                        swp_entry_to_pte(entry));
                        }
+                        pages++;
                }
        } while (pte++, addr += PAGE_SIZE, addr != end);
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(pte - 1, ptl);
+        *ret_all_same_node = all_same_node;
+        return pages;
 }
-static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+#ifdef CONFIG_NUMA_BALANCING
+static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
+                pmd_t *pmd)
+{
+        spin_lock(&mm->page_table_lock);
+        set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
+        spin_unlock(&mm->page_table_lock);
+}
+#else
+static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
+                pmd_t *pmd)
+{
+        BUG();
+}
+#endif /* CONFIG_NUMA_BALANCING */
+static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                unsigned long addr, unsigned long end, pgprot_t newprot,
-                int dirty_accountable)
+                int dirty_accountable, int prot_numa)
 {
        pmd_t *pmd;
        unsigned long next;
+        unsigned long pages = 0;
+        bool all_same_node;
        pmd = pmd_offset(pud, addr);
        do {
@@ -91,42 +143,59 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                if (pmd_trans_huge(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE)
                                split_huge_page_pmd(vma, addr, pmd);
-                        else if (change_huge_pmd(vma, pmd, addr, newprot))
+                        else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) {
+                                pages += HPAGE_PMD_NR;
                                continue;
+                        }
                        /* fall through */
                }
                if (pmd_none_or_clear_bad(pmd))
                        continue;
-                change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
+                pages += change_pte_range(vma, pmd, addr, next, newprot,
-                                 dirty_accountable);
+                                 dirty_accountable, prot_numa, &all_same_node);
+                /*
+                 * If we are changing protections for NUMA hinting faults then
+                 * set pmd_numa if the examined pages were all on the same
+                 * node. This allows a regular PMD to be handled as one fault
+                 * and effectively batches the taking of the PTL
+                 */
+                if (prot_numa && all_same_node)
+                        change_pmd_protnuma(vma->vm_mm, addr, pmd);
        } while (pmd++, addr = next, addr != end);
+        return pages;
 }
-static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
                unsigned long addr, unsigned long end, pgprot_t newprot,
-                int dirty_accountable)
+                int dirty_accountable, int prot_numa)
 {
        pud_t *pud;
        unsigned long next;
+        unsigned long pages = 0;
        pud = pud_offset(pgd, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
-                change_pmd_range(vma, pud, addr, next, newprot,
+                pages += change_pmd_range(vma, pud, addr, next, newprot,
-                                 dirty_accountable);
+                                 dirty_accountable, prot_numa);
        } while (pud++, addr = next, addr != end);
+        return pages;
 }
-static void change_protection(struct vm_area_struct *vma,
+static unsigned long change_protection_range(struct vm_area_struct *vma,
                unsigned long addr, unsigned long end, pgprot_t newprot,
-                int dirty_accountable)
+                int dirty_accountable, int prot_numa)
 {
        struct mm_struct *mm = vma->vm_mm;
        pgd_t *pgd;
        unsigned long next;
        unsigned long start = addr;
+        unsigned long pages = 0;
        BUG_ON(addr >= end);
        pgd = pgd_offset(mm, addr);
@@ -135,10 +204,32 @@ static void change_protection(struct vm_area_struct *vma,
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-                change_pud_range(vma, pgd, addr, next, newprot,
+                pages += change_pud_range(vma, pgd, addr, next, newprot,
-                                 dirty_accountable);
+                                 dirty_accountable, prot_numa);
        } while (pgd++, addr = next, addr != end);
-        flush_tlb_range(vma, start, end);
+        /* Only flush the TLB if we actually modified any entries: */
+        if (pages)
+                flush_tlb_range(vma, start, end);
+        return pages;
+}
+unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
+                       unsigned long end, pgprot_t newprot,
+                       int dirty_accountable, int prot_numa)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        unsigned long pages;
+        mmu_notifier_invalidate_range_start(mm, start, end);
+        if (is_vm_hugetlb_page(vma))
+                pages = hugetlb_change_protection(vma, start, end, newprot);
+        else
+                pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
+        mmu_notifier_invalidate_range_end(mm, start, end);
+        return pages;
 }
 int
@@ -213,12 +304,8 @@ success:
                dirty_accountable = 1;
        }
-        mmu_notifier_invalidate_range_start(mm, start, end);
+        change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0);
-        if (is_vm_hugetlb_page(vma))
-                hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
-        else
-                change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
-        mmu_notifier_invalidate_range_end(mm, start, end);
        vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
        vm_stat_account(mm, newflags, vma->vm_file, nrpages);
        perf_event_mmap(vma);
diff --git a/mm/mremap.c b/mm/mremap.c
index eabb24da6c9e..e1031e1f6a61 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                }
                if (vma->anon_vma) {
                        anon_vma = vma->anon_vma;
-                        anon_vma_lock(anon_vma);
+                        anon_vma_lock_write(anon_vma);
                }
        }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 83637dfba110..d037c8bc1512 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -611,6 +611,7 @@ static inline int free_pages_check(struct page *page)
                bad_page(page);
                return 1;
        }
+        reset_page_last_nid(page);
        if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
                page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
        return 0;
@@ -3883,6 +3884,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                mminit_verify_page_links(page, zone, nid, pfn);
                init_page_count(page);
                reset_page_mapcount(page);
+                reset_page_last_nid(page);
                SetPageReserved(page);
                /*
                 * Mark the block movable so that blocks are reserved for
@@ -4526,6 +4528,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
        int ret;
        pgdat_resize_init(pgdat);
+#ifdef CONFIG_NUMA_BALANCING
+        spin_lock_init(&pgdat->numabalancing_migrate_lock);
+        pgdat->numabalancing_migrate_nr_pages = 0;
+        pgdat->numabalancing_migrate_next_window = jiffies;
+#endif
        init_waitqueue_head(&pgdat->kswapd_wait);
        init_waitqueue_head(&pgdat->pfmemalloc_wait);
        pgdat_page_cgroup_init(pgdat);
@@ -5800,7 +5807,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
                ret = migrate_pages(&cc->migratepages,
                                    alloc_migrate_target,
-                                    0, false, MIGRATE_SYNC);
+                                    0, false, MIGRATE_SYNC,
+                                    MR_CMA);
        }
        putback_movable_pages(&cc->migratepages);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index e642627da6b7..0c8323fe6c8f 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -12,8 +12,8 @@
 #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 /*
- * Only sets the access flags (dirty, accessed, and
+ * Only sets the access flags (dirty, accessed), as well as write 
- * writable). Furthermore, we know it always gets set to a "more
+ * permission. Furthermore, we know it always gets set to a "more
 * permissive" setting, which allows most architectures to optimize
 * this. We return whether the PTE actually changed, which in turn
 * instructs the caller to do things like update__mmu_cache.  This
@@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
        int changed = !pte_same(*ptep, entry);
        if (changed) {
                set_pte_at(vma->vm_mm, address, ptep, entry);
-                flush_tlb_page(vma, address);
+                flush_tlb_fix_spurious_fault(vma, address);
        }
        return changed;
 }
@@ -88,7 +88,8 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
 {
        pte_t pte;
        pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
-        flush_tlb_page(vma, address);
+        if (pte_accessible(pte))
+                flush_tlb_page(vma, address);
        return pte;
 }
 #endif
diff --git a/mm/rmap.c b/mm/rmap.c
index face808a489e..2c78f8cadc95 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,7 +24,7 @@
 *   mm->mmap_sem
 *     page->flags PG_locked (lock_page)
 *       mapping->i_mmap_mutex
- *         anon_vma->mutex
+ *         anon_vma->rwsem
 *           mm->page_table_lock or pte_lock
 *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)
 *             swap_lock (in swap_duplicate, swap_info_get)
@@ -37,7 +37,7 @@
 *                           in arch-dependent flush_dcache_mmap_lock,
 *                           within bdi.wb->list_lock in __sync_single_inode)
 *
- * anon_vma->mutex,mapping->i_mutex      (memory_failure, collect_procs_anon)
+ * anon_vma->rwsem,mapping->i_mutex      (memory_failure, collect_procs_anon)
 *   ->tasklist_lock
 *     pte map lock
 */
@@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
        VM_BUG_ON(atomic_read(&anon_vma->refcount));
        /*
-         * Synchronize against page_lock_anon_vma() such that
+         * Synchronize against page_lock_anon_vma_read() such that
         * we can safely hold the lock without the anon_vma getting
         * freed.
         *
         * Relies on the full mb implied by the atomic_dec_and_test() from
         * put_anon_vma() against the acquire barrier implied by
-         * mutex_trylock() from page_lock_anon_vma(). This orders:
+         * down_read_trylock() from page_lock_anon_vma_read(). This orders:
         *
-         * page_lock_anon_vma()         VS      put_anon_vma()
+         * page_lock_anon_vma_read()    VS      put_anon_vma()
-         *   mutex_trylock()                      atomic_dec_and_test()
+         *   down_read_trylock()                  atomic_dec_and_test()
         *   LOCK                                 MB
-         *   atomic_read()                        mutex_is_locked()
+         *   atomic_read()                        rwsem_is_locked()
         *
         * LOCK should suffice since the actual taking of the lock must
         * happen _before_ what follows.
         */
-        if (mutex_is_locked(&anon_vma->root->mutex)) {
+        if (rwsem_is_locked(&anon_vma->root->rwsem)) {
-                anon_vma_lock(anon_vma);
+                anon_vma_lock_write(anon_vma);
                anon_vma_unlock(anon_vma);
        }
@@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
 * allocate a new one.
 *
 * Anon-vma allocations are very subtle, because we may have
- * optimistically looked up an anon_vma in page_lock_anon_vma()
+ * optimistically looked up an anon_vma in page_lock_anon_vma_read()
 * and that may actually touch the spinlock even in the newly
 * allocated vma (it depends on RCU to make sure that the
 * anon_vma isn't actually destroyed).
@@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                        allocated = anon_vma;
                }
-                anon_vma_lock(anon_vma);
+                anon_vma_lock_write(anon_vma);
                /* page_table_lock to protect against threads */
                spin_lock(&mm->page_table_lock);
                if (likely(!vma->anon_vma)) {
@@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
        struct anon_vma *new_root = anon_vma->root;
        if (new_root != root) {
                if (WARN_ON_ONCE(root))
-                        mutex_unlock(&root->mutex);
+                        up_write(&root->rwsem);
                root = new_root;
-                mutex_lock(&root->mutex);
+                down_write(&root->rwsem);
        }
        return root;
 }
@@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
 static inline void unlock_anon_vma_root(struct anon_vma *root)
 {
        if (root)
-                mutex_unlock(&root->mutex);
+                up_write(&root->rwsem);
 }
 /*
@@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
        get_anon_vma(anon_vma->root);
        /* Mark this anon_vma as the one where our new (COWed) pages go. */
        vma->anon_vma = anon_vma;
-        anon_vma_lock(anon_vma);
+        anon_vma_lock_write(anon_vma);
        anon_vma_chain_link(vma, avc, anon_vma);
        anon_vma_unlock(anon_vma);
@@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
        /*
         * Iterate the list once more, it now only contains empty and unlinked
         * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
-         * needing to acquire the anon_vma->root->mutex.
+         * needing to write-acquire the anon_vma->root->rwsem.
         */
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma = avc->anon_vma;
@@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data)
 {
        struct anon_vma *anon_vma = data;
-        mutex_init(&anon_vma->mutex);
+        init_rwsem(&anon_vma->rwsem);
        atomic_set(&anon_vma->refcount, 0);
        anon_vma->rb_root = RB_ROOT;
 }
@@ -442,7 +442,7 @@ out:
 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
 * reference like with page_get_anon_vma() and then block on the mutex.
 */
-struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_lock_anon_vma_read(struct page *page)
 {
        struct anon_vma *anon_vma = NULL;
        struct anon_vma *root_anon_vma;
@@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
        root_anon_vma = ACCESS_ONCE(anon_vma->root);
-        if (mutex_trylock(&root_anon_vma->mutex)) {
+        if (down_read_trylock(&root_anon_vma->rwsem)) {
                /*
                 * If the page is still mapped, then this anon_vma is still
                 * its anon_vma, and holding the mutex ensures that it will
                 * not go away, see anon_vma_free().
                 */
                if (!page_mapped(page)) {
-                        mutex_unlock(&root_anon_vma->mutex);
+                        up_read(&root_anon_vma->rwsem);
                        anon_vma = NULL;
                }
                goto out;
@@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
        /* we pinned the anon_vma, its safe to sleep */
        rcu_read_unlock();
-        anon_vma_lock(anon_vma);
+        anon_vma_lock_read(anon_vma);
        if (atomic_dec_and_test(&anon_vma->refcount)) {
                /*
                 * Oops, we held the last refcount, release the lock
                 * and bail -- can't simply use put_anon_vma() because
-                 * we'll deadlock on the anon_vma_lock() recursion.
+                 * we'll deadlock on the anon_vma_lock_write() recursion.
                 */
-                anon_vma_unlock(anon_vma);
+                anon_vma_unlock_read(anon_vma);
                __put_anon_vma(anon_vma);
                anon_vma = NULL;
        }
@@ -504,9 +504,9 @@ out:
        return anon_vma;
 }
-void page_unlock_anon_vma(struct anon_vma *anon_vma)
+void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
 {
-        anon_vma_unlock(anon_vma);
+        anon_vma_unlock_read(anon_vma);
 }
 /*
@@ -744,7 +744,7 @@ static int page_referenced_anon(struct page *page,
        struct anon_vma_chain *avc;
        int referenced = 0;
-        anon_vma = page_lock_anon_vma(page);
+        anon_vma = page_lock_anon_vma_read(page);
        if (!anon_vma)
                return referenced;
@@ -766,7 +766,7 @@ static int page_referenced_anon(struct page *page,
                        break;
        }
-        page_unlock_anon_vma(anon_vma);
+        page_unlock_anon_vma_read(anon_vma);
        return referenced;
 }
@@ -1315,7 +1315,7 @@ out_mlock:
        /*
         * We need mmap_sem locking, Otherwise VM_LOCKED check makes
         * unstable result and race. Plus, We can't wait here because
-         * we now hold anon_vma->mutex or mapping->i_mmap_mutex.
+         * we now hold anon_vma->rwsem or mapping->i_mmap_mutex.
         * if trylock failed, the page remain in evictable lru and later
         * vmscan could retry to move the page to unevictable lru if the
         * page is actually mlocked.
@@ -1480,7 +1480,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
        struct anon_vma_chain *avc;
        int ret = SWAP_AGAIN;
-        anon_vma = page_lock_anon_vma(page);
+        anon_vma = page_lock_anon_vma_read(page);
        if (!anon_vma)
                return ret;
@@ -1507,7 +1507,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
                        break;
        }
-        page_unlock_anon_vma(anon_vma);
+        page_unlock_anon_vma_read(anon_vma);
        return ret;
 }
@@ -1702,7 +1702,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
        int ret = SWAP_AGAIN;
        /*
-         * Note: remove_migration_ptes() cannot use page_lock_anon_vma()
+         * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
         * because that depends on page_mapped(); but not all its usages
         * are holding mmap_sem. Users without mmap_sem are required to
         * take a reference count to prevent the anon_vma disappearing
@@ -1710,7 +1710,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
        anon_vma = page_anon_vma(page);
        if (!anon_vma)
                return ret;
-        anon_vma_lock(anon_vma);
+        anon_vma_lock_read(anon_vma);
        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);
@@ -1718,7 +1718,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
                if (ret != SWAP_AGAIN)
                        break;
        }
-        anon_vma_unlock(anon_vma);
+        anon_vma_unlock_read(anon_vma);
        return ret;
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index df14808f0a36..9800306c8195 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -774,10 +774,20 @@ const char * const vmstat_text[] = {
        "pgrotated",
+#ifdef CONFIG_NUMA_BALANCING
+        "numa_pte_updates",
+        "numa_hint_faults",
+        "numa_hint_faults_local",
+        "numa_pages_migrated",
+#endif
+#ifdef CONFIG_MIGRATION
+        "pgmigrate_success",
+        "pgmigrate_fail",
+#endif
 #ifdef CONFIG_COMPACTION
-        "compact_blocks_moved",
+        "compact_migrate_scanned",
-        "compact_pages_moved",
+        "compact_free_scanned",
-        "compact_pagemigrate_failed",
+        "compact_isolated",
        "compact_stall",
        "compact_fail",
        "compact_success",