20 files changed, 222 insertions, 167 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 97a4e06b15c0..03cbfa072f42 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -624,7 +624,7 @@ config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
        bool
 config DEFERRED_STRUCT_PAGE_INIT
-        bool "Defer initialisation of struct pages to kswapd"
+        bool "Defer initialisation of struct pages to kthreads"
        default n
        depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
        depends on MEMORY_HOTPLUG
@@ -633,9 +633,10 @@ config DEFERRED_STRUCT_PAGE_INIT
          single thread. On very large machines this can take a considerable
          amount of time. If this option is set, large machines will bring up
          a subset of memmap at boot and then initialise the rest in parallel
-          when kswapd starts. This has a potential performance impact on
+          by starting one-off "pgdatinitX" kernel thread for each node X. This
-          processes running early in the lifetime of the systemm until kswapd
+          has a potential performance impact on processes running early in the
-          finishes the initialisation.
+          lifetime of the system until these kthreads finish the
+          initialisation.
 config IDLE_PAGE_TRACKING
        bool "Enable idle page tracking"
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index cc5d29d2da9b..c554d173a65f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -328,7 +328,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
        return 0;
 out_destroy_stat:
-        while (--i)
+        while (i--)
                percpu_counter_destroy(&wb->stat[i]);
        fprop_local_destroy_percpu(&wb->completions);
 out_put_cong:
@@ -989,7 +989,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
                 * here rather than calling cond_resched().
                 */
                if (current->flags & PF_WQ_WORKER)
-                        schedule_timeout(1);
+                        schedule_timeout_uninterruptible(1);
                else
                        cond_resched();
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 8fc50811119b..ba5d8f3e6d68 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -22,7 +22,7 @@
 * cleancache_ops is set by cleancache_register_ops to contain the pointers
 * to the cleancache "backend" implementation functions.
 */
-static struct cleancache_ops *cleancache_ops __read_mostly;
+static const struct cleancache_ops *cleancache_ops __read_mostly;
 /*
 * Counters available via /sys/kernel/debug/cleancache (if debugfs is
@@ -49,7 +49,7 @@ static void cleancache_register_ops_sb(struct super_block *sb, void *unused)
 /*
 * Register operations for cleancache. Returns 0 on success.
 */
-int cleancache_register_ops(struct cleancache_ops *ops)
+int cleancache_register_ops(const struct cleancache_ops *ops)
 {
        if (cmpxchg(&cleancache_ops, NULL, ops))
                return -EBUSY;
diff --git a/mm/filemap.c b/mm/filemap.c
index bc943867d68c..23edccecadb0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1890,6 +1890,7 @@ EXPORT_SYMBOL(generic_file_read_iter);
 * page_cache_read - adds requested page to the page cache if not already there
 * @file:       file to read
 * @offset:     page index
+ * @gfp_mask:   memory allocation flags
 *
 * This adds the requested page to the page cache if it isn't already there,
 * and schedules an I/O to read in its contents from disk.
diff --git a/mm/gup.c b/mm/gup.c
index b64a36175884..7bf19ffa2199 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -430,10 +430,8 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
                         * Anon pages in shared mappings are surprising: now
                         * just reject it.
                         */
-                        if (!is_cow_mapping(vm_flags)) {
+                        if (!is_cow_mapping(vm_flags))
-                                WARN_ON_ONCE(vm_flags & VM_MAYWRITE);
                                return -EFAULT;
-                        }
                }
        } else if (!(vm_flags & VM_READ)) {
                if (!(gup_flags & FOLL_FORCE))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fd3a07b3e6f4..08fc0ba2207e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -138,9 +138,6 @@ static struct khugepaged_scan khugepaged_scan = {
        .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
 };
-static DEFINE_SPINLOCK(split_queue_lock);
-static LIST_HEAD(split_queue);
-static unsigned long split_queue_len;
 static struct shrinker deferred_split_shrinker;
 static void set_recommended_min_free_kbytes(void)
@@ -861,7 +858,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
                return false;
        entry = mk_pmd(zero_page, vma->vm_page_prot);
        entry = pmd_mkhuge(entry);
-        pgtable_trans_huge_deposit(mm, pmd, pgtable);
+        if (pgtable)
+                pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, haddr, pmd, entry);
        atomic_long_inc(&mm->nr_ptes);
        return true;
@@ -1039,13 +1037,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        spinlock_t *dst_ptl, *src_ptl;
        struct page *src_page;
        pmd_t pmd;
-        pgtable_t pgtable;
+        pgtable_t pgtable = NULL;
        int ret;
-        ret = -ENOMEM;
+        if (!vma_is_dax(vma)) {
-        pgtable = pte_alloc_one(dst_mm, addr);
+                ret = -ENOMEM;
-        if (unlikely(!pgtable))
+                pgtable = pte_alloc_one(dst_mm, addr);
-                goto out;
+                if (unlikely(!pgtable))
+                        goto out;
+        }
        dst_ptl = pmd_lock(dst_mm, dst_pmd);
        src_ptl = pmd_lockptr(src_mm, src_pmd);
@@ -1076,7 +1076,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                goto out_unlock;
        }
-        if (pmd_trans_huge(pmd)) {
+        if (!vma_is_dax(vma)) {
                /* thp accounting separate from pmd_devmap accounting */
                src_page = pmd_page(pmd);
                VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
@@ -3358,6 +3358,7 @@ int total_mapcount(struct page *page)
 int split_huge_page_to_list(struct page *page, struct list_head *list)
 {
        struct page *head = compound_head(page);
+        struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
        struct anon_vma *anon_vma;
        int count, mapcount, ret;
        bool mlocked;
@@ -3401,19 +3402,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                lru_add_drain();
        /* Prevent deferred_split_scan() touching ->_count */
-        spin_lock_irqsave(&split_queue_lock, flags);
+        spin_lock_irqsave(&pgdata->split_queue_lock, flags);
        count = page_count(head);
        mapcount = total_mapcount(head);
        if (!mapcount && count == 1) {
                if (!list_empty(page_deferred_list(head))) {
-                        split_queue_len--;
+                        pgdata->split_queue_len--;
                        list_del(page_deferred_list(head));
                }
-                spin_unlock_irqrestore(&split_queue_lock, flags);
+                spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
                __split_huge_page(page, list);
                ret = 0;
        } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
-                spin_unlock_irqrestore(&split_queue_lock, flags);
+                spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
                pr_alert("total_mapcount: %u, page_count(): %u\n",
                                mapcount, count);
                if (PageTail(page))
@@ -3421,7 +3422,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                dump_page(page, "total_mapcount(head) > 0");
                BUG();
        } else {
-                spin_unlock_irqrestore(&split_queue_lock, flags);
+                spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
                unfreeze_page(anon_vma, head);
                ret = -EBUSY;
        }
@@ -3436,64 +3437,65 @@ out:
 void free_transhuge_page(struct page *page)
 {
+        struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
        unsigned long flags;
-        spin_lock_irqsave(&split_queue_lock, flags);
+        spin_lock_irqsave(&pgdata->split_queue_lock, flags);
        if (!list_empty(page_deferred_list(page))) {
-                split_queue_len--;
+                pgdata->split_queue_len--;
                list_del(page_deferred_list(page));
        }
-        spin_unlock_irqrestore(&split_queue_lock, flags);
+        spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
        free_compound_page(page);
 }
 void deferred_split_huge_page(struct page *page)
 {
+        struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
        unsigned long flags;
        VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-        spin_lock_irqsave(&split_queue_lock, flags);
+        spin_lock_irqsave(&pgdata->split_queue_lock, flags);
        if (list_empty(page_deferred_list(page))) {
-                list_add_tail(page_deferred_list(page), &split_queue);
+                list_add_tail(page_deferred_list(page), &pgdata->split_queue);
-                split_queue_len++;
+                pgdata->split_queue_len++;
        }
-        spin_unlock_irqrestore(&split_queue_lock, flags);
+        spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
 }
 static unsigned long deferred_split_count(struct shrinker *shrink,
                struct shrink_control *sc)
 {
-        /*
+        struct pglist_data *pgdata = NODE_DATA(sc->nid);
-         * Split a page from split_queue will free up at least one page,
+        return ACCESS_ONCE(pgdata->split_queue_len);
-         * at most HPAGE_PMD_NR - 1. We don't track exact number.
-         * Let's use HPAGE_PMD_NR / 2 as ballpark.
-         */
-        return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
 }
 static unsigned long deferred_split_scan(struct shrinker *shrink,
                struct shrink_control *sc)
 {
+        struct pglist_data *pgdata = NODE_DATA(sc->nid);
        unsigned long flags;
        LIST_HEAD(list), *pos, *next;
        struct page *page;
        int split = 0;
-        spin_lock_irqsave(&split_queue_lock, flags);
+        spin_lock_irqsave(&pgdata->split_queue_lock, flags);
-        list_splice_init(&split_queue, &list);
        /* Take pin on all head pages to avoid freeing them under us */
-        list_for_each_safe(pos, next, &list) {
+        list_for_each_safe(pos, next, &pgdata->split_queue) {
                page = list_entry((void *)pos, struct page, mapping);
                page = compound_head(page);
-                /* race with put_compound_page() */
+                if (get_page_unless_zero(page)) {
-                if (!get_page_unless_zero(page)) {
+                        list_move(page_deferred_list(page), &list);
+                } else {
+                        /* We lost race with put_compound_page() */
                        list_del_init(page_deferred_list(page));
-                        split_queue_len--;
+                        pgdata->split_queue_len--;
                }
+                if (!--sc->nr_to_scan)
+                        break;
        }
-        spin_unlock_irqrestore(&split_queue_lock, flags);
+        spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
        list_for_each_safe(pos, next, &list) {
                page = list_entry((void *)pos, struct page, mapping);
@@ -3505,17 +3507,24 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
                put_page(page);
        }
-        spin_lock_irqsave(&split_queue_lock, flags);
+        spin_lock_irqsave(&pgdata->split_queue_lock, flags);
-        list_splice_tail(&list, &split_queue);
+        list_splice_tail(&list, &pgdata->split_queue);
-        spin_unlock_irqrestore(&split_queue_lock, flags);
+        spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
-        return split * HPAGE_PMD_NR / 2;
+        /*
+         * Stop shrinker if we didn't split any page, but the queue is empty.
+         * This can happen if pages were freed under us.
+         */
+        if (!split && list_empty(&pgdata->split_queue))
+                return SHRINK_STOP;
+        return split;
 }
 static struct shrinker deferred_split_shrinker = {
        .count_objects = deferred_split_count,
        .scan_objects = deferred_split_scan,
        .seeks = DEFAULT_SEEKS,
+        .flags = SHRINKER_NUMA_AWARE,
 };
 #ifdef CONFIG_DEBUG_FS
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 12908dcf5831..06ae13e869d0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1001,7 +1001,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
                ((node = hstate_next_node_to_free(hs, mask)) || 1);     \
                nr_nodes--)
-#if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
+#if defined(CONFIG_X86_64) && ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA))
 static void destroy_compound_gigantic_page(struct page *page,
                                        unsigned int order)
 {
@@ -1214,8 +1214,8 @@ void free_huge_page(struct page *page)
        set_page_private(page, 0);
        page->mapping = NULL;
-        BUG_ON(page_count(page));
+        VM_BUG_ON_PAGE(page_count(page), page);
-        BUG_ON(page_mapcount(page));
+        VM_BUG_ON_PAGE(page_mapcount(page), page);
        restore_reserve = PagePrivate(page);
        ClearPagePrivate(page);
@@ -1286,6 +1286,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
                set_page_count(p, 0);
                set_compound_head(p, page);
        }
+        atomic_set(compound_mapcount_ptr(page), -1);
 }
 /*
diff --git a/mm/internal.h b/mm/internal.h
index ed8b5ffcf9b1..a38a21ebddb4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -216,6 +216,37 @@ static inline bool is_cow_mapping(vm_flags_t flags)
        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 }
+/*
+ * These three helpers classifies VMAs for virtual memory accounting.
+ */
+/*
+ * Executable code area - executable, not writable, not stack
+ */
+static inline bool is_exec_mapping(vm_flags_t flags)
+{
+        return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
+}
+/*
+ * Stack area - atomatically grows in one direction
+ *
+ * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
+ * do_mmap() forbids all other combinations.
+ */
+static inline bool is_stack_mapping(vm_flags_t flags)
+{
+        return (flags & VM_STACK) == VM_STACK;
+}
+/*
+ * Data area - private, writable, not stack
+ */
+static inline bool is_data_mapping(vm_flags_t flags)
+{
+        return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
+}
 /* mm/util.c */
 void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
                struct vm_area_struct *prev, struct rb_node *rb_parent);
diff --git a/mm/memblock.c b/mm/memblock.c
index d2ed81e59a94..dd7989929f13 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1448,7 +1448,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
 * Remaining API functions
 */
-phys_addr_t __init memblock_phys_mem_size(void)
+phys_addr_t __init_memblock memblock_phys_mem_size(void)
 {
        return memblock.memory.total_size;
 }
diff --git a/mm/memory.c b/mm/memory.c
index 5aa4f55eb786..38090ca37a08 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1612,10 +1612,15 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
         * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
         * without pte special, it would there be refcounted as a normal page.
         */
-        if (!HAVE_PTE_SPECIAL && pfn_t_valid(pfn)) {
+        if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
                struct page *page;
-                page = pfn_t_to_page(pfn);
+                /*
+                 * At this point we are committed to insert_page()
+                 * regardless of whether the caller specified flags that
+                 * result in pfn_t_has_page() == false.
+                 */
+                page = pfn_to_page(pfn_t_to_pfn(pfn));
                return insert_page(vma, addr, page, vma->vm_page_prot);
        }
        return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
@@ -2253,11 +2258,6 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
        page_cache_get(old_page);
-        /*
-         * Only catch write-faults on shared writable pages,
-         * read-only shared pages can get COWed by
-         * get_user_pages(.write=1, .force=1).
-         */
        if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
                int tmp;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 27d135408a22..4c4187c0e1de 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -548,8 +548,7 @@ retry:
                        goto retry;
                }
-                if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+                migrate_page_add(page, qp->pagelist, flags);
-                        migrate_page_add(page, qp->pagelist, flags);
        }
        pte_unmap_unlock(pte - 1, ptl);
        cond_resched();
@@ -625,7 +624,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
        unsigned long endvma = vma->vm_end;
        unsigned long flags = qp->flags;
-        if (vma->vm_flags & VM_PFNMAP)
+        if (!vma_migratable(vma))
                return 1;
        if (endvma > end)
@@ -644,16 +643,13 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
        if (flags & MPOL_MF_LAZY) {
                /* Similar to task_numa_work, skip inaccessible VMAs */
-                if (vma_migratable(vma) &&
+                if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
-                        vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
                        change_prot_numa(vma, start, endvma);
                return 1;
        }
-        if ((flags & MPOL_MF_STRICT) ||
+        /* queue pages from current vma */
-            ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
+        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
-             vma_migratable(vma)))
-                /* queue pages from current vma */
                return 0;
        return 1;
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 407ab434d5ee..e2e9f48b06c2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -42,6 +42,7 @@
 #include <linux/memory.h>
 #include <linux/printk.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/moduleparam.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -69,6 +70,8 @@ const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
 int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
 #endif
+static bool ignore_rlimit_data = true;
+core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
 static void unmap_region(struct mm_struct *mm,
                struct vm_area_struct *vma, struct vm_area_struct *prev,
@@ -387,8 +390,9 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma)
 }
 #ifdef CONFIG_DEBUG_VM_RB
-static int browse_rb(struct rb_root *root)
+static int browse_rb(struct mm_struct *mm)
 {
+        struct rb_root *root = &mm->mm_rb;
        int i = 0, j, bug = 0;
        struct rb_node *nd, *pn = NULL;
        unsigned long prev = 0, pend = 0;
@@ -411,12 +415,14 @@ static int browse_rb(struct rb_root *root)
                                  vma->vm_start, vma->vm_end);
                        bug = 1;
                }
+                spin_lock(&mm->page_table_lock);
                if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
                        pr_emerg("free gap %lx, correct %lx\n",
                               vma->rb_subtree_gap,
                               vma_compute_subtree_gap(vma));
                        bug = 1;
                }
+                spin_unlock(&mm->page_table_lock);
                i++;
                pn = nd;
                prev = vma->vm_start;
@@ -453,12 +459,16 @@ static void validate_mm(struct mm_struct *mm)
        struct vm_area_struct *vma = mm->mmap;
        while (vma) {
+                struct anon_vma *anon_vma = vma->anon_vma;
                struct anon_vma_chain *avc;
-                vma_lock_anon_vma(vma);
+                if (anon_vma) {
-                list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+                        anon_vma_lock_read(anon_vma);
-                        anon_vma_interval_tree_verify(avc);
+                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
-                vma_unlock_anon_vma(vma);
+                                anon_vma_interval_tree_verify(avc);
+                        anon_vma_unlock_read(anon_vma);
+                }
                highest_address = vma->vm_end;
                vma = vma->vm_next;
                i++;
@@ -472,7 +482,7 @@ static void validate_mm(struct mm_struct *mm)
                          mm->highest_vm_end, highest_address);
                bug = 1;
        }
-        i = browse_rb(&mm->mm_rb);
+        i = browse_rb(mm);
        if (i != mm->map_count) {
                if (i != -1)
                        pr_emerg("map_count %d rb %d\n", mm->map_count, i);
@@ -2139,32 +2149,27 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
 int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 {
        struct mm_struct *mm = vma->vm_mm;
-        int error;
+        int error = 0;
        if (!(vma->vm_flags & VM_GROWSUP))
                return -EFAULT;
-        /*
+        /* Guard against wrapping around to address 0. */
-         * We must make sure the anon_vma is allocated
+        if (address < PAGE_ALIGN(address+4))
-         * so that the anon_vma locking is not a noop.
+                address = PAGE_ALIGN(address+4);
-         */
+        else
+                return -ENOMEM;
+        /* We must make sure the anon_vma is allocated. */
        if (unlikely(anon_vma_prepare(vma)))
                return -ENOMEM;
-        vma_lock_anon_vma(vma);
        /*
         * vma->vm_start/vm_end cannot change under us because the caller
         * is required to hold the mmap_sem in read mode.  We need the
         * anon_vma lock to serialize against concurrent expand_stacks.
-         * Also guard against wrapping around to address 0.
         */
-        if (address < PAGE_ALIGN(address+4))
+        anon_vma_lock_write(vma->anon_vma);
-                address = PAGE_ALIGN(address+4);
-        else {
-                vma_unlock_anon_vma(vma);
-                return -ENOMEM;
-        }
-        error = 0;
        /* Somebody else might have raced and expanded it already */
        if (address > vma->vm_end) {
@@ -2182,7 +2187,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
                                 * updates, but we only hold a shared mmap_sem
                                 * lock here, so we need to protect against
                                 * concurrent vma expansions.
-                                 * vma_lock_anon_vma() doesn't help here, as
+                                 * anon_vma_lock_write() doesn't help here, as
                                 * we don't guarantee that all growable vmas
                                 * in a mm share the same root anon vma.
                                 * So, we reuse mm->page_table_lock to guard
@@ -2205,7 +2210,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
                        }
                }
        }
-        vma_unlock_anon_vma(vma);
+        anon_vma_unlock_write(vma->anon_vma);
        khugepaged_enter_vma_merge(vma, vma->vm_flags);
        validate_mm(mm);
        return error;
@@ -2221,25 +2226,21 @@ int expand_downwards(struct vm_area_struct *vma,
        struct mm_struct *mm = vma->vm_mm;
        int error;
-        /*
-         * We must make sure the anon_vma is allocated
-         * so that the anon_vma locking is not a noop.
-         */
-        if (unlikely(anon_vma_prepare(vma)))
-                return -ENOMEM;
        address &= PAGE_MASK;
        error = security_mmap_addr(address);
        if (error)
                return error;
-        vma_lock_anon_vma(vma);
+        /* We must make sure the anon_vma is allocated. */
+        if (unlikely(anon_vma_prepare(vma)))
+                return -ENOMEM;
        /*
         * vma->vm_start/vm_end cannot change under us because the caller
         * is required to hold the mmap_sem in read mode.  We need the
         * anon_vma lock to serialize against concurrent expand_stacks.
         */
+        anon_vma_lock_write(vma->anon_vma);
        /* Somebody else might have raced and expanded it already */
        if (address < vma->vm_start) {
@@ -2257,7 +2258,7 @@ int expand_downwards(struct vm_area_struct *vma,
                                 * updates, but we only hold a shared mmap_sem
                                 * lock here, so we need to protect against
                                 * concurrent vma expansions.
-                                 * vma_lock_anon_vma() doesn't help here, as
+                                 * anon_vma_lock_write() doesn't help here, as
                                 * we don't guarantee that all growable vmas
                                 * in a mm share the same root anon vma.
                                 * So, we reuse mm->page_table_lock to guard
@@ -2278,7 +2279,7 @@ int expand_downwards(struct vm_area_struct *vma,
                        }
                }
        }
-        vma_unlock_anon_vma(vma);
+        anon_vma_unlock_write(vma->anon_vma);
        khugepaged_enter_vma_merge(vma, vma->vm_flags);
        validate_mm(mm);
        return error;
@@ -2982,9 +2983,17 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
        if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
                return false;
-        if ((flags & (VM_WRITE | VM_SHARED | (VM_STACK_FLAGS &
+        if (is_data_mapping(flags) &&
-                                (VM_GROWSUP | VM_GROWSDOWN)))) == VM_WRITE)
+            mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
-                return mm->data_vm + npages <= rlimit(RLIMIT_DATA);
+                if (ignore_rlimit_data)
+                        pr_warn_once("%s (%d): VmData %lu exceed data ulimit "
+                                     "%lu. Will be forbidden soon.\n",
+                                     current->comm, current->pid,
+                                     (mm->data_vm + npages) << PAGE_SHIFT,
+                                     rlimit(RLIMIT_DATA));
+                else
+                        return false;
+        }
        return true;
 }
@@ -2993,11 +3002,11 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
 {
        mm->total_vm += npages;
-        if ((flags & (VM_EXEC | VM_WRITE)) == VM_EXEC)
+        if (is_exec_mapping(flags))
                mm->exec_vm += npages;
-        else if (flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN)))
+        else if (is_stack_mapping(flags))
                mm->stack_vm += npages;
-        else if ((flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
+        else if (is_data_mapping(flags))
                mm->data_vm += npages;
 }
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 8eb7bb40dc40..f7cb3d4d9c2e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -160,9 +160,11 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
                }
                if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
-                        if (next - addr != HPAGE_PMD_SIZE)
+                        if (next - addr != HPAGE_PMD_SIZE) {
                                split_huge_pmd(vma, pmd, addr);
-                        else {
+                                if (pmd_none(*pmd))
+                                        continue;
+                        } else {
                                int nr_ptes = change_huge_pmd(vma, pmd, addr,
                                                newprot, prot_numa);
diff --git a/mm/mremap.c b/mm/mremap.c
index d77946a997f7..8eeba02fc991 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -210,6 +210,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                                }
                        }
                        split_huge_pmd(vma, old_pmd, old_addr);
+                        if (pmd_none(*old_pmd))
+                                continue;
                        VM_BUG_ON(pmd_trans_huge(*old_pmd));
                }
                if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 63358d9f9aa9..838ca8bb64f7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5210,6 +5210,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
        pgdat->numabalancing_migrate_nr_pages = 0;
        pgdat->numabalancing_migrate_next_window = jiffies;
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        spin_lock_init(&pgdat->split_queue_lock);
+        INIT_LIST_HEAD(&pgdat->split_queue);
+        pgdat->split_queue_len = 0;
+#endif
        init_waitqueue_head(&pgdat->kswapd_wait);
        init_waitqueue_head(&pgdat->pfmemalloc_wait);
        pgdat_page_ext_init(pgdat);
@@ -6615,7 +6620,7 @@ bool is_pageblock_removable_nolock(struct page *page)
        return !has_unmovable_pages(zone, page, 0, true);
 }
-#ifdef CONFIG_CMA
+#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
 static unsigned long pfn_max_align_down(unsigned long pfn)
 {
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 9d4767698a1c..06a005b979a7 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -90,9 +90,9 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
 * ARCHes with special requirements for evicting THP backing TLB entries can
 * implement this. Otherwise also, it can help optimize normal TLB flush in
 * THP regime. stock flush_tlb_range() typically has optimization to nuke the
- * entire TLB TLB if flush span is greater than a threshhold, which will
+ * entire TLB if flush span is greater than a threshold, which will
 * likely be true for a single huge page. Thus a single thp flush will
- * invalidate the entire TLB which is not desitable.
+ * invalidate the entire TLB which is not desirable.
 * e.g. see arch/arc: flush_pmd_tlb_range
 */
 #define flush_pmd_tlb_range(vma, addr, end)     flush_tlb_range(vma, addr, end)
@@ -195,7 +195,9 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        VM_BUG_ON(pmd_trans_huge(*pmdp));
        pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
-        flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+        /* collapse entails shooting down ptes not pmd */
+        flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return pmd;
 }
 #endif
diff --git a/mm/util.c b/mm/util.c
index c108a6542d05..4fb14ca5a419 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -230,36 +230,11 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
 }
 /* Check if the vma is being used as a stack by this task */
-static int vm_is_stack_for_task(struct task_struct *t,
+int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t)
-                                struct vm_area_struct *vma)
 {
        return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
 }
-/*
- * Check if the vma is being used as a stack.
- * If is_group is non-zero, check in the entire thread group or else
- * just check in the current task. Returns the task_struct of the task
- * that the vma is stack for. Must be called under rcu_read_lock().
- */
-struct task_struct *task_of_stack(struct task_struct *task,
-                                struct vm_area_struct *vma, bool in_group)
-{
-        if (vm_is_stack_for_task(task, vma))
-                return task;
-        if (in_group) {
-                struct task_struct *t;
-                for_each_thread(task, t) {
-                        if (vm_is_stack_for_task(t, vma))
-                                return t;
-                }
-        }
-        return NULL;
-}
 #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
 void arch_pick_mmap_layout(struct mm_struct *mm)
 {
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 9a6c0704211c..149fdf6c5c56 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -248,9 +248,8 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
        if (tree) {
                spin_lock(&vmpr->sr_lock);
-                vmpr->tree_scanned += scanned;
+                scanned = vmpr->tree_scanned += scanned;
                vmpr->tree_reclaimed += reclaimed;
-                scanned = vmpr->scanned;
                spin_unlock(&vmpr->sr_lock);
                if (scanned < vmpressure_win)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eb3dd37ccd7c..71b1c29948db 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1443,7 +1443,7 @@ int isolate_lru_page(struct page *page)
        int ret = -EBUSY;
        VM_BUG_ON_PAGE(!page_count(page), page);
-        VM_BUG_ON_PAGE(PageTail(page), page);
+        WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
        if (PageLRU(page)) {
                struct zone *zone = page_zone(page);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 40b2c74ddf16..084c6725b373 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1396,10 +1396,15 @@ static void vmstat_update(struct work_struct *w)
                 * Counters were updated so we expect more updates
                 * to occur in the future. Keep on running the
                 * update worker thread.
+                 * If we were marked on cpu_stat_off clear the flag
+                 * so that vmstat_shepherd doesn't schedule us again.
                 */
-                queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+                if (!cpumask_test_and_clear_cpu(smp_processor_id(),
-                        this_cpu_ptr(&vmstat_work),
+                                                cpu_stat_off)) {
-                        round_jiffies_relative(sysctl_stat_interval));
+                        queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+                                this_cpu_ptr(&vmstat_work),
+                                round_jiffies_relative(sysctl_stat_interval));
+                }
        } else {
                /*
                 * We did not update any counters so the app may be in
@@ -1417,18 +1422,6 @@ static void vmstat_update(struct work_struct *w)
 * until the diffs stay at zero. The function is used by NOHZ and can only be
 * invoked when tick processing is not active.
 */
-void quiet_vmstat(void)
-{
-        if (system_state != SYSTEM_RUNNING)
-                return;
-        do {
-                if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
-                        cancel_delayed_work(this_cpu_ptr(&vmstat_work));
-        } while (refresh_cpu_vm_stats(false));
-}
 /*
 * Check if the diffs for a certain cpu indicate that
 * an update is needed.
@@ -1452,6 +1445,30 @@ static bool need_update(int cpu)
        return false;
 }
+void quiet_vmstat(void)
+{
+        if (system_state != SYSTEM_RUNNING)
+                return;
+        /*
+         * If we are already in hands of the shepherd then there
+         * is nothing for us to do here.
+         */
+        if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
+                return;
+        if (!need_update(smp_processor_id()))
+                return;
+        /*
+         * Just refresh counters and do not care about the pending delayed
+         * vmstat_update. It doesn't fire that often to matter and canceling
+         * it would be too expensive from this path.
+         * vmstat_shepherd will take care about that for us.
+         */
+        refresh_cpu_vm_stats(false);
+}
 /*
 * Shepherd worker thread that checks the
@@ -1469,18 +1486,25 @@ static void vmstat_shepherd(struct work_struct *w)
        get_online_cpus();
        /* Check processors whose vmstat worker threads have been disabled */
-        for_each_cpu(cpu, cpu_stat_off)
+        for_each_cpu(cpu, cpu_stat_off) {
-                if (need_update(cpu) &&
+                struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
-                        cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
-                        queue_delayed_work_on(cpu, vmstat_wq,
-                                &per_cpu(vmstat_work, cpu), 0);
+                if (need_update(cpu)) {
+                        if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
+                                queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
+                } else {
+                        /*
+                         * Cancel the work if quiet_vmstat has put this
+                         * cpu on cpu_stat_off because the work item might
+                         * be still scheduled
+                         */
+                        cancel_delayed_work(dw);
+                }
+        }
        put_online_cpus();
        schedule_delayed_work(&shepherd,
                round_jiffies_relative(sysctl_stat_interval));
 }
 static void __init start_shepherd_timer(void)
@@ -1488,7 +1512,7 @@ static void __init start_shepherd_timer(void)
        int cpu;
        for_each_possible_cpu(cpu)
-                INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu),
+                INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
                        vmstat_update);
        if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))