29 files changed, 387 insertions, 285 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8a8bb8796c6c..72e6d0c55cfa 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -689,6 +689,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
        INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
        bdi->cgwb_congested_tree = RB_ROOT;
        mutex_init(&bdi->cgwb_release_mutex);
+        init_rwsem(&bdi->wb_switch_rwsem);
        ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
        if (!ret) {
diff --git a/mm/debug.c b/mm/debug.c
index 0abb987dad9b..1611cf00a137 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -44,7 +44,7 @@ const struct trace_print_flags vmaflag_names[] = {
 void __dump_page(struct page *page, const char *reason)
 {
-        struct address_space *mapping = page_mapping(page);
+        struct address_space *mapping;
        bool page_poisoned = PagePoisoned(page);
        int mapcount;
@@ -58,6 +58,8 @@ void __dump_page(struct page *page, const char *reason)
                goto hex_only;
        }
+        mapping = page_mapping(page);
        /*
         * Avoid VM_BUG_ON() in page_mapcount().
         * page->_mapcount space in struct page is used by sl[aou]b pages to
diff --git a/mm/gup.c b/mm/gup.c
index 05acd7e2eb22..75029649baca 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1674,7 +1674,8 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                if (!pmd_present(pmd))
                        return 0;
-                if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
+                if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
+                             pmd_devmap(pmd))) {
                        /*
                         * NUMA hinting faults need to be handled in the GUP
                         * slowpath for accounting purposes and so that they
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 745088810965..afef61656c1e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3238,7 +3238,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
        struct page *ptepage;
        unsigned long addr;
        int cow;
-        struct address_space *mapping = vma->vm_file->f_mapping;
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
        struct mmu_notifier_range range;
@@ -3250,23 +3249,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                mmu_notifier_range_init(&range, src, vma->vm_start,
                                        vma->vm_end);
                mmu_notifier_invalidate_range_start(&range);
-        } else {
-                /*
-                 * For shared mappings i_mmap_rwsem must be held to call
-                 * huge_pte_alloc, otherwise the returned ptep could go
-                 * away if part of a shared pmd and another thread calls
-                 * huge_pmd_unshare.
-                 */
-                i_mmap_lock_read(mapping);
        }
        for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
                spinlock_t *src_ptl, *dst_ptl;
                src_pte = huge_pte_offset(src, addr, sz);
                if (!src_pte)
                        continue;
                dst_pte = huge_pte_alloc(dst, addr, sz);
                if (!dst_pte) {
                        ret = -ENOMEM;
@@ -3337,8 +3326,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
        if (cow)
                mmu_notifier_invalidate_range_end(&range);
-        else
-                i_mmap_unlock_read(mapping);
        return ret;
 }
@@ -3755,16 +3742,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
        }
        /*
-         * We can not race with truncation due to holding i_mmap_rwsem.
+         * Use page lock to guard against racing truncation
-         * Check once here for faults beyond end of file.
+         * before we get page_table_lock.
         */
-        size = i_size_read(mapping->host) >> huge_page_shift(h);
-        if (idx >= size)
-                goto out;
 retry:
        page = find_lock_page(mapping, idx);
        if (!page) {
+                size = i_size_read(mapping->host) >> huge_page_shift(h);
+                if (idx >= size)
+                        goto out;
                /*
                 * Check for page in userfault range
                 */
@@ -3784,18 +3771,14 @@ retry:
                        };
                        /*
-                         * hugetlb_fault_mutex and i_mmap_rwsem must be
+                         * hugetlb_fault_mutex must be dropped before
-                         * dropped before handling userfault.  Reacquire
+                         * handling userfault.  Reacquire after handling
-                         * after handling fault to make calling code simpler.
+                         * fault to make calling code simpler.
                         */
                        hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
                                                        idx, haddr);
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-                        i_mmap_unlock_read(mapping);
                        ret = handle_userfault(&vmf, VM_UFFD_MISSING);
-                        i_mmap_lock_read(mapping);
                        mutex_lock(&hugetlb_fault_mutex_table[hash]);
                        goto out;
                }
@@ -3854,6 +3837,9 @@ retry:
        }
        ptl = huge_pte_lock(h, mm, ptep);
+        size = i_size_read(mapping->host) >> huge_page_shift(h);
+        if (idx >= size)
+                goto backout;
        ret = 0;
        if (!huge_pte_none(huge_ptep_get(ptep)))
@@ -3940,11 +3926,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
        if (ptep) {
-                /*
-                 * Since we hold no locks, ptep could be stale.  That is
-                 * OK as we are only making decisions based on content and
-                 * not actually modifying content here.
-                 */
                entry = huge_ptep_get(ptep);
                if (unlikely(is_hugetlb_entry_migration(entry))) {
                        migration_entry_wait_huge(vma, mm, ptep);
@@ -3952,33 +3933,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
                        return VM_FAULT_HWPOISON_LARGE |
                                VM_FAULT_SET_HINDEX(hstate_index(h));
+        } else {
+                ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
+                if (!ptep)
+                        return VM_FAULT_OOM;
        }
-        /*
-         * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
-         * until finished with ptep.  This serves two purposes:
-         * 1) It prevents huge_pmd_unshare from being called elsewhere
-         *    and making the ptep no longer valid.
-         * 2) It synchronizes us with file truncation.
-         *
-         * ptep could have already be assigned via huge_pte_offset.  That
-         * is OK, as huge_pte_alloc will return the same value unless
-         * something changed.
-         */
        mapping = vma->vm_file->f_mapping;
-        i_mmap_lock_read(mapping);
+        idx = vma_hugecache_offset(h, vma, haddr);
-        ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
-        if (!ptep) {
-                i_mmap_unlock_read(mapping);
-                return VM_FAULT_OOM;
-        }
        /*
         * Serialize hugepage allocation and instantiation, so that we don't
         * get spurious allocation failures if two CPUs race to instantiate
         * the same page in the page cache.
         */
-        idx = vma_hugecache_offset(h, vma, haddr);
        hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
        mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -4066,7 +4034,6 @@ out_ptl:
        }
 out_mutex:
        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-        i_mmap_unlock_read(mapping);
        /*
         * Generally it's safe to hold refcount during waiting page lock. But
         * here we just wait to defer the next page fault to avoid busy loop and
@@ -4301,7 +4268,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                break;
                        }
                        if (ret & VM_FAULT_RETRY) {
-                                if (nonblocking)
+                                if (nonblocking &&
+                                    !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
                                        *nonblocking = 0;
                                *nr_pages = 0;
                                /*
@@ -4671,12 +4639,10 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
 * and returns the corresponding pte. While this is not necessary for the
 * !shared pmd case because we can allocate the pmd later as well, it makes the
- * code much cleaner.
+ * code much cleaner. pmd allocation is essential for the shared case because
- *
+ * pud has to be populated inside the same i_mmap_rwsem section - otherwise
- * This routine must be called with i_mmap_rwsem held in at least read mode.
+ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
- * For hugetlbfs, this prevents removal of any page table entries associated
+ * bad pmd for sharing.
- * with the address space.  This is important as we are setting up sharing
- * based on existing page table entries (mappings).
 */
 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 {
@@ -4693,6 +4659,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
        if (!vma_shareable(vma, addr))
                return (pte_t *)pmd_alloc(mm, pud, addr);
+        i_mmap_lock_write(mapping);
        vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
                if (svma == vma)
                        continue;
@@ -4722,6 +4689,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
        spin_unlock(ptl);
 out:
        pte = (pte_t *)pmd_alloc(mm, pud, addr);
+        i_mmap_unlock_write(mapping);
        return pte;
 }
@@ -4732,7 +4700,7 @@ out:
 * indicated by page_count > 1, unmap is achieved by clearing pud and
 * decrementing the ref count. If count == 1, the pte page is not shared.
 *
- * Called with page table lock held and i_mmap_rwsem held in write mode.
+ * called with page table lock held.
 *
 * returns: 1 successfully unmapped a shared pte page
 *          0 the underlying pte page is not shared, or it is the last user
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 0a14fcff70ed..5d1065efbd47 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -5,7 +5,10 @@ UBSAN_SANITIZE_generic.o := n
 UBSAN_SANITIZE_tags.o := n
 KCOV_INSTRUMENT := n
+CFLAGS_REMOVE_common.o = -pg
 CFLAGS_REMOVE_generic.o = -pg
+CFLAGS_REMOVE_tags.o = -pg
 # Function splitter causes unnecessary splits in __asan_load1/__asan_store1
 # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 03d5d1374ca7..09b534fbba17 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -298,8 +298,6 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
                return;
        }
-        cache->align = round_up(cache->align, KASAN_SHADOW_SCALE_SIZE);
        *flags |= SLAB_KASAN;
 }
@@ -349,28 +347,48 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
 }
 /*
- * Since it's desirable to only call object contructors once during slab
+ * This function assigns a tag to an object considering the following:
- * allocation, we preassign tags to all such objects. Also preassign tags for
+ * 1. A cache might have a constructor, which might save a pointer to a slab
- * SLAB_TYPESAFE_BY_RCU slabs to avoid use-after-free reports.
+ *    object somewhere (e.g. in the object itself). We preassign a tag for
- * For SLAB allocator we can't preassign tags randomly since the freelist is
+ *    each object in caches with constructors during slab creation and reuse
- * stored as an array of indexes instead of a linked list. Assign tags based
+ *    the same tag each time a particular object is allocated.
- * on objects indexes, so that objects that are next to each other get
+ * 2. A cache might be SLAB_TYPESAFE_BY_RCU, which means objects can be
- * different tags.
+ *    accessed after being freed. We preassign tags for objects in these
- * After a tag is assigned, the object always gets allocated with the same tag.
+ *    caches as well.
- * The reason is that we can't change tags for objects with constructors on
+ * 3. For SLAB allocator we can't preassign tags randomly since the freelist
- * reallocation (even for non-SLAB_TYPESAFE_BY_RCU), because the constructor
+ *    is stored as an array of indexes instead of a linked list. Assign tags
- * code can save the pointer to the object somewhere (e.g. in the object
+ *    based on objects indexes, so that objects that are next to each other
- * itself). Then if we retag it, the old saved pointer will become invalid.
+ *    get different tags.
 */
-static u8 assign_tag(struct kmem_cache *cache, const void *object, bool new)
+static u8 assign_tag(struct kmem_cache *cache, const void *object,
+                        bool init, bool keep_tag)
 {
+        /*
+         * 1. When an object is kmalloc()'ed, two hooks are called:
+         *    kasan_slab_alloc() and kasan_kmalloc(). We assign the
+         *    tag only in the first one.
+         * 2. We reuse the same tag for krealloc'ed objects.
+         */
+        if (keep_tag)
+                return get_tag(object);
+        /*
+         * If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU
+         * set, assign a tag when the object is being allocated (init == false).
+         */
        if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU))
-                return new ? KASAN_TAG_KERNEL : random_tag();
+                return init ? KASAN_TAG_KERNEL : random_tag();
+        /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */
 #ifdef CONFIG_SLAB
+        /* For SLAB assign tags based on the object index in the freelist. */
        return (u8)obj_to_index(cache, virt_to_page(object), (void *)object);
 #else
-        return new ? random_tag() : get_tag(object);
+        /*
+         * For SLUB assign a random tag during slab creation, otherwise reuse
+         * the already assigned tag.
+         */
+        return init ? random_tag() : get_tag(object);
 #endif
 }
@@ -386,17 +404,12 @@ void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
        __memset(alloc_info, 0, sizeof(*alloc_info));
        if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
-                object = set_tag(object, assign_tag(cache, object, true));
+                object = set_tag(object,
+                                assign_tag(cache, object, true, false));
        return (void *)object;
 }
-void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object,
-                                        gfp_t flags)
-{
-        return kasan_kmalloc(cache, object, cache->object_size, flags);
-}
 static inline bool shadow_invalid(u8 tag, s8 shadow_byte)
 {
        if (IS_ENABLED(CONFIG_KASAN_GENERIC))
@@ -452,8 +465,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
        return __kasan_slab_free(cache, object, ip, true);
 }
-void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
+static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
-                                        size_t size, gfp_t flags)
+                                size_t size, gfp_t flags, bool keep_tag)
 {
        unsigned long redzone_start;
        unsigned long redzone_end;
@@ -471,7 +484,7 @@ void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
                                KASAN_SHADOW_SCALE_SIZE);
        if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
-                tag = assign_tag(cache, object, false);
+                tag = assign_tag(cache, object, false, keep_tag);
        /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
        kasan_unpoison_shadow(set_tag(object, tag), size);
@@ -483,6 +496,18 @@ void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
        return set_tag(object, tag);
 }
+void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object,
+                                        gfp_t flags)
+{
+        return __kasan_kmalloc(cache, object, cache->object_size, flags, false);
+}
+void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
+                                size_t size, gfp_t flags)
+{
+        return __kasan_kmalloc(cache, object, size, flags, true);
+}
 EXPORT_SYMBOL(kasan_kmalloc);
 void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
@@ -522,7 +547,8 @@ void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags)
        if (unlikely(!PageSlab(page)))
                return kasan_kmalloc_large(object, size, flags);
        else
-                return kasan_kmalloc(page->slab_cache, object, size, flags);
+                return __kasan_kmalloc(page->slab_cache, object, size,
+                                                flags, true);
 }
 void kasan_poison_kfree(void *ptr, unsigned long ip)
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index 0777649e07c4..63fca3172659 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -46,7 +46,7 @@ void kasan_init_tags(void)
        int cpu;
        for_each_possible_cpu(cpu)
-                per_cpu(prng_state, cpu) = get_random_u32();
+                per_cpu(prng_state, cpu) = (u32)get_cycles();
 }
 /*
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index f9d9dc250428..707fa5579f66 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -574,6 +574,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
        unsigned long flags;
        struct kmemleak_object *object, *parent;
        struct rb_node **link, *rb_parent;
+        unsigned long untagged_ptr;
        object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
        if (!object) {
@@ -619,8 +620,9 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
        write_lock_irqsave(&kmemleak_lock, flags);
-        min_addr = min(min_addr, ptr);
+        untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
-        max_addr = max(max_addr, ptr + size);
+        min_addr = min(min_addr, untagged_ptr);
+        max_addr = max(max_addr, untagged_ptr + size);
        link = &object_tree_root.rb_node;
        rb_parent = NULL;
        while (*link) {
@@ -1333,6 +1335,7 @@ static void scan_block(void *_start, void *_end,
        unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
        unsigned long *end = _end - (BYTES_PER_POINTER - 1);
        unsigned long flags;
+        unsigned long untagged_ptr;
        read_lock_irqsave(&kmemleak_lock, flags);
        for (ptr = start; ptr < end; ptr++) {
@@ -1347,7 +1350,8 @@ static void scan_block(void *_start, void *_end,
                pointer = *ptr;
                kasan_enable_current();
-                if (pointer < min_addr || pointer >= max_addr)
+                untagged_ptr = (unsigned long)kasan_reset_tag((void *)pointer);
+                if (untagged_ptr < min_addr || untagged_ptr >= max_addr)
                        continue;
                /*
diff --git a/mm/memblock.c b/mm/memblock.c
index 022d4cbb3618..ea31045ba704 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -26,6 +26,13 @@
 #include "internal.h"
+#define INIT_MEMBLOCK_REGIONS                   128
+#define INIT_PHYSMEM_REGIONS                    4
+#ifndef INIT_MEMBLOCK_RESERVED_REGIONS
+# define INIT_MEMBLOCK_RESERVED_REGIONS         INIT_MEMBLOCK_REGIONS
+#endif
 /**
 * DOC: memblock overview
 *
@@ -92,7 +99,7 @@ unsigned long max_pfn;
 unsigned long long max_possible_pfn;
 static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
-static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
+static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;
 #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
 static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock;
 #endif
@@ -105,7 +112,7 @@ struct memblock memblock __initdata_memblock = {
        .reserved.regions       = memblock_reserved_init_regions,
        .reserved.cnt           = 1,    /* empty dummy entry */
-        .reserved.max           = INIT_MEMBLOCK_REGIONS,
+        .reserved.max           = INIT_MEMBLOCK_RESERVED_REGIONS,
        .reserved.name          = "reserved",
 #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6379fff1a5ff..831be5ff5f4d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -372,7 +372,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
                        if (fail || tk->addr_valid == 0) {
                                pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
                                       pfn, tk->tsk->comm, tk->tsk->pid);
-                                force_sig(SIGKILL, tk->tsk);
+                                do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
+                                                 tk->tsk, PIDTYPE_PID);
                        }
                        /*
@@ -966,7 +967,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
        enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
        struct address_space *mapping;
        LIST_HEAD(tokill);
-        bool unmap_success = true;
+        bool unmap_success;
        int kill = 1, forcekill;
        struct page *hpage = *hpagep;
        bool mlocked = PageMlocked(hpage);
@@ -1028,19 +1029,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
        if (kill)
                collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
-        if (!PageHuge(hpage)) {
+        unmap_success = try_to_unmap(hpage, ttu);
-                unmap_success = try_to_unmap(hpage, ttu);
-        } else if (mapping) {
-                /*
-                 * For hugetlb pages, try_to_unmap could potentially call
-                 * huge_pmd_unshare.  Because of this, take semaphore in
-                 * write mode here and set TTU_RMAP_LOCKED to indicate we
-                 * have taken the lock at this higer level.
-                 */
-                i_mmap_lock_write(mapping);
-                unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
-                i_mmap_unlock_write(mapping);
-        }
        if (!unmap_success)
                pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
                       pfn, page_mapcount(hpage));
diff --git a/mm/memory.c b/mm/memory.c
index a52663c0612d..e11ca9dd823f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2994,6 +2994,28 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret;
+        /*
+         * Preallocate pte before we take page_lock because this might lead to
+         * deadlocks for memcg reclaim which waits for pages under writeback:
+         *                              lock_page(A)
+         *                              SetPageWriteback(A)
+         *                              unlock_page(A)
+         * lock_page(B)
+         *                              lock_page(B)
+         * pte_alloc_pne
+         *   shrink_page_list
+         *     wait_on_page_writeback(A)
+         *                              SetPageWriteback(B)
+         *                              unlock_page(B)
+         *                              # flush A, B to clear the writeback
+         */
+        if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
+                vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
+                if (!vmf->prealloc_pte)
+                        return VM_FAULT_OOM;
+                smp_wmb(); /* See comment in __pte_alloc() */
+        }
        ret = vma->vm_ops->fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
                            VM_FAULT_DONE_COW)))
@@ -4077,8 +4099,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
                goto out;
        if (range) {
-                range->start = address & PAGE_MASK;
+                mmu_notifier_range_init(range, mm, address & PAGE_MASK,
-                range->end = range->start + PAGE_SIZE;
+                                     (address & PAGE_MASK) + PAGE_SIZE);
                mmu_notifier_invalidate_range_start(range);
        }
        ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b9a667d36c55..1ad28323fb9f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1188,11 +1188,13 @@ static inline int pageblock_free(struct page *page)
        return PageBuddy(page) && page_order(page) >= pageblock_order;
 }
-/* Return the start of the next active pageblock after a given page */
+/* Return the pfn of the start of the next active pageblock after a given pfn */
-static struct page *next_active_pageblock(struct page *page)
+static unsigned long next_active_pageblock(unsigned long pfn)
 {
+        struct page *page = pfn_to_page(pfn);
        /* Ensure the starting page is pageblock-aligned */
-        BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
+        BUG_ON(pfn & (pageblock_nr_pages - 1));
        /* If the entire pageblock is free, move to the end of free page */
        if (pageblock_free(page)) {
@@ -1200,16 +1202,16 @@ static struct page *next_active_pageblock(struct page *page)
                /* be careful. we don't have locks, page_order can be changed.*/
                order = page_order(page);
                if ((order < MAX_ORDER) && (order >= pageblock_order))
-                        return page + (1 << order);
+                        return pfn + (1 << order);
        }
-        return page + pageblock_nr_pages;
+        return pfn + pageblock_nr_pages;
 }
-static bool is_pageblock_removable_nolock(struct page *page)
+static bool is_pageblock_removable_nolock(unsigned long pfn)
 {
+        struct page *page = pfn_to_page(pfn);
        struct zone *zone;
-        unsigned long pfn;
        /*
         * We have to be careful here because we are iterating over memory
@@ -1232,12 +1234,14 @@ static bool is_pageblock_removable_nolock(struct page *page)
 /* Checks if this range of memory is likely to be hot-removable. */
 bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
 {
-        struct page *page = pfn_to_page(start_pfn);
+        unsigned long end_pfn, pfn;
-        struct page *end_page = page + nr_pages;
+        end_pfn = min(start_pfn + nr_pages,
+                        zone_end_pfn(page_zone(pfn_to_page(start_pfn))));
        /* Check the starting page of each pageblock within the range */
-        for (; page < end_page; page = next_active_pageblock(page)) {
+        for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) {
-                if (!is_pageblock_removable_nolock(page))
+                if (!is_pageblock_removable_nolock(pfn))
                        return false;
                cond_resched();
        }
@@ -1273,6 +1277,9 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
                                i++;
                        if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
                                continue;
+                        /* Check if we got outside of the zone */
+                        if (zone && !zone_spans_pfn(zone, pfn + i))
+                                return 0;
                        page = pfn_to_page(pfn + i);
                        if (zone && page_zone(page) != zone)
                                return 0;
@@ -1301,23 +1308,27 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
 static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
 {
        unsigned long pfn;
-        struct page *page;
        for (pfn = start; pfn < end; pfn++) {
-                if (pfn_valid(pfn)) {
+                struct page *page, *head;
-                        page = pfn_to_page(pfn);
+                unsigned long skip;
-                        if (PageLRU(page))
-                                return pfn;
+                if (!pfn_valid(pfn))
-                        if (__PageMovable(page))
+                        continue;
-                                return pfn;
+                page = pfn_to_page(pfn);
-                        if (PageHuge(page)) {
+                if (PageLRU(page))
-                                if (hugepage_migration_supported(page_hstate(page)) &&
+                        return pfn;
-                                    page_huge_active(page))
+                if (__PageMovable(page))
-                                        return pfn;
+                        return pfn;
-                                else
-                                        pfn = round_up(pfn + 1,
+                if (!PageHuge(page))
-                                                1 << compound_order(page)) - 1;
+                        continue;
-                        }
+                head = compound_head(page);
-                }
+                if (hugepage_migration_supported(page_hstate(head)) &&
+                    page_huge_active(head))
+                        return pfn;
+                skip = (1 << compound_order(head)) - (page - head);
+                pfn += skip - 1;
        }
        return 0;
 }
@@ -1344,7 +1355,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 {
        unsigned long pfn;
        struct page *page;
-        int not_managed = 0;
        int ret = 0;
        LIST_HEAD(source);
@@ -1392,7 +1402,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                else
                        ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
                if (!ret) { /* Success */
-                        put_page(page);
                        list_add_tail(&page->lru, &source);
                        if (!__PageMovable(page))
                                inc_node_page_state(page, NR_ISOLATED_ANON +
@@ -1401,22 +1410,10 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                } else {
                        pr_warn("failed to isolate pfn %lx\n", pfn);
                        dump_page(page, "isolation failed");
-                        put_page(page);
-                        /* Because we don't have big zone->lock. we should
-                           check this again here. */
-                        if (page_count(page)) {
-                                not_managed++;
-                                ret = -EBUSY;
-                                break;
-                        }
                }
+                put_page(page);
        }
        if (!list_empty(&source)) {
-                if (not_managed) {
-                        putback_movable_pages(&source);
-                        goto out;
-                }
                /* Allocate a new page from the nearest neighbor node */
                ret = migrate_pages(&source, new_node_page, NULL, 0,
                                        MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
@@ -1429,7 +1426,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                        putback_movable_pages(&source);
                }
        }
-out:
        return ret;
 }
@@ -1576,7 +1573,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
           we assume this for now. .*/
        if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start,
                                  &valid_end)) {
-                mem_hotplug_done();
                ret = -EINVAL;
                reason = "multizone range";
                goto failed_removal;
@@ -1591,7 +1587,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
                                       MIGRATE_MOVABLE,
                                       SKIP_HWPOISON | REPORT_FAILURE);
        if (ret) {
-                mem_hotplug_done();
                reason = "failure to isolate range";
                goto failed_removal;
        }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d4496d9d34f5..ee2bce59d2bf 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1314,7 +1314,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
                              nodemask_t *nodes)
 {
        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
-        const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
+        unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
        if (copy > nbytes) {
                if (copy > PAGE_SIZE)
@@ -1491,7 +1491,7 @@ static int kernel_get_mempolicy(int __user *policy,
        int uninitialized_var(pval);
        nodemask_t nodes;
-        if (nmask != NULL && maxnode < MAX_NUMNODES)
+        if (nmask != NULL && maxnode < nr_node_ids)
                return -EINVAL;
        err = do_get_mempolicy(&pval, &nodes, addr, flags);
@@ -1527,7 +1527,7 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
        unsigned long nr_bits, alloc_size;
        DECLARE_BITMAP(bm, MAX_NUMNODES);
-        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
+        nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
        if (nmask)
diff --git a/mm/migrate.c b/mm/migrate.c
index ccf8966caf6f..d4fd680be3b0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -709,7 +709,6 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head,
        /* Simple case, sync compaction */
        if (mode != MIGRATE_ASYNC) {
                do {
-                        get_bh(bh);
                        lock_buffer(bh);
                        bh = bh->b_this_page;
@@ -720,18 +719,15 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head,
        /* async case, we cannot block on lock_buffer so use trylock_buffer */
        do {
-                get_bh(bh);
                if (!trylock_buffer(bh)) {
                        /*
                         * We failed to lock the buffer and cannot stall in
                         * async migration. Release the taken locks
                         */
                        struct buffer_head *failed_bh = bh;
-                        put_bh(failed_bh);
                        bh = head;
                        while (bh != failed_bh) {
                                unlock_buffer(bh);
-                                put_bh(bh);
                                bh = bh->b_this_page;
                        }
                        return false;
@@ -818,7 +814,6 @@ unlock_buffers:
        bh = head;
        do {
                unlock_buffer(bh);
-                put_bh(bh);
                bh = bh->b_this_page;
        } while (bh != head);
@@ -1135,10 +1130,13 @@ out:
         * If migration is successful, decrease refcount of the newpage
         * which will not free the page because new page owner increased
         * refcounter. As well, if it is LRU page, add the page to LRU
-         * list in here.
+         * list in here. Use the old state of the isolated source page to
+         * determine if we migrated a LRU page. newpage was already unlocked
+         * and possibly modified by its owner - don't rely on the page
+         * state.
         */
        if (rc == MIGRATEPAGE_SUCCESS) {
-                if (unlikely(__PageMovable(newpage)))
+                if (unlikely(!is_lru))
                        put_page(newpage);
                else
                        putback_lru_page(newpage);
@@ -1324,19 +1322,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
                goto put_anon;
        if (page_mapped(hpage)) {
-                struct address_space *mapping = page_mapping(hpage);
-                /*
-                 * try_to_unmap could potentially call huge_pmd_unshare.
-                 * Because of this, take semaphore in write mode here and
-                 * set TTU_RMAP_LOCKED to let lower levels know we have
-                 * taken the lock.
-                 */
-                i_mmap_lock_write(mapping);
                try_to_unmap(hpage,
-                        TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
+                        TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
-                        TTU_RMAP_LOCKED);
-                i_mmap_unlock_write(mapping);
                page_was_mapped = 1;
        }
diff --git a/mm/mincore.c b/mm/mincore.c
index f0f91461a9f4..218099b5ed31 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -42,14 +42,72 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
        return 0;
 }
-static int mincore_unmapped_range(unsigned long addr, unsigned long end,
+/*
-                                   struct mm_walk *walk)
+ * Later we can get more picky about what "in core" means precisely.
+ * For now, simply check to see if the page is in the page cache,
+ * and is up to date; i.e. that no page-in operation would be required
+ * at this time if an application were to map and access this page.
+ */
+static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
+{
+        unsigned char present = 0;
+        struct page *page;
+        /*
+         * When tmpfs swaps out a page from a file, any process mapping that
+         * file will not get a swp_entry_t in its pte, but rather it is like
+         * any other file mapping (ie. marked !present and faulted in with
+         * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
+         */
+#ifdef CONFIG_SWAP
+        if (shmem_mapping(mapping)) {
+                page = find_get_entry(mapping, pgoff);
+                /*
+                 * shmem/tmpfs may return swap: account for swapcache
+                 * page too.
+                 */
+                if (xa_is_value(page)) {
+                        swp_entry_t swp = radix_to_swp_entry(page);
+                        page = find_get_page(swap_address_space(swp),
+                                             swp_offset(swp));
+                }
+        } else
+                page = find_get_page(mapping, pgoff);
+#else
+        page = find_get_page(mapping, pgoff);
+#endif
+        if (page) {
+                present = PageUptodate(page);
+                put_page(page);
+        }
+        return present;
+}
+static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
+                                struct vm_area_struct *vma, unsigned char *vec)
 {
-        unsigned char *vec = walk->private;
        unsigned long nr = (end - addr) >> PAGE_SHIFT;
+        int i;
-        memset(vec, 0, nr);
+        if (vma->vm_file) {
-        walk->private += nr;
+                pgoff_t pgoff;
+                pgoff = linear_page_index(vma, addr);
+                for (i = 0; i < nr; i++, pgoff++)
+                        vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
+        } else {
+                for (i = 0; i < nr; i++)
+                        vec[i] = 0;
+        }
+        return nr;
+}
+static int mincore_unmapped_range(unsigned long addr, unsigned long end,
+                                   struct mm_walk *walk)
+{
+        walk->private += __mincore_unmapped_range(addr, end,
+                                                  walk->vma, walk->private);
        return 0;
 }
@@ -69,9 +127,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                goto out;
        }
-        /* We'll consider a THP page under construction to be there */
        if (pmd_trans_unstable(pmd)) {
-                memset(vec, 1, nr);
+                __mincore_unmapped_range(addr, end, vma, vec);
                goto out;
        }
@@ -80,17 +137,28 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                pte_t pte = *ptep;
                if (pte_none(pte))
-                        *vec = 0;
+                        __mincore_unmapped_range(addr, addr + PAGE_SIZE,
+                                                 vma, vec);
                else if (pte_present(pte))
                        *vec = 1;
                else { /* pte is a swap entry */
                        swp_entry_t entry = pte_to_swp_entry(pte);
-                        /*
+                        if (non_swap_entry(entry)) {
-                         * migration or hwpoison entries are always
+                                /*
-                         * uptodate
+                                 * migration or hwpoison entries are always
-                         */
+                                 * uptodate
-                        *vec = !!non_swap_entry(entry);
+                                 */
+                                *vec = 1;
+                        } else {
+#ifdef CONFIG_SWAP
+                                *vec = mincore_page(swap_address_space(entry),
+                                                    swp_offset(entry));
+#else
+                                WARN_ON(1);
+                                *vec = 1;
+#endif
+                        }
                }
                vec++;
        }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f0e8cd9edb1a..26ea8636758f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -647,8 +647,8 @@ static int oom_reaper(void *unused)
 static void wake_oom_reaper(struct task_struct *tsk)
 {
-        /* tsk is already queued? */
+        /* mm is already queued? */
-        if (tsk == oom_reaper_list || tsk->oom_reaper_list)
+        if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
                return;
        get_task_struct(tsk);
@@ -975,6 +975,13 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
         * still freeing memory.
         */
        read_lock(&tasklist_lock);
+        /*
+         * The task 'p' might have already exited before reaching here. The
+         * put_task_struct() will free task_struct 'p' while the loop still try
+         * to access the field of 'p', so, get an extra reference.
+         */
+        get_task_struct(p);
        for_each_thread(p, t) {
                list_for_each_entry(child, &t->children, sibling) {
                        unsigned int child_points;
@@ -994,6 +1001,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
                        }
                }
        }
+        put_task_struct(p);
        read_unlock(&tasklist_lock);
        /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cde5dac6229a..0b9f577b1a2a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2170,6 +2170,18 @@ static inline void boost_watermark(struct zone *zone)
        max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
                        watermark_boost_factor, 10000);
+        /*
+         * high watermark may be uninitialised if fragmentation occurs
+         * very early in boot so do not boost. We do not fall
+         * through and boost by pageblock_nr_pages as failing
+         * allocations that early means that reclaim is not going
+         * to help and it may even be impossible to reclaim the
+         * boosted watermark resulting in a hang.
+         */
+        if (!max_boost)
+                return;
        max_boost = max(pageblock_nr_pages, max_boost);
        zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
@@ -2214,7 +2226,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
         */
        boost_watermark(zone);
        if (alloc_flags & ALLOC_KSWAPD)
-                wakeup_kswapd(zone, 0, 0, zone_idx(zone));
+                set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
        /* We are not allowed to try stealing from the whole block */
        if (!whole_block)
@@ -3102,6 +3114,12 @@ struct page *rmqueue(struct zone *preferred_zone,
        local_irq_restore(flags);
 out:
+        /* Separate test+clear to avoid unnecessary atomics */
+        if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
+                clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
+                wakeup_kswapd(zone, 0, 0, zone_idx(zone));
+        }
        VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
        return page;
@@ -4669,11 +4687,11 @@ refill:
                /* Even if we own the page, we do not use atomic_set().
                 * This would break get_page_unless_zero() users.
                 */
-                page_ref_add(page, size - 1);
+                page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
                /* reset page count bias and offset to start of new frag */
                nc->pfmemalloc = page_is_pfmemalloc(page);
-                nc->pagecnt_bias = size;
+                nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
                nc->offset = size;
        }
@@ -4689,10 +4707,10 @@ refill:
                size = nc->size;
 #endif
                /* OK, page count is 0, we can safely set it */
-                set_page_count(page, size);
+                set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
                /* reset page count bias and offset to start of new frag */
-                nc->pagecnt_bias = size;
+                nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
                offset = size - fragsz;
        }
@@ -5695,18 +5713,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                        cond_resched();
                }
        }
-#ifdef CONFIG_SPARSEMEM
-        /*
-         * If the zone does not span the rest of the section then
-         * we should at least initialize those pages. Otherwise we
-         * could blow up on a poisoned page in some paths which depend
-         * on full sections being initialized (e.g. memory hotplug).
-         */
-        while (end_pfn % PAGES_PER_SECTION) {
-                __init_single_page(pfn_to_page(end_pfn), end_pfn, zone, nid);
-                end_pfn++;
-        }
-#endif
 }
 #ifdef CONFIG_ZONE_DEVICE
diff --git a/mm/page_ext.c b/mm/page_ext.c
index ae44f7adbe07..8c78b8d45117 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -398,10 +398,8 @@ void __init page_ext_init(void)
                         * We know some arch can have a nodes layout such as
                         * -------------pfn-------------->
                         * N0 | N1 | N2 | N0 | N1 | N2|....
-                         *
-                         * Take into account DEFERRED_STRUCT_PAGE_INIT.
                         */
-                        if (early_pfn_to_nid(pfn) != nid)
+                        if (pfn_to_nid(pfn) != nid)
                                continue;
                        if (init_section_page_ext(pfn, nid))
                                goto oom;
diff --git a/mm/rmap.c b/mm/rmap.c
index 21a26cf51114..0454ecc29537 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -25,7 +25,6 @@
 *     page->flags PG_locked (lock_page)
 *       hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
 *         mapping->i_mmap_rwsem
- *           hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
 *           anon_vma->rwsem
 *             mm->page_table_lock or pte_lock
 *               zone_lru_lock (in mark_page_accessed, isolate_lru_page)
@@ -1372,16 +1371,13 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
         * Note that the page can not be free in this function as call of
         * try_to_unmap() must hold a reference on the page.
         */
-        mmu_notifier_range_init(&range, vma->vm_mm, vma->vm_start,
+        mmu_notifier_range_init(&range, vma->vm_mm, address,
-                                min(vma->vm_end, vma->vm_start +
+                                min(vma->vm_end, address +
                                    (PAGE_SIZE << compound_order(page))));
        if (PageHuge(page)) {
                /*
                 * If sharing is possible, start and end will be adjusted
                 * accordingly.
-                 *
-                 * If called for a huge page, caller must hold i_mmap_rwsem
-                 * in write mode as it is possible to call huge_pmd_unshare.
                 */
                adjust_range_if_pmd_sharing_possible(vma, &range.start,
                                                     &range.end);
diff --git a/mm/shmem.c b/mm/shmem.c
index 6ece1e2fe76e..0905215fb016 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2854,10 +2854,14 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
         * No ordinary (disk based) filesystem counts links as inodes;
         * but each new link needs a new dentry, pinning lowmem, and
         * tmpfs dentries cannot be pruned until they are unlinked.
+         * But if an O_TMPFILE file is linked into the tmpfs, the
+         * first link must skip that, to get the accounting right.
         */
-        ret = shmem_reserve_inode(inode->i_sb);
+        if (inode->i_nlink) {
-        if (ret)
+                ret = shmem_reserve_inode(inode->i_sb);
-                goto out;
+                if (ret)
+                        goto out;
+        }
        dir->i_size += BOGO_DIRENT_SIZE;
        inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
diff --git a/mm/slab.c b/mm/slab.c
index 73fe23e649c9..91c1863df93d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -666,8 +666,10 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries,
        struct alien_cache *alc = NULL;
        alc = kmalloc_node(memsize, gfp, node);
-        init_arraycache(&alc->ac, entries, batch);
+        if (alc) {
-        spin_lock_init(&alc->lock);
+                init_arraycache(&alc->ac, entries, batch);
+                spin_lock_init(&alc->lock);
+        }
        return alc;
 }
@@ -2357,7 +2359,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
        void *freelist;
        void *addr = page_address(page);
-        page->s_mem = kasan_reset_tag(addr) + colour_off;
+        page->s_mem = addr + colour_off;
        page->active = 0;
        if (OBJFREELIST_SLAB(cachep))
@@ -2366,6 +2368,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
                /* Slab management obj is off-slab. */
                freelist = kmem_cache_alloc_node(cachep->freelist_cache,
                                              local_flags, nodeid);
+                freelist = kasan_reset_tag(freelist);
                if (!freelist)
                        return NULL;
        } else {
@@ -2679,6 +2682,13 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
        offset *= cachep->colour_off;
+        /*
+         * Call kasan_poison_slab() before calling alloc_slabmgmt(), so
+         * page_address() in the latter returns a non-tagged pointer,
+         * as it should be for slab pages.
+         */
+        kasan_poison_slab(page);
        /* Get slab management. */
        freelist = alloc_slabmgmt(cachep, page, offset,
                        local_flags & ~GFP_CONSTRAINT_MASK, page_node);
@@ -2687,7 +2697,6 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
        slab_map_pages(cachep, page, freelist);
-        kasan_poison_slab(page);
        cache_init_objs(cachep, page);
        if (gfpflags_allow_blocking(local_flags))
@@ -3538,7 +3547,6 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
        void *ret = slab_alloc(cachep, flags, _RET_IP_);
-        ret = kasan_slab_alloc(cachep, ret, flags);
        trace_kmem_cache_alloc(_RET_IP_, ret,
                               cachep->object_size, cachep->size, flags);
@@ -3628,7 +3636,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
        void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
-        ret = kasan_slab_alloc(cachep, ret, flags);
        trace_kmem_cache_alloc_node(_RET_IP_, ret,
                                    cachep->object_size, cachep->size,
                                    flags, nodeid);
@@ -4406,6 +4413,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
        unsigned int objnr;
        unsigned long offset;
+        ptr = kasan_reset_tag(ptr);
        /* Find and validate object. */
        cachep = page->slab_cache;
        objnr = obj_to_index(cachep, page, (void *)ptr);
diff --git a/mm/slab.h b/mm/slab.h
index 4190c24ef0e9..384105318779 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -437,11 +437,10 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
        flags &= gfp_allowed_mask;
        for (i = 0; i < size; i++) {
-                void *object = p[i];
+                p[i] = kasan_slab_alloc(s, p[i], flags);
+                /* As p[i] might get tagged, call kmemleak hook after KASAN. */
-                kmemleak_alloc_recursive(object, s->object_size, 1,
+                kmemleak_alloc_recursive(p[i], s->object_size, 1,
                                         s->flags, flags);
-                p[i] = kasan_slab_alloc(s, object, flags);
        }
        if (memcg_kmem_enabled())
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 81732d05e74a..f9d89c1b5977 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1228,8 +1228,9 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
        flags |= __GFP_COMP;
        page = alloc_pages(flags, order);
        ret = page ? page_address(page) : NULL;
-        kmemleak_alloc(ret, size, 1, flags);
        ret = kasan_kmalloc_large(ret, size, flags);
+        /* As ret might get tagged, call kmemleak hook after KASAN. */
+        kmemleak_alloc(ret, size, 1, flags);
        return ret;
 }
 EXPORT_SYMBOL(kmalloc_order);
diff --git a/mm/slub.c b/mm/slub.c
index 36c0befeebd8..dc777761b6b7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -249,7 +249,18 @@ static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
                                 unsigned long ptr_addr)
 {
 #ifdef CONFIG_SLAB_FREELIST_HARDENED
-        return (void *)((unsigned long)ptr ^ s->random ^ ptr_addr);
+        /*
+         * When CONFIG_KASAN_SW_TAGS is enabled, ptr_addr might be tagged.
+         * Normally, this doesn't cause any issues, as both set_freepointer()
+         * and get_freepointer() are called with a pointer with the same tag.
+         * However, there are some issues with CONFIG_SLUB_DEBUG code. For
+         * example, when __free_slub() iterates over objects in a cache, it
+         * passes untagged pointers to check_object(). check_object() in turns
+         * calls get_freepointer() with an untagged pointer, which causes the
+         * freepointer to be restored incorrectly.
+         */
+        return (void *)((unsigned long)ptr ^ s->random ^
+                        (unsigned long)kasan_reset_tag((void *)ptr_addr));
 #else
        return ptr;
 #endif
@@ -303,15 +314,10 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
                __p < (__addr) + (__objects) * (__s)->size; \
                __p += (__s)->size)
-#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
-        for (__p = fixup_red_left(__s, __addr), __idx = 1; \
-                __idx <= __objects; \
-                __p += (__s)->size, __idx++)
 /* Determine object index from a given position */
 static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr)
 {
-        return (p - addr) / s->size;
+        return (kasan_reset_tag(p) - addr) / s->size;
 }
 static inline unsigned int order_objects(unsigned int order, unsigned int size)
@@ -507,6 +513,7 @@ static inline int check_valid_pointer(struct kmem_cache *s,
                return 1;
        base = page_address(page);
+        object = kasan_reset_tag(object);
        object = restore_red_left(s, object);
        if (object < base || object >= base + page->objects * s->size ||
                (object - base) % s->size) {
@@ -1075,6 +1082,16 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
        init_tracking(s, object);
 }
+static void setup_page_debug(struct kmem_cache *s, void *addr, int order)
+{
+        if (!(s->flags & SLAB_POISON))
+                return;
+        metadata_access_enable();
+        memset(addr, POISON_INUSE, PAGE_SIZE << order);
+        metadata_access_disable();
+}
 static inline int alloc_consistency_checks(struct kmem_cache *s,
                                        struct page *page,
                                        void *object, unsigned long addr)
@@ -1330,6 +1347,8 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
 #else /* !CONFIG_SLUB_DEBUG */
 static inline void setup_object_debug(struct kmem_cache *s,
                        struct page *page, void *object) {}
+static inline void setup_page_debug(struct kmem_cache *s,
+                        void *addr, int order) {}
 static inline int alloc_debug_processing(struct kmem_cache *s,
        struct page *page, void *object, unsigned long addr) { return 0; }
@@ -1374,8 +1393,10 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
 */
 static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
 {
+        ptr = kasan_kmalloc_large(ptr, size, flags);
+        /* As ptr might get tagged, call kmemleak hook after KASAN. */
        kmemleak_alloc(ptr, size, 1, flags);
-        return kasan_kmalloc_large(ptr, size, flags);
+        return ptr;
 }
 static __always_inline void kfree_hook(void *x)
@@ -1641,27 +1662,25 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
        if (page_is_pfmemalloc(page))
                SetPageSlabPfmemalloc(page);
-        start = page_address(page);
+        kasan_poison_slab(page);
-        if (unlikely(s->flags & SLAB_POISON))
+        start = page_address(page);
-                memset(start, POISON_INUSE, PAGE_SIZE << order);
-        kasan_poison_slab(page);
+        setup_page_debug(s, start, order);
        shuffle = shuffle_freelist(s, page);
        if (!shuffle) {
-                for_each_object_idx(p, idx, s, start, page->objects) {
-                        if (likely(idx < page->objects)) {
-                                next = p + s->size;
-                                next = setup_object(s, page, next);
-                                set_freepointer(s, p, next);
-                        } else
-                                set_freepointer(s, p, NULL);
-                }
                start = fixup_red_left(s, start);
                start = setup_object(s, page, start);
                page->freelist = start;
+                for (idx = 0, p = start; idx < page->objects - 1; idx++) {
+                        next = p + s->size;
+                        next = setup_object(s, page, next);
+                        set_freepointer(s, p, next);
+                        p = next;
+                }
+                set_freepointer(s, p, NULL);
        }
        page->inuse = page->objects;
@@ -3846,6 +3865,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
        unsigned int offset;
        size_t object_size;
+        ptr = kasan_reset_tag(ptr);
        /* Find object and usable object size. */
        s = page->slab_cache;
diff --git a/mm/swap.c b/mm/swap.c
index 4929bc1be60e..4d7d37eb3c40 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -320,11 +320,6 @@ static inline void activate_page_drain(int cpu)
 {
 }
-static bool need_activate_page_drain(int cpu)
-{
-        return false;
-}
 void activate_page(struct page *page)
 {
        struct zone *zone = page_zone(page);
@@ -653,13 +648,15 @@ void lru_add_drain(void)
        put_cpu();
 }
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
 static void lru_add_drain_per_cpu(struct work_struct *dummy)
 {
        lru_add_drain();
 }
-static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
 /*
 * Doesn't need any cpu hotplug locking because we do rely on per-cpu
 * kworkers being shut down before our page_alloc_cpu_dead callback is
@@ -702,6 +699,12 @@ void lru_add_drain_all(void)
        mutex_unlock(&lock);
 }
+#else
+void lru_add_drain_all(void)
+{
+        lru_add_drain();
+}
+#endif
 /**
 * release_pages - batched put_page()
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 852eb4e53f06..14faadcedd06 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -247,7 +247,8 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
 /*
 * Validates that the given object is:
 * - not bogus address
- * - known-safe heap or stack object
+ * - fully contained by stack (or stack frame, when available)
+ * - fully within SLAB object (or object whitelist area, when available)
 * - not in kernel text
 */
 void __check_object_size(const void *ptr, unsigned long n, bool to_user)
@@ -262,9 +263,6 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
        /* Check for invalid addresses. */
        check_bogus_address((const unsigned long)ptr, n, to_user);
-        /* Check for bad heap object. */
-        check_heap_object(ptr, n, to_user);
        /* Check for bad stack object. */
        switch (check_stack_object(ptr, n)) {
        case NOT_STACK:
@@ -282,6 +280,9 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
                usercopy_abort("process stack", NULL, to_user, 0, n);
        }
+        /* Check for bad heap object. */
+        check_heap_object(ptr, n, to_user);
        /* Check for object in kernel to avoid text exposure. */
        check_kernel_text_object((const unsigned long)ptr, n, to_user);
 }
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 065c1ce191c4..d59b5a73dfb3 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -267,14 +267,10 @@ retry:
                VM_BUG_ON(dst_addr & ~huge_page_mask(h));
                /*
-                 * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
+                 * Serialize via hugetlb_fault_mutex
-                 * i_mmap_rwsem ensures the dst_pte remains valid even
-                 * in the case of shared pmds.  fault mutex prevents
-                 * races with other faulting threads.
                 */
-                mapping = dst_vma->vm_file->f_mapping;
-                i_mmap_lock_read(mapping);
                idx = linear_page_index(dst_vma, dst_addr);
+                mapping = dst_vma->vm_file->f_mapping;
                hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
                                                                idx, dst_addr);
                mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -283,7 +279,6 @@ retry:
                dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
                if (!dst_pte) {
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-                        i_mmap_unlock_read(mapping);
                        goto out_unlock;
                }
@@ -291,7 +286,6 @@ retry:
                dst_pteval = huge_ptep_get(dst_pte);
                if (!huge_pte_none(dst_pteval)) {
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-                        i_mmap_unlock_read(mapping);
                        goto out_unlock;
                }
@@ -299,7 +293,6 @@ retry:
                                                dst_addr, src_addr, &page);
                mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-                i_mmap_unlock_read(mapping);
                vm_alloc_shared = vm_shared;
                cond_resched();
diff --git a/mm/util.c b/mm/util.c
index 4df23d64aac7..379319b1bcfd 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -150,7 +150,7 @@ void *memdup_user(const void __user *src, size_t len)
 {
        void *p;
-        p = kmalloc_track_caller(len, GFP_USER);
+        p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
        if (!p)
                return ERR_PTR(-ENOMEM);
@@ -478,7 +478,7 @@ bool page_mapped(struct page *page)
                return true;
        if (PageHuge(page))
                return false;
-        for (i = 0; i < hpage_nr_pages(page); i++) {
+        for (i = 0; i < (1 << compound_order(page)); i++) {
                if (atomic_read(&page[i]._mapcount) >= 0)
                        return true;
        }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a714c4f800e9..e979705bbf32 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -491,16 +491,6 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
                delta = freeable / 2;
        }
-        /*
-         * Make sure we apply some minimal pressure on default priority
-         * even on small cgroups. Stale objects are not only consuming memory
-         * by themselves, but can also hold a reference to a dying cgroup,
-         * preventing it from being reclaimed. A dying cgroup with all
-         * corresponding structures like per-cpu stats and kmem caches
-         * can be really big, so it may lead to a significant waste of memory.
-         */
-        delta = max_t(unsigned long long, delta, min(freeable, batch_size));
        total_scan += delta;
        if (total_scan < 0) {
                pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",