Merge branch 'akpm' (patches from Andrew)

Merge misc updates from Andrew Morton: - large KASAN update to use arm's "software tag-based mode" - a few misc things - sh updates - ocfs2 updates - just about all of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (167 commits) kernel/fork.c: mark 'stack_vm_area' with __maybe_unused memcg, oom: notify on oom killer invocation from the charge path mm, swap: fix swapoff with KSM pages include/linux/gfp.h: fix typo mm/hmm: fix memremap.h, move dev_page_fault_t callback to hmm hugetlbfs: Use i_mmap_rwsem to fix page fault/truncate race hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization memory_hotplug: add missing newlines to debugging output mm: remove __hugepage_set_anon_rmap() include/linux/vmstat.h: remove unused page state adjustment macro mm/page_alloc.c: allow error injection mm: migrate: drop unused argument of migrate_page_move_mapping() blkdev: avoid migration stalls for blkdev pages mm: migrate: provide buffer_migrate_page_norefs() mm: migrate: move migrate_page_lock_buffers() mm: migrate: lock buffers before migrate_page_move_mapping() mm: migration: factor out code to compute expected number of page references mm, page_alloc: enable pcpu_drain with zone capability kmemleak: add config to select auto scan mm/page_alloc.c: don't call kasan_free_pages() at deferred mem init ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2018-12-28 19:55:46 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2018-12-28 19:55:46 -0500
commit: f346b0becb1bc62e45495f9cdbae3eef35d0b635 (patch)
tree: ae79f3dfb8e031da51d38f0f095f89d7d23f3643 /mm
parent: 00d59fde8532b2d42e80909d2e58678755e04da9 (diff)
parent: 0f4991e8fd48987ae476a92cdee6bfec4aff31b8 (diff)
57 files changed, 2454 insertions, 1770 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d85e39da47ae..25c71eb8a7db 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -291,6 +291,7 @@ config MMU_NOTIFIER
 config KSM
        bool "Enable KSM for page merging"
        depends on MMU
+        select XXHASH
        help
          Enable Kernel Samepage Merging: KSM periodically scans those areas
          of an application's address space that an app has advised may be
diff --git a/mm/cma.c b/mm/cma.c
index 4cb76121a3ab..c7b39dd3b4f6 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -407,6 +407,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
        unsigned long pfn = -1;
        unsigned long start = 0;
        unsigned long bitmap_maxno, bitmap_no, bitmap_count;
+        size_t i;
        struct page *page = NULL;
        int ret = -ENOMEM;
@@ -466,6 +467,16 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
        trace_cma_alloc(pfn, page, count, align);
+        /*
+         * CMA can allocate multiple page blocks, which results in different
+         * blocks being marked with different tags. Reset the tags to ignore
+         * those page blocks.
+         */
+        if (page) {
+                for (i = 0; i < count; i++)
+                        page_kasan_tag_reset(page + i);
+        }
        if (ret && !no_warn) {
                pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n",
                        __func__, count, ret);
diff --git a/mm/compaction.c b/mm/compaction.c
index 7c607479de4a..ef29490b0f46 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1431,7 +1431,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
        if (is_via_compact_memory(order))
                return COMPACT_CONTINUE;
-        watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
+        watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
        /*
         * If watermarks for high-order allocation are already met, there
         * should be no need for compaction at all.
diff --git a/mm/debug.c b/mm/debug.c
index cdacba12e09a..0abb987dad9b 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -17,7 +17,7 @@
 #include "internal.h"
-char *migrate_reason_names[MR_TYPES] = {
+const char *migrate_reason_names[MR_TYPES] = {
        "compaction",
        "memory_failure",
        "memory_hotplug",
@@ -44,6 +44,7 @@ const struct trace_print_flags vmaflag_names[] = {
 void __dump_page(struct page *page, const char *reason)
 {
+        struct address_space *mapping = page_mapping(page);
        bool page_poisoned = PagePoisoned(page);
        int mapcount;
@@ -53,7 +54,7 @@ void __dump_page(struct page *page, const char *reason)
         * dump_page() when detected.
         */
        if (page_poisoned) {
-                pr_emerg("page:%px is uninitialized and poisoned", page);
+                pr_warn("page:%px is uninitialized and poisoned", page);
                goto hex_only;
        }
@@ -64,27 +65,39 @@ void __dump_page(struct page *page, const char *reason)
         */
        mapcount = PageSlab(page) ? 0 : page_mapcount(page);
-        pr_emerg("page:%px count:%d mapcount:%d mapping:%px index:%#lx",
+        pr_warn("page:%px count:%d mapcount:%d mapping:%px index:%#lx",
                  page, page_ref_count(page), mapcount,
                  page->mapping, page_to_pgoff(page));
        if (PageCompound(page))
                pr_cont(" compound_mapcount: %d", compound_mapcount(page));
        pr_cont("\n");
+        if (PageAnon(page))
+                pr_warn("anon ");
+        else if (PageKsm(page))
+                pr_warn("ksm ");
+        else if (mapping) {
+                pr_warn("%ps ", mapping->a_ops);
+                if (mapping->host->i_dentry.first) {
+                        struct dentry *dentry;
+                        dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias);
+                        pr_warn("name:\"%pd\" ", dentry);
+                }
+        }
        BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
-        pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags);
+        pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags);
 hex_only:
-        print_hex_dump(KERN_ALERT, "raw: ", DUMP_PREFIX_NONE, 32,
+        print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
                        sizeof(unsigned long), page,
                        sizeof(struct page), false);
        if (reason)
-                pr_alert("page dumped because: %s\n", reason);
+                pr_warn("page dumped because: %s\n", reason);
 #ifdef CONFIG_MEMCG
        if (!page_poisoned && page->mem_cgroup)
-                pr_alert("page->mem_cgroup:%px\n", page->mem_cgroup);
+                pr_warn("page->mem_cgroup:%px\n", page->mem_cgroup);
 #endif
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 81adec8ee02c..29655fb47a2c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -981,7 +981,14 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
        if (wait_page->bit_nr != key->bit_nr)
                return 0;
-        /* Stop walking if it's locked */
+        /*
+         * Stop walking if it's locked.
+         * Is this safe if put_and_wait_on_page_locked() is in use?
+         * Yes: the waker must hold a reference to this page, and if PG_locked
+         * has now already been set by another task, that task must also hold
+         * a reference to the *same usage* of this page; so there is no need
+         * to walk on to wake even the put_and_wait_on_page_locked() callers.
+         */
        if (test_bit(key->bit_nr, &key->page->flags))
                return -1;
@@ -1049,25 +1056,44 @@ static void wake_up_page(struct page *page, int bit)
        wake_up_page_bit(page, bit);
 }
+/*
+ * A choice of three behaviors for wait_on_page_bit_common():
+ */
+enum behavior {
+        EXCLUSIVE,      /* Hold ref to page and take the bit when woken, like
+                         * __lock_page() waiting on then setting PG_locked.
+                         */
+        SHARED,         /* Hold ref to page and check the bit when woken, like
+                         * wait_on_page_writeback() waiting on PG_writeback.
+                         */
+        DROP,           /* Drop ref to page before wait, no check when woken,
+                         * like put_and_wait_on_page_locked() on PG_locked.
+                         */
+};
 static inline int wait_on_page_bit_common(wait_queue_head_t *q,
-                struct page *page, int bit_nr, int state, bool lock)
+        struct page *page, int bit_nr, int state, enum behavior behavior)
 {
        struct wait_page_queue wait_page;
        wait_queue_entry_t *wait = &wait_page.wait;
+        bool bit_is_set;
        bool thrashing = false;
+        bool delayacct = false;
        unsigned long pflags;
        int ret = 0;
        if (bit_nr == PG_locked &&
            !PageUptodate(page) && PageWorkingset(page)) {
-                if (!PageSwapBacked(page))
+                if (!PageSwapBacked(page)) {
                        delayacct_thrashing_start();
+                        delayacct = true;
+                }
                psi_memstall_enter(&pflags);
                thrashing = true;
        }
        init_wait(wait);
-        wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
+        wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0;
        wait->func = wake_page_function;
        wait_page.page = page;
        wait_page.bit_nr = bit_nr;
@@ -1084,14 +1110,17 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
                spin_unlock_irq(&q->lock);
-                if (likely(test_bit(bit_nr, &page->flags))) {
+                bit_is_set = test_bit(bit_nr, &page->flags);
+                if (behavior == DROP)
+                        put_page(page);
+                if (likely(bit_is_set))
                        io_schedule();
-                }
-                if (lock) {
+                if (behavior == EXCLUSIVE) {
                        if (!test_and_set_bit_lock(bit_nr, &page->flags))
                                break;
-                } else {
+                } else if (behavior == SHARED) {
                        if (!test_bit(bit_nr, &page->flags))
                                break;
                }
@@ -1100,12 +1129,23 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
                        ret = -EINTR;
                        break;
                }
+                if (behavior == DROP) {
+                        /*
+                         * We can no longer safely access page->flags:
+                         * even if CONFIG_MEMORY_HOTREMOVE is not enabled,
+                         * there is a risk of waiting forever on a page reused
+                         * for something that keeps it locked indefinitely.
+                         * But best check for -EINTR above before breaking.
+                         */
+                        break;
+                }
        }
        finish_wait(q, wait);
        if (thrashing) {
-                if (!PageSwapBacked(page))
+                if (delayacct)
                        delayacct_thrashing_end();
                psi_memstall_leave(&pflags);
        }
@@ -1124,18 +1164,37 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
 void wait_on_page_bit(struct page *page, int bit_nr)
 {
        wait_queue_head_t *q = page_waitqueue(page);
-        wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false);
+        wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
 }
 EXPORT_SYMBOL(wait_on_page_bit);
 int wait_on_page_bit_killable(struct page *page, int bit_nr)
 {
        wait_queue_head_t *q = page_waitqueue(page);
-        return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false);
+        return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED);
 }
 EXPORT_SYMBOL(wait_on_page_bit_killable);
 /**
+ * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
+ * @page: The page to wait for.
+ *
+ * The caller should hold a reference on @page.  They expect the page to
+ * become unlocked relatively soon, but do not wish to hold up migration
+ * (for example) by holding the reference while waiting for the page to
+ * come unlocked.  After this function returns, the caller should not
+ * dereference @page.
+ */
+void put_and_wait_on_page_locked(struct page *page)
+{
+        wait_queue_head_t *q;
+        page = compound_head(page);
+        q = page_waitqueue(page);
+        wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP);
+}
+/**
 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
 * @page: Page defining the wait queue of interest
 * @waiter: Waiter to add to the queue
@@ -1264,7 +1323,8 @@ void __lock_page(struct page *__page)
 {
        struct page *page = compound_head(__page);
        wait_queue_head_t *q = page_waitqueue(page);
-        wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true);
+        wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
+                                EXCLUSIVE);
 }
 EXPORT_SYMBOL(__lock_page);
@@ -1272,7 +1332,8 @@ int __lock_page_killable(struct page *__page)
 {
        struct page *page = compound_head(__page);
        wait_queue_head_t *q = page_waitqueue(page);
-        return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true);
+        return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE,
+                                        EXCLUSIVE);
 }
 EXPORT_SYMBOL_GPL(__lock_page_killable);
@@ -1540,7 +1601,7 @@ repeat:
                VM_BUG_ON_PAGE(page->index != offset, page);
        }
-        if (page && (fgp_flags & FGP_ACCESSED))
+        if (fgp_flags & FGP_ACCESSED)
                mark_page_accessed(page);
 no_page:
@@ -2553,6 +2614,13 @@ void filemap_map_pages(struct vm_fault *vmf,
                        goto next;
                head = compound_head(page);
+                /*
+                 * Check for a locked page first, as a speculative
+                 * reference may adversely influence page migration.
+                 */
+                if (PageLocked(head))
+                        goto next;
                if (!page_cache_get_speculative(head))
                        goto next;
diff --git a/mm/highmem.c b/mm/highmem.c
index 59db3223a5d6..107b10f9878e 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -105,9 +105,8 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
 }
 #endif
-unsigned long totalhigh_pages __read_mostly;
+atomic_long_t _totalhigh_pages __read_mostly;
-EXPORT_SYMBOL(totalhigh_pages);
+EXPORT_SYMBOL(_totalhigh_pages);
 EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
diff --git a/mm/hmm.c b/mm/hmm.c
index 90c34f3d1243..a04e4b810610 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -189,35 +189,30 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 }
 static int hmm_invalidate_range_start(struct mmu_notifier *mn,
-                                      struct mm_struct *mm,
+                        const struct mmu_notifier_range *range)
-                                      unsigned long start,
-                                      unsigned long end,
-                                      bool blockable)
 {
        struct hmm_update update;
-        struct hmm *hmm = mm->hmm;
+        struct hmm *hmm = range->mm->hmm;
        VM_BUG_ON(!hmm);
-        update.start = start;
+        update.start = range->start;
-        update.end = end;
+        update.end = range->end;
        update.event = HMM_UPDATE_INVALIDATE;
-        update.blockable = blockable;
+        update.blockable = range->blockable;
        return hmm_invalidate_range(hmm, true, &update);
 }
 static void hmm_invalidate_range_end(struct mmu_notifier *mn,
-                                     struct mm_struct *mm,
+                        const struct mmu_notifier_range *range)
-                                     unsigned long start,
-                                     unsigned long end)
 {
        struct hmm_update update;
-        struct hmm *hmm = mm->hmm;
+        struct hmm *hmm = range->mm->hmm;
        VM_BUG_ON(!hmm);
-        update.start = start;
+        update.start = range->start;
-        update.end = end;
+        update.end = range->end;
        update.event = HMM_UPDATE_INVALIDATE;
        update.blockable = true;
        hmm_invalidate_range(hmm, false, &update);
@@ -986,19 +981,13 @@ static void hmm_devmem_ref_exit(void *data)
        struct hmm_devmem *devmem;
        devmem = container_of(ref, struct hmm_devmem, ref);
+        wait_for_completion(&devmem->completion);
        percpu_ref_exit(ref);
-        devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data);
 }
-static void hmm_devmem_ref_kill(void *data)
+static void hmm_devmem_ref_kill(struct percpu_ref *ref)
 {
-        struct percpu_ref *ref = data;
-        struct hmm_devmem *devmem;
-        devmem = container_of(ref, struct hmm_devmem, ref);
        percpu_ref_kill(ref);
-        wait_for_completion(&devmem->completion);
-        devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data);
 }
 static int hmm_devmem_fault(struct vm_area_struct *vma,
@@ -1021,172 +1010,6 @@ static void hmm_devmem_free(struct page *page, void *data)
        devmem->ops->free(devmem, page);
 }
-static DEFINE_MUTEX(hmm_devmem_lock);
-static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
-static void hmm_devmem_radix_release(struct resource *resource)
-{
-        resource_size_t key;
-        mutex_lock(&hmm_devmem_lock);
-        for (key = resource->start;
-             key <= resource->end;
-             key += PA_SECTION_SIZE)
-                radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT);
-        mutex_unlock(&hmm_devmem_lock);
-}
-static void hmm_devmem_release(struct device *dev, void *data)
-{
-        struct hmm_devmem *devmem = data;
-        struct resource *resource = devmem->resource;
-        unsigned long start_pfn, npages;
-        struct zone *zone;
-        struct page *page;
-        if (percpu_ref_tryget_live(&devmem->ref)) {
-                dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
-                percpu_ref_put(&devmem->ref);
-        }
-        /* pages are dead and unused, undo the arch mapping */
-        start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT;
-        npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT;
-        page = pfn_to_page(start_pfn);
-        zone = page_zone(page);
-        mem_hotplug_begin();
-        if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
-                __remove_pages(zone, start_pfn, npages, NULL);
-        else
-                arch_remove_memory(start_pfn << PAGE_SHIFT,
-                                   npages << PAGE_SHIFT, NULL);
-        mem_hotplug_done();
-        hmm_devmem_radix_release(resource);
-}
-static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
-{
-        resource_size_t key, align_start, align_size, align_end;
-        struct device *device = devmem->device;
-        int ret, nid, is_ram;
-        align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
-        align_size = ALIGN(devmem->resource->start +
-                           resource_size(devmem->resource),
-                           PA_SECTION_SIZE) - align_start;
-        is_ram = region_intersects(align_start, align_size,
-                                   IORESOURCE_SYSTEM_RAM,
-                                   IORES_DESC_NONE);
-        if (is_ram == REGION_MIXED) {
-                WARN_ONCE(1, "%s attempted on mixed region %pr\n",
-                                __func__, devmem->resource);
-                return -ENXIO;
-        }
-        if (is_ram == REGION_INTERSECTS)
-                return -ENXIO;
-        if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY)
-                devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
-        else
-                devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
-        devmem->pagemap.res = *devmem->resource;
-        devmem->pagemap.page_fault = hmm_devmem_fault;
-        devmem->pagemap.page_free = hmm_devmem_free;
-        devmem->pagemap.dev = devmem->device;
-        devmem->pagemap.ref = &devmem->ref;
-        devmem->pagemap.data = devmem;
-        mutex_lock(&hmm_devmem_lock);
-        align_end = align_start + align_size - 1;
-        for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) {
-                struct hmm_devmem *dup;
-                dup = radix_tree_lookup(&hmm_devmem_radix,
-                                        key >> PA_SECTION_SHIFT);
-                if (dup) {
-                        dev_err(device, "%s: collides with mapping for %s\n",
-                                __func__, dev_name(dup->device));
-                        mutex_unlock(&hmm_devmem_lock);
-                        ret = -EBUSY;
-                        goto error;
-                }
-                ret = radix_tree_insert(&hmm_devmem_radix,
-                                        key >> PA_SECTION_SHIFT,
-                                        devmem);
-                if (ret) {
-                        dev_err(device, "%s: failed: %d\n", __func__, ret);
-                        mutex_unlock(&hmm_devmem_lock);
-                        goto error_radix;
-                }
-        }
-        mutex_unlock(&hmm_devmem_lock);
-        nid = dev_to_node(device);
-        if (nid < 0)
-                nid = numa_mem_id();
-        mem_hotplug_begin();
-        /*
-         * For device private memory we call add_pages() as we only need to
-         * allocate and initialize struct page for the device memory. More-
-         * over the device memory is un-accessible thus we do not want to
-         * create a linear mapping for the memory like arch_add_memory()
-         * would do.
-         *
-         * For device public memory, which is accesible by the CPU, we do
-         * want the linear mapping and thus use arch_add_memory().
-         */
-        if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
-                ret = arch_add_memory(nid, align_start, align_size, NULL,
-                                false);
-        else
-                ret = add_pages(nid, align_start >> PAGE_SHIFT,
-                                align_size >> PAGE_SHIFT, NULL, false);
-        if (ret) {
-                mem_hotplug_done();
-                goto error_add_memory;
-        }
-        move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
-                                align_start >> PAGE_SHIFT,
-                                align_size >> PAGE_SHIFT, NULL);
-        mem_hotplug_done();
-        /*
-         * Initialization of the pages has been deferred until now in order
-         * to allow us to do the work while not holding the hotplug lock.
-         */
-        memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
-                                align_start >> PAGE_SHIFT,
-                                align_size >> PAGE_SHIFT, &devmem->pagemap);
-        return 0;
-error_add_memory:
-        untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
-error_radix:
-        hmm_devmem_radix_release(devmem->resource);
-error:
-        return ret;
-}
-static int hmm_devmem_match(struct device *dev, void *data, void *match_data)
-{
-        struct hmm_devmem *devmem = data;
-        return devmem->resource == match_data;
-}
-static void hmm_devmem_pages_remove(struct hmm_devmem *devmem)
-{
-        devres_release(devmem->device, &hmm_devmem_release,
-                       &hmm_devmem_match, devmem->resource);
-}
 /*
 * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
 *
@@ -1210,12 +1033,12 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 {
        struct hmm_devmem *devmem;
        resource_size_t addr;
+        void *result;
        int ret;
        dev_pagemap_get_ops();
-        devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
+        devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
-                                   GFP_KERNEL, dev_to_node(device));
        if (!devmem)
                return ERR_PTR(-ENOMEM);
@@ -1229,11 +1052,11 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
        ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
                              0, GFP_KERNEL);
        if (ret)
-                goto error_percpu_ref;
+                return ERR_PTR(ret);
-        ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
+        ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, &devmem->ref);
        if (ret)
-                goto error_devm_add_action;
+                return ERR_PTR(ret);
        size = ALIGN(size, PA_SECTION_SIZE);
        addr = min((unsigned long)iomem_resource.end,
@@ -1253,54 +1076,40 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
                devmem->resource = devm_request_mem_region(device, addr, size,
                                                           dev_name(device));
-                if (!devmem->resource) {
+                if (!devmem->resource)
-                        ret = -ENOMEM;
+                        return ERR_PTR(-ENOMEM);
-                        goto error_no_resource;
-                }
                break;
        }
-        if (!devmem->resource) {
+        if (!devmem->resource)
-                ret = -ERANGE;
+                return ERR_PTR(-ERANGE);
-                goto error_no_resource;
-        }
        devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
        devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
        devmem->pfn_last = devmem->pfn_first +
                           (resource_size(devmem->resource) >> PAGE_SHIFT);
+        devmem->page_fault = hmm_devmem_fault;
-        ret = hmm_devmem_pages_create(devmem);
+        devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
-        if (ret)
+        devmem->pagemap.res = *devmem->resource;
-                goto error_pages;
+        devmem->pagemap.page_free = hmm_devmem_free;
+        devmem->pagemap.altmap_valid = false;
-        devres_add(device, devmem);
+        devmem->pagemap.ref = &devmem->ref;
+        devmem->pagemap.data = devmem;
-        ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
+        devmem->pagemap.kill = hmm_devmem_ref_kill;
-        if (ret) {
-                hmm_devmem_remove(devmem);
-                return ERR_PTR(ret);
-        }
+        result = devm_memremap_pages(devmem->device, &devmem->pagemap);
+        if (IS_ERR(result))
+                return result;
        return devmem;
-error_pages:
-        devm_release_mem_region(device, devmem->resource->start,
-                                resource_size(devmem->resource));
-error_no_resource:
-error_devm_add_action:
-        hmm_devmem_ref_kill(&devmem->ref);
-        hmm_devmem_ref_exit(&devmem->ref);
-error_percpu_ref:
-        devres_free(devmem);
-        return ERR_PTR(ret);
 }
-EXPORT_SYMBOL(hmm_devmem_add);
+EXPORT_SYMBOL_GPL(hmm_devmem_add);
 struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
                                           struct device *device,
                                           struct resource *res)
 {
        struct hmm_devmem *devmem;
+        void *result;
        int ret;
        if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
@@ -1308,8 +1117,7 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
        dev_pagemap_get_ops();
-        devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
+        devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
-                                   GFP_KERNEL, dev_to_node(device));
        if (!devmem)
                return ERR_PTR(-ENOMEM);
@@ -1323,71 +1131,32 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
        ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
                              0, GFP_KERNEL);
        if (ret)
-                goto error_percpu_ref;
+                return ERR_PTR(ret);
-        ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
+        ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit,
+                        &devmem->ref);
        if (ret)
-                goto error_devm_add_action;
+                return ERR_PTR(ret);
        devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
        devmem->pfn_last = devmem->pfn_first +
                           (resource_size(devmem->resource) >> PAGE_SHIFT);
+        devmem->page_fault = hmm_devmem_fault;
-        ret = hmm_devmem_pages_create(devmem);
+        devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
-        if (ret)
+        devmem->pagemap.res = *devmem->resource;
-                goto error_devm_add_action;
+        devmem->pagemap.page_free = hmm_devmem_free;
+        devmem->pagemap.altmap_valid = false;
-        devres_add(device, devmem);
+        devmem->pagemap.ref = &devmem->ref;
+        devmem->pagemap.data = devmem;
-        ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
+        devmem->pagemap.kill = hmm_devmem_ref_kill;
-        if (ret) {
-                hmm_devmem_remove(devmem);
-                return ERR_PTR(ret);
-        }
+        result = devm_memremap_pages(devmem->device, &devmem->pagemap);
+        if (IS_ERR(result))
+                return result;
        return devmem;
-error_devm_add_action:
-        hmm_devmem_ref_kill(&devmem->ref);
-        hmm_devmem_ref_exit(&devmem->ref);
-error_percpu_ref:
-        devres_free(devmem);
-        return ERR_PTR(ret);
-}
-EXPORT_SYMBOL(hmm_devmem_add_resource);
-/*
- * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE)
- *
- * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory
- *
- * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf
- * of the device driver. It will free struct page and remove the resource that
- * reserved the physical address range for this device memory.
- */
-void hmm_devmem_remove(struct hmm_devmem *devmem)
-{
-        resource_size_t start, size;
-        struct device *device;
-        bool cdm = false;
-        if (!devmem)
-                return;
-        device = devmem->device;
-        start = devmem->resource->start;
-        size = resource_size(devmem->resource);
-        cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY;
-        hmm_devmem_ref_kill(&devmem->ref);
-        hmm_devmem_ref_exit(&devmem->ref);
-        hmm_devmem_pages_remove(devmem);
-        if (!cdm)
-                devm_release_mem_region(device, start, size);
 }
-EXPORT_SYMBOL(hmm_devmem_remove);
+EXPORT_SYMBOL_GPL(hmm_devmem_add_resource);
 /*
 * A device driver that wants to handle multiple devices memory through a
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e84a10b0d310..cbd977b1d60d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -62,6 +62,16 @@ static struct shrinker deferred_split_shrinker;
 static atomic_t huge_zero_refcount;
 struct page *huge_zero_page __read_mostly;
+bool transparent_hugepage_enabled(struct vm_area_struct *vma)
+{
+        if (vma_is_anonymous(vma))
+                return __transparent_hugepage_enabled(vma);
+        if (vma_is_shmem(vma) && shmem_huge_enabled(vma))
+                return __transparent_hugepage_enabled(vma);
+        return false;
+}
 static struct page *get_huge_zero_page(void)
 {
        struct page *zero_page;
@@ -420,7 +430,7 @@ static int __init hugepage_init(void)
         * where the extra memory used could hurt more than TLB overhead
         * is likely to save.  The admin can still enable it through /sys.
         */
-        if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
+        if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
                transparent_hugepage_flags = 0;
                return 0;
        }
@@ -1134,8 +1144,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
        int i;
        vm_fault_t ret = 0;
        struct page **pages;
-        unsigned long mmun_start;       /* For mmu_notifiers */
+        struct mmu_notifier_range range;
-        unsigned long mmun_end;         /* For mmu_notifiers */
        pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
                              GFP_KERNEL);
@@ -1173,9 +1182,9 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
                cond_resched();
        }
-        mmun_start = haddr;
+        mmu_notifier_range_init(&range, vma->vm_mm, haddr,
-        mmun_end   = haddr + HPAGE_PMD_SIZE;
+                                haddr + HPAGE_PMD_SIZE);
-        mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
+        mmu_notifier_invalidate_range_start(&range);
        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
@@ -1220,8 +1229,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
         * No need to double call mmu_notifier->invalidate_range() callback as
         * the above pmdp_huge_clear_flush_notify() did already call it.
         */
-        mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
+        mmu_notifier_invalidate_range_only_end(&range);
-                                                mmun_end);
        ret |= VM_FAULT_WRITE;
        put_page(page);
@@ -1231,7 +1239,7 @@ out:
 out_free_pages:
        spin_unlock(vmf->ptl);
-        mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
+        mmu_notifier_invalidate_range_end(&range);
        for (i = 0; i < HPAGE_PMD_NR; i++) {
                memcg = (void *)page_private(pages[i]);
                set_page_private(pages[i], 0);
@@ -1248,8 +1256,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
        struct page *page = NULL, *new_page;
        struct mem_cgroup *memcg;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
-        unsigned long mmun_start;       /* For mmu_notifiers */
+        struct mmu_notifier_range range;
-        unsigned long mmun_end;         /* For mmu_notifiers */
        gfp_t huge_gfp;                 /* for allocation and charge */
        vm_fault_t ret = 0;
@@ -1293,7 +1300,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
        get_page(page);
        spin_unlock(vmf->ptl);
 alloc:
-        if (transparent_hugepage_enabled(vma) &&
+        if (__transparent_hugepage_enabled(vma) &&
            !transparent_hugepage_debug_cow()) {
                huge_gfp = alloc_hugepage_direct_gfpmask(vma);
                new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
@@ -1338,9 +1345,9 @@ alloc:
                                    vma, HPAGE_PMD_NR);
        __SetPageUptodate(new_page);
-        mmun_start = haddr;
+        mmu_notifier_range_init(&range, vma->vm_mm, haddr,
-        mmun_end   = haddr + HPAGE_PMD_SIZE;
+                                haddr + HPAGE_PMD_SIZE);
-        mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
+        mmu_notifier_invalidate_range_start(&range);
        spin_lock(vmf->ptl);
        if (page)
@@ -1375,8 +1382,7 @@ out_mn:
         * No need to double call mmu_notifier->invalidate_range() callback as
         * the above pmdp_huge_clear_flush_notify() did already call it.
         */
-        mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
+        mmu_notifier_invalidate_range_only_end(&range);
-                                               mmun_end);
 out:
        return ret;
 out_unlock:
@@ -1490,8 +1496,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
                if (!get_page_unless_zero(page))
                        goto out_unlock;
                spin_unlock(vmf->ptl);
-                wait_on_page_locked(page);
+                put_and_wait_on_page_locked(page);
-                put_page(page);
                goto out;
        }
@@ -1527,8 +1532,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
                if (!get_page_unless_zero(page))
                        goto out_unlock;
                spin_unlock(vmf->ptl);
-                wait_on_page_locked(page);
+                put_and_wait_on_page_locked(page);
-                put_page(page);
                goto out;
        }
@@ -2017,14 +2021,15 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
                unsigned long address)
 {
        spinlock_t *ptl;
-        struct mm_struct *mm = vma->vm_mm;
+        struct mmu_notifier_range range;
-        unsigned long haddr = address & HPAGE_PUD_MASK;
-        mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE);
+        mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PUD_MASK,
-        ptl = pud_lock(mm, pud);
+                                (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
+        mmu_notifier_invalidate_range_start(&range);
+        ptl = pud_lock(vma->vm_mm, pud);
        if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
                goto out;
-        __split_huge_pud_locked(vma, pud, haddr);
+        __split_huge_pud_locked(vma, pud, range.start);
 out:
        spin_unlock(ptl);
@@ -2032,8 +2037,7 @@ out:
         * No need to double call mmu_notifier->invalidate_range() callback as
         * the above pudp_huge_clear_flush_notify() did already call it.
         */
-        mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
+        mmu_notifier_invalidate_range_only_end(&range);
-                                               HPAGE_PUD_SIZE);
 }
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
@@ -2235,11 +2239,12 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long address, bool freeze, struct page *page)
 {
        spinlock_t *ptl;
-        struct mm_struct *mm = vma->vm_mm;
+        struct mmu_notifier_range range;
-        unsigned long haddr = address & HPAGE_PMD_MASK;
-        mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
+        mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PMD_MASK,
-        ptl = pmd_lock(mm, pmd);
+                                (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
+        mmu_notifier_invalidate_range_start(&range);
+        ptl = pmd_lock(vma->vm_mm, pmd);
        /*
         * If caller asks to setup a migration entries, we need a page to check
@@ -2255,7 +2260,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                        clear_page_mlock(page);
        } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
                goto out;
-        __split_huge_pmd_locked(vma, pmd, haddr, freeze);
+        __split_huge_pmd_locked(vma, pmd, range.start, freeze);
 out:
        spin_unlock(ptl);
        /*
@@ -2271,8 +2276,7 @@ out:
         *     any further changes to individual pte will notify. So no need
         *     to call mmu_notifier->invalidate_range()
         */
-        mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
+        mmu_notifier_invalidate_range_only_end(&range);
-                                               HPAGE_PMD_SIZE);
 }
 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a80832487981..e37efd5d8318 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3238,24 +3238,35 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
        struct page *ptepage;
        unsigned long addr;
        int cow;
+        struct address_space *mapping = vma->vm_file->f_mapping;
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
-        unsigned long mmun_start;       /* For mmu_notifiers */
+        struct mmu_notifier_range range;
-        unsigned long mmun_end;         /* For mmu_notifiers */
        int ret = 0;
        cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
-        mmun_start = vma->vm_start;
+        if (cow) {
-        mmun_end = vma->vm_end;
+                mmu_notifier_range_init(&range, src, vma->vm_start,
-        if (cow)
+                                        vma->vm_end);
-                mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
+                mmu_notifier_invalidate_range_start(&range);
+        } else {
+                /*
+                 * For shared mappings i_mmap_rwsem must be held to call
+                 * huge_pte_alloc, otherwise the returned ptep could go
+                 * away if part of a shared pmd and another thread calls
+                 * huge_pmd_unshare.
+                 */
+                i_mmap_lock_read(mapping);
+        }
        for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
                spinlock_t *src_ptl, *dst_ptl;
                src_pte = huge_pte_offset(src, addr, sz);
                if (!src_pte)
                        continue;
                dst_pte = huge_pte_alloc(dst, addr, sz);
                if (!dst_pte) {
                        ret = -ENOMEM;
@@ -3325,7 +3336,9 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
        }
        if (cow)
-                mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
+                mmu_notifier_invalidate_range_end(&range);
+        else
+                i_mmap_unlock_read(mapping);
        return ret;
 }
@@ -3342,8 +3355,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
        struct page *page;
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
-        unsigned long mmun_start = start;       /* For mmu_notifiers */
+        struct mmu_notifier_range range;
-        unsigned long mmun_end   = end;         /* For mmu_notifiers */
        WARN_ON(!is_vm_hugetlb_page(vma));
        BUG_ON(start & ~huge_page_mask(h));
@@ -3359,8 +3371,9 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
        /*
         * If sharing possible, alert mmu notifiers of worst case.
         */
-        adjust_range_if_pmd_sharing_possible(vma, &mmun_start, &mmun_end);
+        mmu_notifier_range_init(&range, mm, start, end);
-        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+        adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
+        mmu_notifier_invalidate_range_start(&range);
        address = start;
        for (; address < end; address += sz) {
                ptep = huge_pte_offset(mm, address, sz);
@@ -3428,7 +3441,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
                if (ref_page)
                        break;
        }
-        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+        mmu_notifier_invalidate_range_end(&range);
        tlb_end_vma(tlb, vma);
 }
@@ -3546,9 +3559,8 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        struct page *old_page, *new_page;
        int outside_reserve = 0;
        vm_fault_t ret = 0;
-        unsigned long mmun_start;       /* For mmu_notifiers */
-        unsigned long mmun_end;         /* For mmu_notifiers */
        unsigned long haddr = address & huge_page_mask(h);
+        struct mmu_notifier_range range;
        pte = huge_ptep_get(ptep);
        old_page = pte_page(pte);
@@ -3627,9 +3639,8 @@ retry_avoidcopy:
        __SetPageUptodate(new_page);
        set_page_huge_active(new_page);
-        mmun_start = haddr;
+        mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h));
-        mmun_end = mmun_start + huge_page_size(h);
+        mmu_notifier_invalidate_range_start(&range);
-        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        /*
         * Retake the page table lock to check for racing updates
@@ -3642,7 +3653,7 @@ retry_avoidcopy:
                /* Break COW */
                huge_ptep_clear_flush(vma, haddr, ptep);
-                mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
+                mmu_notifier_invalidate_range(mm, range.start, range.end);
                set_huge_pte_at(mm, haddr, ptep,
                                make_huge_pte(vma, new_page, 1));
                page_remove_rmap(old_page, true);
@@ -3651,7 +3662,7 @@ retry_avoidcopy:
                new_page = old_page;
        }
        spin_unlock(ptl);
-        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+        mmu_notifier_invalidate_range_end(&range);
 out_release_all:
        restore_reserve_on_error(h, vma, haddr, new_page);
        put_page(new_page);
@@ -3744,16 +3755,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
        }
        /*
-         * Use page lock to guard against racing truncation
+         * We can not race with truncation due to holding i_mmap_rwsem.
-         * before we get page_table_lock.
+         * Check once here for faults beyond end of file.
         */
+        size = i_size_read(mapping->host) >> huge_page_shift(h);
+        if (idx >= size)
+                goto out;
 retry:
        page = find_lock_page(mapping, idx);
        if (!page) {
-                size = i_size_read(mapping->host) >> huge_page_shift(h);
-                if (idx >= size)
-                        goto out;
                /*
                 * Check for page in userfault range
                 */
@@ -3773,14 +3784,18 @@ retry:
                        };
                        /*
-                         * hugetlb_fault_mutex must be dropped before
+                         * hugetlb_fault_mutex and i_mmap_rwsem must be
-                         * handling userfault.  Reacquire after handling
+                         * dropped before handling userfault.  Reacquire
-                         * fault to make calling code simpler.
+                         * after handling fault to make calling code simpler.
                         */
                        hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
                                                        idx, haddr);
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+                        i_mmap_unlock_read(mapping);
                        ret = handle_userfault(&vmf, VM_UFFD_MISSING);
+                        i_mmap_lock_read(mapping);
                        mutex_lock(&hugetlb_fault_mutex_table[hash]);
                        goto out;
                }
@@ -3839,9 +3854,6 @@ retry:
        }
        ptl = huge_pte_lock(h, mm, ptep);
-        size = i_size_read(mapping->host) >> huge_page_shift(h);
-        if (idx >= size)
-                goto backout;
        ret = 0;
        if (!huge_pte_none(huge_ptep_get(ptep)))
@@ -3928,6 +3940,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
        if (ptep) {
+                /*
+                 * Since we hold no locks, ptep could be stale.  That is
+                 * OK as we are only making decisions based on content and
+                 * not actually modifying content here.
+                 */
                entry = huge_ptep_get(ptep);
                if (unlikely(is_hugetlb_entry_migration(entry))) {
                        migration_entry_wait_huge(vma, mm, ptep);
@@ -3935,20 +3952,33 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
                        return VM_FAULT_HWPOISON_LARGE |
                                VM_FAULT_SET_HINDEX(hstate_index(h));
-        } else {
-                ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
-                if (!ptep)
-                        return VM_FAULT_OOM;
        }
+        /*
+         * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
+         * until finished with ptep.  This serves two purposes:
+         * 1) It prevents huge_pmd_unshare from being called elsewhere
+         *    and making the ptep no longer valid.
+         * 2) It synchronizes us with file truncation.
+         *
+         * ptep could have already be assigned via huge_pte_offset.  That
+         * is OK, as huge_pte_alloc will return the same value unless
+         * something changed.
+         */
        mapping = vma->vm_file->f_mapping;
-        idx = vma_hugecache_offset(h, vma, haddr);
+        i_mmap_lock_read(mapping);
+        ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
+        if (!ptep) {
+                i_mmap_unlock_read(mapping);
+                return VM_FAULT_OOM;
+        }
        /*
         * Serialize hugepage allocation and instantiation, so that we don't
         * get spurious allocation failures if two CPUs race to instantiate
         * the same page in the page cache.
         */
+        idx = vma_hugecache_offset(h, vma, haddr);
        hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
        mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -4036,6 +4066,7 @@ out_ptl:
        }
 out_mutex:
        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+        i_mmap_unlock_read(mapping);
        /*
         * Generally it's safe to hold refcount during waiting page lock. But
         * here we just wait to defer the next page fault to avoid busy loop and
@@ -4340,21 +4371,21 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
        pte_t pte;
        struct hstate *h = hstate_vma(vma);
        unsigned long pages = 0;
-        unsigned long f_start = start;
-        unsigned long f_end = end;
        bool shared_pmd = false;
+        struct mmu_notifier_range range;
        /*
         * In the case of shared PMDs, the area to flush could be beyond
-         * start/end.  Set f_start/f_end to cover the maximum possible
+         * start/end.  Set range.start/range.end to cover the maximum possible
         * range if PMD sharing is possible.
         */
-        adjust_range_if_pmd_sharing_possible(vma, &f_start, &f_end);
+        mmu_notifier_range_init(&range, mm, start, end);
+        adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
        BUG_ON(address >= end);
-        flush_cache_range(vma, f_start, f_end);
+        flush_cache_range(vma, range.start, range.end);
-        mmu_notifier_invalidate_range_start(mm, f_start, f_end);
+        mmu_notifier_invalidate_range_start(&range);
        i_mmap_lock_write(vma->vm_file->f_mapping);
        for (; address < end; address += huge_page_size(h)) {
                spinlock_t *ptl;
@@ -4405,7 +4436,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
         * did unshare a page of pmds, flush the range corresponding to the pud.
         */
        if (shared_pmd)
-                flush_hugetlb_tlb_range(vma, f_start, f_end);
+                flush_hugetlb_tlb_range(vma, range.start, range.end);
        else
                flush_hugetlb_tlb_range(vma, start, end);
        /*
@@ -4415,7 +4446,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
         * See Documentation/vm/mmu_notifier.rst
         */
        i_mmap_unlock_write(vma->vm_file->f_mapping);
-        mmu_notifier_invalidate_range_end(mm, f_start, f_end);
+        mmu_notifier_invalidate_range_end(&range);
        return pages << h->order;
 }
@@ -4640,10 +4671,12 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
 * and returns the corresponding pte. While this is not necessary for the
 * !shared pmd case because we can allocate the pmd later as well, it makes the
- * code much cleaner. pmd allocation is essential for the shared case because
+ * code much cleaner.
- * pud has to be populated inside the same i_mmap_rwsem section - otherwise
+ *
- * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+ * This routine must be called with i_mmap_rwsem held in at least read mode.
- * bad pmd for sharing.
+ * For hugetlbfs, this prevents removal of any page table entries associated
+ * with the address space.  This is important as we are setting up sharing
+ * based on existing page table entries (mappings).
 */
 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 {
@@ -4660,7 +4693,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
        if (!vma_shareable(vma, addr))
                return (pte_t *)pmd_alloc(mm, pud, addr);
-        i_mmap_lock_write(mapping);
        vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
                if (svma == vma)
                        continue;
@@ -4690,7 +4722,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
        spin_unlock(ptl);
 out:
        pte = (pte_t *)pmd_alloc(mm, pud, addr);
-        i_mmap_unlock_write(mapping);
        return pte;
 }
@@ -4701,7 +4732,7 @@ out:
 * indicated by page_count > 1, unmap is achieved by clearing pud and
 * decrementing the ref count. If count == 1, the pte page is not shared.
 *
- * called with page table lock held.
+ * Called with page table lock held and i_mmap_rwsem held in write mode.
 *
 * returns: 1 successfully unmapped a shared pte page
 *          0 the underlying pte page is not shared, or it is the last user
diff --git a/mm/internal.h b/mm/internal.h
index 291eb2b6d1d8..f4a7bb02decf 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -444,6 +444,16 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 #define NODE_RECLAIM_SOME       0
 #define NODE_RECLAIM_SUCCESS    1
+#ifdef CONFIG_NUMA
+extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
+#else
+static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
+                                unsigned int order)
+{
+        return NODE_RECLAIM_NOSCAN;
+}
+#endif
 extern int hwpoison_filter(struct page *p);
 extern u32 hwpoison_filter_dev_major;
@@ -480,10 +490,16 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 #define ALLOC_OOM               ALLOC_NO_WATERMARKS
 #endif
-#define ALLOC_HARDER            0x10 /* try to alloc harder */
+#define ALLOC_HARDER             0x10 /* try to alloc harder */
-#define ALLOC_HIGH              0x20 /* __GFP_HIGH set */
+#define ALLOC_HIGH               0x20 /* __GFP_HIGH set */
-#define ALLOC_CPUSET            0x40 /* check for correct cpuset */
+#define ALLOC_CPUSET             0x40 /* check for correct cpuset */
-#define ALLOC_CMA               0x80 /* allow allocations from CMA areas */
+#define ALLOC_CMA                0x80 /* allow allocations from CMA areas */
+#ifdef CONFIG_ZONE_DMA32
+#define ALLOC_NOFRAGMENT        0x100 /* avoid mixing pageblock types */
+#else
+#define ALLOC_NOFRAGMENT          0x0
+#endif
+#define ALLOC_KSWAPD            0x200 /* allow waking of kswapd */
 enum ttu_flags;
 struct tlbflush_unmap_batch;
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 3289db38bc87..0a14fcff70ed 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -1,11 +1,18 @@
 # SPDX-License-Identifier: GPL-2.0
 KASAN_SANITIZE := n
-UBSAN_SANITIZE_kasan.o := n
+UBSAN_SANITIZE_common.o := n
+UBSAN_SANITIZE_generic.o := n
+UBSAN_SANITIZE_tags.o := n
 KCOV_INSTRUMENT := n
-CFLAGS_REMOVE_kasan.o = -pg
+CFLAGS_REMOVE_generic.o = -pg
 # Function splitter causes unnecessary splits in __asan_load1/__asan_store1
 # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
-CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
-obj-y := kasan.o report.o kasan_init.o quarantine.o
+CFLAGS_common.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+CFLAGS_generic.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+CFLAGS_tags.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+obj-$(CONFIG_KASAN) := common.o init.o report.o
+obj-$(CONFIG_KASAN_GENERIC) += generic.o generic_report.o quarantine.o
+obj-$(CONFIG_KASAN_SW_TAGS) += tags.o tags_report.o
diff --git a/mm/kasan/kasan.c b/mm/kasan/common.c
index c3bd5209da38..03d5d1374ca7 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/common.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
- * This file contains shadow memory manipulation code.
+ * This file contains common generic and tag-based KASAN code.
 *
 * Copyright (c) 2014 Samsung Electronics Co., Ltd.
 * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
@@ -13,9 +14,6 @@
 *
 */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define DISABLE_BRANCH_PROFILING
 #include <linux/export.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
@@ -40,6 +38,53 @@
 #include "kasan.h"
 #include "../slab.h"
+static inline int in_irqentry_text(unsigned long ptr)
+{
+        return (ptr >= (unsigned long)&__irqentry_text_start &&
+                ptr < (unsigned long)&__irqentry_text_end) ||
+                (ptr >= (unsigned long)&__softirqentry_text_start &&
+                 ptr < (unsigned long)&__softirqentry_text_end);
+}
+static inline void filter_irq_stacks(struct stack_trace *trace)
+{
+        int i;
+        if (!trace->nr_entries)
+                return;
+        for (i = 0; i < trace->nr_entries; i++)
+                if (in_irqentry_text(trace->entries[i])) {
+                        /* Include the irqentry function into the stack. */
+                        trace->nr_entries = i + 1;
+                        break;
+                }
+}
+static inline depot_stack_handle_t save_stack(gfp_t flags)
+{
+        unsigned long entries[KASAN_STACK_DEPTH];
+        struct stack_trace trace = {
+                .nr_entries = 0,
+                .entries = entries,
+                .max_entries = KASAN_STACK_DEPTH,
+                .skip = 0
+        };
+        save_stack_trace(&trace);
+        filter_irq_stacks(&trace);
+        if (trace.nr_entries != 0 &&
+            trace.entries[trace.nr_entries-1] == ULONG_MAX)
+                trace.nr_entries--;
+        return depot_save_stack(&trace, flags);
+}
+static inline void set_track(struct kasan_track *track, gfp_t flags)
+{
+        track->pid = current->pid;
+        track->stack = save_stack(flags);
+}
 void kasan_enable_current(void)
 {
        current->kasan_depth++;
@@ -50,27 +95,85 @@ void kasan_disable_current(void)
        current->kasan_depth--;
 }
+void kasan_check_read(const volatile void *p, unsigned int size)
+{
+        check_memory_region((unsigned long)p, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(kasan_check_read);
+void kasan_check_write(const volatile void *p, unsigned int size)
+{
+        check_memory_region((unsigned long)p, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(kasan_check_write);
+#undef memset
+void *memset(void *addr, int c, size_t len)
+{
+        check_memory_region((unsigned long)addr, len, true, _RET_IP_);
+        return __memset(addr, c, len);
+}
+#undef memmove
+void *memmove(void *dest, const void *src, size_t len)
+{
+        check_memory_region((unsigned long)src, len, false, _RET_IP_);
+        check_memory_region((unsigned long)dest, len, true, _RET_IP_);
+        return __memmove(dest, src, len);
+}
+#undef memcpy
+void *memcpy(void *dest, const void *src, size_t len)
+{
+        check_memory_region((unsigned long)src, len, false, _RET_IP_);
+        check_memory_region((unsigned long)dest, len, true, _RET_IP_);
+        return __memcpy(dest, src, len);
+}
 /*
 * Poisons the shadow memory for 'size' bytes starting from 'addr'.
 * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
 */
-static void kasan_poison_shadow(const void *address, size_t size, u8 value)
+void kasan_poison_shadow(const void *address, size_t size, u8 value)
 {
        void *shadow_start, *shadow_end;
+        /*
+         * Perform shadow offset calculation based on untagged address, as
+         * some of the callers (e.g. kasan_poison_object_data) pass tagged
+         * addresses to this function.
+         */
+        address = reset_tag(address);
        shadow_start = kasan_mem_to_shadow(address);
        shadow_end = kasan_mem_to_shadow(address + size);
-        memset(shadow_start, value, shadow_end - shadow_start);
+        __memset(shadow_start, value, shadow_end - shadow_start);
 }
 void kasan_unpoison_shadow(const void *address, size_t size)
 {
-        kasan_poison_shadow(address, size, 0);
+        u8 tag = get_tag(address);
+        /*
+         * Perform shadow offset calculation based on untagged address, as
+         * some of the callers (e.g. kasan_unpoison_object_data) pass tagged
+         * addresses to this function.
+         */
+        address = reset_tag(address);
+        kasan_poison_shadow(address, size, tag);
        if (size & KASAN_SHADOW_MASK) {
                u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
-                *shadow = size & KASAN_SHADOW_MASK;
+                if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+                        *shadow = tag;
+                else
+                        *shadow = size & KASAN_SHADOW_MASK;
        }
 }
@@ -116,199 +219,18 @@ void kasan_unpoison_stack_above_sp_to(const void *watermark)
        kasan_unpoison_shadow(sp, size);
 }
-/*
+void kasan_alloc_pages(struct page *page, unsigned int order)
- * All functions below always inlined so compiler could
- * perform better optimizations in each of __asan_loadX/__assn_storeX
- * depending on memory access size X.
- */
-static __always_inline bool memory_is_poisoned_1(unsigned long addr)
-{
-        s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
-        if (unlikely(shadow_value)) {
-                s8 last_accessible_byte = addr & KASAN_SHADOW_MASK;
-                return unlikely(last_accessible_byte >= shadow_value);
-        }
-        return false;
-}
-static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr,
-                                                unsigned long size)
-{
-        u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr);
-        /*
-         * Access crosses 8(shadow size)-byte boundary. Such access maps
-         * into 2 shadow bytes, so we need to check them both.
-         */
-        if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1))
-                return *shadow_addr || memory_is_poisoned_1(addr + size - 1);
-        return memory_is_poisoned_1(addr + size - 1);
-}
-static __always_inline bool memory_is_poisoned_16(unsigned long addr)
-{
-        u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
-        /* Unaligned 16-bytes access maps into 3 shadow bytes. */
-        if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
-                return *shadow_addr || memory_is_poisoned_1(addr + 15);
-        return *shadow_addr;
-}
-static __always_inline unsigned long bytes_is_nonzero(const u8 *start,
-                                        size_t size)
-{
-        while (size) {
-                if (unlikely(*start))
-                        return (unsigned long)start;
-                start++;
-                size--;
-        }
-        return 0;
-}
-static __always_inline unsigned long memory_is_nonzero(const void *start,
-                                                const void *end)
-{
-        unsigned int words;
-        unsigned long ret;
-        unsigned int prefix = (unsigned long)start % 8;
-        if (end - start <= 16)
-                return bytes_is_nonzero(start, end - start);
-        if (prefix) {
-                prefix = 8 - prefix;
-                ret = bytes_is_nonzero(start, prefix);
-                if (unlikely(ret))
-                        return ret;
-                start += prefix;
-        }
-        words = (end - start) / 8;
-        while (words) {
-                if (unlikely(*(u64 *)start))
-                        return bytes_is_nonzero(start, 8);
-                start += 8;
-                words--;
-        }
-        return bytes_is_nonzero(start, (end - start) % 8);
-}
-static __always_inline bool memory_is_poisoned_n(unsigned long addr,
-                                                size_t size)
-{
-        unsigned long ret;
-        ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr),
-                        kasan_mem_to_shadow((void *)addr + size - 1) + 1);
-        if (unlikely(ret)) {
-                unsigned long last_byte = addr + size - 1;
-                s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
-                if (unlikely(ret != (unsigned long)last_shadow ||
-                        ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
-                        return true;
-        }
-        return false;
-}
-static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
-{
-        if (__builtin_constant_p(size)) {
-                switch (size) {
-                case 1:
-                        return memory_is_poisoned_1(addr);
-                case 2:
-                case 4:
-                case 8:
-                        return memory_is_poisoned_2_4_8(addr, size);
-                case 16:
-                        return memory_is_poisoned_16(addr);
-                default:
-                        BUILD_BUG();
-                }
-        }
-        return memory_is_poisoned_n(addr, size);
-}
-static __always_inline void check_memory_region_inline(unsigned long addr,
-                                                size_t size, bool write,
-                                                unsigned long ret_ip)
 {
-        if (unlikely(size == 0))
+        u8 tag;
-                return;
+        unsigned long i;
-        if (unlikely((void *)addr <
+        if (unlikely(PageHighMem(page)))
-                kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
-                kasan_report(addr, size, write, ret_ip);
                return;
-        }
-        if (likely(!memory_is_poisoned(addr, size)))
+        tag = random_tag();
-                return;
+        for (i = 0; i < (1 << order); i++)
+                page_kasan_tag_set(page + i, tag);
-        kasan_report(addr, size, write, ret_ip);
+        kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
-}
-static void check_memory_region(unsigned long addr,
-                                size_t size, bool write,
-                                unsigned long ret_ip)
-{
-        check_memory_region_inline(addr, size, write, ret_ip);
-}
-void kasan_check_read(const volatile void *p, unsigned int size)
-{
-        check_memory_region((unsigned long)p, size, false, _RET_IP_);
-}
-EXPORT_SYMBOL(kasan_check_read);
-void kasan_check_write(const volatile void *p, unsigned int size)
-{
-        check_memory_region((unsigned long)p, size, true, _RET_IP_);
-}
-EXPORT_SYMBOL(kasan_check_write);
-#undef memset
-void *memset(void *addr, int c, size_t len)
-{
-        check_memory_region((unsigned long)addr, len, true, _RET_IP_);
-        return __memset(addr, c, len);
-}
-#undef memmove
-void *memmove(void *dest, const void *src, size_t len)
-{
-        check_memory_region((unsigned long)src, len, false, _RET_IP_);
-        check_memory_region((unsigned long)dest, len, true, _RET_IP_);
-        return __memmove(dest, src, len);
-}
-#undef memcpy
-void *memcpy(void *dest, const void *src, size_t len)
-{
-        check_memory_region((unsigned long)src, len, false, _RET_IP_);
-        check_memory_region((unsigned long)dest, len, true, _RET_IP_);
-        return __memcpy(dest, src, len);
-}
-void kasan_alloc_pages(struct page *page, unsigned int order)
-{
-        if (likely(!PageHighMem(page)))
-                kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
 }
 void kasan_free_pages(struct page *page, unsigned int order)
@@ -323,8 +245,11 @@ void kasan_free_pages(struct page *page, unsigned int order)
 * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
 * For larger allocations larger redzones are used.
 */
-static unsigned int optimal_redzone(unsigned int object_size)
+static inline unsigned int optimal_redzone(unsigned int object_size)
 {
+        if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+                return 0;
        return
                object_size <= 64        - 16   ? 16 :
                object_size <= 128       - 32   ? 32 :
@@ -339,6 +264,7 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
                        slab_flags_t *flags)
 {
        unsigned int orig_size = *size;
+        unsigned int redzone_size;
        int redzone_adjust;
        /* Add alloc meta. */
@@ -346,20 +272,20 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
        *size += sizeof(struct kasan_alloc_meta);
        /* Add free meta. */
-        if (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor ||
+        if (IS_ENABLED(CONFIG_KASAN_GENERIC) &&
-            cache->object_size < sizeof(struct kasan_free_meta)) {
+            (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor ||
+             cache->object_size < sizeof(struct kasan_free_meta))) {
                cache->kasan_info.free_meta_offset = *size;
                *size += sizeof(struct kasan_free_meta);
        }
-        redzone_adjust = optimal_redzone(cache->object_size) -
-                (*size - cache->object_size);
+        redzone_size = optimal_redzone(cache->object_size);
+        redzone_adjust = redzone_size - (*size - cache->object_size);
        if (redzone_adjust > 0)
                *size += redzone_adjust;
        *size = min_t(unsigned int, KMALLOC_MAX_SIZE,
-                        max(*size, cache->object_size +
+                        max(*size, cache->object_size + redzone_size));
-                                        optimal_redzone(cache->object_size)));
        /*
         * If the metadata doesn't fit, don't enable KASAN at all.
@@ -372,30 +298,39 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
                return;
        }
+        cache->align = round_up(cache->align, KASAN_SHADOW_SCALE_SIZE);
        *flags |= SLAB_KASAN;
 }
-void kasan_cache_shrink(struct kmem_cache *cache)
+size_t kasan_metadata_size(struct kmem_cache *cache)
 {
-        quarantine_remove_cache(cache);
+        return (cache->kasan_info.alloc_meta_offset ?
+                sizeof(struct kasan_alloc_meta) : 0) +
+                (cache->kasan_info.free_meta_offset ?
+                sizeof(struct kasan_free_meta) : 0);
 }
-void kasan_cache_shutdown(struct kmem_cache *cache)
+struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
+                                        const void *object)
 {
-        if (!__kmem_cache_empty(cache))
+        BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32);
-                quarantine_remove_cache(cache);
+        return (void *)object + cache->kasan_info.alloc_meta_offset;
 }
-size_t kasan_metadata_size(struct kmem_cache *cache)
+struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
+                                      const void *object)
 {
-        return (cache->kasan_info.alloc_meta_offset ?
+        BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
-                sizeof(struct kasan_alloc_meta) : 0) +
+        return (void *)object + cache->kasan_info.free_meta_offset;
-                (cache->kasan_info.free_meta_offset ?
-                sizeof(struct kasan_free_meta) : 0);
 }
 void kasan_poison_slab(struct page *page)
 {
+        unsigned long i;
+        for (i = 0; i < (1 << compound_order(page)); i++)
+                page_kasan_tag_reset(page + i);
        kasan_poison_shadow(page_address(page),
                        PAGE_SIZE << compound_order(page),
                        KASAN_KMALLOC_REDZONE);
@@ -413,92 +348,79 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
                        KASAN_KMALLOC_REDZONE);
 }
-static inline int in_irqentry_text(unsigned long ptr)
+/*
-{
+ * Since it's desirable to only call object contructors once during slab
-        return (ptr >= (unsigned long)&__irqentry_text_start &&
+ * allocation, we preassign tags to all such objects. Also preassign tags for
-                ptr < (unsigned long)&__irqentry_text_end) ||
+ * SLAB_TYPESAFE_BY_RCU slabs to avoid use-after-free reports.
-                (ptr >= (unsigned long)&__softirqentry_text_start &&
+ * For SLAB allocator we can't preassign tags randomly since the freelist is
-                 ptr < (unsigned long)&__softirqentry_text_end);
+ * stored as an array of indexes instead of a linked list. Assign tags based
-}
+ * on objects indexes, so that objects that are next to each other get
+ * different tags.
-static inline void filter_irq_stacks(struct stack_trace *trace)
+ * After a tag is assigned, the object always gets allocated with the same tag.
+ * The reason is that we can't change tags for objects with constructors on
+ * reallocation (even for non-SLAB_TYPESAFE_BY_RCU), because the constructor
+ * code can save the pointer to the object somewhere (e.g. in the object
+ * itself). Then if we retag it, the old saved pointer will become invalid.
+ */
+static u8 assign_tag(struct kmem_cache *cache, const void *object, bool new)
 {
-        int i;
+        if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU))
+                return new ? KASAN_TAG_KERNEL : random_tag();
-        if (!trace->nr_entries)
+#ifdef CONFIG_SLAB
-                return;
+        return (u8)obj_to_index(cache, virt_to_page(object), (void *)object);
-        for (i = 0; i < trace->nr_entries; i++)
+#else
-                if (in_irqentry_text(trace->entries[i])) {
+        return new ? random_tag() : get_tag(object);
-                        /* Include the irqentry function into the stack. */
+#endif
-                        trace->nr_entries = i + 1;
-                        break;
-                }
 }
-static inline depot_stack_handle_t save_stack(gfp_t flags)
+void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
+                                                const void *object)
 {
-        unsigned long entries[KASAN_STACK_DEPTH];
+        struct kasan_alloc_meta *alloc_info;
-        struct stack_trace trace = {
-                .nr_entries = 0,
-                .entries = entries,
-                .max_entries = KASAN_STACK_DEPTH,
-                .skip = 0
-        };
-        save_stack_trace(&trace);
-        filter_irq_stacks(&trace);
-        if (trace.nr_entries != 0 &&
-            trace.entries[trace.nr_entries-1] == ULONG_MAX)
-                trace.nr_entries--;
-        return depot_save_stack(&trace, flags);
+        if (!(cache->flags & SLAB_KASAN))
-}
+                return (void *)object;
-static inline void set_track(struct kasan_track *track, gfp_t flags)
+        alloc_info = get_alloc_info(cache, object);
-{
+        __memset(alloc_info, 0, sizeof(*alloc_info));
-        track->pid = current->pid;
-        track->stack = save_stack(flags);
-}
-struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
+        if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
-                                        const void *object)
+                object = set_tag(object, assign_tag(cache, object, true));
-{
-        BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32);
-        return (void *)object + cache->kasan_info.alloc_meta_offset;
-}
-struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
+        return (void *)object;
-                                      const void *object)
-{
-        BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
-        return (void *)object + cache->kasan_info.free_meta_offset;
 }
-void kasan_init_slab_obj(struct kmem_cache *cache, const void *object)
+void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object,
+                                        gfp_t flags)
 {
-        struct kasan_alloc_meta *alloc_info;
+        return kasan_kmalloc(cache, object, cache->object_size, flags);
-        if (!(cache->flags & SLAB_KASAN))
-                return;
-        alloc_info = get_alloc_info(cache, object);
-        __memset(alloc_info, 0, sizeof(*alloc_info));
 }
-void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
+static inline bool shadow_invalid(u8 tag, s8 shadow_byte)
 {
-        kasan_kmalloc(cache, object, cache->object_size, flags);
+        if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+                return shadow_byte < 0 ||
+                        shadow_byte >= KASAN_SHADOW_SCALE_SIZE;
+        else
+                return tag != (u8)shadow_byte;
 }
 static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
                              unsigned long ip, bool quarantine)
 {
        s8 shadow_byte;
+        u8 tag;
+        void *tagged_object;
        unsigned long rounded_up_size;
+        tag = get_tag(object);
+        tagged_object = object;
+        object = reset_tag(object);
        if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) !=
            object)) {
-                kasan_report_invalid_free(object, ip);
+                kasan_report_invalid_free(tagged_object, ip);
                return true;
        }
@@ -507,20 +429,22 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
                return false;
        shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
-        if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) {
+        if (shadow_invalid(tag, shadow_byte)) {
-                kasan_report_invalid_free(object, ip);
+                kasan_report_invalid_free(tagged_object, ip);
                return true;
        }
        rounded_up_size = round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE);
        kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
-        if (!quarantine || unlikely(!(cache->flags & SLAB_KASAN)))
+        if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine) ||
+                        unlikely(!(cache->flags & SLAB_KASAN)))
                return false;
        set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT);
        quarantine_put(get_free_info(cache, object), cache);
-        return true;
+        return IS_ENABLED(CONFIG_KASAN_GENERIC);
 }
 bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
@@ -528,33 +452,41 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
        return __kasan_slab_free(cache, object, ip, true);
 }
-void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
+void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
-                   gfp_t flags)
+                                        size_t size, gfp_t flags)
 {
        unsigned long redzone_start;
        unsigned long redzone_end;
+        u8 tag;
        if (gfpflags_allow_blocking(flags))
                quarantine_reduce();
        if (unlikely(object == NULL))
-                return;
+                return NULL;
        redzone_start = round_up((unsigned long)(object + size),
                                KASAN_SHADOW_SCALE_SIZE);
        redzone_end = round_up((unsigned long)object + cache->object_size,
                                KASAN_SHADOW_SCALE_SIZE);
-        kasan_unpoison_shadow(object, size);
+        if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+                tag = assign_tag(cache, object, false);
+        /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
+        kasan_unpoison_shadow(set_tag(object, tag), size);
        kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
                KASAN_KMALLOC_REDZONE);
        if (cache->flags & SLAB_KASAN)
                set_track(&get_alloc_info(cache, object)->alloc_track, flags);
+        return set_tag(object, tag);
 }
 EXPORT_SYMBOL(kasan_kmalloc);
-void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
+void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
+                                                gfp_t flags)
 {
        struct page *page;
        unsigned long redzone_start;
@@ -564,7 +496,7 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
                quarantine_reduce();
        if (unlikely(ptr == NULL))
-                return;
+                return NULL;
        page = virt_to_page(ptr);
        redzone_start = round_up((unsigned long)(ptr + size),
@@ -574,21 +506,23 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
        kasan_unpoison_shadow(ptr, size);
        kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
                KASAN_PAGE_REDZONE);
+        return (void *)ptr;
 }
-void kasan_krealloc(const void *object, size_t size, gfp_t flags)
+void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags)
 {
        struct page *page;
        if (unlikely(object == ZERO_SIZE_PTR))
-                return;
+                return (void *)object;
        page = virt_to_head_page(object);
        if (unlikely(!PageSlab(page)))
-                kasan_kmalloc_large(object, size, flags);
+                return kasan_kmalloc_large(object, size, flags);
        else
-                kasan_kmalloc(page->slab_cache, object, size, flags);
+                return kasan_kmalloc(page->slab_cache, object, size, flags);
 }
 void kasan_poison_kfree(void *ptr, unsigned long ip)
@@ -632,11 +566,12 @@ int kasan_module_alloc(void *addr, size_t size)
        ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
                        shadow_start + shadow_size,
-                        GFP_KERNEL | __GFP_ZERO,
+                        GFP_KERNEL,
                        PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
                        __builtin_return_address(0));
        if (ret) {
+                __memset(ret, KASAN_SHADOW_INIT, shadow_size);
                find_vm_area(addr)->flags |= VM_KASAN;
                kmemleak_ignore(ret);
                return 0;
@@ -651,147 +586,6 @@ void kasan_free_shadow(const struct vm_struct *vm)
                vfree(kasan_mem_to_shadow(vm->addr));
 }
-static void register_global(struct kasan_global *global)
-{
-        size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
-        kasan_unpoison_shadow(global->beg, global->size);
-        kasan_poison_shadow(global->beg + aligned_size,
-                global->size_with_redzone - aligned_size,
-                KASAN_GLOBAL_REDZONE);
-}
-void __asan_register_globals(struct kasan_global *globals, size_t size)
-{
-        int i;
-        for (i = 0; i < size; i++)
-                register_global(&globals[i]);
-}
-EXPORT_SYMBOL(__asan_register_globals);
-void __asan_unregister_globals(struct kasan_global *globals, size_t size)
-{
-}
-EXPORT_SYMBOL(__asan_unregister_globals);
-#define DEFINE_ASAN_LOAD_STORE(size)                                    \
-        void __asan_load##size(unsigned long addr)                      \
-        {                                                               \
-                check_memory_region_inline(addr, size, false, _RET_IP_);\
-        }                                                               \
-        EXPORT_SYMBOL(__asan_load##size);                               \
-        __alias(__asan_load##size)                                      \
-        void __asan_load##size##_noabort(unsigned long);                \
-        EXPORT_SYMBOL(__asan_load##size##_noabort);                     \
-        void __asan_store##size(unsigned long addr)                     \
-        {                                                               \
-                check_memory_region_inline(addr, size, true, _RET_IP_); \
-        }                                                               \
-        EXPORT_SYMBOL(__asan_store##size);                              \
-        __alias(__asan_store##size)                                     \
-        void __asan_store##size##_noabort(unsigned long);               \
-        EXPORT_SYMBOL(__asan_store##size##_noabort)
-DEFINE_ASAN_LOAD_STORE(1);
-DEFINE_ASAN_LOAD_STORE(2);
-DEFINE_ASAN_LOAD_STORE(4);
-DEFINE_ASAN_LOAD_STORE(8);
-DEFINE_ASAN_LOAD_STORE(16);
-void __asan_loadN(unsigned long addr, size_t size)
-{
-        check_memory_region(addr, size, false, _RET_IP_);
-}
-EXPORT_SYMBOL(__asan_loadN);
-__alias(__asan_loadN)
-void __asan_loadN_noabort(unsigned long, size_t);
-EXPORT_SYMBOL(__asan_loadN_noabort);
-void __asan_storeN(unsigned long addr, size_t size)
-{
-        check_memory_region(addr, size, true, _RET_IP_);
-}
-EXPORT_SYMBOL(__asan_storeN);
-__alias(__asan_storeN)
-void __asan_storeN_noabort(unsigned long, size_t);
-EXPORT_SYMBOL(__asan_storeN_noabort);
-/* to shut up compiler complaints */
-void __asan_handle_no_return(void) {}
-EXPORT_SYMBOL(__asan_handle_no_return);
-/* Emitted by compiler to poison large objects when they go out of scope. */
-void __asan_poison_stack_memory(const void *addr, size_t size)
-{
-        /*
-         * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded
-         * by redzones, so we simply round up size to simplify logic.
-         */
-        kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE),
-                            KASAN_USE_AFTER_SCOPE);
-}
-EXPORT_SYMBOL(__asan_poison_stack_memory);
-/* Emitted by compiler to unpoison large objects when they go into scope. */
-void __asan_unpoison_stack_memory(const void *addr, size_t size)
-{
-        kasan_unpoison_shadow(addr, size);
-}
-EXPORT_SYMBOL(__asan_unpoison_stack_memory);
-/* Emitted by compiler to poison alloca()ed objects. */
-void __asan_alloca_poison(unsigned long addr, size_t size)
-{
-        size_t rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
-        size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) -
-                        rounded_up_size;
-        size_t rounded_down_size = round_down(size, KASAN_SHADOW_SCALE_SIZE);
-        const void *left_redzone = (const void *)(addr -
-                        KASAN_ALLOCA_REDZONE_SIZE);
-        const void *right_redzone = (const void *)(addr + rounded_up_size);
-        WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
-        kasan_unpoison_shadow((const void *)(addr + rounded_down_size),
-                              size - rounded_down_size);
-        kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
-                        KASAN_ALLOCA_LEFT);
-        kasan_poison_shadow(right_redzone,
-                        padding_size + KASAN_ALLOCA_REDZONE_SIZE,
-                        KASAN_ALLOCA_RIGHT);
-}
-EXPORT_SYMBOL(__asan_alloca_poison);
-/* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */
-void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
-{
-        if (unlikely(!stack_top || stack_top > stack_bottom))
-                return;
-        kasan_unpoison_shadow(stack_top, stack_bottom - stack_top);
-}
-EXPORT_SYMBOL(__asan_allocas_unpoison);
-/* Emitted by the compiler to [un]poison local variables. */
-#define DEFINE_ASAN_SET_SHADOW(byte) \
-        void __asan_set_shadow_##byte(const void *addr, size_t size)    \
-        {                                                               \
-                __memset((void *)addr, 0x##byte, size);                 \
-        }                                                               \
-        EXPORT_SYMBOL(__asan_set_shadow_##byte)
-DEFINE_ASAN_SET_SHADOW(00);
-DEFINE_ASAN_SET_SHADOW(f1);
-DEFINE_ASAN_SET_SHADOW(f2);
-DEFINE_ASAN_SET_SHADOW(f3);
-DEFINE_ASAN_SET_SHADOW(f5);
-DEFINE_ASAN_SET_SHADOW(f8);
 #ifdef CONFIG_MEMORY_HOTPLUG
 static bool shadow_mapped(unsigned long addr)
 {
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
new file mode 100644
index 000000000000..ccb6207276e3
--- /dev/null
+++ b/mm/kasan/generic.c
@@ -0,0 +1,344 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains core generic KASAN code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ *        Andrey Konovalov <andreyknvl@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define DISABLE_BRANCH_PROFILING
+#include <linux/export.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/kmemleak.h>
+#include <linux/linkage.h>
+#include <linux/memblock.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/bug.h>
+#include "kasan.h"
+#include "../slab.h"
+/*
+ * All functions below always inlined so compiler could
+ * perform better optimizations in each of __asan_loadX/__assn_storeX
+ * depending on memory access size X.
+ */
+static __always_inline bool memory_is_poisoned_1(unsigned long addr)
+{
+        s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
+        if (unlikely(shadow_value)) {
+                s8 last_accessible_byte = addr & KASAN_SHADOW_MASK;
+                return unlikely(last_accessible_byte >= shadow_value);
+        }
+        return false;
+}
+static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr,
+                                                unsigned long size)
+{
+        u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr);
+        /*
+         * Access crosses 8(shadow size)-byte boundary. Such access maps
+         * into 2 shadow bytes, so we need to check them both.
+         */
+        if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1))
+                return *shadow_addr || memory_is_poisoned_1(addr + size - 1);
+        return memory_is_poisoned_1(addr + size - 1);
+}
+static __always_inline bool memory_is_poisoned_16(unsigned long addr)
+{
+        u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+        /* Unaligned 16-bytes access maps into 3 shadow bytes. */
+        if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
+                return *shadow_addr || memory_is_poisoned_1(addr + 15);
+        return *shadow_addr;
+}
+static __always_inline unsigned long bytes_is_nonzero(const u8 *start,
+                                        size_t size)
+{
+        while (size) {
+                if (unlikely(*start))
+                        return (unsigned long)start;
+                start++;
+                size--;
+        }
+        return 0;
+}
+static __always_inline unsigned long memory_is_nonzero(const void *start,
+                                                const void *end)
+{
+        unsigned int words;
+        unsigned long ret;
+        unsigned int prefix = (unsigned long)start % 8;
+        if (end - start <= 16)
+                return bytes_is_nonzero(start, end - start);
+        if (prefix) {
+                prefix = 8 - prefix;
+                ret = bytes_is_nonzero(start, prefix);
+                if (unlikely(ret))
+                        return ret;
+                start += prefix;
+        }
+        words = (end - start) / 8;
+        while (words) {
+                if (unlikely(*(u64 *)start))
+                        return bytes_is_nonzero(start, 8);
+                start += 8;
+                words--;
+        }
+        return bytes_is_nonzero(start, (end - start) % 8);
+}
+static __always_inline bool memory_is_poisoned_n(unsigned long addr,
+                                                size_t size)
+{
+        unsigned long ret;
+        ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr),
+                        kasan_mem_to_shadow((void *)addr + size - 1) + 1);
+        if (unlikely(ret)) {
+                unsigned long last_byte = addr + size - 1;
+                s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
+                if (unlikely(ret != (unsigned long)last_shadow ||
+                        ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
+                        return true;
+        }
+        return false;
+}
+static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
+{
+        if (__builtin_constant_p(size)) {
+                switch (size) {
+                case 1:
+                        return memory_is_poisoned_1(addr);
+                case 2:
+                case 4:
+                case 8:
+                        return memory_is_poisoned_2_4_8(addr, size);
+                case 16:
+                        return memory_is_poisoned_16(addr);
+                default:
+                        BUILD_BUG();
+                }
+        }
+        return memory_is_poisoned_n(addr, size);
+}
+static __always_inline void check_memory_region_inline(unsigned long addr,
+                                                size_t size, bool write,
+                                                unsigned long ret_ip)
+{
+        if (unlikely(size == 0))
+                return;
+        if (unlikely((void *)addr <
+                kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
+                kasan_report(addr, size, write, ret_ip);
+                return;
+        }
+        if (likely(!memory_is_poisoned(addr, size)))
+                return;
+        kasan_report(addr, size, write, ret_ip);
+}
+void check_memory_region(unsigned long addr, size_t size, bool write,
+                                unsigned long ret_ip)
+{
+        check_memory_region_inline(addr, size, write, ret_ip);
+}
+void kasan_cache_shrink(struct kmem_cache *cache)
+{
+        quarantine_remove_cache(cache);
+}
+void kasan_cache_shutdown(struct kmem_cache *cache)
+{
+        if (!__kmem_cache_empty(cache))
+                quarantine_remove_cache(cache);
+}
+static void register_global(struct kasan_global *global)
+{
+        size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
+        kasan_unpoison_shadow(global->beg, global->size);
+        kasan_poison_shadow(global->beg + aligned_size,
+                global->size_with_redzone - aligned_size,
+                KASAN_GLOBAL_REDZONE);
+}
+void __asan_register_globals(struct kasan_global *globals, size_t size)
+{
+        int i;
+        for (i = 0; i < size; i++)
+                register_global(&globals[i]);
+}
+EXPORT_SYMBOL(__asan_register_globals);
+void __asan_unregister_globals(struct kasan_global *globals, size_t size)
+{
+}
+EXPORT_SYMBOL(__asan_unregister_globals);
+#define DEFINE_ASAN_LOAD_STORE(size)                                    \
+        void __asan_load##size(unsigned long addr)                      \
+        {                                                               \
+                check_memory_region_inline(addr, size, false, _RET_IP_);\
+        }                                                               \
+        EXPORT_SYMBOL(__asan_load##size);                               \
+        __alias(__asan_load##size)                                      \
+        void __asan_load##size##_noabort(unsigned long);                \
+        EXPORT_SYMBOL(__asan_load##size##_noabort);                     \
+        void __asan_store##size(unsigned long addr)                     \
+        {                                                               \
+                check_memory_region_inline(addr, size, true, _RET_IP_); \
+        }                                                               \
+        EXPORT_SYMBOL(__asan_store##size);                              \
+        __alias(__asan_store##size)                                     \
+        void __asan_store##size##_noabort(unsigned long);               \
+        EXPORT_SYMBOL(__asan_store##size##_noabort)
+DEFINE_ASAN_LOAD_STORE(1);
+DEFINE_ASAN_LOAD_STORE(2);
+DEFINE_ASAN_LOAD_STORE(4);
+DEFINE_ASAN_LOAD_STORE(8);
+DEFINE_ASAN_LOAD_STORE(16);
+void __asan_loadN(unsigned long addr, size_t size)
+{
+        check_memory_region(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_loadN);
+__alias(__asan_loadN)
+void __asan_loadN_noabort(unsigned long, size_t);
+EXPORT_SYMBOL(__asan_loadN_noabort);
+void __asan_storeN(unsigned long addr, size_t size)
+{
+        check_memory_region(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_storeN);
+__alias(__asan_storeN)
+void __asan_storeN_noabort(unsigned long, size_t);
+EXPORT_SYMBOL(__asan_storeN_noabort);
+/* to shut up compiler complaints */
+void __asan_handle_no_return(void) {}
+EXPORT_SYMBOL(__asan_handle_no_return);
+/* Emitted by compiler to poison large objects when they go out of scope. */
+void __asan_poison_stack_memory(const void *addr, size_t size)
+{
+        /*
+         * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded
+         * by redzones, so we simply round up size to simplify logic.
+         */
+        kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE),
+                            KASAN_USE_AFTER_SCOPE);
+}
+EXPORT_SYMBOL(__asan_poison_stack_memory);
+/* Emitted by compiler to unpoison large objects when they go into scope. */
+void __asan_unpoison_stack_memory(const void *addr, size_t size)
+{
+        kasan_unpoison_shadow(addr, size);
+}
+EXPORT_SYMBOL(__asan_unpoison_stack_memory);
+/* Emitted by compiler to poison alloca()ed objects. */
+void __asan_alloca_poison(unsigned long addr, size_t size)
+{
+        size_t rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
+        size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) -
+                        rounded_up_size;
+        size_t rounded_down_size = round_down(size, KASAN_SHADOW_SCALE_SIZE);
+        const void *left_redzone = (const void *)(addr -
+                        KASAN_ALLOCA_REDZONE_SIZE);
+        const void *right_redzone = (const void *)(addr + rounded_up_size);
+        WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
+        kasan_unpoison_shadow((const void *)(addr + rounded_down_size),
+                              size - rounded_down_size);
+        kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
+                        KASAN_ALLOCA_LEFT);
+        kasan_poison_shadow(right_redzone,
+                        padding_size + KASAN_ALLOCA_REDZONE_SIZE,
+                        KASAN_ALLOCA_RIGHT);
+}
+EXPORT_SYMBOL(__asan_alloca_poison);
+/* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */
+void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
+{
+        if (unlikely(!stack_top || stack_top > stack_bottom))
+                return;
+        kasan_unpoison_shadow(stack_top, stack_bottom - stack_top);
+}
+EXPORT_SYMBOL(__asan_allocas_unpoison);
+/* Emitted by the compiler to [un]poison local variables. */
+#define DEFINE_ASAN_SET_SHADOW(byte) \
+        void __asan_set_shadow_##byte(const void *addr, size_t size)    \
+        {                                                               \
+                __memset((void *)addr, 0x##byte, size);                 \
+        }                                                               \
+        EXPORT_SYMBOL(__asan_set_shadow_##byte)
+DEFINE_ASAN_SET_SHADOW(00);
+DEFINE_ASAN_SET_SHADOW(f1);
+DEFINE_ASAN_SET_SHADOW(f2);
+DEFINE_ASAN_SET_SHADOW(f3);
+DEFINE_ASAN_SET_SHADOW(f5);
+DEFINE_ASAN_SET_SHADOW(f8);
diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c
new file mode 100644
index 000000000000..5e12035888f2
--- /dev/null
+++ b/mm/kasan/generic_report.c
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains generic KASAN specific error reporting code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ *        Andrey Konovalov <andreyknvl@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/bitops.h>
+#include <linux/ftrace.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+#include <linux/module.h>
+#include <asm/sections.h>
+#include "kasan.h"
+#include "../slab.h"
+void *find_first_bad_addr(void *addr, size_t size)
+{
+        void *p = addr;
+        while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p)))
+                p += KASAN_SHADOW_SCALE_SIZE;
+        return p;
+}
+static const char *get_shadow_bug_type(struct kasan_access_info *info)
+{
+        const char *bug_type = "unknown-crash";
+        u8 *shadow_addr;
+        shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr);
+        /*
+         * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look
+         * at the next shadow byte to determine the type of the bad access.
+         */
+        if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1)
+                shadow_addr++;
+        switch (*shadow_addr) {
+        case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
+                /*
+                 * In theory it's still possible to see these shadow values
+                 * due to a data race in the kernel code.
+                 */
+                bug_type = "out-of-bounds";
+                break;
+        case KASAN_PAGE_REDZONE:
+        case KASAN_KMALLOC_REDZONE:
+                bug_type = "slab-out-of-bounds";
+                break;
+        case KASAN_GLOBAL_REDZONE:
+                bug_type = "global-out-of-bounds";
+                break;
+        case KASAN_STACK_LEFT:
+        case KASAN_STACK_MID:
+        case KASAN_STACK_RIGHT:
+        case KASAN_STACK_PARTIAL:
+                bug_type = "stack-out-of-bounds";
+                break;
+        case KASAN_FREE_PAGE:
+        case KASAN_KMALLOC_FREE:
+                bug_type = "use-after-free";
+                break;
+        case KASAN_USE_AFTER_SCOPE:
+                bug_type = "use-after-scope";
+                break;
+        case KASAN_ALLOCA_LEFT:
+        case KASAN_ALLOCA_RIGHT:
+                bug_type = "alloca-out-of-bounds";
+                break;
+        }
+        return bug_type;
+}
+static const char *get_wild_bug_type(struct kasan_access_info *info)
+{
+        const char *bug_type = "unknown-crash";
+        if ((unsigned long)info->access_addr < PAGE_SIZE)
+                bug_type = "null-ptr-deref";
+        else if ((unsigned long)info->access_addr < TASK_SIZE)
+                bug_type = "user-memory-access";
+        else
+                bug_type = "wild-memory-access";
+        return bug_type;
+}
+const char *get_bug_type(struct kasan_access_info *info)
+{
+        if (addr_has_shadow(info->access_addr))
+                return get_shadow_bug_type(info);
+        return get_wild_bug_type(info);
+}
+#define DEFINE_ASAN_REPORT_LOAD(size)                     \
+void __asan_report_load##size##_noabort(unsigned long addr) \
+{                                                         \
+        kasan_report(addr, size, false, _RET_IP_);        \
+}                                                         \
+EXPORT_SYMBOL(__asan_report_load##size##_noabort)
+#define DEFINE_ASAN_REPORT_STORE(size)                     \
+void __asan_report_store##size##_noabort(unsigned long addr) \
+{                                                          \
+        kasan_report(addr, size, true, _RET_IP_);          \
+}                                                          \
+EXPORT_SYMBOL(__asan_report_store##size##_noabort)
+DEFINE_ASAN_REPORT_LOAD(1);
+DEFINE_ASAN_REPORT_LOAD(2);
+DEFINE_ASAN_REPORT_LOAD(4);
+DEFINE_ASAN_REPORT_LOAD(8);
+DEFINE_ASAN_REPORT_LOAD(16);
+DEFINE_ASAN_REPORT_STORE(1);
+DEFINE_ASAN_REPORT_STORE(2);
+DEFINE_ASAN_REPORT_STORE(4);
+DEFINE_ASAN_REPORT_STORE(8);
+DEFINE_ASAN_REPORT_STORE(16);
+void __asan_report_load_n_noabort(unsigned long addr, size_t size)
+{
+        kasan_report(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_load_n_noabort);
+void __asan_report_store_n_noabort(unsigned long addr, size_t size)
+{
+        kasan_report(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_store_n_noabort);
diff --git a/mm/kasan/kasan_init.c b/mm/kasan/init.c
index c7550eb65922..34afad56497b 100644
--- a/mm/kasan/kasan_init.c
+++ b/mm/kasan/init.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
 * This file contains some kasan initialization code.
 *
@@ -30,13 +31,13 @@
 *   - Latter it reused it as zero shadow to cover large ranges of memory
 *     that allowed to access, but not handled by kasan (vmalloc/vmemmap ...).
 */
-unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
+unsigned char kasan_early_shadow_page[PAGE_SIZE] __page_aligned_bss;
 #if CONFIG_PGTABLE_LEVELS > 4
-p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss;
+p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss;
 static inline bool kasan_p4d_table(pgd_t pgd)
 {
-        return pgd_page(pgd) == virt_to_page(lm_alias(kasan_zero_p4d));
+        return pgd_page(pgd) == virt_to_page(lm_alias(kasan_early_shadow_p4d));
 }
 #else
 static inline bool kasan_p4d_table(pgd_t pgd)
@@ -45,10 +46,10 @@ static inline bool kasan_p4d_table(pgd_t pgd)
 }
 #endif
 #if CONFIG_PGTABLE_LEVELS > 3
-pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
+pud_t kasan_early_shadow_pud[PTRS_PER_PUD] __page_aligned_bss;
 static inline bool kasan_pud_table(p4d_t p4d)
 {
-        return p4d_page(p4d) == virt_to_page(lm_alias(kasan_zero_pud));
+        return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud));
 }
 #else
 static inline bool kasan_pud_table(p4d_t p4d)
@@ -57,10 +58,10 @@ static inline bool kasan_pud_table(p4d_t p4d)
 }
 #endif
 #if CONFIG_PGTABLE_LEVELS > 2
-pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss;
+pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD] __page_aligned_bss;
 static inline bool kasan_pmd_table(pud_t pud)
 {
-        return pud_page(pud) == virt_to_page(lm_alias(kasan_zero_pmd));
+        return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd));
 }
 #else
 static inline bool kasan_pmd_table(pud_t pud)
@@ -68,16 +69,16 @@ static inline bool kasan_pmd_table(pud_t pud)
        return 0;
 }
 #endif
-pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss;
+pte_t kasan_early_shadow_pte[PTRS_PER_PTE] __page_aligned_bss;
 static inline bool kasan_pte_table(pmd_t pmd)
 {
-        return pmd_page(pmd) == virt_to_page(lm_alias(kasan_zero_pte));
+        return pmd_page(pmd) == virt_to_page(lm_alias(kasan_early_shadow_pte));
 }
-static inline bool kasan_zero_page_entry(pte_t pte)
+static inline bool kasan_early_shadow_page_entry(pte_t pte)
 {
-        return pte_page(pte) == virt_to_page(lm_alias(kasan_zero_page));
+        return pte_page(pte) == virt_to_page(lm_alias(kasan_early_shadow_page));
 }
 static __init void *early_alloc(size_t size, int node)
@@ -92,7 +93,8 @@ static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr,
        pte_t *pte = pte_offset_kernel(pmd, addr);
        pte_t zero_pte;
-        zero_pte = pfn_pte(PFN_DOWN(__pa_symbol(kasan_zero_page)), PAGE_KERNEL);
+        zero_pte = pfn_pte(PFN_DOWN(__pa_symbol(kasan_early_shadow_page)),
+                                PAGE_KERNEL);
        zero_pte = pte_wrprotect(zero_pte);
        while (addr + PAGE_SIZE <= end) {
@@ -112,7 +114,8 @@ static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr,
                next = pmd_addr_end(addr, end);
                if (IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) {
-                        pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte));
+                        pmd_populate_kernel(&init_mm, pmd,
+                                        lm_alias(kasan_early_shadow_pte));
                        continue;
                }
@@ -145,9 +148,11 @@ static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr,
                if (IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) {
                        pmd_t *pmd;
-                        pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd));
+                        pud_populate(&init_mm, pud,
+                                        lm_alias(kasan_early_shadow_pmd));
                        pmd = pmd_offset(pud, addr);
-                        pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte));
+                        pmd_populate_kernel(&init_mm, pmd,
+                                        lm_alias(kasan_early_shadow_pte));
                        continue;
                }
@@ -181,12 +186,14 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
                        pud_t *pud;
                        pmd_t *pmd;
-                        p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud));
+                        p4d_populate(&init_mm, p4d,
+                                        lm_alias(kasan_early_shadow_pud));
                        pud = pud_offset(p4d, addr);
-                        pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd));
+                        pud_populate(&init_mm, pud,
+                                        lm_alias(kasan_early_shadow_pmd));
                        pmd = pmd_offset(pud, addr);
                        pmd_populate_kernel(&init_mm, pmd,
-                                                lm_alias(kasan_zero_pte));
+                                        lm_alias(kasan_early_shadow_pte));
                        continue;
                }
@@ -209,13 +216,13 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
 }
 /**
- * kasan_populate_zero_shadow - populate shadow memory region with
+ * kasan_populate_early_shadow - populate shadow memory region with
- *                               kasan_zero_page
+ *                               kasan_early_shadow_page
 * @shadow_start - start of the memory range to populate
 * @shadow_end   - end of the memory range to populate
 */
-int __ref kasan_populate_zero_shadow(const void *shadow_start,
+int __ref kasan_populate_early_shadow(const void *shadow_start,
-                                const void *shadow_end)
+                                        const void *shadow_end)
 {
        unsigned long addr = (unsigned long)shadow_start;
        unsigned long end = (unsigned long)shadow_end;
@@ -231,7 +238,7 @@ int __ref kasan_populate_zero_shadow(const void *shadow_start,
                        pmd_t *pmd;
                        /*
-                         * kasan_zero_pud should be populated with pmds
+                         * kasan_early_shadow_pud should be populated with pmds
                         * at this moment.
                         * [pud,pmd]_populate*() below needed only for
                         * 3,2 - level page tables where we don't have
@@ -241,21 +248,25 @@ int __ref kasan_populate_zero_shadow(const void *shadow_start,
                         * The ifndef is required to avoid build breakage.
                         *
                         * With 5level-fixup.h, pgd_populate() is not nop and
-                         * we reference kasan_zero_p4d. It's not defined
+                         * we reference kasan_early_shadow_p4d. It's not defined
                         * unless 5-level paging enabled.
                         *
                         * The ifndef can be dropped once all KASAN-enabled
                         * architectures will switch to pgtable-nop4d.h.
                         */
 #ifndef __ARCH_HAS_5LEVEL_HACK
-                        pgd_populate(&init_mm, pgd, lm_alias(kasan_zero_p4d));
+                        pgd_populate(&init_mm, pgd,
+                                        lm_alias(kasan_early_shadow_p4d));
 #endif
                        p4d = p4d_offset(pgd, addr);
-                        p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud));
+                        p4d_populate(&init_mm, p4d,
+                                        lm_alias(kasan_early_shadow_pud));
                        pud = pud_offset(p4d, addr);
-                        pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd));
+                        pud_populate(&init_mm, pud,
+                                        lm_alias(kasan_early_shadow_pmd));
                        pmd = pmd_offset(pud, addr);
-                        pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte));
+                        pmd_populate_kernel(&init_mm, pmd,
+                                        lm_alias(kasan_early_shadow_pte));
                        continue;
                }
@@ -350,7 +361,7 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr,
                if (!pte_present(*pte))
                        continue;
-                if (WARN_ON(!kasan_zero_page_entry(*pte)))
+                if (WARN_ON(!kasan_early_shadow_page_entry(*pte)))
                        continue;
                pte_clear(&init_mm, addr, pte);
        }
@@ -480,7 +491,7 @@ int kasan_add_zero_shadow(void *start, unsigned long size)
            WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)))
                return -EINVAL;
-        ret = kasan_populate_zero_shadow(shadow_start, shadow_end);
+        ret = kasan_populate_early_shadow(shadow_start, shadow_end);
        if (ret)
                kasan_remove_zero_shadow(shadow_start,
                                        size >> KASAN_SHADOW_SCALE_SHIFT);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index c12dcfde2ebd..ea51b2d898ec 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -8,10 +8,22 @@
 #define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
 #define KASAN_SHADOW_MASK       (KASAN_SHADOW_SCALE_SIZE - 1)
+#define KASAN_TAG_KERNEL        0xFF /* native kernel pointers tag */
+#define KASAN_TAG_INVALID       0xFE /* inaccessible memory tag */
+#define KASAN_TAG_MAX           0xFD /* maximum value for random tags */
+#ifdef CONFIG_KASAN_GENERIC
 #define KASAN_FREE_PAGE         0xFF  /* page was freed */
 #define KASAN_PAGE_REDZONE      0xFE  /* redzone for kmalloc_large allocations */
 #define KASAN_KMALLOC_REDZONE   0xFC  /* redzone inside slub object */
 #define KASAN_KMALLOC_FREE      0xFB  /* object was freed (kmem_cache_free/kfree) */
+#else
+#define KASAN_FREE_PAGE         KASAN_TAG_INVALID
+#define KASAN_PAGE_REDZONE      KASAN_TAG_INVALID
+#define KASAN_KMALLOC_REDZONE   KASAN_TAG_INVALID
+#define KASAN_KMALLOC_FREE      KASAN_TAG_INVALID
+#endif
 #define KASAN_GLOBAL_REDZONE    0xFA  /* redzone for global variable */
 /*
@@ -105,11 +117,25 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
                << KASAN_SHADOW_SCALE_SHIFT);
 }
+static inline bool addr_has_shadow(const void *addr)
+{
+        return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
+}
+void kasan_poison_shadow(const void *address, size_t size, u8 value);
+void check_memory_region(unsigned long addr, size_t size, bool write,
+                                unsigned long ret_ip);
+void *find_first_bad_addr(void *addr, size_t size);
+const char *get_bug_type(struct kasan_access_info *info);
 void kasan_report(unsigned long addr, size_t size,
                bool is_write, unsigned long ip);
 void kasan_report_invalid_free(void *object, unsigned long ip);
-#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB)
+#if defined(CONFIG_KASAN_GENERIC) && \
+        (defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
 void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
 void quarantine_reduce(void);
 void quarantine_remove_cache(struct kmem_cache *cache);
@@ -120,6 +146,37 @@ static inline void quarantine_reduce(void) { }
 static inline void quarantine_remove_cache(struct kmem_cache *cache) { }
 #endif
+#ifdef CONFIG_KASAN_SW_TAGS
+void print_tags(u8 addr_tag, const void *addr);
+u8 random_tag(void);
+#else
+static inline void print_tags(u8 addr_tag, const void *addr) { }
+static inline u8 random_tag(void)
+{
+        return 0;
+}
+#endif
+#ifndef arch_kasan_set_tag
+#define arch_kasan_set_tag(addr, tag)   ((void *)(addr))
+#endif
+#ifndef arch_kasan_reset_tag
+#define arch_kasan_reset_tag(addr)      ((void *)(addr))
+#endif
+#ifndef arch_kasan_get_tag
+#define arch_kasan_get_tag(addr)        0
+#endif
+#define set_tag(addr, tag)      ((void *)arch_kasan_set_tag((addr), (tag)))
+#define reset_tag(addr)         ((void *)arch_kasan_reset_tag(addr))
+#define get_tag(addr)           arch_kasan_get_tag(addr)
 /*
 * Exported functions for interfaces called from assembly or from generated
 * code. Declarations here to avoid warning about missing declarations.
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index b209dbaefde8..978bc4a3eb51 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
 * KASAN quarantine.
 *
@@ -236,7 +237,7 @@ void quarantine_reduce(void)
         * Update quarantine size in case of hotplug. Allocate a fraction of
         * the installed memory to quarantine minus per-cpu queue limits.
         */
-        total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
+        total_size = (totalram_pages() << PAGE_SHIFT) /
                QUARANTINE_FRACTION;
        percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus();
        new_quarantine_size = (total_size < percpu_quarantines) ?
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 5c169aa688fd..ca9418fe9232 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
- * This file contains error reporting code.
+ * This file contains common generic and tag-based KASAN error reporting code.
 *
 * Copyright (c) 2014 Samsung Electronics Co., Ltd.
 * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
@@ -39,129 +40,43 @@
 #define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK)
 #define SHADOW_ROWS_AROUND_ADDR 2
-static const void *find_first_bad_addr(const void *addr, size_t size)
+static unsigned long kasan_flags;
-{
-        u8 shadow_val = *(u8 *)kasan_mem_to_shadow(addr);
-        const void *first_bad_addr = addr;
-        while (!shadow_val && first_bad_addr < addr + size) {
-                first_bad_addr += KASAN_SHADOW_SCALE_SIZE;
-                shadow_val = *(u8 *)kasan_mem_to_shadow(first_bad_addr);
-        }
-        return first_bad_addr;
-}
-static bool addr_has_shadow(struct kasan_access_info *info)
+#define KASAN_BIT_REPORTED      0
-{
+#define KASAN_BIT_MULTI_SHOT    1
-        return (info->access_addr >=
-                kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
-}
-static const char *get_shadow_bug_type(struct kasan_access_info *info)
+bool kasan_save_enable_multi_shot(void)
 {
-        const char *bug_type = "unknown-crash";
+        return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
-        u8 *shadow_addr;
-        info->first_bad_addr = find_first_bad_addr(info->access_addr,
-                                                info->access_size);
-        shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr);
-        /*
-         * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look
-         * at the next shadow byte to determine the type of the bad access.
-         */
-        if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1)
-                shadow_addr++;
-        switch (*shadow_addr) {
-        case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
-                /*
-                 * In theory it's still possible to see these shadow values
-                 * due to a data race in the kernel code.
-                 */
-                bug_type = "out-of-bounds";
-                break;
-        case KASAN_PAGE_REDZONE:
-        case KASAN_KMALLOC_REDZONE:
-                bug_type = "slab-out-of-bounds";
-                break;
-        case KASAN_GLOBAL_REDZONE:
-                bug_type = "global-out-of-bounds";
-                break;
-        case KASAN_STACK_LEFT:
-        case KASAN_STACK_MID:
-        case KASAN_STACK_RIGHT:
-        case KASAN_STACK_PARTIAL:
-                bug_type = "stack-out-of-bounds";
-                break;
-        case KASAN_FREE_PAGE:
-        case KASAN_KMALLOC_FREE:
-                bug_type = "use-after-free";
-                break;
-        case KASAN_USE_AFTER_SCOPE:
-                bug_type = "use-after-scope";
-                break;
-        case KASAN_ALLOCA_LEFT:
-        case KASAN_ALLOCA_RIGHT:
-                bug_type = "alloca-out-of-bounds";
-                break;
-        }
-        return bug_type;
 }
+EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot);
-static const char *get_wild_bug_type(struct kasan_access_info *info)
+void kasan_restore_multi_shot(bool enabled)
 {
-        const char *bug_type = "unknown-crash";
+        if (!enabled)
+                clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
-        if ((unsigned long)info->access_addr < PAGE_SIZE)
-                bug_type = "null-ptr-deref";
-        else if ((unsigned long)info->access_addr < TASK_SIZE)
-                bug_type = "user-memory-access";
-        else
-                bug_type = "wild-memory-access";
-        return bug_type;
 }
+EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
-static const char *get_bug_type(struct kasan_access_info *info)
+static int __init kasan_set_multi_shot(char *str)
 {
-        if (addr_has_shadow(info))
+        set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
-                return get_shadow_bug_type(info);
+        return 1;
-        return get_wild_bug_type(info);
 }
+__setup("kasan_multi_shot", kasan_set_multi_shot);
 static void print_error_description(struct kasan_access_info *info)
 {
-        const char *bug_type = get_bug_type(info);
        pr_err("BUG: KASAN: %s in %pS\n",
-                bug_type, (void *)info->ip);
+                get_bug_type(info), (void *)info->ip);
        pr_err("%s of size %zu at addr %px by task %s/%d\n",
                info->is_write ? "Write" : "Read", info->access_size,
                info->access_addr, current->comm, task_pid_nr(current));
 }
-static inline bool kernel_or_module_addr(const void *addr)
-{
-        if (addr >= (void *)_stext && addr < (void *)_end)
-                return true;
-        if (is_module_address((unsigned long)addr))
-                return true;
-        return false;
-}
-static inline bool init_task_stack_addr(const void *addr)
-{
-        return addr >= (void *)&init_thread_union.stack &&
-                (addr <= (void *)&init_thread_union.stack +
-                        sizeof(init_thread_union.stack));
-}
 static DEFINE_SPINLOCK(report_lock);
-static void kasan_start_report(unsigned long *flags)
+static void start_report(unsigned long *flags)
 {
        /*
         * Make sure we don't end up in loop.
@@ -171,7 +86,7 @@ static void kasan_start_report(unsigned long *flags)
        pr_err("==================================================================\n");
 }
-static void kasan_end_report(unsigned long *flags)
+static void end_report(unsigned long *flags)
 {
        pr_err("==================================================================\n");
        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
@@ -249,6 +164,22 @@ static void describe_object(struct kmem_cache *cache, void *object,
        describe_object_addr(cache, object, addr);
 }
+static inline bool kernel_or_module_addr(const void *addr)
+{
+        if (addr >= (void *)_stext && addr < (void *)_end)
+                return true;
+        if (is_module_address((unsigned long)addr))
+                return true;
+        return false;
+}
+static inline bool init_task_stack_addr(const void *addr)
+{
+        return addr >= (void *)&init_thread_union.stack &&
+                (addr <= (void *)&init_thread_union.stack +
+                        sizeof(init_thread_union.stack));
+}
 static void print_address_description(void *addr)
 {
        struct page *page = addr_to_page(addr);
@@ -326,126 +257,69 @@ static void print_shadow_for_address(const void *addr)
        }
 }
+static bool report_enabled(void)
+{
+        if (current->kasan_depth)
+                return false;
+        if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
+                return true;
+        return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
+}
 void kasan_report_invalid_free(void *object, unsigned long ip)
 {
        unsigned long flags;
-        kasan_start_report(&flags);
+        start_report(&flags);
        pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip);
+        print_tags(get_tag(object), reset_tag(object));
+        object = reset_tag(object);
        pr_err("\n");
        print_address_description(object);
        pr_err("\n");
        print_shadow_for_address(object);
-        kasan_end_report(&flags);
+        end_report(&flags);
-}
-static void kasan_report_error(struct kasan_access_info *info)
-{
-        unsigned long flags;
-        kasan_start_report(&flags);
-        print_error_description(info);
-        pr_err("\n");
-        if (!addr_has_shadow(info)) {
-                dump_stack();
-        } else {
-                print_address_description((void *)info->access_addr);
-                pr_err("\n");
-                print_shadow_for_address(info->first_bad_addr);
-        }
-        kasan_end_report(&flags);
-}
-static unsigned long kasan_flags;
-#define KASAN_BIT_REPORTED      0
-#define KASAN_BIT_MULTI_SHOT    1
-bool kasan_save_enable_multi_shot(void)
-{
-        return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
-}
-EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot);
-void kasan_restore_multi_shot(bool enabled)
-{
-        if (!enabled)
-                clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
-}
-EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
-static int __init kasan_set_multi_shot(char *str)
-{
-        set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
-        return 1;
-}
-__setup("kasan_multi_shot", kasan_set_multi_shot);
-static inline bool kasan_report_enabled(void)
-{
-        if (current->kasan_depth)
-                return false;
-        if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
-                return true;
-        return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
 }
 void kasan_report(unsigned long addr, size_t size,
                bool is_write, unsigned long ip)
 {
        struct kasan_access_info info;
+        void *tagged_addr;
+        void *untagged_addr;
+        unsigned long flags;
-        if (likely(!kasan_report_enabled()))
+        if (likely(!report_enabled()))
                return;
        disable_trace_on_warning();
-        info.access_addr = (void *)addr;
+        tagged_addr = (void *)addr;
-        info.first_bad_addr = (void *)addr;
+        untagged_addr = reset_tag(tagged_addr);
+        info.access_addr = tagged_addr;
+        if (addr_has_shadow(untagged_addr))
+                info.first_bad_addr = find_first_bad_addr(tagged_addr, size);
+        else
+                info.first_bad_addr = untagged_addr;
        info.access_size = size;
        info.is_write = is_write;
        info.ip = ip;
-        kasan_report_error(&info);
+        start_report(&flags);
-}
+        print_error_description(&info);
+        if (addr_has_shadow(untagged_addr))
+                print_tags(get_tag(tagged_addr), info.first_bad_addr);
+        pr_err("\n");
-#define DEFINE_ASAN_REPORT_LOAD(size)                     \
+        if (addr_has_shadow(untagged_addr)) {
-void __asan_report_load##size##_noabort(unsigned long addr) \
+                print_address_description(untagged_addr);
-{                                                         \
+                pr_err("\n");
-        kasan_report(addr, size, false, _RET_IP_);        \
+                print_shadow_for_address(info.first_bad_addr);
-}                                                         \
+        } else {
-EXPORT_SYMBOL(__asan_report_load##size##_noabort)
+                dump_stack();
+        }
-#define DEFINE_ASAN_REPORT_STORE(size)                     \
-void __asan_report_store##size##_noabort(unsigned long addr) \
-{                                                          \
-        kasan_report(addr, size, true, _RET_IP_);          \
-}                                                          \
-EXPORT_SYMBOL(__asan_report_store##size##_noabort)
-DEFINE_ASAN_REPORT_LOAD(1);
-DEFINE_ASAN_REPORT_LOAD(2);
-DEFINE_ASAN_REPORT_LOAD(4);
-DEFINE_ASAN_REPORT_LOAD(8);
-DEFINE_ASAN_REPORT_LOAD(16);
-DEFINE_ASAN_REPORT_STORE(1);
-DEFINE_ASAN_REPORT_STORE(2);
-DEFINE_ASAN_REPORT_STORE(4);
-DEFINE_ASAN_REPORT_STORE(8);
-DEFINE_ASAN_REPORT_STORE(16);
-void __asan_report_load_n_noabort(unsigned long addr, size_t size)
-{
-        kasan_report(addr, size, false, _RET_IP_);
-}
-EXPORT_SYMBOL(__asan_report_load_n_noabort);
-void __asan_report_store_n_noabort(unsigned long addr, size_t size)
+        end_report(&flags);
-{
-        kasan_report(addr, size, true, _RET_IP_);
 }
-EXPORT_SYMBOL(__asan_report_store_n_noabort);
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
new file mode 100644
index 000000000000..0777649e07c4
--- /dev/null
+++ b/mm/kasan/tags.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains core tag-based KASAN code.
+ *
+ * Copyright (c) 2018 Google, Inc.
+ * Author: Andrey Konovalov <andreyknvl@google.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define DISABLE_BRANCH_PROFILING
+#include <linux/export.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/kmemleak.h>
+#include <linux/linkage.h>
+#include <linux/memblock.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/bug.h>
+#include "kasan.h"
+#include "../slab.h"
+static DEFINE_PER_CPU(u32, prng_state);
+void kasan_init_tags(void)
+{
+        int cpu;
+        for_each_possible_cpu(cpu)
+                per_cpu(prng_state, cpu) = get_random_u32();
+}
+/*
+ * If a preemption happens between this_cpu_read and this_cpu_write, the only
+ * side effect is that we'll give a few allocated in different contexts objects
+ * the same tag. Since tag-based KASAN is meant to be used a probabilistic
+ * bug-detection debug feature, this doesn't have significant negative impact.
+ *
+ * Ideally the tags use strong randomness to prevent any attempts to predict
+ * them during explicit exploit attempts. But strong randomness is expensive,
+ * and we did an intentional trade-off to use a PRNG. This non-atomic RMW
+ * sequence has in fact positive effect, since interrupts that randomly skew
+ * PRNG at unpredictable points do only good.
+ */
+u8 random_tag(void)
+{
+        u32 state = this_cpu_read(prng_state);
+        state = 1664525 * state + 1013904223;
+        this_cpu_write(prng_state, state);
+        return (u8)(state % (KASAN_TAG_MAX + 1));
+}
+void *kasan_reset_tag(const void *addr)
+{
+        return reset_tag(addr);
+}
+void check_memory_region(unsigned long addr, size_t size, bool write,
+                                unsigned long ret_ip)
+{
+        u8 tag;
+        u8 *shadow_first, *shadow_last, *shadow;
+        void *untagged_addr;
+        if (unlikely(size == 0))
+                return;
+        tag = get_tag((const void *)addr);
+        /*
+         * Ignore accesses for pointers tagged with 0xff (native kernel
+         * pointer tag) to suppress false positives caused by kmap.
+         *
+         * Some kernel code was written to account for archs that don't keep
+         * high memory mapped all the time, but rather map and unmap particular
+         * pages when needed. Instead of storing a pointer to the kernel memory,
+         * this code saves the address of the page structure and offset within
+         * that page for later use. Those pages are then mapped and unmapped
+         * with kmap/kunmap when necessary and virt_to_page is used to get the
+         * virtual address of the page. For arm64 (that keeps the high memory
+         * mapped all the time), kmap is turned into a page_address call.
+         * The issue is that with use of the page_address + virt_to_page
+         * sequence the top byte value of the original pointer gets lost (gets
+         * set to KASAN_TAG_KERNEL (0xFF)).
+         */
+        if (tag == KASAN_TAG_KERNEL)
+                return;
+        untagged_addr = reset_tag((const void *)addr);
+        if (unlikely(untagged_addr <
+                        kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
+                kasan_report(addr, size, write, ret_ip);
+                return;
+        }
+        shadow_first = kasan_mem_to_shadow(untagged_addr);
+        shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1);
+        for (shadow = shadow_first; shadow <= shadow_last; shadow++) {
+                if (*shadow != tag) {
+                        kasan_report(addr, size, write, ret_ip);
+                        return;
+                }
+        }
+}
+#define DEFINE_HWASAN_LOAD_STORE(size)                                  \
+        void __hwasan_load##size##_noabort(unsigned long addr)          \
+        {                                                               \
+                check_memory_region(addr, size, false, _RET_IP_);       \
+        }                                                               \
+        EXPORT_SYMBOL(__hwasan_load##size##_noabort);                   \
+        void __hwasan_store##size##_noabort(unsigned long addr)         \
+        {                                                               \
+                check_memory_region(addr, size, true, _RET_IP_);        \
+        }                                                               \
+        EXPORT_SYMBOL(__hwasan_store##size##_noabort)
+DEFINE_HWASAN_LOAD_STORE(1);
+DEFINE_HWASAN_LOAD_STORE(2);
+DEFINE_HWASAN_LOAD_STORE(4);
+DEFINE_HWASAN_LOAD_STORE(8);
+DEFINE_HWASAN_LOAD_STORE(16);
+void __hwasan_loadN_noabort(unsigned long addr, unsigned long size)
+{
+        check_memory_region(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__hwasan_loadN_noabort);
+void __hwasan_storeN_noabort(unsigned long addr, unsigned long size)
+{
+        check_memory_region(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__hwasan_storeN_noabort);
+void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
+{
+        kasan_poison_shadow((void *)addr, size, tag);
+}
+EXPORT_SYMBOL(__hwasan_tag_memory);
diff --git a/mm/kasan/tags_report.c b/mm/kasan/tags_report.c
new file mode 100644
index 000000000000..8eaf5f722271
--- /dev/null
+++ b/mm/kasan/tags_report.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains tag-based KASAN specific error reporting code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ *        Andrey Konovalov <andreyknvl@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/bitops.h>
+#include <linux/ftrace.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+#include <linux/module.h>
+#include <asm/sections.h>
+#include "kasan.h"
+#include "../slab.h"
+const char *get_bug_type(struct kasan_access_info *info)
+{
+        return "invalid-access";
+}
+void *find_first_bad_addr(void *addr, size_t size)
+{
+        u8 tag = get_tag(addr);
+        void *p = reset_tag(addr);
+        void *end = p + size;
+        while (p < end && tag == *(u8 *)kasan_mem_to_shadow(p))
+                p += KASAN_SHADOW_SCALE_SIZE;
+        return p;
+}
+void print_tags(u8 addr_tag, const void *addr)
+{
+        u8 *shadow = (u8 *)kasan_mem_to_shadow(addr);
+        pr_err("Pointer tag: [%02x], memory tag: [%02x]\n", addr_tag, *shadow);
+}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 43ce2f4d2551..4f017339ddb2 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -944,8 +944,7 @@ static void collapse_huge_page(struct mm_struct *mm,
        int isolated = 0, result = 0;
        struct mem_cgroup *memcg;
        struct vm_area_struct *vma;
-        unsigned long mmun_start;       /* For mmu_notifiers */
+        struct mmu_notifier_range range;
-        unsigned long mmun_end;         /* For mmu_notifiers */
        gfp_t gfp;
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
@@ -1017,9 +1016,8 @@ static void collapse_huge_page(struct mm_struct *mm,
        pte = pte_offset_map(pmd, address);
        pte_ptl = pte_lockptr(mm, pmd);
-        mmun_start = address;
+        mmu_notifier_range_init(&range, mm, address, address + HPAGE_PMD_SIZE);
-        mmun_end   = address + HPAGE_PMD_SIZE;
+        mmu_notifier_invalidate_range_start(&range);
-        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
        /*
         * After this gup_fast can't run anymore. This also removes
@@ -1029,7 +1027,7 @@ static void collapse_huge_page(struct mm_struct *mm,
         */
        _pmd = pmdp_collapse_flush(vma, address, pmd);
        spin_unlock(pmd_ptl);
-        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+        mmu_notifier_invalidate_range_end(&range);
        spin_lock(pte_ptl);
        isolated = __collapse_huge_page_isolate(vma, address, pte);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 877de4fa0720..f9d9dc250428 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1547,11 +1547,14 @@ static void kmemleak_scan(void)
                unsigned long pfn;
                for (pfn = start_pfn; pfn < end_pfn; pfn++) {
-                        struct page *page;
+                        struct page *page = pfn_to_online_page(pfn);
-                        if (!pfn_valid(pfn))
+                        if (!page)
+                                continue;
+                        /* only scan pages belonging to this node */
+                        if (page_to_nid(page) != i)
                                continue;
-                        page = pfn_to_page(pfn);
                        /* only scan if page is in use */
                        if (page_count(page) == 0)
                                continue;
@@ -1647,7 +1650,7 @@ static void kmemleak_scan(void)
 */
 static int kmemleak_scan_thread(void *arg)
 {
-        static int first_run = 1;
+        static int first_run = IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN);
        pr_info("Automatic memory scanning thread started\n");
        set_user_nice(current, 10);
@@ -2141,9 +2144,11 @@ static int __init kmemleak_late_init(void)
                return -ENOMEM;
        }
-        mutex_lock(&scan_mutex);
+        if (IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN)) {
-        start_scan_thread();
+                mutex_lock(&scan_mutex);
-        mutex_unlock(&scan_mutex);
+                start_scan_thread();
+                mutex_unlock(&scan_mutex);
+        }
        pr_info("Kernel memory leak detector initialized\n");
diff --git a/mm/ksm.c b/mm/ksm.c
index 5b0894b45ee5..6c48ad13b4c9 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -25,7 +25,7 @@
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
 #include <linux/spinlock.h>
-#include <linux/jhash.h>
+#include <linux/xxhash.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/wait.h>
@@ -296,6 +296,7 @@ static unsigned long ksm_run = KSM_RUN_STOP;
 static void wait_while_offlining(void);
 static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
+static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
 static DEFINE_MUTEX(ksm_thread_mutex);
 static DEFINE_SPINLOCK(ksm_mmlist_lock);
@@ -1009,7 +1010,7 @@ static u32 calc_checksum(struct page *page)
 {
        u32 checksum;
        void *addr = kmap_atomic(page);
-        checksum = jhash2(addr, PAGE_SIZE / 4, 17);
+        checksum = xxhash(addr, PAGE_SIZE, 0);
        kunmap_atomic(addr);
        return checksum;
 }
@@ -1042,8 +1043,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
        };
        int swapped;
        int err = -EFAULT;
-        unsigned long mmun_start;       /* For mmu_notifiers */
+        struct mmu_notifier_range range;
-        unsigned long mmun_end;         /* For mmu_notifiers */
        pvmw.address = page_address_in_vma(page, vma);
        if (pvmw.address == -EFAULT)
@@ -1051,9 +1051,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
        BUG_ON(PageTransCompound(page));
-        mmun_start = pvmw.address;
+        mmu_notifier_range_init(&range, mm, pvmw.address,
-        mmun_end   = pvmw.address + PAGE_SIZE;
+                                pvmw.address + PAGE_SIZE);
-        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+        mmu_notifier_invalidate_range_start(&range);
        if (!page_vma_mapped_walk(&pvmw))
                goto out_mn;
@@ -1105,7 +1105,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 out_unlock:
        page_vma_mapped_walk_done(&pvmw);
 out_mn:
-        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+        mmu_notifier_invalidate_range_end(&range);
 out:
        return err;
 }
@@ -1129,8 +1129,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        spinlock_t *ptl;
        unsigned long addr;
        int err = -EFAULT;
-        unsigned long mmun_start;       /* For mmu_notifiers */
+        struct mmu_notifier_range range;
-        unsigned long mmun_end;         /* For mmu_notifiers */
        addr = page_address_in_vma(page, vma);
        if (addr == -EFAULT)
@@ -1140,9 +1139,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        if (!pmd)
                goto out;
-        mmun_start = addr;
+        mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE);
-        mmun_end   = addr + PAGE_SIZE;
+        mmu_notifier_invalidate_range_start(&range);
-        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
        if (!pte_same(*ptep, orig_pte)) {
@@ -1188,7 +1186,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        pte_unmap_unlock(ptep, ptl);
        err = 0;
 out_mn:
-        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+        mmu_notifier_invalidate_range_end(&range);
 out:
        return err;
 }
@@ -2391,6 +2389,8 @@ static int ksmd_should_run(void)
 static int ksm_scan_thread(void *nothing)
 {
+        unsigned int sleep_ms;
        set_freezable();
        set_user_nice(current, 5);
@@ -2404,8 +2404,10 @@ static int ksm_scan_thread(void *nothing)
                try_to_freeze();
                if (ksmd_should_run()) {
-                        schedule_timeout_interruptible(
+                        sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
-                                msecs_to_jiffies(ksm_thread_sleep_millisecs));
+                        wait_event_interruptible_timeout(ksm_iter_wait,
+                                sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
+                                msecs_to_jiffies(sleep_ms));
                } else {
                        wait_event_freezable(ksm_thread_wait,
                                ksmd_should_run() || kthread_should_stop());
@@ -2824,6 +2826,7 @@ static ssize_t sleep_millisecs_store(struct kobject *kobj,
                return -EINVAL;
        ksm_thread_sleep_millisecs = msecs;
+        wake_up_interruptible(&ksm_iter_wait);
        return count;
 }
diff --git a/mm/madvise.c b/mm/madvise.c
index 6cb1ca93e290..21a7881a2db4 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -458,29 +458,30 @@ static void madvise_free_page_range(struct mmu_gather *tlb,
 static int madvise_free_single_vma(struct vm_area_struct *vma,
                        unsigned long start_addr, unsigned long end_addr)
 {
-        unsigned long start, end;
        struct mm_struct *mm = vma->vm_mm;
+        struct mmu_notifier_range range;
        struct mmu_gather tlb;
        /* MADV_FREE works for only anon vma at the moment */
        if (!vma_is_anonymous(vma))
                return -EINVAL;
-        start = max(vma->vm_start, start_addr);
+        range.start = max(vma->vm_start, start_addr);
-        if (start >= vma->vm_end)
+        if (range.start >= vma->vm_end)
                return -EINVAL;
-        end = min(vma->vm_end, end_addr);
+        range.end = min(vma->vm_end, end_addr);
-        if (end <= vma->vm_start)
+        if (range.end <= vma->vm_start)
                return -EINVAL;
+        mmu_notifier_range_init(&range, mm, range.start, range.end);
        lru_add_drain();
-        tlb_gather_mmu(&tlb, mm, start, end);
+        tlb_gather_mmu(&tlb, mm, range.start, range.end);
        update_hiwater_rss(mm);
-        mmu_notifier_invalidate_range_start(mm, start, end);
+        mmu_notifier_invalidate_range_start(&range);
-        madvise_free_page_range(&tlb, vma, start, end);
+        madvise_free_page_range(&tlb, vma, range.start, range.end);
-        mmu_notifier_invalidate_range_end(mm, start, end);
+        mmu_notifier_invalidate_range_end(&range);
-        tlb_finish_mmu(&tlb, start, end);
+        tlb_finish_mmu(&tlb, range.start, range.end);
        return 0;
 }
diff --git a/mm/memblock.c b/mm/memblock.c
index 81ae63ca78d0..022d4cbb3618 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -262,7 +262,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
        phys_addr_t kernel_end, ret;
        /* pump up @end */
-        if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
+        if (end == MEMBLOCK_ALLOC_ACCESSIBLE ||
+            end == MEMBLOCK_ALLOC_KASAN)
                end = memblock.current_limit;
        /* avoid allocating the first page */
@@ -800,7 +801,14 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
        return memblock_remove_range(&memblock.memory, base, size);
 }
+/**
+ * memblock_free - free boot memory block
+ * @base: phys starting address of the  boot memory block
+ * @size: size of the boot memory block in bytes
+ *
+ * Free boot memory block previously allocated by memblock_alloc_xx() API.
+ * The freeing memory will not be released to the buddy allocator.
+ */
 int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
 {
        phys_addr_t end = base + size - 1;
@@ -1412,13 +1420,15 @@ again:
 done:
        ptr = phys_to_virt(alloc);
-        /*
+        /* Skip kmemleak for kasan_init() due to high volume. */
-         * The min_count is set to 0 so that bootmem allocated blocks
+        if (max_addr != MEMBLOCK_ALLOC_KASAN)
-         * are never reported as leaks. This is because many of these blocks
+                /*
-         * are only referred via the physical address which is not
+                 * The min_count is set to 0 so that bootmem allocated
-         * looked up by kmemleak.
+                 * blocks are never reported as leaks. This is because many
-         */
+                 * of these blocks are only referred via the physical
-        kmemleak_alloc(ptr, size, 0, 0);
+                 * address which is not looked up by kmemleak.
+                 */
+                kmemleak_alloc(ptr, size, 0, 0);
        return ptr;
 }
@@ -1537,24 +1547,6 @@ void * __init memblock_alloc_try_nid(
 }
 /**
- * __memblock_free_early - free boot memory block
- * @base: phys starting address of the  boot memory block
- * @size: size of the boot memory block in bytes
- *
- * Free boot memory block previously allocated by memblock_alloc_xx() API.
- * The freeing memory will not be released to the buddy allocator.
- */
-void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
-{
-        phys_addr_t end = base + size - 1;
-        memblock_dbg("%s: [%pa-%pa] %pF\n",
-                     __func__, &base, &end, (void *)_RET_IP_);
-        kmemleak_free_part_phys(base, size);
-        memblock_remove_range(&memblock.reserved, base, size);
-}
-/**
 * __memblock_free_late - free bootmem block pages directly to buddy allocator
 * @base: phys starting address of the  boot memory block
 * @size: size of the boot memory block in bytes
@@ -1576,7 +1568,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
        for (; cursor < end; cursor++) {
                memblock_free_pages(pfn_to_page(cursor), cursor, 0);
-                totalram_pages++;
+                totalram_pages_inc();
        }
 }
@@ -1950,7 +1942,7 @@ void reset_node_managed_pages(pg_data_t *pgdat)
        struct zone *z;
        for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
-                z->managed_pages = 0;
+                atomic_long_set(&z->managed_pages, 0);
 }
 void __init reset_all_zones_managed_pages(void)
@@ -1978,7 +1970,7 @@ unsigned long __init memblock_free_all(void)
        reset_all_zones_managed_pages();
        pages = free_low_memory_core_early();
-        totalram_pages += pages;
+        totalram_pages_add(pages);
        return pages;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6e1469b80cb7..af7f18b32389 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1293,32 +1293,39 @@ static const char *const memcg1_stat_names[] = {
 #define K(x) ((x) << (PAGE_SHIFT-10))
 /**
- * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
+ * mem_cgroup_print_oom_context: Print OOM information relevant to
+ * memory controller.
 * @memcg: The memory cgroup that went over limit
 * @p: Task that is going to be killed
 *
 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
 * enabled
 */
-void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
+void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
 {
-        struct mem_cgroup *iter;
-        unsigned int i;
        rcu_read_lock();
+        if (memcg) {
+                pr_cont(",oom_memcg=");
+                pr_cont_cgroup_path(memcg->css.cgroup);
+        } else
+                pr_cont(",global_oom");
        if (p) {
-                pr_info("Task in ");
+                pr_cont(",task_memcg=");
                pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
-                pr_cont(" killed as a result of limit of ");
-        } else {
-                pr_info("Memory limit reached of cgroup ");
        }
-        pr_cont_cgroup_path(memcg->css.cgroup);
-        pr_cont("\n");
        rcu_read_unlock();
+}
+/**
+ * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
+ * memory controller.
+ * @memcg: The memory cgroup that went over limit
+ */
+void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
+{
+        struct mem_cgroup *iter;
+        unsigned int i;
        pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
                K((u64)page_counter_read(&memcg->memory)),
@@ -1666,6 +1673,9 @@ enum oom_status {
 static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
+        enum oom_status ret;
+        bool locked;
        if (order > PAGE_ALLOC_COSTLY_ORDER)
                return OOM_SKIPPED;
@@ -1700,10 +1710,23 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int
                return OOM_ASYNC;
        }
+        mem_cgroup_mark_under_oom(memcg);
+        locked = mem_cgroup_oom_trylock(memcg);
+        if (locked)
+                mem_cgroup_oom_notify(memcg);
+        mem_cgroup_unmark_under_oom(memcg);
        if (mem_cgroup_out_of_memory(memcg, mask, order))
-                return OOM_SUCCESS;
+                ret = OOM_SUCCESS;
+        else
+                ret = OOM_FAILED;
-        return OOM_FAILED;
+        if (locked)
+                mem_cgroup_oom_unlock(memcg);
+        return ret;
 }
 /**
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7c72f2a95785..6379fff1a5ff 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -966,7 +966,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
        enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
        struct address_space *mapping;
        LIST_HEAD(tokill);
-        bool unmap_success;
+        bool unmap_success = true;
        int kill = 1, forcekill;
        struct page *hpage = *hpagep;
        bool mlocked = PageMlocked(hpage);
@@ -1028,7 +1028,19 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
        if (kill)
                collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
-        unmap_success = try_to_unmap(hpage, ttu);
+        if (!PageHuge(hpage)) {
+                unmap_success = try_to_unmap(hpage, ttu);
+        } else if (mapping) {
+                /*
+                 * For hugetlb pages, try_to_unmap could potentially call
+                 * huge_pmd_unshare.  Because of this, take semaphore in
+                 * write mode here and set TTU_RMAP_LOCKED to indicate we
+                 * have taken the lock at this higer level.
+                 */
+                i_mmap_lock_write(mapping);
+                unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
+                i_mmap_unlock_write(mapping);
+        }
        if (!unmap_success)
                pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
                       pfn, page_mapcount(hpage));
diff --git a/mm/memory.c b/mm/memory.c
index 4ad2d293ddc2..2dd2f9ab57f4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -973,8 +973,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        unsigned long next;
        unsigned long addr = vma->vm_start;
        unsigned long end = vma->vm_end;
-        unsigned long mmun_start;       /* For mmu_notifiers */
+        struct mmu_notifier_range range;
-        unsigned long mmun_end;         /* For mmu_notifiers */
        bool is_cow;
        int ret;
@@ -1008,11 +1007,11 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         * is_cow_mapping() returns true.
         */
        is_cow = is_cow_mapping(vma->vm_flags);
-        mmun_start = addr;
-        mmun_end   = end;
+        if (is_cow) {
-        if (is_cow)
+                mmu_notifier_range_init(&range, src_mm, addr, end);
-                mmu_notifier_invalidate_range_start(src_mm, mmun_start,
+                mmu_notifier_invalidate_range_start(&range);
-                                                    mmun_end);
+        }
        ret = 0;
        dst_pgd = pgd_offset(dst_mm, addr);
@@ -1029,7 +1028,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        } while (dst_pgd++, src_pgd++, addr = next, addr != end);
        if (is_cow)
-                mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
+                mmu_notifier_invalidate_range_end(&range);
        return ret;
 }
@@ -1332,12 +1331,13 @@ void unmap_vmas(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long start_addr,
                unsigned long end_addr)
 {
-        struct mm_struct *mm = vma->vm_mm;
+        struct mmu_notifier_range range;
-        mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
+        mmu_notifier_range_init(&range, vma->vm_mm, start_addr, end_addr);
+        mmu_notifier_invalidate_range_start(&range);
        for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
                unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
-        mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
+        mmu_notifier_invalidate_range_end(&range);
 }
 /**
@@ -1351,18 +1351,18 @@ void unmap_vmas(struct mmu_gather *tlb,
 void zap_page_range(struct vm_area_struct *vma, unsigned long start,
                unsigned long size)
 {
-        struct mm_struct *mm = vma->vm_mm;
+        struct mmu_notifier_range range;
        struct mmu_gather tlb;
-        unsigned long end = start + size;
        lru_add_drain();
-        tlb_gather_mmu(&tlb, mm, start, end);
+        mmu_notifier_range_init(&range, vma->vm_mm, start, start + size);
-        update_hiwater_rss(mm);
+        tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
-        mmu_notifier_invalidate_range_start(mm, start, end);
+        update_hiwater_rss(vma->vm_mm);
-        for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
+        mmu_notifier_invalidate_range_start(&range);
-                unmap_single_vma(&tlb, vma, start, end, NULL);
+        for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
-        mmu_notifier_invalidate_range_end(mm, start, end);
+                unmap_single_vma(&tlb, vma, start, range.end, NULL);
-        tlb_finish_mmu(&tlb, start, end);
+        mmu_notifier_invalidate_range_end(&range);
+        tlb_finish_mmu(&tlb, start, range.end);
 }
 /**
@@ -1377,17 +1377,17 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
 static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
                unsigned long size, struct zap_details *details)
 {
-        struct mm_struct *mm = vma->vm_mm;
+        struct mmu_notifier_range range;
        struct mmu_gather tlb;
-        unsigned long end = address + size;
        lru_add_drain();
-        tlb_gather_mmu(&tlb, mm, address, end);
+        mmu_notifier_range_init(&range, vma->vm_mm, address, address + size);
-        update_hiwater_rss(mm);
+        tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
-        mmu_notifier_invalidate_range_start(mm, address, end);
+        update_hiwater_rss(vma->vm_mm);
-        unmap_single_vma(&tlb, vma, address, end, details);
+        mmu_notifier_invalidate_range_start(&range);
-        mmu_notifier_invalidate_range_end(mm, address, end);
+        unmap_single_vma(&tlb, vma, address, range.end, details);
-        tlb_finish_mmu(&tlb, address, end);
+        mmu_notifier_invalidate_range_end(&range);
+        tlb_finish_mmu(&tlb, address, range.end);
 }
 /**
@@ -2247,9 +2247,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
        struct page *new_page = NULL;
        pte_t entry;
        int page_copied = 0;
-        const unsigned long mmun_start = vmf->address & PAGE_MASK;
-        const unsigned long mmun_end = mmun_start + PAGE_SIZE;
        struct mem_cgroup *memcg;
+        struct mmu_notifier_range range;
        if (unlikely(anon_vma_prepare(vma)))
                goto oom;
@@ -2272,7 +2271,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
        __SetPageUptodate(new_page);
-        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+        mmu_notifier_range_init(&range, mm, vmf->address & PAGE_MASK,
+                                (vmf->address & PAGE_MASK) + PAGE_SIZE);
+        mmu_notifier_invalidate_range_start(&range);
        /*
         * Re-check the pte - we dropped the lock
@@ -2349,7 +2350,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
         * No need to double call mmu_notifier->invalidate_range() callback as
         * the above ptep_clear_flush_notify() did already call it.
         */
-        mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
+        mmu_notifier_invalidate_range_only_end(&range);
        if (old_page) {
                /*
                 * Don't let another task, with possibly unlocked vma,
@@ -3830,7 +3831,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
        vmf.pud = pud_alloc(mm, p4d, address);
        if (!vmf.pud)
                return VM_FAULT_OOM;
-        if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
+        if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
                ret = create_huge_pud(&vmf);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
@@ -3856,7 +3857,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
        vmf.pmd = pmd_alloc(mm, vmf.pud, address);
        if (!vmf.pmd)
                return VM_FAULT_OOM;
-        if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
+        if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
                ret = create_huge_pmd(&vmf);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
@@ -4030,7 +4031,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 #endif /* __PAGETABLE_PMD_FOLDED */
 static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
-                            unsigned long *start, unsigned long *end,
+                            struct mmu_notifier_range *range,
                            pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
 {
        pgd_t *pgd;
@@ -4058,10 +4059,10 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
                if (!pmdpp)
                        goto out;
-                if (start && end) {
+                if (range) {
-                        *start = address & PMD_MASK;
+                        mmu_notifier_range_init(range, mm, address & PMD_MASK,
-                        *end = *start + PMD_SIZE;
+                                             (address & PMD_MASK) + PMD_SIZE);
-                        mmu_notifier_invalidate_range_start(mm, *start, *end);
+                        mmu_notifier_invalidate_range_start(range);
                }
                *ptlp = pmd_lock(mm, pmd);
                if (pmd_huge(*pmd)) {
@@ -4069,17 +4070,17 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
                        return 0;
                }
                spin_unlock(*ptlp);
-                if (start && end)
+                if (range)
-                        mmu_notifier_invalidate_range_end(mm, *start, *end);
+                        mmu_notifier_invalidate_range_end(range);
        }
        if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
                goto out;
-        if (start && end) {
+        if (range) {
-                *start = address & PAGE_MASK;
+                range->start = address & PAGE_MASK;
-                *end = *start + PAGE_SIZE;
+                range->end = range->start + PAGE_SIZE;
-                mmu_notifier_invalidate_range_start(mm, *start, *end);
+                mmu_notifier_invalidate_range_start(range);
        }
        ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
        if (!pte_present(*ptep))
@@ -4088,8 +4089,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
        return 0;
 unlock:
        pte_unmap_unlock(ptep, *ptlp);
-        if (start && end)
+        if (range)
-                mmu_notifier_invalidate_range_end(mm, *start, *end);
+                mmu_notifier_invalidate_range_end(range);
 out:
        return -EINVAL;
 }
@@ -4101,20 +4102,20 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address,
        /* (void) is needed to make gcc happy */
        (void) __cond_lock(*ptlp,
-                           !(res = __follow_pte_pmd(mm, address, NULL, NULL,
+                           !(res = __follow_pte_pmd(mm, address, NULL,
                                                    ptepp, NULL, ptlp)));
        return res;
 }
 int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
-                             unsigned long *start, unsigned long *end,
+                   struct mmu_notifier_range *range,
-                             pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+                   pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
 {
        int res;
        /* (void) is needed to make gcc happy */
        (void) __cond_lock(*ptlp,
-                           !(res = __follow_pte_pmd(mm, address, start, end,
+                           !(res = __follow_pte_pmd(mm, address, range,
                                                    ptepp, pmdpp, ptlp)));
        return res;
 }
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2b2b3ccbbfb5..b9a667d36c55 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -34,6 +34,7 @@
 #include <linux/hugetlb.h>
 #include <linux/memblock.h>
 #include <linux/compaction.h>
+#include <linux/rmap.h>
 #include <asm/tlbflush.h>
@@ -253,7 +254,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
        if (pfn_valid(phys_start_pfn))
                return -EEXIST;
-        ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn, altmap);
+        ret = sparse_add_one_section(nid, phys_start_pfn, altmap);
        if (ret < 0)
                return ret;
@@ -743,14 +744,13 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
        int nid = pgdat->node_id;
        unsigned long flags;
-        if (zone_is_empty(zone))
-                init_currently_empty_zone(zone, start_pfn, nr_pages);
        clear_zone_contiguous(zone);
        /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
        pgdat_resize_lock(pgdat, &flags);
        zone_span_writelock(zone);
+        if (zone_is_empty(zone))
+                init_currently_empty_zone(zone, start_pfn, nr_pages);
        resize_zone_range(zone, start_pfn, nr_pages);
        zone_span_writeunlock(zone);
        resize_pgdat_range(pgdat, start_pfn, nr_pages);
@@ -1078,7 +1078,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
 *
 * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
 */
-int __ref add_memory_resource(int nid, struct resource *res, bool online)
+int __ref add_memory_resource(int nid, struct resource *res)
 {
        u64 start, size;
        bool new_node = false;
@@ -1133,7 +1133,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
        mem_hotplug_done();
        /* online pages if requested */
-        if (online)
+        if (memhp_auto_online)
                walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
                                  NULL, online_memory_block);
@@ -1157,7 +1157,7 @@ int __ref __add_memory(int nid, u64 start, u64 size)
        if (IS_ERR(res))
                return PTR_ERR(res);
-        ret = add_memory_resource(nid, res, memhp_auto_online);
+        ret = add_memory_resource(nid, res);
        if (ret < 0)
                release_memory_resource(res);
        return ret;
@@ -1226,7 +1226,7 @@ static bool is_pageblock_removable_nolock(struct page *page)
        if (!zone_spans_pfn(zone, pfn))
                return false;
-        return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
+        return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, SKIP_HWPOISON);
 }
 /* Checks if this range of memory is likely to be hot-removable. */
@@ -1339,18 +1339,16 @@ static struct page *new_node_page(struct page *page, unsigned long private)
        return new_page_nodemask(page, nid, &nmask);
 }
-#define NR_OFFLINE_AT_ONCE_PAGES        (256)
 static int
 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 {
        unsigned long pfn;
        struct page *page;
-        int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
        int not_managed = 0;
        int ret = 0;
        LIST_HEAD(source);
-        for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
+        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
                if (!pfn_valid(pfn))
                        continue;
                page = pfn_to_page(pfn);
@@ -1362,13 +1360,27 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                                ret = -EBUSY;
                                break;
                        }
-                        if (isolate_huge_page(page, &source))
+                        isolate_huge_page(page, &source);
-                                move_pages -= 1 << compound_order(head);
                        continue;
                } else if (PageTransHuge(page))
                        pfn = page_to_pfn(compound_head(page))
                                + hpage_nr_pages(page) - 1;
+                /*
+                 * HWPoison pages have elevated reference counts so the migration would
+                 * fail on them. It also doesn't make any sense to migrate them in the
+                 * first place. Still try to unmap such a page in case it is still mapped
+                 * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
+                 * the unmap as the catch all safety net).
+                 */
+                if (PageHWPoison(page)) {
+                        if (WARN_ON(PageLRU(page)))
+                                isolate_lru_page(page);
+                        if (page_mapped(page))
+                                try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS);
+                        continue;
+                }
                if (!get_page_unless_zero(page))
                        continue;
                /*
@@ -1382,16 +1394,13 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                if (!ret) { /* Success */
                        put_page(page);
                        list_add_tail(&page->lru, &source);
-                        move_pages--;
                        if (!__PageMovable(page))
                                inc_node_page_state(page, NR_ISOLATED_ANON +
                                                    page_is_file_cache(page));
                } else {
-#ifdef CONFIG_DEBUG_VM
+                        pr_warn("failed to isolate pfn %lx\n", pfn);
-                        pr_alert("failed to isolate pfn %lx\n", pfn);
                        dump_page(page, "isolation failed");
-#endif
                        put_page(page);
                        /* Because we don't have big zone->lock. we should
                           check this again here. */
@@ -1411,8 +1420,14 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                /* Allocate a new page from the nearest neighbor node */
                ret = migrate_pages(&source, new_node_page, NULL, 0,
                                        MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
-                if (ret)
+                if (ret) {
+                        list_for_each_entry(page, &source, lru) {
+                                pr_warn("migrating pfn %lx failed ret:%d ",
+                                       page_to_pfn(page), ret);
+                                dump_page(page, "migration failure");
+                        }
                        putback_movable_pages(&source);
+                }
        }
 out:
        return ret;
@@ -1553,12 +1568,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
        unsigned long valid_start, valid_end;
        struct zone *zone;
        struct memory_notify arg;
+        char *reason;
-        /* at least, alignment against pageblock is necessary */
-        if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
-                return -EINVAL;
-        if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
-                return -EINVAL;
        mem_hotplug_begin();
@@ -1567,7 +1577,9 @@ static int __ref __offline_pages(unsigned long start_pfn,
        if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start,
                                  &valid_end)) {
                mem_hotplug_done();
-                return -EINVAL;
+                ret = -EINVAL;
+                reason = "multizone range";
+                goto failed_removal;
        }
        zone = page_zone(pfn_to_page(valid_start));
@@ -1576,10 +1588,12 @@ static int __ref __offline_pages(unsigned long start_pfn,
        /* set above range as isolated */
        ret = start_isolate_page_range(start_pfn, end_pfn,
-                                       MIGRATE_MOVABLE, true);
+                                       MIGRATE_MOVABLE,
+                                       SKIP_HWPOISON | REPORT_FAILURE);
        if (ret) {
                mem_hotplug_done();
-                return ret;
+                reason = "failure to isolate range";
+                goto failed_removal;
        }
        arg.start_pfn = start_pfn;
@@ -1588,37 +1602,47 @@ static int __ref __offline_pages(unsigned long start_pfn,
        ret = memory_notify(MEM_GOING_OFFLINE, &arg);
        ret = notifier_to_errno(ret);
-        if (ret)
+        if (ret) {
-                goto failed_removal;
+                reason = "notifier failure";
+                goto failed_removal_isolated;
+        }
-        pfn = start_pfn;
+        do {
-repeat:
+                for (pfn = start_pfn; pfn;) {
-        /* start memory hot removal */
+                        if (signal_pending(current)) {
-        ret = -EINTR;
+                                ret = -EINTR;
-        if (signal_pending(current))
+                                reason = "signal backoff";
-                goto failed_removal;
+                                goto failed_removal_isolated;
+                        }
-        cond_resched();
+                        cond_resched();
-        lru_add_drain_all();
+                        lru_add_drain_all();
-        drain_all_pages(zone);
+                        drain_all_pages(zone);
+                        pfn = scan_movable_pages(pfn, end_pfn);
+                        if (pfn) {
+                                /*
+                                 * TODO: fatal migration failures should bail
+                                 * out
+                                 */
+                                do_migrate_range(pfn, end_pfn);
+                        }
+                }
-        pfn = scan_movable_pages(start_pfn, end_pfn);
+                /*
-        if (pfn) { /* We have movable pages */
+                 * Dissolve free hugepages in the memory block before doing
-                ret = do_migrate_range(pfn, end_pfn);
+                 * offlining actually in order to make hugetlbfs's object
-                goto repeat;
+                 * counting consistent.
-        }
+                 */
+                ret = dissolve_free_huge_pages(start_pfn, end_pfn);
+                if (ret) {
+                        reason = "failure to dissolve huge pages";
+                        goto failed_removal_isolated;
+                }
+                /* check again */
+                offlined_pages = check_pages_isolated(start_pfn, end_pfn);
+        } while (offlined_pages < 0);
-        /*
-         * dissolve free hugepages in the memory block before doing offlining
-         * actually in order to make hugetlbfs's object counting consistent.
-         */
-        ret = dissolve_free_huge_pages(start_pfn, end_pfn);
-        if (ret)
-                goto failed_removal;
-        /* check again */
-        offlined_pages = check_pages_isolated(start_pfn, end_pfn);
-        if (offlined_pages < 0)
-                goto repeat;
        pr_info("Offlined Pages %ld\n", offlined_pages);
        /* Ok, all of our target is isolated.
           We cannot do rollback at this point. */
@@ -1654,13 +1678,15 @@ repeat:
        mem_hotplug_done();
        return 0;
+failed_removal_isolated:
+        undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
 failed_removal:
-        pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n",
+        pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
                 (unsigned long long) start_pfn << PAGE_SHIFT,
-                 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
+                 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
+                 reason);
        memory_notify(MEM_CANCEL_OFFLINE, &arg);
        /* pushback to free area */
-        undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
        mem_hotplug_done();
        return ret;
 }
@@ -1753,34 +1779,6 @@ static int check_cpu_on_node(pg_data_t *pgdat)
        return 0;
 }
-static void unmap_cpu_on_node(pg_data_t *pgdat)
-{
-#ifdef CONFIG_ACPI_NUMA
-        int cpu;
-        for_each_possible_cpu(cpu)
-                if (cpu_to_node(cpu) == pgdat->node_id)
-                        numa_clear_node(cpu);
-#endif
-}
-static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
-{
-        int ret;
-        ret = check_cpu_on_node(pgdat);
-        if (ret)
-                return ret;
-        /*
-         * the node will be offlined when we come here, so we can clear
-         * the cpu_to_node() now.
-         */
-        unmap_cpu_on_node(pgdat);
-        return 0;
-}
 /**
 * try_offline_node
 * @nid: the node ID
@@ -1813,7 +1811,7 @@ void try_offline_node(int nid)
                return;
        }
-        if (check_and_unmap_cpu_on_node(pgdat))
+        if (check_cpu_on_node(pgdat))
                return;
        /*
@@ -1858,7 +1856,7 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
        memblock_free(start, size);
        memblock_remove(start, size);
-        arch_remove_memory(start, size, NULL);
+        arch_remove_memory(nid, start, size, NULL);
        try_offline_node(nid);
diff --git a/mm/migrate.c b/mm/migrate.c
index f7e4bfdc13b7..5d1839a9148d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -327,16 +327,13 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
        /*
         * Once page cache replacement of page migration started, page_count
-         * *must* be zero. And, we don't want to call wait_on_page_locked()
+         * is zero; but we must not call put_and_wait_on_page_locked() without
-         * against a page without get_page().
+         * a ref. Use get_page_unless_zero(), and just fault again if it fails.
-         * So, we use get_page_unless_zero(), here. Even failed, page fault
-         * will occur again.
         */
        if (!get_page_unless_zero(page))
                goto out;
        pte_unmap_unlock(ptep, ptl);
-        wait_on_page_locked(page);
+        put_and_wait_on_page_locked(page);
-        put_page(page);
        return;
 out:
        pte_unmap_unlock(ptep, ptl);
@@ -370,63 +367,28 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
        if (!get_page_unless_zero(page))
                goto unlock;
        spin_unlock(ptl);
-        wait_on_page_locked(page);
+        put_and_wait_on_page_locked(page);
-        put_page(page);
        return;
 unlock:
        spin_unlock(ptl);
 }
 #endif
-#ifdef CONFIG_BLOCK
+static int expected_page_refs(struct page *page)
-/* Returns true if all buffers are successfully locked */
-static bool buffer_migrate_lock_buffers(struct buffer_head *head,
-                                                        enum migrate_mode mode)
 {
-        struct buffer_head *bh = head;
+        int expected_count = 1;
-        /* Simple case, sync compaction */
-        if (mode != MIGRATE_ASYNC) {
-                do {
-                        get_bh(bh);
-                        lock_buffer(bh);
-                        bh = bh->b_this_page;
-                } while (bh != head);
-                return true;
+        /*
-        }
+         * Device public or private pages have an extra refcount as they are
+         * ZONE_DEVICE pages.
-        /* async case, we cannot block on lock_buffer so use trylock_buffer */
+         */
-        do {
+        expected_count += is_device_private_page(page);
-                get_bh(bh);
+        expected_count += is_device_public_page(page);
-                if (!trylock_buffer(bh)) {
+        if (page_mapping(page))
-                        /*
+                expected_count += hpage_nr_pages(page) + page_has_private(page);
-                         * We failed to lock the buffer and cannot stall in
-                         * async migration. Release the taken locks
-                         */
-                        struct buffer_head *failed_bh = bh;
-                        put_bh(failed_bh);
-                        bh = head;
-                        while (bh != failed_bh) {
-                                unlock_buffer(bh);
-                                put_bh(bh);
-                                bh = bh->b_this_page;
-                        }
-                        return false;
-                }
-                bh = bh->b_this_page;
+        return expected_count;
-        } while (bh != head);
-        return true;
-}
-#else
-static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
-                                                        enum migrate_mode mode)
-{
-        return true;
 }
-#endif /* CONFIG_BLOCK */
 /*
 * Replace the page in the mapping.
@@ -437,21 +399,13 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
 */
 int migrate_page_move_mapping(struct address_space *mapping,
-                struct page *newpage, struct page *page,
+                struct page *newpage, struct page *page, enum migrate_mode mode,
-                struct buffer_head *head, enum migrate_mode mode,
                int extra_count)
 {
        XA_STATE(xas, &mapping->i_pages, page_index(page));
        struct zone *oldzone, *newzone;
        int dirty;
-        int expected_count = 1 + extra_count;
+        int expected_count = expected_page_refs(page) + extra_count;
-        /*
-         * Device public or private pages have an extra refcount as they are
-         * ZONE_DEVICE pages.
-         */
-        expected_count += is_device_private_page(page);
-        expected_count += is_device_public_page(page);
        if (!mapping) {
                /* Anonymous page without mapping */
@@ -471,8 +425,6 @@ int migrate_page_move_mapping(struct address_space *mapping,
        newzone = page_zone(newpage);
        xas_lock_irq(&xas);
-        expected_count += hpage_nr_pages(page) + page_has_private(page);
        if (page_count(page) != expected_count || xas_load(&xas) != page) {
                xas_unlock_irq(&xas);
                return -EAGAIN;
@@ -484,20 +436,6 @@ int migrate_page_move_mapping(struct address_space *mapping,
        }
        /*
-         * In the async migration case of moving a page with buffers, lock the
-         * buffers using trylock before the mapping is moved. If the mapping
-         * was moved, we later failed to lock the buffers and could not move
-         * the mapping back due to an elevated page count, we would have to
-         * block waiting on other references to be dropped.
-         */
-        if (mode == MIGRATE_ASYNC && head &&
-                        !buffer_migrate_lock_buffers(head, mode)) {
-                page_ref_unfreeze(page, expected_count);
-                xas_unlock_irq(&xas);
-                return -EAGAIN;
-        }
-        /*
         * Now we know that no one else is looking at the page:
         * no turning back from here.
         */
@@ -748,7 +686,7 @@ int migrate_page(struct address_space *mapping,
        BUG_ON(PageWriteback(page));    /* Writeback must be complete */
-        rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+        rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
        if (rc != MIGRATEPAGE_SUCCESS)
                return rc;
@@ -762,34 +700,98 @@ int migrate_page(struct address_space *mapping,
 EXPORT_SYMBOL(migrate_page);
 #ifdef CONFIG_BLOCK
-/*
+/* Returns true if all buffers are successfully locked */
- * Migration function for pages with buffers. This function can only be used
+static bool buffer_migrate_lock_buffers(struct buffer_head *head,
- * if the underlying filesystem guarantees that no other references to "page"
+                                                        enum migrate_mode mode)
- * exist.
+{
- */
+        struct buffer_head *bh = head;
-int buffer_migrate_page(struct address_space *mapping,
-                struct page *newpage, struct page *page, enum migrate_mode mode)
+        /* Simple case, sync compaction */
+        if (mode != MIGRATE_ASYNC) {
+                do {
+                        get_bh(bh);
+                        lock_buffer(bh);
+                        bh = bh->b_this_page;
+                } while (bh != head);
+                return true;
+        }
+        /* async case, we cannot block on lock_buffer so use trylock_buffer */
+        do {
+                get_bh(bh);
+                if (!trylock_buffer(bh)) {
+                        /*
+                         * We failed to lock the buffer and cannot stall in
+                         * async migration. Release the taken locks
+                         */
+                        struct buffer_head *failed_bh = bh;
+                        put_bh(failed_bh);
+                        bh = head;
+                        while (bh != failed_bh) {
+                                unlock_buffer(bh);
+                                put_bh(bh);
+                                bh = bh->b_this_page;
+                        }
+                        return false;
+                }
+                bh = bh->b_this_page;
+        } while (bh != head);
+        return true;
+}
+static int __buffer_migrate_page(struct address_space *mapping,
+                struct page *newpage, struct page *page, enum migrate_mode mode,
+                bool check_refs)
 {
        struct buffer_head *bh, *head;
        int rc;
+        int expected_count;
        if (!page_has_buffers(page))
                return migrate_page(mapping, newpage, page, mode);
+        /* Check whether page does not have extra refs before we do more work */
+        expected_count = expected_page_refs(page);
+        if (page_count(page) != expected_count)
+                return -EAGAIN;
        head = page_buffers(page);
+        if (!buffer_migrate_lock_buffers(head, mode))
+                return -EAGAIN;
-        rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0);
+        if (check_refs) {
+                bool busy;
+                bool invalidated = false;
-        if (rc != MIGRATEPAGE_SUCCESS)
+recheck_buffers:
-                return rc;
+                busy = false;
+                spin_lock(&mapping->private_lock);
+                bh = head;
+                do {
+                        if (atomic_read(&bh->b_count)) {
+                                busy = true;
+                                break;
+                        }
+                        bh = bh->b_this_page;
+                } while (bh != head);
+                spin_unlock(&mapping->private_lock);
+                if (busy) {
+                        if (invalidated) {
+                                rc = -EAGAIN;
+                                goto unlock_buffers;
+                        }
+                        invalidate_bh_lrus();
+                        invalidated = true;
+                        goto recheck_buffers;
+                }
+        }
-        /*
+        rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
-         * In the async case, migrate_page_move_mapping locked the buffers
+        if (rc != MIGRATEPAGE_SUCCESS)
-         * with an IRQ-safe spinlock held. In the sync case, the buffers
+                goto unlock_buffers;
-         * need to be locked now
-         */
-        if (mode != MIGRATE_ASYNC)
-                BUG_ON(!buffer_migrate_lock_buffers(head, mode));
        ClearPagePrivate(page);
        set_page_private(newpage, page_private(page));
@@ -811,6 +813,8 @@ int buffer_migrate_page(struct address_space *mapping,
        else
                migrate_page_states(newpage, page);
+        rc = MIGRATEPAGE_SUCCESS;
+unlock_buffers:
        bh = head;
        do {
                unlock_buffer(bh);
@@ -819,9 +823,32 @@ int buffer_migrate_page(struct address_space *mapping,
        } while (bh != head);
-        return MIGRATEPAGE_SUCCESS;
+        return rc;
+}
+/*
+ * Migration function for pages with buffers. This function can only be used
+ * if the underlying filesystem guarantees that no other references to "page"
+ * exist. For example attached buffer heads are accessed only under page lock.
+ */
+int buffer_migrate_page(struct address_space *mapping,
+                struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+        return __buffer_migrate_page(mapping, newpage, page, mode, false);
 }
 EXPORT_SYMBOL(buffer_migrate_page);
+/*
+ * Same as above except that this variant is more careful and checks that there
+ * are also no buffer head references. This function is the right one for
+ * mappings where buffer heads are directly looked up and referenced (such as
+ * block device mappings).
+ */
+int buffer_migrate_page_norefs(struct address_space *mapping,
+                struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+        return __buffer_migrate_page(mapping, newpage, page, mode, true);
+}
 #endif
 /*
@@ -1297,8 +1324,19 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
                goto put_anon;
        if (page_mapped(hpage)) {
+                struct address_space *mapping = page_mapping(hpage);
+                /*
+                 * try_to_unmap could potentially call huge_pmd_unshare.
+                 * Because of this, take semaphore in write mode here and
+                 * set TTU_RMAP_LOCKED to let lower levels know we have
+                 * taken the lock.
+                 */
+                i_mmap_lock_write(mapping);
                try_to_unmap(hpage,
-                        TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+                        TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
+                        TTU_RMAP_LOCKED);
+                i_mmap_unlock_write(mapping);
                page_was_mapped = 1;
        }
@@ -2303,6 +2341,7 @@ next:
 */
 static void migrate_vma_collect(struct migrate_vma *migrate)
 {
+        struct mmu_notifier_range range;
        struct mm_walk mm_walk;
        mm_walk.pmd_entry = migrate_vma_collect_pmd;
@@ -2314,13 +2353,11 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
        mm_walk.mm = migrate->vma->vm_mm;
        mm_walk.private = migrate;
-        mmu_notifier_invalidate_range_start(mm_walk.mm,
+        mmu_notifier_range_init(&range, mm_walk.mm, migrate->start,
-                                            migrate->start,
+                                migrate->end);
-                                            migrate->end);
+        mmu_notifier_invalidate_range_start(&range);
        walk_page_range(migrate->start, migrate->end, &mm_walk);
-        mmu_notifier_invalidate_range_end(mm_walk.mm,
+        mmu_notifier_invalidate_range_end(&range);
-                                          migrate->start,
-                                          migrate->end);
        migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
 }
@@ -2701,9 +2738,8 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
 {
        const unsigned long npages = migrate->npages;
        const unsigned long start = migrate->start;
-        struct vm_area_struct *vma = migrate->vma;
+        struct mmu_notifier_range range;
-        struct mm_struct *mm = vma->vm_mm;
+        unsigned long addr, i;
-        unsigned long addr, i, mmu_start;
        bool notified = false;
        for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
@@ -2722,11 +2758,12 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
                                continue;
                        }
                        if (!notified) {
-                                mmu_start = addr;
                                notified = true;
-                                mmu_notifier_invalidate_range_start(mm,
-                                                                mmu_start,
+                                mmu_notifier_range_init(&range,
-                                                                migrate->end);
+                                                        migrate->vma->vm_mm,
+                                                        addr, migrate->end);
+                                mmu_notifier_invalidate_range_start(&range);
                        }
                        migrate_vma_insert_page(migrate, addr, newpage,
                                                &migrate->src[i],
@@ -2767,8 +2804,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
         * did already call it.
         */
        if (notified)
-                mmu_notifier_invalidate_range_only_end(mm, mmu_start,
+                mmu_notifier_invalidate_range_only_end(&range);
-                                                       migrate->end);
 }
 /*
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 6838a530789b..33917105a3a2 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -146,7 +146,7 @@ static void __meminit mm_compute_batch(void)
        s32 batch = max_t(s32, nr*2, 32);
        /* batch size set to 0.4% of (total memory/#cpus), or max int32 */
-        memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff);
+        memsized_batch = min_t(u64, (totalram_pages()/nr)/256, 0x7fffffff);
        vm_committed_as_batch = max_t(s32, memsized_batch, batch);
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 7bb64381e77c..f901065c4c64 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2973,16 +2973,6 @@ out:
        return ret;
 }
-static inline void verify_mm_writelocked(struct mm_struct *mm)
-{
-#ifdef CONFIG_DEBUG_VM
-        if (unlikely(down_read_trylock(&mm->mmap_sem))) {
-                WARN_ON(1);
-                up_read(&mm->mmap_sem);
-        }
-#endif
-}
 /*
 *  this is really a simplified "do_mmap".  it only handles
 *  anonymous maps.  eventually we may be able to do some
@@ -3010,12 +3000,6 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
                return error;
        /*
-         * mm->mmap_sem is required to protect against another thread
-         * changing the mappings in case we sleep.
-         */
-        verify_mm_writelocked(mm);
-        /*
         * Clear old maps.  this also does some error checking for us
         */
        while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 5119ff846769..9c884abc7850 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -35,13 +35,6 @@ void mmu_notifier_call_srcu(struct rcu_head *rcu,
 }
 EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu);
-void mmu_notifier_synchronize(void)
-{
-        /* Wait for any running method to finish. */
-        srcu_barrier(&srcu);
-}
-EXPORT_SYMBOL_GPL(mmu_notifier_synchronize);
 /*
 * This function can't run concurrently against mmu_notifier_register
 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
@@ -174,22 +167,20 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
        srcu_read_unlock(&srcu, id);
 }
-int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
-                                  unsigned long start, unsigned long end,
-                                  bool blockable)
 {
        struct mmu_notifier *mn;
        int ret = 0;
        int id;
        id = srcu_read_lock(&srcu);
-        hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+        hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->invalidate_range_start) {
-                        int _ret = mn->ops->invalidate_range_start(mn, mm, start, end, blockable);
+                        int _ret = mn->ops->invalidate_range_start(mn, range);
                        if (_ret) {
                                pr_info("%pS callback failed with %d in %sblockable context.\n",
-                                                mn->ops->invalidate_range_start, _ret,
+                                        mn->ops->invalidate_range_start, _ret,
-                                                !blockable ? "non-" : "");
+                                        !range->blockable ? "non-" : "");
                                ret = _ret;
                        }
                }
@@ -200,16 +191,14 @@ int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
 }
 EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
-void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
+void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
-                                         unsigned long start,
-                                         unsigned long end,
                                         bool only_end)
 {
        struct mmu_notifier *mn;
        int id;
        id = srcu_read_lock(&srcu);
-        hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+        hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) {
                /*
                 * Call invalidate_range here too to avoid the need for the
                 * subsystem of having to register an invalidate_range_end
@@ -224,9 +213,11 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
                 * already happen under page table lock.
                 */
                if (!only_end && mn->ops->invalidate_range)
-                        mn->ops->invalidate_range(mn, mm, start, end);
+                        mn->ops->invalidate_range(mn, range->mm,
+                                                  range->start,
+                                                  range->end);
                if (mn->ops->invalidate_range_end)
-                        mn->ops->invalidate_range_end(mn, mm, start, end);
+                        mn->ops->invalidate_range_end(mn, range);
        }
        srcu_read_unlock(&srcu, id);
 }
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 6d331620b9e5..36cb358db170 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -167,11 +167,12 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
                pgprot_t newprot, int dirty_accountable, int prot_numa)
 {
        pmd_t *pmd;
-        struct mm_struct *mm = vma->vm_mm;
        unsigned long next;
        unsigned long pages = 0;
        unsigned long nr_huge_updates = 0;
-        unsigned long mni_start = 0;
+        struct mmu_notifier_range range;
+        range.start = 0;
        pmd = pmd_offset(pud, addr);
        do {
@@ -183,9 +184,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
                        goto next;
                /* invoke the mmu notifier if the pmd is populated */
-                if (!mni_start) {
+                if (!range.start) {
-                        mni_start = addr;
+                        mmu_notifier_range_init(&range, vma->vm_mm, addr, end);
-                        mmu_notifier_invalidate_range_start(mm, mni_start, end);
+                        mmu_notifier_invalidate_range_start(&range);
                }
                if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
@@ -214,8 +215,8 @@ next:
                cond_resched();
        } while (pmd++, addr = next, addr != end);
-        if (mni_start)
+        if (range.start)
-                mmu_notifier_invalidate_range_end(mm, mni_start, end);
+                mmu_notifier_invalidate_range_end(&range);
        if (nr_huge_updates)
                count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
diff --git a/mm/mremap.c b/mm/mremap.c
index 7f9f9180e401..def01d86e36f 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -197,16 +197,14 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                bool need_rmap_locks)
 {
        unsigned long extent, next, old_end;
+        struct mmu_notifier_range range;
        pmd_t *old_pmd, *new_pmd;
-        unsigned long mmun_start;       /* For mmu_notifiers */
-        unsigned long mmun_end;         /* For mmu_notifiers */
        old_end = old_addr + len;
        flush_cache_range(vma, old_addr, old_end);
-        mmun_start = old_addr;
+        mmu_notifier_range_init(&range, vma->vm_mm, old_addr, old_end);
-        mmun_end   = old_end;
+        mmu_notifier_invalidate_range_start(&range);
-        mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
        for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
                cond_resched();
@@ -247,7 +245,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                          new_pmd, new_addr, need_rmap_locks);
        }
-        mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
+        mmu_notifier_invalidate_range_end(&range);
        return len + old_addr - old_end;        /* how much done */
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 6589f60d5018..f0e8cd9edb1a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -245,11 +245,11 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
        return points > 0 ? points : 1;
 }
-enum oom_constraint {
+static const char * const oom_constraint_text[] = {
-        CONSTRAINT_NONE,
+        [CONSTRAINT_NONE] = "CONSTRAINT_NONE",
-        CONSTRAINT_CPUSET,
+        [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
-        CONSTRAINT_MEMORY_POLICY,
+        [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
-        CONSTRAINT_MEMCG,
+        [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
 };
 /*
@@ -269,7 +269,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
        }
        /* Default to all available memory */
-        oc->totalpages = totalram_pages + total_swap_pages;
+        oc->totalpages = totalram_pages() + total_swap_pages;
        if (!IS_ENABLED(CONFIG_NUMA))
                return CONSTRAINT_NONE;
@@ -428,19 +428,29 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
        rcu_read_unlock();
 }
+static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
+{
+        /* one line summary of the oom killer context. */
+        pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
+                        oom_constraint_text[oc->constraint],
+                        nodemask_pr_args(oc->nodemask));
+        cpuset_print_current_mems_allowed();
+        mem_cgroup_print_oom_context(oc->memcg, victim);
+        pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
+                from_kuid(&init_user_ns, task_uid(victim)));
+}
 static void dump_header(struct oom_control *oc, struct task_struct *p)
 {
-        pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n",
+        pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
-                current->comm, oc->gfp_mask, &oc->gfp_mask,
+                current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
-                nodemask_pr_args(oc->nodemask), oc->order,
                        current->signal->oom_score_adj);
        if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
                pr_warn("COMPACTION is disabled!!!\n");
-        cpuset_print_current_mems_allowed();
        dump_stack();
        if (is_memcg_oom(oc))
-                mem_cgroup_print_oom_info(oc->memcg, p);
+                mem_cgroup_print_oom_meminfo(oc->memcg);
        else {
                show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
                if (is_dump_unreclaim_slabs())
@@ -448,6 +458,8 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
        }
        if (sysctl_oom_dump_tasks)
                dump_tasks(oc->memcg, oc->nodemask);
+        if (p)
+                dump_oom_summary(oc, p);
 }
 /*
@@ -516,19 +528,20 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
                 * count elevated without a good reason.
                 */
                if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
-                        const unsigned long start = vma->vm_start;
+                        struct mmu_notifier_range range;
-                        const unsigned long end = vma->vm_end;
                        struct mmu_gather tlb;
-                        tlb_gather_mmu(&tlb, mm, start, end);
+                        mmu_notifier_range_init(&range, mm, vma->vm_start,
-                        if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) {
+                                                vma->vm_end);
-                                tlb_finish_mmu(&tlb, start, end);
+                        tlb_gather_mmu(&tlb, mm, range.start, range.end);
+                        if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
+                                tlb_finish_mmu(&tlb, range.start, range.end);
                                ret = false;
                                continue;
                        }
-                        unmap_page_range(&tlb, vma, start, end, NULL);
+                        unmap_page_range(&tlb, vma, range.start, range.end, NULL);
-                        mmu_notifier_invalidate_range_end(mm, start, end);
+                        mmu_notifier_invalidate_range_end(&range);
-                        tlb_finish_mmu(&tlb, start, end);
+                        tlb_finish_mmu(&tlb, range.start, range.end);
                }
        }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3f690bae6b78..7d1010453fb9 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2154,6 +2154,7 @@ int write_cache_pages(struct address_space *mapping,
 {
        int ret = 0;
        int done = 0;
+        int error;
        struct pagevec pvec;
        int nr_pages;
        pgoff_t uninitialized_var(writeback_index);
@@ -2227,25 +2228,31 @@ continue_unlock:
                                goto continue_unlock;
                        trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
-                        ret = (*writepage)(page, wbc, data);
+                        error = (*writepage)(page, wbc, data);
-                        if (unlikely(ret)) {
+                        if (unlikely(error)) {
-                                if (ret == AOP_WRITEPAGE_ACTIVATE) {
+                                /*
+                                 * Handle errors according to the type of
+                                 * writeback. There's no need to continue for
+                                 * background writeback. Just push done_index
+                                 * past this page so media errors won't choke
+                                 * writeout for the entire file. For integrity
+                                 * writeback, we must process the entire dirty
+                                 * set regardless of errors because the fs may
+                                 * still have state to clear for each page. In
+                                 * that case we continue processing and return
+                                 * the first error.
+                                 */
+                                if (error == AOP_WRITEPAGE_ACTIVATE) {
                                        unlock_page(page);
-                                        ret = 0;
+                                        error = 0;
-                                } else {
+                                } else if (wbc->sync_mode != WB_SYNC_ALL) {
-                                        /*
+                                        ret = error;
-                                         * done_index is set past this page,
-                                         * so media errors will not choke
-                                         * background writeout for the entire
-                                         * file. This has consequences for
-                                         * range_cyclic semantics (ie. it may
-                                         * not be suitable for data integrity
-                                         * writeout).
-                                         */
                                        done_index = page->index + 1;
                                        done = 1;
                                        break;
                                }
+                                if (!ret)
+                                        ret = error;
                        }
                        /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e95b5b7c9c3d..cde5dac6229a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -16,6 +16,7 @@
 #include <linux/stddef.h>
 #include <linux/mm.h>
+#include <linux/highmem.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
@@ -96,8 +97,12 @@ int _node_numa_mem_[MAX_NUMNODES];
 #endif
 /* work_structs for global per-cpu drains */
+struct pcpu_drain {
+        struct zone *zone;
+        struct work_struct work;
+};
 DEFINE_MUTEX(pcpu_drain_mutex);
-DEFINE_PER_CPU(struct work_struct, pcpu_drain);
+DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
 volatile unsigned long latent_entropy __latent_entropy;
@@ -121,10 +126,8 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 };
 EXPORT_SYMBOL(node_states);
-/* Protect totalram_pages and zone->managed_pages */
+atomic_long_t _totalram_pages __read_mostly;
-static DEFINE_SPINLOCK(managed_page_count_lock);
+EXPORT_SYMBOL(_totalram_pages);
-unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 unsigned long totalcma_pages __read_mostly;
@@ -237,7 +240,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
 #endif
 };
-char * const migratetype_names[MIGRATE_TYPES] = {
+const char * const migratetype_names[MIGRATE_TYPES] = {
        "Unmovable",
        "Movable",
        "Reclaimable",
@@ -263,20 +266,21 @@ compound_page_dtor * const compound_page_dtors[] = {
 int min_free_kbytes = 1024;
 int user_min_free_kbytes = -1;
+int watermark_boost_factor __read_mostly = 15000;
 int watermark_scale_factor = 10;
-static unsigned long nr_kernel_pages __meminitdata;
+static unsigned long nr_kernel_pages __initdata;
-static unsigned long nr_all_pages __meminitdata;
+static unsigned long nr_all_pages __initdata;
-static unsigned long dma_reserve __meminitdata;
+static unsigned long dma_reserve __initdata;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata;
+static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
-static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata;
+static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
 static unsigned long required_kernelcore __initdata;
 static unsigned long required_kernelcore_percent __initdata;
 static unsigned long required_movablecore __initdata;
 static unsigned long required_movablecore_percent __initdata;
-static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata;
+static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
 static bool mirrored_kernelcore __meminitdata;
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
@@ -294,6 +298,32 @@ EXPORT_SYMBOL(nr_online_nodes);
 int page_group_by_mobility_disabled __read_mostly;
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+/*
+ * During boot we initialize deferred pages on-demand, as needed, but once
+ * page_alloc_init_late() has finished, the deferred pages are all initialized,
+ * and we can permanently disable that path.
+ */
+static DEFINE_STATIC_KEY_TRUE(deferred_pages);
+/*
+ * Calling kasan_free_pages() only after deferred memory initialization
+ * has completed. Poisoning pages during deferred memory init will greatly
+ * lengthen the process and cause problem in large memory systems as the
+ * deferred pages initialization is done with interrupt disabled.
+ *
+ * Assuming that there will be no reference to those newly initialized
+ * pages before they are ever allocated, this should have no effect on
+ * KASAN memory tracking as the poison will be properly inserted at page
+ * allocation time. The only corner case is when pages are allocated by
+ * on-demand allocation and then freed again before the deferred pages
+ * initialization is done, but this is not likely to happen.
+ */
+static inline void kasan_free_nondeferred_pages(struct page *page, int order)
+{
+        if (!static_branch_unlikely(&deferred_pages))
+                kasan_free_pages(page, order);
+}
 /* Returns true if the struct page for the pfn is uninitialised */
 static inline bool __meminit early_page_uninitialised(unsigned long pfn)
 {
@@ -326,8 +356,13 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
        /* Always populate low zones for address-constrained allocations */
        if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
                return false;
+        /*
+         * We start only with one section of pages, more pages are added as
+         * needed until the rest of deferred pages are initialized.
+         */
        nr_initialised++;
-        if ((nr_initialised > NODE_DATA(nid)->static_init_pgcnt) &&
+        if ((nr_initialised > PAGES_PER_SECTION) &&
            (pfn & (PAGES_PER_SECTION - 1)) == 0) {
                NODE_DATA(nid)->first_deferred_pfn = pfn;
                return true;
@@ -335,6 +370,8 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
        return false;
 }
 #else
+#define kasan_free_nondeferred_pages(p, o)      kasan_free_pages(p, o)
 static inline bool early_page_uninitialised(unsigned long pfn)
 {
        return false;
@@ -426,6 +463,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
        unsigned long old_word, word;
        BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+        BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
        bitmap = get_pageblock_bitmap(page, pfn);
        bitidx = pfn_to_bitidx(page, pfn);
@@ -1037,7 +1075,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
        arch_free_page(page, order);
        kernel_poison_pages(page, 1 << order, 0);
        kernel_map_pages(page, 1 << order, 0);
-        kasan_free_pages(page, order);
+        kasan_free_nondeferred_pages(page, order);
        return true;
 }
@@ -1183,6 +1221,7 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn,
        init_page_count(page);
        page_mapcount_reset(page);
        page_cpupid_reset_last(page);
+        page_kasan_tag_reset(page);
        INIT_LIST_HEAD(&page->lru);
 #ifdef WANT_PAGE_VIRTUAL
@@ -1279,7 +1318,7 @@ static void __init __free_pages_boot_core(struct page *page, unsigned int order)
        __ClearPageReserved(p);
        set_page_count(p, 0);
-        page_zone(page)->managed_pages += nr_pages;
+        atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
        set_page_refcounted(page);
        __free_pages(page, order);
 }
@@ -1606,13 +1645,6 @@ static int __init deferred_init_memmap(void *data)
 }
 /*
- * During boot we initialize deferred pages on-demand, as needed, but once
- * page_alloc_init_late() has finished, the deferred pages are all initialized,
- * and we can permanently disable that path.
- */
-static DEFINE_STATIC_KEY_TRUE(deferred_pages);
-/*
 * If this zone has deferred pages, try to grow it by initializing enough
 * deferred pages to satisfy the allocation specified by order, rounded up to
 * the nearest PAGES_PER_SECTION boundary.  So we're adding memory in increments
@@ -1981,8 +2013,8 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 */
 static int fallbacks[MIGRATE_TYPES][4] = {
        [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },
-        [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
        [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
+        [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
 #ifdef CONFIG_CMA
        [MIGRATE_CMA]         = { MIGRATE_TYPES }, /* Never used */
 #endif
@@ -2129,6 +2161,21 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
        return false;
 }
+static inline void boost_watermark(struct zone *zone)
+{
+        unsigned long max_boost;
+        if (!watermark_boost_factor)
+                return;
+        max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
+                        watermark_boost_factor, 10000);
+        max_boost = max(pageblock_nr_pages, max_boost);
+        zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
+                max_boost);
+}
 /*
 * This function implements actual steal behaviour. If order is large enough,
 * we can steal whole pageblock. If not, we first move freepages in this
@@ -2138,7 +2185,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
 * itself, so pages freed in the future will be put on the correct free list.
 */
 static void steal_suitable_fallback(struct zone *zone, struct page *page,
-                                        int start_type, bool whole_block)
+                unsigned int alloc_flags, int start_type, bool whole_block)
 {
        unsigned int current_order = page_order(page);
        struct free_area *area;
@@ -2160,6 +2207,15 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
                goto single_page;
        }
+        /*
+         * Boost watermarks to increase reclaim pressure to reduce the
+         * likelihood of future fallbacks. Wake kswapd now as the node
+         * may be balanced overall and kswapd will not wake naturally.
+         */
+        boost_watermark(zone);
+        if (alloc_flags & ALLOC_KSWAPD)
+                wakeup_kswapd(zone, 0, 0, zone_idx(zone));
        /* We are not allowed to try stealing from the whole block */
        if (!whole_block)
                goto single_page;
@@ -2258,7 +2314,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
         * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
         * Check is race-prone but harmless.
         */
-        max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
+        max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
        if (zone->nr_reserved_highatomic >= max_managed)
                return;
@@ -2375,20 +2431,30 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 * condition simpler.
 */
 static __always_inline bool
-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
+__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+                                                unsigned int alloc_flags)
 {
        struct free_area *area;
        int current_order;
+        int min_order = order;
        struct page *page;
        int fallback_mt;
        bool can_steal;
        /*
+         * Do not steal pages from freelists belonging to other pageblocks
+         * i.e. orders < pageblock_order. If there are no local zones free,
+         * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
+         */
+        if (alloc_flags & ALLOC_NOFRAGMENT)
+                min_order = pageblock_order;
+        /*
         * Find the largest available free page in the other list. This roughly
         * approximates finding the pageblock with the most free pages, which
         * would be too costly to do exactly.
         */
-        for (current_order = MAX_ORDER - 1; current_order >= order;
+        for (current_order = MAX_ORDER - 1; current_order >= min_order;
                                --current_order) {
                area = &(zone->free_area[current_order]);
                fallback_mt = find_suitable_fallback(area, current_order,
@@ -2433,7 +2499,8 @@ do_steal:
        page = list_first_entry(&area->free_list[fallback_mt],
                                                        struct page, lru);
-        steal_suitable_fallback(zone, page, start_migratetype, can_steal);
+        steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
+                                                                can_steal);
        trace_mm_page_alloc_extfrag(page, order, current_order,
                start_migratetype, fallback_mt);
@@ -2447,7 +2514,8 @@ do_steal:
 * Call me with the zone->lock already held.
 */
 static __always_inline struct page *
-__rmqueue(struct zone *zone, unsigned int order, int migratetype)
+__rmqueue(struct zone *zone, unsigned int order, int migratetype,
+                                                unsigned int alloc_flags)
 {
        struct page *page;
@@ -2457,7 +2525,8 @@ retry:
                if (migratetype == MIGRATE_MOVABLE)
                        page = __rmqueue_cma_fallback(zone, order);
-                if (!page && __rmqueue_fallback(zone, order, migratetype))
+                if (!page && __rmqueue_fallback(zone, order, migratetype,
+                                                                alloc_flags))
                        goto retry;
        }
@@ -2472,13 +2541,14 @@ retry:
 */
 static int rmqueue_bulk(struct zone *zone, unsigned int order,
                        unsigned long count, struct list_head *list,
-                        int migratetype)
+                        int migratetype, unsigned int alloc_flags)
 {
        int i, alloced = 0;
        spin_lock(&zone->lock);
        for (i = 0; i < count; ++i) {
-                struct page *page = __rmqueue(zone, order, migratetype);
+                struct page *page = __rmqueue(zone, order, migratetype,
+                                                                alloc_flags);
                if (unlikely(page == NULL))
                        break;
@@ -2592,6 +2662,10 @@ void drain_local_pages(struct zone *zone)
 static void drain_local_pages_wq(struct work_struct *work)
 {
+        struct pcpu_drain *drain;
+        drain = container_of(work, struct pcpu_drain, work);
        /*
         * drain_all_pages doesn't use proper cpu hotplug protection so
         * we can race with cpu offline when the WQ can move this from
@@ -2600,7 +2674,7 @@ static void drain_local_pages_wq(struct work_struct *work)
         * a different one.
         */
        preempt_disable();
-        drain_local_pages(NULL);
+        drain_local_pages(drain->zone);
        preempt_enable();
 }
@@ -2671,12 +2745,14 @@ void drain_all_pages(struct zone *zone)
        }
        for_each_cpu(cpu, &cpus_with_pcps) {
-                struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
+                struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
-                INIT_WORK(work, drain_local_pages_wq);
-                queue_work_on(cpu, mm_percpu_wq, work);
+                drain->zone = zone;
+                INIT_WORK(&drain->work, drain_local_pages_wq);
+                queue_work_on(cpu, mm_percpu_wq, &drain->work);
        }
        for_each_cpu(cpu, &cpus_with_pcps)
-                flush_work(per_cpu_ptr(&pcpu_drain, cpu));
+                flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
        mutex_unlock(&pcpu_drain_mutex);
 }
@@ -2934,6 +3010,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
 /* Remove page from the per-cpu list, caller must protect the list */
 static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+                        unsigned int alloc_flags,
                        struct per_cpu_pages *pcp,
                        struct list_head *list)
 {
@@ -2943,7 +3020,7 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
                if (list_empty(list)) {
                        pcp->count += rmqueue_bulk(zone, 0,
                                        pcp->batch, list,
-                                        migratetype);
+                                        migratetype, alloc_flags);
                        if (unlikely(list_empty(list)))
                                return NULL;
                }
@@ -2959,7 +3036,8 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
 /* Lock and remove page from the per-cpu list */
 static struct page *rmqueue_pcplist(struct zone *preferred_zone,
                        struct zone *zone, unsigned int order,
-                        gfp_t gfp_flags, int migratetype)
+                        gfp_t gfp_flags, int migratetype,
+                        unsigned int alloc_flags)
 {
        struct per_cpu_pages *pcp;
        struct list_head *list;
@@ -2969,7 +3047,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
        local_irq_save(flags);
        pcp = &this_cpu_ptr(zone->pageset)->pcp;
        list = &pcp->lists[migratetype];
-        page = __rmqueue_pcplist(zone,  migratetype, pcp, list);
+        page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
        if (page) {
                __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
                zone_statistics(preferred_zone, zone);
@@ -2992,7 +3070,7 @@ struct page *rmqueue(struct zone *preferred_zone,
        if (likely(order == 0)) {
                page = rmqueue_pcplist(preferred_zone, zone, order,
-                                gfp_flags, migratetype);
+                                gfp_flags, migratetype, alloc_flags);
                goto out;
        }
@@ -3011,7 +3089,7 @@ struct page *rmqueue(struct zone *preferred_zone,
                                trace_mm_page_alloc_zone_locked(page, order, migratetype);
                }
                if (!page)
-                        page = __rmqueue(zone, order, migratetype);
+                        page = __rmqueue(zone, order, migratetype, alloc_flags);
        } while (page && check_new_pages(page, order));
        spin_unlock(&zone->lock);
        if (!page)
@@ -3053,7 +3131,7 @@ static int __init setup_fail_page_alloc(char *str)
 }
 __setup("fail_page_alloc=", setup_fail_page_alloc);
-static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
        if (order < fail_page_alloc.min_order)
                return false;
@@ -3103,13 +3181,19 @@ late_initcall(fail_page_alloc_debugfs);
 #else /* CONFIG_FAIL_PAGE_ALLOC */
-static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
        return false;
 }
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
+static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+        return __should_fail_alloc_page(gfp_mask, order);
+}
+ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
 /*
 * Return true if free base pages are above 'mark'. For high-order checks it
 * will return true of the order-0 watermark is reached and there is at least
@@ -3254,6 +3338,40 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 #endif  /* CONFIG_NUMA */
 /*
+ * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
+ * fragmentation is subtle. If the preferred zone was HIGHMEM then
+ * premature use of a lower zone may cause lowmem pressure problems that
+ * are worse than fragmentation. If the next zone is ZONE_DMA then it is
+ * probably too small. It only makes sense to spread allocations to avoid
+ * fragmentation between the Normal and DMA32 zones.
+ */
+static inline unsigned int
+alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
+{
+        unsigned int alloc_flags = 0;
+        if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+                alloc_flags |= ALLOC_KSWAPD;
+#ifdef CONFIG_ZONE_DMA32
+        if (zone_idx(zone) != ZONE_NORMAL)
+                goto out;
+        /*
+         * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
+         * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
+         * on UMA that if Normal is populated then so is DMA32.
+         */
+        BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
+        if (nr_online_nodes > 1 && !populated_zone(--zone))
+                goto out;
+out:
+#endif /* CONFIG_ZONE_DMA32 */
+        return alloc_flags;
+}
+/*
 * get_page_from_freelist goes through the zonelist trying to allocate
 * a page.
 */
@@ -3261,14 +3379,18 @@ static struct page *
 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
                                                const struct alloc_context *ac)
 {
-        struct zoneref *z = ac->preferred_zoneref;
+        struct zoneref *z;
        struct zone *zone;
        struct pglist_data *last_pgdat_dirty_limit = NULL;
+        bool no_fallback;
+retry:
        /*
         * Scan zonelist, looking for a zone with enough free.
         * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
         */
+        no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
+        z = ac->preferred_zoneref;
        for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
                                                                ac->nodemask) {
                struct page *page;
@@ -3307,7 +3429,23 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
                        }
                }
-                mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
+                if (no_fallback && nr_online_nodes > 1 &&
+                    zone != ac->preferred_zoneref->zone) {
+                        int local_nid;
+                        /*
+                         * If moving to a remote node, retry but allow
+                         * fragmenting fallbacks. Locality is more important
+                         * than fragmentation avoidance.
+                         */
+                        local_nid = zone_to_nid(ac->preferred_zoneref->zone);
+                        if (zone_to_nid(zone) != local_nid) {
+                                alloc_flags &= ~ALLOC_NOFRAGMENT;
+                                goto retry;
+                        }
+                }
+                mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
                if (!zone_watermark_fast(zone, order, mark,
                                       ac_classzone_idx(ac), alloc_flags)) {
                        int ret;
@@ -3374,6 +3512,15 @@ try_this_zone:
                }
        }
+        /*
+         * It's possible on a UMA machine to get through all zones that are
+         * fragmented. If avoiding fragmentation, reset and try again.
+         */
+        if (no_fallback) {
+                alloc_flags &= ~ALLOC_NOFRAGMENT;
+                goto retry;
+        }
        return NULL;
 }
@@ -3413,13 +3560,13 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
-        pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n",
+        pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
                        current->comm, &vaf, gfp_mask, &gfp_mask,
                        nodemask_pr_args(nodemask));
        va_end(args);
        cpuset_print_current_mems_allowed();
+        pr_cont("\n");
        dump_stack();
        warn_alloc_show_mem(gfp_mask, nodemask);
 }
@@ -3861,6 +4008,9 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
        } else if (unlikely(rt_task(current)) && !in_interrupt())
                alloc_flags |= ALLOC_HARDER;
+        if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+                alloc_flags |= ALLOC_KSWAPD;
 #ifdef CONFIG_CMA
        if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
                alloc_flags |= ALLOC_CMA;
@@ -4092,7 +4242,7 @@ retry_cpuset:
        if (!ac->preferred_zoneref->zone)
                goto nopage;
-        if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+        if (alloc_flags & ALLOC_KSWAPD)
                wake_all_kswapds(order, gfp_mask, ac);
        /*
@@ -4150,7 +4300,7 @@ retry_cpuset:
 retry:
        /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
-        if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+        if (alloc_flags & ALLOC_KSWAPD)
                wake_all_kswapds(order, gfp_mask, ac);
        reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
@@ -4369,6 +4519,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
        finalise_ac(gfp_mask, &ac);
+        /*
+         * Forbid the first pass from falling back to types that fragment
+         * memory until all local zones are considered.
+         */
+        alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
        /* First allocation attempt */
        page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
        if (likely(page))
@@ -4427,16 +4583,19 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(get_zeroed_page);
-void __free_pages(struct page *page, unsigned int order)
+static inline void free_the_page(struct page *page, unsigned int order)
 {
-        if (put_page_testzero(page)) {
+        if (order == 0)         /* Via pcp? */
-                if (order == 0)
+                free_unref_page(page);
-                        free_unref_page(page);
+        else
-                else
+                __free_pages_ok(page, order);
-                        __free_pages_ok(page, order);
-        }
 }
+void __free_pages(struct page *page, unsigned int order)
+{
+        if (put_page_testzero(page))
+                free_the_page(page, order);
+}
 EXPORT_SYMBOL(__free_pages);
 void free_pages(unsigned long addr, unsigned int order)
@@ -4485,14 +4644,8 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
 {
        VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
-        if (page_ref_sub_and_test(page, count)) {
+        if (page_ref_sub_and_test(page, count))
-                unsigned int order = compound_order(page);
+                free_the_page(page, compound_order(page));
-                if (order == 0)
-                        free_unref_page(page);
-                else
-                        __free_pages_ok(page, order);
-        }
 }
 EXPORT_SYMBOL(__page_frag_cache_drain);
@@ -4558,7 +4711,7 @@ void page_frag_free(void *addr)
        struct page *page = virt_to_head_page(addr);
        if (unlikely(put_page_testzero(page)))
-                __free_pages_ok(page, compound_order(page));
+                free_the_page(page, compound_order(page));
 }
 EXPORT_SYMBOL(page_frag_free);
@@ -4660,7 +4813,7 @@ static unsigned long nr_free_zone_pages(int offset)
        struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
        for_each_zone_zonelist(zone, z, zonelist, offset) {
-                unsigned long size = zone->managed_pages;
+                unsigned long size = zone_managed_pages(zone);
                unsigned long high = high_wmark_pages(zone);
                if (size > high)
                        sum += size - high;
@@ -4712,7 +4865,7 @@ long si_mem_available(void)
                pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
        for_each_zone(zone)
-                wmark_low += zone->watermark[WMARK_LOW];
+                wmark_low += low_wmark_pages(zone);
        /*
         * Estimate the amount of memory available for userspace allocations,
@@ -4746,11 +4899,11 @@ EXPORT_SYMBOL_GPL(si_mem_available);
 void si_meminfo(struct sysinfo *val)
 {
-        val->totalram = totalram_pages;
+        val->totalram = totalram_pages();
        val->sharedram = global_node_page_state(NR_SHMEM);
        val->freeram = global_zone_page_state(NR_FREE_PAGES);
        val->bufferram = nr_blockdev_pages();
-        val->totalhigh = totalhigh_pages;
+        val->totalhigh = totalhigh_pages();
        val->freehigh = nr_free_highpages();
        val->mem_unit = PAGE_SIZE;
 }
@@ -4767,7 +4920,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
        pg_data_t *pgdat = NODE_DATA(nid);
        for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
-                managed_pages += pgdat->node_zones[zone_type].managed_pages;
+                managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
        val->totalram = managed_pages;
        val->sharedram = node_page_state(pgdat, NR_SHMEM);
        val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
@@ -4776,7 +4929,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
                struct zone *zone = &pgdat->node_zones[zone_type];
                if (is_highmem(zone)) {
-                        managed_highpages += zone->managed_pages;
+                        managed_highpages += zone_managed_pages(zone);
                        free_highpages += zone_page_state(zone, NR_FREE_PAGES);
                }
        }
@@ -4983,7 +5136,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                        K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
                        K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
                        K(zone->present_pages),
-                        K(zone->managed_pages),
+                        K(zone_managed_pages(zone)),
                        K(zone_page_state(zone, NR_MLOCK)),
                        zone_page_state(zone, NR_KERNEL_STACK_KB),
                        K(zone_page_state(zone, NR_PAGETABLE)),
@@ -5655,7 +5808,7 @@ static int zone_batchsize(struct zone *zone)
         * The per-cpu-pages pools are set to around 1000th of the
         * size of the zone.
         */
-        batch = zone->managed_pages / 1024;
+        batch = zone_managed_pages(zone) / 1024;
        /* But no more than a meg. */
        if (batch * PAGE_SIZE > 1024 * 1024)
                batch = (1024 * 1024) / PAGE_SIZE;
@@ -5736,7 +5889,6 @@ static void pageset_init(struct per_cpu_pageset *p)
        memset(p, 0, sizeof(*p));
        pcp = &p->pcp;
-        pcp->count = 0;
        for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
                INIT_LIST_HEAD(&pcp->lists[migratetype]);
 }
@@ -5766,7 +5918,7 @@ static void pageset_set_high_and_batch(struct zone *zone,
 {
        if (percpu_pagelist_fraction)
                pageset_set_high(pcp,
-                        (zone->managed_pages /
+                        (zone_managed_pages(zone) /
                                percpu_pagelist_fraction));
        else
                pageset_set_batch(pcp, zone_batchsize(zone));
@@ -5920,7 +6072,7 @@ void __init sparse_memory_present_with_active_regions(int nid)
 * with no available memory, a warning is printed and the start and end
 * PFNs will be 0.
 */
-void __meminit get_pfn_range_for_nid(unsigned int nid,
+void __init get_pfn_range_for_nid(unsigned int nid,
                        unsigned long *start_pfn, unsigned long *end_pfn)
 {
        unsigned long this_start_pfn, this_end_pfn;
@@ -5969,7 +6121,7 @@ static void __init find_usable_zone_for_movable(void)
 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
 * zones within a node are in order of monotonic increases memory addresses
 */
-static void __meminit adjust_zone_range_for_zone_movable(int nid,
+static void __init adjust_zone_range_for_zone_movable(int nid,
                                        unsigned long zone_type,
                                        unsigned long node_start_pfn,
                                        unsigned long node_end_pfn,
@@ -6000,7 +6152,7 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
 * Return the number of pages a zone spans in a node, including holes
 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
 */
-static unsigned long __meminit zone_spanned_pages_in_node(int nid,
+static unsigned long __init zone_spanned_pages_in_node(int nid,
                                        unsigned long zone_type,
                                        unsigned long node_start_pfn,
                                        unsigned long node_end_pfn,
@@ -6035,7 +6187,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
 * then all holes in the requested range will be accounted for.
 */
-unsigned long __meminit __absent_pages_in_range(int nid,
+unsigned long __init __absent_pages_in_range(int nid,
                                unsigned long range_start_pfn,
                                unsigned long range_end_pfn)
 {
@@ -6065,7 +6217,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
 }
 /* Return the number of page frames in holes in a zone on a node */
-static unsigned long __meminit zone_absent_pages_in_node(int nid,
+static unsigned long __init zone_absent_pages_in_node(int nid,
                                        unsigned long zone_type,
                                        unsigned long node_start_pfn,
                                        unsigned long node_end_pfn,
@@ -6117,7 +6269,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
 }
 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
+static inline unsigned long __init zone_spanned_pages_in_node(int nid,
                                        unsigned long zone_type,
                                        unsigned long node_start_pfn,
                                        unsigned long node_end_pfn,
@@ -6136,7 +6288,7 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
        return zones_size[zone_type];
 }
-static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
+static inline unsigned long __init zone_absent_pages_in_node(int nid,
                                                unsigned long zone_type,
                                                unsigned long node_start_pfn,
                                                unsigned long node_end_pfn,
@@ -6150,7 +6302,7 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
                                                unsigned long node_start_pfn,
                                                unsigned long node_end_pfn,
                                                unsigned long *zones_size,
@@ -6323,7 +6475,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
 static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
                                                        unsigned long remaining_pages)
 {
-        zone->managed_pages = remaining_pages;
+        atomic_long_set(&zone->managed_pages, remaining_pages);
        zone_set_nid(zone, nid);
        zone->name = zone_names[idx];
        zone->zone_pgdat = NODE_DATA(nid);
@@ -6476,12 +6628,6 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
 {
-        /*
-         * We start only with one section of pages, more pages are added as
-         * needed until the rest of deferred pages are initialized.
-         */
-        pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
-                                                pgdat->node_spanned_pages);
        pgdat->first_deferred_pfn = ULONG_MAX;
 }
 #else
@@ -7075,18 +7221,16 @@ early_param("movablecore", cmdline_parse_movablecore);
 void adjust_managed_page_count(struct page *page, long count)
 {
-        spin_lock(&managed_page_count_lock);
+        atomic_long_add(count, &page_zone(page)->managed_pages);
-        page_zone(page)->managed_pages += count;
+        totalram_pages_add(count);
-        totalram_pages += count;
 #ifdef CONFIG_HIGHMEM
        if (PageHighMem(page))
-                totalhigh_pages += count;
+                totalhigh_pages_add(count);
 #endif
-        spin_unlock(&managed_page_count_lock);
 }
 EXPORT_SYMBOL(adjust_managed_page_count);
-unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
+unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
 {
        void *pos;
        unsigned long pages = 0;
@@ -7123,9 +7267,9 @@ EXPORT_SYMBOL(free_reserved_area);
 void free_highmem_page(struct page *page)
 {
        __free_reserved_page(page);
-        totalram_pages++;
+        totalram_pages_inc();
-        page_zone(page)->managed_pages++;
+        atomic_long_inc(&page_zone(page)->managed_pages);
-        totalhigh_pages++;
+        totalhigh_pages_inc();
 }
 #endif
@@ -7174,10 +7318,10 @@ void __init mem_init_print_info(const char *str)
                physpages << (PAGE_SHIFT - 10),
                codesize >> 10, datasize >> 10, rosize >> 10,
                (init_data_size + init_code_size) >> 10, bss_size >> 10,
-                (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
+                (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
                totalcma_pages << (PAGE_SHIFT - 10),
 #ifdef  CONFIG_HIGHMEM
-                totalhigh_pages << (PAGE_SHIFT - 10),
+                totalhigh_pages() << (PAGE_SHIFT - 10),
 #endif
                str ? ", " : "", str ? str : "");
 }
@@ -7257,6 +7401,7 @@ static void calculate_totalreserve_pages(void)
                for (i = 0; i < MAX_NR_ZONES; i++) {
                        struct zone *zone = pgdat->node_zones + i;
                        long max = 0;
+                        unsigned long managed_pages = zone_managed_pages(zone);
                        /* Find valid and maximum lowmem_reserve in the zone */
                        for (j = i; j < MAX_NR_ZONES; j++) {
@@ -7267,8 +7412,8 @@ static void calculate_totalreserve_pages(void)
                        /* we treat the high watermark as reserved pages. */
                        max += high_wmark_pages(zone);
-                        if (max > zone->managed_pages)
+                        if (max > managed_pages)
-                                max = zone->managed_pages;
+                                max = managed_pages;
                        pgdat->totalreserve_pages += max;
@@ -7292,7 +7437,7 @@ static void setup_per_zone_lowmem_reserve(void)
        for_each_online_pgdat(pgdat) {
                for (j = 0; j < MAX_NR_ZONES; j++) {
                        struct zone *zone = pgdat->node_zones + j;
-                        unsigned long managed_pages = zone->managed_pages;
+                        unsigned long managed_pages = zone_managed_pages(zone);
                        zone->lowmem_reserve[j] = 0;
@@ -7310,7 +7455,7 @@ static void setup_per_zone_lowmem_reserve(void)
                                        lower_zone->lowmem_reserve[j] =
                                                managed_pages / sysctl_lowmem_reserve_ratio[idx];
                                }
-                                managed_pages += lower_zone->managed_pages;
+                                managed_pages += zone_managed_pages(lower_zone);
                        }
                }
        }
@@ -7329,14 +7474,14 @@ static void __setup_per_zone_wmarks(void)
        /* Calculate total number of !ZONE_HIGHMEM pages */
        for_each_zone(zone) {
                if (!is_highmem(zone))
-                        lowmem_pages += zone->managed_pages;
+                        lowmem_pages += zone_managed_pages(zone);
        }
        for_each_zone(zone) {
                u64 tmp;
                spin_lock_irqsave(&zone->lock, flags);
-                tmp = (u64)pages_min * zone->managed_pages;
+                tmp = (u64)pages_min * zone_managed_pages(zone);
                do_div(tmp, lowmem_pages);
                if (is_highmem(zone)) {
                        /*
@@ -7350,15 +7495,15 @@ static void __setup_per_zone_wmarks(void)
                         */
                        unsigned long min_pages;
-                        min_pages = zone->managed_pages / 1024;
+                        min_pages = zone_managed_pages(zone) / 1024;
                        min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
-                        zone->watermark[WMARK_MIN] = min_pages;
+                        zone->_watermark[WMARK_MIN] = min_pages;
                } else {
                        /*
                         * If it's a lowmem zone, reserve a number of pages
                         * proportionate to the zone's size.
                         */
-                        zone->watermark[WMARK_MIN] = tmp;
+                        zone->_watermark[WMARK_MIN] = tmp;
                }
                /*
@@ -7367,11 +7512,12 @@ static void __setup_per_zone_wmarks(void)
                 * ensure a minimum size on small systems.
                 */
                tmp = max_t(u64, tmp >> 2,
-                            mult_frac(zone->managed_pages,
+                            mult_frac(zone_managed_pages(zone),
                                      watermark_scale_factor, 10000));
-                zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
+                zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
-                zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
+                zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
+                zone->watermark_boost = 0;
                spin_unlock_irqrestore(&zone->lock, flags);
        }
@@ -7472,6 +7618,18 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
        return 0;
 }
+int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write,
+        void __user *buffer, size_t *length, loff_t *ppos)
+{
+        int rc;
+        rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+        if (rc)
+                return rc;
+        return 0;
+}
 int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
@@ -7497,8 +7655,8 @@ static void setup_min_unmapped_ratio(void)
                pgdat->min_unmapped_pages = 0;
        for_each_zone(zone)
-                zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
+                zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
-                                sysctl_min_unmapped_ratio) / 100;
+                                                         sysctl_min_unmapped_ratio) / 100;
 }
@@ -7525,8 +7683,8 @@ static void setup_min_slab_ratio(void)
                pgdat->min_slab_pages = 0;
        for_each_zone(zone)
-                zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
+                zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
-                                sysctl_min_slab_ratio) / 100;
+                                                     sysctl_min_slab_ratio) / 100;
 }
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
@@ -7766,8 +7924,7 @@ void *__init alloc_large_system_hash(const char *tablename,
 * race condition. So you can't expect this function should be exact.
 */
 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
-                         int migratetype,
+                         int migratetype, int flags)
-                         bool skip_hwpoisoned_pages)
 {
        unsigned long pfn, iter, found;
@@ -7841,7 +7998,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
                 * The HWPoisoned page may be not in buddy system, and
                 * page_count() is not 0.
                 */
-                if (skip_hwpoisoned_pages && PageHWPoison(page))
+                if ((flags & SKIP_HWPOISON) && PageHWPoison(page))
                        continue;
                if (__PageMovable(page))
@@ -7868,6 +8025,8 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
        return false;
 unmovable:
        WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
+        if (flags & REPORT_FAILURE)
+                dump_page(pfn_to_page(pfn+iter), "unmovable page");
        return true;
 }
@@ -7994,8 +8153,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
         */
        ret = start_isolate_page_range(pfn_max_align_down(start),
-                                       pfn_max_align_up(end), migratetype,
+                                       pfn_max_align_up(end), migratetype, 0);
-                                       false);
        if (ret)
                return ret;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 43e085608846..ce323e56b34d 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -15,8 +15,7 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/page_isolation.h>
-static int set_migratetype_isolate(struct page *page, int migratetype,
+static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags)
-                                bool skip_hwpoisoned_pages)
 {
        struct zone *zone;
        unsigned long flags, pfn;
@@ -60,8 +59,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype,
         * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
         * We just check MOVABLE pages.
         */
-        if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype,
+        if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype, flags))
-                                 skip_hwpoisoned_pages))
                ret = 0;
        /*
@@ -185,7 +183,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
 * prevents two threads from simultaneously working on overlapping ranges.
 */
 int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-                             unsigned migratetype, bool skip_hwpoisoned_pages)
+                             unsigned migratetype, int flags)
 {
        unsigned long pfn;
        unsigned long undo_pfn;
@@ -199,7 +197,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
             pfn += pageblock_nr_pages) {
                page = __first_valid_page(pfn, pageblock_nr_pages);
                if (page &&
-                    set_migratetype_isolate(page, migratetype, skip_hwpoisoned_pages)) {
+                    set_migratetype_isolate(page, migratetype, flags)) {
                        undo_pfn = pfn;
                        goto undo;
                }
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 87bc0dfdb52b..28b06524939f 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -351,6 +351,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
                .skip = 0
        };
+        count = min_t(size_t, count, PAGE_SIZE);
        kbuf = kmalloc(count, GFP_KERNEL);
        if (!kbuf)
                return -ENOMEM;
diff --git a/mm/readahead.c b/mm/readahead.c
index f3d6f9656a3c..1ae16522412a 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -270,17 +270,15 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
 *  return it as the new window size.
 */
 static unsigned long get_next_ra_size(struct file_ra_state *ra,
-                                                unsigned long max)
+                                      unsigned long max)
 {
        unsigned long cur = ra->size;
-        unsigned long newsize;
        if (cur < max / 16)
-                newsize = 4 * cur;
+                return 4 * cur;
-        else
+        if (cur <= max / 2)
-                newsize = 2 * cur;
+                return 2 * cur;
+        return max;
-        return min(newsize, max);
 }
 /*
diff --git a/mm/rmap.c b/mm/rmap.c
index 85b7f9423352..21a26cf51114 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -25,6 +25,7 @@
 *     page->flags PG_locked (lock_page)
 *       hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
 *         mapping->i_mmap_rwsem
+ *           hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
 *           anon_vma->rwsem
 *             mm->page_table_lock or pte_lock
 *               zone_lru_lock (in mark_page_accessed, isolate_lru_page)
@@ -889,15 +890,17 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
                .address = address,
                .flags = PVMW_SYNC,
        };
-        unsigned long start = address, end;
+        struct mmu_notifier_range range;
        int *cleaned = arg;
        /*
         * We have to assume the worse case ie pmd for invalidation. Note that
         * the page can not be free from this function.
         */
-        end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page)));
+        mmu_notifier_range_init(&range, vma->vm_mm, address,
-        mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
+                                min(vma->vm_end, address +
+                                    (PAGE_SIZE << compound_order(page))));
+        mmu_notifier_invalidate_range_start(&range);
        while (page_vma_mapped_walk(&pvmw)) {
                unsigned long cstart;
@@ -949,7 +952,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
                        (*cleaned)++;
        }
-        mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
+        mmu_notifier_invalidate_range_end(&range);
        return true;
 }
@@ -1017,7 +1020,7 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
 /**
 * __page_set_anon_rmap - set up new anonymous rmap
- * @page:       Page to add to rmap     
+ * @page:       Page or Hugepage to add to rmap
 * @vma:        VM area to add page to.
 * @address:    User virtual address of the mapping     
 * @exclusive:  the page is exclusively owned by the current process
@@ -1345,7 +1348,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        pte_t pteval;
        struct page *subpage;
        bool ret = true;
-        unsigned long start = address, end;
+        struct mmu_notifier_range range;
        enum ttu_flags flags = (enum ttu_flags)arg;
        /* munlock has nothing to gain from examining un-locked vmas */
@@ -1369,15 +1372,21 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
         * Note that the page can not be free in this function as call of
         * try_to_unmap() must hold a reference on the page.
         */
-        end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page)));
+        mmu_notifier_range_init(&range, vma->vm_mm, vma->vm_start,
+                                min(vma->vm_end, vma->vm_start +
+                                    (PAGE_SIZE << compound_order(page))));
        if (PageHuge(page)) {
                /*
                 * If sharing is possible, start and end will be adjusted
                 * accordingly.
+                 *
+                 * If called for a huge page, caller must hold i_mmap_rwsem
+                 * in write mode as it is possible to call huge_pmd_unshare.
                 */
-                adjust_range_if_pmd_sharing_possible(vma, &start, &end);
+                adjust_range_if_pmd_sharing_possible(vma, &range.start,
+                                                     &range.end);
        }
-        mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
+        mmu_notifier_invalidate_range_start(&range);
        while (page_vma_mapped_walk(&pvmw)) {
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
@@ -1428,9 +1437,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                                 * we must flush them all.  start/end were
                                 * already adjusted above to cover this range.
                                 */
-                                flush_cache_range(vma, start, end);
+                                flush_cache_range(vma, range.start, range.end);
-                                flush_tlb_range(vma, start, end);
+                                flush_tlb_range(vma, range.start, range.end);
-                                mmu_notifier_invalidate_range(mm, start, end);
+                                mmu_notifier_invalidate_range(mm, range.start,
+                                                              range.end);
                                /*
                                 * The ref count of the PMD page was dropped
@@ -1650,7 +1660,7 @@ discard:
                put_page(page);
        }
-        mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
+        mmu_notifier_invalidate_range_end(&range);
        return ret;
 }
@@ -1910,27 +1920,10 @@ void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
 #ifdef CONFIG_HUGETLB_PAGE
 /*
- * The following three functions are for anonymous (private mapped) hugepages.
+ * The following two functions are for anonymous (private mapped) hugepages.
 * Unlike common anonymous pages, anonymous hugepages have no accounting code
 * and no lru code, because we handle hugepages differently from common pages.
 */
-static void __hugepage_set_anon_rmap(struct page *page,
-        struct vm_area_struct *vma, unsigned long address, int exclusive)
-{
-        struct anon_vma *anon_vma = vma->anon_vma;
-        BUG_ON(!anon_vma);
-        if (PageAnon(page))
-                return;
-        if (!exclusive)
-                anon_vma = anon_vma->root;
-        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
-        page->mapping = (struct address_space *) anon_vma;
-        page->index = linear_page_index(vma, address);
-}
 void hugepage_add_anon_rmap(struct page *page,
                            struct vm_area_struct *vma, unsigned long address)
 {
@@ -1942,7 +1935,7 @@ void hugepage_add_anon_rmap(struct page *page,
        /* address might be in next vma when migration races vma_adjust */
        first = atomic_inc_and_test(compound_mapcount_ptr(page));
        if (first)
-                __hugepage_set_anon_rmap(page, vma, address, 0);
+                __page_set_anon_rmap(page, vma, address, 0);
 }
 void hugepage_add_new_anon_rmap(struct page *page,
@@ -1950,6 +1943,6 @@ void hugepage_add_new_anon_rmap(struct page *page,
 {
        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        atomic_set(compound_mapcount_ptr(page), 0);
-        __hugepage_set_anon_rmap(page, vma, address, 1);
+        __page_set_anon_rmap(page, vma, address, 1);
 }
 #endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/shmem.c b/mm/shmem.c
index 375f3ac19bb8..6ece1e2fe76e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -109,12 +109,14 @@ struct shmem_falloc {
 #ifdef CONFIG_TMPFS
 static unsigned long shmem_default_max_blocks(void)
 {
-        return totalram_pages / 2;
+        return totalram_pages() / 2;
 }
 static unsigned long shmem_default_max_inodes(void)
 {
-        return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
+        unsigned long nr_pages = totalram_pages();
+        return min(nr_pages - totalhigh_pages(), nr_pages / 2);
 }
 #endif
@@ -3301,7 +3303,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
                        size = memparse(value,&rest);
                        if (*rest == '%') {
                                size <<= PAGE_SHIFT;
-                                size *= totalram_pages;
+                                size *= totalram_pages();
                                do_div(size, 100);
                                rest++;
                        }
diff --git a/mm/slab.c b/mm/slab.c
index 3abb9feb3818..73fe23e649c9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -406,19 +406,6 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct page *page,
        return page->s_mem + cache->size * idx;
 }
-/*
- * We want to avoid an expensive divide : (offset / cache->size)
- *   Using the fact that size is a constant for a particular cache,
- *   we can replace (offset / cache->size) by
- *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
- */
-static inline unsigned int obj_to_index(const struct kmem_cache *cache,
-                                        const struct page *page, void *obj)
-{
-        u32 offset = (obj - page->s_mem);
-        return reciprocal_divide(offset, cache->reciprocal_buffer_size);
-}
 #define BOOT_CPUCACHE_ENTRIES   1
 /* internal cache of cache description objs */
 static struct kmem_cache kmem_cache_boot = {
@@ -1248,7 +1235,7 @@ void __init kmem_cache_init(void)
         * page orders on machines with more than 32MB of memory if
         * not overridden on the command line.
         */
-        if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
+        if (!slab_max_order_set && totalram_pages() > (32 << 20) >> PAGE_SHIFT)
                slab_max_order = SLAB_MAX_ORDER_HI;
        /* Bootstrap is tricky, because several objects are allocated
@@ -2370,7 +2357,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
        void *freelist;
        void *addr = page_address(page);
-        page->s_mem = addr + colour_off;
+        page->s_mem = kasan_reset_tag(addr) + colour_off;
        page->active = 0;
        if (OBJFREELIST_SLAB(cachep))
@@ -2574,7 +2561,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
        for (i = 0; i < cachep->num; i++) {
                objp = index_to_obj(cachep, page, i);
-                kasan_init_slab_obj(cachep, objp);
+                objp = kasan_init_slab_obj(cachep, objp);
                /* constructor could break poison info */
                if (DEBUG == 0 && cachep->ctor) {
@@ -3551,7 +3538,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
        void *ret = slab_alloc(cachep, flags, _RET_IP_);
-        kasan_slab_alloc(cachep, ret, flags);
+        ret = kasan_slab_alloc(cachep, ret, flags);
        trace_kmem_cache_alloc(_RET_IP_, ret,
                               cachep->object_size, cachep->size, flags);
@@ -3617,7 +3604,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
        ret = slab_alloc(cachep, flags, _RET_IP_);
-        kasan_kmalloc(cachep, ret, size, flags);
+        ret = kasan_kmalloc(cachep, ret, size, flags);
        trace_kmalloc(_RET_IP_, ret,
                      size, cachep->size, flags);
        return ret;
@@ -3641,7 +3628,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
        void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
-        kasan_slab_alloc(cachep, ret, flags);
+        ret = kasan_slab_alloc(cachep, ret, flags);
        trace_kmem_cache_alloc_node(_RET_IP_, ret,
                                    cachep->object_size, cachep->size,
                                    flags, nodeid);
@@ -3660,7 +3647,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
        ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
-        kasan_kmalloc(cachep, ret, size, flags);
+        ret = kasan_kmalloc(cachep, ret, size, flags);
        trace_kmalloc_node(_RET_IP_, ret,
                           size, cachep->size,
                           flags, nodeid);
@@ -3681,7 +3668,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
        if (unlikely(ZERO_OR_NULL_PTR(cachep)))
                return cachep;
        ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
-        kasan_kmalloc(cachep, ret, size, flags);
+        ret = kasan_kmalloc(cachep, ret, size, flags);
        return ret;
 }
@@ -3719,7 +3706,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
                return cachep;
        ret = slab_alloc(cachep, flags, caller);
-        kasan_kmalloc(cachep, ret, size, flags);
+        ret = kasan_kmalloc(cachep, ret, size, flags);
        trace_kmalloc(caller, ret,
                      size, cachep->size, flags);
diff --git a/mm/slab.h b/mm/slab.h
index 58c6c1c2a78e..4190c24ef0e9 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -441,7 +441,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
                kmemleak_alloc_recursive(object, s->object_size, 1,
                                         s->flags, flags);
-                kasan_slab_alloc(s, object, flags);
+                p[i] = kasan_slab_alloc(s, object, flags);
        }
        if (memcg_kmem_enabled())
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 9c11e8a937d2..70b0cc85db67 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1029,10 +1029,8 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
                index = size_index[size_index_elem(size)];
        } else {
-                if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
+                if (WARN_ON_ONCE(size > KMALLOC_MAX_CACHE_SIZE))
-                        WARN_ON(1);
                        return NULL;
-                }
                index = fls(size - 1);
        }
@@ -1204,7 +1202,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
        page = alloc_pages(flags, order);
        ret = page ? page_address(page) : NULL;
        kmemleak_alloc(ret, size, 1, flags);
-        kasan_kmalloc_large(ret, size, flags);
+        ret = kasan_kmalloc_large(ret, size, flags);
        return ret;
 }
 EXPORT_SYMBOL(kmalloc_order);
@@ -1482,7 +1480,7 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
                ks = ksize(p);
        if (ks >= new_size) {
-                kasan_krealloc((void *)p, new_size, flags);
+                p = kasan_krealloc((void *)p, new_size, flags);
                return (void *)p;
        }
@@ -1534,7 +1532,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
        }
        ret = __do_krealloc(p, new_size, flags);
-        if (ret && p != ret)
+        if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret))
                kfree(p);
        return ret;
diff --git a/mm/slub.c b/mm/slub.c
index e3629cd7aff1..36c0befeebd8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1372,10 +1372,10 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
 * Hooks for other subsystems that check memory allocations. In a typical
 * production configuration these hooks all should produce no code at all.
 */
-static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
+static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
 {
        kmemleak_alloc(ptr, size, 1, flags);
-        kasan_kmalloc_large(ptr, size, flags);
+        return kasan_kmalloc_large(ptr, size, flags);
 }
 static __always_inline void kfree_hook(void *x)
@@ -1451,16 +1451,17 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
 #endif
 }
-static void setup_object(struct kmem_cache *s, struct page *page,
+static void *setup_object(struct kmem_cache *s, struct page *page,
                                void *object)
 {
        setup_object_debug(s, page, object);
-        kasan_init_slab_obj(s, object);
+        object = kasan_init_slab_obj(s, object);
        if (unlikely(s->ctor)) {
                kasan_unpoison_object_data(s, object);
                s->ctor(object);
                kasan_poison_object_data(s, object);
        }
+        return object;
 }
 /*
@@ -1568,16 +1569,16 @@ static bool shuffle_freelist(struct kmem_cache *s, struct page *page)
        /* First entry is used as the base of the freelist */
        cur = next_freelist_entry(s, page, &pos, start, page_limit,
                                freelist_count);
+        cur = setup_object(s, page, cur);
        page->freelist = cur;
        for (idx = 1; idx < page->objects; idx++) {
-                setup_object(s, page, cur);
                next = next_freelist_entry(s, page, &pos, start, page_limit,
                        freelist_count);
+                next = setup_object(s, page, next);
                set_freepointer(s, cur, next);
                cur = next;
        }
-        setup_object(s, page, cur);
        set_freepointer(s, cur, NULL);
        return true;
@@ -1599,7 +1600,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
        struct page *page;
        struct kmem_cache_order_objects oo = s->oo;
        gfp_t alloc_gfp;
-        void *start, *p;
+        void *start, *p, *next;
        int idx, order;
        bool shuffle;
@@ -1651,13 +1652,16 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
        if (!shuffle) {
                for_each_object_idx(p, idx, s, start, page->objects) {
-                        setup_object(s, page, p);
+                        if (likely(idx < page->objects)) {
-                        if (likely(idx < page->objects))
+                                next = p + s->size;
-                                set_freepointer(s, p, p + s->size);
+                                next = setup_object(s, page, next);
-                        else
+                                set_freepointer(s, p, next);
+                        } else
                                set_freepointer(s, p, NULL);
                }
-                page->freelist = fixup_red_left(s, start);
+                start = fixup_red_left(s, start);
+                start = setup_object(s, page, start);
+                page->freelist = start;
        }
        page->inuse = page->objects;
@@ -2127,26 +2131,15 @@ redo:
        }
        if (l != m) {
                if (l == M_PARTIAL)
                        remove_partial(n, page);
                else if (l == M_FULL)
                        remove_full(s, n, page);
-                if (m == M_PARTIAL) {
+                if (m == M_PARTIAL)
                        add_partial(n, page, tail);
-                        stat(s, tail);
+                else if (m == M_FULL)
-                } else if (m == M_FULL) {
-                        stat(s, DEACTIVATE_FULL);
                        add_full(s, n, page);
-                }
        }
        l = m;
@@ -2159,7 +2152,11 @@ redo:
        if (lock)
                spin_unlock(&n->list_lock);
-        if (m == M_FREE) {
+        if (m == M_PARTIAL)
+                stat(s, tail);
+        else if (m == M_FULL)
+                stat(s, DEACTIVATE_FULL);
+        else if (m == M_FREE) {
                stat(s, DEACTIVATE_EMPTY);
                discard_slab(s, page);
                stat(s, FREE_SLAB);
@@ -2313,12 +2310,10 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
 {
        struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
-        if (likely(c)) {
+        if (c->page)
-                if (c->page)
+                flush_slab(s, c);
-                        flush_slab(s, c);
-                unfreeze_partials(s, c);
+        unfreeze_partials(s, c);
-        }
 }
 static void flush_cpu_slab(void *d)
@@ -2367,7 +2362,7 @@ static int slub_cpu_dead(unsigned int cpu)
 static inline int node_match(struct page *page, int node)
 {
 #ifdef CONFIG_NUMA
-        if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node))
+        if (node != NUMA_NO_NODE && page_to_nid(page) != node)
                return 0;
 #endif
        return 1;
@@ -2768,7 +2763,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 {
        void *ret = slab_alloc(s, gfpflags, _RET_IP_);
        trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
-        kasan_kmalloc(s, ret, size, gfpflags);
+        ret = kasan_kmalloc(s, ret, size, gfpflags);
        return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_trace);
@@ -2796,7 +2791,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
        trace_kmalloc_node(_RET_IP_, ret,
                           size, s->size, gfpflags, node);
-        kasan_kmalloc(s, ret, size, gfpflags);
+        ret = kasan_kmalloc(s, ret, size, gfpflags);
        return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
@@ -2992,7 +2987,7 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
                do_slab_free(s, page, head, tail, cnt, addr);
 }
-#ifdef CONFIG_KASAN
+#ifdef CONFIG_KASAN_GENERIC
 void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
 {
        do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr);
@@ -3364,16 +3359,16 @@ static void early_kmem_cache_node_alloc(int node)
        n = page->freelist;
        BUG_ON(!n);
-        page->freelist = get_freepointer(kmem_cache_node, n);
-        page->inuse = 1;
-        page->frozen = 0;
-        kmem_cache_node->node[node] = n;
 #ifdef CONFIG_SLUB_DEBUG
        init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
        init_tracking(kmem_cache_node, n);
 #endif
-        kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
+        n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
                      GFP_KERNEL);
+        page->freelist = get_freepointer(kmem_cache_node, n);
+        page->inuse = 1;
+        page->frozen = 0;
+        kmem_cache_node->node[node] = n;
        init_kmem_cache_node(n);
        inc_slabs_node(kmem_cache_node, node, page->objects);
@@ -3784,7 +3779,7 @@ void *__kmalloc(size_t size, gfp_t flags)
        trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
-        kasan_kmalloc(s, ret, size, flags);
+        ret = kasan_kmalloc(s, ret, size, flags);
        return ret;
 }
@@ -3801,8 +3796,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
        if (page)
                ptr = page_address(page);
-        kmalloc_large_node_hook(ptr, size, flags);
+        return kmalloc_large_node_hook(ptr, size, flags);
-        return ptr;
 }
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
@@ -3829,7 +3823,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
        trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
-        kasan_kmalloc(s, ret, size, flags);
+        ret = kasan_kmalloc(s, ret, size, flags);
        return ret;
 }
diff --git a/mm/sparse.c b/mm/sparse.c
index 3abc8cc50201..7ea5dc6c6b19 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -678,25 +678,24 @@ static void free_map_bootmem(struct page *memmap)
 * set.  If this is <=0, then that means that the passed-in
 * map was not consumed and must be freed.
 */
-int __meminit sparse_add_one_section(struct pglist_data *pgdat,
+int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
-                unsigned long start_pfn, struct vmem_altmap *altmap)
+                                     struct vmem_altmap *altmap)
 {
        unsigned long section_nr = pfn_to_section_nr(start_pfn);
        struct mem_section *ms;
        struct page *memmap;
        unsigned long *usemap;
-        unsigned long flags;
        int ret;
        /*
         * no locking for this, because it does its own
         * plus, it does a kmalloc
         */
-        ret = sparse_index_init(section_nr, pgdat->node_id);
+        ret = sparse_index_init(section_nr, nid);
        if (ret < 0 && ret != -EEXIST)
                return ret;
        ret = 0;
-        memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, altmap);
+        memmap = kmalloc_section_memmap(section_nr, nid, altmap);
        if (!memmap)
                return -ENOMEM;
        usemap = __kmalloc_section_usemap();
@@ -705,8 +704,6 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
                return -ENOMEM;
        }
-        pgdat_resize_lock(pgdat, &flags);
        ms = __pfn_to_section(start_pfn);
        if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
                ret = -EEXIST;
@@ -723,7 +720,6 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
        sparse_init_one_section(ms, section_nr, memmap, usemap);
 out:
-        pgdat_resize_unlock(pgdat, &flags);
        if (ret < 0) {
                kfree(usemap);
                __kfree_section_memmap(memmap, altmap);
@@ -740,6 +736,15 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
        if (!memmap)
                return;
+        /*
+         * A further optimization is to have per section refcounted
+         * num_poisoned_pages.  But that would need more space per memmap, so
+         * for now just do a quick global check to speed up this routine in the
+         * absence of bad pages.
+         */
+        if (atomic_long_read(&num_poisoned_pages) == 0)
+                return;
        for (i = 0; i < nr_pages; i++) {
                if (PageHWPoison(&memmap[i])) {
                        atomic_long_sub(1, &num_poisoned_pages);
@@ -785,10 +790,8 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
                unsigned long map_offset, struct vmem_altmap *altmap)
 {
        struct page *memmap = NULL;
-        unsigned long *usemap = NULL, flags;
+        unsigned long *usemap = NULL;
-        struct pglist_data *pgdat = zone->zone_pgdat;
-        pgdat_resize_lock(pgdat, &flags);
        if (ms->section_mem_map) {
                usemap = ms->pageblock_flags;
                memmap = sparse_decode_mem_map(ms->section_mem_map,
@@ -796,7 +799,6 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
                ms->section_mem_map = 0;
                ms->pageblock_flags = NULL;
        }
-        pgdat_resize_unlock(pgdat, &flags);
        clear_hwpoisoned_pages(memmap + map_offset,
                        PAGES_PER_SECTION - map_offset);
diff --git a/mm/swap.c b/mm/swap.c
index 5d786019eab9..4d8a1f1afaab 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1022,7 +1022,7 @@ EXPORT_SYMBOL(pagevec_lookup_range_nr_tag);
 */
 void __init swap_setup(void)
 {
-        unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
+        unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
        /* Use a smaller cluster for small-memory machines */
        if (megs < 16)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8688ae65ef58..dbac1d49469d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2197,7 +2197,8 @@ int try_to_unuse(unsigned int type, bool frontswap,
                 */
                if (PageSwapCache(page) &&
                    likely(page_private(page) == entry.val) &&
-                    !page_swapped(page))
+                    (!PageTransCompound(page) ||
+                     !swap_page_trans_huge_swapped(si, entry)))
                        delete_from_swap_cache(compound_head(page));
                /*
@@ -2812,8 +2813,9 @@ static struct swap_info_struct *alloc_swap_info(void)
        struct swap_info_struct *p;
        unsigned int type;
        int i;
+        int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node);
-        p = kvzalloc(sizeof(*p), GFP_KERNEL);
+        p = kvzalloc(size, GFP_KERNEL);
        if (!p)
                return ERR_PTR(-ENOMEM);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 458acda96f20..48368589f519 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -267,10 +267,14 @@ retry:
                VM_BUG_ON(dst_addr & ~huge_page_mask(h));
                /*
-                 * Serialize via hugetlb_fault_mutex
+                 * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
+                 * i_mmap_rwsem ensures the dst_pte remains valid even
+                 * in the case of shared pmds.  fault mutex prevents
+                 * races with other faulting threads.
                 */
-                idx = linear_page_index(dst_vma, dst_addr);
                mapping = dst_vma->vm_file->f_mapping;
+                i_mmap_lock_read(mapping);
+                idx = linear_page_index(dst_vma, dst_addr);
                hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
                                                                idx, dst_addr);
                mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -279,6 +283,7 @@ retry:
                dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
                if (!dst_pte) {
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+                        i_mmap_unlock_read(mapping);
                        goto out_unlock;
                }
@@ -286,6 +291,7 @@ retry:
                dst_pteval = huge_ptep_get(dst_pte);
                if (!huge_pte_none(dst_pteval)) {
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+                        i_mmap_unlock_read(mapping);
                        goto out_unlock;
                }
@@ -293,6 +299,7 @@ retry:
                                                dst_addr, src_addr, &page);
                mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+                i_mmap_unlock_read(mapping);
                vm_alloc_shared = vm_shared;
                cond_resched();
diff --git a/mm/util.c b/mm/util.c
index 8bf08b5b5760..4df23d64aac7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -593,7 +593,7 @@ unsigned long vm_commit_limit(void)
        if (sysctl_overcommit_kbytes)
                allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
        else
-                allowed = ((totalram_pages - hugetlb_total_pages())
+                allowed = ((totalram_pages() - hugetlb_total_pages())
                           * sysctl_overcommit_ratio / 100);
        allowed += total_swap_pages;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 97d4b25d0373..871e41c55e23 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1634,7 +1634,7 @@ void *vmap(struct page **pages, unsigned int count,
        might_sleep();
-        if (count > totalram_pages)
+        if (count > totalram_pages())
                return NULL;
        size = (unsigned long)count << PAGE_SHIFT;
@@ -1739,7 +1739,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
        unsigned long real_size = size;
        size = PAGE_ALIGN(size);
-        if (!size || (size >> PAGE_SHIFT) > totalram_pages)
+        if (!size || (size >> PAGE_SHIFT) > totalram_pages())
                goto fail;
        area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 24ab1f7394ab..a714c4f800e9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -88,6 +88,9 @@ struct scan_control {
        /* Can pages be swapped as part of reclaim? */
        unsigned int may_swap:1;
+        /* e.g. boosted watermark reclaim leaves slabs alone */
+        unsigned int may_shrinkslab:1;
        /*
         * Cgroups are not reclaimed below their configured memory.low,
         * unless we threaten to OOM. If any cgroups are skipped due to
@@ -1457,14 +1460,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        count_memcg_page_event(page, PGLAZYFREED);
                } else if (!mapping || !__remove_mapping(mapping, page, true))
                        goto keep_locked;
-                /*
-                 * At this point, we have no other references and there is
+                unlock_page(page);
-                 * no way to pick any more up (removed from LRU, removed
-                 * from pagecache). Can use non-atomic bitops now (and
-                 * we obviously don't have to worry about waking up a process
-                 * waiting on the page lock, because there are no references.
-                 */
-                __ClearPageLocked(page);
 free_it:
                nr_reclaimed++;
@@ -2756,8 +2753,10 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
                        shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
                        node_lru_pages += lru_pages;
-                        shrink_slab(sc->gfp_mask, pgdat->node_id,
+                        if (sc->may_shrinkslab) {
+                                shrink_slab(sc->gfp_mask, pgdat->node_id,
                                    memcg, sc->priority);
+                        }
                        /* Record the group's reclaim efficiency */
                        vmpressure(sc->gfp_mask, memcg, false,
@@ -3239,6 +3238,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
                .may_swap = 1,
+                .may_shrinkslab = 1,
        };
        /*
@@ -3283,6 +3283,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
                .may_unmap = 1,
                .reclaim_idx = MAX_NR_ZONES - 1,
                .may_swap = !noswap,
+                .may_shrinkslab = 1,
        };
        unsigned long lru_pages;
@@ -3329,6 +3330,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
                .may_swap = may_swap,
+                .may_shrinkslab = 1,
        };
        /*
@@ -3379,6 +3381,30 @@ static void age_active_anon(struct pglist_data *pgdat,
        } while (memcg);
 }
+static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx)
+{
+        int i;
+        struct zone *zone;
+        /*
+         * Check for watermark boosts top-down as the higher zones
+         * are more likely to be boosted. Both watermarks and boosts
+         * should not be checked at the time time as reclaim would
+         * start prematurely when there is no boosting and a lower
+         * zone is balanced.
+         */
+        for (i = classzone_idx; i >= 0; i--) {
+                zone = pgdat->node_zones + i;
+                if (!managed_zone(zone))
+                        continue;
+                if (zone->watermark_boost)
+                        return true;
+        }
+        return false;
+}
 /*
 * Returns true if there is an eligible zone balanced for the request order
 * and classzone_idx
@@ -3389,6 +3415,10 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
        unsigned long mark = -1;
        struct zone *zone;
+        /*
+         * Check watermarks bottom-up as lower zones are more likely to
+         * meet watermarks.
+         */
        for (i = 0; i <= classzone_idx; i++) {
                zone = pgdat->node_zones + i;
@@ -3517,14 +3547,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
        unsigned long nr_soft_reclaimed;
        unsigned long nr_soft_scanned;
        unsigned long pflags;
+        unsigned long nr_boost_reclaim;
+        unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
+        bool boosted;
        struct zone *zone;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
                .order = order,
-                .priority = DEF_PRIORITY,
-                .may_writepage = !laptop_mode,
                .may_unmap = 1,
-                .may_swap = 1,
        };
        psi_memstall_enter(&pflags);
@@ -3532,9 +3562,28 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
        count_vm_event(PAGEOUTRUN);
+        /*
+         * Account for the reclaim boost. Note that the zone boost is left in
+         * place so that parallel allocations that are near the watermark will
+         * stall or direct reclaim until kswapd is finished.
+         */
+        nr_boost_reclaim = 0;
+        for (i = 0; i <= classzone_idx; i++) {
+                zone = pgdat->node_zones + i;
+                if (!managed_zone(zone))
+                        continue;
+                nr_boost_reclaim += zone->watermark_boost;
+                zone_boosts[i] = zone->watermark_boost;
+        }
+        boosted = nr_boost_reclaim;
+restart:
+        sc.priority = DEF_PRIORITY;
        do {
                unsigned long nr_reclaimed = sc.nr_reclaimed;
                bool raise_priority = true;
+                bool balanced;
                bool ret;
                sc.reclaim_idx = classzone_idx;
@@ -3561,13 +3610,40 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                }
                /*
-                 * Only reclaim if there are no eligible zones. Note that
+                 * If the pgdat is imbalanced then ignore boosting and preserve
-                 * sc.reclaim_idx is not used as buffer_heads_over_limit may
+                 * the watermarks for a later time and restart. Note that the
-                 * have adjusted it.
+                 * zone watermarks will be still reset at the end of balancing
+                 * on the grounds that the normal reclaim should be enough to
+                 * re-evaluate if boosting is required when kswapd next wakes.
                 */
-                if (pgdat_balanced(pgdat, sc.order, classzone_idx))
+                balanced = pgdat_balanced(pgdat, sc.order, classzone_idx);
+                if (!balanced && nr_boost_reclaim) {
+                        nr_boost_reclaim = 0;
+                        goto restart;
+                }
+                /*
+                 * If boosting is not active then only reclaim if there are no
+                 * eligible zones. Note that sc.reclaim_idx is not used as
+                 * buffer_heads_over_limit may have adjusted it.
+                 */
+                if (!nr_boost_reclaim && balanced)
                        goto out;
+                /* Limit the priority of boosting to avoid reclaim writeback */
+                if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
+                        raise_priority = false;
+                /*
+                 * Do not writeback or swap pages for boosted reclaim. The
+                 * intent is to relieve pressure not issue sub-optimal IO
+                 * from reclaim context. If no pages are reclaimed, the
+                 * reclaim will be aborted.
+                 */
+                sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
+                sc.may_swap = !nr_boost_reclaim;
+                sc.may_shrinkslab = !nr_boost_reclaim;
                /*
                 * Do some background aging of the anon list, to give
                 * pages a chance to be referenced before reclaiming. All
@@ -3619,6 +3695,16 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                 * progress in reclaiming pages
                 */
                nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
+                nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
+                /*
+                 * If reclaim made no progress for a boost, stop reclaim as
+                 * IO cannot be queued and it could be an infinite loop in
+                 * extreme circumstances.
+                 */
+                if (nr_boost_reclaim && !nr_reclaimed)
+                        break;
                if (raise_priority || !nr_reclaimed)
                        sc.priority--;
        } while (sc.priority >= 1);
@@ -3627,6 +3713,28 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                pgdat->kswapd_failures++;
 out:
+        /* If reclaim was boosted, account for the reclaim done in this pass */
+        if (boosted) {
+                unsigned long flags;
+                for (i = 0; i <= classzone_idx; i++) {
+                        if (!zone_boosts[i])
+                                continue;
+                        /* Increments are under the zone lock */
+                        zone = pgdat->node_zones + i;
+                        spin_lock_irqsave(&zone->lock, flags);
+                        zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
+                        spin_unlock_irqrestore(&zone->lock, flags);
+                }
+                /*
+                 * As there is now likely space, wakeup kcompact to defragment
+                 * pageblocks.
+                 */
+                wakeup_kcompactd(pgdat, pageblock_order, classzone_idx);
+        }
        snapshot_refaults(NULL, pgdat);
        __fs_reclaim_release();
        psi_memstall_leave(&pflags);
@@ -3855,7 +3963,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
        /* Hopeless node, leave it to direct reclaim if possible */
        if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
-            pgdat_balanced(pgdat, order, classzone_idx)) {
+            (pgdat_balanced(pgdat, order, classzone_idx) &&
+             !pgdat_watermark_boosted(pgdat, classzone_idx))) {
                /*
                 * There may be plenty of free memory available, but it's too
                 * fragmented for high-order allocations.  Wake up kcompactd
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9c624595e904..83b30edc2f7f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -227,7 +227,7 @@ int calculate_normal_threshold(struct zone *zone)
         * 125          1024            10      16-32 GB        9
         */
-        mem = zone->managed_pages >> (27 - PAGE_SHIFT);
+        mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
        threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
@@ -1569,7 +1569,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                   high_wmark_pages(zone),
                   zone->spanned_pages,
                   zone->present_pages,
-                   zone->managed_pages);
+                   zone_managed_pages(zone));
        seq_printf(m,
                   "\n        protection: (%ld",
diff --git a/mm/workingset.c b/mm/workingset.c
index d46f8c92aa2f..dcb994f2acc2 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -549,7 +549,7 @@ static int __init workingset_init(void)
         * double the initial memory by using totalram_pages as-is.
         */
        timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
-        max_order = fls_long(totalram_pages - 1);
+        max_order = fls_long(totalram_pages() - 1);
        if (max_order > timestamp_bits)
                bucket_order = max_order - timestamp_bits;
        pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
diff --git a/mm/zswap.c b/mm/zswap.c
index cd91fd9d96b8..a4e4d36ec085 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -219,8 +219,8 @@ static const struct zpool_ops zswap_zpool_ops = {
 static bool zswap_is_full(void)
 {
-        return totalram_pages * zswap_max_pool_percent / 100 <
+        return totalram_pages() * zswap_max_pool_percent / 100 <
-                DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
+                        DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
 }
 static void zswap_update_total_size(void)
author	Linus Torvalds <torvalds@linux-foundation.org>	2018-12-28 19:55:46 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2018-12-28 19:55:46 -0500
commit	f346b0becb1bc62e45495f9cdbae3eef35d0b635 (patch)
tree	ae79f3dfb8e031da51d38f0f095f89d7d23f3643 /mm
parent	00d59fde8532b2d42e80909d2e58678755e04da9 (diff)
parent	0f4991e8fd48987ae476a92cdee6bfec4aff31b8 (diff)