69 files changed, 4547 insertions, 3331 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 4395b12869c8..a03131b6ba8e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -129,28 +129,28 @@ config SPARSEMEM_VMEMMAP
         efficient option when sufficient kernel resources are available.
 config HAVE_MEMBLOCK
-        boolean
+        bool
 config HAVE_MEMBLOCK_NODE_MAP
-        boolean
+        bool
 config HAVE_MEMBLOCK_PHYS_MAP
-        boolean
+        bool
 config HAVE_GENERIC_RCU_GUP
-        boolean
+        bool
 config ARCH_DISCARD_MEMBLOCK
-        boolean
+        bool
 config NO_BOOTMEM
-        boolean
+        bool
 config MEMORY_ISOLATION
-        boolean
+        bool
 config MOVABLE_NODE
-        boolean "Enable to assign a node which has only movable memory"
+        bool "Enable to assign a node which has only movable memory"
        depends on HAVE_MEMBLOCK
        depends on NO_BOOTMEM
        depends on X86_64
@@ -228,12 +228,12 @@ config SPLIT_PTLOCK_CPUS
        default "4"
 config ARCH_ENABLE_SPLIT_PMD_PTLOCK
-        boolean
+        bool
 #
 # support for memory balloon
 config MEMORY_BALLOON
-        boolean
+        bool
 #
 # support for memory balloon compaction
@@ -276,7 +276,7 @@ config MIGRATION
          allocation instead of reclaiming.
 config ARCH_ENABLE_HUGEPAGE_MIGRATION
-        boolean
+        bool
 config PHYS_ADDR_T_64BIT
        def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
@@ -602,6 +602,16 @@ config PGTABLE_MAPPING
          You can check speed with zsmalloc benchmark:
          https://github.com/spartacus06/zsmapbench
+config ZSMALLOC_STAT
+        bool "Export zsmalloc statistics"
+        depends on ZSMALLOC
+        select DEBUG_FS
+        help
+          This option enables code in the zsmalloc to collect various
+          statistics about whats happening in zsmalloc and exports that
+          information to userspace via debugfs.
+          If unsure, say N.
 config GENERIC_EARLY_IOREMAP
        bool
diff --git a/mm/Makefile b/mm/Makefile
index 4bf586e66378..3c1caa2693bd 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -2,8 +2,11 @@
 # Makefile for the linux memory manager.
 #
+KASAN_SANITIZE_slab_common.o := n
+KASAN_SANITIZE_slub.o := n
 mmu-y                   := nommu.o
-mmu-$(CONFIG_MMU)       := fremap.o gup.o highmem.o memory.o mincore.o \
+mmu-$(CONFIG_MMU)       := gup.o highmem.o memory.o mincore.o \
                           mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
                           vmalloc.o pagewalk.o pgtable-generic.o
@@ -49,9 +52,9 @@ obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
 obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
+obj-$(CONFIG_KASAN)     += kasan/
 obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
-obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0ae0df55000b..6dc4580df2af 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -14,19 +14,10 @@
 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-struct backing_dev_info default_backing_dev_info = {
-        .name           = "default",
-        .ra_pages       = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
-        .state          = 0,
-        .capabilities   = BDI_CAP_MAP_COPY,
-};
-EXPORT_SYMBOL_GPL(default_backing_dev_info);
 struct backing_dev_info noop_backing_dev_info = {
        .name           = "noop",
        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
-EXPORT_SYMBOL_GPL(noop_backing_dev_info);
 static struct class *bdi_class;
@@ -40,17 +31,6 @@ LIST_HEAD(bdi_list);
 /* bdi_wq serves all asynchronous writeback tasks */
 struct workqueue_struct *bdi_wq;
-static void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
-{
-        if (wb1 < wb2) {
-                spin_lock(&wb1->list_lock);
-                spin_lock_nested(&wb2->list_lock, 1);
-        } else {
-                spin_lock(&wb2->list_lock);
-                spin_lock_nested(&wb1->list_lock, 1);
-        }
-}
 #ifdef CONFIG_DEBUG_FS
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
@@ -69,10 +49,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        unsigned long bdi_thresh;
-        unsigned long nr_dirty, nr_io, nr_more_io;
+        unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
        struct inode *inode;
-        nr_dirty = nr_io = nr_more_io = 0;
+        nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
        spin_lock(&wb->list_lock);
        list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
                nr_dirty++;
@@ -80,6 +60,9 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
                nr_io++;
        list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
                nr_more_io++;
+        list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list)
+                if (inode->i_state & I_DIRTY_TIME)
+                        nr_dirty_time++;
        spin_unlock(&wb->list_lock);
        global_dirty_limits(&background_thresh, &dirty_thresh);
@@ -98,6 +81,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
                   "b_dirty:            %10lu\n"
                   "b_io:               %10lu\n"
                   "b_more_io:          %10lu\n"
+                   "b_dirty_time:       %10lu\n"
                   "bdi_list:           %10u\n"
                   "state:              %10lx\n",
                   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
@@ -111,6 +95,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
                   nr_dirty,
                   nr_io,
                   nr_more_io,
+                   nr_dirty_time,
                   !list_empty(&bdi->bdi_list), bdi->state);
 #undef K
@@ -264,9 +249,6 @@ static int __init default_bdi_init(void)
        if (!bdi_wq)
                return -ENOMEM;
-        err = bdi_init(&default_backing_dev_info);
-        if (!err)
-                bdi_register(&default_backing_dev_info, NULL, "default");
        err = bdi_init(&noop_backing_dev_info);
        return err;
@@ -355,19 +337,19 @@ EXPORT_SYMBOL(bdi_register_dev);
 */
 static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 {
-        if (!bdi_cap_writeback_dirty(bdi))
+        /* Make sure nobody queues further work */
+        spin_lock_bh(&bdi->wb_lock);
+        if (!test_and_clear_bit(BDI_registered, &bdi->state)) {
+                spin_unlock_bh(&bdi->wb_lock);
                return;
+        }
+        spin_unlock_bh(&bdi->wb_lock);
        /*
         * Make sure nobody finds us on the bdi_list anymore
         */
        bdi_remove_from_list(bdi);
-        /* Make sure nobody queues further work */
-        spin_lock_bh(&bdi->wb_lock);
-        clear_bit(BDI_registered, &bdi->state);
-        spin_unlock_bh(&bdi->wb_lock);
        /*
         * Drain work list and shutdown the delayed_work.  At this point,
         * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
@@ -375,37 +357,22 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
         */
        mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
        flush_delayed_work(&bdi->wb.dwork);
-        WARN_ON(!list_empty(&bdi->work_list));
-        WARN_ON(delayed_work_pending(&bdi->wb.dwork));
 }
 /*
- * This bdi is going away now, make sure that no super_blocks point to it
+ * Called when the device behind @bdi has been removed or ejected.
+ *
+ * We can't really do much here except for reducing the dirty ratio at
+ * the moment.  In the future we should be able to set a flag so that
+ * the filesystem can handle errors at mark_inode_dirty time instead
+ * of only at writeback time.
 */
-static void bdi_prune_sb(struct backing_dev_info *bdi)
-{
-        struct super_block *sb;
-        spin_lock(&sb_lock);
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (sb->s_bdi == bdi)
-                        sb->s_bdi = &default_backing_dev_info;
-        }
-        spin_unlock(&sb_lock);
-}
 void bdi_unregister(struct backing_dev_info *bdi)
 {
-        if (bdi->dev) {
+        if (WARN_ON_ONCE(!bdi->dev))
-                bdi_set_min_ratio(bdi, 0);
+                return;
-                trace_writeback_bdi_unregister(bdi);
-                bdi_prune_sb(bdi);
-                bdi_wb_shutdown(bdi);
+        bdi_set_min_ratio(bdi, 0);
-                bdi_debug_unregister(bdi);
-                device_unregister(bdi->dev);
-                bdi->dev = NULL;
-        }
 }
 EXPORT_SYMBOL(bdi_unregister);
@@ -418,6 +385,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
        INIT_LIST_HEAD(&wb->b_dirty);
        INIT_LIST_HEAD(&wb->b_io);
        INIT_LIST_HEAD(&wb->b_more_io);
+        INIT_LIST_HEAD(&wb->b_dirty_time);
        spin_lock_init(&wb->list_lock);
        INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
 }
@@ -474,37 +442,19 @@ void bdi_destroy(struct backing_dev_info *bdi)
 {
        int i;
-        /*
+        bdi_wb_shutdown(bdi);
-         * Splice our entries to the default_backing_dev_info.  This
-         * condition shouldn't happen.  @wb must be empty at this point and
-         * dirty inodes on it might cause other issues.  This workaround is
-         * added by ce5f8e779519 ("writeback: splice dirty inode entries to
-         * default bdi on bdi_destroy()") without root-causing the issue.
-         *
-         * http://lkml.kernel.org/g/1253038617-30204-11-git-send-email-jens.axboe@oracle.com
-         * http://thread.gmane.org/gmane.linux.file-systems/35341/focus=35350
-         *
-         * We should probably add WARN_ON() to find out whether it still
-         * happens and track it down if so.
-         */
-        if (bdi_has_dirty_io(bdi)) {
-                struct bdi_writeback *dst = &default_backing_dev_info.wb;
-                bdi_lock_two(&bdi->wb, dst);
-                list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
-                list_splice(&bdi->wb.b_io, &dst->b_io);
-                list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
-                spin_unlock(&bdi->wb.list_lock);
-                spin_unlock(&dst->list_lock);
-        }
-        bdi_unregister(bdi);
+        WARN_ON(!list_empty(&bdi->work_list));
        WARN_ON(delayed_work_pending(&bdi->wb.dwork));
+        if (bdi->dev) {
+                bdi_debug_unregister(bdi);
+                device_unregister(bdi->dev);
+                bdi->dev = NULL;
+        }
        for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
                percpu_counter_destroy(&bdi->bdi_stat[i]);
        fprop_local_destroy_percpu(&bdi->completions);
 }
 EXPORT_SYMBOL(bdi_destroy);
@@ -513,13 +463,12 @@ EXPORT_SYMBOL(bdi_destroy);
 * For use from filesystems to quickly init and register a bdi associated
 * with dirty writeback
 */
-int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
+int bdi_setup_and_register(struct backing_dev_info *bdi, char *name)
-                           unsigned int cap)
 {
        int err;
        bdi->name = name;
-        bdi->capabilities = cap;
+        bdi->capabilities = 0;
        err = bdi_init(bdi);
        if (err)
                return err;
diff --git a/mm/cleancache.c b/mm/cleancache.c
index d0eac4350403..053bcd8f12fb 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -25,7 +25,7 @@
 static struct cleancache_ops *cleancache_ops __read_mostly;
 /*
- * Counters available via /sys/kernel/debug/frontswap (if debugfs is
+ * Counters available via /sys/kernel/debug/cleancache (if debugfs is
 * properly configured.  These are for information only so are not protected
 * against increment races.
 */
diff --git a/mm/cma.c b/mm/cma.c
index a85ae28709a3..75016fd1de90 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -199,6 +199,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
        cma->order_per_bit = order_per_bit;
        *res_cma = cma;
        cma_area_count++;
+        totalcma_pages += (size / PAGE_SIZE);
        return 0;
 }
@@ -337,7 +338,6 @@ int __init cma_declare_contiguous(phys_addr_t base,
        if (ret)
                goto err;
-        totalcma_pages += (size / PAGE_SIZE);
        pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M,
                &base);
        return 0;
diff --git a/mm/compaction.c b/mm/compaction.c
index 546e571e9d60..8c0d9459b54a 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,6 +16,7 @@
 #include <linux/sysfs.h>
 #include <linux/balloon_compaction.h>
 #include <linux/page-isolation.h>
+#include <linux/kasan.h>
 #include "internal.h"
 #ifdef CONFIG_COMPACTION
@@ -34,6 +35,17 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
 #endif
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
+#ifdef CONFIG_TRACEPOINTS
+static const char *const compaction_status_string[] = {
+        "deferred",
+        "skipped",
+        "continue",
+        "partial",
+        "complete",
+        "no_suitable_page",
+        "not_suitable_zone",
+};
+#endif
 #define CREATE_TRACE_POINTS
 #include <trace/events/compaction.h>
@@ -61,6 +73,7 @@ static void map_pages(struct list_head *list)
        list_for_each_entry(page, list, lru) {
                arch_alloc_page(page, 0);
                kernel_map_pages(page, 1, 1);
+                kasan_alloc_pages(page, 0);
        }
 }
@@ -113,6 +126,77 @@ static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
 }
 #ifdef CONFIG_COMPACTION
+/* Do not skip compaction more than 64 times */
+#define COMPACT_MAX_DEFER_SHIFT 6
+/*
+ * Compaction is deferred when compaction fails to result in a page
+ * allocation success. 1 << compact_defer_limit compactions are skipped up
+ * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
+ */
+void defer_compaction(struct zone *zone, int order)
+{
+        zone->compact_considered = 0;
+        zone->compact_defer_shift++;
+        if (order < zone->compact_order_failed)
+                zone->compact_order_failed = order;
+        if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
+                zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
+        trace_mm_compaction_defer_compaction(zone, order);
+}
+/* Returns true if compaction should be skipped this time */
+bool compaction_deferred(struct zone *zone, int order)
+{
+        unsigned long defer_limit = 1UL << zone->compact_defer_shift;
+        if (order < zone->compact_order_failed)
+                return false;
+        /* Avoid possible overflow */
+        if (++zone->compact_considered > defer_limit)
+                zone->compact_considered = defer_limit;
+        if (zone->compact_considered >= defer_limit)
+                return false;
+        trace_mm_compaction_deferred(zone, order);
+        return true;
+}
+/*
+ * Update defer tracking counters after successful compaction of given order,
+ * which means an allocation either succeeded (alloc_success == true) or is
+ * expected to succeed.
+ */
+void compaction_defer_reset(struct zone *zone, int order,
+                bool alloc_success)
+{
+        if (alloc_success) {
+                zone->compact_considered = 0;
+                zone->compact_defer_shift = 0;
+        }
+        if (order >= zone->compact_order_failed)
+                zone->compact_order_failed = order + 1;
+        trace_mm_compaction_defer_reset(zone, order);
+}
+/* Returns true if restarting compaction after many failures */
+bool compaction_restarting(struct zone *zone, int order)
+{
+        if (order < zone->compact_order_failed)
+                return false;
+        return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
+                zone->compact_considered >= 1UL << zone->compact_defer_shift;
+}
 /* Returns true if the pageblock should be scanned for pages to isolate. */
 static inline bool isolation_suitable(struct compact_control *cc,
                                        struct page *page)
@@ -408,6 +492,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
                /* If a page was split, advance to the end of it */
                if (isolated) {
+                        cc->nr_freepages += isolated;
+                        if (!strict &&
+                                cc->nr_migratepages <= cc->nr_freepages) {
+                                blockpfn += isolated;
+                                break;
+                        }
                        blockpfn += isolated - 1;
                        cursor += isolated - 1;
                        continue;
@@ -421,11 +512,12 @@ isolate_fail:
        }
+        trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
+                                        nr_scanned, total_isolated);
        /* Record how far we have got within the block */
        *start_pfn = blockpfn;
-        trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
        /*
         * If strict isolation is requested by CMA then check that all the
         * pages requested were isolated. If there were any failures, 0 is
@@ -581,6 +673,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
        unsigned long flags = 0;
        bool locked = false;
        struct page *page = NULL, *valid_page = NULL;
+        unsigned long start_pfn = low_pfn;
        /*
         * Ensure that there are not too many pages isolated from the LRU
@@ -741,7 +834,8 @@ isolate_success:
        if (low_pfn == end_pfn)
                update_pageblock_skip(cc, valid_page, nr_isolated, true);
-        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
+        trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
+                                                nr_scanned, nr_isolated);
        count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
        if (nr_isolated)
@@ -814,7 +908,6 @@ static void isolate_freepages(struct compact_control *cc)
        unsigned long isolate_start_pfn; /* exact pfn we start at */
        unsigned long block_end_pfn;    /* end of current pageblock */
        unsigned long low_pfn;       /* lowest pfn scanner is able to scan */
-        int nr_freepages = cc->nr_freepages;
        struct list_head *freelist = &cc->freepages;
        /*
@@ -839,11 +932,11 @@ static void isolate_freepages(struct compact_control *cc)
         * pages on cc->migratepages. We stop searching if the migrate
         * and free page scanners meet or enough free pages are isolated.
         */
-        for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
+        for (; block_start_pfn >= low_pfn &&
+                        cc->nr_migratepages > cc->nr_freepages;
                                block_end_pfn = block_start_pfn,
                                block_start_pfn -= pageblock_nr_pages,
                                isolate_start_pfn = block_start_pfn) {
-                unsigned long isolated;
                /*
                 * This can iterate a massively long zone without finding any
@@ -868,9 +961,8 @@ static void isolate_freepages(struct compact_control *cc)
                        continue;
                /* Found a block suitable for isolating free pages from. */
-                isolated = isolate_freepages_block(cc, &isolate_start_pfn,
+                isolate_freepages_block(cc, &isolate_start_pfn,
                                        block_end_pfn, freelist, false);
-                nr_freepages += isolated;
                /*
                 * Remember where the free scanner should restart next time,
@@ -902,8 +994,6 @@ static void isolate_freepages(struct compact_control *cc)
         */
        if (block_start_pfn < low_pfn)
                cc->free_pfn = cc->migrate_pfn;
-        cc->nr_freepages = nr_freepages;
 }
 /*
@@ -1015,8 +1105,10 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
                                                                isolate_mode);
-                if (!low_pfn || cc->contended)
+                if (!low_pfn || cc->contended) {
+                        acct_isolated(zone, cc);
                        return ISOLATE_ABORT;
+                }
                /*
                 * Either we isolated something and proceed with migration. Or
@@ -1037,7 +1129,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
        return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
 }
-static int compact_finished(struct zone *zone, struct compact_control *cc,
+static int __compact_finished(struct zone *zone, struct compact_control *cc,
                            const int migratetype)
 {
        unsigned int order;
@@ -1088,11 +1180,24 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
                        return COMPACT_PARTIAL;
                /* Job done if allocation would set block type */
-                if (cc->order >= pageblock_order && area->nr_free)
+                if (order >= pageblock_order && area->nr_free)
                        return COMPACT_PARTIAL;
        }
-        return COMPACT_CONTINUE;
+        return COMPACT_NO_SUITABLE_PAGE;
+}
+static int compact_finished(struct zone *zone, struct compact_control *cc,
+                            const int migratetype)
+{
+        int ret;
+        ret = __compact_finished(zone, cc, migratetype);
+        trace_mm_compaction_finished(zone, cc->order, ret);
+        if (ret == COMPACT_NO_SUITABLE_PAGE)
+                ret = COMPACT_CONTINUE;
+        return ret;
 }
 /*
@@ -1102,7 +1207,7 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
 *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
 *   COMPACT_CONTINUE - If compaction should run now
 */
-unsigned long compaction_suitable(struct zone *zone, int order,
+static unsigned long __compaction_suitable(struct zone *zone, int order,
                                        int alloc_flags, int classzone_idx)
 {
        int fragindex;
@@ -1146,11 +1251,24 @@ unsigned long compaction_suitable(struct zone *zone, int order,
         */
        fragindex = fragmentation_index(zone, order);
        if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
-                return COMPACT_SKIPPED;
+                return COMPACT_NOT_SUITABLE_ZONE;
        return COMPACT_CONTINUE;
 }
+unsigned long compaction_suitable(struct zone *zone, int order,
+                                        int alloc_flags, int classzone_idx)
+{
+        unsigned long ret;
+        ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
+        trace_mm_compaction_suitable(zone, order, ret);
+        if (ret == COMPACT_NOT_SUITABLE_ZONE)
+                ret = COMPACT_SKIPPED;
+        return ret;
+}
 static int compact_zone(struct zone *zone, struct compact_control *cc)
 {
        int ret;
@@ -1197,7 +1315,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
        }
-        trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
+        trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
+                                cc->free_pfn, end_pfn, sync);
        migrate_prep_local();
@@ -1299,7 +1418,8 @@ out:
                        zone->compact_cached_free_pfn = free_pfn;
        }
-        trace_mm_compaction_end(ret);
+        trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
+                                cc->free_pfn, end_pfn, sync, ret);
        return ret;
 }
@@ -1335,22 +1455,20 @@ int sysctl_extfrag_threshold = 500;
 /**
 * try_to_compact_pages - Direct compact to satisfy a high-order allocation
- * @zonelist: The zonelist used for the current allocation
- * @order: The order of the current allocation
 * @gfp_mask: The GFP mask of the current allocation
- * @nodemask: The allowed nodes to allocate from
+ * @order: The order of the current allocation
+ * @alloc_flags: The allocation flags of the current allocation
+ * @ac: The context of current allocation
 * @mode: The migration mode for async, sync light, or sync migration
 * @contended: Return value that determines if compaction was aborted due to
 *             need_resched() or lock contention
 *
 * This is the main entry point for direct page compaction.
 */
-unsigned long try_to_compact_pages(struct zonelist *zonelist,
+unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
-                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
+                        int alloc_flags, const struct alloc_context *ac,
-                        enum migrate_mode mode, int *contended,
+                        enum migrate_mode mode, int *contended)
-                        int alloc_flags, int classzone_idx)
 {
-        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
        int may_perform_io = gfp_mask & __GFP_IO;
        struct zoneref *z;
@@ -1364,9 +1482,11 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
        if (!order || !may_enter_fs || !may_perform_io)
                return COMPACT_SKIPPED;
+        trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode);
        /* Compact each zone in the list */
-        for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
+        for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
-                                                                nodemask) {
+                                                                ac->nodemask) {
                int status;
                int zone_contended;
@@ -1374,7 +1494,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        continue;
                status = compact_zone_order(zone, order, gfp_mask, mode,
-                                &zone_contended, alloc_flags, classzone_idx);
+                                &zone_contended, alloc_flags,
+                                ac->classzone_idx);
                rc = max(status, rc);
                /*
                 * It takes at least one zone that wasn't lock contended
@@ -1384,7 +1505,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
                /* If a normal allocation would succeed, stop compacting */
                if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
-                                        classzone_idx, alloc_flags)) {
+                                        ac->classzone_idx, alloc_flags)) {
                        /*
                         * We think the allocation will succeed in this zone,
                         * but it is not certain, hence the false. The caller
diff --git a/mm/debug.c b/mm/debug.c
index 0e58f3211f89..3eb3ac2fcee7 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -130,7 +130,6 @@ static const struct trace_print_flags vmaflags_names[] = {
        {VM_ACCOUNT,                    "account"       },
        {VM_NORESERVE,                  "noreserve"     },
        {VM_HUGETLB,                    "hugetlb"       },
-        {VM_NONLINEAR,                  "nonlinear"     },
 #if defined(CONFIG_X86)
        {VM_PAT,                        "pat"           },
 #elif defined(CONFIG_PPC)
@@ -174,7 +173,7 @@ void dump_mm(const struct mm_struct *mm)
                "get_unmapped_area %p\n"
 #endif
                "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
-                "pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n"
+                "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
                "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
                "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
                "start_code %lx end_code %lx start_data %lx end_data %lx\n"
@@ -207,6 +206,7 @@ void dump_mm(const struct mm_struct *mm)
                mm->pgd, atomic_read(&mm->mm_users),
                atomic_read(&mm->mm_count),
                atomic_long_read((atomic_long_t *)&mm->nr_ptes),
+                mm_nr_pmds((struct mm_struct *)mm),
                mm->map_count,
                mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
                mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 2ad7adf4f0a4..4a3907cf79f8 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -28,6 +28,7 @@
 SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 {
        struct fd f = fdget(fd);
+        struct inode *inode;
        struct address_space *mapping;
        struct backing_dev_info *bdi;
        loff_t endbyte;                 /* inclusive */
@@ -39,7 +40,8 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
        if (!f.file)
                return -EBADF;
-        if (S_ISFIFO(file_inode(f.file)->i_mode)) {
+        inode = file_inode(f.file);
+        if (S_ISFIFO(inode->i_mode)) {
                ret = -ESPIPE;
                goto out;
        }
@@ -50,7 +52,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
                goto out;
        }
-        if (mapping->a_ops->get_xip_mem) {
+        if (IS_DAX(inode)) {
                switch (advice) {
                case POSIX_FADV_NORMAL:
                case POSIX_FADV_RANDOM:
@@ -73,7 +75,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
        else
                endbyte--;              /* inclusive */
-        bdi = mapping->backing_dev_info;
+        bdi = inode_to_bdi(mapping->host);
        switch (advice) {
        case POSIX_FADV_NORMAL:
@@ -113,7 +115,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
        case POSIX_FADV_NOREUSE:
                break;
        case POSIX_FADV_DONTNEED:
-                if (!bdi_write_congested(mapping->backing_dev_info))
+                if (!bdi_write_congested(bdi))
                        __filemap_fdatawrite_range(mapping, offset, endbyte,
                                                   WB_SYNC_NONE);
diff --git a/mm/filemap.c b/mm/filemap.c
index 673e4581a2e5..ad7242043bdb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -211,7 +211,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
         */
        if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
                dec_zone_page_state(page, NR_FILE_DIRTY);
-                dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+                dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
        }
 }
@@ -1695,8 +1695,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
        loff_t *ppos = &iocb->ki_pos;
        loff_t pos = *ppos;
-        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
+        if (io_is_direct(file)) {
-        if (file->f_flags & O_DIRECT) {
                struct address_space *mapping = file->f_mapping;
                struct inode *inode = mapping->host;
                size_t count = iov_iter_count(iter);
@@ -1723,9 +1722,11 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
                 * we've already read everything we wanted to, or if
                 * there was a short read because we hit EOF, go ahead
                 * and return.  Otherwise fallthrough to buffered io for
-                 * the rest of the read.
+                 * the rest of the read.  Buffered reads will not work for
+                 * DAX files, so don't bother trying.
                 */
-                if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) {
+                if (retval < 0 || !iov_iter_count(iter) || *ppos >= size ||
+                    IS_DAX(inode)) {
                        file_accessed(file);
                        goto out;
                }
@@ -2087,7 +2088,6 @@ const struct vm_operations_struct generic_file_vm_ops = {
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = filemap_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
 /* This is used for a general mmap of a disk file */
@@ -2565,7 +2565,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        size_t          count = iov_iter_count(from);
        /* We can write back this queue in page reclaim */
-        current->backing_dev_info = mapping->backing_dev_info;
+        current->backing_dev_info = inode_to_bdi(inode);
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err)
                goto out;
@@ -2583,18 +2583,20 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        if (err)
                goto out;
-        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
+        if (io_is_direct(file)) {
-        if (unlikely(file->f_flags & O_DIRECT)) {
                loff_t endbyte;
                written = generic_file_direct_write(iocb, from, pos);
-                if (written < 0 || written == count)
-                        goto out;
                /*
-                 * direct-io write to a hole: fall through to buffered I/O
+                 * If the write stopped short of completing, fall back to
-                 * for completing the rest of the request.
+                 * buffered writes.  Some filesystems do this for writes to
+                 * holes, for example.  For DAX files, a buffered write will
+                 * not succeed (even if it did, DAX does not handle dirty
+                 * page-cache pages correctly).
                 */
+                if (written < 0 || written == count || IS_DAX(inode))
+                        goto out;
                pos += written;
                count -= written;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
deleted file mode 100644
index 0d105aeff82f..000000000000
--- a/mm/filemap_xip.c
+++ /dev/null
@@ -1,478 +0,0 @@
-/*
- *      linux/mm/filemap_xip.c
- *
- * Copyright (C) 2005 IBM Corporation
- * Author: Carsten Otte <cotte@de.ibm.com>
- *
- * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds
- *
- */
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include <linux/export.h>
-#include <linux/uio.h>
-#include <linux/rmap.h>
-#include <linux/mmu_notifier.h>
-#include <linux/sched.h>
-#include <linux/seqlock.h>
-#include <linux/mutex.h>
-#include <linux/gfp.h>
-#include <asm/tlbflush.h>
-#include <asm/io.h>
-/*
- * We do use our own empty page to avoid interference with other users
- * of ZERO_PAGE(), such as /dev/zero
- */
-static DEFINE_MUTEX(xip_sparse_mutex);
-static seqcount_t xip_sparse_seq = SEQCNT_ZERO(xip_sparse_seq);
-static struct page *__xip_sparse_page;
-/* called under xip_sparse_mutex */
-static struct page *xip_sparse_page(void)
-{
-        if (!__xip_sparse_page) {
-                struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
-                if (page)
-                        __xip_sparse_page = page;
-        }
-        return __xip_sparse_page;
-}
-/*
- * This is a file read routine for execute in place files, and uses
- * the mapping->a_ops->get_xip_mem() function for the actual low-level
- * stuff.
- *
- * Note the struct file* is not used at all.  It may be NULL.
- */
-static ssize_t
-do_xip_mapping_read(struct address_space *mapping,
-                    struct file_ra_state *_ra,
-                    struct file *filp,
-                    char __user *buf,
-                    size_t len,
-                    loff_t *ppos)
-{
-        struct inode *inode = mapping->host;
-        pgoff_t index, end_index;
-        unsigned long offset;
-        loff_t isize, pos;
-        size_t copied = 0, error = 0;
-        BUG_ON(!mapping->a_ops->get_xip_mem);
-        pos = *ppos;
-        index = pos >> PAGE_CACHE_SHIFT;
-        offset = pos & ~PAGE_CACHE_MASK;
-        isize = i_size_read(inode);
-        if (!isize)
-                goto out;
-        end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
-        do {
-                unsigned long nr, left;
-                void *xip_mem;
-                unsigned long xip_pfn;
-                int zero = 0;
-                /* nr is the maximum number of bytes to copy from this page */
-                nr = PAGE_CACHE_SIZE;
-                if (index >= end_index) {
-                        if (index > end_index)
-                                goto out;
-                        nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
-                        if (nr <= offset) {
-                                goto out;
-                        }
-                }
-                nr = nr - offset;
-                if (nr > len - copied)
-                        nr = len - copied;
-                error = mapping->a_ops->get_xip_mem(mapping, index, 0,
-                                                        &xip_mem, &xip_pfn);
-                if (unlikely(error)) {
-                        if (error == -ENODATA) {
-                                /* sparse */
-                                zero = 1;
-                        } else
-                                goto out;
-                }
-                /* If users can be writing to this page using arbitrary
-                 * virtual addresses, take care about potential aliasing
-                 * before reading the page on the kernel side.
-                 */
-                if (mapping_writably_mapped(mapping))
-                        /* address based flush */ ;
-                /*
-                 * Ok, we have the mem, so now we can copy it to user space...
-                 *
-                 * The actor routine returns how many bytes were actually used..
-                 * NOTE! This may not be the same as how much of a user buffer
-                 * we filled up (we may be padding etc), so we can only update
-                 * "pos" here (the actor routine has to update the user buffer
-                 * pointers and the remaining count).
-                 */
-                if (!zero)
-                        left = __copy_to_user(buf+copied, xip_mem+offset, nr);
-                else
-                        left = __clear_user(buf + copied, nr);
-                if (left) {
-                        error = -EFAULT;
-                        goto out;
-                }
-                copied += (nr - left);
-                offset += (nr - left);
-                index += offset >> PAGE_CACHE_SHIFT;
-                offset &= ~PAGE_CACHE_MASK;
-        } while (copied < len);
-out:
-        *ppos = pos + copied;
-        if (filp)
-                file_accessed(filp);
-        return (copied ? copied : error);
-}
-ssize_t
-xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
-{
-        if (!access_ok(VERIFY_WRITE, buf, len))
-                return -EFAULT;
-        return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
-                            buf, len, ppos);
-}
-EXPORT_SYMBOL_GPL(xip_file_read);
-/*
- * __xip_unmap is invoked from xip_unmap and xip_write
- *
- * This function walks all vmas of the address_space and unmaps the
- * __xip_sparse_page when found at pgoff.
- */
-static void __xip_unmap(struct address_space * mapping, unsigned long pgoff)
-{
-        struct vm_area_struct *vma;
-        struct page *page;
-        unsigned count;
-        int locked = 0;
-        count = read_seqcount_begin(&xip_sparse_seq);
-        page = __xip_sparse_page;
-        if (!page)
-                return;
-retry:
-        i_mmap_lock_read(mapping);
-        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
-                pte_t *pte, pteval;
-                spinlock_t *ptl;
-                struct mm_struct *mm = vma->vm_mm;
-                unsigned long address = vma->vm_start +
-                        ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-                BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-                pte = page_check_address(page, mm, address, &ptl, 1);
-                if (pte) {
-                        /* Nuke the page table entry. */
-                        flush_cache_page(vma, address, pte_pfn(*pte));
-                        pteval = ptep_clear_flush(vma, address, pte);
-                        page_remove_rmap(page);
-                        dec_mm_counter(mm, MM_FILEPAGES);
-                        BUG_ON(pte_dirty(pteval));
-                        pte_unmap_unlock(pte, ptl);
-                        /* must invalidate_page _before_ freeing the page */
-                        mmu_notifier_invalidate_page(mm, address);
-                        page_cache_release(page);
-                }
-        }
-        i_mmap_unlock_read(mapping);
-        if (locked) {
-                mutex_unlock(&xip_sparse_mutex);
-        } else if (read_seqcount_retry(&xip_sparse_seq, count)) {
-                mutex_lock(&xip_sparse_mutex);
-                locked = 1;
-                goto retry;
-        }
-}
-/*
- * xip_fault() is invoked via the vma operations vector for a
- * mapped memory region to read in file data during a page fault.
- *
- * This function is derived from filemap_fault, but used for execute in place
- */
-static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-        struct file *file = vma->vm_file;
-        struct address_space *mapping = file->f_mapping;
-        struct inode *inode = mapping->host;
-        pgoff_t size;
-        void *xip_mem;
-        unsigned long xip_pfn;
-        struct page *page;
-        int error;
-        /* XXX: are VM_FAULT_ codes OK? */
-again:
-        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        if (vmf->pgoff >= size)
-                return VM_FAULT_SIGBUS;
-        error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
-                                                &xip_mem, &xip_pfn);
-        if (likely(!error))
-                goto found;
-        if (error != -ENODATA)
-                return VM_FAULT_OOM;
-        /* sparse block */
-        if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
-            (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) &&
-            (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
-                int err;
-                /* maybe shared writable, allocate new block */
-                mutex_lock(&xip_sparse_mutex);
-                error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
-                                                        &xip_mem, &xip_pfn);
-                mutex_unlock(&xip_sparse_mutex);
-                if (error)
-                        return VM_FAULT_SIGBUS;
-                /* unmap sparse mappings at pgoff from all other vmas */
-                __xip_unmap(mapping, vmf->pgoff);
-found:
-                err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
-                                                        xip_pfn);
-                if (err == -ENOMEM)
-                        return VM_FAULT_OOM;
-                /*
-                 * err == -EBUSY is fine, we've raced against another thread
-                 * that faulted-in the same page
-                 */
-                if (err != -EBUSY)
-                        BUG_ON(err);
-                return VM_FAULT_NOPAGE;
-        } else {
-                int err, ret = VM_FAULT_OOM;
-                mutex_lock(&xip_sparse_mutex);
-                write_seqcount_begin(&xip_sparse_seq);
-                error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
-                                                        &xip_mem, &xip_pfn);
-                if (unlikely(!error)) {
-                        write_seqcount_end(&xip_sparse_seq);
-                        mutex_unlock(&xip_sparse_mutex);
-                        goto again;
-                }
-                if (error != -ENODATA)
-                        goto out;
-                /* not shared and writable, use xip_sparse_page() */
-                page = xip_sparse_page();
-                if (!page)
-                        goto out;
-                err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
-                                                        page);
-                if (err == -ENOMEM)
-                        goto out;
-                ret = VM_FAULT_NOPAGE;
-out:
-                write_seqcount_end(&xip_sparse_seq);
-                mutex_unlock(&xip_sparse_mutex);
-                return ret;
-        }
-}
-static const struct vm_operations_struct xip_file_vm_ops = {
-        .fault  = xip_file_fault,
-        .page_mkwrite   = filemap_page_mkwrite,
-        .remap_pages = generic_file_remap_pages,
-};
-int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
-{
-        BUG_ON(!file->f_mapping->a_ops->get_xip_mem);
-        file_accessed(file);
-        vma->vm_ops = &xip_file_vm_ops;
-        vma->vm_flags |= VM_MIXEDMAP;
-        return 0;
-}
-EXPORT_SYMBOL_GPL(xip_file_mmap);
-static ssize_t
-__xip_file_write(struct file *filp, const char __user *buf,
-                  size_t count, loff_t pos, loff_t *ppos)
-{
-        struct address_space * mapping = filp->f_mapping;
-        const struct address_space_operations *a_ops = mapping->a_ops;
-        struct inode    *inode = mapping->host;
-        long            status = 0;
-        size_t          bytes;
-        ssize_t         written = 0;
-        BUG_ON(!mapping->a_ops->get_xip_mem);
-        do {
-                unsigned long index;
-                unsigned long offset;
-                size_t copied;
-                void *xip_mem;
-                unsigned long xip_pfn;
-                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
-                index = pos >> PAGE_CACHE_SHIFT;
-                bytes = PAGE_CACHE_SIZE - offset;
-                if (bytes > count)
-                        bytes = count;
-                status = a_ops->get_xip_mem(mapping, index, 0,
-                                                &xip_mem, &xip_pfn);
-                if (status == -ENODATA) {
-                        /* we allocate a new page unmap it */
-                        mutex_lock(&xip_sparse_mutex);
-                        status = a_ops->get_xip_mem(mapping, index, 1,
-                                                        &xip_mem, &xip_pfn);
-                        mutex_unlock(&xip_sparse_mutex);
-                        if (!status)
-                                /* unmap page at pgoff from all other vmas */
-                                __xip_unmap(mapping, index);
-                }
-                if (status)
-                        break;
-                copied = bytes -
-                        __copy_from_user_nocache(xip_mem + offset, buf, bytes);
-                if (likely(copied > 0)) {
-                        status = copied;
-                        if (status >= 0) {
-                                written += status;
-                                count -= status;
-                                pos += status;
-                                buf += status;
-                        }
-                }
-                if (unlikely(copied != bytes))
-                        if (status >= 0)
-                                status = -EFAULT;
-                if (status < 0)
-                        break;
-        } while (count);
-        *ppos = pos;
-        /*
-         * No need to use i_size_read() here, the i_size
-         * cannot change under us because we hold i_mutex.
-         */
-        if (pos > inode->i_size) {
-                i_size_write(inode, pos);
-                mark_inode_dirty(inode);
-        }
-        return written ? written : status;
-}
-ssize_t
-xip_file_write(struct file *filp, const char __user *buf, size_t len,
-               loff_t *ppos)
-{
-        struct address_space *mapping = filp->f_mapping;
-        struct inode *inode = mapping->host;
-        size_t count;
-        loff_t pos;
-        ssize_t ret;
-        mutex_lock(&inode->i_mutex);
-        if (!access_ok(VERIFY_READ, buf, len)) {
-                ret=-EFAULT;
-                goto out_up;
-        }
-        pos = *ppos;
-        count = len;
-        /* We can write back this queue in page reclaim */
-        current->backing_dev_info = mapping->backing_dev_info;
-        ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
-        if (ret)
-                goto out_backing;
-        if (count == 0)
-                goto out_backing;
-        ret = file_remove_suid(filp);
-        if (ret)
-                goto out_backing;
-        ret = file_update_time(filp);
-        if (ret)
-                goto out_backing;
-        ret = __xip_file_write (filp, buf, count, pos, ppos);
- out_backing:
-        current->backing_dev_info = NULL;
- out_up:
-        mutex_unlock(&inode->i_mutex);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(xip_file_write);
-/*
- * truncate a page used for execute in place
- * functionality is analog to block_truncate_page but does use get_xip_mem
- * to get the page instead of page cache
- */
-int
-xip_truncate_page(struct address_space *mapping, loff_t from)
-{
-        pgoff_t index = from >> PAGE_CACHE_SHIFT;
-        unsigned offset = from & (PAGE_CACHE_SIZE-1);
-        unsigned blocksize;
-        unsigned length;
-        void *xip_mem;
-        unsigned long xip_pfn;
-        int err;
-        BUG_ON(!mapping->a_ops->get_xip_mem);
-        blocksize = 1 << mapping->host->i_blkbits;
-        length = offset & (blocksize - 1);
-        /* Block boundary? Nothing to do */
-        if (!length)
-                return 0;
-        length = blocksize - length;
-        err = mapping->a_ops->get_xip_mem(mapping, index, 0,
-                                                &xip_mem, &xip_pfn);
-        if (unlikely(err)) {
-                if (err == -ENODATA)
-                        /* Hole? No need to truncate */
-                        return 0;
-                else
-                        return err;
-        }
-        memset(xip_mem + offset, 0, length);
-        return 0;
-}
-EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/fremap.c b/mm/fremap.c
deleted file mode 100644
index 2805d71cf476..000000000000
--- a/mm/fremap.c
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- *   linux/mm/fremap.c
- * 
- * Explicit pagetable population and nonlinear (random) mappings support.
- *
- * started by Ingo Molnar, Copyright (C) 2002, 2003
- */
-#include <linux/export.h>
-#include <linux/backing-dev.h>
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/file.h>
-#include <linux/mman.h>
-#include <linux/pagemap.h>
-#include <linux/swapops.h>
-#include <linux/rmap.h>
-#include <linux/syscalls.h>
-#include <linux/mmu_notifier.h>
-#include <asm/mmu_context.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include "internal.h"
-static int mm_counter(struct page *page)
-{
-        return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES;
-}
-static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
-                        unsigned long addr, pte_t *ptep)
-{
-        pte_t pte = *ptep;
-        struct page *page;
-        swp_entry_t entry;
-        if (pte_present(pte)) {
-                flush_cache_page(vma, addr, pte_pfn(pte));
-                pte = ptep_clear_flush_notify(vma, addr, ptep);
-                page = vm_normal_page(vma, addr, pte);
-                if (page) {
-                        if (pte_dirty(pte))
-                                set_page_dirty(page);
-                        update_hiwater_rss(mm);
-                        dec_mm_counter(mm, mm_counter(page));
-                        page_remove_rmap(page);
-                        page_cache_release(page);
-                }
-        } else {        /* zap_pte() is not called when pte_none() */
-                if (!pte_file(pte)) {
-                        update_hiwater_rss(mm);
-                        entry = pte_to_swp_entry(pte);
-                        if (non_swap_entry(entry)) {
-                                if (is_migration_entry(entry)) {
-                                        page = migration_entry_to_page(entry);
-                                        dec_mm_counter(mm, mm_counter(page));
-                                }
-                        } else {
-                                free_swap_and_cache(entry);
-                                dec_mm_counter(mm, MM_SWAPENTS);
-                        }
-                }
-                pte_clear_not_present_full(mm, addr, ptep, 0);
-        }
-}
-/*
- * Install a file pte to a given virtual memory address, release any
- * previously existing mapping.
- */
-static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
-                unsigned long addr, unsigned long pgoff, pgprot_t prot)
-{
-        int err = -ENOMEM;
-        pte_t *pte, ptfile;
-        spinlock_t *ptl;
-        pte = get_locked_pte(mm, addr, &ptl);
-        if (!pte)
-                goto out;
-        ptfile = pgoff_to_pte(pgoff);
-        if (!pte_none(*pte))
-                zap_pte(mm, vma, addr, pte);
-        set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile));
-        /*
-         * We don't need to run update_mmu_cache() here because the "file pte"
-         * being installed by install_file_pte() is not a real pte - it's a
-         * non-present entry (like a swap entry), noting what file offset should
-         * be mapped there when there's a fault (in a non-linear vma where
-         * that's not obvious).
-         */
-        pte_unmap_unlock(pte, ptl);
-        err = 0;
-out:
-        return err;
-}
-int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
-                             unsigned long size, pgoff_t pgoff)
-{
-        struct mm_struct *mm = vma->vm_mm;
-        int err;
-        do {
-                err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot);
-                if (err)
-                        return err;
-                size -= PAGE_SIZE;
-                addr += PAGE_SIZE;
-                pgoff++;
-        } while (size);
-        return 0;
-}
-EXPORT_SYMBOL(generic_file_remap_pages);
-/**
- * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
- * @start: start of the remapped virtual memory range
- * @size: size of the remapped virtual memory range
- * @prot: new protection bits of the range (see NOTE)
- * @pgoff: to-be-mapped page of the backing store file
- * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
- *
- * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma
- * (shared backing store file).
- *
- * This syscall works purely via pagetables, so it's the most efficient
- * way to map the same (large) file into a given virtual window. Unlike
- * mmap()/mremap() it does not create any new vmas. The new mappings are
- * also safe across swapout.
- *
- * NOTE: the @prot parameter right now is ignored (but must be zero),
- * and the vma's default protection is used. Arbitrary protections
- * might be implemented in the future.
- */
-SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
-                unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
-{
-        struct mm_struct *mm = current->mm;
-        struct address_space *mapping;
-        struct vm_area_struct *vma;
-        int err = -EINVAL;
-        int has_write_lock = 0;
-        vm_flags_t vm_flags = 0;
-        pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
-                        "See Documentation/vm/remap_file_pages.txt.\n",
-                        current->comm, current->pid);
-        if (prot)
-                return err;
-        /*
-         * Sanitize the syscall parameters:
-         */
-        start = start & PAGE_MASK;
-        size = size & PAGE_MASK;
-        /* Does the address range wrap, or is the span zero-sized? */
-        if (start + size <= start)
-                return err;
-        /* Does pgoff wrap? */
-        if (pgoff + (size >> PAGE_SHIFT) < pgoff)
-                return err;
-        /* Can we represent this offset inside this architecture's pte's? */
-#if PTE_FILE_MAX_BITS < BITS_PER_LONG
-        if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
-                return err;
-#endif
-        /* We need down_write() to change vma->vm_flags. */
-        down_read(&mm->mmap_sem);
- retry:
-        vma = find_vma(mm, start);
-        /*
-         * Make sure the vma is shared, that it supports prefaulting,
-         * and that the remapped range is valid and fully within
-         * the single existing vma.
-         */
-        if (!vma || !(vma->vm_flags & VM_SHARED))
-                goto out;
-        if (!vma->vm_ops || !vma->vm_ops->remap_pages)
-                goto out;
-        if (start < vma->vm_start || start + size > vma->vm_end)
-                goto out;
-        /* Must set VM_NONLINEAR before any pages are populated. */
-        if (!(vma->vm_flags & VM_NONLINEAR)) {
-                /*
-                 * vm_private_data is used as a swapout cursor
-                 * in a VM_NONLINEAR vma.
-                 */
-                if (vma->vm_private_data)
-                        goto out;
-                /* Don't need a nonlinear mapping, exit success */
-                if (pgoff == linear_page_index(vma, start)) {
-                        err = 0;
-                        goto out;
-                }
-                if (!has_write_lock) {
-get_write_lock:
-                        up_read(&mm->mmap_sem);
-                        down_write(&mm->mmap_sem);
-                        has_write_lock = 1;
-                        goto retry;
-                }
-                mapping = vma->vm_file->f_mapping;
-                /*
-                 * page_mkclean doesn't work on nonlinear vmas, so if
-                 * dirty pages need to be accounted, emulate with linear
-                 * vmas.
-                 */
-                if (mapping_cap_account_dirty(mapping)) {
-                        unsigned long addr;
-                        struct file *file = get_file(vma->vm_file);
-                        /* mmap_region may free vma; grab the info now */
-                        vm_flags = vma->vm_flags;
-                        addr = mmap_region(file, start, size, vm_flags, pgoff);
-                        fput(file);
-                        if (IS_ERR_VALUE(addr)) {
-                                err = addr;
-                        } else {
-                                BUG_ON(addr != start);
-                                err = 0;
-                        }
-                        goto out_freed;
-                }
-                i_mmap_lock_write(mapping);
-                flush_dcache_mmap_lock(mapping);
-                vma->vm_flags |= VM_NONLINEAR;
-                vma_interval_tree_remove(vma, &mapping->i_mmap);
-                vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
-                flush_dcache_mmap_unlock(mapping);
-                i_mmap_unlock_write(mapping);
-        }
-        if (vma->vm_flags & VM_LOCKED) {
-                /*
-                 * drop PG_Mlocked flag for over-mapped range
-                 */
-                if (!has_write_lock)
-                        goto get_write_lock;
-                vm_flags = vma->vm_flags;
-                munlock_vma_pages_range(vma, start, start + size);
-                vma->vm_flags = vm_flags;
-        }
-        mmu_notifier_invalidate_range_start(mm, start, start + size);
-        err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
-        mmu_notifier_invalidate_range_end(mm, start, start + size);
-        /*
-         * We can't clear VM_NONLINEAR because we'd have to do
-         * it after ->populate completes, and that would prevent
-         * downgrading the lock.  (Locks can't be upgraded).
-         */
-out:
-        if (vma)
-                vm_flags = vma->vm_flags;
-out_freed:
-        if (likely(!has_write_lock))
-                up_read(&mm->mmap_sem);
-        else
-                up_write(&mm->mmap_sem);
-        if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK)))
-                mm_populate(start, size);
-        return err;
-}
diff --git a/mm/gup.c b/mm/gup.c
index 8dd50ce6326f..a6e24e246f86 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -55,7 +55,7 @@ retry:
                 */
                if (likely(!(flags & FOLL_MIGRATION)))
                        goto no_page;
-                if (pte_none(pte) || pte_file(pte))
+                if (pte_none(pte))
                        goto no_page;
                entry = pte_to_swp_entry(pte);
                if (!is_migration_entry(entry))
@@ -64,7 +64,7 @@ retry:
                migration_entry_wait(mm, pmd, address);
                goto retry;
        }
-        if ((flags & FOLL_NUMA) && pte_numa(pte))
+        if ((flags & FOLL_NUMA) && pte_protnone(pte))
                goto no_page;
        if ((flags & FOLL_WRITE) && !pte_write(pte)) {
                pte_unmap_unlock(ptep, ptl);
@@ -167,10 +167,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
        if (pud_none(*pud))
                return no_page_table(vma, flags);
        if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
-                if (flags & FOLL_GET)
+                page = follow_huge_pud(mm, address, pud, flags);
-                        return NULL;
+                if (page)
-                page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
+                        return page;
-                return page;
+                return no_page_table(vma, flags);
        }
        if (unlikely(pud_bad(*pud)))
                return no_page_table(vma, flags);
@@ -179,21 +179,12 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
        if (pmd_none(*pmd))
                return no_page_table(vma, flags);
        if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
-                page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
+                page = follow_huge_pmd(mm, address, pmd, flags);
-                if (flags & FOLL_GET) {
+                if (page)
-                        /*
+                        return page;
-                         * Refcount on tail pages are not well-defined and
+                return no_page_table(vma, flags);
-                         * shouldn't be taken. The caller should handle a NULL
-                         * return when trying to follow tail pages.
-                         */
-                        if (PageHead(page))
-                                get_page(page);
-                        else
-                                page = NULL;
-                }
-                return page;
        }
-        if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+        if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
                return no_page_table(vma, flags);
        if (pmd_trans_huge(*pmd)) {
                if (flags & FOLL_SPLIT) {
@@ -584,6 +575,185 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
        return 0;
 }
+static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
+                                                struct mm_struct *mm,
+                                                unsigned long start,
+                                                unsigned long nr_pages,
+                                                int write, int force,
+                                                struct page **pages,
+                                                struct vm_area_struct **vmas,
+                                                int *locked, bool notify_drop,
+                                                unsigned int flags)
+{
+        long ret, pages_done;
+        bool lock_dropped;
+        if (locked) {
+                /* if VM_FAULT_RETRY can be returned, vmas become invalid */
+                BUG_ON(vmas);
+                /* check caller initialized locked */
+                BUG_ON(*locked != 1);
+        }
+        if (pages)
+                flags |= FOLL_GET;
+        if (write)
+                flags |= FOLL_WRITE;
+        if (force)
+                flags |= FOLL_FORCE;
+        pages_done = 0;
+        lock_dropped = false;
+        for (;;) {
+                ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
+                                       vmas, locked);
+                if (!locked)
+                        /* VM_FAULT_RETRY couldn't trigger, bypass */
+                        return ret;
+                /* VM_FAULT_RETRY cannot return errors */
+                if (!*locked) {
+                        BUG_ON(ret < 0);
+                        BUG_ON(ret >= nr_pages);
+                }
+                if (!pages)
+                        /* If it's a prefault don't insist harder */
+                        return ret;
+                if (ret > 0) {
+                        nr_pages -= ret;
+                        pages_done += ret;
+                        if (!nr_pages)
+                                break;
+                }
+                if (*locked) {
+                        /* VM_FAULT_RETRY didn't trigger */
+                        if (!pages_done)
+                                pages_done = ret;
+                        break;
+                }
+                /* VM_FAULT_RETRY triggered, so seek to the faulting offset */
+                pages += ret;
+                start += ret << PAGE_SHIFT;
+                /*
+                 * Repeat on the address that fired VM_FAULT_RETRY
+                 * without FAULT_FLAG_ALLOW_RETRY but with
+                 * FAULT_FLAG_TRIED.
+                 */
+                *locked = 1;
+                lock_dropped = true;
+                down_read(&mm->mmap_sem);
+                ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
+                                       pages, NULL, NULL);
+                if (ret != 1) {
+                        BUG_ON(ret > 1);
+                        if (!pages_done)
+                                pages_done = ret;
+                        break;
+                }
+                nr_pages--;
+                pages_done++;
+                if (!nr_pages)
+                        break;
+                pages++;
+                start += PAGE_SIZE;
+        }
+        if (notify_drop && lock_dropped && *locked) {
+                /*
+                 * We must let the caller know we temporarily dropped the lock
+                 * and so the critical section protected by it was lost.
+                 */
+                up_read(&mm->mmap_sem);
+                *locked = 0;
+        }
+        return pages_done;
+}
+/*
+ * We can leverage the VM_FAULT_RETRY functionality in the page fault
+ * paths better by using either get_user_pages_locked() or
+ * get_user_pages_unlocked().
+ *
+ * get_user_pages_locked() is suitable to replace the form:
+ *
+ *      down_read(&mm->mmap_sem);
+ *      do_something()
+ *      get_user_pages(tsk, mm, ..., pages, NULL);
+ *      up_read(&mm->mmap_sem);
+ *
+ *  to:
+ *
+ *      int locked = 1;
+ *      down_read(&mm->mmap_sem);
+ *      do_something()
+ *      get_user_pages_locked(tsk, mm, ..., pages, &locked);
+ *      if (locked)
+ *          up_read(&mm->mmap_sem);
+ */
+long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
+                           unsigned long start, unsigned long nr_pages,
+                           int write, int force, struct page **pages,
+                           int *locked)
+{
+        return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
+                                       pages, NULL, locked, true, FOLL_TOUCH);
+}
+EXPORT_SYMBOL(get_user_pages_locked);
+/*
+ * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to
+ * pass additional gup_flags as last parameter (like FOLL_HWPOISON).
+ *
+ * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the
+ * caller if required (just like with __get_user_pages). "FOLL_GET",
+ * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed
+ * according to the parameters "pages", "write", "force"
+ * respectively.
+ */
+__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+                                               unsigned long start, unsigned long nr_pages,
+                                               int write, int force, struct page **pages,
+                                               unsigned int gup_flags)
+{
+        long ret;
+        int locked = 1;
+        down_read(&mm->mmap_sem);
+        ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
+                                      pages, NULL, &locked, false, gup_flags);
+        if (locked)
+                up_read(&mm->mmap_sem);
+        return ret;
+}
+EXPORT_SYMBOL(__get_user_pages_unlocked);
+/*
+ * get_user_pages_unlocked() is suitable to replace the form:
+ *
+ *      down_read(&mm->mmap_sem);
+ *      get_user_pages(tsk, mm, ..., pages, NULL);
+ *      up_read(&mm->mmap_sem);
+ *
+ *  with:
+ *
+ *      get_user_pages_unlocked(tsk, mm, ..., pages);
+ *
+ * It is functionally equivalent to get_user_pages_fast so
+ * get_user_pages_fast should be used instead, if the two parameters
+ * "tsk" and "mm" are respectively equal to current and current->mm,
+ * or if "force" shall be set to 1 (get_user_pages_fast misses the
+ * "force" parameter).
+ */
+long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+                             unsigned long start, unsigned long nr_pages,
+                             int write, int force, struct page **pages)
+{
+        return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
+                                         force, pages, FOLL_TOUCH);
+}
+EXPORT_SYMBOL(get_user_pages_unlocked);
 /*
 * get_user_pages() - pin user pages in memory
 * @tsk:        the task_struct to use for page fault accounting, or
@@ -633,22 +803,18 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
 * use the correct cache flushing APIs.
 *
 * See also get_user_pages_fast, for performance critical applications.
+ *
+ * get_user_pages should be phased out in favor of
+ * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
+ * should use get_user_pages because it cannot pass
+ * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
 */
 long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                unsigned long start, unsigned long nr_pages, int write,
                int force, struct page **pages, struct vm_area_struct **vmas)
 {
-        int flags = FOLL_TOUCH;
+        return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
+                                       pages, vmas, NULL, false, FOLL_TOUCH);
-        if (pages)
-                flags |= FOLL_GET;
-        if (write)
-                flags |= FOLL_WRITE;
-        if (force)
-                flags |= FOLL_FORCE;
-        return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
-                                NULL);
 }
 EXPORT_SYMBOL(get_user_pages);
@@ -740,10 +906,10 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
                /*
                 * Similar to the PMD case below, NUMA hinting must take slow
-                 * path
+                 * path using the pte_protnone check.
                 */
                if (!pte_present(pte) || pte_special(pte) ||
-                        pte_numa(pte) || (write && !pte_write(pte)))
+                        pte_protnone(pte) || (write && !pte_write(pte)))
                        goto pte_unmap;
                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
@@ -926,7 +1092,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
        pmdp = pmd_offset(&pud, addr);
        do {
-                pmd_t pmd = ACCESS_ONCE(*pmdp);
+                pmd_t pmd = READ_ONCE(*pmdp);
                next = pmd_addr_end(addr, end);
                if (pmd_none(pmd) || pmd_trans_splitting(pmd))
@@ -938,7 +1104,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                         * slowpath for accounting purposes and so that they
                         * can be serialised against THP migration.
                         */
-                        if (pmd_numa(pmd))
+                        if (pmd_protnone(pmd))
                                return 0;
                        if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
@@ -1077,10 +1243,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
                start += nr << PAGE_SHIFT;
                pages += nr;
-                down_read(&mm->mmap_sem);
+                ret = get_user_pages_unlocked(current, mm, start,
-                ret = get_user_pages(current, mm, start,
+                                              nr_pages - nr, write, 0, pages);
-                                     nr_pages - nr, write, 0, pages, NULL);
-                up_read(&mm->mmap_sem);
                /* Have to be a bit careful with return values */
                if (nr > 0) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 817a875f2b8c..fc00c8cb5a82 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -171,12 +171,7 @@ static int start_khugepaged(void)
 }
 static atomic_t huge_zero_refcount;
-static struct page *huge_zero_page __read_mostly;
+struct page *huge_zero_page __read_mostly;
-static inline bool is_huge_zero_page(struct page *page)
-{
-        return ACCESS_ONCE(huge_zero_page) == page;
-}
 static inline bool is_huge_zero_pmd(pmd_t pmd)
 {
@@ -766,15 +761,6 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
        return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
 }
-static inline struct page *alloc_hugepage_vma(int defrag,
-                                              struct vm_area_struct *vma,
-                                              unsigned long haddr, int nd,
-                                              gfp_t extra_gfp)
-{
-        return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
-                               HPAGE_PMD_ORDER, vma, haddr, nd);
-}
 /* Caller must hold page table lock. */
 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
                struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
@@ -795,6 +781,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                               unsigned long address, pmd_t *pmd,
                               unsigned int flags)
 {
+        gfp_t gfp;
        struct page *page;
        unsigned long haddr = address & HPAGE_PMD_MASK;
@@ -829,8 +816,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                }
                return 0;
        }
-        page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+        gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
-                        vma, haddr, numa_node_id(), 0);
+        page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
        if (unlikely(!page)) {
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
@@ -1118,10 +1105,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        spin_unlock(ptl);
 alloc:
        if (transparent_hugepage_enabled(vma) &&
-            !transparent_hugepage_debug_cow())
+            !transparent_hugepage_debug_cow()) {
-                new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+                gfp_t gfp;
-                                              vma, haddr, numa_node_id(), 0);
-        else
+                gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+                new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
+        } else
                new_page = NULL;
        if (unlikely(!new_page)) {
@@ -1222,7 +1211,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                return ERR_PTR(-EFAULT);
        /* Full NUMA hinting faults to serialise migration in fault paths */
-        if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+        if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
                goto out;
        page = pmd_page(*pmd);
@@ -1273,6 +1262,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        bool migrated = false;
        int flags = 0;
+        /* A PROT_NONE fault should not end up here */
+        BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
        ptl = pmd_lock(mm, pmdp);
        if (unlikely(!pmd_same(pmd, *pmdp)))
                goto out_unlock;
@@ -1283,8 +1275,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
         * check_same as the page may no longer be mapped.
         */
        if (unlikely(pmd_trans_migrating(*pmdp))) {
+                page = pmd_page(*pmdp);
                spin_unlock(ptl);
-                wait_migrate_huge_page(vma->anon_vma, pmdp);
+                wait_on_page_locked(page);
                goto out;
        }
@@ -1352,7 +1345,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        /*
         * Migrate the THP to the requested node, returns with page unlocked
-         * and pmd_numa cleared.
+         * and access rights restored.
         */
        spin_unlock(ptl);
        migrated = migrate_misplaced_transhuge_page(mm, vma,
@@ -1365,9 +1358,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        goto out;
 clear_pmdnuma:
        BUG_ON(!PageLocked(page));
-        pmd = pmd_mknonnuma(pmd);
+        pmd = pmd_modify(pmd, vma->vm_page_prot);
        set_pmd_at(mm, haddr, pmdp, pmd);
-        VM_BUG_ON(pmd_numa(*pmdp));
        update_mmu_cache_pmd(vma, addr, pmdp);
        unlock_page(page);
 out_unlock:
@@ -1423,26 +1415,6 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
        return ret;
 }
-int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-                unsigned long addr, unsigned long end,
-                unsigned char *vec)
-{
-        spinlock_t *ptl;
-        int ret = 0;
-        if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
-                /*
-                 * All logical pages in the range are present
-                 * if backed by a huge page.
-                 */
-                spin_unlock(ptl);
-                memset(vec, 1, (end - addr) >> PAGE_SHIFT);
-                ret = 1;
-        }
-        return ret;
-}
 int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
                  unsigned long old_addr,
                  unsigned long new_addr, unsigned long old_end,
@@ -1510,29 +1482,24 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
        if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
                pmd_t entry;
-                ret = 1;
-                if (!prot_numa) {
+                /*
+                 * Avoid trapping faults against the zero page. The read-only
+                 * data is likely to be read-cached on the local CPU and
+                 * local/remote hits to the zero page are not interesting.
+                 */
+                if (prot_numa && is_huge_zero_pmd(*pmd)) {
+                        spin_unlock(ptl);
+                        return 0;
+                }
+                if (!prot_numa || !pmd_protnone(*pmd)) {
+                        ret = 1;
                        entry = pmdp_get_and_clear_notify(mm, addr, pmd);
-                        if (pmd_numa(entry))
-                                entry = pmd_mknonnuma(entry);
                        entry = pmd_modify(entry, newprot);
                        ret = HPAGE_PMD_NR;
                        set_pmd_at(mm, addr, pmd, entry);
                        BUG_ON(pmd_write(entry));
-                } else {
-                        struct page *page = pmd_page(*pmd);
-                        /*
-                         * Do not trap faults against the zero page. The
-                         * read-only data is likely to be read-cached on the
-                         * local CPU cache and it is less useful to know about
-                         * local vs remote hits on the zero page.
-                         */
-                        if (!is_huge_zero_page(page) &&
-                            !pmd_numa(*pmd)) {
-                                pmdp_set_numa(mm, addr, pmd);
-                                ret = HPAGE_PMD_NR;
-                        }
                }
                spin_unlock(ptl);
        }
@@ -1797,9 +1764,9 @@ static int __split_huge_page_map(struct page *page,
                        pte_t *pte, entry;
                        BUG_ON(PageCompound(page+i));
                        /*
-                         * Note that pmd_numa is not transferred deliberately
+                         * Note that NUMA hinting access restrictions are not
-                         * to avoid any possibility that pte_numa leaks to
+                         * transferred to avoid any possibility of altering
-                         * a PROT_NONE VMA by accident.
+                         * permissions across VMAs.
                         */
                        entry = mk_pte(page + i, vma->vm_page_prot);
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2148,7 +2115,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 {
        struct page *page;
        pte_t *_pte;
-        int referenced = 0, none = 0;
+        int none = 0;
+        bool referenced = false, writable = false;
        for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
             _pte++, address += PAGE_SIZE) {
                pte_t pteval = *_pte;
@@ -2158,7 +2126,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                        else
                                goto out;
                }
-                if (!pte_present(pteval) || !pte_write(pteval))
+                if (!pte_present(pteval))
                        goto out;
                page = vm_normal_page(vma, address, pteval);
                if (unlikely(!page))
@@ -2168,9 +2136,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                VM_BUG_ON_PAGE(!PageAnon(page), page);
                VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
-                /* cannot use mapcount: can't collapse if there's a gup pin */
-                if (page_count(page) != 1)
-                        goto out;
                /*
                 * We can do it before isolate_lru_page because the
                 * page can't be freed from under us. NOTE: PG_lock
@@ -2179,6 +2144,29 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                 */
                if (!trylock_page(page))
                        goto out;
+                /*
+                 * cannot use mapcount: can't collapse if there's a gup pin.
+                 * The page must only be referenced by the scanned process
+                 * and page swap cache.
+                 */
+                if (page_count(page) != 1 + !!PageSwapCache(page)) {
+                        unlock_page(page);
+                        goto out;
+                }
+                if (pte_write(pteval)) {
+                        writable = true;
+                } else {
+                        if (PageSwapCache(page) && !reuse_swap_page(page)) {
+                                unlock_page(page);
+                                goto out;
+                        }
+                        /*
+                         * Page is not in the swap cache. It can be collapsed
+                         * into a THP.
+                         */
+                }
                /*
                 * Isolate the page to avoid collapsing an hugepage
                 * currently in use by the VM.
@@ -2195,9 +2183,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                /* If there is no mapped pte young don't collapse the page */
                if (pte_young(pteval) || PageReferenced(page) ||
                    mmu_notifier_test_young(vma->vm_mm, address))
-                        referenced = 1;
+                        referenced = true;
        }
-        if (likely(referenced))
+        if (likely(referenced && writable))
                return 1;
 out:
        release_pte_pages(pte, _pte);
@@ -2550,11 +2538,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 {
        pmd_t *pmd;
        pte_t *pte, *_pte;
-        int ret = 0, referenced = 0, none = 0;
+        int ret = 0, none = 0;
        struct page *page;
        unsigned long _address;
        spinlock_t *ptl;
        int node = NUMA_NO_NODE;
+        bool writable = false, referenced = false;
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
@@ -2573,8 +2562,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                        else
                                goto out_unmap;
                }
-                if (!pte_present(pteval) || !pte_write(pteval))
+                if (!pte_present(pteval))
                        goto out_unmap;
+                if (pte_write(pteval))
+                        writable = true;
                page = vm_normal_page(vma, _address, pteval);
                if (unlikely(!page))
                        goto out_unmap;
@@ -2591,14 +2583,18 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                VM_BUG_ON_PAGE(PageCompound(page), page);
                if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
                        goto out_unmap;
-                /* cannot use mapcount: can't collapse if there's a gup pin */
+                /*
-                if (page_count(page) != 1)
+                 * cannot use mapcount: can't collapse if there's a gup pin.
+                 * The page must only be referenced by the scanned process
+                 * and page swap cache.
+                 */
+                if (page_count(page) != 1 + !!PageSwapCache(page))
                        goto out_unmap;
                if (pte_young(pteval) || PageReferenced(page) ||
                    mmu_notifier_test_young(vma->vm_mm, address))
-                        referenced = 1;
+                        referenced = true;
        }
-        if (referenced)
+        if (referenced && writable)
                ret = 1;
 out_unmap:
        pte_unmap_unlock(pte, ptl);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 85032de5e20f..0a9ac6c26832 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -35,7 +35,7 @@
 #include <linux/node.h>
 #include "internal.h"
-unsigned long hugepages_treat_as_movable;
+int hugepages_treat_as_movable;
 int hugetlb_max_hstate __read_mostly;
 unsigned int default_hstate_idx;
@@ -2657,9 +2657,10 @@ again:
                        goto unlock;
                /*
-                 * HWPoisoned hugepage is already unmapped and dropped reference
+                 * Migrating hugepage or HWPoisoned hugepage is already
+                 * unmapped and its refcount is dropped, so just clear pte here.
                 */
-                if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
+                if (unlikely(!pte_present(pte))) {
                        huge_pte_clear(mm, address, ptep);
                        goto unlock;
                }
@@ -3134,6 +3135,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        struct page *pagecache_page = NULL;
        struct hstate *h = hstate_vma(vma);
        struct address_space *mapping;
+        int need_wait_lock = 0;
        address &= huge_page_mask(h);
@@ -3172,6 +3174,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        ret = 0;
        /*
+         * entry could be a migration/hwpoison entry at this point, so this
+         * check prevents the kernel from going below assuming that we have
+         * a active hugepage in pagecache. This goto expects the 2nd page fault,
+         * and is_hugetlb_entry_(migration|hwpoisoned) check will properly
+         * handle it.
+         */
+        if (!pte_present(entry))
+                goto out_mutex;
+        /*
         * If we are going to COW the mapping later, we examine the pending
         * reservations for this page now. This will ensure that any
         * allocations necessary to record that reservation occur outside the
@@ -3190,30 +3202,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                                                vma, address);
        }
+        ptl = huge_pte_lock(h, mm, ptep);
+        /* Check for a racing update before calling hugetlb_cow */
+        if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
+                goto out_ptl;
        /*
         * hugetlb_cow() requires page locks of pte_page(entry) and
         * pagecache_page, so here we need take the former one
         * when page != pagecache_page or !pagecache_page.
-         * Note that locking order is always pagecache_page -> page,
-         * so no worry about deadlock.
         */
        page = pte_page(entry);
-        get_page(page);
        if (page != pagecache_page)
-                lock_page(page);
+                if (!trylock_page(page)) {
+                        need_wait_lock = 1;
-        ptl = huge_pte_lockptr(h, mm, ptep);
+                        goto out_ptl;
-        spin_lock(ptl);
+                }
-        /* Check for a racing update before calling hugetlb_cow */
-        if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
-                goto out_ptl;
+        get_page(page);
        if (flags & FAULT_FLAG_WRITE) {
                if (!huge_pte_write(entry)) {
                        ret = hugetlb_cow(mm, vma, address, ptep, entry,
                                        pagecache_page, ptl);
-                        goto out_ptl;
+                        goto out_put_page;
                }
                entry = huge_pte_mkdirty(entry);
        }
@@ -3221,7 +3234,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (huge_ptep_set_access_flags(vma, address, ptep, entry,
                                                flags & FAULT_FLAG_WRITE))
                update_mmu_cache(vma, address, ptep);
+out_put_page:
+        if (page != pagecache_page)
+                unlock_page(page);
+        put_page(page);
 out_ptl:
        spin_unlock(ptl);
@@ -3229,12 +3245,17 @@ out_ptl:
                unlock_page(pagecache_page);
                put_page(pagecache_page);
        }
-        if (page != pagecache_page)
-                unlock_page(page);
-        put_page(page);
 out_mutex:
        mutex_unlock(&htlb_fault_mutex_table[hash]);
+        /*
+         * Generally it's safe to hold refcount during waiting page lock. But
+         * here we just wait to defer the next page fault to avoid busy loop and
+         * the page is not used after unlocked before returning from the current
+         * page fault. So we are safe from accessing freed page, even if we wait
+         * here without taking refcount.
+         */
+        if (need_wait_lock)
+                wait_on_page_locked(page);
        return ret;
 }
@@ -3364,7 +3385,26 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                        spin_unlock(ptl);
                        continue;
                }
-                if (!huge_pte_none(huge_ptep_get(ptep))) {
+                pte = huge_ptep_get(ptep);
+                if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
+                        spin_unlock(ptl);
+                        continue;
+                }
+                if (unlikely(is_hugetlb_entry_migration(pte))) {
+                        swp_entry_t entry = pte_to_swp_entry(pte);
+                        if (is_write_migration_entry(entry)) {
+                                pte_t newpte;
+                                make_migration_entry_read(&entry);
+                                newpte = swp_entry_to_pte(entry);
+                                set_huge_pte_at(mm, address, ptep, newpte);
+                                pages++;
+                        }
+                        spin_unlock(ptl);
+                        continue;
+                }
+                if (!huge_pte_none(pte)) {
                        pte = huge_ptep_get_and_clear(mm, address, ptep);
                        pte = pte_mkhuge(huge_pte_modify(pte, newprot));
                        pte = arch_make_huge_pte(pte, vma, NULL, 0);
@@ -3558,6 +3598,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
                if (saddr) {
                        spte = huge_pte_offset(svma->vm_mm, saddr);
                        if (spte) {
+                                mm_inc_nr_pmds(mm);
                                get_page(virt_to_page(spte));
                                break;
                        }
@@ -3569,11 +3610,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
        ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
        spin_lock(ptl);
-        if (pud_none(*pud))
+        if (pud_none(*pud)) {
                pud_populate(mm, pud,
                                (pmd_t *)((unsigned long)spte & PAGE_MASK));
-        else
+        } else {
                put_page(virt_to_page(spte));
+                mm_inc_nr_pmds(mm);
+        }
        spin_unlock(ptl);
 out:
        pte = (pte_t *)pmd_alloc(mm, pud, addr);
@@ -3604,6 +3647,7 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
        pud_clear(pud);
        put_page(virt_to_page(ptep));
+        mm_dec_nr_pmds(mm);
        *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
        return 1;
 }
@@ -3660,42 +3704,64 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
        return (pte_t *) pmd;
 }
-struct page *
+#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-                pmd_t *pmd, int write)
-{
-        struct page *page;
-        page = pte_page(*(pte_t *)pmd);
+/*
-        if (page)
+ * These functions are overwritable if your architecture needs its own
-                page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
+ * behavior.
-        return page;
+ */
+struct page * __weak
+follow_huge_addr(struct mm_struct *mm, unsigned long address,
+                              int write)
+{
+        return ERR_PTR(-EINVAL);
 }
-struct page *
+struct page * __weak
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-                pud_t *pud, int write)
+                pmd_t *pmd, int flags)
 {
-        struct page *page;
+        struct page *page = NULL;
+        spinlock_t *ptl;
-        page = pte_page(*(pte_t *)pud);
+retry:
-        if (page)
+        ptl = pmd_lockptr(mm, pmd);
-                page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
+        spin_lock(ptl);
+        /*
+         * make sure that the address range covered by this pmd is not
+         * unmapped from other threads.
+         */
+        if (!pmd_huge(*pmd))
+                goto out;
+        if (pmd_present(*pmd)) {
+                page = pte_page(*(pte_t *)pmd) +
+                        ((address & ~PMD_MASK) >> PAGE_SHIFT);
+                if (flags & FOLL_GET)
+                        get_page(page);
+        } else {
+                if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) {
+                        spin_unlock(ptl);
+                        __migration_entry_wait(mm, (pte_t *)pmd, ptl);
+                        goto retry;
+                }
+                /*
+                 * hwpoisoned entry is treated as no_page_table in
+                 * follow_page_mask().
+                 */
+        }
+out:
+        spin_unlock(ptl);
        return page;
 }
-#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
-/* Can be overriden by architectures */
 struct page * __weak
 follow_huge_pud(struct mm_struct *mm, unsigned long address,
-               pud_t *pud, int write)
+                pud_t *pud, int flags)
 {
-        BUG();
+        if (flags & FOLL_GET)
-        return NULL;
+                return NULL;
-}
-#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+        return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
+}
 #ifdef CONFIG_MEMORY_FAILURE
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 037e1c00a5b7..6e0057439a46 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -279,7 +279,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
                return -EINVAL;
        buf = strstrip(buf);
-        ret = page_counter_memparse(buf, &nr_pages);
+        ret = page_counter_memparse(buf, "-1", &nr_pages);
        if (ret)
                return ret;
diff --git a/mm/internal.h b/mm/internal.h
index efad241f7014..a96da5b0029d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -110,6 +110,28 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
 */
 /*
+ * Structure for holding the mostly immutable allocation parameters passed
+ * between functions involved in allocations, including the alloc_pages*
+ * family of functions.
+ *
+ * nodemask, migratetype and high_zoneidx are initialized only once in
+ * __alloc_pages_nodemask() and then never change.
+ *
+ * zonelist, preferred_zone and classzone_idx are set first in
+ * __alloc_pages_nodemask() for the fast path, and might be later changed
+ * in __alloc_pages_slowpath(). All other functions pass the whole strucure
+ * by a const pointer.
+ */
+struct alloc_context {
+        struct zonelist *zonelist;
+        nodemask_t *nodemask;
+        struct zone *preferred_zone;
+        int classzone_idx;
+        int migratetype;
+        enum zone_type high_zoneidx;
+};
+/*
 * Locate the struct page for both the matching buddy in our
 * pair (buddy1) and the combined O(n+1) page they form (page).
 *
@@ -329,8 +351,10 @@ extern int mminit_loglevel;
 #define mminit_dprintk(level, prefix, fmt, arg...) \
 do { \
        if (level < mminit_loglevel) { \
-                printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \
+                if (level <= MMINIT_WARNING) \
-                printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \
+                        printk(KERN_WARNING "mminit::" prefix " " fmt, ##arg); \
+                else \
+                        printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
        } \
 } while (0)
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index 8da581fa9060..f2c2492681bf 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -21,8 +21,8 @@ static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
        return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
 }
-INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb,
+INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
-                     unsigned long, shared.linear.rb_subtree_last,
+                     unsigned long, shared.rb_subtree_last,
                     vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
 /* Insert node immediately after prev in the interval tree */
@@ -36,26 +36,26 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
        VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node);
-        if (!prev->shared.linear.rb.rb_right) {
+        if (!prev->shared.rb.rb_right) {
                parent = prev;
-                link = &prev->shared.linear.rb.rb_right;
+                link = &prev->shared.rb.rb_right;
        } else {
-                parent = rb_entry(prev->shared.linear.rb.rb_right,
+                parent = rb_entry(prev->shared.rb.rb_right,
-                                  struct vm_area_struct, shared.linear.rb);
+                                  struct vm_area_struct, shared.rb);
-                if (parent->shared.linear.rb_subtree_last < last)
+                if (parent->shared.rb_subtree_last < last)
-                        parent->shared.linear.rb_subtree_last = last;
+                        parent->shared.rb_subtree_last = last;
-                while (parent->shared.linear.rb.rb_left) {
+                while (parent->shared.rb.rb_left) {
-                        parent = rb_entry(parent->shared.linear.rb.rb_left,
+                        parent = rb_entry(parent->shared.rb.rb_left,
-                                struct vm_area_struct, shared.linear.rb);
+                                struct vm_area_struct, shared.rb);
-                        if (parent->shared.linear.rb_subtree_last < last)
+                        if (parent->shared.rb_subtree_last < last)
-                                parent->shared.linear.rb_subtree_last = last;
+                                parent->shared.rb_subtree_last = last;
                }
-                link = &parent->shared.linear.rb.rb_left;
+                link = &parent->shared.rb.rb_left;
        }
-        node->shared.linear.rb_subtree_last = last;
+        node->shared.rb_subtree_last = last;
-        rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link);
+        rb_link_node(&node->shared.rb, &parent->shared.rb, link);
-        rb_insert_augmented(&node->shared.linear.rb, root,
+        rb_insert_augmented(&node->shared.rb, root,
                            &vma_interval_tree_augment);
 }
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index a1599ca4ab0e..827732047da1 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -501,18 +501,31 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i)
 EXPORT_SYMBOL(iov_iter_single_seg_count);
 void iov_iter_kvec(struct iov_iter *i, int direction,
-                        const struct kvec *iov, unsigned long nr_segs,
+                        const struct kvec *kvec, unsigned long nr_segs,
                        size_t count)
 {
        BUG_ON(!(direction & ITER_KVEC));
        i->type = direction;
-        i->kvec = (struct kvec *)iov;
+        i->kvec = kvec;
        i->nr_segs = nr_segs;
        i->iov_offset = 0;
        i->count = count;
 }
 EXPORT_SYMBOL(iov_iter_kvec);
+void iov_iter_bvec(struct iov_iter *i, int direction,
+                        const struct bio_vec *bvec, unsigned long nr_segs,
+                        size_t count)
+{
+        BUG_ON(!(direction & ITER_BVEC));
+        i->type = direction;
+        i->bvec = bvec;
+        i->nr_segs = nr_segs;
+        i->iov_offset = 0;
+        i->count = count;
+}
+EXPORT_SYMBOL(iov_iter_bvec);
 unsigned long iov_iter_alignment(const struct iov_iter *i)
 {
        unsigned long res = 0;
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
new file mode 100644
index 000000000000..bd837b8c2f41
--- /dev/null
+++ b/mm/kasan/Makefile
@@ -0,0 +1,8 @@
+KASAN_SANITIZE := n
+CFLAGS_REMOVE_kasan.o = -pg
+# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
+# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
+CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+obj-y := kasan.o report.o
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
new file mode 100644
index 000000000000..78fee632a7ee
--- /dev/null
+++ b/mm/kasan/kasan.c
@@ -0,0 +1,516 @@
+/*
+ * This file contains shadow memory manipulation code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ *
+ * Some of code borrowed from https://github.com/xairy/linux by
+ *        Andrey Konovalov <adech.fo@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define DISABLE_BRANCH_PROFILING
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/memblock.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+#include "kasan.h"
+#include "../slab.h"
+/*
+ * Poisons the shadow memory for 'size' bytes starting from 'addr'.
+ * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
+ */
+static void kasan_poison_shadow(const void *address, size_t size, u8 value)
+{
+        void *shadow_start, *shadow_end;
+        shadow_start = kasan_mem_to_shadow(address);
+        shadow_end = kasan_mem_to_shadow(address + size);
+        memset(shadow_start, value, shadow_end - shadow_start);
+}
+void kasan_unpoison_shadow(const void *address, size_t size)
+{
+        kasan_poison_shadow(address, size, 0);
+        if (size & KASAN_SHADOW_MASK) {
+                u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
+                *shadow = size & KASAN_SHADOW_MASK;
+        }
+}
+/*
+ * All functions below always inlined so compiler could
+ * perform better optimizations in each of __asan_loadX/__assn_storeX
+ * depending on memory access size X.
+ */
+static __always_inline bool memory_is_poisoned_1(unsigned long addr)
+{
+        s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
+        if (unlikely(shadow_value)) {
+                s8 last_accessible_byte = addr & KASAN_SHADOW_MASK;
+                return unlikely(last_accessible_byte >= shadow_value);
+        }
+        return false;
+}
+static __always_inline bool memory_is_poisoned_2(unsigned long addr)
+{
+        u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+        if (unlikely(*shadow_addr)) {
+                if (memory_is_poisoned_1(addr + 1))
+                        return true;
+                if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0))
+                        return false;
+                return unlikely(*(u8 *)shadow_addr);
+        }
+        return false;
+}
+static __always_inline bool memory_is_poisoned_4(unsigned long addr)
+{
+        u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+        if (unlikely(*shadow_addr)) {
+                if (memory_is_poisoned_1(addr + 3))
+                        return true;
+                if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3))
+                        return false;
+                return unlikely(*(u8 *)shadow_addr);
+        }
+        return false;
+}
+static __always_inline bool memory_is_poisoned_8(unsigned long addr)
+{
+        u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+        if (unlikely(*shadow_addr)) {
+                if (memory_is_poisoned_1(addr + 7))
+                        return true;
+                if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7))
+                        return false;
+                return unlikely(*(u8 *)shadow_addr);
+        }
+        return false;
+}
+static __always_inline bool memory_is_poisoned_16(unsigned long addr)
+{
+        u32 *shadow_addr = (u32 *)kasan_mem_to_shadow((void *)addr);
+        if (unlikely(*shadow_addr)) {
+                u16 shadow_first_bytes = *(u16 *)shadow_addr;
+                s8 last_byte = (addr + 15) & KASAN_SHADOW_MASK;
+                if (unlikely(shadow_first_bytes))
+                        return true;
+                if (likely(!last_byte))
+                        return false;
+                return memory_is_poisoned_1(addr + 15);
+        }
+        return false;
+}
+static __always_inline unsigned long bytes_is_zero(const u8 *start,
+                                        size_t size)
+{
+        while (size) {
+                if (unlikely(*start))
+                        return (unsigned long)start;
+                start++;
+                size--;
+        }
+        return 0;
+}
+static __always_inline unsigned long memory_is_zero(const void *start,
+                                                const void *end)
+{
+        unsigned int words;
+        unsigned long ret;
+        unsigned int prefix = (unsigned long)start % 8;
+        if (end - start <= 16)
+                return bytes_is_zero(start, end - start);
+        if (prefix) {
+                prefix = 8 - prefix;
+                ret = bytes_is_zero(start, prefix);
+                if (unlikely(ret))
+                        return ret;
+                start += prefix;
+        }
+        words = (end - start) / 8;
+        while (words) {
+                if (unlikely(*(u64 *)start))
+                        return bytes_is_zero(start, 8);
+                start += 8;
+                words--;
+        }
+        return bytes_is_zero(start, (end - start) % 8);
+}
+static __always_inline bool memory_is_poisoned_n(unsigned long addr,
+                                                size_t size)
+{
+        unsigned long ret;
+        ret = memory_is_zero(kasan_mem_to_shadow((void *)addr),
+                        kasan_mem_to_shadow((void *)addr + size - 1) + 1);
+        if (unlikely(ret)) {
+                unsigned long last_byte = addr + size - 1;
+                s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
+                if (unlikely(ret != (unsigned long)last_shadow ||
+                        ((last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
+                        return true;
+        }
+        return false;
+}
+static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
+{
+        if (__builtin_constant_p(size)) {
+                switch (size) {
+                case 1:
+                        return memory_is_poisoned_1(addr);
+                case 2:
+                        return memory_is_poisoned_2(addr);
+                case 4:
+                        return memory_is_poisoned_4(addr);
+                case 8:
+                        return memory_is_poisoned_8(addr);
+                case 16:
+                        return memory_is_poisoned_16(addr);
+                default:
+                        BUILD_BUG();
+                }
+        }
+        return memory_is_poisoned_n(addr, size);
+}
+static __always_inline void check_memory_region(unsigned long addr,
+                                                size_t size, bool write)
+{
+        struct kasan_access_info info;
+        if (unlikely(size == 0))
+                return;
+        if (unlikely((void *)addr <
+                kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
+                info.access_addr = (void *)addr;
+                info.access_size = size;
+                info.is_write = write;
+                info.ip = _RET_IP_;
+                kasan_report_user_access(&info);
+                return;
+        }
+        if (likely(!memory_is_poisoned(addr, size)))
+                return;
+        kasan_report(addr, size, write, _RET_IP_);
+}
+void __asan_loadN(unsigned long addr, size_t size);
+void __asan_storeN(unsigned long addr, size_t size);
+#undef memset
+void *memset(void *addr, int c, size_t len)
+{
+        __asan_storeN((unsigned long)addr, len);
+        return __memset(addr, c, len);
+}
+#undef memmove
+void *memmove(void *dest, const void *src, size_t len)
+{
+        __asan_loadN((unsigned long)src, len);
+        __asan_storeN((unsigned long)dest, len);
+        return __memmove(dest, src, len);
+}
+#undef memcpy
+void *memcpy(void *dest, const void *src, size_t len)
+{
+        __asan_loadN((unsigned long)src, len);
+        __asan_storeN((unsigned long)dest, len);
+        return __memcpy(dest, src, len);
+}
+void kasan_alloc_pages(struct page *page, unsigned int order)
+{
+        if (likely(!PageHighMem(page)))
+                kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
+}
+void kasan_free_pages(struct page *page, unsigned int order)
+{
+        if (likely(!PageHighMem(page)))
+                kasan_poison_shadow(page_address(page),
+                                PAGE_SIZE << order,
+                                KASAN_FREE_PAGE);
+}
+void kasan_poison_slab(struct page *page)
+{
+        kasan_poison_shadow(page_address(page),
+                        PAGE_SIZE << compound_order(page),
+                        KASAN_KMALLOC_REDZONE);
+}
+void kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
+{
+        kasan_unpoison_shadow(object, cache->object_size);
+}
+void kasan_poison_object_data(struct kmem_cache *cache, void *object)
+{
+        kasan_poison_shadow(object,
+                        round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
+                        KASAN_KMALLOC_REDZONE);
+}
+void kasan_slab_alloc(struct kmem_cache *cache, void *object)
+{
+        kasan_kmalloc(cache, object, cache->object_size);
+}
+void kasan_slab_free(struct kmem_cache *cache, void *object)
+{
+        unsigned long size = cache->object_size;
+        unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
+        /* RCU slabs could be legally used after free within the RCU period */
+        if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
+                return;
+        kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
+}
+void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size)
+{
+        unsigned long redzone_start;
+        unsigned long redzone_end;
+        if (unlikely(object == NULL))
+                return;
+        redzone_start = round_up((unsigned long)(object + size),
+                                KASAN_SHADOW_SCALE_SIZE);
+        redzone_end = round_up((unsigned long)object + cache->object_size,
+                                KASAN_SHADOW_SCALE_SIZE);
+        kasan_unpoison_shadow(object, size);
+        kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
+                KASAN_KMALLOC_REDZONE);
+}
+EXPORT_SYMBOL(kasan_kmalloc);
+void kasan_kmalloc_large(const void *ptr, size_t size)
+{
+        struct page *page;
+        unsigned long redzone_start;
+        unsigned long redzone_end;
+        if (unlikely(ptr == NULL))
+                return;
+        page = virt_to_page(ptr);
+        redzone_start = round_up((unsigned long)(ptr + size),
+                                KASAN_SHADOW_SCALE_SIZE);
+        redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page));
+        kasan_unpoison_shadow(ptr, size);
+        kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
+                KASAN_PAGE_REDZONE);
+}
+void kasan_krealloc(const void *object, size_t size)
+{
+        struct page *page;
+        if (unlikely(object == ZERO_SIZE_PTR))
+                return;
+        page = virt_to_head_page(object);
+        if (unlikely(!PageSlab(page)))
+                kasan_kmalloc_large(object, size);
+        else
+                kasan_kmalloc(page->slab_cache, object, size);
+}
+void kasan_kfree_large(const void *ptr)
+{
+        struct page *page = virt_to_page(ptr);
+        kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
+                        KASAN_FREE_PAGE);
+}
+int kasan_module_alloc(void *addr, size_t size)
+{
+        void *ret;
+        size_t shadow_size;
+        unsigned long shadow_start;
+        shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
+        shadow_size = round_up(size >> KASAN_SHADOW_SCALE_SHIFT,
+                        PAGE_SIZE);
+        if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
+                return -EINVAL;
+        ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
+                        shadow_start + shadow_size,
+                        GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+                        PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
+                        __builtin_return_address(0));
+        return ret ? 0 : -ENOMEM;
+}
+void kasan_module_free(void *addr)
+{
+        vfree(kasan_mem_to_shadow(addr));
+}
+static void register_global(struct kasan_global *global)
+{
+        size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
+        kasan_unpoison_shadow(global->beg, global->size);
+        kasan_poison_shadow(global->beg + aligned_size,
+                global->size_with_redzone - aligned_size,
+                KASAN_GLOBAL_REDZONE);
+}
+void __asan_register_globals(struct kasan_global *globals, size_t size)
+{
+        int i;
+        for (i = 0; i < size; i++)
+                register_global(&globals[i]);
+}
+EXPORT_SYMBOL(__asan_register_globals);
+void __asan_unregister_globals(struct kasan_global *globals, size_t size)
+{
+}
+EXPORT_SYMBOL(__asan_unregister_globals);
+#define DEFINE_ASAN_LOAD_STORE(size)                            \
+        void __asan_load##size(unsigned long addr)              \
+        {                                                       \
+                check_memory_region(addr, size, false);         \
+        }                                                       \
+        EXPORT_SYMBOL(__asan_load##size);                       \
+        __alias(__asan_load##size)                              \
+        void __asan_load##size##_noabort(unsigned long);        \
+        EXPORT_SYMBOL(__asan_load##size##_noabort);             \
+        void __asan_store##size(unsigned long addr)             \
+        {                                                       \
+                check_memory_region(addr, size, true);          \
+        }                                                       \
+        EXPORT_SYMBOL(__asan_store##size);                      \
+        __alias(__asan_store##size)                             \
+        void __asan_store##size##_noabort(unsigned long);       \
+        EXPORT_SYMBOL(__asan_store##size##_noabort)
+DEFINE_ASAN_LOAD_STORE(1);
+DEFINE_ASAN_LOAD_STORE(2);
+DEFINE_ASAN_LOAD_STORE(4);
+DEFINE_ASAN_LOAD_STORE(8);
+DEFINE_ASAN_LOAD_STORE(16);
+void __asan_loadN(unsigned long addr, size_t size)
+{
+        check_memory_region(addr, size, false);
+}
+EXPORT_SYMBOL(__asan_loadN);
+__alias(__asan_loadN)
+void __asan_loadN_noabort(unsigned long, size_t);
+EXPORT_SYMBOL(__asan_loadN_noabort);
+void __asan_storeN(unsigned long addr, size_t size)
+{
+        check_memory_region(addr, size, true);
+}
+EXPORT_SYMBOL(__asan_storeN);
+__alias(__asan_storeN)
+void __asan_storeN_noabort(unsigned long, size_t);
+EXPORT_SYMBOL(__asan_storeN_noabort);
+/* to shut up compiler complaints */
+void __asan_handle_no_return(void) {}
+EXPORT_SYMBOL(__asan_handle_no_return);
+#ifdef CONFIG_MEMORY_HOTPLUG
+static int kasan_mem_notifier(struct notifier_block *nb,
+                        unsigned long action, void *data)
+{
+        return (action == MEM_GOING_ONLINE) ? NOTIFY_BAD : NOTIFY_OK;
+}
+static int __init kasan_memhotplug_init(void)
+{
+        pr_err("WARNING: KASan doesn't support memory hot-add\n");
+        pr_err("Memory hot-add will be disabled\n");
+        hotplug_memory_notifier(kasan_mem_notifier, 0);
+        return 0;
+}
+module_init(kasan_memhotplug_init);
+#endif
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
new file mode 100644
index 000000000000..4986b0acab21
--- /dev/null
+++ b/mm/kasan/kasan.h
@@ -0,0 +1,75 @@
+#ifndef __MM_KASAN_KASAN_H
+#define __MM_KASAN_KASAN_H
+#include <linux/kasan.h>
+#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
+#define KASAN_SHADOW_MASK       (KASAN_SHADOW_SCALE_SIZE - 1)
+#define KASAN_FREE_PAGE         0xFF  /* page was freed */
+#define KASAN_FREE_PAGE         0xFF  /* page was freed */
+#define KASAN_PAGE_REDZONE      0xFE  /* redzone for kmalloc_large allocations */
+#define KASAN_KMALLOC_REDZONE   0xFC  /* redzone inside slub object */
+#define KASAN_KMALLOC_FREE      0xFB  /* object was freed (kmem_cache_free/kfree) */
+#define KASAN_GLOBAL_REDZONE    0xFA  /* redzone for global variable */
+/*
+ * Stack redzone shadow values
+ * (Those are compiler's ABI, don't change them)
+ */
+#define KASAN_STACK_LEFT        0xF1
+#define KASAN_STACK_MID         0xF2
+#define KASAN_STACK_RIGHT       0xF3
+#define KASAN_STACK_PARTIAL     0xF4
+/* Don't break randconfig/all*config builds */
+#ifndef KASAN_ABI_VERSION
+#define KASAN_ABI_VERSION 1
+#endif
+struct kasan_access_info {
+        const void *access_addr;
+        const void *first_bad_addr;
+        size_t access_size;
+        bool is_write;
+        unsigned long ip;
+};
+/* The layout of struct dictated by compiler */
+struct kasan_source_location {
+        const char *filename;
+        int line_no;
+        int column_no;
+};
+/* The layout of struct dictated by compiler */
+struct kasan_global {
+        const void *beg;                /* Address of the beginning of the global variable. */
+        size_t size;                    /* Size of the global variable. */
+        size_t size_with_redzone;       /* Size of the variable + size of the red zone. 32 bytes aligned */
+        const void *name;
+        const void *module_name;        /* Name of the module where the global variable is declared. */
+        unsigned long has_dynamic_init; /* This needed for C++ */
+#if KASAN_ABI_VERSION >= 4
+        struct kasan_source_location *location;
+#endif
+};
+void kasan_report_error(struct kasan_access_info *info);
+void kasan_report_user_access(struct kasan_access_info *info);
+static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
+{
+        return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
+                << KASAN_SHADOW_SCALE_SHIFT);
+}
+static inline bool kasan_enabled(void)
+{
+        return !current->kasan_depth;
+}
+void kasan_report(unsigned long addr, size_t size,
+                bool is_write, unsigned long ip);
+#endif
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
new file mode 100644
index 000000000000..680ceedf810a
--- /dev/null
+++ b/mm/kasan/report.c
@@ -0,0 +1,269 @@
+/*
+ * This file contains error reporting code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ *
+ * Some of code borrowed from https://github.com/xairy/linux by
+ *        Andrey Konovalov <adech.fo@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+#include <asm/sections.h>
+#include "kasan.h"
+#include "../slab.h"
+/* Shadow layout customization. */
+#define SHADOW_BYTES_PER_BLOCK 1
+#define SHADOW_BLOCKS_PER_ROW 16
+#define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK)
+#define SHADOW_ROWS_AROUND_ADDR 2
+static const void *find_first_bad_addr(const void *addr, size_t size)
+{
+        u8 shadow_val = *(u8 *)kasan_mem_to_shadow(addr);
+        const void *first_bad_addr = addr;
+        while (!shadow_val && first_bad_addr < addr + size) {
+                first_bad_addr += KASAN_SHADOW_SCALE_SIZE;
+                shadow_val = *(u8 *)kasan_mem_to_shadow(first_bad_addr);
+        }
+        return first_bad_addr;
+}
+static void print_error_description(struct kasan_access_info *info)
+{
+        const char *bug_type = "unknown crash";
+        u8 shadow_val;
+        info->first_bad_addr = find_first_bad_addr(info->access_addr,
+                                                info->access_size);
+        shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr);
+        switch (shadow_val) {
+        case KASAN_FREE_PAGE:
+        case KASAN_KMALLOC_FREE:
+                bug_type = "use after free";
+                break;
+        case KASAN_PAGE_REDZONE:
+        case KASAN_KMALLOC_REDZONE:
+        case KASAN_GLOBAL_REDZONE:
+        case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
+                bug_type = "out of bounds access";
+                break;
+        case KASAN_STACK_LEFT:
+        case KASAN_STACK_MID:
+        case KASAN_STACK_RIGHT:
+        case KASAN_STACK_PARTIAL:
+                bug_type = "out of bounds on stack";
+                break;
+        }
+        pr_err("BUG: KASan: %s in %pS at addr %p\n",
+                bug_type, (void *)info->ip,
+                info->access_addr);
+        pr_err("%s of size %zu by task %s/%d\n",
+                info->is_write ? "Write" : "Read",
+                info->access_size, current->comm, task_pid_nr(current));
+}
+static inline bool kernel_or_module_addr(const void *addr)
+{
+        return (addr >= (void *)_stext && addr < (void *)_end)
+                || (addr >= (void *)MODULES_VADDR
+                        && addr < (void *)MODULES_END);
+}
+static inline bool init_task_stack_addr(const void *addr)
+{
+        return addr >= (void *)&init_thread_union.stack &&
+                (addr <= (void *)&init_thread_union.stack +
+                        sizeof(init_thread_union.stack));
+}
+static void print_address_description(struct kasan_access_info *info)
+{
+        const void *addr = info->access_addr;
+        if ((addr >= (void *)PAGE_OFFSET) &&
+                (addr < high_memory)) {
+                struct page *page = virt_to_head_page(addr);
+                if (PageSlab(page)) {
+                        void *object;
+                        struct kmem_cache *cache = page->slab_cache;
+                        void *last_object;
+                        object = virt_to_obj(cache, page_address(page), addr);
+                        last_object = page_address(page) +
+                                page->objects * cache->size;
+                        if (unlikely(object > last_object))
+                                object = last_object; /* we hit into padding */
+                        object_err(cache, page, object,
+                                "kasan: bad access detected");
+                        return;
+                }
+                dump_page(page, "kasan: bad access detected");
+        }
+        if (kernel_or_module_addr(addr)) {
+                if (!init_task_stack_addr(addr))
+                        pr_err("Address belongs to variable %pS\n", addr);
+        }
+        dump_stack();
+}
+static bool row_is_guilty(const void *row, const void *guilty)
+{
+        return (row <= guilty) && (guilty < row + SHADOW_BYTES_PER_ROW);
+}
+static int shadow_pointer_offset(const void *row, const void *shadow)
+{
+        /* The length of ">ff00ff00ff00ff00: " is
+         *    3 + (BITS_PER_LONG/8)*2 chars.
+         */
+        return 3 + (BITS_PER_LONG/8)*2 + (shadow - row)*2 +
+                (shadow - row) / SHADOW_BYTES_PER_BLOCK + 1;
+}
+static void print_shadow_for_address(const void *addr)
+{
+        int i;
+        const void *shadow = kasan_mem_to_shadow(addr);
+        const void *shadow_row;
+        shadow_row = (void *)round_down((unsigned long)shadow,
+                                        SHADOW_BYTES_PER_ROW)
+                - SHADOW_ROWS_AROUND_ADDR * SHADOW_BYTES_PER_ROW;
+        pr_err("Memory state around the buggy address:\n");
+        for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) {
+                const void *kaddr = kasan_shadow_to_mem(shadow_row);
+                char buffer[4 + (BITS_PER_LONG/8)*2];
+                snprintf(buffer, sizeof(buffer),
+                        (i == 0) ? ">%p: " : " %p: ", kaddr);
+                kasan_disable_current();
+                print_hex_dump(KERN_ERR, buffer,
+                        DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1,
+                        shadow_row, SHADOW_BYTES_PER_ROW, 0);
+                kasan_enable_current();
+                if (row_is_guilty(shadow_row, shadow))
+                        pr_err("%*c\n",
+                                shadow_pointer_offset(shadow_row, shadow),
+                                '^');
+                shadow_row += SHADOW_BYTES_PER_ROW;
+        }
+}
+static DEFINE_SPINLOCK(report_lock);
+void kasan_report_error(struct kasan_access_info *info)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&report_lock, flags);
+        pr_err("================================="
+                "=================================\n");
+        print_error_description(info);
+        print_address_description(info);
+        print_shadow_for_address(info->first_bad_addr);
+        pr_err("================================="
+                "=================================\n");
+        spin_unlock_irqrestore(&report_lock, flags);
+}
+void kasan_report_user_access(struct kasan_access_info *info)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&report_lock, flags);
+        pr_err("================================="
+                "=================================\n");
+        pr_err("BUG: KASan: user-memory-access on address %p\n",
+                info->access_addr);
+        pr_err("%s of size %zu by task %s/%d\n",
+                info->is_write ? "Write" : "Read",
+                info->access_size, current->comm, task_pid_nr(current));
+        dump_stack();
+        pr_err("================================="
+                "=================================\n");
+        spin_unlock_irqrestore(&report_lock, flags);
+}
+void kasan_report(unsigned long addr, size_t size,
+                bool is_write, unsigned long ip)
+{
+        struct kasan_access_info info;
+        if (likely(!kasan_enabled()))
+                return;
+        info.access_addr = (void *)addr;
+        info.access_size = size;
+        info.is_write = is_write;
+        info.ip = ip;
+        kasan_report_error(&info);
+}
+#define DEFINE_ASAN_REPORT_LOAD(size)                     \
+void __asan_report_load##size##_noabort(unsigned long addr) \
+{                                                         \
+        kasan_report(addr, size, false, _RET_IP_);        \
+}                                                         \
+EXPORT_SYMBOL(__asan_report_load##size##_noabort)
+#define DEFINE_ASAN_REPORT_STORE(size)                     \
+void __asan_report_store##size##_noabort(unsigned long addr) \
+{                                                          \
+        kasan_report(addr, size, true, _RET_IP_);          \
+}                                                          \
+EXPORT_SYMBOL(__asan_report_store##size##_noabort)
+DEFINE_ASAN_REPORT_LOAD(1);
+DEFINE_ASAN_REPORT_LOAD(2);
+DEFINE_ASAN_REPORT_LOAD(4);
+DEFINE_ASAN_REPORT_LOAD(8);
+DEFINE_ASAN_REPORT_LOAD(16);
+DEFINE_ASAN_REPORT_STORE(1);
+DEFINE_ASAN_REPORT_STORE(2);
+DEFINE_ASAN_REPORT_STORE(4);
+DEFINE_ASAN_REPORT_STORE(8);
+DEFINE_ASAN_REPORT_STORE(16);
+void __asan_report_load_n_noabort(unsigned long addr, size_t size)
+{
+        kasan_report(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_load_n_noabort);
+void __asan_report_store_n_noabort(unsigned long addr, size_t size)
+{
+        kasan_report(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_store_n_noabort);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 3cda50c1e394..5405aff5a590 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -98,6 +98,7 @@
 #include <asm/processor.h>
 #include <linux/atomic.h>
+#include <linux/kasan.h>
 #include <linux/kmemcheck.h>
 #include <linux/kmemleak.h>
 #include <linux/memory_hotplug.h>
@@ -1113,7 +1114,10 @@ static bool update_checksum(struct kmemleak_object *object)
        if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
                return false;
+        kasan_disable_current();
        object->checksum = crc32(0, (void *)object->pointer, object->size);
+        kasan_enable_current();
        return object->checksum != old_csum;
 }
@@ -1164,7 +1168,9 @@ static void scan_block(void *_start, void *_end,
                                                  BYTES_PER_POINTER))
                        continue;
+                kasan_disable_current();
                pointer = *ptr;
+                kasan_enable_current();
                object = find_and_get_object(pointer, 1);
                if (!object)
diff --git a/mm/ksm.c b/mm/ksm.c
index 15647fb0394f..4162dce2eb44 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1748,7 +1748,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                 */
                if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
                                 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
-                                 VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
+                                 VM_HUGETLB | VM_MIXEDMAP))
                        return 0;               /* just ignore the advice */
 #ifdef VM_SAO
diff --git a/mm/list_lru.c b/mm/list_lru.c
index f1a0db194173..909eca2c820e 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -9,18 +9,100 @@
 #include <linux/mm.h>
 #include <linux/list_lru.h>
 #include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/memcontrol.h>
+#ifdef CONFIG_MEMCG_KMEM
+static LIST_HEAD(list_lrus);
+static DEFINE_MUTEX(list_lrus_mutex);
+static void list_lru_register(struct list_lru *lru)
+{
+        mutex_lock(&list_lrus_mutex);
+        list_add(&lru->list, &list_lrus);
+        mutex_unlock(&list_lrus_mutex);
+}
+static void list_lru_unregister(struct list_lru *lru)
+{
+        mutex_lock(&list_lrus_mutex);
+        list_del(&lru->list);
+        mutex_unlock(&list_lrus_mutex);
+}
+#else
+static void list_lru_register(struct list_lru *lru)
+{
+}
+static void list_lru_unregister(struct list_lru *lru)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+#ifdef CONFIG_MEMCG_KMEM
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+        return !!lru->node[0].memcg_lrus;
+}
+static inline struct list_lru_one *
+list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+{
+        /*
+         * The lock protects the array of per cgroup lists from relocation
+         * (see memcg_update_list_lru_node).
+         */
+        lockdep_assert_held(&nlru->lock);
+        if (nlru->memcg_lrus && idx >= 0)
+                return nlru->memcg_lrus->lru[idx];
+        return &nlru->lru;
+}
+static inline struct list_lru_one *
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+{
+        struct mem_cgroup *memcg;
+        if (!nlru->memcg_lrus)
+                return &nlru->lru;
+        memcg = mem_cgroup_from_kmem(ptr);
+        if (!memcg)
+                return &nlru->lru;
+        return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
+}
+#else
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+        return false;
+}
+static inline struct list_lru_one *
+list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+{
+        return &nlru->lru;
+}
+static inline struct list_lru_one *
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+{
+        return &nlru->lru;
+}
+#endif /* CONFIG_MEMCG_KMEM */
 bool list_lru_add(struct list_lru *lru, struct list_head *item)
 {
        int nid = page_to_nid(virt_to_page(item));
        struct list_lru_node *nlru = &lru->node[nid];
+        struct list_lru_one *l;
        spin_lock(&nlru->lock);
-        WARN_ON_ONCE(nlru->nr_items < 0);
+        l = list_lru_from_kmem(nlru, item);
        if (list_empty(item)) {
-                list_add_tail(item, &nlru->list);
+                list_add_tail(item, &l->list);
-                if (nlru->nr_items++ == 0)
+                l->nr_items++;
-                        node_set(nid, lru->active_nodes);
                spin_unlock(&nlru->lock);
                return true;
        }
@@ -33,13 +115,13 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
 {
        int nid = page_to_nid(virt_to_page(item));
        struct list_lru_node *nlru = &lru->node[nid];
+        struct list_lru_one *l;
        spin_lock(&nlru->lock);
+        l = list_lru_from_kmem(nlru, item);
        if (!list_empty(item)) {
                list_del_init(item);
-                if (--nlru->nr_items == 0)
+                l->nr_items--;
-                        node_clear(nid, lru->active_nodes);
-                WARN_ON_ONCE(nlru->nr_items < 0);
                spin_unlock(&nlru->lock);
                return true;
        }
@@ -48,33 +130,72 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
 }
 EXPORT_SYMBOL_GPL(list_lru_del);
-unsigned long
+void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
-list_lru_count_node(struct list_lru *lru, int nid)
+{
+        list_del_init(item);
+        list->nr_items--;
+}
+EXPORT_SYMBOL_GPL(list_lru_isolate);
+void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
+                           struct list_head *head)
+{
+        list_move(item, head);
+        list->nr_items--;
+}
+EXPORT_SYMBOL_GPL(list_lru_isolate_move);
+static unsigned long __list_lru_count_one(struct list_lru *lru,
+                                          int nid, int memcg_idx)
 {
-        unsigned long count = 0;
        struct list_lru_node *nlru = &lru->node[nid];
+        struct list_lru_one *l;
+        unsigned long count;
        spin_lock(&nlru->lock);
-        WARN_ON_ONCE(nlru->nr_items < 0);
+        l = list_lru_from_memcg_idx(nlru, memcg_idx);
-        count += nlru->nr_items;
+        count = l->nr_items;
        spin_unlock(&nlru->lock);
        return count;
 }
+unsigned long list_lru_count_one(struct list_lru *lru,
+                                 int nid, struct mem_cgroup *memcg)
+{
+        return __list_lru_count_one(lru, nid, memcg_cache_id(memcg));
+}
+EXPORT_SYMBOL_GPL(list_lru_count_one);
+unsigned long list_lru_count_node(struct list_lru *lru, int nid)
+{
+        long count = 0;
+        int memcg_idx;
+        count += __list_lru_count_one(lru, nid, -1);
+        if (list_lru_memcg_aware(lru)) {
+                for_each_memcg_cache_index(memcg_idx)
+                        count += __list_lru_count_one(lru, nid, memcg_idx);
+        }
+        return count;
+}
 EXPORT_SYMBOL_GPL(list_lru_count_node);
-unsigned long
+static unsigned long
-list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
+__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
-                   void *cb_arg, unsigned long *nr_to_walk)
+                    list_lru_walk_cb isolate, void *cb_arg,
+                    unsigned long *nr_to_walk)
 {
-        struct list_lru_node    *nlru = &lru->node[nid];
+        struct list_lru_node *nlru = &lru->node[nid];
+        struct list_lru_one *l;
        struct list_head *item, *n;
        unsigned long isolated = 0;
        spin_lock(&nlru->lock);
+        l = list_lru_from_memcg_idx(nlru, memcg_idx);
 restart:
-        list_for_each_safe(item, n, &nlru->list) {
+        list_for_each_safe(item, n, &l->list) {
                enum lru_status ret;
                /*
@@ -85,14 +206,11 @@ restart:
                        break;
                --*nr_to_walk;
-                ret = isolate(item, &nlru->lock, cb_arg);
+                ret = isolate(item, l, &nlru->lock, cb_arg);
                switch (ret) {
                case LRU_REMOVED_RETRY:
                        assert_spin_locked(&nlru->lock);
                case LRU_REMOVED:
-                        if (--nlru->nr_items == 0)
-                                node_clear(nid, lru->active_nodes);
-                        WARN_ON_ONCE(nlru->nr_items < 0);
                        isolated++;
                        /*
                         * If the lru lock has been dropped, our list
@@ -103,7 +221,7 @@ restart:
                                goto restart;
                        break;
                case LRU_ROTATE:
-                        list_move_tail(item, &nlru->list);
+                        list_move_tail(item, &l->list);
                        break;
                case LRU_SKIP:
                        break;
@@ -122,31 +240,322 @@ restart:
        spin_unlock(&nlru->lock);
        return isolated;
 }
+unsigned long
+list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
+                  list_lru_walk_cb isolate, void *cb_arg,
+                  unsigned long *nr_to_walk)
+{
+        return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg),
+                                   isolate, cb_arg, nr_to_walk);
+}
+EXPORT_SYMBOL_GPL(list_lru_walk_one);
+unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
+                                 list_lru_walk_cb isolate, void *cb_arg,
+                                 unsigned long *nr_to_walk)
+{
+        long isolated = 0;
+        int memcg_idx;
+        isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg,
+                                        nr_to_walk);
+        if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
+                for_each_memcg_cache_index(memcg_idx) {
+                        isolated += __list_lru_walk_one(lru, nid, memcg_idx,
+                                                isolate, cb_arg, nr_to_walk);
+                        if (*nr_to_walk <= 0)
+                                break;
+                }
+        }
+        return isolated;
+}
 EXPORT_SYMBOL_GPL(list_lru_walk_node);
-int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
+static void init_one_lru(struct list_lru_one *l)
+{
+        INIT_LIST_HEAD(&l->list);
+        l->nr_items = 0;
+}
+#ifdef CONFIG_MEMCG_KMEM
+static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
+                                          int begin, int end)
+{
+        int i;
+        for (i = begin; i < end; i++)
+                kfree(memcg_lrus->lru[i]);
+}
+static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
+                                      int begin, int end)
+{
+        int i;
+        for (i = begin; i < end; i++) {
+                struct list_lru_one *l;
+                l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL);
+                if (!l)
+                        goto fail;
+                init_one_lru(l);
+                memcg_lrus->lru[i] = l;
+        }
+        return 0;
+fail:
+        __memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1);
+        return -ENOMEM;
+}
+static int memcg_init_list_lru_node(struct list_lru_node *nlru)
+{
+        int size = memcg_nr_cache_ids;
+        nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL);
+        if (!nlru->memcg_lrus)
+                return -ENOMEM;
+        if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) {
+                kfree(nlru->memcg_lrus);
+                return -ENOMEM;
+        }
+        return 0;
+}
+static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
+{
+        __memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids);
+        kfree(nlru->memcg_lrus);
+}
+static int memcg_update_list_lru_node(struct list_lru_node *nlru,
+                                      int old_size, int new_size)
+{
+        struct list_lru_memcg *old, *new;
+        BUG_ON(old_size > new_size);
+        old = nlru->memcg_lrus;
+        new = kmalloc(new_size * sizeof(void *), GFP_KERNEL);
+        if (!new)
+                return -ENOMEM;
+        if (__memcg_init_list_lru_node(new, old_size, new_size)) {
+                kfree(new);
+                return -ENOMEM;
+        }
+        memcpy(new, old, old_size * sizeof(void *));
+        /*
+         * The lock guarantees that we won't race with a reader
+         * (see list_lru_from_memcg_idx).
+         *
+         * Since list_lru_{add,del} may be called under an IRQ-safe lock,
+         * we have to use IRQ-safe primitives here to avoid deadlock.
+         */
+        spin_lock_irq(&nlru->lock);
+        nlru->memcg_lrus = new;
+        spin_unlock_irq(&nlru->lock);
+        kfree(old);
+        return 0;
+}
+static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru,
+                                              int old_size, int new_size)
+{
+        /* do not bother shrinking the array back to the old size, because we
+         * cannot handle allocation failures here */
+        __memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size);
+}
+static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+{
+        int i;
+        for (i = 0; i < nr_node_ids; i++) {
+                if (!memcg_aware)
+                        lru->node[i].memcg_lrus = NULL;
+                else if (memcg_init_list_lru_node(&lru->node[i]))
+                        goto fail;
+        }
+        return 0;
+fail:
+        for (i = i - 1; i >= 0; i--)
+                memcg_destroy_list_lru_node(&lru->node[i]);
+        return -ENOMEM;
+}
+static void memcg_destroy_list_lru(struct list_lru *lru)
+{
+        int i;
+        if (!list_lru_memcg_aware(lru))
+                return;
+        for (i = 0; i < nr_node_ids; i++)
+                memcg_destroy_list_lru_node(&lru->node[i]);
+}
+static int memcg_update_list_lru(struct list_lru *lru,
+                                 int old_size, int new_size)
+{
+        int i;
+        if (!list_lru_memcg_aware(lru))
+                return 0;
+        for (i = 0; i < nr_node_ids; i++) {
+                if (memcg_update_list_lru_node(&lru->node[i],
+                                               old_size, new_size))
+                        goto fail;
+        }
+        return 0;
+fail:
+        for (i = i - 1; i >= 0; i--)
+                memcg_cancel_update_list_lru_node(&lru->node[i],
+                                                  old_size, new_size);
+        return -ENOMEM;
+}
+static void memcg_cancel_update_list_lru(struct list_lru *lru,
+                                         int old_size, int new_size)
+{
+        int i;
+        if (!list_lru_memcg_aware(lru))
+                return;
+        for (i = 0; i < nr_node_ids; i++)
+                memcg_cancel_update_list_lru_node(&lru->node[i],
+                                                  old_size, new_size);
+}
+int memcg_update_all_list_lrus(int new_size)
+{
+        int ret = 0;
+        struct list_lru *lru;
+        int old_size = memcg_nr_cache_ids;
+        mutex_lock(&list_lrus_mutex);
+        list_for_each_entry(lru, &list_lrus, list) {
+                ret = memcg_update_list_lru(lru, old_size, new_size);
+                if (ret)
+                        goto fail;
+        }
+out:
+        mutex_unlock(&list_lrus_mutex);
+        return ret;
+fail:
+        list_for_each_entry_continue_reverse(lru, &list_lrus, list)
+                memcg_cancel_update_list_lru(lru, old_size, new_size);
+        goto out;
+}
+static void memcg_drain_list_lru_node(struct list_lru_node *nlru,
+                                      int src_idx, int dst_idx)
+{
+        struct list_lru_one *src, *dst;
+        /*
+         * Since list_lru_{add,del} may be called under an IRQ-safe lock,
+         * we have to use IRQ-safe primitives here to avoid deadlock.
+         */
+        spin_lock_irq(&nlru->lock);
+        src = list_lru_from_memcg_idx(nlru, src_idx);
+        dst = list_lru_from_memcg_idx(nlru, dst_idx);
+        list_splice_init(&src->list, &dst->list);
+        dst->nr_items += src->nr_items;
+        src->nr_items = 0;
+        spin_unlock_irq(&nlru->lock);
+}
+static void memcg_drain_list_lru(struct list_lru *lru,
+                                 int src_idx, int dst_idx)
+{
+        int i;
+        if (!list_lru_memcg_aware(lru))
+                return;
+        for (i = 0; i < nr_node_ids; i++)
+                memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx);
+}
+void memcg_drain_all_list_lrus(int src_idx, int dst_idx)
+{
+        struct list_lru *lru;
+        mutex_lock(&list_lrus_mutex);
+        list_for_each_entry(lru, &list_lrus, list)
+                memcg_drain_list_lru(lru, src_idx, dst_idx);
+        mutex_unlock(&list_lrus_mutex);
+}
+#else
+static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+{
+        return 0;
+}
+static void memcg_destroy_list_lru(struct list_lru *lru)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+int __list_lru_init(struct list_lru *lru, bool memcg_aware,
+                    struct lock_class_key *key)
 {
        int i;
        size_t size = sizeof(*lru->node) * nr_node_ids;
+        int err = -ENOMEM;
+        memcg_get_cache_ids();
        lru->node = kzalloc(size, GFP_KERNEL);
        if (!lru->node)
-                return -ENOMEM;
+                goto out;
-        nodes_clear(lru->active_nodes);
        for (i = 0; i < nr_node_ids; i++) {
                spin_lock_init(&lru->node[i].lock);
                if (key)
                        lockdep_set_class(&lru->node[i].lock, key);
-                INIT_LIST_HEAD(&lru->node[i].list);
+                init_one_lru(&lru->node[i].lru);
-                lru->node[i].nr_items = 0;
        }
-        return 0;
+        err = memcg_init_list_lru(lru, memcg_aware);
+        if (err) {
+                kfree(lru->node);
+                goto out;
+        }
+        list_lru_register(lru);
+out:
+        memcg_put_cache_ids();
+        return err;
 }
-EXPORT_SYMBOL_GPL(list_lru_init_key);
+EXPORT_SYMBOL_GPL(__list_lru_init);
 void list_lru_destroy(struct list_lru *lru)
 {
+        /* Already destroyed or not yet initialized? */
+        if (!lru->node)
+                return;
+        memcg_get_cache_ids();
+        list_lru_unregister(lru);
+        memcg_destroy_list_lru(lru);
        kfree(lru->node);
+        lru->node = NULL;
+        memcg_put_cache_ids();
 }
 EXPORT_SYMBOL_GPL(list_lru_destroy);
diff --git a/mm/madvise.c b/mm/madvise.c
index a271adc93289..d551475517bf 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -155,7 +155,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
                pte = *(orig_pte + ((index - start) / PAGE_SIZE));
                pte_unmap_unlock(orig_pte, ptl);
-                if (pte_present(pte) || pte_none(pte) || pte_file(pte))
+                if (pte_present(pte) || pte_none(pte))
                        continue;
                entry = pte_to_swp_entry(pte);
                if (unlikely(non_swap_entry(entry)))
@@ -222,21 +222,24 @@ static long madvise_willneed(struct vm_area_struct *vma,
        struct file *file = vma->vm_file;
 #ifdef CONFIG_SWAP
-        if (!file || mapping_cap_swap_backed(file->f_mapping)) {
+        if (!file) {
                *prev = vma;
-                if (!file)
+                force_swapin_readahead(vma, start, end);
-                        force_swapin_readahead(vma, start, end);
-                else
-                        force_shm_swapin_readahead(vma, start, end,
-                                                file->f_mapping);
                return 0;
        }
-#endif
+        if (shmem_mapping(file->f_mapping)) {
+                *prev = vma;
+                force_shm_swapin_readahead(vma, start, end,
+                                        file->f_mapping);
+                return 0;
+        }
+#else
        if (!file)
                return -EBADF;
+#endif
-        if (file->f_mapping->a_ops->get_xip_mem) {
+        if (IS_DAX(file_inode(file))) {
                /* no bad return value, but ignore advice */
                return 0;
        }
@@ -278,14 +281,7 @@ static long madvise_dontneed(struct vm_area_struct *vma,
        if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
                return -EINVAL;
-        if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
+        zap_page_range(vma, start, end - start, NULL);
-                struct zap_details details = {
-                        .nonlinear_vma = vma,
-                        .last_index = ULONG_MAX,
-                };
-                zap_page_range(vma, start, end - start, &details);
-        } else
-                zap_page_range(vma, start, end - start, NULL);
        return 0;
 }
@@ -303,7 +299,7 @@ static long madvise_remove(struct vm_area_struct *vma,
        *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
-        if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
+        if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB))
                return -EINVAL;
        f = vma->vm_file;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2f6893c2f01b..9fe07692eaad 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -72,22 +72,13 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
 #define MEM_CGROUP_RECLAIM_RETRIES      5
 static struct mem_cgroup *root_mem_cgroup __read_mostly;
+/* Whether the swap controller is active */
 #ifdef CONFIG_MEMCG_SWAP
-/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
 int do_swap_account __read_mostly;
-/* for remember boot option*/
-#ifdef CONFIG_MEMCG_SWAP_ENABLED
-static int really_do_swap_account __initdata = 1;
-#else
-static int really_do_swap_account __initdata;
-#endif
 #else
 #define do_swap_account         0
 #endif
 static const char * const mem_cgroup_stat_names[] = {
        "cache",
        "rss",
@@ -97,14 +88,6 @@ static const char * const mem_cgroup_stat_names[] = {
        "swap",
 };
-enum mem_cgroup_events_index {
-        MEM_CGROUP_EVENTS_PGPGIN,       /* # of pages paged in */
-        MEM_CGROUP_EVENTS_PGPGOUT,      /* # of pages paged out */
-        MEM_CGROUP_EVENTS_PGFAULT,      /* # of page-faults */
-        MEM_CGROUP_EVENTS_PGMAJFAULT,   /* # of major page-faults */
-        MEM_CGROUP_EVENTS_NSTATS,
-};
 static const char * const mem_cgroup_events_names[] = {
        "pgpgin",
        "pgpgout",
@@ -138,7 +121,7 @@ enum mem_cgroup_events_target {
 struct mem_cgroup_stat_cpu {
        long count[MEM_CGROUP_STAT_NSTATS];
-        unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
+        unsigned long events[MEMCG_NR_EVENTS];
        unsigned long nr_page_events;
        unsigned long targets[MEM_CGROUP_NTARGETS];
 };
@@ -284,6 +267,10 @@ struct mem_cgroup {
        struct page_counter memsw;
        struct page_counter kmem;
+        /* Normal memory consumption range */
+        unsigned long low;
+        unsigned long high;
        unsigned long soft_limit;
        /* vmpressure notifications */
@@ -325,9 +312,11 @@ struct mem_cgroup {
        /*
         * set > 0 if pages under this cgroup are moving to other cgroup.
         */
-        atomic_t        moving_account;
+        atomic_t                moving_account;
        /* taken only while moving_account > 0 */
-        spinlock_t      move_lock;
+        spinlock_t              move_lock;
+        struct task_struct      *move_lock_task;
+        unsigned long           move_lock_flags;
        /*
         * percpu counter.
         */
@@ -343,11 +332,10 @@ struct mem_cgroup {
        struct cg_proto tcp_mem;
 #endif
 #if defined(CONFIG_MEMCG_KMEM)
-        /* analogous to slab_common's slab_caches list, but per-memcg;
+        /* Index in the kmem_cache->memcg_params.memcg_caches array */
-         * protected by memcg_slab_mutex */
-        struct list_head memcg_slab_caches;
-        /* Index in the kmem_cache->memcg_params->memcg_caches array */
        int kmemcg_id;
+        bool kmem_acct_activated;
+        bool kmem_acct_active;
 #endif
        int last_scanned_node;
@@ -366,29 +354,26 @@ struct mem_cgroup {
 };
 #ifdef CONFIG_MEMCG_KMEM
-static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+bool memcg_kmem_is_active(struct mem_cgroup *memcg)
 {
-        return memcg->kmemcg_id >= 0;
+        return memcg->kmem_acct_active;
 }
 #endif
 /* Stuffs for move charges at task migration. */
 /*
- * Types of charges to be moved. "move_charge_at_immitgrate" and
+ * Types of charges to be moved.
- * "immigrate_flags" are treated as a left-shifted bitmap of these types.
 */
-enum move_type {
+#define MOVE_ANON       0x1U
-        MOVE_CHARGE_TYPE_ANON,  /* private anonymous page and swap of it */
+#define MOVE_FILE       0x2U
-        MOVE_CHARGE_TYPE_FILE,  /* file page(including tmpfs) and swap of it */
+#define MOVE_MASK       (MOVE_ANON | MOVE_FILE)
-        NR_MOVE_TYPE,
-};
 /* "mc" and its members are protected by cgroup_mutex */
 static struct move_charge_struct {
        spinlock_t        lock; /* for from, to */
        struct mem_cgroup *from;
        struct mem_cgroup *to;
-        unsigned long immigrate_flags;
+        unsigned long flags;
        unsigned long precharge;
        unsigned long moved_charge;
        unsigned long moved_swap;
@@ -399,16 +384,6 @@ static struct move_charge_struct {
        .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 };
-static bool move_anon(void)
-{
-        return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
-}
-static bool move_file(void)
-{
-        return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
-}
 /*
 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 * limit reclaim to prevent infinite loops, if they ever occur.
@@ -544,33 +519,35 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
-static void disarm_sock_keys(struct mem_cgroup *memcg)
-{
-        if (!memcg_proto_activated(&memcg->tcp_mem))
-                return;
-        static_key_slow_dec(&memcg_socket_limit_enabled);
-}
-#else
-static void disarm_sock_keys(struct mem_cgroup *memcg)
-{
-}
 #endif
 #ifdef CONFIG_MEMCG_KMEM
 /*
- * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
+ * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
 * The main reason for not using cgroup id for this:
 *  this works better in sparse environments, where we have a lot of memcgs,
 *  but only a few kmem-limited. Or also, if we have, for instance, 200
 *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 *  200 entry array for that.
 *
- * The current size of the caches array is stored in
+ * The current size of the caches array is stored in memcg_nr_cache_ids. It
- * memcg_limited_groups_array_size.  It will double each time we have to
+ * will double each time we have to increase it.
- * increase it.
 */
-static DEFINE_IDA(kmem_limited_groups);
+static DEFINE_IDA(memcg_cache_ida);
-int memcg_limited_groups_array_size;
+int memcg_nr_cache_ids;
+/* Protects memcg_nr_cache_ids */
+static DECLARE_RWSEM(memcg_cache_ids_sem);
+void memcg_get_cache_ids(void)
+{
+        down_read(&memcg_cache_ids_sem);
+}
+void memcg_put_cache_ids(void)
+{
+        up_read(&memcg_cache_ids_sem);
+}
 /*
 * MIN_SIZE is different than 1, because we would like to avoid going through
@@ -596,32 +573,8 @@ int memcg_limited_groups_array_size;
 struct static_key memcg_kmem_enabled_key;
 EXPORT_SYMBOL(memcg_kmem_enabled_key);
-static void memcg_free_cache_id(int id);
-static void disarm_kmem_keys(struct mem_cgroup *memcg)
-{
-        if (memcg_kmem_is_active(memcg)) {
-                static_key_slow_dec(&memcg_kmem_enabled_key);
-                memcg_free_cache_id(memcg->kmemcg_id);
-        }
-        /*
-         * This check can't live in kmem destruction function,
-         * since the charges will outlive the cgroup
-         */
-        WARN_ON(page_counter_read(&memcg->kmem));
-}
-#else
-static void disarm_kmem_keys(struct mem_cgroup *memcg)
-{
-}
 #endif /* CONFIG_MEMCG_KMEM */
-static void disarm_static_keys(struct mem_cgroup *memcg)
-{
-        disarm_sock_keys(memcg);
-        disarm_kmem_keys(memcg);
-}
 static struct mem_cgroup_per_zone *
 mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
 {
@@ -1368,6 +1321,20 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
        return inactive * inactive_ratio < active;
 }
+bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
+{
+        struct mem_cgroup_per_zone *mz;
+        struct mem_cgroup *memcg;
+        if (mem_cgroup_disabled())
+                return true;
+        mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+        memcg = mz->memcg;
+        return !!(memcg->css.flags & CSS_ONLINE);
+}
 #define mem_cgroup_from_counter(counter, member)        \
        container_of(counter, struct mem_cgroup, member)
@@ -1560,7 +1527,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
         * quickly exit and free its memory.
         */
        if (fatal_signal_pending(current) || task_will_free_mem(current)) {
-                set_thread_flag(TIF_MEMDIE);
+                mark_tsk_oom_victim(current);
                return;
        }
@@ -1934,7 +1901,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
        if (!memcg)
                return false;
-        if (!handle)
+        if (!handle || oom_killer_disabled)
                goto cleanup;
        owait.memcg = memcg;
@@ -1980,34 +1947,33 @@ cleanup:
 /**
 * mem_cgroup_begin_page_stat - begin a page state statistics transaction
 * @page: page that is going to change accounted state
- * @locked: &memcg->move_lock slowpath was taken
- * @flags: IRQ-state flags for &memcg->move_lock
 *
 * This function must mark the beginning of an accounted page state
 * change to prevent double accounting when the page is concurrently
 * being moved to another memcg:
 *
- *   memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+ *   memcg = mem_cgroup_begin_page_stat(page);
 *   if (TestClearPageState(page))
 *     mem_cgroup_update_page_stat(memcg, state, -1);
- *   mem_cgroup_end_page_stat(memcg, locked, flags);
+ *   mem_cgroup_end_page_stat(memcg);
- *
- * The RCU lock is held throughout the transaction.  The fast path can
- * get away without acquiring the memcg->move_lock (@locked is false)
- * because page moving starts with an RCU grace period.
- *
- * The RCU lock also protects the memcg from being freed when the page
- * state that is going to change is the only thing preventing the page
- * from being uncharged.  E.g. end-writeback clearing PageWriteback(),
- * which allows migration to go ahead and uncharge the page before the
- * account transaction might be complete.
 */
-struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
+struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
-                                              bool *locked,
-                                              unsigned long *flags)
 {
        struct mem_cgroup *memcg;
+        unsigned long flags;
+        /*
+         * The RCU lock is held throughout the transaction.  The fast
+         * path can get away without acquiring the memcg->move_lock
+         * because page moving starts with an RCU grace period.
+         *
+         * The RCU lock also protects the memcg from being freed when
+         * the page state that is going to change is the only thing
+         * preventing the page from being uncharged.
+         * E.g. end-writeback clearing PageWriteback(), which allows
+         * migration to go ahead and uncharge the page before the
+         * account transaction might be complete.
+         */
        rcu_read_lock();
        if (mem_cgroup_disabled())
@@ -2017,16 +1983,22 @@ again:
        if (unlikely(!memcg))
                return NULL;
-        *locked = false;
        if (atomic_read(&memcg->moving_account) <= 0)
                return memcg;
-        spin_lock_irqsave(&memcg->move_lock, *flags);
+        spin_lock_irqsave(&memcg->move_lock, flags);
        if (memcg != page->mem_cgroup) {
-                spin_unlock_irqrestore(&memcg->move_lock, *flags);
+                spin_unlock_irqrestore(&memcg->move_lock, flags);
                goto again;
        }
-        *locked = true;
+        /*
+         * When charge migration first begins, we can have locked and
+         * unlocked page stat updates happening concurrently.  Track
+         * the task who has the lock for mem_cgroup_end_page_stat().
+         */
+        memcg->move_lock_task = current;
+        memcg->move_lock_flags = flags;
        return memcg;
 }
@@ -2034,14 +2006,17 @@ again:
 /**
 * mem_cgroup_end_page_stat - finish a page state statistics transaction
 * @memcg: the memcg that was accounted against
- * @locked: value received from mem_cgroup_begin_page_stat()
- * @flags: value received from mem_cgroup_begin_page_stat()
 */
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked,
+void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
-                              unsigned long *flags)
 {
-        if (memcg && *locked)
+        if (memcg && memcg->move_lock_task == current) {
-                spin_unlock_irqrestore(&memcg->move_lock, *flags);
+                unsigned long flags = memcg->move_lock_flags;
+                memcg->move_lock_task = NULL;
+                memcg->move_lock_flags = 0;
+                spin_unlock_irqrestore(&memcg->move_lock, flags);
+        }
        rcu_read_unlock();
 }
@@ -2134,17 +2109,6 @@ static void drain_local_stock(struct work_struct *dummy)
        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 }
-static void __init memcg_stock_init(void)
-{
-        int cpu;
-        for_each_possible_cpu(cpu) {
-                struct memcg_stock_pcp *stock =
-                                        &per_cpu(memcg_stock, cpu);
-                INIT_WORK(&stock->work, drain_local_stock);
-        }
-}
 /*
 * Cache charges(val) to local per_cpu area.
 * This will be consumed by consume_stock() function, later.
@@ -2294,6 +2258,8 @@ retry:
        if (!(gfp_mask & __GFP_WAIT))
                goto nomem;
+        mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
        nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
                                                    gfp_mask, may_swap);
@@ -2335,6 +2301,8 @@ retry:
        if (fatal_signal_pending(current))
                goto bypass;
+        mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
        mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
 nomem:
        if (!(gfp_mask & __GFP_NOFAIL))
@@ -2346,6 +2314,16 @@ done_restock:
        css_get_many(&memcg->css, batch);
        if (batch > nr_pages)
                refill_stock(memcg, batch - nr_pages);
+        /*
+         * If the hierarchy is above the normal consumption range,
+         * make the charging task trim their excess contribution.
+         */
+        do {
+                if (page_counter_read(&memcg->memory) <= memcg->high)
+                        continue;
+                mem_cgroup_events(memcg, MEMCG_HIGH, 1);
+                try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+        } while ((memcg = parent_mem_cgroup(memcg)));
 done:
        return ret;
 }
@@ -2476,27 +2454,8 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
 }
 #ifdef CONFIG_MEMCG_KMEM
-/*
+int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
- * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
+                      unsigned long nr_pages)
- * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.
- */
-static DEFINE_MUTEX(memcg_slab_mutex);
-/*
- * This is a bit cumbersome, but it is rarely used and avoids a backpointer
- * in the memcg_cache_params struct.
- */
-static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
-{
-        struct kmem_cache *cachep;
-        VM_BUG_ON(p->is_root_cache);
-        cachep = p->root_cache;
-        return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
-}
-static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
-                             unsigned long nr_pages)
 {
        struct page_counter *counter;
        int ret = 0;
@@ -2533,8 +2492,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
        return ret;
 }
-static void memcg_uncharge_kmem(struct mem_cgroup *memcg,
+void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
-                                unsigned long nr_pages)
 {
        page_counter_uncharge(&memcg->memory, nr_pages);
        if (do_swap_account)
@@ -2560,18 +2518,19 @@ static int memcg_alloc_cache_id(void)
        int id, size;
        int err;
-        id = ida_simple_get(&kmem_limited_groups,
+        id = ida_simple_get(&memcg_cache_ida,
                            0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
        if (id < 0)
                return id;
-        if (id < memcg_limited_groups_array_size)
+        if (id < memcg_nr_cache_ids)
                return id;
        /*
         * There's no space for the new id in memcg_caches arrays,
         * so we have to grow them.
         */
+        down_write(&memcg_cache_ids_sem);
        size = 2 * (id + 1);
        if (size < MEMCG_CACHES_MIN_SIZE)
@@ -2579,12 +2538,16 @@ static int memcg_alloc_cache_id(void)
        else if (size > MEMCG_CACHES_MAX_SIZE)
                size = MEMCG_CACHES_MAX_SIZE;
-        mutex_lock(&memcg_slab_mutex);
        err = memcg_update_all_caches(size);
-        mutex_unlock(&memcg_slab_mutex);
+        if (!err)
+                err = memcg_update_all_list_lrus(size);
+        if (!err)
+                memcg_nr_cache_ids = size;
+        up_write(&memcg_cache_ids_sem);
        if (err) {
-                ida_simple_remove(&kmem_limited_groups, id);
+                ida_simple_remove(&memcg_cache_ida, id);
                return err;
        }
        return id;
@@ -2592,136 +2555,23 @@ static int memcg_alloc_cache_id(void)
 static void memcg_free_cache_id(int id)
 {
-        ida_simple_remove(&kmem_limited_groups, id);
+        ida_simple_remove(&memcg_cache_ida, id);
 }
-/*
+struct memcg_kmem_cache_create_work {
- * We should update the current array size iff all caches updates succeed. This
- * can only be done from the slab side. The slab mutex needs to be held when
- * calling this.
- */
-void memcg_update_array_size(int num)
-{
-        memcg_limited_groups_array_size = num;
-}
-static void memcg_register_cache(struct mem_cgroup *memcg,
-                                 struct kmem_cache *root_cache)
-{
-        static char memcg_name_buf[NAME_MAX + 1]; /* protected by
-                                                     memcg_slab_mutex */
-        struct kmem_cache *cachep;
-        int id;
-        lockdep_assert_held(&memcg_slab_mutex);
-        id = memcg_cache_id(memcg);
-        /*
-         * Since per-memcg caches are created asynchronously on first
-         * allocation (see memcg_kmem_get_cache()), several threads can try to
-         * create the same cache, but only one of them may succeed.
-         */
-        if (cache_from_memcg_idx(root_cache, id))
-                return;
-        cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
-        cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
-        /*
-         * If we could not create a memcg cache, do not complain, because
-         * that's not critical at all as we can always proceed with the root
-         * cache.
-         */
-        if (!cachep)
-                return;
-        list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
-        /*
-         * Since readers won't lock (see cache_from_memcg_idx()), we need a
-         * barrier here to ensure nobody will see the kmem_cache partially
-         * initialized.
-         */
-        smp_wmb();
-        BUG_ON(root_cache->memcg_params->memcg_caches[id]);
-        root_cache->memcg_params->memcg_caches[id] = cachep;
-}
-static void memcg_unregister_cache(struct kmem_cache *cachep)
-{
-        struct kmem_cache *root_cache;
-        struct mem_cgroup *memcg;
-        int id;
-        lockdep_assert_held(&memcg_slab_mutex);
-        BUG_ON(is_root_cache(cachep));
-        root_cache = cachep->memcg_params->root_cache;
-        memcg = cachep->memcg_params->memcg;
-        id = memcg_cache_id(memcg);
-        BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
-        root_cache->memcg_params->memcg_caches[id] = NULL;
-        list_del(&cachep->memcg_params->list);
-        kmem_cache_destroy(cachep);
-}
-int __memcg_cleanup_cache_params(struct kmem_cache *s)
-{
-        struct kmem_cache *c;
-        int i, failed = 0;
-        mutex_lock(&memcg_slab_mutex);
-        for_each_memcg_cache_index(i) {
-                c = cache_from_memcg_idx(s, i);
-                if (!c)
-                        continue;
-                memcg_unregister_cache(c);
-                if (cache_from_memcg_idx(s, i))
-                        failed++;
-        }
-        mutex_unlock(&memcg_slab_mutex);
-        return failed;
-}
-static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
-{
-        struct kmem_cache *cachep;
-        struct memcg_cache_params *params, *tmp;
-        if (!memcg_kmem_is_active(memcg))
-                return;
-        mutex_lock(&memcg_slab_mutex);
-        list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
-                cachep = memcg_params_to_cache(params);
-                memcg_unregister_cache(cachep);
-        }
-        mutex_unlock(&memcg_slab_mutex);
-}
-struct memcg_register_cache_work {
        struct mem_cgroup *memcg;
        struct kmem_cache *cachep;
        struct work_struct work;
 };
-static void memcg_register_cache_func(struct work_struct *w)
+static void memcg_kmem_cache_create_func(struct work_struct *w)
 {
-        struct memcg_register_cache_work *cw =
+        struct memcg_kmem_cache_create_work *cw =
-                container_of(w, struct memcg_register_cache_work, work);
+                container_of(w, struct memcg_kmem_cache_create_work, work);
        struct mem_cgroup *memcg = cw->memcg;
        struct kmem_cache *cachep = cw->cachep;
-        mutex_lock(&memcg_slab_mutex);
+        memcg_create_kmem_cache(memcg, cachep);
-        memcg_register_cache(memcg, cachep);
-        mutex_unlock(&memcg_slab_mutex);
        css_put(&memcg->css);
        kfree(cw);
@@ -2730,10 +2580,10 @@ static void memcg_register_cache_func(struct work_struct *w)
 /*
 * Enqueue the creation of a per-memcg kmem_cache.
 */
-static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
+static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
-                                            struct kmem_cache *cachep)
+                                               struct kmem_cache *cachep)
 {
-        struct memcg_register_cache_work *cw;
+        struct memcg_kmem_cache_create_work *cw;
        cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
        if (!cw)
@@ -2743,18 +2593,18 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
        cw->memcg = memcg;
        cw->cachep = cachep;
+        INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
-        INIT_WORK(&cw->work, memcg_register_cache_func);
        schedule_work(&cw->work);
 }
-static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
+static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
-                                          struct kmem_cache *cachep)
+                                             struct kmem_cache *cachep)
 {
        /*
         * We need to stop accounting when we kmalloc, because if the
         * corresponding kmalloc cache is not yet created, the first allocation
-         * in __memcg_schedule_register_cache will recurse.
+         * in __memcg_schedule_kmem_cache_create will recurse.
         *
         * However, it is better to enclose the whole function. Depending on
         * the debugging options enabled, INIT_WORK(), for instance, can
@@ -2763,24 +2613,10 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
         * the safest choice is to do it like this, wrapping the whole function.
         */
        current->memcg_kmem_skip_account = 1;
-        __memcg_schedule_register_cache(memcg, cachep);
+        __memcg_schedule_kmem_cache_create(memcg, cachep);
        current->memcg_kmem_skip_account = 0;
 }
-int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
-{
-        unsigned int nr_pages = 1 << order;
-        return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
-}
-void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
-{
-        unsigned int nr_pages = 1 << order;
-        memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
-}
 /*
 * Return the kmem_cache we're supposed to use for a slab allocation.
 * We try to use the current memcg's version of the cache.
@@ -2798,18 +2634,19 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
 {
        struct mem_cgroup *memcg;
        struct kmem_cache *memcg_cachep;
+        int kmemcg_id;
-        VM_BUG_ON(!cachep->memcg_params);
+        VM_BUG_ON(!is_root_cache(cachep));
-        VM_BUG_ON(!cachep->memcg_params->is_root_cache);
        if (current->memcg_kmem_skip_account)
                return cachep;
        memcg = get_mem_cgroup_from_mm(current->mm);
-        if (!memcg_kmem_is_active(memcg))
+        kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id);
+        if (kmemcg_id < 0)
                goto out;
-        memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+        memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
        if (likely(memcg_cachep))
                return memcg_cachep;
@@ -2825,7 +2662,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
         * could happen with the slab_mutex held. So it's better to
         * defer everything.
         */
-        memcg_schedule_register_cache(memcg, cachep);
+        memcg_schedule_kmem_cache_create(memcg, cachep);
 out:
        css_put(&memcg->css);
        return cachep;
@@ -2834,7 +2671,7 @@ out:
 void __memcg_kmem_put_cache(struct kmem_cache *cachep)
 {
        if (!is_root_cache(cachep))
-                css_put(&cachep->memcg_params->memcg->css);
+                css_put(&cachep->memcg_params.memcg->css);
 }
 /*
@@ -2899,6 +2736,24 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
        memcg_uncharge_kmem(memcg, 1 << order);
        page->mem_cgroup = NULL;
 }
+struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
+{
+        struct mem_cgroup *memcg = NULL;
+        struct kmem_cache *cachep;
+        struct page *page;
+        page = virt_to_head_page(ptr);
+        if (PageSlab(page)) {
+                cachep = page->slab_cache;
+                if (!is_root_cache(cachep))
+                        memcg = cachep->memcg_params.memcg;
+        } else
+                /* page allocated by alloc_kmem_pages */
+                memcg = page->mem_cgroup;
+        return memcg;
+}
 #endif /* CONFIG_MEMCG_KMEM */
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -3433,8 +3288,9 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
        int err = 0;
        int memcg_id;
-        if (memcg_kmem_is_active(memcg))
+        BUG_ON(memcg->kmemcg_id >= 0);
-                return 0;
+        BUG_ON(memcg->kmem_acct_activated);
+        BUG_ON(memcg->kmem_acct_active);
        /*
         * For simplicity, we won't allow this to be disabled.  It also can't
@@ -3477,6 +3333,8 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
         * patched.
         */
        memcg->kmemcg_id = memcg_id;
+        memcg->kmem_acct_activated = true;
+        memcg->kmem_acct_active = true;
 out:
        return err;
 }
@@ -3533,7 +3391,7 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
        int ret;
        buf = strstrip(buf);
-        ret = page_counter_memparse(buf, &nr_pages);
+        ret = page_counter_memparse(buf, "-1", &nr_pages);
        if (ret)
                return ret;
@@ -3609,7 +3467,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-        if (val >= (1 << NR_MOVE_TYPE))
+        if (val & ~MOVE_MASK)
                return -EINVAL;
        /*
@@ -3687,6 +3545,10 @@ static int memcg_stat_show(struct seq_file *m, void *v)
        struct mem_cgroup *mi;
        unsigned int i;
+        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
+                     MEM_CGROUP_STAT_NSTATS);
+        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=
+                     MEM_CGROUP_EVENTS_NSTATS);
        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
@@ -3901,7 +3763,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
        unsigned long usage;
        int i, size, ret;
-        ret = page_counter_memparse(args, &threshold);
+        ret = page_counter_memparse(args, "-1", &threshold);
        if (ret)
                return ret;
@@ -4152,9 +4014,59 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
        return mem_cgroup_sockets_init(memcg, ss);
 }
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+        struct cgroup_subsys_state *css;
+        struct mem_cgroup *parent, *child;
+        int kmemcg_id;
+        if (!memcg->kmem_acct_active)
+                return;
+        /*
+         * Clear the 'active' flag before clearing memcg_caches arrays entries.
+         * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
+         * guarantees no cache will be created for this cgroup after we are
+         * done (see memcg_create_kmem_cache()).
+         */
+        memcg->kmem_acct_active = false;
+        memcg_deactivate_kmem_caches(memcg);
+        kmemcg_id = memcg->kmemcg_id;
+        BUG_ON(kmemcg_id < 0);
+        parent = parent_mem_cgroup(memcg);
+        if (!parent)
+                parent = root_mem_cgroup;
+        /*
+         * Change kmemcg_id of this cgroup and all its descendants to the
+         * parent's id, and then move all entries from this cgroup's list_lrus
+         * to ones of the parent. After we have finished, all list_lrus
+         * corresponding to this cgroup are guaranteed to remain empty. The
+         * ordering is imposed by list_lru_node->lock taken by
+         * memcg_drain_all_list_lrus().
+         */
+        css_for_each_descendant_pre(css, &memcg->css) {
+                child = mem_cgroup_from_css(css);
+                BUG_ON(child->kmemcg_id != kmemcg_id);
+                child->kmemcg_id = parent->kmemcg_id;
+                if (!memcg->use_hierarchy)
+                        break;
+        }
+        memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
+        memcg_free_cache_id(kmemcg_id);
+}
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
-        memcg_unregister_all_caches(memcg);
+        if (memcg->kmem_acct_activated) {
+                memcg_destroy_kmem_caches(memcg);
+                static_key_slow_dec(&memcg_kmem_enabled_key);
+                WARN_ON(page_counter_read(&memcg->kmem));
+        }
        mem_cgroup_sockets_destroy(memcg);
 }
 #else
@@ -4163,6 +4075,10 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
        return 0;
 }
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+}
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
 }
@@ -4391,7 +4307,7 @@ out_kfree:
        return ret;
 }
-static struct cftype mem_cgroup_files[] = {
+static struct cftype mem_cgroup_legacy_files[] = {
        {
                .name = "usage_in_bytes",
                .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
@@ -4502,34 +4418,6 @@ static struct cftype mem_cgroup_files[] = {
        { },    /* terminate */
 };
-#ifdef CONFIG_MEMCG_SWAP
-static struct cftype memsw_cgroup_files[] = {
-        {
-                .name = "memsw.usage_in_bytes",
-                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
-                .read_u64 = mem_cgroup_read_u64,
-        },
-        {
-                .name = "memsw.max_usage_in_bytes",
-                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
-                .write = mem_cgroup_reset,
-                .read_u64 = mem_cgroup_read_u64,
-        },
-        {
-                .name = "memsw.limit_in_bytes",
-                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
-                .write = mem_cgroup_write,
-                .read_u64 = mem_cgroup_read_u64,
-        },
-        {
-                .name = "memsw.failcnt",
-                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
-                .write = mem_cgroup_reset,
-                .read_u64 = mem_cgroup_read_u64,
-        },
-        { },    /* terminate */
-};
-#endif
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
        struct mem_cgroup_per_node *pn;
@@ -4609,8 +4497,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
                free_mem_cgroup_per_zone_info(memcg, node);
        free_percpu(memcg->stat);
-        disarm_static_keys(memcg);
        kfree(memcg);
 }
@@ -4625,29 +4511,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(parent_mem_cgroup);
-static void __init mem_cgroup_soft_limit_tree_init(void)
-{
-        struct mem_cgroup_tree_per_node *rtpn;
-        struct mem_cgroup_tree_per_zone *rtpz;
-        int tmp, node, zone;
-        for_each_node(node) {
-                tmp = node;
-                if (!node_state(node, N_NORMAL_MEMORY))
-                        tmp = -1;
-                rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
-                BUG_ON(!rtpn);
-                soft_limit_tree.rb_tree_per_node[node] = rtpn;
-                for (zone = 0; zone < MAX_NR_ZONES; zone++) {
-                        rtpz = &rtpn->rb_tree_per_zone[zone];
-                        rtpz->rb_root = RB_ROOT;
-                        spin_lock_init(&rtpz->lock);
-                }
-        }
-}
 static struct cgroup_subsys_state * __ref
 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 {
@@ -4667,6 +4530,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
        if (parent_css == NULL) {
                root_mem_cgroup = memcg;
                page_counter_init(&memcg->memory, NULL);
+                memcg->high = PAGE_COUNTER_MAX;
                memcg->soft_limit = PAGE_COUNTER_MAX;
                page_counter_init(&memcg->memsw, NULL);
                page_counter_init(&memcg->kmem, NULL);
@@ -4682,7 +4546,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
        spin_lock_init(&memcg->event_list_lock);
 #ifdef CONFIG_MEMCG_KMEM
        memcg->kmemcg_id = -1;
-        INIT_LIST_HEAD(&memcg->memcg_slab_caches);
 #endif
        return &memcg->css;
@@ -4713,6 +4576,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
        if (parent->use_hierarchy) {
                page_counter_init(&memcg->memory, &parent->memory);
+                memcg->high = PAGE_COUNTER_MAX;
                memcg->soft_limit = PAGE_COUNTER_MAX;
                page_counter_init(&memcg->memsw, &parent->memsw);
                page_counter_init(&memcg->kmem, &parent->kmem);
@@ -4723,6 +4587,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
                 */
        } else {
                page_counter_init(&memcg->memory, NULL);
+                memcg->high = PAGE_COUNTER_MAX;
                memcg->soft_limit = PAGE_COUNTER_MAX;
                page_counter_init(&memcg->memsw, NULL);
                page_counter_init(&memcg->kmem, NULL);
@@ -4768,6 +4633,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
        spin_unlock(&memcg->event_list_lock);
        vmpressure_cleanup(&memcg->vmpressure);
+        memcg_deactivate_kmem(memcg);
 }
 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -4798,6 +4665,8 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
        mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
        mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
        memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
+        memcg->low = 0;
+        memcg->high = PAGE_COUNTER_MAX;
        memcg->soft_limit = PAGE_COUNTER_MAX;
 }
@@ -4874,12 +4743,12 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
        if (!page || !page_mapped(page))
                return NULL;
        if (PageAnon(page)) {
-                /* we don't move shared anon */
+                if (!(mc.flags & MOVE_ANON))
-                if (!move_anon())
                        return NULL;
-        } else if (!move_file())
+        } else {
-                /* we ignore mapcount for file pages */
+                if (!(mc.flags & MOVE_FILE))
-                return NULL;
+                        return NULL;
+        }
        if (!get_page_unless_zero(page))
                return NULL;
@@ -4893,7 +4762,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
        struct page *page = NULL;
        swp_entry_t ent = pte_to_swp_entry(ptent);
-        if (!move_anon() || non_swap_entry(ent))
+        if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
                return NULL;
        /*
         * Because lookup_swap_cache() updates some statistics counter,
@@ -4922,14 +4791,11 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
        if (!vma->vm_file) /* anonymous vma */
                return NULL;
-        if (!move_file())
+        if (!(mc.flags & MOVE_FILE))
                return NULL;
        mapping = vma->vm_file->f_mapping;
-        if (pte_none(ptent))
+        pgoff = linear_page_index(vma, addr);
-                pgoff = linear_page_index(vma, addr);
-        else /* pte_file(ptent) is true */
-                pgoff = pte_to_pgoff(ptent);
        /* page is moved even if it's not RSS of this task(page-faulted). */
 #ifdef CONFIG_SWAP
@@ -4961,7 +4827,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
                page = mc_handle_present_pte(vma, addr, ptent);
        else if (is_swap_pte(ptent))
                page = mc_handle_swap_pte(vma, addr, ptent, &ent);
-        else if (pte_none(ptent) || pte_file(ptent))
+        else if (pte_none(ptent))
                page = mc_handle_file_pte(vma, addr, ptent, &ent);
        if (!page && !ent.val)
@@ -5004,7 +4870,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
        page = pmd_page(pmd);
        VM_BUG_ON_PAGE(!page || !PageHead(page), page);
-        if (!move_anon())
+        if (!(mc.flags & MOVE_ANON))
                return ret;
        if (page->mem_cgroup == mc.from) {
                ret = MC_TARGET_PAGE;
@@ -5027,7 +4893,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
                                        unsigned long addr, unsigned long end,
                                        struct mm_walk *walk)
 {
-        struct vm_area_struct *vma = walk->private;
+        struct vm_area_struct *vma = walk->vma;
        pte_t *pte;
        spinlock_t *ptl;
@@ -5053,20 +4919,13 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
 {
        unsigned long precharge;
-        struct vm_area_struct *vma;
+        struct mm_walk mem_cgroup_count_precharge_walk = {
+                .pmd_entry = mem_cgroup_count_precharge_pte_range,
+                .mm = mm,
+        };
        down_read(&mm->mmap_sem);
-        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+        walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk);
-                struct mm_walk mem_cgroup_count_precharge_walk = {
-                        .pmd_entry = mem_cgroup_count_precharge_pte_range,
-                        .mm = mm,
-                        .private = vma,
-                };
-                if (is_vm_hugetlb_page(vma))
-                        continue;
-                walk_page_range(vma->vm_start, vma->vm_end,
-                                        &mem_cgroup_count_precharge_walk);
-        }
        up_read(&mm->mmap_sem);
        precharge = mc.precharge;
@@ -5146,15 +5005,15 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
        struct task_struct *p = cgroup_taskset_first(tset);
        int ret = 0;
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-        unsigned long move_charge_at_immigrate;
+        unsigned long move_flags;
        /*
         * We are now commited to this value whatever it is. Changes in this
         * tunable will only affect upcoming migrations, not the current one.
         * So we need to save it, and keep it going.
         */
-        move_charge_at_immigrate  = memcg->move_charge_at_immigrate;
+        move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate);
-        if (move_charge_at_immigrate) {
+        if (move_flags) {
                struct mm_struct *mm;
                struct mem_cgroup *from = mem_cgroup_from_task(p);
@@ -5174,7 +5033,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
                        spin_lock(&mc.lock);
                        mc.from = from;
                        mc.to = memcg;
-                        mc.immigrate_flags = move_charge_at_immigrate;
+                        mc.flags = move_flags;
                        spin_unlock(&mc.lock);
                        /* We set mc.moving_task later */
@@ -5199,7 +5058,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
                                struct mm_walk *walk)
 {
        int ret = 0;
-        struct vm_area_struct *vma = walk->private;
+        struct vm_area_struct *vma = walk->vma;
        pte_t *pte;
        spinlock_t *ptl;
        enum mc_target_type target_type;
@@ -5295,7 +5154,10 @@ put:			/* get_mctgt_type() gets the page */
 static void mem_cgroup_move_charge(struct mm_struct *mm)
 {
-        struct vm_area_struct *vma;
+        struct mm_walk mem_cgroup_move_charge_walk = {
+                .pmd_entry = mem_cgroup_move_charge_pte_range,
+                .mm = mm,
+        };
        lru_add_drain_all();
        /*
@@ -5318,24 +5180,11 @@ retry:
                cond_resched();
                goto retry;
        }
-        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+        /*
-                int ret;
+         * When we have consumed all precharges and failed in doing
-                struct mm_walk mem_cgroup_move_charge_walk = {
+         * additional charge, the page walk just aborts.
-                        .pmd_entry = mem_cgroup_move_charge_pte_range,
+         */
-                        .mm = mm,
+        walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
-                        .private = vma,
-                };
-                if (is_vm_hugetlb_page(vma))
-                        continue;
-                ret = walk_page_range(vma->vm_start, vma->vm_end,
-                                                &mem_cgroup_move_charge_walk);
-                if (ret)
-                        /*
-                         * means we have consumed all precharges and failed in
-                         * doing additional charge. Just abandon here.
-                         */
-                        break;
-        }
        up_read(&mm->mmap_sem);
        atomic_dec(&mc.from->moving_account);
 }
@@ -5386,118 +5235,211 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
                mem_cgroup_from_css(root_css)->use_hierarchy = true;
 }
-struct cgroup_subsys memory_cgrp_subsys = {
+static u64 memory_current_read(struct cgroup_subsys_state *css,
-        .css_alloc = mem_cgroup_css_alloc,
+                               struct cftype *cft)
-        .css_online = mem_cgroup_css_online,
+{
-        .css_offline = mem_cgroup_css_offline,
+        return mem_cgroup_usage(mem_cgroup_from_css(css), false);
-        .css_free = mem_cgroup_css_free,
+}
-        .css_reset = mem_cgroup_css_reset,
-        .can_attach = mem_cgroup_can_attach,
-        .cancel_attach = mem_cgroup_cancel_attach,
-        .attach = mem_cgroup_move_task,
-        .bind = mem_cgroup_bind,
-        .legacy_cftypes = mem_cgroup_files,
-        .early_init = 0,
-};
-#ifdef CONFIG_MEMCG_SWAP
+static int memory_low_show(struct seq_file *m, void *v)
-static int __init enable_swap_account(char *s)
 {
-        if (!strcmp(s, "1"))
+        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-                really_do_swap_account = 1;
+        unsigned long low = ACCESS_ONCE(memcg->low);
-        else if (!strcmp(s, "0"))
-                really_do_swap_account = 0;
+        if (low == PAGE_COUNTER_MAX)
-        return 1;
+                seq_puts(m, "max\n");
+        else
+                seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
+        return 0;
 }
-__setup("swapaccount=", enable_swap_account);
-static void __init memsw_file_init(void)
+static ssize_t memory_low_write(struct kernfs_open_file *of,
+                                char *buf, size_t nbytes, loff_t off)
 {
-        WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
+        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
-                                          memsw_cgroup_files));
+        unsigned long low;
+        int err;
+        buf = strstrip(buf);
+        err = page_counter_memparse(buf, "max", &low);
+        if (err)
+                return err;
+        memcg->low = low;
+        return nbytes;
 }
-static void __init enable_swap_cgroup(void)
+static int memory_high_show(struct seq_file *m, void *v)
 {
-        if (!mem_cgroup_disabled() && really_do_swap_account) {
+        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-                do_swap_account = 1;
+        unsigned long high = ACCESS_ONCE(memcg->high);
-                memsw_file_init();
-        }
+        if (high == PAGE_COUNTER_MAX)
+                seq_puts(m, "max\n");
+        else
+                seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
+        return 0;
 }
-#else
+static ssize_t memory_high_write(struct kernfs_open_file *of,
-static void __init enable_swap_cgroup(void)
+                                 char *buf, size_t nbytes, loff_t off)
 {
+        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+        unsigned long high;
+        int err;
+        buf = strstrip(buf);
+        err = page_counter_memparse(buf, "max", &high);
+        if (err)
+                return err;
+        memcg->high = high;
+        return nbytes;
 }
-#endif
-#ifdef CONFIG_MEMCG_SWAP
+static int memory_max_show(struct seq_file *m, void *v)
-/**
- * mem_cgroup_swapout - transfer a memsw charge to swap
- * @page: page whose memsw charge to transfer
- * @entry: swap entry to move the charge to
- *
- * Transfer the memsw charge of @page to @entry.
- */
-void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 {
-        struct mem_cgroup *memcg;
+        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-        unsigned short oldid;
+        unsigned long max = ACCESS_ONCE(memcg->memory.limit);
-        VM_BUG_ON_PAGE(PageLRU(page), page);
+        if (max == PAGE_COUNTER_MAX)
-        VM_BUG_ON_PAGE(page_count(page), page);
+                seq_puts(m, "max\n");
+        else
+                seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
-        if (!do_swap_account)
+        return 0;
-                return;
+}
-        memcg = page->mem_cgroup;
+static ssize_t memory_max_write(struct kernfs_open_file *of,
+                                char *buf, size_t nbytes, loff_t off)
+{
+        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+        unsigned long max;
+        int err;
-        /* Readahead page, never charged */
+        buf = strstrip(buf);
-        if (!memcg)
+        err = page_counter_memparse(buf, "max", &max);
-                return;
+        if (err)
+                return err;
-        oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
+        err = mem_cgroup_resize_limit(memcg, max);
-        VM_BUG_ON_PAGE(oldid, page);
+        if (err)
-        mem_cgroup_swap_statistics(memcg, true);
+                return err;
-        page->mem_cgroup = NULL;
+        return nbytes;
+}
-        if (!mem_cgroup_is_root(memcg))
+static int memory_events_show(struct seq_file *m, void *v)
-                page_counter_uncharge(&memcg->memory, 1);
+{
+        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-        /* XXX: caller holds IRQ-safe mapping->tree_lock */
+        seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW));
-        VM_BUG_ON(!irqs_disabled());
+        seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH));
+        seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX));
+        seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM));
-        mem_cgroup_charge_statistics(memcg, page, -1);
+        return 0;
-        memcg_check_events(memcg, page);
 }
+static struct cftype memory_files[] = {
+        {
+                .name = "current",
+                .read_u64 = memory_current_read,
+        },
+        {
+                .name = "low",
+                .flags = CFTYPE_NOT_ON_ROOT,
+                .seq_show = memory_low_show,
+                .write = memory_low_write,
+        },
+        {
+                .name = "high",
+                .flags = CFTYPE_NOT_ON_ROOT,
+                .seq_show = memory_high_show,
+                .write = memory_high_write,
+        },
+        {
+                .name = "max",
+                .flags = CFTYPE_NOT_ON_ROOT,
+                .seq_show = memory_max_show,
+                .write = memory_max_write,
+        },
+        {
+                .name = "events",
+                .flags = CFTYPE_NOT_ON_ROOT,
+                .seq_show = memory_events_show,
+        },
+        { }     /* terminate */
+};
+struct cgroup_subsys memory_cgrp_subsys = {
+        .css_alloc = mem_cgroup_css_alloc,
+        .css_online = mem_cgroup_css_online,
+        .css_offline = mem_cgroup_css_offline,
+        .css_free = mem_cgroup_css_free,
+        .css_reset = mem_cgroup_css_reset,
+        .can_attach = mem_cgroup_can_attach,
+        .cancel_attach = mem_cgroup_cancel_attach,
+        .attach = mem_cgroup_move_task,
+        .bind = mem_cgroup_bind,
+        .dfl_cftypes = memory_files,
+        .legacy_cftypes = mem_cgroup_legacy_files,
+        .early_init = 0,
+};
 /**
- * mem_cgroup_uncharge_swap - uncharge a swap entry
+ * mem_cgroup_events - count memory events against a cgroup
- * @entry: swap entry to uncharge
+ * @memcg: the memory cgroup
+ * @idx: the event index
+ * @nr: the number of events to account for
+ */
+void mem_cgroup_events(struct mem_cgroup *memcg,
+                       enum mem_cgroup_events_index idx,
+                       unsigned int nr)
+{
+        this_cpu_add(memcg->stat->events[idx], nr);
+}
+/**
+ * mem_cgroup_low - check if memory consumption is below the normal range
+ * @root: the highest ancestor to consider
+ * @memcg: the memory cgroup to check
 *
- * Drop the memsw charge associated with @entry.
+ * Returns %true if memory consumption of @memcg, and that of all
+ * configurable ancestors up to @root, is below the normal range.
 */
-void mem_cgroup_uncharge_swap(swp_entry_t entry)
+bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
 {
-        struct mem_cgroup *memcg;
+        if (mem_cgroup_disabled())
-        unsigned short id;
+                return false;
-        if (!do_swap_account)
+        /*
-                return;
+         * The toplevel group doesn't have a configurable range, so
+         * it's never low when looked at directly, and it is not
+         * considered an ancestor when assessing the hierarchy.
+         */
-        id = swap_cgroup_record(entry, 0);
+        if (memcg == root_mem_cgroup)
-        rcu_read_lock();
+                return false;
-        memcg = mem_cgroup_lookup(id);
-        if (memcg) {
+        if (page_counter_read(&memcg->memory) >= memcg->low)
-                if (!mem_cgroup_is_root(memcg))
+                return false;
-                        page_counter_uncharge(&memcg->memsw, 1);
-                mem_cgroup_swap_statistics(memcg, false);
+        while (memcg != root) {
-                css_put(&memcg->css);
+                memcg = parent_mem_cgroup(memcg);
+                if (memcg == root_mem_cgroup)
+                        break;
+                if (page_counter_read(&memcg->memory) >= memcg->low)
+                        return false;
        }
-        rcu_read_unlock();
+        return true;
 }
-#endif
 /**
 * mem_cgroup_try_charge - try charging a page
@@ -5831,10 +5773,155 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
 */
 static int __init mem_cgroup_init(void)
 {
+        int cpu, node;
        hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
-        enable_swap_cgroup();
-        mem_cgroup_soft_limit_tree_init();
+        for_each_possible_cpu(cpu)
-        memcg_stock_init();
+                INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
+                          drain_local_stock);
+        for_each_node(node) {
+                struct mem_cgroup_tree_per_node *rtpn;
+                int zone;
+                rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
+                                    node_online(node) ? node : NUMA_NO_NODE);
+                for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+                        struct mem_cgroup_tree_per_zone *rtpz;
+                        rtpz = &rtpn->rb_tree_per_zone[zone];
+                        rtpz->rb_root = RB_ROOT;
+                        spin_lock_init(&rtpz->lock);
+                }
+                soft_limit_tree.rb_tree_per_node[node] = rtpn;
+        }
        return 0;
 }
 subsys_initcall(mem_cgroup_init);
+#ifdef CONFIG_MEMCG_SWAP
+/**
+ * mem_cgroup_swapout - transfer a memsw charge to swap
+ * @page: page whose memsw charge to transfer
+ * @entry: swap entry to move the charge to
+ *
+ * Transfer the memsw charge of @page to @entry.
+ */
+void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+{
+        struct mem_cgroup *memcg;
+        unsigned short oldid;
+        VM_BUG_ON_PAGE(PageLRU(page), page);
+        VM_BUG_ON_PAGE(page_count(page), page);
+        if (!do_swap_account)
+                return;
+        memcg = page->mem_cgroup;
+        /* Readahead page, never charged */
+        if (!memcg)
+                return;
+        oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
+        VM_BUG_ON_PAGE(oldid, page);
+        mem_cgroup_swap_statistics(memcg, true);
+        page->mem_cgroup = NULL;
+        if (!mem_cgroup_is_root(memcg))
+                page_counter_uncharge(&memcg->memory, 1);
+        /* XXX: caller holds IRQ-safe mapping->tree_lock */
+        VM_BUG_ON(!irqs_disabled());
+        mem_cgroup_charge_statistics(memcg, page, -1);
+        memcg_check_events(memcg, page);
+}
+/**
+ * mem_cgroup_uncharge_swap - uncharge a swap entry
+ * @entry: swap entry to uncharge
+ *
+ * Drop the memsw charge associated with @entry.
+ */
+void mem_cgroup_uncharge_swap(swp_entry_t entry)
+{
+        struct mem_cgroup *memcg;
+        unsigned short id;
+        if (!do_swap_account)
+                return;
+        id = swap_cgroup_record(entry, 0);
+        rcu_read_lock();
+        memcg = mem_cgroup_lookup(id);
+        if (memcg) {
+                if (!mem_cgroup_is_root(memcg))
+                        page_counter_uncharge(&memcg->memsw, 1);
+                mem_cgroup_swap_statistics(memcg, false);
+                css_put(&memcg->css);
+        }
+        rcu_read_unlock();
+}
+/* for remember boot option*/
+#ifdef CONFIG_MEMCG_SWAP_ENABLED
+static int really_do_swap_account __initdata = 1;
+#else
+static int really_do_swap_account __initdata;
+#endif
+static int __init enable_swap_account(char *s)
+{
+        if (!strcmp(s, "1"))
+                really_do_swap_account = 1;
+        else if (!strcmp(s, "0"))
+                really_do_swap_account = 0;
+        return 1;
+}
+__setup("swapaccount=", enable_swap_account);
+static struct cftype memsw_cgroup_files[] = {
+        {
+                .name = "memsw.usage_in_bytes",
+                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
+                .read_u64 = mem_cgroup_read_u64,
+        },
+        {
+                .name = "memsw.max_usage_in_bytes",
+                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
+                .write = mem_cgroup_reset,
+                .read_u64 = mem_cgroup_read_u64,
+        },
+        {
+                .name = "memsw.limit_in_bytes",
+                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
+                .write = mem_cgroup_write,
+                .read_u64 = mem_cgroup_read_u64,
+        },
+        {
+                .name = "memsw.failcnt",
+                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
+                .write = mem_cgroup_reset,
+                .read_u64 = mem_cgroup_read_u64,
+        },
+        { },    /* terminate */
+};
+static int __init mem_cgroup_swap_init(void)
+{
+        if (!mem_cgroup_disabled() && really_do_swap_account) {
+                do_swap_account = 1;
+                WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
+                                                  memsw_cgroup_files));
+        }
+        return 0;
+}
+subsys_initcall(mem_cgroup_swap_init);
+#endif /* CONFIG_MEMCG_SWAP */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index feb803bf3443..d487f8dc6d39 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -242,15 +242,8 @@ void shake_page(struct page *p, int access)
         * Only call shrink_node_slabs here (which would also shrink
         * other caches) if access is not potentially fatal.
         */
-        if (access) {
+        if (access)
-                int nr;
+                drop_slab_node(page_to_nid(p));
-                int nid = page_to_nid(p);
-                do {
-                        nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000);
-                        if (page_count(p) == 1)
-                                break;
-                } while (nr > 10);
-        }
 }
 EXPORT_SYMBOL_GPL(shake_page);
@@ -1654,8 +1647,6 @@ static int __soft_offline_page(struct page *page, int flags)
                         * setting PG_hwpoison.
                         */
                        if (!is_free_buddy_page(page))
-                                lru_add_drain_all();
-                        if (!is_free_buddy_page(page))
                                drain_all_pages(page_zone(page));
                        SetPageHWPoison(page);
                        if (!is_free_buddy_page(page))
diff --git a/mm/memory.c b/mm/memory.c
index 2c3536cc6c63..8068893697bb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -428,6 +428,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
        pmd = pmd_offset(pud, start);
        pud_clear(pud);
        pmd_free_tlb(tlb, pmd, start);
+        mm_dec_nr_pmds(tlb->mm);
 }
 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -754,6 +755,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
        if (HAVE_PTE_SPECIAL) {
                if (likely(!pte_special(pte)))
                        goto check_pfn;
+                if (vma->vm_ops && vma->vm_ops->find_special_page)
+                        return vma->vm_ops->find_special_page(vma, addr);
                if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
                        return NULL;
                if (!is_zero_pfn(pfn))
@@ -811,42 +814,40 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        /* pte contains position in swap or file, so copy. */
        if (unlikely(!pte_present(pte))) {
-                if (!pte_file(pte)) {
+                swp_entry_t entry = pte_to_swp_entry(pte);
-                        swp_entry_t entry = pte_to_swp_entry(pte);
+                if (likely(!non_swap_entry(entry))) {
-                        if (likely(!non_swap_entry(entry))) {
+                        if (swap_duplicate(entry) < 0)
-                                if (swap_duplicate(entry) < 0)
+                                return entry.val;
-                                        return entry.val;
+                        /* make sure dst_mm is on swapoff's mmlist. */
-                                /* make sure dst_mm is on swapoff's mmlist. */
+                        if (unlikely(list_empty(&dst_mm->mmlist))) {
-                                if (unlikely(list_empty(&dst_mm->mmlist))) {
+                                spin_lock(&mmlist_lock);
-                                        spin_lock(&mmlist_lock);
+                                if (list_empty(&dst_mm->mmlist))
-                                        if (list_empty(&dst_mm->mmlist))
+                                        list_add(&dst_mm->mmlist,
-                                                list_add(&dst_mm->mmlist,
+                                                        &src_mm->mmlist);
-                                                         &src_mm->mmlist);
+                                spin_unlock(&mmlist_lock);
-                                        spin_unlock(&mmlist_lock);
+                        }
-                                }
+                        rss[MM_SWAPENTS]++;
-                                rss[MM_SWAPENTS]++;
+                } else if (is_migration_entry(entry)) {
-                        } else if (is_migration_entry(entry)) {
+                        page = migration_entry_to_page(entry);
-                                page = migration_entry_to_page(entry);
+                        if (PageAnon(page))
-                                if (PageAnon(page))
+                                rss[MM_ANONPAGES]++;
-                                        rss[MM_ANONPAGES]++;
+                        else
-                                else
+                                rss[MM_FILEPAGES]++;
-                                        rss[MM_FILEPAGES]++;
+                        if (is_write_migration_entry(entry) &&
-                                if (is_write_migration_entry(entry) &&
+                                        is_cow_mapping(vm_flags)) {
-                                    is_cow_mapping(vm_flags)) {
+                                /*
-                                        /*
+                                 * COW mappings require pages in both
-                                         * COW mappings require pages in both
+                                 * parent and child to be set to read.
-                                         * parent and child to be set to read.
+                                 */
-                                         */
+                                make_migration_entry_read(&entry);
-                                        make_migration_entry_read(&entry);
+                                pte = swp_entry_to_pte(entry);
-                                        pte = swp_entry_to_pte(entry);
+                                if (pte_swp_soft_dirty(*src_pte))
-                                        if (pte_swp_soft_dirty(*src_pte))
+                                        pte = pte_swp_mksoft_dirty(pte);
-                                                pte = pte_swp_mksoft_dirty(pte);
+                                set_pte_at(src_mm, addr, src_pte, pte);
-                                        set_pte_at(src_mm, addr, src_pte, pte);
-                                }
                        }
                }
                goto out_set_pte;
@@ -1020,11 +1021,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         * readonly mappings. The tradeoff is that copy_page_range is more
         * efficient than faulting.
         */
-        if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
+        if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
-                               VM_PFNMAP | VM_MIXEDMAP))) {
+                        !vma->anon_vma)
-                if (!vma->anon_vma)
+                return 0;
-                        return 0;
-        }
        if (is_vm_hugetlb_page(vma))
                return copy_hugetlb_page_range(dst_mm, src_mm, vma);
@@ -1082,6 +1081,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
        spinlock_t *ptl;
        pte_t *start_pte;
        pte_t *pte;
+        swp_entry_t entry;
 again:
        init_rss_vec(rss);
@@ -1107,28 +1107,12 @@ again:
                                if (details->check_mapping &&
                                    details->check_mapping != page->mapping)
                                        continue;
-                                /*
-                                 * Each page->index must be checked when
-                                 * invalidating or truncating nonlinear.
-                                 */
-                                if (details->nonlinear_vma &&
-                                    (page->index < details->first_index ||
-                                     page->index > details->last_index))
-                                        continue;
                        }
                        ptent = ptep_get_and_clear_full(mm, addr, pte,
                                                        tlb->fullmm);
                        tlb_remove_tlb_entry(tlb, pte, addr);
                        if (unlikely(!page))
                                continue;
-                        if (unlikely(details) && details->nonlinear_vma
-                            && linear_page_index(details->nonlinear_vma,
-                                                addr) != page->index) {
-                                pte_t ptfile = pgoff_to_pte(page->index);
-                                if (pte_soft_dirty(ptent))
-                                        ptfile = pte_file_mksoft_dirty(ptfile);
-                                set_pte_at(mm, addr, pte, ptfile);
-                        }
                        if (PageAnon(page))
                                rss[MM_ANONPAGES]--;
                        else {
@@ -1151,33 +1135,25 @@ again:
                        }
                        continue;
                }
-                /*
+                /* If details->check_mapping, we leave swap entries. */
-                 * If details->check_mapping, we leave swap entries;
-                 * if details->nonlinear_vma, we leave file entries.
-                 */
                if (unlikely(details))
                        continue;
-                if (pte_file(ptent)) {
-                        if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
-                                print_bad_pte(vma, addr, ptent, NULL);
-                } else {
-                        swp_entry_t entry = pte_to_swp_entry(ptent);
-                        if (!non_swap_entry(entry))
+                entry = pte_to_swp_entry(ptent);
-                                rss[MM_SWAPENTS]--;
+                if (!non_swap_entry(entry))
-                        else if (is_migration_entry(entry)) {
+                        rss[MM_SWAPENTS]--;
-                                struct page *page;
+                else if (is_migration_entry(entry)) {
+                        struct page *page;
-                                page = migration_entry_to_page(entry);
+                        page = migration_entry_to_page(entry);
-                                if (PageAnon(page))
+                        if (PageAnon(page))
-                                        rss[MM_ANONPAGES]--;
+                                rss[MM_ANONPAGES]--;
-                                else
+                        else
-                                        rss[MM_FILEPAGES]--;
+                                rss[MM_FILEPAGES]--;
-                        }
-                        if (unlikely(!free_swap_and_cache(entry)))
-                                print_bad_pte(vma, addr, ptent, NULL);
                }
+                if (unlikely(!free_swap_and_cache(entry)))
+                        print_bad_pte(vma, addr, ptent, NULL);
                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
        } while (pte++, addr += PAGE_SIZE, addr != end);
@@ -1277,7 +1253,7 @@ static void unmap_page_range(struct mmu_gather *tlb,
        pgd_t *pgd;
        unsigned long next;
-        if (details && !details->check_mapping && !details->nonlinear_vma)
+        if (details && !details->check_mapping)
                details = NULL;
        BUG_ON(addr >= end);
@@ -1371,7 +1347,7 @@ void unmap_vmas(struct mmu_gather *tlb,
 * @vma: vm_area_struct holding the applicable pages
 * @start: starting address of pages to zap
 * @size: number of bytes to zap
- * @details: details of nonlinear truncation or shared cache invalidation
+ * @details: details of shared cache invalidation
 *
 * Caller must protect the VMA list
 */
@@ -1397,7 +1373,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
 * @vma: vm_area_struct holding the applicable pages
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
- * @details: details of nonlinear truncation or shared cache invalidation
+ * @details: details of shared cache invalidation
 *
 * The range must fit into one VMA.
 */
@@ -1922,12 +1898,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 /*
- * handle_pte_fault chooses page fault handler according to an entry
+ * handle_pte_fault chooses page fault handler according to an entry which was
- * which was read non-atomically.  Before making any commitment, on
+ * read non-atomically.  Before making any commitment, on those architectures
- * those architectures or configurations (e.g. i386 with PAE) which
+ * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
- * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
+ * parts, do_swap_page must check under lock before unmapping the pte and
- * must check under lock before unmapping the pte and proceeding
+ * proceeding (but do_wp_page is only called after already making such a check;
- * (but do_wp_page is only called after already making such a check;
 * and do_anonymous_page can safely check later on).
 */
 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
@@ -1990,6 +1965,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
        vmf.pgoff = page->index;
        vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
        vmf.page = page;
+        vmf.cow_page = NULL;
        ret = vma->vm_ops->page_mkwrite(vma, &vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
@@ -2033,7 +2009,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        pte_t entry;
        int ret = 0;
        int page_mkwrite = 0;
-        struct page *dirty_page = NULL;
+        bool dirty_shared = false;
        unsigned long mmun_start = 0;   /* For mmu_notifiers */
        unsigned long mmun_end = 0;     /* For mmu_notifiers */
        struct mem_cgroup *memcg;
@@ -2084,6 +2060,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                unlock_page(old_page);
        } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                                        (VM_WRITE|VM_SHARED))) {
+                page_cache_get(old_page);
                /*
                 * Only catch write-faults on shared writable pages,
                 * read-only shared pages can get COWed by
@@ -2091,7 +2068,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 */
                if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
                        int tmp;
-                        page_cache_get(old_page);
                        pte_unmap_unlock(page_table, ptl);
                        tmp = do_page_mkwrite(vma, old_page, address);
                        if (unlikely(!tmp || (tmp &
@@ -2111,11 +2088,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                unlock_page(old_page);
                                goto unlock;
                        }
                        page_mkwrite = 1;
                }
-                dirty_page = old_page;
-                get_page(dirty_page);
+                dirty_shared = true;
 reuse:
                /*
@@ -2134,20 +2110,20 @@ reuse:
                pte_unmap_unlock(page_table, ptl);
                ret |= VM_FAULT_WRITE;
-                if (!dirty_page)
+                if (dirty_shared) {
-                        return ret;
-                if (!page_mkwrite) {
                        struct address_space *mapping;
                        int dirtied;
-                        lock_page(dirty_page);
+                        if (!page_mkwrite)
-                        dirtied = set_page_dirty(dirty_page);
+                                lock_page(old_page);
-                        VM_BUG_ON_PAGE(PageAnon(dirty_page), dirty_page);
-                        mapping = dirty_page->mapping;
+                        dirtied = set_page_dirty(old_page);
-                        unlock_page(dirty_page);
+                        VM_BUG_ON_PAGE(PageAnon(old_page), old_page);
+                        mapping = old_page->mapping;
+                        unlock_page(old_page);
+                        page_cache_release(old_page);
-                        if (dirtied && mapping) {
+                        if ((dirtied || page_mkwrite) && mapping) {
                                /*
                                 * Some device drivers do not set page.mapping
                                 * but still dirty their pages
@@ -2155,25 +2131,9 @@ reuse:
                                balance_dirty_pages_ratelimited(mapping);
                        }
-                        /* file_update_time outside page_lock */
+                        if (!page_mkwrite)
-                        if (vma->vm_file)
                                file_update_time(vma->vm_file);
                }
-                put_page(dirty_page);
-                if (page_mkwrite) {
-                        struct address_space *mapping = dirty_page->mapping;
-                        set_page_dirty(dirty_page);
-                        unlock_page(dirty_page);
-                        page_cache_release(dirty_page);
-                        if (mapping)    {
-                                /*
-                                 * Some device drivers do not set page.mapping
-                                 * but still dirty their pages
-                                 */
-                                balance_dirty_pages_ratelimited(mapping);
-                        }
-                }
                return ret;
        }
@@ -2331,25 +2291,11 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
        }
 }
-static inline void unmap_mapping_range_list(struct list_head *head,
-                                            struct zap_details *details)
-{
-        struct vm_area_struct *vma;
-        /*
-         * In nonlinear VMAs there is no correspondence between virtual address
-         * offset and file offset.  So we must perform an exhaustive search
-         * across *all* the pages in each nonlinear VMA, not just the pages
-         * whose virtual address lies outside the file truncation point.
-         */
-        list_for_each_entry(vma, head, shared.nonlinear) {
-                details->nonlinear_vma = vma;
-                unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
-        }
-}
 /**
- * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
+ * unmap_mapping_range - unmap the portion of all mmaps in the specified
+ * address_space corresponding to the specified page range in the underlying
+ * file.
+ *
 * @mapping: the address space containing mmaps to be unmapped.
 * @holebegin: byte in first page to unmap, relative to the start of
 * the underlying file.  This will be rounded down to a PAGE_SIZE
@@ -2378,18 +2324,16 @@ void unmap_mapping_range(struct address_space *mapping,
        }
        details.check_mapping = even_cows? NULL: mapping;
-        details.nonlinear_vma = NULL;
        details.first_index = hba;
        details.last_index = hba + hlen - 1;
        if (details.last_index < details.first_index)
                details.last_index = ULONG_MAX;
+        /* DAX uses i_mmap_lock to serialise file truncate vs page fault */
        i_mmap_lock_write(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
                unmap_mapping_range_tree(&mapping->i_mmap, &details);
-        if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
-                unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
        i_mmap_unlock_write(mapping);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
@@ -2696,7 +2640,8 @@ oom:
 * See filemap_fault() and __lock_page_retry().
 */
 static int __do_fault(struct vm_area_struct *vma, unsigned long address,
-                pgoff_t pgoff, unsigned int flags, struct page **page)
+                        pgoff_t pgoff, unsigned int flags,
+                        struct page *cow_page, struct page **page)
 {
        struct vm_fault vmf;
        int ret;
@@ -2705,10 +2650,13 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
        vmf.pgoff = pgoff;
        vmf.flags = flags;
        vmf.page = NULL;
+        vmf.cow_page = cow_page;
        ret = vma->vm_ops->fault(vma, &vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;
+        if (!vmf.page)
+                goto out;
        if (unlikely(PageHWPoison(vmf.page))) {
                if (ret & VM_FAULT_LOCKED)
@@ -2722,6 +2670,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
        else
                VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
+ out:
        *page = vmf.page;
        return ret;
 }
@@ -2750,8 +2699,6 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
        entry = mk_pte(page, vma->vm_page_prot);
        if (write)
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-        else if (pte_file(*pte) && pte_file_soft_dirty(*pte))
-                entry = pte_mksoft_dirty(entry);
        if (anon) {
                inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
                page_add_new_anon_rmap(page, vma, address);
@@ -2886,8 +2833,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * if page by the offset is not ready to be mapped (cold cache or
         * something).
         */
-        if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) &&
+        if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
-            fault_around_bytes >> PAGE_SHIFT > 1) {
                pte = pte_offset_map_lock(mm, pmd, address, &ptl);
                do_fault_around(vma, address, pte, pgoff, flags);
                if (!pte_same(*pte, orig_pte))
@@ -2895,7 +2841,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                pte_unmap_unlock(pte, ptl);
        }
-        ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+        ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;
@@ -2935,26 +2881,43 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                return VM_FAULT_OOM;
        }
-        ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+        ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                goto uncharge_out;
-        copy_user_highpage(new_page, fault_page, address, vma);
+        if (fault_page)
+                copy_user_highpage(new_page, fault_page, address, vma);
        __SetPageUptodate(new_page);
        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
        if (unlikely(!pte_same(*pte, orig_pte))) {
                pte_unmap_unlock(pte, ptl);
-                unlock_page(fault_page);
+                if (fault_page) {
-                page_cache_release(fault_page);
+                        unlock_page(fault_page);
+                        page_cache_release(fault_page);
+                } else {
+                        /*
+                         * The fault handler has no page to lock, so it holds
+                         * i_mmap_lock for read to protect against truncate.
+                         */
+                        i_mmap_unlock_read(vma->vm_file->f_mapping);
+                }
                goto uncharge_out;
        }
        do_set_pte(vma, address, new_page, pte, true, true);
        mem_cgroup_commit_charge(new_page, memcg, false);
        lru_cache_add_active_or_unevictable(new_page, vma);
        pte_unmap_unlock(pte, ptl);
-        unlock_page(fault_page);
+        if (fault_page) {
-        page_cache_release(fault_page);
+                unlock_page(fault_page);
+                page_cache_release(fault_page);
+        } else {
+                /*
+                 * The fault handler has no page to lock, so it holds
+                 * i_mmap_lock for read to protect against truncate.
+                 */
+                i_mmap_unlock_read(vma->vm_file->f_mapping);
+        }
        return ret;
 uncharge_out:
        mem_cgroup_cancel_charge(new_page, memcg);
@@ -2973,7 +2936,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        int dirtied = 0;
        int ret, tmp;
-        ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+        ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;
@@ -3019,8 +2982,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                balance_dirty_pages_ratelimited(mapping);
        }
-        /* file_update_time outside page_lock */
+        if (!vma->vm_ops->page_mkwrite)
-        if (vma->vm_file && !vma->vm_ops->page_mkwrite)
                file_update_time(vma->vm_file);
        return ret;
@@ -3032,7 +2994,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 * The mmap_sem may have been released depending on flags and our
 * return value.  See filemap_fault() and __lock_page_or_retry().
 */
-static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, pte_t *page_table, pmd_t *pmd,
                unsigned int flags, pte_t orig_pte)
 {
@@ -3049,46 +3011,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
-/*
- * Fault of a previously existing named mapping. Repopulate the pte
- * from the encoded file_pte if possible. This enables swappable
- * nonlinear vmas.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with pte unmapped and unlocked.
- * The mmap_sem may have been released depending on flags and our
- * return value.  See filemap_fault() and __lock_page_or_retry().
- */
-static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-                unsigned long address, pte_t *page_table, pmd_t *pmd,
-                unsigned int flags, pte_t orig_pte)
-{
-        pgoff_t pgoff;
-        flags |= FAULT_FLAG_NONLINEAR;
-        if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
-                return 0;
-        if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
-                /*
-                 * Page table corrupted: show pte and kill process.
-                 */
-                print_bad_pte(vma, address, orig_pte, NULL);
-                return VM_FAULT_SIGBUS;
-        }
-        pgoff = pte_to_pgoff(orig_pte);
-        if (!(flags & FAULT_FLAG_WRITE))
-                return do_read_fault(mm, vma, address, pmd, pgoff, flags,
-                                orig_pte);
-        if (!(vma->vm_flags & VM_SHARED))
-                return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
-                                orig_pte);
-        return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
-}
 static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
                                unsigned long addr, int page_nid,
                                int *flags)
@@ -3115,14 +3037,17 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        bool migrated = false;
        int flags = 0;
+        /* A PROT_NONE fault should not end up here */
+        BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
        /*
        * The "pte" at this point cannot be used safely without
        * validation through pte_unmap_same(). It's of NUMA type but
        * the pfn may be screwed if the read is non atomic.
        *
-        * ptep_modify_prot_start is not called as this is clearing
+        * We can safely just do a "set_pte_at()", because the old
-        * the _PAGE_NUMA bit and it is not really expected that there
+        * page table entry is not accessible, so there would be no
-        * would be concurrent hardware modifications to the PTE.
+        * concurrent hardware modifications to the PTE.
        */
        ptl = pte_lockptr(mm, pmd);
        spin_lock(ptl);
@@ -3131,7 +3056,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto out;
        }
-        pte = pte_mknonnuma(pte);
+        /* Make it present again */
+        pte = pte_modify(pte, vma->vm_page_prot);
+        pte = pte_mkyoung(pte);
        set_pte_at(mm, addr, ptep, pte);
        update_mmu_cache(vma, addr, ptep);
@@ -3140,7 +3067,6 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                pte_unmap_unlock(ptep, ptl);
                return 0;
        }
-        BUG_ON(is_zero_pfn(page_to_pfn(page)));
        /*
         * Avoid grouping on DSO/COW pages in specific and RO pages
@@ -3216,20 +3142,17 @@ static int handle_pte_fault(struct mm_struct *mm,
                if (pte_none(entry)) {
                        if (vma->vm_ops) {
                                if (likely(vma->vm_ops->fault))
-                                        return do_linear_fault(mm, vma, address,
+                                        return do_fault(mm, vma, address, pte,
-                                                pte, pmd, flags, entry);
+                                                        pmd, flags, entry);
                        }
                        return do_anonymous_page(mm, vma, address,
                                                 pte, pmd, flags);
                }
-                if (pte_file(entry))
-                        return do_nonlinear_fault(mm, vma, address,
-                                        pte, pmd, flags, entry);
                return do_swap_page(mm, vma, address,
                                        pte, pmd, flags, entry);
        }
-        if (pte_numa(entry))
+        if (pte_protnone(entry))
                return do_numa_page(mm, vma, address, entry, pte, pmd);
        ptl = pte_lockptr(mm, pmd);
@@ -3307,7 +3230,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        if (pmd_trans_splitting(orig_pmd))
                                return 0;
-                        if (pmd_numa(orig_pmd))
+                        if (pmd_protnone(orig_pmd))
                                return do_huge_pmd_numa_page(mm, vma, address,
                                                             orig_pmd, pmd);
@@ -3428,15 +3351,17 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
        spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_4LEVEL_HACK
-        if (pud_present(*pud))          /* Another has populated it */
+        if (!pud_present(*pud)) {
-                pmd_free(mm, new);
+                mm_inc_nr_pmds(mm);
-        else
                pud_populate(mm, pud, new);
-#else
+        } else  /* Another has populated it */
-        if (pgd_present(*pud))          /* Another has populated it */
                pmd_free(mm, new);
-        else
+#else
+        if (!pgd_present(*pud)) {
+                mm_inc_nr_pmds(mm);
                pgd_populate(mm, pud, new);
+        } else /* Another has populated it */
+                pmd_free(mm, new);
 #endif /* __ARCH_HAS_4LEVEL_HACK */
        spin_unlock(&mm->page_table_lock);
        return 0;
@@ -3561,7 +3486,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
        if (follow_phys(vma, addr, write, &prot, &phys_addr))
                return -EINVAL;
-        maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
+        maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
        if (write)
                memcpy_toio(maddr + offset, buf, len);
        else
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0e0961b8c39c..4721046a134a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -471,24 +471,34 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
                                unsigned long flags);
+struct queue_pages {
+        struct list_head *pagelist;
+        unsigned long flags;
+        nodemask_t *nmask;
+        struct vm_area_struct *prev;
+};
 /*
 * Scan through pages checking if pages follow certain conditions,
 * and move them to the pagelist if they do.
 */
-static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
-                unsigned long addr, unsigned long end,
+                        unsigned long end, struct mm_walk *walk)
-                const nodemask_t *nodes, unsigned long flags,
-                void *private)
 {
-        pte_t *orig_pte;
+        struct vm_area_struct *vma = walk->vma;
+        struct page *page;
+        struct queue_pages *qp = walk->private;
+        unsigned long flags = qp->flags;
+        int nid;
        pte_t *pte;
        spinlock_t *ptl;
-        orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+        split_huge_page_pmd(vma, addr, pmd);
-        do {
+        if (pmd_trans_unstable(pmd))
-                struct page *page;
+                return 0;
-                int nid;
+        pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+        for (; addr != end; pte++, addr += PAGE_SIZE) {
                if (!pte_present(*pte))
                        continue;
                page = vm_normal_page(vma, addr, *pte);
@@ -501,114 +511,46 @@ static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                if (PageReserved(page))
                        continue;
                nid = page_to_nid(page);
-                if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+                if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
                        continue;
                if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
-                        migrate_page_add(page, private, flags);
+                        migrate_page_add(page, qp->pagelist, flags);
-                else
+        }
-                        break;
+        pte_unmap_unlock(pte - 1, ptl);
-        } while (pte++, addr += PAGE_SIZE, addr != end);
+        cond_resched();
-        pte_unmap_unlock(orig_pte, ptl);
+        return 0;
-        return addr != end;
 }
-static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
+static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
-                pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
+                               unsigned long addr, unsigned long end,
-                                    void *private)
+                               struct mm_walk *walk)
 {
 #ifdef CONFIG_HUGETLB_PAGE
+        struct queue_pages *qp = walk->private;
+        unsigned long flags = qp->flags;
        int nid;
        struct page *page;
        spinlock_t *ptl;
        pte_t entry;
-        ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
+        ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
-        entry = huge_ptep_get((pte_t *)pmd);
+        entry = huge_ptep_get(pte);
        if (!pte_present(entry))
                goto unlock;
        page = pte_page(entry);
        nid = page_to_nid(page);
-        if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+        if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
                goto unlock;
        /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
        if (flags & (MPOL_MF_MOVE_ALL) ||
            (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
-                isolate_huge_page(page, private);
+                isolate_huge_page(page, qp->pagelist);
 unlock:
        spin_unlock(ptl);
 #else
        BUG();
 #endif
-}
-static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
-                unsigned long addr, unsigned long end,
-                const nodemask_t *nodes, unsigned long flags,
-                void *private)
-{
-        pmd_t *pmd;
-        unsigned long next;
-        pmd = pmd_offset(pud, addr);
-        do {
-                next = pmd_addr_end(addr, end);
-                if (!pmd_present(*pmd))
-                        continue;
-                if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
-                        queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
-                                                flags, private);
-                        continue;
-                }
-                split_huge_page_pmd(vma, addr, pmd);
-                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
-                        continue;
-                if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
-                                    flags, private))
-                        return -EIO;
-        } while (pmd++, addr = next, addr != end);
-        return 0;
-}
-static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
-                unsigned long addr, unsigned long end,
-                const nodemask_t *nodes, unsigned long flags,
-                void *private)
-{
-        pud_t *pud;
-        unsigned long next;
-        pud = pud_offset(pgd, addr);
-        do {
-                next = pud_addr_end(addr, end);
-                if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
-                        continue;
-                if (pud_none_or_clear_bad(pud))
-                        continue;
-                if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
-                                    flags, private))
-                        return -EIO;
-        } while (pud++, addr = next, addr != end);
-        return 0;
-}
-static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
-                unsigned long addr, unsigned long end,
-                const nodemask_t *nodes, unsigned long flags,
-                void *private)
-{
-        pgd_t *pgd;
-        unsigned long next;
-        pgd = pgd_offset(vma->vm_mm, addr);
-        do {
-                next = pgd_addr_end(addr, end);
-                if (pgd_none_or_clear_bad(pgd))
-                        continue;
-                if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
-                                    flags, private))
-                        return -EIO;
-        } while (pgd++, addr = next, addr != end);
        return 0;
 }
@@ -627,7 +569,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
 {
        int nr_updated;
-        nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
+        nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
        if (nr_updated)
                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
@@ -641,6 +583,49 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
 }
 #endif /* CONFIG_NUMA_BALANCING */
+static int queue_pages_test_walk(unsigned long start, unsigned long end,
+                                struct mm_walk *walk)
+{
+        struct vm_area_struct *vma = walk->vma;
+        struct queue_pages *qp = walk->private;
+        unsigned long endvma = vma->vm_end;
+        unsigned long flags = qp->flags;
+        if (vma->vm_flags & VM_PFNMAP)
+                return 1;
+        if (endvma > end)
+                endvma = end;
+        if (vma->vm_start > start)
+                start = vma->vm_start;
+        if (!(flags & MPOL_MF_DISCONTIG_OK)) {
+                if (!vma->vm_next && vma->vm_end < end)
+                        return -EFAULT;
+                if (qp->prev && qp->prev->vm_end < vma->vm_start)
+                        return -EFAULT;
+        }
+        qp->prev = vma;
+        if (vma->vm_flags & VM_PFNMAP)
+                return 1;
+        if (flags & MPOL_MF_LAZY) {
+                /* Similar to task_numa_work, skip inaccessible VMAs */
+                if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
+                        change_prot_numa(vma, start, endvma);
+                return 1;
+        }
+        if ((flags & MPOL_MF_STRICT) ||
+            ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
+             vma_migratable(vma)))
+                /* queue pages from current vma */
+                return 0;
+        return 1;
+}
 /*
 * Walk through page tables and collect pages to be migrated.
 *
@@ -650,50 +635,24 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
 */
 static int
 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
-                const nodemask_t *nodes, unsigned long flags, void *private)
+                nodemask_t *nodes, unsigned long flags,
-{
+                struct list_head *pagelist)
-        int err = 0;
+{
-        struct vm_area_struct *vma, *prev;
+        struct queue_pages qp = {
+                .pagelist = pagelist,
-        vma = find_vma(mm, start);
+                .flags = flags,
-        if (!vma)
+                .nmask = nodes,
-                return -EFAULT;
+                .prev = NULL,
-        prev = NULL;
+        };
-        for (; vma && vma->vm_start < end; vma = vma->vm_next) {
+        struct mm_walk queue_pages_walk = {
-                unsigned long endvma = vma->vm_end;
+                .hugetlb_entry = queue_pages_hugetlb,
+                .pmd_entry = queue_pages_pte_range,
-                if (endvma > end)
+                .test_walk = queue_pages_test_walk,
-                        endvma = end;
+                .mm = mm,
-                if (vma->vm_start > start)
+                .private = &qp,
-                        start = vma->vm_start;
+        };
-                if (!(flags & MPOL_MF_DISCONTIG_OK)) {
+        return walk_page_range(start, end, &queue_pages_walk);
-                        if (!vma->vm_next && vma->vm_end < end)
-                                return -EFAULT;
-                        if (prev && prev->vm_end < vma->vm_start)
-                                return -EFAULT;
-                }
-                if (flags & MPOL_MF_LAZY) {
-                        /* Similar to task_numa_work, skip inaccessible VMAs */
-                        if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
-                                change_prot_numa(vma, start, endvma);
-                        goto next;
-                }
-                if ((flags & MPOL_MF_STRICT) ||
-                     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
-                      vma_migratable(vma))) {
-                        err = queue_pages_pgd_range(vma, start, endvma, nodes,
-                                                flags, private);
-                        if (err)
-                                break;
-                }
-next:
-                prev = vma;
-        }
-        return err;
 }
 /*
@@ -1988,43 +1947,63 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 *      @order:Order of the GFP allocation.
 *      @vma:  Pointer to VMA or NULL if not available.
 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
+ *      @node: Which node to prefer for allocation (modulo policy).
+ *      @hugepage: for hugepages try only the preferred node if possible
 *
 *      This function allocates a page from the kernel page pool and applies
 *      a NUMA policy associated with the VMA or the current process.
 *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
 *      mm_struct of the VMA to prevent it from going away. Should be used for
- *      all allocations for pages that will be mapped into
+ *      all allocations for pages that will be mapped into user space. Returns
- *      user space. Returns NULL when no page can be allocated.
+ *      NULL when no page can be allocated.
- *
- *      Should be called with the mm_sem of the vma hold.
 */
 struct page *
 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
-                unsigned long addr, int node)
+                unsigned long addr, int node, bool hugepage)
 {
        struct mempolicy *pol;
        struct page *page;
        unsigned int cpuset_mems_cookie;
+        struct zonelist *zl;
+        nodemask_t *nmask;
 retry_cpuset:
        pol = get_vma_policy(vma, addr);
        cpuset_mems_cookie = read_mems_allowed_begin();
-        if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
+        if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage &&
+                                        pol->mode != MPOL_INTERLEAVE)) {
+                /*
+                 * For hugepage allocation and non-interleave policy which
+                 * allows the current node, we only try to allocate from the
+                 * current node and don't fall back to other nodes, as the
+                 * cost of remote accesses would likely offset THP benefits.
+                 *
+                 * If the policy is interleave, or does not allow the current
+                 * node in its nodemask, we allocate the standard way.
+                 */
+                nmask = policy_nodemask(gfp, pol);
+                if (!nmask || node_isset(node, *nmask)) {
+                        mpol_cond_put(pol);
+                        page = alloc_pages_exact_node(node, gfp, order);
+                        goto out;
+                }
+        }
+        if (pol->mode == MPOL_INTERLEAVE) {
                unsigned nid;
                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
                mpol_cond_put(pol);
                page = alloc_page_interleave(gfp, order, nid);
-                if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+                goto out;
-                        goto retry_cpuset;
-                return page;
        }
-        page = __alloc_pages_nodemask(gfp, order,
-                                      policy_zonelist(gfp, pol, node),
+        nmask = policy_nodemask(gfp, pol);
-                                      policy_nodemask(gfp, pol));
+        zl = policy_zonelist(gfp, pol, node);
        mpol_cond_put(pol);
+        page = __alloc_pages_nodemask(gfp, order, zl, nmask);
+out:
        if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
                goto retry_cpuset;
        return page;
@@ -2838,8 +2817,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
                        p += snprintf(p, buffer + maxlen - p, "relative");
        }
-        if (!nodes_empty(nodes)) {
+        if (!nodes_empty(nodes))
-                p += snprintf(p, buffer + maxlen - p, ":");
+                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
-                p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
+                               nodemask_pr_args(&nodes));
-        }
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 344cdf692fc8..85e042686031 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -179,37 +179,6 @@ out:
 }
 /*
- * Congratulations to trinity for discovering this bug.
- * mm/fremap.c's remap_file_pages() accepts any range within a single vma to
- * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then
- * replace the specified range by file ptes throughout (maybe populated after).
- * If page migration finds a page within that range, while it's still located
- * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem:
- * zap_pte() clears the temporary migration entry before mmap_sem is dropped.
- * But if the migrating page is in a part of the vma outside the range to be
- * remapped, then it will not be cleared, and remove_migration_ptes() needs to
- * deal with it.  Fortunately, this part of the vma is of course still linear,
- * so we just need to use linear location on the nonlinear list.
- */
-static int remove_linear_migration_ptes_from_nonlinear(struct page *page,
-                struct address_space *mapping, void *arg)
-{
-        struct vm_area_struct *vma;
-        /* hugetlbfs does not support remap_pages, so no huge pgoff worries */
-        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-        unsigned long addr;
-        list_for_each_entry(vma,
-                &mapping->i_mmap_nonlinear, shared.nonlinear) {
-                addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-                if (addr >= vma->vm_start && addr < vma->vm_end)
-                        remove_migration_pte(page, vma, addr, arg);
-        }
-        return SWAP_AGAIN;
-}
-/*
 * Get rid of all migration entries and replace them by
 * references to the indicated page.
 */
@@ -218,7 +187,6 @@ static void remove_migration_ptes(struct page *old, struct page *new)
        struct rmap_walk_control rwc = {
                .rmap_one = remove_migration_pte,
                .arg = old,
-                .file_nonlinear = remove_linear_migration_ptes_from_nonlinear,
        };
        rmap_walk(new, &rwc);
@@ -229,7 +197,7 @@ static void remove_migration_ptes(struct page *old, struct page *new)
 * get to the page and wait until migration is finished.
 * When we return from this function the fault will be retried.
 */
-static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
+void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
                                spinlock_t *ptl)
 {
        pte_t pte;
@@ -1268,7 +1236,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
                        goto put_and_set;
                if (PageHuge(page)) {
-                        isolate_huge_page(page, &pagelist);
+                        if (PageHead(page))
+                                isolate_huge_page(page, &pagelist);
                        goto put_and_set;
                }
@@ -1685,12 +1654,6 @@ bool pmd_trans_migrating(pmd_t pmd)
        return PageLocked(page);
 }
-void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
-{
-        struct page *page = pmd_page(*pmd);
-        wait_on_page_locked(page);
-}
 /*
 * Attempt to migrate a misplaced page to the specified destination
 * node. Caller is expected to have an elevated reference count on
@@ -1884,7 +1847,7 @@ out_fail:
 out_dropref:
        ptl = pmd_lock(mm, pmd);
        if (pmd_same(*pmd, entry)) {
-                entry = pmd_mknonnuma(entry);
+                entry = pmd_modify(entry, vma->vm_page_prot);
                set_pmd_at(mm, mmun_start, pmd, entry);
                update_mmu_cache_pmd(vma, address, &entry);
        }
diff --git a/mm/mincore.c b/mm/mincore.c
index c8c528b36641..be25efde64a4 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -19,38 +19,25 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
-static void mincore_hugetlb_page_range(struct vm_area_struct *vma,
+static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
-                                unsigned long addr, unsigned long end,
+                        unsigned long end, struct mm_walk *walk)
-                                unsigned char *vec)
 {
 #ifdef CONFIG_HUGETLB_PAGE
-        struct hstate *h;
+        unsigned char present;
+        unsigned char *vec = walk->private;
-        h = hstate_vma(vma);
+        /*
-        while (1) {
+         * Hugepages under user process are always in RAM and never
-                unsigned char present;
+         * swapped out, but theoretically it needs to be checked.
-                pte_t *ptep;
+         */
-                /*
+        present = pte && !huge_pte_none(huge_ptep_get(pte));
-                 * Huge pages are always in RAM for now, but
+        for (; addr != end; vec++, addr += PAGE_SIZE)
-                 * theoretically it needs to be checked.
+                *vec = present;
-                 */
+        walk->private = vec;
-                ptep = huge_pte_offset(current->mm,
-                                       addr & huge_page_mask(h));
-                present = ptep && !huge_pte_none(huge_ptep_get(ptep));
-                while (1) {
-                        *vec = present;
-                        vec++;
-                        addr += PAGE_SIZE;
-                        if (addr == end)
-                                return;
-                        /* check hugepage border */
-                        if (!(addr & ~huge_page_mask(h)))
-                                break;
-                }
-        }
 #else
        BUG();
 #endif
+        return 0;
 }
 /*
@@ -94,9 +81,8 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
        return present;
 }
-static void mincore_unmapped_range(struct vm_area_struct *vma,
+static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
-                                unsigned long addr, unsigned long end,
+                                struct vm_area_struct *vma, unsigned char *vec)
-                                unsigned char *vec)
 {
        unsigned long nr = (end - addr) >> PAGE_SHIFT;
        int i;
@@ -111,30 +97,47 @@ static void mincore_unmapped_range(struct vm_area_struct *vma,
                for (i = 0; i < nr; i++)
                        vec[i] = 0;
        }
+        return nr;
+}
+static int mincore_unmapped_range(unsigned long addr, unsigned long end,
+                                   struct mm_walk *walk)
+{
+        walk->private += __mincore_unmapped_range(addr, end,
+                                                  walk->vma, walk->private);
+        return 0;
 }
-static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-                        unsigned long addr, unsigned long end,
+                        struct mm_walk *walk)
-                        unsigned char *vec)
 {
-        unsigned long next;
        spinlock_t *ptl;
+        struct vm_area_struct *vma = walk->vma;
        pte_t *ptep;
+        unsigned char *vec = walk->private;
+        int nr = (end - addr) >> PAGE_SHIFT;
+        if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+                memset(vec, 1, nr);
+                spin_unlock(ptl);
+                goto out;
+        }
+        if (pmd_trans_unstable(pmd)) {
+                __mincore_unmapped_range(addr, end, vma, vec);
+                goto out;
+        }
-        ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+        ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
-        do {
+        for (; addr != end; ptep++, addr += PAGE_SIZE) {
                pte_t pte = *ptep;
-                pgoff_t pgoff;
-                next = addr + PAGE_SIZE;
                if (pte_none(pte))
-                        mincore_unmapped_range(vma, addr, next, vec);
+                        __mincore_unmapped_range(addr, addr + PAGE_SIZE,
+                                                 vma, vec);
                else if (pte_present(pte))
                        *vec = 1;
-                else if (pte_file(pte)) {
+                else { /* pte is a swap entry */
-                        pgoff = pte_to_pgoff(pte);
-                        *vec = mincore_page(vma->vm_file->f_mapping, pgoff);
-                } else { /* pte is a swap entry */
                        swp_entry_t entry = pte_to_swp_entry(pte);
                        if (non_swap_entry(entry)) {
@@ -145,9 +148,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                *vec = 1;
                        } else {
 #ifdef CONFIG_SWAP
-                                pgoff = entry.val;
                                *vec = mincore_page(swap_address_space(entry),
-                                        pgoff);
+                                        entry.val);
 #else
                                WARN_ON(1);
                                *vec = 1;
@@ -155,69 +157,12 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                        }
                }
                vec++;
-        } while (ptep++, addr = next, addr != end);
+        }
        pte_unmap_unlock(ptep - 1, ptl);
-}
+out:
+        walk->private += nr;
-static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+        cond_resched();
-                        unsigned long addr, unsigned long end,
+        return 0;
-                        unsigned char *vec)
-{
-        unsigned long next;
-        pmd_t *pmd;
-        pmd = pmd_offset(pud, addr);
-        do {
-                next = pmd_addr_end(addr, end);
-                if (pmd_trans_huge(*pmd)) {
-                        if (mincore_huge_pmd(vma, pmd, addr, next, vec)) {
-                                vec += (next - addr) >> PAGE_SHIFT;
-                                continue;
-                        }
-                        /* fall through */
-                }
-                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
-                        mincore_unmapped_range(vma, addr, next, vec);
-                else
-                        mincore_pte_range(vma, pmd, addr, next, vec);
-                vec += (next - addr) >> PAGE_SHIFT;
-        } while (pmd++, addr = next, addr != end);
-}
-static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
-                        unsigned long addr, unsigned long end,
-                        unsigned char *vec)
-{
-        unsigned long next;
-        pud_t *pud;
-        pud = pud_offset(pgd, addr);
-        do {
-                next = pud_addr_end(addr, end);
-                if (pud_none_or_clear_bad(pud))
-                        mincore_unmapped_range(vma, addr, next, vec);
-                else
-                        mincore_pmd_range(vma, pud, addr, next, vec);
-                vec += (next - addr) >> PAGE_SHIFT;
-        } while (pud++, addr = next, addr != end);
-}
-static void mincore_page_range(struct vm_area_struct *vma,
-                        unsigned long addr, unsigned long end,
-                        unsigned char *vec)
-{
-        unsigned long next;
-        pgd_t *pgd;
-        pgd = pgd_offset(vma->vm_mm, addr);
-        do {
-                next = pgd_addr_end(addr, end);
-                if (pgd_none_or_clear_bad(pgd))
-                        mincore_unmapped_range(vma, addr, next, vec);
-                else
-                        mincore_pud_range(vma, pgd, addr, next, vec);
-                vec += (next - addr) >> PAGE_SHIFT;
-        } while (pgd++, addr = next, addr != end);
 }
 /*
@@ -229,18 +174,22 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
 {
        struct vm_area_struct *vma;
        unsigned long end;
+        int err;
+        struct mm_walk mincore_walk = {
+                .pmd_entry = mincore_pte_range,
+                .pte_hole = mincore_unmapped_range,
+                .hugetlb_entry = mincore_hugetlb,
+                .private = vec,
+        };
        vma = find_vma(current->mm, addr);
        if (!vma || addr < vma->vm_start)
                return -ENOMEM;
+        mincore_walk.mm = vma->vm_mm;
        end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
+        err = walk_page_range(addr, end, &mincore_walk);
-        if (is_vm_hugetlb_page(vma))
+        if (err < 0)
-                mincore_hugetlb_page_range(vma, addr, end, vec);
+                return err;
-        else
-                mincore_page_range(vma, addr, end, vec);
        return (end - addr) >> PAGE_SHIFT;
 }
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 4074caf9936b..5f420f7fafa1 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -14,14 +14,14 @@
 #include "internal.h"
 #ifdef CONFIG_DEBUG_MEMORY_INIT
-int mminit_loglevel;
+int __meminitdata mminit_loglevel;
 #ifndef SECTIONS_SHIFT
 #define SECTIONS_SHIFT  0
 #endif
 /* The zonelists are simply reported, validation is manual. */
-void mminit_verify_zonelist(void)
+void __init mminit_verify_zonelist(void)
 {
        int nid;
diff --git a/mm/mmap.c b/mm/mmap.c
index 7f684d5a8087..da9990acc08b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -152,7 +152,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
 */
 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 {
-        unsigned long free, allowed, reserve;
+        long free, allowed, reserve;
        VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
                        -(s64)vm_committed_as_batch * num_online_cpus(),
@@ -220,7 +220,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
         */
        if (mm) {
                reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
-                allowed -= min(mm->total_vm / 32, reserve);
+                allowed -= min_t(long, mm->total_vm / 32, reserve);
        }
        if (percpu_counter_read_positive(&vm_committed_as) < allowed)
@@ -243,10 +243,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
                mapping_unmap_writable(mapping);
        flush_dcache_mmap_lock(mapping);
-        if (unlikely(vma->vm_flags & VM_NONLINEAR))
+        vma_interval_tree_remove(vma, &mapping->i_mmap);
-                list_del_init(&vma->shared.nonlinear);
-        else
-                vma_interval_tree_remove(vma, &mapping->i_mmap);
        flush_dcache_mmap_unlock(mapping);
 }
@@ -649,10 +646,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
                        atomic_inc(&mapping->i_mmap_writable);
                flush_dcache_mmap_lock(mapping);
-                if (unlikely(vma->vm_flags & VM_NONLINEAR))
+                vma_interval_tree_insert(vma, &mapping->i_mmap);
-                        vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
-                else
-                        vma_interval_tree_insert(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
        }
 }
@@ -789,14 +783,11 @@ again:			remove_next = 1 + (end > next->vm_end);
        if (file) {
                mapping = file->f_mapping;
-                if (!(vma->vm_flags & VM_NONLINEAR)) {
+                root = &mapping->i_mmap;
-                        root = &mapping->i_mmap;
+                uprobe_munmap(vma, vma->vm_start, vma->vm_end);
-                        uprobe_munmap(vma, vma->vm_start, vma->vm_end);
-                        if (adjust_next)
+                if (adjust_next)
-                                uprobe_munmap(next, next->vm_start,
+                        uprobe_munmap(next, next->vm_start, next->vm_end);
-                                                        next->vm_end);
-                }
                i_mmap_lock_write(mapping);
                if (insert) {
@@ -2634,6 +2625,75 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
        return vm_munmap(addr, len);
 }
+/*
+ * Emulation of deprecated remap_file_pages() syscall.
+ */
+SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
+                unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
+{
+        struct mm_struct *mm = current->mm;
+        struct vm_area_struct *vma;
+        unsigned long populate = 0;
+        unsigned long ret = -EINVAL;
+        struct file *file;
+        pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
+                        "See Documentation/vm/remap_file_pages.txt.\n",
+                        current->comm, current->pid);
+        if (prot)
+                return ret;
+        start = start & PAGE_MASK;
+        size = size & PAGE_MASK;
+        if (start + size <= start)
+                return ret;
+        /* Does pgoff wrap? */
+        if (pgoff + (size >> PAGE_SHIFT) < pgoff)
+                return ret;
+        down_write(&mm->mmap_sem);
+        vma = find_vma(mm, start);
+        if (!vma || !(vma->vm_flags & VM_SHARED))
+                goto out;
+        if (start < vma->vm_start || start + size > vma->vm_end)
+                goto out;
+        if (pgoff == linear_page_index(vma, start)) {
+                ret = 0;
+                goto out;
+        }
+        prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
+        prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
+        prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
+        flags &= MAP_NONBLOCK;
+        flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
+        if (vma->vm_flags & VM_LOCKED) {
+                flags |= MAP_LOCKED;
+                /* drop PG_Mlocked flag for over-mapped range */
+                munlock_vma_pages_range(vma, start, start + size);
+        }
+        file = get_file(vma->vm_file);
+        ret = do_mmap_pgoff(vma->vm_file, start, size,
+                        prot, flags, pgoff, &populate);
+        fput(file);
+out:
+        up_write(&mm->mmap_sem);
+        if (populate)
+                mm_populate(ret, populate);
+        if (!IS_ERR_VALUE(ret))
+                ret = 0;
+        return ret;
+}
 static inline void verify_mm_writelocked(struct mm_struct *mm)
 {
 #ifdef CONFIG_DEBUG_VM
@@ -2791,9 +2851,6 @@ void exit_mmap(struct mm_struct *mm)
                vma = remove_vma(vma);
        }
        vm_unacct_memory(nr_accounted);
-        WARN_ON(atomic_long_read(&mm->nr_ptes) >
-                        (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
 }
 /* Insert vm structure into process list sorted by address
@@ -3108,8 +3165,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
 *
 * mmap_sem in write mode is required in order to block all operations
 * that could modify pagetables and free pages without need of
- * altering the vma layout (for example populate_range() with
+ * altering the vma layout. It's also needed in write mode to avoid new
- * nonlinear vmas). It's also needed in write mode to avoid new
 * anon_vmas to be associated with existing vmas.
 *
 * A single task can't take more than one mm_take_all_locks() in a row
diff --git a/mm/mmzone.c b/mm/mmzone.c
index bf34fb8556db..7d87ebb0d632 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -54,8 +54,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
 /* Returns the next zone at or below highest_zoneidx in a zonelist */
 struct zoneref *next_zones_zonelist(struct zoneref *z,
                                        enum zone_type highest_zoneidx,
-                                        nodemask_t *nodes,
+                                        nodemask_t *nodes)
-                                        struct zone **zone)
 {
        /*
         * Find the next suitable zone to use for the allocation.
@@ -69,7 +68,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
                                (z->zone && !zref_in_nodemask(z, nodes)))
                        z++;
-        *zone = zonelist_zone(z);
        return z;
 }
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ace93454ce8e..44727811bf4c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -75,37 +75,35 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                oldpte = *pte;
                if (pte_present(oldpte)) {
                        pte_t ptent;
-                        bool updated = false;
-                        if (!prot_numa) {
+                        /*
-                                ptent = ptep_modify_prot_start(mm, addr, pte);
+                         * Avoid trapping faults against the zero or KSM
-                                if (pte_numa(ptent))
+                         * pages. See similar comment in change_huge_pmd.
-                                        ptent = pte_mknonnuma(ptent);
+                         */
-                                ptent = pte_modify(ptent, newprot);
+                        if (prot_numa) {
-                                /*
-                                 * Avoid taking write faults for pages we
-                                 * know to be dirty.
-                                 */
-                                if (dirty_accountable && pte_dirty(ptent) &&
-                                    (pte_soft_dirty(ptent) ||
-                                     !(vma->vm_flags & VM_SOFTDIRTY)))
-                                        ptent = pte_mkwrite(ptent);
-                                ptep_modify_prot_commit(mm, addr, pte, ptent);
-                                updated = true;
-                        } else {
                                struct page *page;
                                page = vm_normal_page(vma, addr, oldpte);
-                                if (page && !PageKsm(page)) {
+                                if (!page || PageKsm(page))
-                                        if (!pte_numa(oldpte)) {
+                                        continue;
-                                                ptep_set_numa(mm, addr, pte);
-                                                updated = true;
+                                /* Avoid TLB flush if possible */
-                                        }
+                                if (pte_protnone(oldpte))
-                                }
+                                        continue;
                        }
-                        if (updated)
-                                pages++;
+                        ptent = ptep_modify_prot_start(mm, addr, pte);
-                } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
+                        ptent = pte_modify(ptent, newprot);
+                        /* Avoid taking write faults for known dirty pages */
+                        if (dirty_accountable && pte_dirty(ptent) &&
+                                        (pte_soft_dirty(ptent) ||
+                                         !(vma->vm_flags & VM_SOFTDIRTY))) {
+                                ptent = pte_mkwrite(ptent);
+                        }
+                        ptep_modify_prot_commit(mm, addr, pte, ptent);
+                        pages++;
+                } else if (IS_ENABLED(CONFIG_MIGRATION)) {
                        swp_entry_t entry = pte_to_swp_entry(oldpte);
                        if (is_write_migration_entry(entry)) {
diff --git a/mm/mremap.c b/mm/mremap.c
index 17fa018f5f39..57dadc025c64 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -81,8 +81,6 @@ static pte_t move_soft_dirty_pte(pte_t pte)
                pte = pte_mksoft_dirty(pte);
        else if (is_swap_pte(pte))
                pte = pte_swp_mksoft_dirty(pte);
-        else if (pte_file(pte))
-                pte = pte_file_mksoft_dirty(pte);
 #endif
        return pte;
 }
diff --git a/mm/msync.c b/mm/msync.c
index 992a1673d488..bb04d53ae852 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -86,10 +86,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
                                (vma->vm_flags & VM_SHARED)) {
                        get_file(file);
                        up_read(&mm->mmap_sem);
-                        if (vma->vm_flags & VM_NONLINEAR)
+                        error = vfs_fsync_range(file, fstart, fend, 1);
-                                error = vfs_fsync(file, 1);
-                        else
-                                error = vfs_fsync_range(file, fstart, fend, 1);
                        fput(file);
                        if (error || start >= end)
                                goto out;
diff --git a/mm/nommu.c b/mm/nommu.c
index 28bd8c4dff6f..3e67e7538ecf 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -214,6 +214,39 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 }
 EXPORT_SYMBOL(get_user_pages);
+long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
+                           unsigned long start, unsigned long nr_pages,
+                           int write, int force, struct page **pages,
+                           int *locked)
+{
+        return get_user_pages(tsk, mm, start, nr_pages, write, force,
+                              pages, NULL);
+}
+EXPORT_SYMBOL(get_user_pages_locked);
+long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+                               unsigned long start, unsigned long nr_pages,
+                               int write, int force, struct page **pages,
+                               unsigned int gup_flags)
+{
+        long ret;
+        down_read(&mm->mmap_sem);
+        ret = get_user_pages(tsk, mm, start, nr_pages, write, force,
+                             pages, NULL);
+        up_read(&mm->mmap_sem);
+        return ret;
+}
+EXPORT_SYMBOL(__get_user_pages_unlocked);
+long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+                             unsigned long start, unsigned long nr_pages,
+                             int write, int force, struct page **pages)
+{
+        return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
+                                         force, pages, 0);
+}
+EXPORT_SYMBOL(get_user_pages_unlocked);
 /**
 * follow_pfn - look up PFN at a user virtual address
 * @vma: memory mapping
@@ -947,9 +980,6 @@ static int validate_mmap_request(struct file *file,
                return -EOVERFLOW;
        if (file) {
-                /* validate file mapping requests */
-                struct address_space *mapping;
                /* files must support mmap */
                if (!file->f_op->mmap)
                        return -ENODEV;
@@ -958,28 +988,22 @@ static int validate_mmap_request(struct file *file,
                 * - we support chardevs that provide their own "memory"
                 * - we support files/blockdevs that are memory backed
                 */
-                mapping = file->f_mapping;
+                if (file->f_op->mmap_capabilities) {
-                if (!mapping)
+                        capabilities = file->f_op->mmap_capabilities(file);
-                        mapping = file_inode(file)->i_mapping;
+                } else {
-                capabilities = 0;
-                if (mapping && mapping->backing_dev_info)
-                        capabilities = mapping->backing_dev_info->capabilities;
-                if (!capabilities) {
                        /* no explicit capabilities set, so assume some
                         * defaults */
                        switch (file_inode(file)->i_mode & S_IFMT) {
                        case S_IFREG:
                        case S_IFBLK:
-                                capabilities = BDI_CAP_MAP_COPY;
+                                capabilities = NOMMU_MAP_COPY;
                                break;
                        case S_IFCHR:
                                capabilities =
-                                        BDI_CAP_MAP_DIRECT |
+                                        NOMMU_MAP_DIRECT |
-                                        BDI_CAP_READ_MAP |
+                                        NOMMU_MAP_READ |
-                                        BDI_CAP_WRITE_MAP;
+                                        NOMMU_MAP_WRITE;
                                break;
                        default:
@@ -990,9 +1014,9 @@ static int validate_mmap_request(struct file *file,
                /* eliminate any capabilities that we can't support on this
                 * device */
                if (!file->f_op->get_unmapped_area)
-                        capabilities &= ~BDI_CAP_MAP_DIRECT;
+                        capabilities &= ~NOMMU_MAP_DIRECT;
                if (!file->f_op->read)
-                        capabilities &= ~BDI_CAP_MAP_COPY;
+                        capabilities &= ~NOMMU_MAP_COPY;
                /* The file shall have been opened with read permission. */
                if (!(file->f_mode & FMODE_READ))
@@ -1011,29 +1035,29 @@ static int validate_mmap_request(struct file *file,
                        if (locks_verify_locked(file))
                                return -EAGAIN;
-                        if (!(capabilities & BDI_CAP_MAP_DIRECT))
+                        if (!(capabilities & NOMMU_MAP_DIRECT))
                                return -ENODEV;
                        /* we mustn't privatise shared mappings */
-                        capabilities &= ~BDI_CAP_MAP_COPY;
+                        capabilities &= ~NOMMU_MAP_COPY;
                } else {
                        /* we're going to read the file into private memory we
                         * allocate */
-                        if (!(capabilities & BDI_CAP_MAP_COPY))
+                        if (!(capabilities & NOMMU_MAP_COPY))
                                return -ENODEV;
                        /* we don't permit a private writable mapping to be
                         * shared with the backing device */
                        if (prot & PROT_WRITE)
-                                capabilities &= ~BDI_CAP_MAP_DIRECT;
+                                capabilities &= ~NOMMU_MAP_DIRECT;
                }
-                if (capabilities & BDI_CAP_MAP_DIRECT) {
+                if (capabilities & NOMMU_MAP_DIRECT) {
-                        if (((prot & PROT_READ)  && !(capabilities & BDI_CAP_READ_MAP))  ||
+                        if (((prot & PROT_READ)  && !(capabilities & NOMMU_MAP_READ))  ||
-                            ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
+                            ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) ||
-                            ((prot & PROT_EXEC)  && !(capabilities & BDI_CAP_EXEC_MAP))
+                            ((prot & PROT_EXEC)  && !(capabilities & NOMMU_MAP_EXEC))
                            ) {
-                                capabilities &= ~BDI_CAP_MAP_DIRECT;
+                                capabilities &= ~NOMMU_MAP_DIRECT;
                                if (flags & MAP_SHARED) {
                                        printk(KERN_WARNING
                                               "MAP_SHARED not completely supported on !MMU\n");
@@ -1050,21 +1074,21 @@ static int validate_mmap_request(struct file *file,
                } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
                        /* handle implication of PROT_EXEC by PROT_READ */
                        if (current->personality & READ_IMPLIES_EXEC) {
-                                if (capabilities & BDI_CAP_EXEC_MAP)
+                                if (capabilities & NOMMU_MAP_EXEC)
                                        prot |= PROT_EXEC;
                        }
                } else if ((prot & PROT_READ) &&
                         (prot & PROT_EXEC) &&
-                         !(capabilities & BDI_CAP_EXEC_MAP)
+                         !(capabilities & NOMMU_MAP_EXEC)
                         ) {
                        /* backing file is not executable, try to copy */
-                        capabilities &= ~BDI_CAP_MAP_DIRECT;
+                        capabilities &= ~NOMMU_MAP_DIRECT;
                }
        } else {
                /* anonymous mappings are always memory backed and can be
                 * privately mapped
                 */
-                capabilities = BDI_CAP_MAP_COPY;
+                capabilities = NOMMU_MAP_COPY;
                /* handle PROT_EXEC implication by PROT_READ */
                if ((prot & PROT_READ) &&
@@ -1096,7 +1120,7 @@ static unsigned long determine_vm_flags(struct file *file,
        vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
        /* vm_flags |= mm->def_flags; */
-        if (!(capabilities & BDI_CAP_MAP_DIRECT)) {
+        if (!(capabilities & NOMMU_MAP_DIRECT)) {
                /* attempt to share read-only copies of mapped file chunks */
                vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
                if (file && !(prot & PROT_WRITE))
@@ -1105,7 +1129,7 @@ static unsigned long determine_vm_flags(struct file *file,
                /* overlay a shareable mapping on the backing device or inode
                 * if possible - used for chardevs, ramfs/tmpfs/shmfs and
                 * romfs/cramfs */
-                vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS);
+                vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS);
                if (flags & MAP_SHARED)
                        vm_flags |= VM_SHARED;
        }
@@ -1158,7 +1182,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
         * shared mappings on devices or memory
         * - VM_MAYSHARE will be set if it may attempt to share
         */
-        if (capabilities & BDI_CAP_MAP_DIRECT) {
+        if (capabilities & NOMMU_MAP_DIRECT) {
                ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
                if (ret == 0) {
                        /* shouldn't return success if we're not sharing */
@@ -1189,11 +1213,9 @@ static int do_mmap_private(struct vm_area_struct *vma,
        if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) {
                total = point;
                kdebug("try to alloc exact %lu pages", total);
-                base = alloc_pages_exact(len, GFP_KERNEL);
-        } else {
-                base = (void *)__get_free_pages(GFP_KERNEL, order);
        }
+        base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL);
        if (!base)
                goto enomem;
@@ -1347,7 +1369,7 @@ unsigned long do_mmap_pgoff(struct file *file,
                        if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
                            !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
                                /* new mapping is not a subset of the region */
-                                if (!(capabilities & BDI_CAP_MAP_DIRECT))
+                                if (!(capabilities & NOMMU_MAP_DIRECT))
                                        goto sharing_violation;
                                continue;
                        }
@@ -1386,7 +1408,7 @@ unsigned long do_mmap_pgoff(struct file *file,
                 * - this is the hook for quasi-memory character devices to
                 *   tell us the location of a shared mapping
                 */
-                if (capabilities & BDI_CAP_MAP_DIRECT) {
+                if (capabilities & NOMMU_MAP_DIRECT) {
                        addr = file->f_op->get_unmapped_area(file, addr, len,
                                                             pgoff, flags);
                        if (IS_ERR_VALUE(addr)) {
@@ -1398,10 +1420,10 @@ unsigned long do_mmap_pgoff(struct file *file,
                                 * the mapping so we'll have to attempt to copy
                                 * it */
                                ret = -ENODEV;
-                                if (!(capabilities & BDI_CAP_MAP_COPY))
+                                if (!(capabilities & NOMMU_MAP_COPY))
                                        goto error_just_free;
-                                capabilities &= ~BDI_CAP_MAP_DIRECT;
+                                capabilities &= ~NOMMU_MAP_DIRECT;
                        } else {
                                vma->vm_start = region->vm_start = addr;
                                vma->vm_end = region->vm_end = addr + len;
@@ -1412,7 +1434,7 @@ unsigned long do_mmap_pgoff(struct file *file,
        vma->vm_region = region;
        /* set up the mapping
-         * - the region is filled in if BDI_CAP_MAP_DIRECT is still set
+         * - the region is filled in if NOMMU_MAP_DIRECT is still set
         */
        if (file && vma->vm_flags & VM_SHARED)
                ret = do_mmap_shared_file(vma);
@@ -1895,7 +1917,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
 */
 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 {
-        unsigned long free, allowed, reserve;
+        long free, allowed, reserve;
        vm_acct_memory(pages);
@@ -1959,7 +1981,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
         */
        if (mm) {
                reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
-                allowed -= min(mm->total_vm / 32, reserve);
+                allowed -= min_t(long, mm->total_vm / 32, reserve);
        }
        if (percpu_counter_read_positive(&vm_committed_as) < allowed)
@@ -1984,14 +2006,6 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 EXPORT_SYMBOL(filemap_map_pages);
-int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
-                             unsigned long size, pgoff_t pgoff)
-{
-        BUG();
-        return 0;
-}
-EXPORT_SYMBOL(generic_file_remap_pages);
 static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
                unsigned long addr, void *buf, int len, int write)
 {
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d503e9ce1c7b..642f38cb175a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -169,8 +169,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
         * The baseline for the badness score is the proportion of RAM that each
         * task's rss, pagetable and swap space use.
         */
-        points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) +
+        points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
-                 get_mm_counter(p->mm, MM_SWAPENTS);
+                atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
        task_unlock(p);
        /*
@@ -266,8 +266,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
         * Don't allow any other task to have access to the reserves.
         */
        if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
-                if (unlikely(frozen(task)))
-                        __thaw_task(task);
                if (!force_kill)
                        return OOM_SCAN_ABORT;
        }
@@ -353,7 +351,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
        struct task_struct *p;
        struct task_struct *task;
-        pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes swapents oom_score_adj name\n");
+        pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds swapents oom_score_adj name\n");
        rcu_read_lock();
        for_each_process(p) {
                if (oom_unkillable_task(p, memcg, nodemask))
@@ -369,10 +367,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
                        continue;
                }
-                pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu         %5hd %s\n",
+                pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu         %5hd %s\n",
                        task->pid, from_kuid(&init_user_ns, task_uid(task)),
                        task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
                        atomic_long_read(&task->mm->nr_ptes),
+                        mm_nr_pmds(task->mm),
                        get_mm_counter(task->mm, MM_SWAPENTS),
                        task->signal->oom_score_adj, task->comm);
                task_unlock(task);
@@ -400,20 +399,98 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 }
 /*
- * Number of OOM killer invocations (including memcg OOM killer).
+ * Number of OOM victims in flight
- * Primarily used by PM freezer to check for potential races with
- * OOM killed frozen task.
 */
-static atomic_t oom_kills = ATOMIC_INIT(0);
+static atomic_t oom_victims = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
-int oom_kills_count(void)
+bool oom_killer_disabled __read_mostly;
+static DECLARE_RWSEM(oom_sem);
+/**
+ * mark_tsk_oom_victim - marks the given taks as OOM victim.
+ * @tsk: task to mark
+ *
+ * Has to be called with oom_sem taken for read and never after
+ * oom has been disabled already.
+ */
+void mark_tsk_oom_victim(struct task_struct *tsk)
 {
-        return atomic_read(&oom_kills);
+        WARN_ON(oom_killer_disabled);
+        /* OOM killer might race with memcg OOM */
+        if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
+                return;
+        /*
+         * Make sure that the task is woken up from uninterruptible sleep
+         * if it is frozen because OOM killer wouldn't be able to free
+         * any memory and livelock. freezing_slow_path will tell the freezer
+         * that TIF_MEMDIE tasks should be ignored.
+         */
+        __thaw_task(tsk);
+        atomic_inc(&oom_victims);
+}
+/**
+ * unmark_oom_victim - unmarks the current task as OOM victim.
+ *
+ * Wakes up all waiters in oom_killer_disable()
+ */
+void unmark_oom_victim(void)
+{
+        if (!test_and_clear_thread_flag(TIF_MEMDIE))
+                return;
+        down_read(&oom_sem);
+        /*
+         * There is no need to signal the lasst oom_victim if there
+         * is nobody who cares.
+         */
+        if (!atomic_dec_return(&oom_victims) && oom_killer_disabled)
+                wake_up_all(&oom_victims_wait);
+        up_read(&oom_sem);
+}
+/**
+ * oom_killer_disable - disable OOM killer
+ *
+ * Forces all page allocations to fail rather than trigger OOM killer.
+ * Will block and wait until all OOM victims are killed.
+ *
+ * The function cannot be called when there are runnable user tasks because
+ * the userspace would see unexpected allocation failures as a result. Any
+ * new usage of this function should be consulted with MM people.
+ *
+ * Returns true if successful and false if the OOM killer cannot be
+ * disabled.
+ */
+bool oom_killer_disable(void)
+{
+        /*
+         * Make sure to not race with an ongoing OOM killer
+         * and that the current is not the victim.
+         */
+        down_write(&oom_sem);
+        if (test_thread_flag(TIF_MEMDIE)) {
+                up_write(&oom_sem);
+                return false;
+        }
+        oom_killer_disabled = true;
+        up_write(&oom_sem);
+        wait_event(oom_victims_wait, !atomic_read(&oom_victims));
+        return true;
 }
-void note_oom_kill(void)
+/**
+ * oom_killer_enable - enable OOM killer
+ */
+void oom_killer_enable(void)
 {
-        atomic_inc(&oom_kills);
+        down_write(&oom_sem);
+        oom_killer_disabled = false;
+        up_write(&oom_sem);
 }
 #define K(x) ((x) << (PAGE_SHIFT-10))
@@ -438,11 +515,14 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
         * If the task is already exiting, don't alarm the sysadmin or kill
         * its children or threads, just set TIF_MEMDIE so it can die quickly
         */
-        if (task_will_free_mem(p)) {
+        task_lock(p);
-                set_tsk_thread_flag(p, TIF_MEMDIE);
+        if (p->mm && task_will_free_mem(p)) {
+                mark_tsk_oom_victim(p);
+                task_unlock(p);
                put_task_struct(p);
                return;
        }
+        task_unlock(p);
        if (__ratelimit(&oom_rs))
                dump_header(p, gfp_mask, order, memcg, nodemask);
@@ -492,6 +572,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
        /* mm cannot safely be dereferenced after task_unlock(victim) */
        mm = victim->mm;
+        mark_tsk_oom_victim(victim);
        pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
                task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
                K(get_mm_counter(victim->mm, MM_ANONPAGES)),
@@ -522,7 +603,6 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                }
        rcu_read_unlock();
-        set_tsk_thread_flag(victim, TIF_MEMDIE);
        do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
        put_task_struct(victim);
 }
@@ -611,7 +691,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
 }
 /**
- * out_of_memory - kill the "best" process when we run out of memory
+ * __out_of_memory - kill the "best" process when we run out of memory
 * @zonelist: zonelist pointer
 * @gfp_mask: memory allocation flags
 * @order: amount of memory being requested as a power of 2
@@ -623,7 +703,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
 * OR try to be smart about which process to kill. Note that we
 * don't have to be perfect here, we just have to be good.
 */
-void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
                int order, nodemask_t *nodemask, bool force_kill)
 {
        const nodemask_t *mpol_mask;
@@ -643,9 +723,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
         * If current has a pending SIGKILL or is exiting, then automatically
         * select it.  The goal is to allow it to allocate so that it may
         * quickly exit and free its memory.
+         *
+         * But don't select if current has already released its mm and cleared
+         * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur.
         */
-        if (fatal_signal_pending(current) || task_will_free_mem(current)) {
+        if (current->mm &&
-                set_thread_flag(TIF_MEMDIE);
+            (fatal_signal_pending(current) || task_will_free_mem(current))) {
+                mark_tsk_oom_victim(current);
                return;
        }
@@ -688,6 +772,32 @@ out:
                schedule_timeout_killable(1);
 }
+/**
+ * out_of_memory -  tries to invoke OOM killer.
+ * @zonelist: zonelist pointer
+ * @gfp_mask: memory allocation flags
+ * @order: amount of memory being requested as a power of 2
+ * @nodemask: nodemask passed to page allocator
+ * @force_kill: true if a task must be killed, even if others are exiting
+ *
+ * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable()
+ * when it returns false. Otherwise returns true.
+ */
+bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+                int order, nodemask_t *nodemask, bool force_kill)
+{
+        bool ret = false;
+        down_read(&oom_sem);
+        if (!oom_killer_disabled) {
+                __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill);
+                ret = true;
+        }
+        up_read(&oom_sem);
+        return ret;
+}
 /*
 * The pagefault handler calls here because it is out of memory, so kill a
 * memory-hogging task.  If any populated zone has ZONE_OOM_LOCKED set, a
@@ -697,12 +807,25 @@ void pagefault_out_of_memory(void)
 {
        struct zonelist *zonelist;
+        down_read(&oom_sem);
        if (mem_cgroup_oom_synchronize(true))
-                return;
+                goto unlock;
        zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
        if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
-                out_of_memory(NULL, 0, 0, NULL, false);
+                if (!oom_killer_disabled)
+                        __out_of_memory(NULL, 0, 0, NULL, false);
+                else
+                        /*
+                         * There shouldn't be any user tasks runable while the
+                         * OOM killer is disabled so the current task has to
+                         * be a racing OOM victim for which oom_killer_disable()
+                         * is waiting for.
+                         */
+                        WARN_ON(test_thread_flag(TIF_MEMDIE));
                oom_zonelist_unlock(zonelist, GFP_KERNEL);
        }
+unlock:
+        up_read(&oom_sem);
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6f4335238e33..45e187b2d971 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1351,7 +1351,7 @@ static void balance_dirty_pages(struct address_space *mapping,
        unsigned long task_ratelimit;
        unsigned long dirty_ratelimit;
        unsigned long pos_ratio;
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
+        struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
        bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
        unsigned long start_time = jiffies;
@@ -1574,7 +1574,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
 */
 void balance_dirty_pages_ratelimited(struct address_space *mapping)
 {
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
+        struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
        int ratelimit;
        int *p;
@@ -1929,7 +1929,7 @@ continue_unlock:
                        if (!clear_page_dirty_for_io(page))
                                goto continue_unlock;
-                        trace_wbc_writepage(wbc, mapping->backing_dev_info);
+                        trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
                        ret = (*writepage)(page, wbc, data);
                        if (unlikely(ret)) {
                                if (ret == AOP_WRITEPAGE_ACTIVATE) {
@@ -2094,10 +2094,12 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
        trace_writeback_dirty_page(page, mapping);
        if (mapping_cap_account_dirty(mapping)) {
+                struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
                __inc_zone_page_state(page, NR_FILE_DIRTY);
                __inc_zone_page_state(page, NR_DIRTIED);
-                __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+                __inc_bdi_stat(bdi, BDI_RECLAIMABLE);
-                __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+                __inc_bdi_stat(bdi, BDI_DIRTIED);
                task_io_account_write(PAGE_CACHE_SIZE);
                current->nr_dirtied++;
                this_cpu_inc(bdp_ratelimits);
@@ -2156,7 +2158,7 @@ void account_page_redirty(struct page *page)
        if (mapping && mapping_cap_account_dirty(mapping)) {
                current->nr_dirtied--;
                dec_zone_page_state(page, NR_DIRTIED);
-                dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+                dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED);
        }
 }
 EXPORT_SYMBOL(account_page_redirty);
@@ -2168,9 +2170,12 @@ EXPORT_SYMBOL(account_page_redirty);
 */
 int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
 {
+        int ret;
        wbc->pages_skipped++;
+        ret = __set_page_dirty_nobuffers(page);
        account_page_redirty(page);
-        return __set_page_dirty_nobuffers(page);
+        return ret;
 }
 EXPORT_SYMBOL(redirty_page_for_writepage);
@@ -2295,7 +2300,7 @@ int clear_page_dirty_for_io(struct page *page)
                 */
                if (TestClearPageDirty(page)) {
                        dec_zone_page_state(page, NR_FILE_DIRTY);
-                        dec_bdi_stat(mapping->backing_dev_info,
+                        dec_bdi_stat(inode_to_bdi(mapping->host),
                                        BDI_RECLAIMABLE);
                        return 1;
                }
@@ -2308,14 +2313,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
 int test_clear_page_writeback(struct page *page)
 {
        struct address_space *mapping = page_mapping(page);
-        unsigned long memcg_flags;
        struct mem_cgroup *memcg;
-        bool locked;
        int ret;
-        memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
+        memcg = mem_cgroup_begin_page_stat(page);
        if (mapping) {
-                struct backing_dev_info *bdi = mapping->backing_dev_info;
+                struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
                unsigned long flags;
                spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2338,21 +2341,19 @@ int test_clear_page_writeback(struct page *page)
                dec_zone_page_state(page, NR_WRITEBACK);
                inc_zone_page_state(page, NR_WRITTEN);
        }
-        mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags);
+        mem_cgroup_end_page_stat(memcg);
        return ret;
 }
 int __test_set_page_writeback(struct page *page, bool keep_write)
 {
        struct address_space *mapping = page_mapping(page);
-        unsigned long memcg_flags;
        struct mem_cgroup *memcg;
-        bool locked;
        int ret;
-        memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
+        memcg = mem_cgroup_begin_page_stat(page);
        if (mapping) {
-                struct backing_dev_info *bdi = mapping->backing_dev_info;
+                struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
                unsigned long flags;
                spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2380,7 +2381,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
                mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
                inc_zone_page_state(page, NR_WRITEBACK);
        }
-        mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags);
+        mem_cgroup_end_page_stat(memcg);
        return ret;
 }
@@ -2406,12 +2407,7 @@ EXPORT_SYMBOL(mapping_tagged);
 */
 void wait_for_stable_page(struct page *page)
 {
-        struct address_space *mapping = page_mapping(page);
+        if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host)))
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
+                wait_on_page_writeback(page);
-        if (!bdi_cap_stable_pages_required(bdi))
-                return;
-        wait_on_page_writeback(page);
 }
 EXPORT_SYMBOL_GPL(wait_for_stable_page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8e20f9c2fa5a..7abfa70cdc1a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -25,6 +25,7 @@
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/kmemcheck.h>
+#include <linux/kasan.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/pagevec.h>
@@ -172,7 +173,7 @@ static void __free_pages_ok(struct page *page, unsigned int order);
 *      1G machine -> (16M dma, 784M normal, 224M high)
 *      NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
 *      HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
- *      HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
+ *      HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
 *
 * TBD: should special case ZONE_DMA32 machines here - in those we normally
 * don't need any ZONE_NORMAL reservation
@@ -244,8 +245,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
                                        PB_migrate, PB_migrate_end);
 }
-bool oom_killer_disabled __read_mostly;
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
@@ -381,36 +380,6 @@ void prep_compound_page(struct page *page, unsigned long order)
        }
 }
-/* update __split_huge_page_refcount if you change this function */
-static int destroy_compound_page(struct page *page, unsigned long order)
-{
-        int i;
-        int nr_pages = 1 << order;
-        int bad = 0;
-        if (unlikely(compound_order(page) != order)) {
-                bad_page(page, "wrong compound order", 0);
-                bad++;
-        }
-        __ClearPageHead(page);
-        for (i = 1; i < nr_pages; i++) {
-                struct page *p = page + i;
-                if (unlikely(!PageTail(p))) {
-                        bad_page(page, "PageTail not set", 0);
-                        bad++;
-                } else if (unlikely(p->first_page != page)) {
-                        bad_page(page, "first_page not consistent", 0);
-                        bad++;
-                }
-                __ClearPageTail(p);
-        }
-        return bad;
-}
 static inline void prep_zero_page(struct page *page, unsigned int order,
                                                        gfp_t gfp_flags)
 {
@@ -552,17 +521,15 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
                return 0;
        if (page_is_guard(buddy) && page_order(buddy) == order) {
-                VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
                if (page_zone_id(page) != page_zone_id(buddy))
                        return 0;
+                VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
                return 1;
        }
        if (PageBuddy(buddy) && page_order(buddy) == order) {
-                VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
                /*
                 * zone check is done late to avoid uselessly
                 * calculating zone/node ids for pages that could
@@ -571,6 +538,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
                if (page_zone_id(page) != page_zone_id(buddy))
                        return 0;
+                VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
                return 1;
        }
        return 0;
@@ -613,10 +582,7 @@ static inline void __free_one_page(struct page *page,
        int max_order = MAX_ORDER;
        VM_BUG_ON(!zone_is_initialized(zone));
+        VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
-        if (unlikely(PageCompound(page)))
-                if (unlikely(destroy_compound_page(page, order)))
-                        return;
        VM_BUG_ON(migratetype == -1);
        if (is_migrate_isolate(migratetype)) {
@@ -797,21 +763,41 @@ static void free_one_page(struct zone *zone,
        spin_unlock(&zone->lock);
 }
+static int free_tail_pages_check(struct page *head_page, struct page *page)
+{
+        if (!IS_ENABLED(CONFIG_DEBUG_VM))
+                return 0;
+        if (unlikely(!PageTail(page))) {
+                bad_page(page, "PageTail not set", 0);
+                return 1;
+        }
+        if (unlikely(page->first_page != head_page)) {
+                bad_page(page, "first_page not consistent", 0);
+                return 1;
+        }
+        return 0;
+}
 static bool free_pages_prepare(struct page *page, unsigned int order)
 {
-        int i;
+        bool compound = PageCompound(page);
-        int bad = 0;
+        int i, bad = 0;
        VM_BUG_ON_PAGE(PageTail(page), page);
-        VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page);
+        VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
        trace_mm_page_free(page, order);
        kmemcheck_free_shadow(page, order);
+        kasan_free_pages(page, order);
        if (PageAnon(page))
                page->mapping = NULL;
-        for (i = 0; i < (1 << order); i++)
+        bad += free_pages_check(page);
+        for (i = 1; i < (1 << order); i++) {
+                if (compound)
+                        bad += free_tail_pages_check(page, page + i);
                bad += free_pages_check(page + i);
+        }
        if (bad)
                return false;
@@ -970,7 +956,8 @@ static inline int check_new_page(struct page *page)
        return 0;
 }
-static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
+static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
+                                                                int alloc_flags)
 {
        int i;
@@ -985,6 +972,7 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
        arch_alloc_page(page, order);
        kernel_map_pages(page, 1 << order, 1);
+        kasan_alloc_pages(page, order);
        if (gfp_flags & __GFP_ZERO)
                prep_zero_page(page, order, gfp_flags);
@@ -994,6 +982,14 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
        set_page_owner(page, order, gfp_flags);
+        /*
+         * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to
+         * allocate the page. The expectation is that the caller is taking
+         * steps that will free more memory. The caller should avoid the page
+         * being used for !PFMEMALLOC purposes.
+         */
+        page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
        return 0;
 }
@@ -1130,39 +1126,34 @@ static void change_pageblock_range(struct page *pageblock_page,
 }
 /*
- * If breaking a large block of pages, move all free pages to the preferred
+ * When we are falling back to another migratetype during allocation, try to
- * allocation list. If falling back for a reclaimable kernel allocation, be
+ * steal extra free pages from the same pageblocks to satisfy further
- * more aggressive about taking ownership of free pages.
+ * allocations, instead of polluting multiple pageblocks.
 *
- * On the other hand, never change migration type of MIGRATE_CMA pageblocks
+ * If we are stealing a relatively large buddy page, it is likely there will
- * nor move CMA pages to different free lists. We don't want unmovable pages
+ * be more free pages in the pageblock, so try to steal them all. For
- * to be allocated from MIGRATE_CMA areas.
+ * reclaimable and unmovable allocations, we steal regardless of page size,
+ * as fragmentation caused by those allocations polluting movable pageblocks
+ * is worse than movable allocations stealing from unmovable and reclaimable
+ * pageblocks.
 *
- * Returns the new migratetype of the pageblock (or the same old migratetype
+ * If we claim more than half of the pageblock, change pageblock's migratetype
- * if it was unchanged).
+ * as well.
 */
-static int try_to_steal_freepages(struct zone *zone, struct page *page,
+static void try_to_steal_freepages(struct zone *zone, struct page *page,
                                  int start_type, int fallback_type)
 {
        int current_order = page_order(page);
-        /*
-         * When borrowing from MIGRATE_CMA, we need to release the excess
-         * buddy pages to CMA itself. We also ensure the freepage_migratetype
-         * is set to CMA so it is returned to the correct freelist in case
-         * the page ends up being not actually allocated from the pcp lists.
-         */
-        if (is_migrate_cma(fallback_type))
-                return fallback_type;
        /* Take ownership for orders >= pageblock_order */
        if (current_order >= pageblock_order) {
                change_pageblock_range(page, current_order, start_type);
-                return start_type;
+                return;
        }
        if (current_order >= pageblock_order / 2 ||
            start_type == MIGRATE_RECLAIMABLE ||
+            start_type == MIGRATE_UNMOVABLE ||
            page_group_by_mobility_disabled) {
                int pages;
@@ -1170,15 +1161,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
                /* Claim the whole block if over half of it is free */
                if (pages >= (1 << (pageblock_order-1)) ||
-                                page_group_by_mobility_disabled) {
+                                page_group_by_mobility_disabled)
                        set_pageblock_migratetype(page, start_type);
-                        return start_type;
-                }
        }
-        return fallback_type;
 }
 /* Remove an element from the buddy allocator from the fallback list */
@@ -1188,14 +1173,15 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
        struct free_area *area;
        unsigned int current_order;
        struct page *page;
-        int migratetype, new_type, i;
        /* Find the largest possible block of pages in the other list */
        for (current_order = MAX_ORDER-1;
                                current_order >= order && current_order <= MAX_ORDER-1;
                                --current_order) {
+                int i;
                for (i = 0;; i++) {
-                        migratetype = fallbacks[start_migratetype][i];
+                        int migratetype = fallbacks[start_migratetype][i];
+                        int buddy_type = start_migratetype;
                        /* MIGRATE_RESERVE handled later if necessary */
                        if (migratetype == MIGRATE_RESERVE)
@@ -1209,25 +1195,39 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
                                        struct page, lru);
                        area->nr_free--;
-                        new_type = try_to_steal_freepages(zone, page,
+                        if (!is_migrate_cma(migratetype)) {
-                                                          start_migratetype,
+                                try_to_steal_freepages(zone, page,
-                                                          migratetype);
+                                                        start_migratetype,
+                                                        migratetype);
+                        } else {
+                                /*
+                                 * When borrowing from MIGRATE_CMA, we need to
+                                 * release the excess buddy pages to CMA
+                                 * itself, and we do not try to steal extra
+                                 * free pages.
+                                 */
+                                buddy_type = migratetype;
+                        }
                        /* Remove the page from the freelists */
                        list_del(&page->lru);
                        rmv_page_order(page);
                        expand(zone, page, order, current_order, area,
-                               new_type);
+                                        buddy_type);
-                        /* The freepage_migratetype may differ from pageblock's
+                        /*
+                         * The freepage_migratetype may differ from pageblock's
                         * migratetype depending on the decisions in
-                         * try_to_steal_freepages. This is OK as long as it does
+                         * try_to_steal_freepages(). This is OK as long as it
-                         * not differ for MIGRATE_CMA type.
+                         * does not differ for MIGRATE_CMA pageblocks. For CMA
+                         * we need to make sure unallocated pages flushed from
+                         * pcp lists are returned to the correct freelist.
                         */
-                        set_freepage_migratetype(page, new_type);
+                        set_freepage_migratetype(page, buddy_type);
                        trace_mm_page_alloc_extfrag(page, order, current_order,
-                                start_migratetype, migratetype, new_type);
+                                start_migratetype, migratetype);
                        return page;
                }
@@ -1642,9 +1642,7 @@ int split_free_page(struct page *page)
 }
 /*
- * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
+ * Allocate a page from the given zone. Use pcplists for order-0 allocations.
- * we cheat by calling it from here, in the order > 0 path.  Saves a branch
- * or two.
 */
 static inline
 struct page *buffered_rmqueue(struct zone *preferred_zone,
@@ -1655,7 +1653,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
        struct page *page;
        bool cold = ((gfp_flags & __GFP_COLD) != 0);
-again:
        if (likely(order == 0)) {
                struct per_cpu_pages *pcp;
                struct list_head *list;
@@ -1711,8 +1708,6 @@ again:
        local_irq_restore(flags);
        VM_BUG_ON_PAGE(bad_range(zone, page), page);
-        if (prep_new_page(page, order, gfp_flags))
-                goto again;
        return page;
 failed:
@@ -2033,10 +2028,10 @@ static void reset_alloc_batches(struct zone *preferred_zone)
 * a page.
 */
 static struct page *
-get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
+get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
-                struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
+                                                const struct alloc_context *ac)
-                struct zone *preferred_zone, int classzone_idx, int migratetype)
 {
+        struct zonelist *zonelist = ac->zonelist;
        struct zoneref *z;
        struct page *page = NULL;
        struct zone *zone;
@@ -2055,8 +2050,8 @@ zonelist_scan:
         * Scan zonelist, looking for a zone with enough free.
         * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
         */
-        for_each_zone_zonelist_nodemask(zone, z, zonelist,
+        for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
-                                                high_zoneidx, nodemask) {
+                                                                ac->nodemask) {
                unsigned long mark;
                if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
@@ -2073,7 +2068,7 @@ zonelist_scan:
                 * time the page has in memory before being reclaimed.
                 */
                if (alloc_flags & ALLOC_FAIR) {
-                        if (!zone_local(preferred_zone, zone))
+                        if (!zone_local(ac->preferred_zone, zone))
                                break;
                        if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
                                nr_fair_skipped++;
@@ -2111,7 +2106,7 @@ zonelist_scan:
                mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
                if (!zone_watermark_ok(zone, order, mark,
-                                       classzone_idx, alloc_flags)) {
+                                       ac->classzone_idx, alloc_flags)) {
                        int ret;
                        /* Checked here to keep the fast path fast */
@@ -2132,7 +2127,7 @@ zonelist_scan:
                        }
                        if (zone_reclaim_mode == 0 ||
-                            !zone_allows_reclaim(preferred_zone, zone))
+                            !zone_allows_reclaim(ac->preferred_zone, zone))
                                goto this_zone_full;
                        /*
@@ -2154,7 +2149,7 @@ zonelist_scan:
                        default:
                                /* did we reclaim enough */
                                if (zone_watermark_ok(zone, order, mark,
-                                                classzone_idx, alloc_flags))
+                                                ac->classzone_idx, alloc_flags))
                                        goto try_this_zone;
                                /*
@@ -2175,27 +2170,18 @@ zonelist_scan:
                }
 try_this_zone:
-                page = buffered_rmqueue(preferred_zone, zone, order,
+                page = buffered_rmqueue(ac->preferred_zone, zone, order,
-                                                gfp_mask, migratetype);
+                                                gfp_mask, ac->migratetype);
-                if (page)
+                if (page) {
-                        break;
+                        if (prep_new_page(page, order, gfp_mask, alloc_flags))
+                                goto try_this_zone;
+                        return page;
+                }
 this_zone_full:
                if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
                        zlc_mark_zone_full(zonelist, z);
        }
-        if (page) {
-                /*
-                 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
-                 * necessary to allocate the page. The expectation is
-                 * that the caller is taking steps that will free more
-                 * memory. The caller should avoid the page being used
-                 * for !PFMEMALLOC purposes.
-                 */
-                page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
-                return page;
-        }
        /*
         * The first pass makes sure allocations are spread fairly within the
         * local node.  However, the local node might have free pages left
@@ -2208,7 +2194,7 @@ this_zone_full:
                alloc_flags &= ~ALLOC_FAIR;
                if (nr_fair_skipped) {
                        zonelist_rescan = true;
-                        reset_alloc_batches(preferred_zone);
+                        reset_alloc_batches(ac->preferred_zone);
                }
                if (nr_online_nodes > 1)
                        zonelist_rescan = true;
@@ -2330,44 +2316,29 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
 static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
-        struct zonelist *zonelist, enum zone_type high_zoneidx,
+        const struct alloc_context *ac, unsigned long *did_some_progress)
-        nodemask_t *nodemask, struct zone *preferred_zone,
-        int classzone_idx, int migratetype, unsigned long *did_some_progress)
 {
        struct page *page;
        *did_some_progress = 0;
-        if (oom_killer_disabled)
-                return NULL;
        /*
         * Acquire the per-zone oom lock for each zone.  If that
         * fails, somebody else is making progress for us.
         */
-        if (!oom_zonelist_trylock(zonelist, gfp_mask)) {
+        if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) {
                *did_some_progress = 1;
                schedule_timeout_uninterruptible(1);
                return NULL;
        }
        /*
-         * PM-freezer should be notified that there might be an OOM killer on
-         * its way to kill and wake somebody up. This is too early and we might
-         * end up not killing anything but false positives are acceptable.
-         * See freeze_processes.
-         */
-        note_oom_kill();
-        /*
         * Go through the zonelist yet one more time, keep very high watermark
         * here, this is only to catch a parallel oom killing, we must fail if
         * we're still under heavy pressure.
         */
-        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
+        page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
-                order, zonelist, high_zoneidx,
+                                        ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
-                ALLOC_WMARK_HIGH|ALLOC_CPUSET,
-                preferred_zone, classzone_idx, migratetype);
        if (page)
                goto out;
@@ -2379,11 +2350,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                if (order > PAGE_ALLOC_COSTLY_ORDER)
                        goto out;
                /* The OOM killer does not needlessly kill tasks for lowmem */
-                if (high_zoneidx < ZONE_NORMAL)
+                if (ac->high_zoneidx < ZONE_NORMAL)
                        goto out;
                /* The OOM killer does not compensate for light reclaim */
-                if (!(gfp_mask & __GFP_FS))
+                if (!(gfp_mask & __GFP_FS)) {
+                        /*
+                         * XXX: Page reclaim didn't yield anything,
+                         * and the OOM killer can't be invoked, but
+                         * keep looping as per should_alloc_retry().
+                         */
+                        *did_some_progress = 1;
                        goto out;
+                }
                /*
                 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
                 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
@@ -2395,10 +2373,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                        goto out;
        }
        /* Exhausted what can be done so it's blamo time */
-        out_of_memory(zonelist, gfp_mask, order, nodemask, false);
+        if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false))
-        *did_some_progress = 1;
+                *did_some_progress = 1;
 out:
-        oom_zonelist_unlock(zonelist, gfp_mask);
+        oom_zonelist_unlock(ac->zonelist, gfp_mask);
        return page;
 }
@@ -2406,10 +2384,9 @@ out:
 /* Try memory compaction for high-order allocations before reclaim */
 static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
-        struct zonelist *zonelist, enum zone_type high_zoneidx,
+                int alloc_flags, const struct alloc_context *ac,
-        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+                enum migrate_mode mode, int *contended_compaction,
-        int classzone_idx, int migratetype, enum migrate_mode mode,
+                bool *deferred_compaction)
-        int *contended_compaction, bool *deferred_compaction)
 {
        unsigned long compact_result;
        struct page *page;
@@ -2418,10 +2395,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                return NULL;
        current->flags |= PF_MEMALLOC;
-        compact_result = try_to_compact_pages(zonelist, order, gfp_mask,
+        compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
-                                                nodemask, mode,
+                                                mode, contended_compaction);
-                                                contended_compaction,
-                                                alloc_flags, classzone_idx);
        current->flags &= ~PF_MEMALLOC;
        switch (compact_result) {
@@ -2440,10 +2415,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         */
        count_vm_event(COMPACTSTALL);
-        page = get_page_from_freelist(gfp_mask, nodemask,
+        page = get_page_from_freelist(gfp_mask, order,
-                        order, zonelist, high_zoneidx,
+                                        alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
-                        alloc_flags & ~ALLOC_NO_WATERMARKS,
-                        preferred_zone, classzone_idx, migratetype);
        if (page) {
                struct zone *zone = page_zone(page);
@@ -2467,10 +2440,9 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 #else
 static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
-        struct zonelist *zonelist, enum zone_type high_zoneidx,
+                int alloc_flags, const struct alloc_context *ac,
-        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+                enum migrate_mode mode, int *contended_compaction,
-        int classzone_idx, int migratetype, enum migrate_mode mode,
+                bool *deferred_compaction)
-        int *contended_compaction, bool *deferred_compaction)
 {
        return NULL;
 }
@@ -2478,8 +2450,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 /* Perform direct synchronous page reclaim */
 static int
-__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
+__perform_reclaim(gfp_t gfp_mask, unsigned int order,
-                  nodemask_t *nodemask)
+                                        const struct alloc_context *ac)
 {
        struct reclaim_state reclaim_state;
        int progress;
@@ -2493,7 +2465,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
        reclaim_state.reclaimed_slab = 0;
        current->reclaim_state = &reclaim_state;
-        progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
+        progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
+                                                                ac->nodemask);
        current->reclaim_state = NULL;
        lockdep_clear_current_reclaim_state();
@@ -2507,28 +2480,23 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
 /* The really slow allocator path where we enter direct reclaim */
 static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
-        struct zonelist *zonelist, enum zone_type high_zoneidx,
+                int alloc_flags, const struct alloc_context *ac,
-        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+                unsigned long *did_some_progress)
-        int classzone_idx, int migratetype, unsigned long *did_some_progress)
 {
        struct page *page = NULL;
        bool drained = false;
-        *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
+        *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
-                                               nodemask);
        if (unlikely(!(*did_some_progress)))
                return NULL;
        /* After successful reclaim, reconsider all zones for allocation */
        if (IS_ENABLED(CONFIG_NUMA))
-                zlc_clear_zones_full(zonelist);
+                zlc_clear_zones_full(ac->zonelist);
 retry:
-        page = get_page_from_freelist(gfp_mask, nodemask, order,
+        page = get_page_from_freelist(gfp_mask, order,
-                                        zonelist, high_zoneidx,
+                                        alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
-                                        alloc_flags & ~ALLOC_NO_WATERMARKS,
-                                        preferred_zone, classzone_idx,
-                                        migratetype);
        /*
         * If an allocation failed after direct reclaim, it could be because
@@ -2549,36 +2517,30 @@ retry:
 */
 static inline struct page *
 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
-        struct zonelist *zonelist, enum zone_type high_zoneidx,
+                                const struct alloc_context *ac)
-        nodemask_t *nodemask, struct zone *preferred_zone,
-        int classzone_idx, int migratetype)
 {
        struct page *page;
        do {
-                page = get_page_from_freelist(gfp_mask, nodemask, order,
+                page = get_page_from_freelist(gfp_mask, order,
-                        zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
+                                                ALLOC_NO_WATERMARKS, ac);
-                        preferred_zone, classzone_idx, migratetype);
                if (!page && gfp_mask & __GFP_NOFAIL)
-                        wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
+                        wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC,
+                                                                        HZ/50);
        } while (!page && (gfp_mask & __GFP_NOFAIL));
        return page;
 }
-static void wake_all_kswapds(unsigned int order,
+static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
-                             struct zonelist *zonelist,
-                             enum zone_type high_zoneidx,
-                             struct zone *preferred_zone,
-                             nodemask_t *nodemask)
 {
        struct zoneref *z;
        struct zone *zone;
-        for_each_zone_zonelist_nodemask(zone, z, zonelist,
+        for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
-                                                high_zoneidx, nodemask)
+                                                ac->high_zoneidx, ac->nodemask)
-                wakeup_kswapd(zone, order, zone_idx(preferred_zone));
+                wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone));
 }
 static inline int
@@ -2637,9 +2599,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
-        struct zonelist *zonelist, enum zone_type high_zoneidx,
+                                                struct alloc_context *ac)
-        nodemask_t *nodemask, struct zone *preferred_zone,
-        int classzone_idx, int migratetype)
 {
        const gfp_t wait = gfp_mask & __GFP_WAIT;
        struct page *page = NULL;
@@ -2675,8 +2635,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 retry:
        if (!(gfp_mask & __GFP_NO_KSWAPD))
-                wake_all_kswapds(order, zonelist, high_zoneidx,
+                wake_all_kswapds(order, ac);
-                                preferred_zone, nodemask);
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -2689,17 +2648,16 @@ retry:
         * Find the true preferred zone if the allocation is unconstrained by
         * cpusets.
         */
-        if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
+        if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) {
                struct zoneref *preferred_zoneref;
-                preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
+                preferred_zoneref = first_zones_zonelist(ac->zonelist,
-                                NULL, &preferred_zone);
+                                ac->high_zoneidx, NULL, &ac->preferred_zone);
-                classzone_idx = zonelist_zone_idx(preferred_zoneref);
+                ac->classzone_idx = zonelist_zone_idx(preferred_zoneref);
        }
        /* This is the last chance, in general, before the goto nopage. */
-        page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
+        page = get_page_from_freelist(gfp_mask, order,
-                        high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
+                                alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
-                        preferred_zone, classzone_idx, migratetype);
        if (page)
                goto got_pg;
@@ -2710,11 +2668,10 @@ retry:
                 * the allocation is high priority and these type of
                 * allocations are system rather than user orientated
                 */
-                zonelist = node_zonelist(numa_node_id(), gfp_mask);
+                ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
+                page = __alloc_pages_high_priority(gfp_mask, order, ac);
-                page = __alloc_pages_high_priority(gfp_mask, order,
-                                zonelist, high_zoneidx, nodemask,
-                                preferred_zone, classzone_idx, migratetype);
                if (page) {
                        goto got_pg;
                }
@@ -2743,11 +2700,9 @@ retry:
         * Try direct compaction. The first pass is asynchronous. Subsequent
         * attempts after direct reclaim are synchronous
         */
-        page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+        page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
-                                        high_zoneidx, nodemask, alloc_flags,
+                                        migration_mode,
-                                        preferred_zone,
+                                        &contended_compaction,
-                                        classzone_idx, migratetype,
-                                        migration_mode, &contended_compaction,
                                        &deferred_compaction);
        if (page)
                goto got_pg;
@@ -2793,12 +2748,8 @@ retry:
                migration_mode = MIGRATE_SYNC_LIGHT;
        /* Try direct reclaim and then allocating */
-        page = __alloc_pages_direct_reclaim(gfp_mask, order,
+        page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
-                                        zonelist, high_zoneidx,
+                                                        &did_some_progress);
-                                        nodemask,
-                                        alloc_flags, preferred_zone,
-                                        classzone_idx, migratetype,
-                                        &did_some_progress);
        if (page)
                goto got_pg;
@@ -2812,17 +2763,15 @@ retry:
                 * start OOM killing tasks.
                 */
                if (!did_some_progress) {
-                        page = __alloc_pages_may_oom(gfp_mask, order, zonelist,
+                        page = __alloc_pages_may_oom(gfp_mask, order, ac,
-                                                high_zoneidx, nodemask,
+                                                        &did_some_progress);
-                                                preferred_zone, classzone_idx,
-                                                migratetype,&did_some_progress);
                        if (page)
                                goto got_pg;
                        if (!did_some_progress)
                                goto nopage;
                }
                /* Wait for some write requests to complete then retry */
-                wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
+                wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
                goto retry;
        } else {
                /*
@@ -2830,11 +2779,9 @@ retry:
                 * direct reclaim and reclaim/compaction depends on compaction
                 * being called after reclaim so call directly if necessary
                 */
-                page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+                page = __alloc_pages_direct_compact(gfp_mask, order,
-                                        high_zoneidx, nodemask, alloc_flags,
+                                        alloc_flags, ac, migration_mode,
-                                        preferred_zone,
+                                        &contended_compaction,
-                                        classzone_idx, migratetype,
-                                        migration_mode, &contended_compaction,
                                        &deferred_compaction);
                if (page)
                        goto got_pg;
@@ -2842,11 +2789,7 @@ retry:
 nopage:
        warn_alloc_failed(gfp_mask, order, NULL);
-        return page;
 got_pg:
-        if (kmemcheck_enabled)
-                kmemcheck_pagealloc_alloc(page, order, gfp_mask);
        return page;
 }
@@ -2857,14 +2800,16 @@ struct page *
 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                        struct zonelist *zonelist, nodemask_t *nodemask)
 {
-        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
-        struct zone *preferred_zone;
        struct zoneref *preferred_zoneref;
        struct page *page = NULL;
-        int migratetype = gfpflags_to_migratetype(gfp_mask);
        unsigned int cpuset_mems_cookie;
        int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
-        int classzone_idx;
+        gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
+        struct alloc_context ac = {
+                .high_zoneidx = gfp_zone(gfp_mask),
+                .nodemask = nodemask,
+                .migratetype = gfpflags_to_migratetype(gfp_mask),
+        };
        gfp_mask &= gfp_allowed_mask;
@@ -2883,37 +2828,40 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        if (unlikely(!zonelist->_zonerefs->zone))
                return NULL;
-        if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE)
+        if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
                alloc_flags |= ALLOC_CMA;
 retry_cpuset:
        cpuset_mems_cookie = read_mems_allowed_begin();
+        /* We set it here, as __alloc_pages_slowpath might have changed it */
+        ac.zonelist = zonelist;
        /* The preferred zone is used for statistics later */
-        preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
+        preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
-                                nodemask ? : &cpuset_current_mems_allowed,
+                                ac.nodemask ? : &cpuset_current_mems_allowed,
-                                &preferred_zone);
+                                &ac.preferred_zone);
-        if (!preferred_zone)
+        if (!ac.preferred_zone)
                goto out;
-        classzone_idx = zonelist_zone_idx(preferred_zoneref);
+        ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
        /* First allocation attempt */
-        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+        alloc_mask = gfp_mask|__GFP_HARDWALL;
-                        zonelist, high_zoneidx, alloc_flags,
+        page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
-                        preferred_zone, classzone_idx, migratetype);
        if (unlikely(!page)) {
                /*
                 * Runtime PM, block IO and its error handling path
                 * can deadlock because I/O on the device might not
                 * complete.
                 */
-                gfp_mask = memalloc_noio_flags(gfp_mask);
+                alloc_mask = memalloc_noio_flags(gfp_mask);
-                page = __alloc_pages_slowpath(gfp_mask, order,
-                                zonelist, high_zoneidx, nodemask,
+                page = __alloc_pages_slowpath(alloc_mask, order, &ac);
-                                preferred_zone, classzone_idx, migratetype);
        }
-        trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+        if (kmemcheck_enabled && page)
+                kmemcheck_pagealloc_alloc(page, order, gfp_mask);
+        trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
 out:
        /*
@@ -3933,18 +3881,29 @@ static int __build_all_zonelists(void *data)
        return 0;
 }
+static noinline void __init
+build_all_zonelists_init(void)
+{
+        __build_all_zonelists(NULL);
+        mminit_verify_zonelist();
+        cpuset_init_current_mems_allowed();
+}
 /*
 * Called with zonelists_mutex held always
 * unless system_state == SYSTEM_BOOTING.
+ *
+ * __ref due to (1) call of __meminit annotated setup_zone_pageset
+ * [we're only called with non-NULL zone through __meminit paths] and
+ * (2) call of __init annotated helper build_all_zonelists_init
+ * [protected by SYSTEM_BOOTING].
 */
 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
 {
        set_zonelist_order();
        if (system_state == SYSTEM_BOOTING) {
-                __build_all_zonelists(NULL);
+                build_all_zonelists_init();
-                mminit_verify_zonelist();
-                cpuset_init_current_mems_allowed();
        } else {
 #ifdef CONFIG_MEMORY_HOTPLUG
                if (zone)
@@ -5047,8 +5006,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
        pgdat->node_start_pfn = node_start_pfn;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
        get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
-        printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid,
+        pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
-                        (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1);
+                (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1);
 #endif
        calculate_node_totalpages(pgdat, start_pfn, end_pfn,
                                  zones_size, zholes_size);
@@ -5420,9 +5379,10 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
                                arch_zone_highest_possible_pfn[i])
                        pr_cont("empty\n");
                else
-                        pr_cont("[mem %0#10lx-%0#10lx]\n",
+                        pr_cont("[mem %#018Lx-%#018Lx]\n",
-                                arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
+                                (u64)arch_zone_lowest_possible_pfn[i]
-                                (arch_zone_highest_possible_pfn[i]
+                                        << PAGE_SHIFT,
+                                ((u64)arch_zone_highest_possible_pfn[i]
                                        << PAGE_SHIFT) - 1);
        }
@@ -5430,15 +5390,16 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
        pr_info("Movable zone start for each node\n");
        for (i = 0; i < MAX_NUMNODES; i++) {
                if (zone_movable_pfn[i])
-                        pr_info("  Node %d: %#010lx\n", i,
+                        pr_info("  Node %d: %#018Lx\n", i,
-                               zone_movable_pfn[i] << PAGE_SHIFT);
+                               (u64)zone_movable_pfn[i] << PAGE_SHIFT);
        }
        /* Print out the early node map */
        pr_info("Early memory node ranges\n");
        for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
-                pr_info("  node %3d: [mem %#010lx-%#010lx]\n", nid,
+                pr_info("  node %3d: [mem %#018Lx-%#018Lx]\n", nid,
-                       start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
+                        (u64)start_pfn << PAGE_SHIFT,
+                        ((u64)end_pfn << PAGE_SHIFT) - 1);
        /* Initialise every node */
        mminit_verify_pageflags_layout();
diff --git a/mm/page_counter.c b/mm/page_counter.c
index a009574fbba9..11b4beda14ba 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -166,18 +166,19 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit)
 /**
 * page_counter_memparse - memparse() for page counter limits
 * @buf: string to parse
+ * @max: string meaning maximum possible value
 * @nr_pages: returns the result in number of pages
 *
 * Returns -EINVAL, or 0 and @nr_pages on success.  @nr_pages will be
 * limited to %PAGE_COUNTER_MAX.
 */
-int page_counter_memparse(const char *buf, unsigned long *nr_pages)
+int page_counter_memparse(const char *buf, const char *max,
+                          unsigned long *nr_pages)
 {
-        char unlimited[] = "-1";
        char *end;
        u64 bytes;
-        if (!strncmp(buf, unlimited, sizeof(unlimited))) {
+        if (!strcmp(buf, max)) {
                *nr_pages = PAGE_COUNTER_MAX;
                return 0;
        }
diff --git a/mm/page_io.c b/mm/page_io.c
index 955db8b0d497..e6045804c8d8 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -269,14 +269,9 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
                        .bv_len  = PAGE_SIZE,
                        .bv_offset = 0
                };
-                struct iov_iter from = {
+                struct iov_iter from;
-                        .type = ITER_BVEC | WRITE,
-                        .count = PAGE_SIZE,
-                        .iov_offset = 0,
-                        .nr_segs = 1,
-                };
-                from.bvec = &bv;        /* older gcc versions are broken */
+                iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE);
                init_sync_kiocb(&kiocb, swap_file);
                kiocb.ki_pos = page_file_offset(page);
                kiocb.ki_nbytes = PAGE_SIZE;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 9ab4a9b5bc09..0993f5f36b01 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -59,20 +59,19 @@ void __reset_page_owner(struct page *page, unsigned int order)
 void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
 {
-        struct page_ext *page_ext;
+        struct page_ext *page_ext = lookup_page_ext(page);
-        struct stack_trace *trace;
+        struct stack_trace trace = {
+                .nr_entries = 0,
-        page_ext = lookup_page_ext(page);
+                .max_entries = ARRAY_SIZE(page_ext->trace_entries),
+                .entries = &page_ext->trace_entries[0],
+                .skip = 3,
+        };
-        trace = &page_ext->trace;
+        save_stack_trace(&trace);
-        trace->nr_entries = 0;
-        trace->max_entries = ARRAY_SIZE(page_ext->trace_entries);
-        trace->entries = &page_ext->trace_entries[0];
-        trace->skip = 3;
-        save_stack_trace(&page_ext->trace);
        page_ext->order = order;
        page_ext->gfp_mask = gfp_mask;
+        page_ext->nr_entries = trace.nr_entries;
        __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
 }
@@ -84,6 +83,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
        int ret;
        int pageblock_mt, page_mt;
        char *kbuf;
+        struct stack_trace trace = {
+                .nr_entries = page_ext->nr_entries,
+                .entries = &page_ext->trace_entries[0],
+        };
        kbuf = kmalloc(count, GFP_KERNEL);
        if (!kbuf)
@@ -121,8 +124,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
        if (ret >= count)
                goto err;
-        ret += snprint_stack_trace(kbuf + ret, count - ret,
+        ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0);
-                                        &page_ext->trace, 0);
        if (ret >= count)
                goto err;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index b264bda46e1b..75c1f2878519 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -35,7 +35,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
        do {
 again:
                next = pmd_addr_end(addr, end);
-                if (pmd_none(*pmd)) {
+                if (pmd_none(*pmd) || !walk->vma) {
                        if (walk->pte_hole)
                                err = walk->pte_hole(addr, next, walk);
                        if (err)
@@ -59,7 +59,7 @@ again:
                        continue;
                split_huge_page_pmd_mm(walk->mm, addr, pmd);
-                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+                if (pmd_trans_unstable(pmd))
                        goto again;
                err = walk_pte_range(pmd, addr, next, walk);
                if (err)
@@ -86,9 +86,7 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
                                break;
                        continue;
                }
-                if (walk->pud_entry)
+                if (walk->pmd_entry || walk->pte_entry)
-                        err = walk->pud_entry(pud, addr, next, walk);
-                if (!err && (walk->pmd_entry || walk->pte_entry))
                        err = walk_pmd_range(pud, addr, next, walk);
                if (err)
                        break;
@@ -97,6 +95,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
        return err;
 }
+static int walk_pgd_range(unsigned long addr, unsigned long end,
+                          struct mm_walk *walk)
+{
+        pgd_t *pgd;
+        unsigned long next;
+        int err = 0;
+        pgd = pgd_offset(walk->mm, addr);
+        do {
+                next = pgd_addr_end(addr, end);
+                if (pgd_none_or_clear_bad(pgd)) {
+                        if (walk->pte_hole)
+                                err = walk->pte_hole(addr, next, walk);
+                        if (err)
+                                break;
+                        continue;
+                }
+                if (walk->pmd_entry || walk->pte_entry)
+                        err = walk_pud_range(pgd, addr, next, walk);
+                if (err)
+                        break;
+        } while (pgd++, addr = next, addr != end);
+        return err;
+}
 #ifdef CONFIG_HUGETLB_PAGE
 static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
                                       unsigned long end)
@@ -105,10 +129,10 @@ static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
        return boundary < end ? boundary : end;
 }
-static int walk_hugetlb_range(struct vm_area_struct *vma,
+static int walk_hugetlb_range(unsigned long addr, unsigned long end,
-                              unsigned long addr, unsigned long end,
                              struct mm_walk *walk)
 {
+        struct vm_area_struct *vma = walk->vma;
        struct hstate *h = hstate_vma(vma);
        unsigned long next;
        unsigned long hmask = huge_page_mask(h);
@@ -121,15 +145,14 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
                if (pte && walk->hugetlb_entry)
                        err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
                if (err)
-                        return err;
+                        break;
        } while (addr = next, addr != end);
-        return 0;
+        return err;
 }
 #else /* CONFIG_HUGETLB_PAGE */
-static int walk_hugetlb_range(struct vm_area_struct *vma,
+static int walk_hugetlb_range(unsigned long addr, unsigned long end,
-                              unsigned long addr, unsigned long end,
                              struct mm_walk *walk)
 {
        return 0;
@@ -137,115 +160,138 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
 #endif /* CONFIG_HUGETLB_PAGE */
+/*
+ * Decide whether we really walk over the current vma on [@start, @end)
+ * or skip it via the returned value. Return 0 if we do walk over the
+ * current vma, and return 1 if we skip the vma. Negative values means
+ * error, where we abort the current walk.
+ */
+static int walk_page_test(unsigned long start, unsigned long end,
+                        struct mm_walk *walk)
+{
+        struct vm_area_struct *vma = walk->vma;
+        if (walk->test_walk)
+                return walk->test_walk(start, end, walk);
+        /*
+         * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
+         * range, so we don't walk over it as we do for normal vmas. However,
+         * Some callers are interested in handling hole range and they don't
+         * want to just ignore any single address range. Such users certainly
+         * define their ->pte_hole() callbacks, so let's delegate them to handle
+         * vma(VM_PFNMAP).
+         */
+        if (vma->vm_flags & VM_PFNMAP) {
+                int err = 1;
+                if (walk->pte_hole)
+                        err = walk->pte_hole(start, end, walk);
+                return err ? err : 1;
+        }
+        return 0;
+}
+static int __walk_page_range(unsigned long start, unsigned long end,
+                        struct mm_walk *walk)
+{
+        int err = 0;
+        struct vm_area_struct *vma = walk->vma;
+        if (vma && is_vm_hugetlb_page(vma)) {
+                if (walk->hugetlb_entry)
+                        err = walk_hugetlb_range(start, end, walk);
+        } else
+                err = walk_pgd_range(start, end, walk);
+        return err;
+}
 /**
- * walk_page_range - walk a memory map's page tables with a callback
+ * walk_page_range - walk page table with caller specific callbacks
- * @addr: starting address
- * @end: ending address
- * @walk: set of callbacks to invoke for each level of the tree
 *
- * Recursively walk the page table for the memory area in a VMA,
+ * Recursively walk the page table tree of the process represented by @walk->mm
- * calling supplied callbacks. Callbacks are called in-order (first
+ * within the virtual address range [@start, @end). During walking, we can do
- * PGD, first PUD, first PMD, first PTE, second PTE... second PMD,
+ * some caller-specific works for each entry, by setting up pmd_entry(),
- * etc.). If lower-level callbacks are omitted, walking depth is reduced.
+ * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
+ * callbacks, the associated entries/pages are just ignored.
+ * The return values of these callbacks are commonly defined like below:
+ *  - 0  : succeeded to handle the current entry, and if you don't reach the
+ *         end address yet, continue to walk.
+ *  - >0 : succeeded to handle the current entry, and return to the caller
+ *         with caller specific value.
+ *  - <0 : failed to handle the current entry, and return to the caller
+ *         with error code.
 *
- * Each callback receives an entry pointer and the start and end of the
+ * Before starting to walk page table, some callers want to check whether
- * associated range, and a copy of the original mm_walk for access to
+ * they really want to walk over the current vma, typically by checking
- * the ->private or ->mm fields.
+ * its vm_flags. walk_page_test() and @walk->test_walk() are used for this
+ * purpose.
 *
- * Usually no locks are taken, but splitting transparent huge page may
+ * struct mm_walk keeps current values of some common data like vma and pmd,
- * take page table lock. And the bottom level iterator will map PTE
+ * which are useful for the access from callbacks. If you want to pass some
- * directories from highmem if necessary.
+ * caller-specific data to callbacks, @walk->private should be helpful.
 *
- * If any callback returns a non-zero value, the walk is aborted and
+ * Locking:
- * the return value is propagated back to the caller. Otherwise 0 is returned.
+ *   Callers of walk_page_range() and walk_page_vma() should hold
- *
+ *   @walk->mm->mmap_sem, because these function traverse vma list and/or
- * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry
+ *   access to vma's data.
- * is !NULL.
 */
-int walk_page_range(unsigned long addr, unsigned long end,
+int walk_page_range(unsigned long start, unsigned long end,
                    struct mm_walk *walk)
 {
-        pgd_t *pgd;
-        unsigned long next;
        int err = 0;
+        unsigned long next;
+        struct vm_area_struct *vma;
-        if (addr >= end)
+        if (start >= end)
-                return err;
+                return -EINVAL;
        if (!walk->mm)
                return -EINVAL;
        VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm);
-        pgd = pgd_offset(walk->mm, addr);
+        vma = find_vma(walk->mm, start);
        do {
-                struct vm_area_struct *vma = NULL;
+                if (!vma) { /* after the last vma */
+                        walk->vma = NULL;
-                next = pgd_addr_end(addr, end);
+                        next = end;
+                } else if (start < vma->vm_start) { /* outside vma */
+                        walk->vma = NULL;
+                        next = min(end, vma->vm_start);
+                } else { /* inside vma */
+                        walk->vma = vma;
+                        next = min(end, vma->vm_end);
+                        vma = vma->vm_next;
-                /*
+                        err = walk_page_test(start, next, walk);
-                 * This function was not intended to be vma based.
+                        if (err > 0)
-                 * But there are vma special cases to be handled:
-                 * - hugetlb vma's
-                 * - VM_PFNMAP vma's
-                 */
-                vma = find_vma(walk->mm, addr);
-                if (vma) {
-                        /*
-                         * There are no page structures backing a VM_PFNMAP
-                         * range, so do not allow split_huge_page_pmd().
-                         */
-                        if ((vma->vm_start <= addr) &&
-                            (vma->vm_flags & VM_PFNMAP)) {
-                                if (walk->pte_hole)
-                                        err = walk->pte_hole(addr, next, walk);
-                                if (err)
-                                        break;
-                                pgd = pgd_offset(walk->mm, next);
-                                continue;
-                        }
-                        /*
-                         * Handle hugetlb vma individually because pagetable
-                         * walk for the hugetlb page is dependent on the
-                         * architecture and we can't handled it in the same
-                         * manner as non-huge pages.
-                         */
-                        if (walk->hugetlb_entry && (vma->vm_start <= addr) &&
-                            is_vm_hugetlb_page(vma)) {
-                                if (vma->vm_end < next)
-                                        next = vma->vm_end;
-                                /*
-                                 * Hugepage is very tightly coupled with vma,
-                                 * so walk through hugetlb entries within a
-                                 * given vma.
-                                 */
-                                err = walk_hugetlb_range(vma, addr, next, walk);
-                                if (err)
-                                        break;
-                                pgd = pgd_offset(walk->mm, next);
                                continue;
-                        }
+                        if (err < 0)
-                }
-                if (pgd_none_or_clear_bad(pgd)) {
-                        if (walk->pte_hole)
-                                err = walk->pte_hole(addr, next, walk);
-                        if (err)
                                break;
-                        pgd++;
-                        continue;
                }
-                if (walk->pgd_entry)
+                if (walk->vma || walk->pte_hole)
-                        err = walk->pgd_entry(pgd, addr, next, walk);
+                        err = __walk_page_range(start, next, walk);
-                if (!err &&
-                    (walk->pud_entry || walk->pmd_entry || walk->pte_entry))
-                        err = walk_pud_range(pgd, addr, next, walk);
                if (err)
                        break;
-                pgd++;
+        } while (start = next, start < end);
-        } while (addr = next, addr < end);
        return err;
 }
+int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk)
+{
+        int err;
+        if (!walk->mm)
+                return -EINVAL;
+        VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
+        VM_BUG_ON(!vma);
+        walk->vma = vma;
+        err = walk_page_test(vma->vm_start, vma->vm_end, walk);
+        if (err > 0)
+                return 0;
+        if (err < 0)
+                return err;
+        return __walk_page_range(vma->vm_start, vma->vm_end, walk);
+}
diff --git a/mm/percpu.c b/mm/percpu.c
index d39e2f4e335c..73c97a5f4495 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1528,7 +1528,6 @@ static void pcpu_dump_alloc_info(const char *lvl,
 int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
                                  void *base_addr)
 {
-        static char cpus_buf[4096] __initdata;
        static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
        static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
        size_t dyn_size = ai->dyn_size;
@@ -1541,12 +1540,11 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        int *unit_map;
        int group, unit, i;
-        cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask);
 #define PCPU_SETUP_BUG_ON(cond) do {                                    \
        if (unlikely(cond)) {                                           \
                pr_emerg("PERCPU: failed to initialize, %s", #cond);    \
-                pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf);   \
+                pr_emerg("PERCPU: cpu_possible_mask=%*pb\n",            \
+                         cpumask_pr_args(cpu_possible_mask));           \
                pcpu_dump_alloc_info(KERN_EMERG, ai);                   \
                BUG();                                                  \
        }                                                               \
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index dfb79e028ecb..c25f94b33811 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -193,8 +193,6 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                     pmd_t *pmdp)
 {
        pmd_t entry = *pmdp;
-        if (pmd_numa(entry))
-                entry = pmd_mknonnuma(entry);
        set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry));
        flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 }
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 5077afcd9e11..b1597690530c 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -99,11 +99,8 @@ static int process_vm_rw_single_vec(unsigned long addr,
                size_t bytes;
                /* Get the pages we're interested in */
-                down_read(&mm->mmap_sem);
+                pages = get_user_pages_unlocked(task, mm, pa, pages,
-                pages = get_user_pages(task, mm, pa, pages,
+                                                vm_write, 0, process_pages);
-                                      vm_write, 0, process_pages, NULL);
-                up_read(&mm->mmap_sem);
                if (pages <= 0)
                        return -EFAULT;
diff --git a/mm/readahead.c b/mm/readahead.c
index 17b9172ec37f..935675844b2e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -27,7 +27,7 @@
 void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
 {
-        ra->ra_pages = mapping->backing_dev_info->ra_pages;
+        ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
        ra->prev_pos = -1;
 }
 EXPORT_SYMBOL_GPL(file_ra_state_init);
@@ -541,7 +541,7 @@ page_cache_async_readahead(struct address_space *mapping,
        /*
         * Defer asynchronous read-ahead on IO congestion.
         */
-        if (bdi_read_congested(mapping->backing_dev_info))
+        if (bdi_read_congested(inode_to_bdi(mapping->host)))
                return;
        /* do read-ahead */
diff --git a/mm/rmap.c b/mm/rmap.c
index 71cd5bd0c17d..5e3e09081164 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -590,9 +590,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
                if (!vma->anon_vma || !page__anon_vma ||
                    vma->anon_vma->root != page__anon_vma->root)
                        return -EFAULT;
-        } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
+        } else if (page->mapping) {
-                if (!vma->vm_file ||
+                if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping)
-                    vma->vm_file->f_mapping != page->mapping)
                        return -EFAULT;
        } else
                return -EFAULT;
@@ -1086,24 +1085,20 @@ void page_add_new_anon_rmap(struct page *page,
 void page_add_file_rmap(struct page *page)
 {
        struct mem_cgroup *memcg;
-        unsigned long flags;
-        bool locked;
-        memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+        memcg = mem_cgroup_begin_page_stat(page);
        if (atomic_inc_and_test(&page->_mapcount)) {
                __inc_zone_page_state(page, NR_FILE_MAPPED);
                mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
        }
-        mem_cgroup_end_page_stat(memcg, &locked, &flags);
+        mem_cgroup_end_page_stat(memcg);
 }
 static void page_remove_file_rmap(struct page *page)
 {
        struct mem_cgroup *memcg;
-        unsigned long flags;
-        bool locked;
-        memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+        memcg = mem_cgroup_begin_page_stat(page);
        /* page still mapped by someone else? */
        if (!atomic_add_negative(-1, &page->_mapcount))
@@ -1124,7 +1119,7 @@ static void page_remove_file_rmap(struct page *page)
        if (unlikely(PageMlocked(page)))
                clear_page_mlock(page);
 out:
-        mem_cgroup_end_page_stat(memcg, &locked, &flags);
+        mem_cgroup_end_page_stat(memcg);
 }
 /**
@@ -1274,7 +1269,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                if (pte_soft_dirty(pteval))
                        swp_pte = pte_swp_mksoft_dirty(swp_pte);
                set_pte_at(mm, address, pte, swp_pte);
-                BUG_ON(pte_file(*pte));
        } else if (IS_ENABLED(CONFIG_MIGRATION) &&
                   (flags & TTU_MIGRATION)) {
                /* Establish migration entry for a file page */
@@ -1316,211 +1310,6 @@ out_mlock:
        return ret;
 }
-/*
- * objrmap doesn't work for nonlinear VMAs because the assumption that
- * offset-into-file correlates with offset-into-virtual-addresses does not hold.
- * Consequently, given a particular page and its ->index, we cannot locate the
- * ptes which are mapping that page without an exhaustive linear search.
- *
- * So what this code does is a mini "virtual scan" of each nonlinear VMA which
- * maps the file to which the target page belongs.  The ->vm_private_data field
- * holds the current cursor into that scan.  Successive searches will circulate
- * around the vma's virtual address space.
- *
- * So as more replacement pressure is applied to the pages in a nonlinear VMA,
- * more scanning pressure is placed against them as well.   Eventually pages
- * will become fully unmapped and are eligible for eviction.
- *
- * For very sparsely populated VMAs this is a little inefficient - chances are
- * there there won't be many ptes located within the scan cluster.  In this case
- * maybe we could scan further - to the end of the pte page, perhaps.
- *
- * Mlocked pages:  check VM_LOCKED under mmap_sem held for read, if we can
- * acquire it without blocking.  If vma locked, mlock the pages in the cluster,
- * rather than unmapping them.  If we encounter the "check_page" that vmscan is
- * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
- */
-#define CLUSTER_SIZE    min(32*PAGE_SIZE, PMD_SIZE)
-#define CLUSTER_MASK    (~(CLUSTER_SIZE - 1))
-static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
-                struct vm_area_struct *vma, struct page *check_page)
-{
-        struct mm_struct *mm = vma->vm_mm;
-        pmd_t *pmd;
-        pte_t *pte;
-        pte_t pteval;
-        spinlock_t *ptl;
-        struct page *page;
-        unsigned long address;
-        unsigned long mmun_start;       /* For mmu_notifiers */
-        unsigned long mmun_end;         /* For mmu_notifiers */
-        unsigned long end;
-        int ret = SWAP_AGAIN;
-        int locked_vma = 0;
-        address = (vma->vm_start + cursor) & CLUSTER_MASK;
-        end = address + CLUSTER_SIZE;
-        if (address < vma->vm_start)
-                address = vma->vm_start;
-        if (end > vma->vm_end)
-                end = vma->vm_end;
-        pmd = mm_find_pmd(mm, address);
-        if (!pmd)
-                return ret;
-        mmun_start = address;
-        mmun_end   = end;
-        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-        /*
-         * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
-         * keep the sem while scanning the cluster for mlocking pages.
-         */
-        if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
-                locked_vma = (vma->vm_flags & VM_LOCKED);
-                if (!locked_vma)
-                        up_read(&vma->vm_mm->mmap_sem); /* don't need it */
-        }
-        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
-        /* Update high watermark before we lower rss */
-        update_hiwater_rss(mm);
-        for (; address < end; pte++, address += PAGE_SIZE) {
-                if (!pte_present(*pte))
-                        continue;
-                page = vm_normal_page(vma, address, *pte);
-                BUG_ON(!page || PageAnon(page));
-                if (locked_vma) {
-                        if (page == check_page) {
-                                /* we know we have check_page locked */
-                                mlock_vma_page(page);
-                                ret = SWAP_MLOCK;
-                        } else if (trylock_page(page)) {
-                                /*
-                                 * If we can lock the page, perform mlock.
-                                 * Otherwise leave the page alone, it will be
-                                 * eventually encountered again later.
-                                 */
-                                mlock_vma_page(page);
-                                unlock_page(page);
-                        }
-                        continue;       /* don't unmap */
-                }
-                /*
-                 * No need for _notify because we're within an
-                 * mmu_notifier_invalidate_range_ {start|end} scope.
-                 */
-                if (ptep_clear_flush_young(vma, address, pte))
-                        continue;
-                /* Nuke the page table entry. */
-                flush_cache_page(vma, address, pte_pfn(*pte));
-                pteval = ptep_clear_flush_notify(vma, address, pte);
-                /* If nonlinear, store the file page offset in the pte. */
-                if (page->index != linear_page_index(vma, address)) {
-                        pte_t ptfile = pgoff_to_pte(page->index);
-                        if (pte_soft_dirty(pteval))
-                                ptfile = pte_file_mksoft_dirty(ptfile);
-                        set_pte_at(mm, address, pte, ptfile);
-                }
-                /* Move the dirty bit to the physical page now the pte is gone. */
-                if (pte_dirty(pteval))
-                        set_page_dirty(page);
-                page_remove_rmap(page);
-                page_cache_release(page);
-                dec_mm_counter(mm, MM_FILEPAGES);
-                (*mapcount)--;
-        }
-        pte_unmap_unlock(pte - 1, ptl);
-        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-        if (locked_vma)
-                up_read(&vma->vm_mm->mmap_sem);
-        return ret;
-}
-static int try_to_unmap_nonlinear(struct page *page,
-                struct address_space *mapping, void *arg)
-{
-        struct vm_area_struct *vma;
-        int ret = SWAP_AGAIN;
-        unsigned long cursor;
-        unsigned long max_nl_cursor = 0;
-        unsigned long max_nl_size = 0;
-        unsigned int mapcount;
-        list_for_each_entry(vma,
-                &mapping->i_mmap_nonlinear, shared.nonlinear) {
-                cursor = (unsigned long) vma->vm_private_data;
-                if (cursor > max_nl_cursor)
-                        max_nl_cursor = cursor;
-                cursor = vma->vm_end - vma->vm_start;
-                if (cursor > max_nl_size)
-                        max_nl_size = cursor;
-        }
-        if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
-                return SWAP_FAIL;
-        }
-        /*
-         * We don't try to search for this page in the nonlinear vmas,
-         * and page_referenced wouldn't have found it anyway.  Instead
-         * just walk the nonlinear vmas trying to age and unmap some.
-         * The mapcount of the page we came in with is irrelevant,
-         * but even so use it as a guide to how hard we should try?
-         */
-        mapcount = page_mapcount(page);
-        if (!mapcount)
-                return ret;
-        cond_resched();
-        max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
-        if (max_nl_cursor == 0)
-                max_nl_cursor = CLUSTER_SIZE;
-        do {
-                list_for_each_entry(vma,
-                        &mapping->i_mmap_nonlinear, shared.nonlinear) {
-                        cursor = (unsigned long) vma->vm_private_data;
-                        while (cursor < max_nl_cursor &&
-                                cursor < vma->vm_end - vma->vm_start) {
-                                if (try_to_unmap_cluster(cursor, &mapcount,
-                                                vma, page) == SWAP_MLOCK)
-                                        ret = SWAP_MLOCK;
-                                cursor += CLUSTER_SIZE;
-                                vma->vm_private_data = (void *) cursor;
-                                if ((int)mapcount <= 0)
-                                        return ret;
-                        }
-                        vma->vm_private_data = (void *) max_nl_cursor;
-                }
-                cond_resched();
-                max_nl_cursor += CLUSTER_SIZE;
-        } while (max_nl_cursor <= max_nl_size);
-        /*
-         * Don't loop forever (perhaps all the remaining pages are
-         * in locked vmas).  Reset cursor on all unreserved nonlinear
-         * vmas, now forgetting on which ones it had fallen behind.
-         */
-        list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
-                vma->vm_private_data = NULL;
-        return ret;
-}
 bool is_vma_temporary_stack(struct vm_area_struct *vma)
 {
        int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -1566,7 +1355,6 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
                .rmap_one = try_to_unmap_one,
                .arg = (void *)flags,
                .done = page_not_mapped,
-                .file_nonlinear = try_to_unmap_nonlinear,
                .anon_lock = page_lock_anon_vma_read,
        };
@@ -1612,12 +1400,6 @@ int try_to_munlock(struct page *page)
                .rmap_one = try_to_unmap_one,
                .arg = (void *)TTU_MUNLOCK,
                .done = page_not_mapped,
-                /*
-                 * We don't bother to try to find the munlocked page in
-                 * nonlinears. It's costly. Instead, later, page reclaim logic
-                 * may call try_to_unmap() and recover PG_mlocked lazily.
-                 */
-                .file_nonlinear = NULL,
                .anon_lock = page_lock_anon_vma_read,
        };
@@ -1748,13 +1530,6 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
                        goto done;
        }
-        if (!rwc->file_nonlinear)
-                goto done;
-        if (list_empty(&mapping->i_mmap_nonlinear))
-                goto done;
-        ret = rwc->file_nonlinear(page, mapping, rwc->arg);
 done:
        i_mmap_unlock_read(mapping);
        return ret;
diff --git a/mm/shmem.c b/mm/shmem.c
index 993e6ba689cc..cf2d0ca010bc 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -191,11 +191,6 @@ static const struct inode_operations shmem_dir_inode_operations;
 static const struct inode_operations shmem_special_inode_operations;
 static const struct vm_operations_struct shmem_vm_ops;
-static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
-        .ra_pages       = 0,    /* No readahead */
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
-};
 static LIST_HEAD(shmem_swaplist);
 static DEFINE_MUTEX(shmem_swaplist_mutex);
@@ -765,11 +760,11 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
                goto redirty;
        /*
-         * shmem_backing_dev_info's capabilities prevent regular writeback or
+         * Our capabilities prevent regular writeback or sync from ever calling
-         * sync from ever calling shmem_writepage; but a stacking filesystem
+         * shmem_writepage; but a stacking filesystem might use ->writepage of
-         * might use ->writepage of its underlying filesystem, in which case
+         * its underlying filesystem, in which case tmpfs should write out to
-         * tmpfs should write out to swap only in response to memory pressure,
+         * swap only in response to memory pressure, and not for the writeback
-         * and not for the writeback threads or sync.
+         * threads or sync.
         */
        if (!wbc->for_reclaim) {
                WARN_ON_ONCE(1);        /* Still happens? Tell us about it! */
@@ -1131,7 +1126,7 @@ repeat:
                         * truncated or holepunched since swap was confirmed.
                         * shmem_undo_range() will have done some of the
                         * unaccounting, now delete_from_swap_cache() will do
-                         * the rest (including mem_cgroup_uncharge_swapcache).
+                         * the rest.
                         * Reset swap.val? No, leave it so "failed" goes back to
                         * "repeat": reading a hole and writing should succeed.
                         */
@@ -1415,7 +1410,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
                inode->i_ino = get_next_ino();
                inode_init_owner(inode, dir, mode);
                inode->i_blocks = 0;
-                inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inode->i_generation = get_seconds();
                info = SHMEM_I(inode);
@@ -1461,7 +1455,10 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 bool shmem_mapping(struct address_space *mapping)
 {
-        return mapping->backing_dev_info == &shmem_backing_dev_info;
+        if (!mapping->host)
+                return false;
+        return mapping->host->i_sb->s_op == &shmem_ops;
 }
 #ifdef CONFIG_TMPFS
@@ -2325,8 +2322,8 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
 static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
 {
-        bool old_is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
+        bool old_is_dir = d_is_dir(old_dentry);
-        bool new_is_dir = S_ISDIR(new_dentry->d_inode->i_mode);
+        bool new_is_dir = d_is_dir(new_dentry);
        if (old_dir != new_dir && old_is_dir != new_is_dir) {
                if (old_is_dir) {
@@ -3201,7 +3198,6 @@ static const struct vm_operations_struct shmem_vm_ops = {
        .set_policy     = shmem_set_policy,
        .get_policy     = shmem_get_policy,
 #endif
-        .remap_pages    = generic_file_remap_pages,
 };
 static struct dentry *shmem_mount(struct file_system_type *fs_type,
@@ -3226,10 +3222,6 @@ int __init shmem_init(void)
        if (shmem_inode_cachep)
                return 0;
-        error = bdi_init(&shmem_backing_dev_info);
-        if (error)
-                goto out4;
        error = shmem_init_inodecache();
        if (error)
                goto out3;
@@ -3253,8 +3245,6 @@ out1:
 out2:
        shmem_destroy_inodecache();
 out3:
-        bdi_destroy(&shmem_backing_dev_info);
-out4:
        shm_mnt = ERR_PTR(error);
        return error;
 }
diff --git a/mm/slab.c b/mm/slab.c
index 65b5dcb6f671..c4b89eaf4c96 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2382,7 +2382,7 @@ out:
        return nr_freed;
 }
-int __kmem_cache_shrink(struct kmem_cache *cachep)
+int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
 {
        int ret = 0;
        int node;
@@ -2404,7 +2404,7 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
 {
        int i;
        struct kmem_cache_node *n;
-        int rc = __kmem_cache_shrink(cachep);
+        int rc = __kmem_cache_shrink(cachep, false);
        if (rc)
                return rc;
@@ -3708,8 +3708,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
                                int batchcount, int shared, gfp_t gfp)
 {
        int ret;
-        struct kmem_cache *c = NULL;
+        struct kmem_cache *c;
-        int i = 0;
        ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
@@ -3719,12 +3718,10 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
        if ((ret < 0) || !is_root_cache(cachep))
                return ret;
-        VM_BUG_ON(!mutex_is_locked(&slab_mutex));
+        lockdep_assert_held(&slab_mutex);
-        for_each_memcg_cache_index(i) {
+        for_each_memcg_cache(c, cachep) {
-                c = cache_from_memcg_idx(cachep, i);
+                /* return value determined by the root cache only */
-                if (c)
+                __do_tune_cpucache(c, limit, batchcount, shared, gfp);
-                        /* return value determined by the parent cache only */
-                        __do_tune_cpucache(c, limit, batchcount, shared, gfp);
        }
        return ret;
diff --git a/mm/slab.h b/mm/slab.h
index 1cf4005482dd..4c3ac12dd644 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -86,8 +86,6 @@ extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
 extern void create_boot_cache(struct kmem_cache *, const char *name,
                        size_t size, unsigned long flags);
-struct mem_cgroup;
 int slab_unmergeable(struct kmem_cache *s);
 struct kmem_cache *find_mergeable(size_t size, size_t align,
                unsigned long flags, const char *name, void (*ctor)(void *));
@@ -140,7 +138,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
 #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
 int __kmem_cache_shutdown(struct kmem_cache *);
-int __kmem_cache_shrink(struct kmem_cache *);
+int __kmem_cache_shrink(struct kmem_cache *, bool);
 void slab_kmem_cache_release(struct kmem_cache *);
 struct seq_file;
@@ -165,16 +163,27 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
                       size_t count, loff_t *ppos);
 #ifdef CONFIG_MEMCG_KMEM
+/*
+ * Iterate over all memcg caches of the given root cache. The caller must hold
+ * slab_mutex.
+ */
+#define for_each_memcg_cache(iter, root) \
+        list_for_each_entry(iter, &(root)->memcg_params.list, \
+                            memcg_params.list)
+#define for_each_memcg_cache_safe(iter, tmp, root) \
+        list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \
+                                 memcg_params.list)
 static inline bool is_root_cache(struct kmem_cache *s)
 {
-        return !s->memcg_params || s->memcg_params->is_root_cache;
+        return s->memcg_params.is_root_cache;
 }
 static inline bool slab_equal_or_root(struct kmem_cache *s,
-                                        struct kmem_cache *p)
+                                      struct kmem_cache *p)
 {
-        return (p == s) ||
+        return p == s || p == s->memcg_params.root_cache;
-                (s->memcg_params && (p == s->memcg_params->root_cache));
 }
 /*
@@ -185,37 +194,30 @@ static inline bool slab_equal_or_root(struct kmem_cache *s,
 static inline const char *cache_name(struct kmem_cache *s)
 {
        if (!is_root_cache(s))
-                return s->memcg_params->root_cache->name;
+                s = s->memcg_params.root_cache;
        return s->name;
 }
 /*
 * Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
- * That said the caller must assure the memcg's cache won't go away. Since once
+ * That said the caller must assure the memcg's cache won't go away by either
- * created a memcg's cache is destroyed only along with the root cache, it is
+ * taking a css reference to the owner cgroup, or holding the slab_mutex.
- * true if we are going to allocate from the cache or hold a reference to the
- * root cache by other means. Otherwise, we should hold either the slab_mutex
- * or the memcg's slab_caches_mutex while calling this function and accessing
- * the returned value.
 */
 static inline struct kmem_cache *
 cache_from_memcg_idx(struct kmem_cache *s, int idx)
 {
        struct kmem_cache *cachep;
-        struct memcg_cache_params *params;
+        struct memcg_cache_array *arr;
-        if (!s->memcg_params)
-                return NULL;
        rcu_read_lock();
-        params = rcu_dereference(s->memcg_params);
+        arr = rcu_dereference(s->memcg_params.memcg_caches);
        /*
         * Make sure we will access the up-to-date value. The code updating
         * memcg_caches issues a write barrier to match this (see
-         * memcg_register_cache()).
+         * memcg_create_kmem_cache()).
         */
-        cachep = lockless_dereference(params->memcg_caches[idx]);
+        cachep = lockless_dereference(arr->entries[idx]);
        rcu_read_unlock();
        return cachep;
@@ -225,7 +227,7 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
 {
        if (is_root_cache(s))
                return s;
-        return s->memcg_params->root_cache;
+        return s->memcg_params.root_cache;
 }
 static __always_inline int memcg_charge_slab(struct kmem_cache *s,
@@ -235,7 +237,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s,
                return 0;
        if (is_root_cache(s))
                return 0;
-        return __memcg_charge_slab(s, gfp, order);
+        return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order);
 }
 static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
@@ -244,9 +246,18 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
                return;
        if (is_root_cache(s))
                return;
-        __memcg_uncharge_slab(s, order);
+        memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order);
 }
-#else
+extern void slab_init_memcg_params(struct kmem_cache *);
+#else /* !CONFIG_MEMCG_KMEM */
+#define for_each_memcg_cache(iter, root) \
+        for ((void)(iter), (void)(root); 0; )
+#define for_each_memcg_cache_safe(iter, tmp, root) \
+        for ((void)(iter), (void)(tmp), (void)(root); 0; )
 static inline bool is_root_cache(struct kmem_cache *s)
 {
        return true;
@@ -282,7 +293,11 @@ static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
 static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
 {
 }
-#endif
+static inline void slab_init_memcg_params(struct kmem_cache *s)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
 static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
 {
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e03dd6f2a272..999bb3424d44 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -106,62 +106,67 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
 #endif
 #ifdef CONFIG_MEMCG_KMEM
-static int memcg_alloc_cache_params(struct mem_cgroup *memcg,
+void slab_init_memcg_params(struct kmem_cache *s)
-                struct kmem_cache *s, struct kmem_cache *root_cache)
 {
-        size_t size;
+        s->memcg_params.is_root_cache = true;
+        INIT_LIST_HEAD(&s->memcg_params.list);
+        RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
+}
+static int init_memcg_params(struct kmem_cache *s,
+                struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+{
+        struct memcg_cache_array *arr;
-        if (!memcg_kmem_enabled())
+        if (memcg) {
+                s->memcg_params.is_root_cache = false;
+                s->memcg_params.memcg = memcg;
+                s->memcg_params.root_cache = root_cache;
                return 0;
+        }
-        if (!memcg) {
+        slab_init_memcg_params(s);
-                size = offsetof(struct memcg_cache_params, memcg_caches);
-                size += memcg_limited_groups_array_size * sizeof(void *);
-        } else
-                size = sizeof(struct memcg_cache_params);
-        s->memcg_params = kzalloc(size, GFP_KERNEL);
+        if (!memcg_nr_cache_ids)
-        if (!s->memcg_params)
+                return 0;
-                return -ENOMEM;
-        if (memcg) {
+        arr = kzalloc(sizeof(struct memcg_cache_array) +
-                s->memcg_params->memcg = memcg;
+                      memcg_nr_cache_ids * sizeof(void *),
-                s->memcg_params->root_cache = root_cache;
+                      GFP_KERNEL);
-        } else
+        if (!arr)
-                s->memcg_params->is_root_cache = true;
+                return -ENOMEM;
+        RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr);
        return 0;
 }
-static void memcg_free_cache_params(struct kmem_cache *s)
+static void destroy_memcg_params(struct kmem_cache *s)
 {
-        kfree(s->memcg_params);
+        if (is_root_cache(s))
+                kfree(rcu_access_pointer(s->memcg_params.memcg_caches));
 }
-static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs)
+static int update_memcg_params(struct kmem_cache *s, int new_array_size)
 {
-        int size;
+        struct memcg_cache_array *old, *new;
-        struct memcg_cache_params *new_params, *cur_params;
-        BUG_ON(!is_root_cache(s));
-        size = offsetof(struct memcg_cache_params, memcg_caches);
+        if (!is_root_cache(s))
-        size += num_memcgs * sizeof(void *);
+                return 0;
-        new_params = kzalloc(size, GFP_KERNEL);
+        new = kzalloc(sizeof(struct memcg_cache_array) +
-        if (!new_params)
+                      new_array_size * sizeof(void *), GFP_KERNEL);
+        if (!new)
                return -ENOMEM;
-        cur_params = s->memcg_params;
+        old = rcu_dereference_protected(s->memcg_params.memcg_caches,
-        memcpy(new_params->memcg_caches, cur_params->memcg_caches,
+                                        lockdep_is_held(&slab_mutex));
-               memcg_limited_groups_array_size * sizeof(void *));
+        if (old)
+                memcpy(new->entries, old->entries,
-        new_params->is_root_cache = true;
+                       memcg_nr_cache_ids * sizeof(void *));
-        rcu_assign_pointer(s->memcg_params, new_params);
-        if (cur_params)
-                kfree_rcu(cur_params, rcu_head);
+        rcu_assign_pointer(s->memcg_params.memcg_caches, new);
+        if (old)
+                kfree_rcu(old, rcu);
        return 0;
 }
@@ -169,34 +174,28 @@ int memcg_update_all_caches(int num_memcgs)
 {
        struct kmem_cache *s;
        int ret = 0;
-        mutex_lock(&slab_mutex);
+        mutex_lock(&slab_mutex);
        list_for_each_entry(s, &slab_caches, list) {
-                if (!is_root_cache(s))
+                ret = update_memcg_params(s, num_memcgs);
-                        continue;
-                ret = memcg_update_cache_params(s, num_memcgs);
                /*
                 * Instead of freeing the memory, we'll just leave the caches
                 * up to this point in an updated state.
                 */
                if (ret)
-                        goto out;
+                        break;
        }
-        memcg_update_array_size(num_memcgs);
-out:
        mutex_unlock(&slab_mutex);
        return ret;
 }
 #else
-static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg,
+static inline int init_memcg_params(struct kmem_cache *s,
-                struct kmem_cache *s, struct kmem_cache *root_cache)
+                struct mem_cgroup *memcg, struct kmem_cache *root_cache)
 {
        return 0;
 }
-static inline void memcg_free_cache_params(struct kmem_cache *s)
+static inline void destroy_memcg_params(struct kmem_cache *s)
 {
 }
 #endif /* CONFIG_MEMCG_KMEM */
@@ -296,8 +295,8 @@ unsigned long calculate_alignment(unsigned long flags,
 }
 static struct kmem_cache *
-do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
+do_kmem_cache_create(const char *name, size_t object_size, size_t size,
-                     unsigned long flags, void (*ctor)(void *),
+                     size_t align, unsigned long flags, void (*ctor)(void *),
                     struct mem_cgroup *memcg, struct kmem_cache *root_cache)
 {
        struct kmem_cache *s;
@@ -314,7 +313,7 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
        s->align = align;
        s->ctor = ctor;
-        err = memcg_alloc_cache_params(memcg, s, root_cache);
+        err = init_memcg_params(s, memcg, root_cache);
        if (err)
                goto out_free_cache;
@@ -330,8 +329,8 @@ out:
        return s;
 out_free_cache:
-        memcg_free_cache_params(s);
+        destroy_memcg_params(s);
-        kfree(s);
+        kmem_cache_free(kmem_cache, s);
        goto out;
 }
@@ -364,11 +363,12 @@ kmem_cache_create(const char *name, size_t size, size_t align,
                  unsigned long flags, void (*ctor)(void *))
 {
        struct kmem_cache *s;
-        char *cache_name;
+        const char *cache_name;
        int err;
        get_online_cpus();
        get_online_mems();
+        memcg_get_cache_ids();
        mutex_lock(&slab_mutex);
@@ -390,7 +390,7 @@ kmem_cache_create(const char *name, size_t size, size_t align,
        if (s)
                goto out_unlock;
-        cache_name = kstrdup(name, GFP_KERNEL);
+        cache_name = kstrdup_const(name, GFP_KERNEL);
        if (!cache_name) {
                err = -ENOMEM;
                goto out_unlock;
@@ -401,12 +401,13 @@ kmem_cache_create(const char *name, size_t size, size_t align,
                                 flags, ctor, NULL, NULL);
        if (IS_ERR(s)) {
                err = PTR_ERR(s);
-                kfree(cache_name);
+                kfree_const(cache_name);
        }
 out_unlock:
        mutex_unlock(&slab_mutex);
+        memcg_put_cache_ids();
        put_online_mems();
        put_online_cpus();
@@ -425,31 +426,91 @@ out_unlock:
 }
 EXPORT_SYMBOL(kmem_cache_create);
+static int do_kmem_cache_shutdown(struct kmem_cache *s,
+                struct list_head *release, bool *need_rcu_barrier)
+{
+        if (__kmem_cache_shutdown(s) != 0) {
+                printk(KERN_ERR "kmem_cache_destroy %s: "
+                       "Slab cache still has objects\n", s->name);
+                dump_stack();
+                return -EBUSY;
+        }
+        if (s->flags & SLAB_DESTROY_BY_RCU)
+                *need_rcu_barrier = true;
+#ifdef CONFIG_MEMCG_KMEM
+        if (!is_root_cache(s))
+                list_del(&s->memcg_params.list);
+#endif
+        list_move(&s->list, release);
+        return 0;
+}
+static void do_kmem_cache_release(struct list_head *release,
+                                  bool need_rcu_barrier)
+{
+        struct kmem_cache *s, *s2;
+        if (need_rcu_barrier)
+                rcu_barrier();
+        list_for_each_entry_safe(s, s2, release, list) {
+#ifdef SLAB_SUPPORTS_SYSFS
+                sysfs_slab_remove(s);
+#else
+                slab_kmem_cache_release(s);
+#endif
+        }
+}
 #ifdef CONFIG_MEMCG_KMEM
 /*
 * memcg_create_kmem_cache - Create a cache for a memory cgroup.
 * @memcg: The memory cgroup the new cache is for.
 * @root_cache: The parent of the new cache.
- * @memcg_name: The name of the memory cgroup (used for naming the new cache).
 *
 * This function attempts to create a kmem cache that will serve allocation
 * requests going from @memcg to @root_cache. The new cache inherits properties
 * from its parent.
 */
-struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
+void memcg_create_kmem_cache(struct mem_cgroup *memcg,
-                                           struct kmem_cache *root_cache,
+                             struct kmem_cache *root_cache)
-                                           const char *memcg_name)
 {
+        static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
+        struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
+        struct memcg_cache_array *arr;
        struct kmem_cache *s = NULL;
        char *cache_name;
+        int idx;
        get_online_cpus();
        get_online_mems();
        mutex_lock(&slab_mutex);
+        /*
+         * The memory cgroup could have been deactivated while the cache
+         * creation work was pending.
+         */
+        if (!memcg_kmem_is_active(memcg))
+                goto out_unlock;
+        idx = memcg_cache_id(memcg);
+        arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches,
+                                        lockdep_is_held(&slab_mutex));
+        /*
+         * Since per-memcg caches are created asynchronously on first
+         * allocation (see memcg_kmem_get_cache()), several threads can try to
+         * create the same cache, but only one of them may succeed.
+         */
+        if (arr->entries[idx])
+                goto out_unlock;
+        cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf));
        cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
-                               memcg_cache_id(memcg), memcg_name);
+                               css->id, memcg_name_buf);
        if (!cache_name)
                goto out_unlock;
@@ -457,49 +518,108 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
                                 root_cache->size, root_cache->align,
                                 root_cache->flags, root_cache->ctor,
                                 memcg, root_cache);
+        /*
+         * If we could not create a memcg cache, do not complain, because
+         * that's not critical at all as we can always proceed with the root
+         * cache.
+         */
        if (IS_ERR(s)) {
                kfree(cache_name);
-                s = NULL;
+                goto out_unlock;
        }
+        list_add(&s->memcg_params.list, &root_cache->memcg_params.list);
+        /*
+         * Since readers won't lock (see cache_from_memcg_idx()), we need a
+         * barrier here to ensure nobody will see the kmem_cache partially
+         * initialized.
+         */
+        smp_wmb();
+        arr->entries[idx] = s;
 out_unlock:
        mutex_unlock(&slab_mutex);
        put_online_mems();
        put_online_cpus();
-        return s;
 }
-static int memcg_cleanup_cache_params(struct kmem_cache *s)
+void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
 {
-        int rc;
+        int idx;
+        struct memcg_cache_array *arr;
+        struct kmem_cache *s, *c;
-        if (!s->memcg_params ||
+        idx = memcg_cache_id(memcg);
-            !s->memcg_params->is_root_cache)
-                return 0;
+        get_online_cpus();
+        get_online_mems();
-        mutex_unlock(&slab_mutex);
-        rc = __memcg_cleanup_cache_params(s);
        mutex_lock(&slab_mutex);
+        list_for_each_entry(s, &slab_caches, list) {
+                if (!is_root_cache(s))
+                        continue;
+                arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
+                                                lockdep_is_held(&slab_mutex));
+                c = arr->entries[idx];
+                if (!c)
+                        continue;
+                __kmem_cache_shrink(c, true);
+                arr->entries[idx] = NULL;
+        }
+        mutex_unlock(&slab_mutex);
-        return rc;
+        put_online_mems();
+        put_online_cpus();
 }
-#else
-static int memcg_cleanup_cache_params(struct kmem_cache *s)
+void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
 {
-        return 0;
+        LIST_HEAD(release);
+        bool need_rcu_barrier = false;
+        struct kmem_cache *s, *s2;
+        get_online_cpus();
+        get_online_mems();
+        mutex_lock(&slab_mutex);
+        list_for_each_entry_safe(s, s2, &slab_caches, list) {
+                if (is_root_cache(s) || s->memcg_params.memcg != memcg)
+                        continue;
+                /*
+                 * The cgroup is about to be freed and therefore has no charges
+                 * left. Hence, all its caches must be empty by now.
+                 */
+                BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier));
+        }
+        mutex_unlock(&slab_mutex);
+        put_online_mems();
+        put_online_cpus();
+        do_kmem_cache_release(&release, need_rcu_barrier);
 }
 #endif /* CONFIG_MEMCG_KMEM */
 void slab_kmem_cache_release(struct kmem_cache *s)
 {
-        kfree(s->name);
+        destroy_memcg_params(s);
+        kfree_const(s->name);
        kmem_cache_free(kmem_cache, s);
 }
 void kmem_cache_destroy(struct kmem_cache *s)
 {
+        struct kmem_cache *c, *c2;
+        LIST_HEAD(release);
+        bool need_rcu_barrier = false;
+        bool busy = false;
+        BUG_ON(!is_root_cache(s));
        get_online_cpus();
        get_online_mems();
@@ -509,35 +629,21 @@ void kmem_cache_destroy(struct kmem_cache *s)
        if (s->refcount)
                goto out_unlock;
-        if (memcg_cleanup_cache_params(s) != 0)
+        for_each_memcg_cache_safe(c, c2, s) {
-                goto out_unlock;
+                if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
+                        busy = true;
-        if (__kmem_cache_shutdown(s) != 0) {
-                printk(KERN_ERR "kmem_cache_destroy %s: "
-                       "Slab cache still has objects\n", s->name);
-                dump_stack();
-                goto out_unlock;
        }
-        list_del(&s->list);
+        if (!busy)
+                do_kmem_cache_shutdown(s, &release, &need_rcu_barrier);
-        mutex_unlock(&slab_mutex);
-        if (s->flags & SLAB_DESTROY_BY_RCU)
-                rcu_barrier();
-        memcg_free_cache_params(s);
-#ifdef SLAB_SUPPORTS_SYSFS
-        sysfs_slab_remove(s);
-#else
-        slab_kmem_cache_release(s);
-#endif
-        goto out;
 out_unlock:
        mutex_unlock(&slab_mutex);
-out:
        put_online_mems();
        put_online_cpus();
+        do_kmem_cache_release(&release, need_rcu_barrier);
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
@@ -554,7 +660,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
        get_online_cpus();
        get_online_mems();
-        ret = __kmem_cache_shrink(cachep);
+        ret = __kmem_cache_shrink(cachep, false);
        put_online_mems();
        put_online_cpus();
        return ret;
@@ -576,6 +682,9 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz
        s->name = name;
        s->size = s->object_size = size;
        s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
+        slab_init_memcg_params(s);
        err = __kmem_cache_create(s, flags);
        if (err)
@@ -789,6 +898,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
        page = alloc_kmem_pages(flags, order);
        ret = page ? page_address(page) : NULL;
        kmemleak_alloc(ret, size, 1, flags);
+        kasan_kmalloc_large(ret, size);
        return ret;
 }
 EXPORT_SYMBOL(kmalloc_order);
@@ -855,16 +965,11 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
 {
        struct kmem_cache *c;
        struct slabinfo sinfo;
-        int i;
        if (!is_root_cache(s))
                return;
-        for_each_memcg_cache_index(i) {
+        for_each_memcg_cache(c, s) {
-                c = cache_from_memcg_idx(s, i);
-                if (!c)
-                        continue;
                memset(&sinfo, 0, sizeof(sinfo));
                get_slabinfo(c, &sinfo);
@@ -916,7 +1021,7 @@ int memcg_slab_show(struct seq_file *m, void *p)
        if (p == slab_caches.next)
                print_slabinfo_header(m);
-        if (!is_root_cache(s) && s->memcg_params->memcg == memcg)
+        if (!is_root_cache(s) && s->memcg_params.memcg == memcg)
                cache_show(s, m);
        return 0;
 }
@@ -973,8 +1078,10 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
        if (p)
                ks = ksize(p);
-        if (ks >= new_size)
+        if (ks >= new_size) {
+                kasan_krealloc((void *)p, new_size);
                return (void *)p;
+        }
        ret = kmalloc_track_caller(new_size, flags);
        if (ret && p)
diff --git a/mm/slob.c b/mm/slob.c
index 96a86206a26b..94a7fede6d48 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -618,7 +618,7 @@ int __kmem_cache_shutdown(struct kmem_cache *c)
        return 0;
 }
-int __kmem_cache_shrink(struct kmem_cache *d)
+int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
 {
        return 0;
 }
diff --git a/mm/slub.c b/mm/slub.c
index fe376fe1f4fe..6832c4eab104 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -20,6 +20,7 @@
 #include <linux/proc_fs.h>
 #include <linux/notifier.h>
 #include <linux/seq_file.h>
+#include <linux/kasan.h>
 #include <linux/kmemcheck.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
@@ -468,12 +469,30 @@ static char *slub_debug_slabs;
 static int disable_higher_order_debug;
 /*
+ * slub is about to manipulate internal object metadata.  This memory lies
+ * outside the range of the allocated object, so accessing it would normally
+ * be reported by kasan as a bounds error.  metadata_access_enable() is used
+ * to tell kasan that these accesses are OK.
+ */
+static inline void metadata_access_enable(void)
+{
+        kasan_disable_current();
+}
+static inline void metadata_access_disable(void)
+{
+        kasan_enable_current();
+}
+/*
 * Object debugging
 */
 static void print_section(char *text, u8 *addr, unsigned int length)
 {
+        metadata_access_enable();
        print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
                        length, 1);
+        metadata_access_disable();
 }
 static struct track *get_track(struct kmem_cache *s, void *object,
@@ -503,7 +522,9 @@ static void set_track(struct kmem_cache *s, void *object,
                trace.max_entries = TRACK_ADDRS_COUNT;
                trace.entries = p->addrs;
                trace.skip = 3;
+                metadata_access_enable();
                save_stack_trace(&trace);
+                metadata_access_disable();
                /* See rant in lockdep.c */
                if (trace.nr_entries != 0 &&
@@ -629,7 +650,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
        dump_stack();
 }
-static void object_err(struct kmem_cache *s, struct page *page,
+void object_err(struct kmem_cache *s, struct page *page,
                        u8 *object, char *reason)
 {
        slab_bug(s, "%s", reason);
@@ -677,7 +698,9 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
        u8 *fault;
        u8 *end;
+        metadata_access_enable();
        fault = memchr_inv(start, value, bytes);
+        metadata_access_disable();
        if (!fault)
                return 1;
@@ -770,7 +793,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
        if (!remainder)
                return 1;
+        metadata_access_enable();
        fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
+        metadata_access_disable();
        if (!fault)
                return 1;
        while (end > fault && end[-1] == POISON_INUSE)
@@ -1226,11 +1251,13 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
 static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
 {
        kmemleak_alloc(ptr, size, 1, flags);
+        kasan_kmalloc_large(ptr, size);
 }
 static inline void kfree_hook(const void *x)
 {
        kmemleak_free(x);
+        kasan_kfree_large(x);
 }
 static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
@@ -1253,6 +1280,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
        kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
        kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
        memcg_kmem_put_cache(s);
+        kasan_slab_alloc(s, object);
 }
 static inline void slab_free_hook(struct kmem_cache *s, void *x)
@@ -1276,6 +1304,8 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
 #endif
        if (!(s->flags & SLAB_DEBUG_OBJECTS))
                debug_check_no_obj_freed(x, s->object_size);
+        kasan_slab_free(s, x);
 }
 /*
@@ -1370,8 +1400,11 @@ static void setup_object(struct kmem_cache *s, struct page *page,
                                void *object)
 {
        setup_object_debug(s, page, object);
-        if (unlikely(s->ctor))
+        if (unlikely(s->ctor)) {
+                kasan_unpoison_object_data(s, object);
                s->ctor(object);
+                kasan_poison_object_data(s, object);
+        }
 }
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1404,6 +1437,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        if (unlikely(s->flags & SLAB_POISON))
                memset(start, POISON_INUSE, PAGE_SIZE << order);
+        kasan_poison_slab(page);
        for_each_object_idx(p, idx, s, start, page->objects) {
                setup_object(s, page, p);
                if (likely(idx < page->objects))
@@ -2007,6 +2042,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
        int pages;
        int pobjects;
+        preempt_disable();
        do {
                pages = 0;
                pobjects = 0;
@@ -2040,6 +2076,14 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
        } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
                                                                != oldpage);
+        if (unlikely(!s->cpu_partial)) {
+                unsigned long flags;
+                local_irq_save(flags);
+                unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
+                local_irq_restore(flags);
+        }
+        preempt_enable();
 #endif
 }
@@ -2398,13 +2442,24 @@ redo:
         * reading from one cpu area. That does not matter as long
         * as we end up on the original cpu again when doing the cmpxchg.
         *
-         * Preemption is disabled for the retrieval of the tid because that
+         * We should guarantee that tid and kmem_cache are retrieved on
-         * must occur from the current processor. We cannot allow rescheduling
+         * the same cpu. It could be different if CONFIG_PREEMPT so we need
-         * on a different processor between the determination of the pointer
+         * to check if it is matched or not.
-         * and the retrieval of the tid.
         */
-        preempt_disable();
+        do {
-        c = this_cpu_ptr(s->cpu_slab);
+                tid = this_cpu_read(s->cpu_slab->tid);
+                c = raw_cpu_ptr(s->cpu_slab);
+        } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));
+        /*
+         * Irqless object alloc/free algorithm used here depends on sequence
+         * of fetching cpu_slab's data. tid should be fetched before anything
+         * on c to guarantee that object and page associated with previous tid
+         * won't be used with current tid. If we fetch tid first, object and
+         * page could be one associated with next tid and our alloc/free
+         * request will be failed. In this case, we will retry. So, no problem.
+         */
+        barrier();
        /*
         * The transaction ids are globally unique per cpu and per operation on
@@ -2412,8 +2467,6 @@ redo:
         * occurs on the right processor and that there was no operation on the
         * linked list in between.
         */
-        tid = c->tid;
-        preempt_enable();
        object = c->freelist;
        page = c->page;
@@ -2479,6 +2532,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 {
        void *ret = slab_alloc(s, gfpflags, _RET_IP_);
        trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
+        kasan_kmalloc(s, ret, size);
        return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_trace);
@@ -2505,6 +2559,8 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
        trace_kmalloc_node(_RET_IP_, ret,
                           size, s->size, gfpflags, node);
+        kasan_kmalloc(s, ret, size);
        return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
@@ -2512,7 +2568,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
 #endif
 /*
- * Slow patch handling. This may still be called frequently since objects
+ * Slow path handling. This may still be called frequently since objects
 * have a longer lifetime than the cpu slabs in most processing loads.
 *
 * So we still attempt to reduce cache line usage. Just take the slab
@@ -2659,11 +2715,13 @@ redo:
         * data is retrieved via this pointer. If we are on the same cpu
         * during the cmpxchg then the free will succedd.
         */
-        preempt_disable();
+        do {
-        c = this_cpu_ptr(s->cpu_slab);
+                tid = this_cpu_read(s->cpu_slab->tid);
+                c = raw_cpu_ptr(s->cpu_slab);
+        } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));
-        tid = c->tid;
+        /* Same with comment on barrier() in slab_alloc_node() */
-        preempt_enable();
+        barrier();
        if (likely(page == c->page)) {
                set_freepointer(s, object, c->freelist);
@@ -2888,6 +2946,7 @@ static void early_kmem_cache_node_alloc(int node)
        init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
        init_tracking(kmem_cache_node, n);
 #endif
+        kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node));
        init_kmem_cache_node(n);
        inc_slabs_node(kmem_cache_node, node, page->objects);
@@ -3260,6 +3319,8 @@ void *__kmalloc(size_t size, gfp_t flags)
        trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
+        kasan_kmalloc(s, ret, size);
        return ret;
 }
 EXPORT_SYMBOL(__kmalloc);
@@ -3303,12 +3364,14 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
        trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
+        kasan_kmalloc(s, ret, size);
        return ret;
 }
 EXPORT_SYMBOL(__kmalloc_node);
 #endif
-size_t ksize(const void *object)
+static size_t __ksize(const void *object)
 {
        struct page *page;
@@ -3324,6 +3387,15 @@ size_t ksize(const void *object)
        return slab_ksize(page->slab_cache);
 }
+size_t ksize(const void *object)
+{
+        size_t size = __ksize(object);
+        /* We assume that ksize callers could use whole allocated area,
+           so we need unpoison this area. */
+        kasan_krealloc(object, size);
+        return size;
+}
 EXPORT_SYMBOL(ksize);
 void kfree(const void *x)
@@ -3347,69 +3419,92 @@ void kfree(const void *x)
 }
 EXPORT_SYMBOL(kfree);
+#define SHRINK_PROMOTE_MAX 32
 /*
- * kmem_cache_shrink removes empty slabs from the partial lists and sorts
+ * kmem_cache_shrink discards empty slabs and promotes the slabs filled
- * the remaining slabs by the number of items in use. The slabs with the
+ * up most to the head of the partial lists. New allocations will then
- * most items in use come first. New allocations will then fill those up
+ * fill those up and thus they can be removed from the partial lists.
- * and thus they can be removed from the partial lists.
 *
 * The slabs with the least items are placed last. This results in them
 * being allocated from last increasing the chance that the last objects
 * are freed in them.
 */
-int __kmem_cache_shrink(struct kmem_cache *s)
+int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
 {
        int node;
        int i;
        struct kmem_cache_node *n;
        struct page *page;
        struct page *t;
-        int objects = oo_objects(s->max);
+        struct list_head discard;
-        struct list_head *slabs_by_inuse =
+        struct list_head promote[SHRINK_PROMOTE_MAX];
-                kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
        unsigned long flags;
+        int ret = 0;
-        if (!slabs_by_inuse)
+        if (deactivate) {
-                return -ENOMEM;
+                /*
+                 * Disable empty slabs caching. Used to avoid pinning offline
+                 * memory cgroups by kmem pages that can be freed.
+                 */
+                s->cpu_partial = 0;
+                s->min_partial = 0;
+                /*
+                 * s->cpu_partial is checked locklessly (see put_cpu_partial),
+                 * so we have to make sure the change is visible.
+                 */
+                kick_all_cpus_sync();
+        }
        flush_all(s);
        for_each_kmem_cache_node(s, node, n) {
-                if (!n->nr_partial)
+                INIT_LIST_HEAD(&discard);
-                        continue;
+                for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
+                        INIT_LIST_HEAD(promote + i);
-                for (i = 0; i < objects; i++)
-                        INIT_LIST_HEAD(slabs_by_inuse + i);
                spin_lock_irqsave(&n->list_lock, flags);
                /*
-                 * Build lists indexed by the items in use in each slab.
+                 * Build lists of slabs to discard or promote.
                 *
                 * Note that concurrent frees may occur while we hold the
                 * list_lock. page->inuse here is the upper limit.
                 */
                list_for_each_entry_safe(page, t, &n->partial, lru) {
-                        list_move(&page->lru, slabs_by_inuse + page->inuse);
+                        int free = page->objects - page->inuse;
-                        if (!page->inuse)
+                        /* Do not reread page->inuse */
+                        barrier();
+                        /* We do not keep full slabs on the list */
+                        BUG_ON(free <= 0);
+                        if (free == page->objects) {
+                                list_move(&page->lru, &discard);
                                n->nr_partial--;
+                        } else if (free <= SHRINK_PROMOTE_MAX)
+                                list_move(&page->lru, promote + free - 1);
                }
                /*
-                 * Rebuild the partial list with the slabs filled up most
+                 * Promote the slabs filled up most to the head of the
-                 * first and the least used slabs at the end.
+                 * partial list.
                 */
-                for (i = objects - 1; i > 0; i--)
+                for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
-                        list_splice(slabs_by_inuse + i, n->partial.prev);
+                        list_splice(promote + i, &n->partial);
                spin_unlock_irqrestore(&n->list_lock, flags);
                /* Release empty slabs */
-                list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
+                list_for_each_entry_safe(page, t, &discard, lru)
                        discard_slab(s, page);
+                if (slabs_node(s, node))
+                        ret = 1;
        }
-        kfree(slabs_by_inuse);
+        return ret;
-        return 0;
 }
 static int slab_mem_going_offline_callback(void *arg)
@@ -3418,7 +3513,7 @@ static int slab_mem_going_offline_callback(void *arg)
        mutex_lock(&slab_mutex);
        list_for_each_entry(s, &slab_caches, list)
-                __kmem_cache_shrink(s);
+                __kmem_cache_shrink(s, false);
        mutex_unlock(&slab_mutex);
        return 0;
@@ -3566,6 +3661,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
                        p->slab_cache = s;
 #endif
        }
+        slab_init_memcg_params(s);
        list_add(&s->list, &slab_caches);
        return s;
 }
@@ -3624,13 +3720,10 @@ struct kmem_cache *
 __kmem_cache_alias(const char *name, size_t size, size_t align,
                   unsigned long flags, void (*ctor)(void *))
 {
-        struct kmem_cache *s;
+        struct kmem_cache *s, *c;
        s = find_mergeable(size, align, flags, name, ctor);
        if (s) {
-                int i;
-                struct kmem_cache *c;
                s->refcount++;
                /*
@@ -3640,10 +3733,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
                s->object_size = max(s->object_size, (int)size);
                s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
-                for_each_memcg_cache_index(i) {
+                for_each_memcg_cache(c, s) {
-                        c = cache_from_memcg_idx(s, i);
-                        if (!c)
-                                continue;
                        c->object_size = s->object_size;
                        c->inuse = max_t(int, c->inuse,
                                         ALIGN(size, sizeof(void *)));
@@ -4070,20 +4160,16 @@ static int list_locations(struct kmem_cache *s, char *buf,
                if (num_online_cpus() > 1 &&
                                !cpumask_empty(to_cpumask(l->cpus)) &&
-                                len < PAGE_SIZE - 60) {
+                                len < PAGE_SIZE - 60)
-                        len += sprintf(buf + len, " cpus=");
+                        len += scnprintf(buf + len, PAGE_SIZE - len - 50,
-                        len += cpulist_scnprintf(buf + len,
+                                         " cpus=%*pbl",
-                                                 PAGE_SIZE - len - 50,
+                                         cpumask_pr_args(to_cpumask(l->cpus)));
-                                                 to_cpumask(l->cpus));
-                }
                if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
-                                len < PAGE_SIZE - 60) {
+                                len < PAGE_SIZE - 60)
-                        len += sprintf(buf + len, " nodes=");
+                        len += scnprintf(buf + len, PAGE_SIZE - len - 50,
-                        len += nodelist_scnprintf(buf + len,
+                                         " nodes=%*pbl",
-                                                  PAGE_SIZE - len - 50,
+                                         nodemask_pr_args(&l->nodes));
-                                                  l->nodes);
-                }
                len += sprintf(buf + len, "\n");
        }
@@ -4680,12 +4766,9 @@ static ssize_t shrink_show(struct kmem_cache *s, char *buf)
 static ssize_t shrink_store(struct kmem_cache *s,
                        const char *buf, size_t length)
 {
-        if (buf[0] == '1') {
+        if (buf[0] == '1')
-                int rc = kmem_cache_shrink(s);
+                kmem_cache_shrink(s);
+        else
-                if (rc)
-                        return rc;
-        } else
                return -EINVAL;
        return length;
 }
@@ -4909,7 +4992,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
        err = attribute->store(s, buf, len);
 #ifdef CONFIG_MEMCG_KMEM
        if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
-                int i;
+                struct kmem_cache *c;
                mutex_lock(&slab_mutex);
                if (s->max_attr_size < len)
@@ -4932,11 +5015,8 @@ static ssize_t slab_attr_store(struct kobject *kobj,
                 * directly either failed or succeeded, in which case we loop
                 * through the descendants with best-effort propagation.
                 */
-                for_each_memcg_cache_index(i) {
+                for_each_memcg_cache(c, s)
-                        struct kmem_cache *c = cache_from_memcg_idx(s, i);
+                        attribute->store(c, buf, len);
-                        if (c)
-                                attribute->store(c, buf, len);
-                }
                mutex_unlock(&slab_mutex);
        }
 #endif
@@ -4953,7 +5033,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
        if (is_root_cache(s))
                return;
-        root_cache = s->memcg_params->root_cache;
+        root_cache = s->memcg_params.root_cache;
        /*
         * This mean this cache had no attribute written. Therefore, no point
@@ -5033,7 +5113,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s)
 {
 #ifdef CONFIG_MEMCG_KMEM
        if (!is_root_cache(s))
-                return s->memcg_params->root_cache->memcg_kset;
+                return s->memcg_params.root_cache->memcg_kset;
 #endif
        return slab_kset;
 }
diff --git a/mm/swap.c b/mm/swap.c
index 8a12b33936b4..cd3a5e64cea9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1138,12 +1138,8 @@ void __init swap_setup(void)
 #ifdef CONFIG_SWAP
        int i;
-        if (bdi_init(swapper_spaces[0].backing_dev_info))
+        for (i = 0; i < MAX_SWAPFILES; i++)
-                panic("Failed to init swap bdi");
-        for (i = 0; i < MAX_SWAPFILES; i++) {
                spin_lock_init(&swapper_spaces[i].tree_lock);
-                INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
-        }
 #endif
        /* Use a smaller cluster for small-memory machines */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 9711342987a0..405923f77334 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -32,17 +32,11 @@ static const struct address_space_operations swap_aops = {
 #endif
 };
-static struct backing_dev_info swap_backing_dev_info = {
-        .name           = "swap",
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
-};
 struct address_space swapper_spaces[MAX_SWAPFILES] = {
        [0 ... MAX_SWAPFILES - 1] = {
                .page_tree      = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
                .i_mmap_writable = ATOMIC_INIT(0),
                .a_ops          = &swap_aops,
-                .backing_dev_info = &swap_backing_dev_info,
        }
 };
diff --git a/mm/truncate.c b/mm/truncate.c
index f1e4d6052369..ddec5a5966d7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -112,7 +112,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
                struct address_space *mapping = page->mapping;
                if (mapping && mapping_cap_account_dirty(mapping)) {
                        dec_zone_page_state(page, NR_FILE_DIRTY);
-                        dec_bdi_stat(mapping->backing_dev_info,
+                        dec_bdi_stat(inode_to_bdi(mapping->host),
                                        BDI_RECLAIMABLE);
                        if (account_size)
                                task_io_account_cancelled_write(account_size);
diff --git a/mm/util.c b/mm/util.c
index fec39d4509a9..3981ae9d1b15 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -12,10 +12,30 @@
 #include <linux/hugetlb.h>
 #include <linux/vmalloc.h>
+#include <asm/sections.h>
 #include <asm/uaccess.h>
 #include "internal.h"
+static inline int is_kernel_rodata(unsigned long addr)
+{
+        return addr >= (unsigned long)__start_rodata &&
+                addr < (unsigned long)__end_rodata;
+}
+/**
+ * kfree_const - conditionally free memory
+ * @x: pointer to the memory
+ *
+ * Function calls kfree only if @x is not in .rodata section.
+ */
+void kfree_const(const void *x)
+{
+        if (!is_kernel_rodata((unsigned long)x))
+                kfree(x);
+}
+EXPORT_SYMBOL(kfree_const);
 /**
 * kstrdup - allocate space for and copy an existing string
 * @s: the string to duplicate
@@ -38,6 +58,24 @@ char *kstrdup(const char *s, gfp_t gfp)
 EXPORT_SYMBOL(kstrdup);
 /**
+ * kstrdup_const - conditionally duplicate an existing const string
+ * @s: the string to duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Function returns source string if it is in .rodata section otherwise it
+ * fallbacks to kstrdup.
+ * Strings allocated by kstrdup_const should be freed by kfree_const.
+ */
+const char *kstrdup_const(const char *s, gfp_t gfp)
+{
+        if (is_kernel_rodata((unsigned long)s))
+                return s;
+        return kstrdup(s, gfp);
+}
+EXPORT_SYMBOL(kstrdup_const);
+/**
 * kstrndup - allocate space for and copy an existing string
 * @s: the string to duplicate
 * @max: read at most @max chars from @s
@@ -240,14 +278,8 @@ int __weak get_user_pages_fast(unsigned long start,
                                int nr_pages, int write, struct page **pages)
 {
        struct mm_struct *mm = current->mm;
-        int ret;
+        return get_user_pages_unlocked(current, mm, start, nr_pages,
+                                       write, 0, pages);
-        down_read(&mm->mmap_sem);
-        ret = get_user_pages(current, mm, start, nr_pages,
-                                        write, 0, pages, NULL);
-        up_read(&mm->mmap_sem);
-        return ret;
 }
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 39c338896416..35b25e1340ca 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1324,10 +1324,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
        if (unlikely(!area))
                return NULL;
-        /*
+        if (!(flags & VM_NO_GUARD))
-         * We always allocate a guard page.
+                size += PAGE_SIZE;
-         */
-        size += PAGE_SIZE;
        va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
        if (IS_ERR(va)) {
@@ -1621,6 +1619,7 @@ fail:
 *      @end:           vm area range end
 *      @gfp_mask:      flags for the page level allocator
 *      @prot:          protection mask for the allocated pages
+ *      @vm_flags:      additional vm area flags (e.g. %VM_NO_GUARD)
 *      @node:          node to use for allocation or NUMA_NO_NODE
 *      @caller:        caller's return address
 *
@@ -1630,7 +1629,8 @@ fail:
 */
 void *__vmalloc_node_range(unsigned long size, unsigned long align,
                        unsigned long start, unsigned long end, gfp_t gfp_mask,
-                        pgprot_t prot, int node, const void *caller)
+                        pgprot_t prot, unsigned long vm_flags, int node,
+                        const void *caller)
 {
        struct vm_struct *area;
        void *addr;
@@ -1640,8 +1640,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
        if (!size || (size >> PAGE_SHIFT) > totalram_pages)
                goto fail;
-        area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED,
+        area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
-                                  start, end, node, gfp_mask, caller);
+                                vm_flags, start, end, node, gfp_mask, caller);
        if (!area)
                goto fail;
@@ -1690,7 +1690,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
                            int node, const void *caller)
 {
        return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
-                                gfp_mask, prot, node, caller);
+                                gfp_mask, prot, 0, node, caller);
 }
 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dcd90c891d8e..5e8eadd71bac 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -91,6 +91,9 @@ struct scan_control {
        /* Can pages be swapped as part of reclaim? */
        unsigned int may_swap:1;
+        /* Can cgroups be reclaimed below their normal consumption range? */
+        unsigned int may_thrash:1;
        unsigned int hibernation_mode:1;
        /* One of the zones is ready for compaction */
@@ -229,10 +232,10 @@ EXPORT_SYMBOL(unregister_shrinker);
 #define SHRINK_BATCH 128
-static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
+static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
-                                  struct shrinker *shrinker,
+                                    struct shrinker *shrinker,
-                                  unsigned long nr_scanned,
+                                    unsigned long nr_scanned,
-                                  unsigned long nr_eligible)
+                                    unsigned long nr_eligible)
 {
        unsigned long freed = 0;
        unsigned long long delta;
@@ -341,9 +344,10 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
 }
 /**
- * shrink_node_slabs - shrink slab caches of a given node
+ * shrink_slab - shrink slab caches
 * @gfp_mask: allocation context
 * @nid: node whose slab caches to target
+ * @memcg: memory cgroup whose slab caches to target
 * @nr_scanned: pressure numerator
 * @nr_eligible: pressure denominator
 *
@@ -352,6 +356,12 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
 * unaware shrinkers will receive a node id of 0 instead.
 *
+ * @memcg specifies the memory cgroup to target. If it is not NULL,
+ * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
+ * objects from the memory cgroup specified. Otherwise all shrinkers
+ * are called, and memcg aware shrinkers are supposed to scan the
+ * global list then.
+ *
 * @nr_scanned and @nr_eligible form a ratio that indicate how much of
 * the available objects should be scanned.  Page reclaim for example
 * passes the number of pages scanned and the number of pages on the
@@ -362,13 +372,17 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
 *
 * Returns the number of reclaimed slab objects.
 */
-unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
+static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
-                                unsigned long nr_scanned,
+                                 struct mem_cgroup *memcg,
-                                unsigned long nr_eligible)
+                                 unsigned long nr_scanned,
+                                 unsigned long nr_eligible)
 {
        struct shrinker *shrinker;
        unsigned long freed = 0;
+        if (memcg && !memcg_kmem_is_active(memcg))
+                return 0;
        if (nr_scanned == 0)
                nr_scanned = SWAP_CLUSTER_MAX;
@@ -387,12 +401,16 @@ unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
                struct shrink_control sc = {
                        .gfp_mask = gfp_mask,
                        .nid = nid,
+                        .memcg = memcg,
                };
+                if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE))
+                        continue;
                if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
                        sc.nid = 0;
-                freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible);
+                freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
        }
        up_read(&shrinker_rwsem);
@@ -401,6 +419,29 @@ out:
        return freed;
 }
+void drop_slab_node(int nid)
+{
+        unsigned long freed;
+        do {
+                struct mem_cgroup *memcg = NULL;
+                freed = 0;
+                do {
+                        freed += shrink_slab(GFP_KERNEL, nid, memcg,
+                                             1000, 1000);
+                } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+        } while (freed > 10);
+}
+void drop_slab(void)
+{
+        int nid;
+        for_each_online_node(nid)
+                drop_slab_node(nid);
+}
 static inline int is_page_cache_freeable(struct page *page)
 {
        /*
@@ -497,7 +538,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
        }
        if (mapping->a_ops->writepage == NULL)
                return PAGE_ACTIVATE;
-        if (!may_write_to_queue(mapping->backing_dev_info, sc))
+        if (!may_write_to_queue(inode_to_bdi(mapping->host), sc))
                return PAGE_KEEP;
        if (clear_page_dirty_for_io(page)) {
@@ -876,7 +917,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 */
                mapping = page_mapping(page);
                if (((dirty || writeback) && mapping &&
-                     bdi_write_congested(mapping->backing_dev_info)) ||
+                     bdi_write_congested(inode_to_bdi(mapping->host))) ||
                    (writeback && PageReclaim(page)))
                        nr_congested++;
@@ -1903,8 +1944,12 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness,
         * latencies, so it's better to scan a minimum amount there as
         * well.
         */
-        if (current_is_kswapd() && !zone_reclaimable(zone))
+        if (current_is_kswapd()) {
-                force_scan = true;
+                if (!zone_reclaimable(zone))
+                        force_scan = true;
+                if (!mem_cgroup_lruvec_online(lruvec))
+                        force_scan = true;
+        }
        if (!global_reclaim(sc))
                force_scan = true;
@@ -2269,6 +2314,7 @@ static inline bool should_continue_reclaim(struct zone *zone,
 static bool shrink_zone(struct zone *zone, struct scan_control *sc,
                        bool is_classzone)
 {
+        struct reclaim_state *reclaim_state = current->reclaim_state;
        unsigned long nr_reclaimed, nr_scanned;
        bool reclaimable = false;
@@ -2287,15 +2333,28 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
                memcg = mem_cgroup_iter(root, NULL, &reclaim);
                do {
                        unsigned long lru_pages;
+                        unsigned long scanned;
                        struct lruvec *lruvec;
                        int swappiness;
+                        if (mem_cgroup_low(root, memcg)) {
+                                if (!sc->may_thrash)
+                                        continue;
+                                mem_cgroup_events(memcg, MEMCG_LOW, 1);
+                        }
                        lruvec = mem_cgroup_zone_lruvec(zone, memcg);
                        swappiness = mem_cgroup_swappiness(memcg);
+                        scanned = sc->nr_scanned;
                        shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
                        zone_lru_pages += lru_pages;
+                        if (memcg && is_classzone)
+                                shrink_slab(sc->gfp_mask, zone_to_nid(zone),
+                                            memcg, sc->nr_scanned - scanned,
+                                            lru_pages);
                        /*
                         * Direct reclaim and kswapd have to scan all memory
                         * cgroups to fulfill the overall scan target for the
@@ -2311,26 +2370,20 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
                                mem_cgroup_iter_break(root, memcg);
                                break;
                        }
-                        memcg = mem_cgroup_iter(root, memcg, &reclaim);
+                } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
-                } while (memcg);
                /*
                 * Shrink the slab caches in the same proportion that
                 * the eligible LRU pages were scanned.
                 */
-                if (global_reclaim(sc) && is_classzone) {
+                if (global_reclaim(sc) && is_classzone)
-                        struct reclaim_state *reclaim_state;
+                        shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL,
+                                    sc->nr_scanned - nr_scanned,
-                        shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone),
+                                    zone_lru_pages);
-                                          sc->nr_scanned - nr_scanned,
-                                          zone_lru_pages);
+                if (reclaim_state) {
+                        sc->nr_reclaimed += reclaim_state->reclaimed_slab;
-                        reclaim_state = current->reclaim_state;
+                        reclaim_state->reclaimed_slab = 0;
-                        if (reclaim_state) {
-                                sc->nr_reclaimed +=
-                                        reclaim_state->reclaimed_slab;
-                                reclaim_state->reclaimed_slab = 0;
-                        }
                }
                vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
@@ -2515,10 +2568,11 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                                          struct scan_control *sc)
 {
+        int initial_priority = sc->priority;
        unsigned long total_scanned = 0;
        unsigned long writeback_threshold;
        bool zones_reclaimable;
+retry:
        delayacct_freepages_start();
        if (global_reclaim(sc))
@@ -2568,6 +2622,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        if (sc->compaction_ready)
                return 1;
+        /* Untapped cgroup reserves?  Don't OOM, retry. */
+        if (!sc->may_thrash) {
+                sc->priority = initial_priority;
+                sc->may_thrash = 1;
+                goto retry;
+        }
        /* Any of the zones still reclaimable?  Don't OOM. */
        if (zones_reclaimable)
                return 1;
@@ -3175,7 +3236,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                 */
                if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
                                pfmemalloc_watermark_ok(pgdat))
-                        wake_up(&pgdat->pfmemalloc_wait);
+                        wake_up_all(&pgdat->pfmemalloc_wait);
                /*
                 * Fragmentation may mean that the system cannot be rebalanced
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1284f89fca08..4f5cd974e11a 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -17,6 +17,9 @@
 #include <linux/cpu.h>
 #include <linux/cpumask.h>
 #include <linux/vmstat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
 #include <linux/sched.h>
 #include <linux/math64.h>
 #include <linux/writeback.h>
@@ -670,66 +673,6 @@ int fragmentation_index(struct zone *zone, unsigned int order)
 }
 #endif
-#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION)
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-static char * const migratetype_names[MIGRATE_TYPES] = {
-        "Unmovable",
-        "Reclaimable",
-        "Movable",
-        "Reserve",
-#ifdef CONFIG_CMA
-        "CMA",
-#endif
-#ifdef CONFIG_MEMORY_ISOLATION
-        "Isolate",
-#endif
-};
-static void *frag_start(struct seq_file *m, loff_t *pos)
-{
-        pg_data_t *pgdat;
-        loff_t node = *pos;
-        for (pgdat = first_online_pgdat();
-             pgdat && node;
-             pgdat = next_online_pgdat(pgdat))
-                --node;
-        return pgdat;
-}
-static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
-{
-        pg_data_t *pgdat = (pg_data_t *)arg;
-        (*pos)++;
-        return next_online_pgdat(pgdat);
-}
-static void frag_stop(struct seq_file *m, void *arg)
-{
-}
-/* Walk all the zones in a node and print using a callback */
-static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
-                void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
-{
-        struct zone *zone;
-        struct zone *node_zones = pgdat->node_zones;
-        unsigned long flags;
-        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
-                if (!populated_zone(zone))
-                        continue;
-                spin_lock_irqsave(&zone->lock, flags);
-                print(m, pgdat, zone);
-                spin_unlock_irqrestore(&zone->lock, flags);
-        }
-}
-#endif
 #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA)
 #ifdef CONFIG_ZONE_DMA
 #define TEXT_FOR_DMA(xx) xx "_dma",
@@ -907,7 +850,66 @@ const char * const vmstat_text[] = {
 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
+#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
+     defined(CONFIG_PROC_FS)
+static void *frag_start(struct seq_file *m, loff_t *pos)
+{
+        pg_data_t *pgdat;
+        loff_t node = *pos;
+        for (pgdat = first_online_pgdat();
+             pgdat && node;
+             pgdat = next_online_pgdat(pgdat))
+                --node;
+        return pgdat;
+}
+static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+        pg_data_t *pgdat = (pg_data_t *)arg;
+        (*pos)++;
+        return next_online_pgdat(pgdat);
+}
+static void frag_stop(struct seq_file *m, void *arg)
+{
+}
+/* Walk all the zones in a node and print using a callback */
+static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
+                void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
+{
+        struct zone *zone;
+        struct zone *node_zones = pgdat->node_zones;
+        unsigned long flags;
+        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+                if (!populated_zone(zone))
+                        continue;
+                spin_lock_irqsave(&zone->lock, flags);
+                print(m, pgdat, zone);
+                spin_unlock_irqrestore(&zone->lock, flags);
+        }
+}
+#endif
 #ifdef CONFIG_PROC_FS
+static char * const migratetype_names[MIGRATE_TYPES] = {
+        "Unmovable",
+        "Reclaimable",
+        "Movable",
+        "Reserve",
+#ifdef CONFIG_CMA
+        "CMA",
+#endif
+#ifdef CONFIG_MEMORY_ISOLATION
+        "Isolate",
+#endif
+};
 static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
                                                struct zone *zone)
 {
@@ -1435,8 +1437,8 @@ static void vmstat_shepherd(struct work_struct *w)
                if (need_update(cpu) &&
                        cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
-                        schedule_delayed_work_on(cpu, &per_cpu(vmstat_work, cpu),
+                        schedule_delayed_work_on(cpu,
-                                __round_jiffies_relative(sysctl_stat_interval, cpu));
+                                &per_cpu(vmstat_work, cpu), 0);
        put_online_cpus();
@@ -1450,7 +1452,7 @@ static void __init start_shepherd_timer(void)
        int cpu;
        for_each_possible_cpu(cpu)
-                INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
+                INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu),
                        vmstat_update);
        if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))
@@ -1536,8 +1538,6 @@ static int __init setup_vmstat(void)
 module_init(setup_vmstat)
 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
-#include <linux/debugfs.h>
 /*
 * Return an index indicating how much of the available free memory is
diff --git a/mm/workingset.c b/mm/workingset.c
index f7216fa7da27..aa017133744b 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -275,7 +275,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
        /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
        local_irq_disable();
-        shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid);
+        shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
        local_irq_enable();
        pages = node_present_pages(sc->nid);
@@ -302,6 +302,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
 }
 static enum lru_status shadow_lru_isolate(struct list_head *item,
+                                          struct list_lru_one *lru,
                                          spinlock_t *lru_lock,
                                          void *arg)
 {
@@ -332,7 +333,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
                goto out;
        }
-        list_del_init(item);
+        list_lru_isolate(lru, item);
        spin_unlock(lru_lock);
        /*
@@ -376,8 +377,8 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
        /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
        local_irq_disable();
-        ret =  list_lru_walk_node(&workingset_shadow_nodes, sc->nid,
+        ret =  list_lru_shrink_walk(&workingset_shadow_nodes, sc,
-                                  shadow_lru_isolate, NULL, &sc->nr_to_scan);
+                                    shadow_lru_isolate, NULL);
        local_irq_enable();
        return ret;
 }
diff --git a/mm/zbud.c b/mm/zbud.c
index 4e387bea702e..2ee4e4520493 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -130,7 +130,8 @@ static struct zbud_ops zbud_zpool_ops = {
        .evict =        zbud_zpool_evict
 };
-static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
+static void *zbud_zpool_create(char *name, gfp_t gfp,
+                        struct zpool_ops *zpool_ops)
 {
        return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
 }
diff --git a/mm/zpool.c b/mm/zpool.c
index 739cdf0d183a..bacdab6e47de 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -129,6 +129,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
 /**
 * zpool_create_pool() - Create a new zpool
 * @type        The type of the zpool to create (e.g. zbud, zsmalloc)
+ * @name        The name of the zpool (e.g. zram0, zswap)
 * @gfp         The GFP flags to use when allocating the pool.
 * @ops         The optional ops callback.
 *
@@ -140,7 +141,8 @@ static void zpool_put_driver(struct zpool_driver *driver)
 *
 * Returns: New zpool on success, NULL on failure.
 */
-struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
+struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
+                struct zpool_ops *ops)
 {
        struct zpool_driver *driver;
        struct zpool *zpool;
@@ -168,7 +170,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
        zpool->type = driver->type;
        zpool->driver = driver;
-        zpool->pool = driver->create(gfp, ops);
+        zpool->pool = driver->create(name, gfp, ops);
        zpool->ops = ops;
        if (!zpool->pool) {
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b72403927aa4..0dec1fa5f656 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -91,6 +91,7 @@
 #include <linux/hardirq.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
+#include <linux/debugfs.h>
 #include <linux/zsmalloc.h>
 #include <linux/zpool.h>
@@ -168,6 +169,22 @@ enum fullness_group {
        ZS_FULL
 };
+enum zs_stat_type {
+        OBJ_ALLOCATED,
+        OBJ_USED,
+        NR_ZS_STAT_TYPE,
+};
+#ifdef CONFIG_ZSMALLOC_STAT
+static struct dentry *zs_stat_root;
+struct zs_size_stat {
+        unsigned long objs[NR_ZS_STAT_TYPE];
+};
+#endif
 /*
 * number of size_classes
 */
@@ -200,6 +217,10 @@ struct size_class {
        /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
        int pages_per_zspage;
+#ifdef CONFIG_ZSMALLOC_STAT
+        struct zs_size_stat stats;
+#endif
        spinlock_t lock;
        struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
@@ -217,10 +238,16 @@ struct link_free {
 };
 struct zs_pool {
+        char *name;
        struct size_class **size_class;
        gfp_t flags;    /* allocation flags used when growing pool */
        atomic_long_t pages_allocated;
+#ifdef CONFIG_ZSMALLOC_STAT
+        struct dentry *stat_dentry;
+#endif
 };
 /*
@@ -246,9 +273,9 @@ struct mapping_area {
 #ifdef CONFIG_ZPOOL
-static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
+static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops)
 {
-        return zs_create_pool(gfp);
+        return zs_create_pool(name, gfp);
 }
 static void zs_zpool_destroy(void *pool)
@@ -942,6 +969,166 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
        return true;
 }
+#ifdef CONFIG_ZSMALLOC_STAT
+static inline void zs_stat_inc(struct size_class *class,
+                                enum zs_stat_type type, unsigned long cnt)
+{
+        class->stats.objs[type] += cnt;
+}
+static inline void zs_stat_dec(struct size_class *class,
+                                enum zs_stat_type type, unsigned long cnt)
+{
+        class->stats.objs[type] -= cnt;
+}
+static inline unsigned long zs_stat_get(struct size_class *class,
+                                enum zs_stat_type type)
+{
+        return class->stats.objs[type];
+}
+static int __init zs_stat_init(void)
+{
+        if (!debugfs_initialized())
+                return -ENODEV;
+        zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
+        if (!zs_stat_root)
+                return -ENOMEM;
+        return 0;
+}
+static void __exit zs_stat_exit(void)
+{
+        debugfs_remove_recursive(zs_stat_root);
+}
+static int zs_stats_size_show(struct seq_file *s, void *v)
+{
+        int i;
+        struct zs_pool *pool = s->private;
+        struct size_class *class;
+        int objs_per_zspage;
+        unsigned long obj_allocated, obj_used, pages_used;
+        unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
+        seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
+                                "obj_allocated", "obj_used", "pages_used");
+        for (i = 0; i < zs_size_classes; i++) {
+                class = pool->size_class[i];
+                if (class->index != i)
+                        continue;
+                spin_lock(&class->lock);
+                obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
+                obj_used = zs_stat_get(class, OBJ_USED);
+                spin_unlock(&class->lock);
+                objs_per_zspage = get_maxobj_per_zspage(class->size,
+                                class->pages_per_zspage);
+                pages_used = obj_allocated / objs_per_zspage *
+                                class->pages_per_zspage;
+                seq_printf(s, " %5u %5u    %10lu %10lu %10lu\n", i,
+                        class->size, obj_allocated, obj_used, pages_used);
+                total_objs += obj_allocated;
+                total_used_objs += obj_used;
+                total_pages += pages_used;
+        }
+        seq_puts(s, "\n");
+        seq_printf(s, " %5s %5s    %10lu %10lu %10lu\n", "Total", "",
+                        total_objs, total_used_objs, total_pages);
+        return 0;
+}
+static int zs_stats_size_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, zs_stats_size_show, inode->i_private);
+}
+static const struct file_operations zs_stat_size_ops = {
+        .open           = zs_stats_size_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+        struct dentry *entry;
+        if (!zs_stat_root)
+                return -ENODEV;
+        entry = debugfs_create_dir(name, zs_stat_root);
+        if (!entry) {
+                pr_warn("debugfs dir <%s> creation failed\n", name);
+                return -ENOMEM;
+        }
+        pool->stat_dentry = entry;
+        entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
+                        pool->stat_dentry, pool, &zs_stat_size_ops);
+        if (!entry) {
+                pr_warn("%s: debugfs file entry <%s> creation failed\n",
+                                name, "obj_in_classes");
+                return -ENOMEM;
+        }
+        return 0;
+}
+static void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+        debugfs_remove_recursive(pool->stat_dentry);
+}
+#else /* CONFIG_ZSMALLOC_STAT */
+static inline void zs_stat_inc(struct size_class *class,
+                                enum zs_stat_type type, unsigned long cnt)
+{
+}
+static inline void zs_stat_dec(struct size_class *class,
+                                enum zs_stat_type type, unsigned long cnt)
+{
+}
+static inline unsigned long zs_stat_get(struct size_class *class,
+                                enum zs_stat_type type)
+{
+        return 0;
+}
+static int __init zs_stat_init(void)
+{
+        return 0;
+}
+static void __exit zs_stat_exit(void)
+{
+}
+static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+        return 0;
+}
+static inline void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+}
+#endif
 unsigned long zs_get_total_pages(struct zs_pool *pool)
 {
        return atomic_long_read(&pool->pages_allocated);
@@ -1074,7 +1261,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
                set_zspage_mapping(first_page, class->index, ZS_EMPTY);
                atomic_long_add(class->pages_per_zspage,
                                        &pool->pages_allocated);
                spin_lock(&class->lock);
+                zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+                                class->size, class->pages_per_zspage));
        }
        obj = (unsigned long)first_page->freelist;
@@ -1088,6 +1278,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
        kunmap_atomic(vaddr);
        first_page->inuse++;
+        zs_stat_inc(class, OBJ_USED, 1);
        /* Now move the zspage to another fullness group, if required */
        fix_fullness_group(pool, first_page);
        spin_unlock(&class->lock);
@@ -1128,6 +1319,12 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
        first_page->inuse--;
        fullness = fix_fullness_group(pool, first_page);
+        zs_stat_dec(class, OBJ_USED, 1);
+        if (fullness == ZS_EMPTY)
+                zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+                                class->size, class->pages_per_zspage));
        spin_unlock(&class->lock);
        if (fullness == ZS_EMPTY) {
@@ -1148,7 +1345,7 @@ EXPORT_SYMBOL_GPL(zs_free);
 * On success, a pointer to the newly created pool is returned,
 * otherwise NULL.
 */
-struct zs_pool *zs_create_pool(gfp_t flags)
+struct zs_pool *zs_create_pool(char *name, gfp_t flags)
 {
        int i;
        struct zs_pool *pool;
@@ -1158,9 +1355,16 @@ struct zs_pool *zs_create_pool(gfp_t flags)
        if (!pool)
                return NULL;
+        pool->name = kstrdup(name, GFP_KERNEL);
+        if (!pool->name) {
+                kfree(pool);
+                return NULL;
+        }
        pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
                        GFP_KERNEL);
        if (!pool->size_class) {
+                kfree(pool->name);
                kfree(pool);
                return NULL;
        }
@@ -1210,6 +1414,9 @@ struct zs_pool *zs_create_pool(gfp_t flags)
        pool->flags = flags;
+        if (zs_pool_stat_create(name, pool))
+                goto err;
        return pool;
 err:
@@ -1222,6 +1429,8 @@ void zs_destroy_pool(struct zs_pool *pool)
 {
        int i;
+        zs_pool_stat_destroy(pool);
        for (i = 0; i < zs_size_classes; i++) {
                int fg;
                struct size_class *class = pool->size_class[i];
@@ -1242,6 +1451,7 @@ void zs_destroy_pool(struct zs_pool *pool)
        }
        kfree(pool->size_class);
+        kfree(pool->name);
        kfree(pool);
 }
 EXPORT_SYMBOL_GPL(zs_destroy_pool);
@@ -1250,17 +1460,30 @@ static int __init zs_init(void)
 {
        int ret = zs_register_cpu_notifier();
-        if (ret) {
+        if (ret)
-                zs_unregister_cpu_notifier();
+                goto notifier_fail;
-                return ret;
-        }
        init_zs_size_classes();
 #ifdef CONFIG_ZPOOL
        zpool_register_driver(&zs_zpool_driver);
 #endif
+        ret = zs_stat_init();
+        if (ret) {
+                pr_err("zs stat initialization failed\n");
+                goto stat_fail;
+        }
        return 0;
+stat_fail:
+#ifdef CONFIG_ZPOOL
+        zpool_unregister_driver(&zs_zpool_driver);
+#endif
+notifier_fail:
+        zs_unregister_cpu_notifier();
+        return ret;
 }
 static void __exit zs_exit(void)
@@ -1269,6 +1492,8 @@ static void __exit zs_exit(void)
        zpool_unregister_driver(&zs_zpool_driver);
 #endif
        zs_unregister_cpu_notifier();
+        zs_stat_exit();
 }
 module_init(zs_init);
diff --git a/mm/zswap.c b/mm/zswap.c
index 0cfce9bc51e4..4249e82ff934 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -906,11 +906,12 @@ static int __init init_zswap(void)
        pr_info("loading zswap\n");
-        zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops);
+        zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
+                                        &zswap_zpool_ops);
        if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
                pr_info("%s zpool not available\n", zswap_zpool_type);
                zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
-                zswap_pool = zpool_create_pool(zswap_zpool_type, gfp,
+                zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
                                        &zswap_zpool_ops);
        }
        if (!zswap_pool) {