56 files changed, 2793 insertions, 1795 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 886db2158538..1d1ae6b078fd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,6 +137,9 @@ config HAVE_MEMBLOCK_NODE_MAP
 config HAVE_MEMBLOCK_PHYS_MAP
        boolean
+config HAVE_GENERIC_RCU_GUP
+        boolean
 config ARCH_DISCARD_MEMBLOCK
        boolean
@@ -228,11 +231,16 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK
        boolean
 #
+# support for memory balloon
+config MEMORY_BALLOON
+        boolean
+#
 # support for memory balloon compaction
 config BALLOON_COMPACTION
        bool "Allow for balloon memory compaction/migration"
        def_bool y
-        depends on COMPACTION && VIRTIO_BALLOON
+        depends on COMPACTION && MEMORY_BALLOON
        help
          Memory fragmentation introduced by ballooning might reduce
          significantly the number of 2MB contiguous memory blocks that can be
diff --git a/mm/Makefile b/mm/Makefile
index 2ad574d1d12d..8405eb0023a9 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -16,9 +16,9 @@ obj-y			:= filemap.o mempool.o oom_kill.o \
                           readahead.o swap.o truncate.o vmscan.o shmem.o \
                           util.o mmzone.o vmstat.o backing-dev.o \
                           mm_init.o mmu_context.o percpu.o slab_common.o \
-                           compaction.o balloon_compaction.o vmacache.o \
+                           compaction.o vmacache.o \
                           interval_tree.o list_lru.o workingset.o \
-                           iov_iter.o $(mmu-y)
+                           iov_iter.o debug.o $(mmu-y)
 obj-y += init-mm.o
@@ -68,3 +68,4 @@ obj-$(CONFIG_ZBUD)	+= zbud.o
 obj-$(CONFIG_ZSMALLOC)  += zsmalloc.o
 obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
 obj-$(CONFIG_CMA)       += cma.o
+obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1706cbbdf5f0..12a992b62576 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -455,7 +455,7 @@ int bdi_init(struct backing_dev_info *bdi)
        bdi_wb_init(&bdi->wb, bdi);
        for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
-                err = percpu_counter_init(&bdi->bdi_stat[i], 0);
+                err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL);
                if (err)
                        goto err;
        }
@@ -470,7 +470,7 @@ int bdi_init(struct backing_dev_info *bdi)
        bdi->write_bandwidth = INIT_BW;
        bdi->avg_write_bandwidth = INIT_BW;
-        err = fprop_local_init_percpu(&bdi->completions);
+        err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL);
        if (err) {
 err:
@@ -631,7 +631,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
         * of sleeping on the congestion queue
         */
        if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
-                        !zone_is_reclaim_congested(zone)) {
+            !test_bit(ZONE_CONGESTED, &zone->flags)) {
                cond_resched();
                /* In case we scheduled, work out time remaining */
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 6e45a5074bf0..b3cbe19f71b5 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -11,32 +11,6 @@
 #include <linux/balloon_compaction.h>
 /*
- * balloon_devinfo_alloc - allocates a balloon device information descriptor.
- * @balloon_dev_descriptor: pointer to reference the balloon device which
- *                          this struct balloon_dev_info will be servicing.
- *
- * Driver must call it to properly allocate and initialize an instance of
- * struct balloon_dev_info which will be used to reference a balloon device
- * as well as to keep track of the balloon device page list.
- */
-struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor)
-{
-        struct balloon_dev_info *b_dev_info;
-        b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL);
-        if (!b_dev_info)
-                return ERR_PTR(-ENOMEM);
-        b_dev_info->balloon_device = balloon_dev_descriptor;
-        b_dev_info->mapping = NULL;
-        b_dev_info->isolated_pages = 0;
-        spin_lock_init(&b_dev_info->pages_lock);
-        INIT_LIST_HEAD(&b_dev_info->pages);
-        return b_dev_info;
-}
-EXPORT_SYMBOL_GPL(balloon_devinfo_alloc);
-/*
 * balloon_page_enqueue - allocates a new page and inserts it into the balloon
 *                        page list.
 * @b_dev_info: balloon device decriptor where we will insert a new page to
@@ -61,7 +35,8 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
         */
        BUG_ON(!trylock_page(page));
        spin_lock_irqsave(&b_dev_info->pages_lock, flags);
-        balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages);
+        balloon_page_insert(b_dev_info, page);
+        __count_vm_event(BALLOON_INFLATE);
        spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
        unlock_page(page);
        return page;
@@ -93,18 +68,14 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
                 * to be released by the balloon driver.
                 */
                if (trylock_page(page)) {
+                        if (!PagePrivate(page)) {
+                                /* raced with isolation */
+                                unlock_page(page);
+                                continue;
+                        }
                        spin_lock_irqsave(&b_dev_info->pages_lock, flags);
-                        /*
-                         * Raise the page refcount here to prevent any wrong
-                         * attempt to isolate this page, in case of coliding
-                         * with balloon_page_isolate() just after we release
-                         * the page lock.
-                         *
-                         * balloon_page_free() will take care of dropping
-                         * this extra refcount later.
-                         */
-                        get_page(page);
                        balloon_page_delete(page);
+                        __count_vm_event(BALLOON_DEFLATE);
                        spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
                        unlock_page(page);
                        dequeued_page = true;
@@ -132,62 +103,14 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
 EXPORT_SYMBOL_GPL(balloon_page_dequeue);
 #ifdef CONFIG_BALLOON_COMPACTION
-/*
- * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages.
- * @b_dev_info: holds the balloon device information descriptor.
- * @a_ops: balloon_mapping address_space_operations descriptor.
- *
- * Driver must call it to properly allocate and initialize an instance of
- * struct address_space which will be used as the special page->mapping for
- * balloon device enlisted page instances.
- */
-struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
-                                const struct address_space_operations *a_ops)
-{
-        struct address_space *mapping;
-        mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
-        if (!mapping)
-                return ERR_PTR(-ENOMEM);
-        /*
-         * Give a clean 'zeroed' status to all elements of this special
-         * balloon page->mapping struct address_space instance.
-         */
-        address_space_init_once(mapping);
-        /*
-         * Set mapping->flags appropriately, to allow balloon pages
-         * ->mapping identification.
-         */
-        mapping_set_balloon(mapping);
-        mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask());
-        /* balloon's page->mapping->a_ops callback descriptor */
-        mapping->a_ops = a_ops;
-        /*
-         * Establish a pointer reference back to the balloon device descriptor
-         * this particular page->mapping will be servicing.
-         * This is used by compaction / migration procedures to identify and
-         * access the balloon device pageset while isolating / migrating pages.
-         *
-         * As some balloon drivers can register multiple balloon devices
-         * for a single guest, this also helps compaction / migration to
-         * properly deal with multiple balloon pagesets, when required.
-         */
-        mapping->private_data = b_dev_info;
-        b_dev_info->mapping = mapping;
-        return mapping;
-}
-EXPORT_SYMBOL_GPL(balloon_mapping_alloc);
 static inline void __isolate_balloon_page(struct page *page)
 {
-        struct balloon_dev_info *b_dev_info = page->mapping->private_data;
+        struct balloon_dev_info *b_dev_info = balloon_page_device(page);
        unsigned long flags;
        spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+        ClearPagePrivate(page);
        list_del(&page->lru);
        b_dev_info->isolated_pages++;
        spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
@@ -195,20 +118,16 @@ static inline void __isolate_balloon_page(struct page *page)
 static inline void __putback_balloon_page(struct page *page)
 {
-        struct balloon_dev_info *b_dev_info = page->mapping->private_data;
+        struct balloon_dev_info *b_dev_info = balloon_page_device(page);
        unsigned long flags;
        spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+        SetPagePrivate(page);
        list_add(&page->lru, &b_dev_info->pages);
        b_dev_info->isolated_pages--;
        spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
 }
-static inline int __migrate_balloon_page(struct address_space *mapping,
-                struct page *newpage, struct page *page, enum migrate_mode mode)
-{
-        return page->mapping->a_ops->migratepage(mapping, newpage, page, mode);
-}
 /* __isolate_lru_page() counterpart for a ballooned page */
 bool balloon_page_isolate(struct page *page)
 {
@@ -235,12 +154,11 @@ bool balloon_page_isolate(struct page *page)
                 */
                if (likely(trylock_page(page))) {
                        /*
-                         * A ballooned page, by default, has just one refcount.
+                         * A ballooned page, by default, has PagePrivate set.
                         * Prevent concurrent compaction threads from isolating
-                         * an already isolated balloon page by refcount check.
+                         * an already isolated balloon page by clearing it.
                         */
-                        if (__is_movable_balloon_page(page) &&
+                        if (balloon_page_movable(page)) {
-                            page_count(page) == 2) {
                                __isolate_balloon_page(page);
                                unlock_page(page);
                                return true;
@@ -276,7 +194,7 @@ void balloon_page_putback(struct page *page)
 int balloon_page_migrate(struct page *newpage,
                         struct page *page, enum migrate_mode mode)
 {
-        struct address_space *mapping;
+        struct balloon_dev_info *balloon = balloon_page_device(page);
        int rc = -EAGAIN;
        /*
@@ -292,9 +210,8 @@ int balloon_page_migrate(struct page *newpage,
                return rc;
        }
-        mapping = page->mapping;
+        if (balloon && balloon->migratepage)
-        if (mapping)
+                rc = balloon->migratepage(balloon, newpage, page, mode);
-                rc = __migrate_balloon_page(mapping, newpage, page, mode);
        unlock_page(newpage);
        return rc;
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 90bd3507b413..8a000cebb0d7 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -16,9 +16,9 @@
 #include <linux/kmemleak.h>
 #include <linux/range.h>
 #include <linux/memblock.h>
+#include <linux/bug.h>
+#include <linux/io.h>
-#include <asm/bug.h>
-#include <asm/io.h>
 #include <asm/processor.h>
 #include "internal.h"
diff --git a/mm/cma.c b/mm/cma.c
index c17751c0dcaf..474c644a0dc6 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -32,6 +32,7 @@
 #include <linux/slab.h>
 #include <linux/log2.h>
 #include <linux/cma.h>
+#include <linux/highmem.h>
 struct cma {
        unsigned long   base_pfn;
@@ -163,6 +164,8 @@ int __init cma_declare_contiguous(phys_addr_t base,
                        bool fixed, struct cma **res_cma)
 {
        struct cma *cma;
+        phys_addr_t memblock_end = memblock_end_of_DRAM();
+        phys_addr_t highmem_start = __pa(high_memory);
        int ret = 0;
        pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n",
@@ -196,6 +199,24 @@ int __init cma_declare_contiguous(phys_addr_t base,
        if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit))
                return -EINVAL;
+        /*
+         * adjust limit to avoid crossing low/high memory boundary for
+         * automatically allocated regions
+         */
+        if (((limit == 0 || limit > memblock_end) &&
+             (memblock_end - size < highmem_start &&
+              memblock_end > highmem_start)) ||
+            (!fixed && limit > highmem_start && limit - size < highmem_start)) {
+                limit = highmem_start;
+        }
+        if (fixed && base < highmem_start && base+size > highmem_start) {
+                ret = -EINVAL;
+                pr_err("Region at %08lx defined on low/high memory boundary (%08lx)\n",
+                        (unsigned long)base, (unsigned long)highmem_start);
+                goto err;
+        }
        /* Reserve memory */
        if (base && fixed) {
                if (memblock_is_region_reserved(base, size) ||
diff --git a/mm/compaction.c b/mm/compaction.c
index 21bf292b642a..edba18aed173 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -67,6 +67,49 @@ static inline bool migrate_async_suitable(int migratetype)
        return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
 }
+/*
+ * Check that the whole (or subset of) a pageblock given by the interval of
+ * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
+ * with the migration of free compaction scanner. The scanners then need to
+ * use only pfn_valid_within() check for arches that allow holes within
+ * pageblocks.
+ *
+ * Return struct page pointer of start_pfn, or NULL if checks were not passed.
+ *
+ * It's possible on some configurations to have a setup like node0 node1 node0
+ * i.e. it's possible that all pages within a zones range of pages do not
+ * belong to a single zone. We assume that a border between node0 and node1
+ * can occur within a single pageblock, but not a node0 node1 node0
+ * interleaving within a single pageblock. It is therefore sufficient to check
+ * the first and last page of a pageblock and avoid checking each individual
+ * page in a pageblock.
+ */
+static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
+                                unsigned long end_pfn, struct zone *zone)
+{
+        struct page *start_page;
+        struct page *end_page;
+        /* end_pfn is one past the range we are checking */
+        end_pfn--;
+        if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
+                return NULL;
+        start_page = pfn_to_page(start_pfn);
+        if (page_zone(start_page) != zone)
+                return NULL;
+        end_page = pfn_to_page(end_pfn);
+        /* This gives a shorter code than deriving page_zone(end_page) */
+        if (page_zone_id(start_page) != page_zone_id(end_page))
+                return NULL;
+        return start_page;
+}
 #ifdef CONFIG_COMPACTION
 /* Returns true if the pageblock should be scanned for pages to isolate. */
 static inline bool isolation_suitable(struct compact_control *cc,
@@ -132,7 +175,7 @@ void reset_isolation_suitable(pg_data_t *pgdat)
 */
 static void update_pageblock_skip(struct compact_control *cc,
                        struct page *page, unsigned long nr_isolated,
-                        bool set_unsuitable, bool migrate_scanner)
+                        bool migrate_scanner)
 {
        struct zone *zone = cc->zone;
        unsigned long pfn;
@@ -146,12 +189,7 @@ static void update_pageblock_skip(struct compact_control *cc,
        if (nr_isolated)
                return;
-        /*
+        set_pageblock_skip(page);
-         * Only skip pageblocks when all forms of compaction will be known to
-         * fail in the near future.
-         */
-        if (set_unsuitable)
-                set_pageblock_skip(page);
        pfn = page_to_pfn(page);
@@ -180,52 +218,77 @@ static inline bool isolation_suitable(struct compact_control *cc,
 static void update_pageblock_skip(struct compact_control *cc,
                        struct page *page, unsigned long nr_isolated,
-                        bool set_unsuitable, bool migrate_scanner)
+                        bool migrate_scanner)
 {
 }
 #endif /* CONFIG_COMPACTION */
-static inline bool should_release_lock(spinlock_t *lock)
+/*
+ * Compaction requires the taking of some coarse locks that are potentially
+ * very heavily contended. For async compaction, back out if the lock cannot
+ * be taken immediately. For sync compaction, spin on the lock if needed.
+ *
+ * Returns true if the lock is held
+ * Returns false if the lock is not held and compaction should abort
+ */
+static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
+                                                struct compact_control *cc)
 {
-        return need_resched() || spin_is_contended(lock);
+        if (cc->mode == MIGRATE_ASYNC) {
+                if (!spin_trylock_irqsave(lock, *flags)) {
+                        cc->contended = COMPACT_CONTENDED_LOCK;
+                        return false;
+                }
+        } else {
+                spin_lock_irqsave(lock, *flags);
+        }
+        return true;
 }
 /*
 * Compaction requires the taking of some coarse locks that are potentially
- * very heavily contended. Check if the process needs to be scheduled or
+ * very heavily contended. The lock should be periodically unlocked to avoid
- * if the lock is contended. For async compaction, back out in the event
+ * having disabled IRQs for a long time, even when there is nobody waiting on
- * if contention is severe. For sync compaction, schedule.
+ * the lock. It might also be that allowing the IRQs will result in
+ * need_resched() becoming true. If scheduling is needed, async compaction
+ * aborts. Sync compaction schedules.
+ * Either compaction type will also abort if a fatal signal is pending.
+ * In either case if the lock was locked, it is dropped and not regained.
 *
- * Returns true if the lock is held.
+ * Returns true if compaction should abort due to fatal signal pending, or
- * Returns false if the lock is released and compaction should abort
+ *              async compaction due to need_resched()
+ * Returns false when compaction can continue (sync compaction might have
+ *              scheduled)
 */
-static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
+static bool compact_unlock_should_abort(spinlock_t *lock,
-                                      bool locked, struct compact_control *cc)
+                unsigned long flags, bool *locked, struct compact_control *cc)
 {
-        if (should_release_lock(lock)) {
+        if (*locked) {
-                if (locked) {
+                spin_unlock_irqrestore(lock, flags);
-                        spin_unlock_irqrestore(lock, *flags);
+                *locked = false;
-                        locked = false;
+        }
-                }
+        if (fatal_signal_pending(current)) {
+                cc->contended = COMPACT_CONTENDED_SCHED;
+                return true;
+        }
-                /* async aborts if taking too long or contended */
+        if (need_resched()) {
                if (cc->mode == MIGRATE_ASYNC) {
-                        cc->contended = true;
+                        cc->contended = COMPACT_CONTENDED_SCHED;
-                        return false;
+                        return true;
                }
                cond_resched();
        }
-        if (!locked)
+        return false;
-                spin_lock_irqsave(lock, *flags);
-        return true;
 }
 /*
 * Aside from avoiding lock contention, compaction also periodically checks
 * need_resched() and either schedules in sync compaction or aborts async
- * compaction. This is similar to what compact_checklock_irqsave() does, but
+ * compaction. This is similar to what compact_unlock_should_abort() does, but
 * is used where no lock is concerned.
 *
 * Returns false when no scheduling was needed, or sync compaction scheduled.
@@ -236,7 +299,7 @@ static inline bool compact_should_abort(struct compact_control *cc)
        /* async compaction aborts if contended */
        if (need_resched()) {
                if (cc->mode == MIGRATE_ASYNC) {
-                        cc->contended = true;
+                        cc->contended = COMPACT_CONTENDED_SCHED;
                        return true;
                }
@@ -250,8 +313,15 @@ static inline bool compact_should_abort(struct compact_control *cc)
 static bool suitable_migration_target(struct page *page)
 {
        /* If the page is a large free page, then disallow migration */
-        if (PageBuddy(page) && page_order(page) >= pageblock_order)
+        if (PageBuddy(page)) {
-                return false;
+                /*
+                 * We are checking page_order without zone->lock taken. But
+                 * the only small danger is that we skip a potentially suitable
+                 * pageblock, so it's not worth to check order for valid range.
+                 */
+                if (page_order_unsafe(page) >= pageblock_order)
+                        return false;
+        }
        /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
        if (migrate_async_suitable(get_pageblock_migratetype(page)))
@@ -267,16 +337,16 @@ static bool suitable_migration_target(struct page *page)
 * (even though it may still end up isolating some pages).
 */
 static unsigned long isolate_freepages_block(struct compact_control *cc,
-                                unsigned long blockpfn,
+                                unsigned long *start_pfn,
                                unsigned long end_pfn,
                                struct list_head *freelist,
                                bool strict)
 {
        int nr_scanned = 0, total_isolated = 0;
        struct page *cursor, *valid_page = NULL;
-        unsigned long flags;
+        unsigned long flags = 0;
        bool locked = false;
-        bool checked_pageblock = false;
+        unsigned long blockpfn = *start_pfn;
        cursor = pfn_to_page(blockpfn);
@@ -285,6 +355,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
                int isolated, i;
                struct page *page = cursor;
+                /*
+                 * Periodically drop the lock (if held) regardless of its
+                 * contention, to give chance to IRQs. Abort if fatal signal
+                 * pending or async compaction detects need_resched()
+                 */
+                if (!(blockpfn % SWAP_CLUSTER_MAX)
+                    && compact_unlock_should_abort(&cc->zone->lock, flags,
+                                                                &locked, cc))
+                        break;
                nr_scanned++;
                if (!pfn_valid_within(blockpfn))
                        goto isolate_fail;
@@ -295,33 +375,30 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
                        goto isolate_fail;
                /*
-                 * The zone lock must be held to isolate freepages.
+                 * If we already hold the lock, we can skip some rechecking.
-                 * Unfortunately this is a very coarse lock and can be
+                 * Note that if we hold the lock now, checked_pageblock was
-                 * heavily contended if there are parallel allocations
+                 * already set in some previous iteration (or strict is true),
-                 * or parallel compactions. For async compaction do not
+                 * so it is correct to skip the suitable migration target
-                 * spin on the lock and we acquire the lock as late as
+                 * recheck as well.
-                 * possible.
                 */
-                locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
+                if (!locked) {
-                                                                locked, cc);
-                if (!locked)
-                        break;
-                /* Recheck this is a suitable migration target under lock */
-                if (!strict && !checked_pageblock) {
                        /*
-                         * We need to check suitability of pageblock only once
+                         * The zone lock must be held to isolate freepages.
-                         * and this isolate_freepages_block() is called with
+                         * Unfortunately this is a very coarse lock and can be
-                         * pageblock range, so just check once is sufficient.
+                         * heavily contended if there are parallel allocations
+                         * or parallel compactions. For async compaction do not
+                         * spin on the lock and we acquire the lock as late as
+                         * possible.
                         */
-                        checked_pageblock = true;
+                        locked = compact_trylock_irqsave(&cc->zone->lock,
-                        if (!suitable_migration_target(page))
+                                                                &flags, cc);
+                        if (!locked)
                                break;
-                }
-                /* Recheck this is a buddy page under lock */
+                        /* Recheck this is a buddy page under lock */
-                if (!PageBuddy(page))
+                        if (!PageBuddy(page))
-                        goto isolate_fail;
+                                goto isolate_fail;
+                }
                /* Found a free page, break it into order-0 pages */
                isolated = split_free_page(page);
@@ -346,6 +423,9 @@ isolate_fail:
        }
+        /* Record how far we have got within the block */
+        *start_pfn = blockpfn;
        trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
        /*
@@ -361,8 +441,7 @@ isolate_fail:
        /* Update the pageblock-skip if the whole pageblock was scanned */
        if (blockpfn == end_pfn)
-                update_pageblock_skip(cc, valid_page, total_isolated, true,
+                update_pageblock_skip(cc, valid_page, total_isolated, false);
-                                      false);
        count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
        if (total_isolated)
@@ -390,19 +469,21 @@ isolate_freepages_range(struct compact_control *cc,
        unsigned long isolated, pfn, block_end_pfn;
        LIST_HEAD(freelist);
-        for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
+        pfn = start_pfn;
-                if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn)))
+        block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
-                        break;
+        for (; pfn < end_pfn; pfn += isolated,
+                                block_end_pfn += pageblock_nr_pages) {
+                /* Protect pfn from changing by isolate_freepages_block */
+                unsigned long isolate_start_pfn = pfn;
-                /*
-                 * On subsequent iterations ALIGN() is actually not needed,
-                 * but we keep it that we not to complicate the code.
-                 */
-                block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
                block_end_pfn = min(block_end_pfn, end_pfn);
-                isolated = isolate_freepages_block(cc, pfn, block_end_pfn,
+                if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
-                                                   &freelist, true);
+                        break;
+                isolated = isolate_freepages_block(cc, &isolate_start_pfn,
+                                                block_end_pfn, &freelist, true);
                /*
                 * In strict mode, isolate_freepages_block() returns 0 if
@@ -433,22 +514,19 @@ isolate_freepages_range(struct compact_control *cc,
 }
 /* Update the number of anon and file isolated pages in the zone */
-static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc)
+static void acct_isolated(struct zone *zone, struct compact_control *cc)
 {
        struct page *page;
        unsigned int count[2] = { 0, };
+        if (list_empty(&cc->migratepages))
+                return;
        list_for_each_entry(page, &cc->migratepages, lru)
                count[!!page_is_file_cache(page)]++;
-        /* If locked we can use the interrupt unsafe versions */
+        mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
-        if (locked) {
+        mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
-                __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
-                __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
-        } else {
-                mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
-                mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
-        }
 }
 /* Similar to reclaim, but different enough that they don't share logic */
@@ -467,40 +545,34 @@ static bool too_many_isolated(struct zone *zone)
 }
 /**
- * isolate_migratepages_range() - isolate all migrate-able pages in range.
+ * isolate_migratepages_block() - isolate all migrate-able pages within
- * @zone:       Zone pages are in.
+ *                                a single pageblock
 * @cc:         Compaction control structure.
- * @low_pfn:    The first PFN of the range.
+ * @low_pfn:    The first PFN to isolate
- * @end_pfn:    The one-past-the-last PFN of the range.
+ * @end_pfn:    The one-past-the-last PFN to isolate, within same pageblock
- * @unevictable: true if it allows to isolate unevictable pages
+ * @isolate_mode: Isolation mode to be used.
 *
 * Isolate all pages that can be migrated from the range specified by
- * [low_pfn, end_pfn).  Returns zero if there is a fatal signal
+ * [low_pfn, end_pfn). The range is expected to be within same pageblock.
- * pending), otherwise PFN of the first page that was not scanned
+ * Returns zero if there is a fatal signal pending, otherwise PFN of the
- * (which may be both less, equal to or more then end_pfn).
+ * first page that was not scanned (which may be both less, equal to or more
+ * than end_pfn).
 *
- * Assumes that cc->migratepages is empty and cc->nr_migratepages is
+ * The pages are isolated on cc->migratepages list (not required to be empty),
- * zero.
+ * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
- *
+ * is neither read nor updated.
- * Apart from cc->migratepages and cc->nr_migratetypes this function
- * does not modify any cc's fields, in particular it does not modify
- * (or read for that matter) cc->migrate_pfn.
 */
-unsigned long
+static unsigned long
-isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
-                unsigned long low_pfn, unsigned long end_pfn, bool unevictable)
+                        unsigned long end_pfn, isolate_mode_t isolate_mode)
 {
-        unsigned long last_pageblock_nr = 0, pageblock_nr;
+        struct zone *zone = cc->zone;
        unsigned long nr_scanned = 0, nr_isolated = 0;
        struct list_head *migratelist = &cc->migratepages;
        struct lruvec *lruvec;
-        unsigned long flags;
+        unsigned long flags = 0;
        bool locked = false;
        struct page *page = NULL, *valid_page = NULL;
-        bool set_unsuitable = true;
-        const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
-                                        ISOLATE_ASYNC_MIGRATE : 0) |
-                                    (unevictable ? ISOLATE_UNEVICTABLE : 0);
        /*
         * Ensure that there are not too many pages isolated from the LRU
@@ -523,72 +595,43 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
        /* Time to isolate some pages for migration */
        for (; low_pfn < end_pfn; low_pfn++) {
-                /* give a chance to irqs before checking need_resched() */
-                if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
-                        if (should_release_lock(&zone->lru_lock)) {
-                                spin_unlock_irqrestore(&zone->lru_lock, flags);
-                                locked = false;
-                        }
-                }
                /*
-                 * migrate_pfn does not necessarily start aligned to a
+                 * Periodically drop the lock (if held) regardless of its
-                 * pageblock. Ensure that pfn_valid is called when moving
+                 * contention, to give chance to IRQs. Abort async compaction
-                 * into a new MAX_ORDER_NR_PAGES range in case of large
+                 * if contended.
-                 * memory holes within the zone
                 */
-                if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
+                if (!(low_pfn % SWAP_CLUSTER_MAX)
-                        if (!pfn_valid(low_pfn)) {
+                    && compact_unlock_should_abort(&zone->lru_lock, flags,
-                                low_pfn += MAX_ORDER_NR_PAGES - 1;
+                                                                &locked, cc))
-                                continue;
+                        break;
-                        }
-                }
                if (!pfn_valid_within(low_pfn))
                        continue;
                nr_scanned++;
-                /*
-                 * Get the page and ensure the page is within the same zone.
-                 * See the comment in isolate_freepages about overlapping
-                 * nodes. It is deliberate that the new zone lock is not taken
-                 * as memory compaction should not move pages between nodes.
-                 */
                page = pfn_to_page(low_pfn);
-                if (page_zone(page) != zone)
-                        continue;
                if (!valid_page)
                        valid_page = page;
-                /* If isolation recently failed, do not retry */
+                /*
-                pageblock_nr = low_pfn >> pageblock_order;
+                 * Skip if free. We read page order here without zone lock
-                if (last_pageblock_nr != pageblock_nr) {
+                 * which is generally unsafe, but the race window is small and
-                        int mt;
+                 * the worst thing that can happen is that we skip some
+                 * potential isolation targets.
-                        last_pageblock_nr = pageblock_nr;
+                 */
-                        if (!isolation_suitable(cc, page))
+                if (PageBuddy(page)) {
-                                goto next_pageblock;
+                        unsigned long freepage_order = page_order_unsafe(page);
                        /*
-                         * For async migration, also only scan in MOVABLE
+                         * Without lock, we cannot be sure that what we got is
-                         * blocks. Async migration is optimistic to see if
+                         * a valid page order. Consider only values in the
-                         * the minimum amount of work satisfies the allocation
+                         * valid order range to prevent low_pfn overflow.
                         */
-                        mt = get_pageblock_migratetype(page);
+                        if (freepage_order > 0 && freepage_order < MAX_ORDER)
-                        if (cc->mode == MIGRATE_ASYNC &&
+                                low_pfn += (1UL << freepage_order) - 1;
-                            !migrate_async_suitable(mt)) {
-                                set_unsuitable = false;
-                                goto next_pageblock;
-                        }
-                }
-                /*
-                 * Skip if free. page_order cannot be used without zone->lock
-                 * as nothing prevents parallel allocations or buddy merging.
-                 */
-                if (PageBuddy(page))
                        continue;
+                }
                /*
                 * Check may be lockless but that's ok as we recheck later.
@@ -597,7 +640,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                 */
                if (!PageLRU(page)) {
                        if (unlikely(balloon_page_movable(page))) {
-                                if (locked && balloon_page_isolate(page)) {
+                                if (balloon_page_isolate(page)) {
                                        /* Successfully isolated */
                                        goto isolate_success;
                                }
@@ -617,8 +660,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                 */
                if (PageTransHuge(page)) {
                        if (!locked)
-                                goto next_pageblock;
+                                low_pfn = ALIGN(low_pfn + 1,
-                        low_pfn += (1 << compound_order(page)) - 1;
+                                                pageblock_nr_pages) - 1;
+                        else
+                                low_pfn += (1 << compound_order(page)) - 1;
                        continue;
                }
@@ -631,24 +677,26 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                    page_count(page) > page_mapcount(page))
                        continue;
-                /* Check if it is ok to still hold the lock */
+                /* If we already hold the lock, we can skip some rechecking */
-                locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
+                if (!locked) {
-                                                                locked, cc);
+                        locked = compact_trylock_irqsave(&zone->lru_lock,
-                if (!locked || fatal_signal_pending(current))
+                                                                &flags, cc);
-                        break;
+                        if (!locked)
+                                break;
-                /* Recheck PageLRU and PageTransHuge under lock */
+                        /* Recheck PageLRU and PageTransHuge under lock */
-                if (!PageLRU(page))
+                        if (!PageLRU(page))
-                        continue;
+                                continue;
-                if (PageTransHuge(page)) {
+                        if (PageTransHuge(page)) {
-                        low_pfn += (1 << compound_order(page)) - 1;
+                                low_pfn += (1 << compound_order(page)) - 1;
-                        continue;
+                                continue;
+                        }
                }
                lruvec = mem_cgroup_page_lruvec(page, zone);
                /* Try isolate the page */
-                if (__isolate_lru_page(page, mode) != 0)
+                if (__isolate_lru_page(page, isolate_mode) != 0)
                        continue;
                VM_BUG_ON_PAGE(PageTransCompound(page), page);
@@ -667,14 +715,14 @@ isolate_success:
                        ++low_pfn;
                        break;
                }
-                continue;
-next_pageblock:
-                low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
        }
-        acct_isolated(zone, locked, cc);
+        /*
+         * The PageBuddy() check could have potentially brought us outside
+         * the range to be scanned.
+         */
+        if (unlikely(low_pfn > end_pfn))
+                low_pfn = end_pfn;
        if (locked)
                spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -684,8 +732,7 @@ next_pageblock:
         * if the whole pageblock was scanned without isolating any page.
         */
        if (low_pfn == end_pfn)
-                update_pageblock_skip(cc, valid_page, nr_isolated,
+                update_pageblock_skip(cc, valid_page, nr_isolated, true);
-                                      set_unsuitable, true);
        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
@@ -696,17 +743,65 @@ next_pageblock:
        return low_pfn;
 }
+/**
+ * isolate_migratepages_range() - isolate migrate-able pages in a PFN range
+ * @cc:        Compaction control structure.
+ * @start_pfn: The first PFN to start isolating.
+ * @end_pfn:   The one-past-last PFN.
+ *
+ * Returns zero if isolation fails fatally due to e.g. pending signal.
+ * Otherwise, function returns one-past-the-last PFN of isolated page
+ * (which may be greater than end_pfn if end fell in a middle of a THP page).
+ */
+unsigned long
+isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
+                                                        unsigned long end_pfn)
+{
+        unsigned long pfn, block_end_pfn;
+        /* Scan block by block. First and last block may be incomplete */
+        pfn = start_pfn;
+        block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+        for (; pfn < end_pfn; pfn = block_end_pfn,
+                                block_end_pfn += pageblock_nr_pages) {
+                block_end_pfn = min(block_end_pfn, end_pfn);
+                if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
+                        continue;
+                pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
+                                                        ISOLATE_UNEVICTABLE);
+                /*
+                 * In case of fatal failure, release everything that might
+                 * have been isolated in the previous iteration, and signal
+                 * the failure back to caller.
+                 */
+                if (!pfn) {
+                        putback_movable_pages(&cc->migratepages);
+                        cc->nr_migratepages = 0;
+                        break;
+                }
+        }
+        acct_isolated(cc->zone, cc);
+        return pfn;
+}
 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
 #ifdef CONFIG_COMPACTION
 /*
 * Based on information in the current compact_control, find blocks
 * suitable for isolating free pages from and then isolate them.
 */
-static void isolate_freepages(struct zone *zone,
+static void isolate_freepages(struct compact_control *cc)
-                                struct compact_control *cc)
 {
+        struct zone *zone = cc->zone;
        struct page *page;
        unsigned long block_start_pfn;  /* start of current pageblock */
+        unsigned long isolate_start_pfn; /* exact pfn we start at */
        unsigned long block_end_pfn;    /* end of current pageblock */
        unsigned long low_pfn;       /* lowest pfn scanner is able to scan */
        int nr_freepages = cc->nr_freepages;
@@ -715,14 +810,15 @@ static void isolate_freepages(struct zone *zone,
        /*
         * Initialise the free scanner. The starting point is where we last
         * successfully isolated from, zone-cached value, or the end of the
-         * zone when isolating for the first time. We need this aligned to
+         * zone when isolating for the first time. For looping we also need
-         * the pageblock boundary, because we do
+         * this pfn aligned down to the pageblock boundary, because we do
         * block_start_pfn -= pageblock_nr_pages in the for loop.
         * For ending point, take care when isolating in last pageblock of a
         * a zone which ends in the middle of a pageblock.
         * The low boundary is the end of the pageblock the migration scanner
         * is using.
         */
+        isolate_start_pfn = cc->free_pfn;
        block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
        block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
                                                zone_end_pfn(zone));
@@ -735,7 +831,8 @@ static void isolate_freepages(struct zone *zone,
         */
        for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
                                block_end_pfn = block_start_pfn,
-                                block_start_pfn -= pageblock_nr_pages) {
+                                block_start_pfn -= pageblock_nr_pages,
+                                isolate_start_pfn = block_start_pfn) {
                unsigned long isolated;
                /*
@@ -747,18 +844,9 @@ static void isolate_freepages(struct zone *zone,
                                                && compact_should_abort(cc))
                        break;
-                if (!pfn_valid(block_start_pfn))
+                page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
-                        continue;
+                                                                        zone);
+                if (!page)
-                /*
-                 * Check for overlapping nodes/zones. It's possible on some
-                 * configurations to have a setup like
-                 * node0 node1 node0
-                 * i.e. it's possible that all pages within a zones range of
-                 * pages do not belong to a single zone.
-                 */
-                page = pfn_to_page(block_start_pfn);
-                if (page_zone(page) != zone)
                        continue;
                /* Check the block is suitable for migration */
@@ -769,13 +857,25 @@ static void isolate_freepages(struct zone *zone,
                if (!isolation_suitable(cc, page))
                        continue;
-                /* Found a block suitable for isolating free pages from */
+                /* Found a block suitable for isolating free pages from. */
-                cc->free_pfn = block_start_pfn;
+                isolated = isolate_freepages_block(cc, &isolate_start_pfn,
-                isolated = isolate_freepages_block(cc, block_start_pfn,
                                        block_end_pfn, freelist, false);
                nr_freepages += isolated;
                /*
+                 * Remember where the free scanner should restart next time,
+                 * which is where isolate_freepages_block() left off.
+                 * But if it scanned the whole pageblock, isolate_start_pfn
+                 * now points at block_end_pfn, which is the start of the next
+                 * pageblock.
+                 * In that case we will however want to restart at the start
+                 * of the previous pageblock.
+                 */
+                cc->free_pfn = (isolate_start_pfn < block_end_pfn) ?
+                                isolate_start_pfn :
+                                block_start_pfn - pageblock_nr_pages;
+                /*
                 * Set a flag that we successfully isolated in this pageblock.
                 * In the next loop iteration, zone->compact_cached_free_pfn
                 * will not be updated and thus it will effectively contain the
@@ -822,7 +922,7 @@ static struct page *compaction_alloc(struct page *migratepage,
         */
        if (list_empty(&cc->freepages)) {
                if (!cc->contended)
-                        isolate_freepages(cc->zone, cc);
+                        isolate_freepages(cc);
                if (list_empty(&cc->freepages))
                        return NULL;
@@ -856,38 +956,84 @@ typedef enum {
 } isolate_migrate_t;
 /*
- * Isolate all pages that can be migrated from the block pointed to by
+ * Isolate all pages that can be migrated from the first suitable block,
- * the migrate scanner within compact_control.
+ * starting at the block pointed to by the migrate scanner pfn within
+ * compact_control.
 */
 static isolate_migrate_t isolate_migratepages(struct zone *zone,
                                        struct compact_control *cc)
 {
        unsigned long low_pfn, end_pfn;
+        struct page *page;
+        const isolate_mode_t isolate_mode =
+                (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
-        /* Do not scan outside zone boundaries */
+        /*
-        low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
+         * Start at where we last stopped, or beginning of the zone as
+         * initialized by compact_zone()
+         */
+        low_pfn = cc->migrate_pfn;
        /* Only scan within a pageblock boundary */
        end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
-        /* Do not cross the free scanner or scan within a memory hole */
+        /*
-        if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
+         * Iterate over whole pageblocks until we find the first suitable.
-                cc->migrate_pfn = end_pfn;
+         * Do not cross the free scanner.
-                return ISOLATE_NONE;
+         */
-        }
+        for (; end_pfn <= cc->free_pfn;
+                        low_pfn = end_pfn, end_pfn += pageblock_nr_pages) {
-        /* Perform the isolation */
+                /*
-        low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false);
+                 * This can potentially iterate a massively long zone with
-        if (!low_pfn || cc->contended)
+                 * many pageblocks unsuitable, so periodically check if we
-                return ISOLATE_ABORT;
+                 * need to schedule, or even abort async compaction.
+                 */
+                if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
+                                                && compact_should_abort(cc))
+                        break;
+                page = pageblock_pfn_to_page(low_pfn, end_pfn, zone);
+                if (!page)
+                        continue;
+                /* If isolation recently failed, do not retry */
+                if (!isolation_suitable(cc, page))
+                        continue;
+                /*
+                 * For async compaction, also only scan in MOVABLE blocks.
+                 * Async compaction is optimistic to see if the minimum amount
+                 * of work satisfies the allocation.
+                 */
+                if (cc->mode == MIGRATE_ASYNC &&
+                    !migrate_async_suitable(get_pageblock_migratetype(page)))
+                        continue;
+                /* Perform the isolation */
+                low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
+                                                                isolate_mode);
+                if (!low_pfn || cc->contended)
+                        return ISOLATE_ABORT;
+                /*
+                 * Either we isolated something and proceed with migration. Or
+                 * we failed and compact_zone should decide if we should
+                 * continue or not.
+                 */
+                break;
+        }
+        acct_isolated(zone, cc);
+        /* Record where migration scanner will be restarted */
        cc->migrate_pfn = low_pfn;
-        return ISOLATE_SUCCESS;
+        return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
 }
-static int compact_finished(struct zone *zone,
+static int compact_finished(struct zone *zone, struct compact_control *cc,
-                            struct compact_control *cc)
+                            const int migratetype)
 {
        unsigned int order;
        unsigned long watermark;
@@ -933,7 +1079,7 @@ static int compact_finished(struct zone *zone,
                struct free_area *area = &zone->free_area[order];
                /* Job done if page is free of the right migratetype */
-                if (!list_empty(&area->free_list[cc->migratetype]))
+                if (!list_empty(&area->free_list[migratetype]))
                        return COMPACT_PARTIAL;
                /* Job done if allocation would set block type */
@@ -999,6 +1145,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        int ret;
        unsigned long start_pfn = zone->zone_start_pfn;
        unsigned long end_pfn = zone_end_pfn(zone);
+        const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
        const bool sync = cc->mode != MIGRATE_ASYNC;
        ret = compaction_suitable(zone, cc->order);
@@ -1041,7 +1188,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        migrate_prep_local();
-        while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
+        while ((ret = compact_finished(zone, cc, migratetype)) ==
+                                                COMPACT_CONTINUE) {
                int err;
                switch (isolate_migratepages(zone, cc)) {
@@ -1056,9 +1204,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                        ;
                }
-                if (!cc->nr_migratepages)
-                        continue;
                err = migrate_pages(&cc->migratepages, compaction_alloc,
                                compaction_free, (unsigned long)cc, cc->mode,
                                MR_COMPACTION);
@@ -1092,14 +1237,14 @@ out:
 }
 static unsigned long compact_zone_order(struct zone *zone, int order,
-                gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
+                gfp_t gfp_mask, enum migrate_mode mode, int *contended)
 {
        unsigned long ret;
        struct compact_control cc = {
                .nr_freepages = 0,
                .nr_migratepages = 0,
                .order = order,
-                .migratetype = allocflags_to_migratetype(gfp_mask),
+                .gfp_mask = gfp_mask,
                .zone = zone,
                .mode = mode,
        };
@@ -1124,48 +1269,117 @@ int sysctl_extfrag_threshold = 500;
 * @gfp_mask: The GFP mask of the current allocation
 * @nodemask: The allowed nodes to allocate from
 * @mode: The migration mode for async, sync light, or sync migration
- * @contended: Return value that is true if compaction was aborted due to lock contention
+ * @contended: Return value that determines if compaction was aborted due to
- * @page: Optionally capture a free page of the requested order during compaction
+ *             need_resched() or lock contention
+ * @candidate_zone: Return the zone where we think allocation should succeed
 *
 * This is the main entry point for direct page compaction.
 */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
-                        enum migrate_mode mode, bool *contended)
+                        enum migrate_mode mode, int *contended,
+                        struct zone **candidate_zone)
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
        int may_perform_io = gfp_mask & __GFP_IO;
        struct zoneref *z;
        struct zone *zone;
-        int rc = COMPACT_SKIPPED;
+        int rc = COMPACT_DEFERRED;
        int alloc_flags = 0;
+        int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
+        *contended = COMPACT_CONTENDED_NONE;
        /* Check if the GFP flags allow compaction */
        if (!order || !may_enter_fs || !may_perform_io)
-                return rc;
+                return COMPACT_SKIPPED;
-        count_compact_event(COMPACTSTALL);
 #ifdef CONFIG_CMA
-        if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+        if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
                alloc_flags |= ALLOC_CMA;
 #endif
        /* Compact each zone in the list */
        for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
                                                                nodemask) {
                int status;
+                int zone_contended;
+                if (compaction_deferred(zone, order))
+                        continue;
                status = compact_zone_order(zone, order, gfp_mask, mode,
-                                                contended);
+                                                        &zone_contended);
                rc = max(status, rc);
+                /*
+                 * It takes at least one zone that wasn't lock contended
+                 * to clear all_zones_contended.
+                 */
+                all_zones_contended &= zone_contended;
                /* If a normal allocation would succeed, stop compacting */
                if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
-                                      alloc_flags))
+                                      alloc_flags)) {
-                        break;
+                        *candidate_zone = zone;
+                        /*
+                         * We think the allocation will succeed in this zone,
+                         * but it is not certain, hence the false. The caller
+                         * will repeat this with true if allocation indeed
+                         * succeeds in this zone.
+                         */
+                        compaction_defer_reset(zone, order, false);
+                        /*
+                         * It is possible that async compaction aborted due to
+                         * need_resched() and the watermarks were ok thanks to
+                         * somebody else freeing memory. The allocation can
+                         * however still fail so we better signal the
+                         * need_resched() contention anyway (this will not
+                         * prevent the allocation attempt).
+                         */
+                        if (zone_contended == COMPACT_CONTENDED_SCHED)
+                                *contended = COMPACT_CONTENDED_SCHED;
+                        goto break_loop;
+                }
+                if (mode != MIGRATE_ASYNC) {
+                        /*
+                         * We think that allocation won't succeed in this zone
+                         * so we defer compaction there. If it ends up
+                         * succeeding after all, it will be reset.
+                         */
+                        defer_compaction(zone, order);
+                }
+                /*
+                 * We might have stopped compacting due to need_resched() in
+                 * async compaction, or due to a fatal signal detected. In that
+                 * case do not try further zones and signal need_resched()
+                 * contention.
+                 */
+                if ((zone_contended == COMPACT_CONTENDED_SCHED)
+                                        || fatal_signal_pending(current)) {
+                        *contended = COMPACT_CONTENDED_SCHED;
+                        goto break_loop;
+                }
+                continue;
+break_loop:
+                /*
+                 * We might not have tried all the zones, so  be conservative
+                 * and assume they are not all lock contended.
+                 */
+                all_zones_contended = 0;
+                break;
        }
+        /*
+         * If at least one zone wasn't deferred or skipped, we report if all
+         * zones that were tried were lock contended.
+         */
+        if (rc > COMPACT_SKIPPED && all_zones_contended)
+                *contended = COMPACT_CONTENDED_LOCK;
        return rc;
 }
diff --git a/mm/debug.c b/mm/debug.c
new file mode 100644
index 000000000000..5ce45c9a29b5
--- /dev/null
+++ b/mm/debug.c
@@ -0,0 +1,237 @@
+/*
+ * mm/debug.c
+ *
+ * mm/ specific debug routines.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/ftrace_event.h>
+#include <linux/memcontrol.h>
+static const struct trace_print_flags pageflag_names[] = {
+        {1UL << PG_locked,              "locked"        },
+        {1UL << PG_error,               "error"         },
+        {1UL << PG_referenced,          "referenced"    },
+        {1UL << PG_uptodate,            "uptodate"      },
+        {1UL << PG_dirty,               "dirty"         },
+        {1UL << PG_lru,                 "lru"           },
+        {1UL << PG_active,              "active"        },
+        {1UL << PG_slab,                "slab"          },
+        {1UL << PG_owner_priv_1,        "owner_priv_1"  },
+        {1UL << PG_arch_1,              "arch_1"        },
+        {1UL << PG_reserved,            "reserved"      },
+        {1UL << PG_private,             "private"       },
+        {1UL << PG_private_2,           "private_2"     },
+        {1UL << PG_writeback,           "writeback"     },
+#ifdef CONFIG_PAGEFLAGS_EXTENDED
+        {1UL << PG_head,                "head"          },
+        {1UL << PG_tail,                "tail"          },
+#else
+        {1UL << PG_compound,            "compound"      },
+#endif
+        {1UL << PG_swapcache,           "swapcache"     },
+        {1UL << PG_mappedtodisk,        "mappedtodisk"  },
+        {1UL << PG_reclaim,             "reclaim"       },
+        {1UL << PG_swapbacked,          "swapbacked"    },
+        {1UL << PG_unevictable,         "unevictable"   },
+#ifdef CONFIG_MMU
+        {1UL << PG_mlocked,             "mlocked"       },
+#endif
+#ifdef CONFIG_ARCH_USES_PG_UNCACHED
+        {1UL << PG_uncached,            "uncached"      },
+#endif
+#ifdef CONFIG_MEMORY_FAILURE
+        {1UL << PG_hwpoison,            "hwpoison"      },
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        {1UL << PG_compound_lock,       "compound_lock" },
+#endif
+};
+static void dump_flags(unsigned long flags,
+                        const struct trace_print_flags *names, int count)
+{
+        const char *delim = "";
+        unsigned long mask;
+        int i;
+        pr_emerg("flags: %#lx(", flags);
+        /* remove zone id */
+        flags &= (1UL << NR_PAGEFLAGS) - 1;
+        for (i = 0; i < count && flags; i++) {
+                mask = names[i].mask;
+                if ((flags & mask) != mask)
+                        continue;
+                flags &= ~mask;
+                pr_cont("%s%s", delim, names[i].name);
+                delim = "|";
+        }
+        /* check for left over flags */
+        if (flags)
+                pr_cont("%s%#lx", delim, flags);
+        pr_cont(")\n");
+}
+void dump_page_badflags(struct page *page, const char *reason,
+                unsigned long badflags)
+{
+        pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
+                  page, atomic_read(&page->_count), page_mapcount(page),
+                  page->mapping, page->index);
+        BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
+        dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
+        if (reason)
+                pr_alert("page dumped because: %s\n", reason);
+        if (page->flags & badflags) {
+                pr_alert("bad because of flags:\n");
+                dump_flags(page->flags & badflags,
+                                pageflag_names, ARRAY_SIZE(pageflag_names));
+        }
+        mem_cgroup_print_bad_page(page);
+}
+void dump_page(struct page *page, const char *reason)
+{
+        dump_page_badflags(page, reason, 0);
+}
+EXPORT_SYMBOL(dump_page);
+#ifdef CONFIG_DEBUG_VM
+static const struct trace_print_flags vmaflags_names[] = {
+        {VM_READ,                       "read"          },
+        {VM_WRITE,                      "write"         },
+        {VM_EXEC,                       "exec"          },
+        {VM_SHARED,                     "shared"        },
+        {VM_MAYREAD,                    "mayread"       },
+        {VM_MAYWRITE,                   "maywrite"      },
+        {VM_MAYEXEC,                    "mayexec"       },
+        {VM_MAYSHARE,                   "mayshare"      },
+        {VM_GROWSDOWN,                  "growsdown"     },
+        {VM_PFNMAP,                     "pfnmap"        },
+        {VM_DENYWRITE,                  "denywrite"     },
+        {VM_LOCKED,                     "locked"        },
+        {VM_IO,                         "io"            },
+        {VM_SEQ_READ,                   "seqread"       },
+        {VM_RAND_READ,                  "randread"      },
+        {VM_DONTCOPY,                   "dontcopy"      },
+        {VM_DONTEXPAND,                 "dontexpand"    },
+        {VM_ACCOUNT,                    "account"       },
+        {VM_NORESERVE,                  "noreserve"     },
+        {VM_HUGETLB,                    "hugetlb"       },
+        {VM_NONLINEAR,                  "nonlinear"     },
+#if defined(CONFIG_X86)
+        {VM_PAT,                        "pat"           },
+#elif defined(CONFIG_PPC)
+        {VM_SAO,                        "sao"           },
+#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
+        {VM_GROWSUP,                    "growsup"       },
+#elif !defined(CONFIG_MMU)
+        {VM_MAPPED_COPY,                "mappedcopy"    },
+#else
+        {VM_ARCH_1,                     "arch_1"        },
+#endif
+        {VM_DONTDUMP,                   "dontdump"      },
+#ifdef CONFIG_MEM_SOFT_DIRTY
+        {VM_SOFTDIRTY,                  "softdirty"     },
+#endif
+        {VM_MIXEDMAP,                   "mixedmap"      },
+        {VM_HUGEPAGE,                   "hugepage"      },
+        {VM_NOHUGEPAGE,                 "nohugepage"    },
+        {VM_MERGEABLE,                  "mergeable"     },
+};
+void dump_vma(const struct vm_area_struct *vma)
+{
+        pr_emerg("vma %p start %p end %p\n"
+                "next %p prev %p mm %p\n"
+                "prot %lx anon_vma %p vm_ops %p\n"
+                "pgoff %lx file %p private_data %p\n",
+                vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
+                vma->vm_prev, vma->vm_mm,
+                (unsigned long)pgprot_val(vma->vm_page_prot),
+                vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
+                vma->vm_file, vma->vm_private_data);
+        dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names));
+}
+EXPORT_SYMBOL(dump_vma);
+void dump_mm(const struct mm_struct *mm)
+{
+        pr_emerg("mm %p mmap %p seqnum %d task_size %lu\n"
+#ifdef CONFIG_MMU
+                "get_unmapped_area %p\n"
+#endif
+                "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
+                "pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n"
+                "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
+                "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
+                "start_code %lx end_code %lx start_data %lx end_data %lx\n"
+                "start_brk %lx brk %lx start_stack %lx\n"
+                "arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
+                "binfmt %p flags %lx core_state %p\n"
+#ifdef CONFIG_AIO
+                "ioctx_table %p\n"
+#endif
+#ifdef CONFIG_MEMCG
+                "owner %p "
+#endif
+                "exe_file %p\n"
+#ifdef CONFIG_MMU_NOTIFIER
+                "mmu_notifier_mm %p\n"
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+                "numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n"
+#endif
+#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
+                "tlb_flush_pending %d\n"
+#endif
+                "%s",   /* This is here to hold the comma */
+                mm, mm->mmap, mm->vmacache_seqnum, mm->task_size,
+#ifdef CONFIG_MMU
+                mm->get_unmapped_area,
+#endif
+                mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
+                mm->pgd, atomic_read(&mm->mm_users),
+                atomic_read(&mm->mm_count),
+                atomic_long_read((atomic_long_t *)&mm->nr_ptes),
+                mm->map_count,
+                mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
+                mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
+                mm->start_code, mm->end_code, mm->start_data, mm->end_data,
+                mm->start_brk, mm->brk, mm->start_stack,
+                mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
+                mm->binfmt, mm->flags, mm->core_state,
+#ifdef CONFIG_AIO
+                mm->ioctx_table,
+#endif
+#ifdef CONFIG_MEMCG
+                mm->owner,
+#endif
+                mm->exe_file,
+#ifdef CONFIG_MMU_NOTIFIER
+                mm->mmu_notifier_mm,
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+                mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq,
+#endif
+#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
+                mm->tlb_flush_pending,
+#endif
+                ""              /* This is here to not have a comma! */
+                );
+                dump_flags(mm->def_flags, vmaflags_names,
+                                ARRAY_SIZE(vmaflags_names));
+}
+#endif          /* CONFIG_DEBUG_VM */
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 306baa594f95..fd5fe4342e93 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -62,6 +62,7 @@ struct dma_page {		/* cacheable header for 'allocation' bytes */
 };
 static DEFINE_MUTEX(pools_lock);
+static DEFINE_MUTEX(pools_reg_lock);
 static ssize_t
 show_pools(struct device *dev, struct device_attribute *attr, char *buf)
@@ -132,29 +133,27 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
 {
        struct dma_pool *retval;
        size_t allocation;
+        bool empty = false;
-        if (align == 0) {
+        if (align == 0)
                align = 1;
-        } else if (align & (align - 1)) {
+        else if (align & (align - 1))
                return NULL;
-        }
-        if (size == 0) {
+        if (size == 0)
                return NULL;
-        } else if (size < 4) {
+        else if (size < 4)
                size = 4;
-        }
        if ((size % align) != 0)
                size = ALIGN(size, align);
        allocation = max_t(size_t, size, PAGE_SIZE);
-        if (!boundary) {
+        if (!boundary)
                boundary = allocation;
-        } else if ((boundary < size) || (boundary & (boundary - 1))) {
+        else if ((boundary < size) || (boundary & (boundary - 1)))
                return NULL;
-        }
        retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev));
        if (!retval)
@@ -172,15 +171,34 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
        INIT_LIST_HEAD(&retval->pools);
+        /*
+         * pools_lock ensures that the ->dma_pools list does not get corrupted.
+         * pools_reg_lock ensures that there is not a race between
+         * dma_pool_create() and dma_pool_destroy() or within dma_pool_create()
+         * when the first invocation of dma_pool_create() failed on
+         * device_create_file() and the second assumes that it has been done (I
+         * know it is a short window).
+         */
+        mutex_lock(&pools_reg_lock);
        mutex_lock(&pools_lock);
-        if (list_empty(&dev->dma_pools) &&
+        if (list_empty(&dev->dma_pools))
-            device_create_file(dev, &dev_attr_pools)) {
+                empty = true;
-                kfree(retval);
+        list_add(&retval->pools, &dev->dma_pools);
-                return NULL;
-        } else
-                list_add(&retval->pools, &dev->dma_pools);
        mutex_unlock(&pools_lock);
+        if (empty) {
+                int err;
+                err = device_create_file(dev, &dev_attr_pools);
+                if (err) {
+                        mutex_lock(&pools_lock);
+                        list_del(&retval->pools);
+                        mutex_unlock(&pools_lock);
+                        mutex_unlock(&pools_reg_lock);
+                        kfree(retval);
+                        return NULL;
+                }
+        }
+        mutex_unlock(&pools_reg_lock);
        return retval;
 }
 EXPORT_SYMBOL(dma_pool_create);
@@ -251,11 +269,17 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
 */
 void dma_pool_destroy(struct dma_pool *pool)
 {
+        bool empty = false;
+        mutex_lock(&pools_reg_lock);
        mutex_lock(&pools_lock);
        list_del(&pool->pools);
        if (pool->dev && list_empty(&pool->dev->dma_pools))
-                device_remove_file(pool->dev, &dev_attr_pools);
+                empty = true;
        mutex_unlock(&pools_lock);
+        if (empty)
+                device_remove_file(pool->dev, &dev_attr_pools);
+        mutex_unlock(&pools_reg_lock);
        while (!list_empty(&pool->page_list)) {
                struct dma_page *page;
diff --git a/mm/filemap.c b/mm/filemap.c
index 90effcdf948d..14b4642279f1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -670,17 +670,13 @@ EXPORT_SYMBOL(__page_cache_alloc);
 * at a cost of "thundering herd" phenomena during rare hash
 * collisions.
 */
-static wait_queue_head_t *page_waitqueue(struct page *page)
+wait_queue_head_t *page_waitqueue(struct page *page)
 {
        const struct zone *zone = page_zone(page);
        return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
 }
+EXPORT_SYMBOL(page_waitqueue);
-static inline void wake_up_page(struct page *page, int bit)
-{
-        __wake_up_bit(page_waitqueue(page), &page->flags, bit);
-}
 void wait_on_page_bit(struct page *page, int bit_nr)
 {
@@ -703,6 +699,19 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)
                             bit_wait_io, TASK_KILLABLE);
 }
+int wait_on_page_bit_killable_timeout(struct page *page,
+                                       int bit_nr, unsigned long timeout)
+{
+        DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+        wait.key.timeout = jiffies + timeout;
+        if (!test_bit(bit_nr, &page->flags))
+                return 0;
+        return __wait_on_bit(page_waitqueue(page), &wait,
+                             bit_wait_io_timeout, TASK_KILLABLE);
+}
+EXPORT_SYMBOL_GPL(wait_on_page_bit_killable_timeout);
 /**
 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
 * @page: Page defining the wait queue of interest
@@ -727,7 +736,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
 *
 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
 * Also wakes sleepers in wait_on_page_writeback() because the wakeup
- * mechananism between PageLocked pages and PageWriteback pages is shared.
+ * mechanism between PageLocked pages and PageWriteback pages is shared.
 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
 *
 * The mb is necessary to enforce ordering between the clear_bit and the read
@@ -1744,7 +1753,7 @@ EXPORT_SYMBOL(generic_file_read_iter);
 static int page_cache_read(struct file *file, pgoff_t offset)
 {
        struct address_space *mapping = file->f_mapping;
-        struct page *page; 
+        struct page *page;
        int ret;
        do {
@@ -1761,7 +1770,7 @@ static int page_cache_read(struct file *file, pgoff_t offset)
                page_cache_release(page);
        } while (ret == AOP_TRUNCATED_PAGE);
-                
        return ret;
 }
diff --git a/mm/gup.c b/mm/gup.c
index 91d044b1600d..cd62c8c90d4a 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -10,6 +10,10 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/sched.h>
+#include <linux/rwsem.h>
+#include <asm/pgtable.h>
 #include "internal.h"
 static struct page *no_page_table(struct vm_area_struct *vma,
@@ -281,6 +285,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
                fault_flags |= FAULT_FLAG_ALLOW_RETRY;
        if (*flags & FOLL_NOWAIT)
                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
+        if (*flags & FOLL_TRIED) {
+                VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
+                fault_flags |= FAULT_FLAG_TRIED;
+        }
        ret = handle_mm_fault(mm, vma, address, fault_flags);
        if (ret & VM_FAULT_ERROR) {
@@ -672,3 +680,353 @@ struct page *get_dump_page(unsigned long addr)
        return page;
 }
 #endif /* CONFIG_ELF_CORE */
+/*
+ * Generic RCU Fast GUP
+ *
+ * get_user_pages_fast attempts to pin user pages by walking the page
+ * tables directly and avoids taking locks. Thus the walker needs to be
+ * protected from page table pages being freed from under it, and should
+ * block any THP splits.
+ *
+ * One way to achieve this is to have the walker disable interrupts, and
+ * rely on IPIs from the TLB flushing code blocking before the page table
+ * pages are freed. This is unsuitable for architectures that do not need
+ * to broadcast an IPI when invalidating TLBs.
+ *
+ * Another way to achieve this is to batch up page table containing pages
+ * belonging to more than one mm_user, then rcu_sched a callback to free those
+ * pages. Disabling interrupts will allow the fast_gup walker to both block
+ * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
+ * (which is a relatively rare event). The code below adopts this strategy.
+ *
+ * Before activating this code, please be aware that the following assumptions
+ * are currently made:
+ *
+ *  *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
+ *      pages containing page tables.
+ *
+ *  *) THP splits will broadcast an IPI, this can be achieved by overriding
+ *      pmdp_splitting_flush.
+ *
+ *  *) ptes can be read atomically by the architecture.
+ *
+ *  *) access_ok is sufficient to validate userspace address ranges.
+ *
+ * The last two assumptions can be relaxed by the addition of helper functions.
+ *
+ * This code is based heavily on the PowerPC implementation by Nick Piggin.
+ */
+#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
+#ifdef __HAVE_ARCH_PTE_SPECIAL
+static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
+                         int write, struct page **pages, int *nr)
+{
+        pte_t *ptep, *ptem;
+        int ret = 0;
+        ptem = ptep = pte_offset_map(&pmd, addr);
+        do {
+                /*
+                 * In the line below we are assuming that the pte can be read
+                 * atomically. If this is not the case for your architecture,
+                 * please wrap this in a helper function!
+                 *
+                 * for an example see gup_get_pte in arch/x86/mm/gup.c
+                 */
+                pte_t pte = ACCESS_ONCE(*ptep);
+                struct page *page;
+                /*
+                 * Similar to the PMD case below, NUMA hinting must take slow
+                 * path
+                 */
+                if (!pte_present(pte) || pte_special(pte) ||
+                        pte_numa(pte) || (write && !pte_write(pte)))
+                        goto pte_unmap;
+                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+                page = pte_page(pte);
+                if (!page_cache_get_speculative(page))
+                        goto pte_unmap;
+                if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+                        put_page(page);
+                        goto pte_unmap;
+                }
+                pages[*nr] = page;
+                (*nr)++;
+        } while (ptep++, addr += PAGE_SIZE, addr != end);
+        ret = 1;
+pte_unmap:
+        pte_unmap(ptem);
+        return ret;
+}
+#else
+/*
+ * If we can't determine whether or not a pte is special, then fail immediately
+ * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
+ * to be special.
+ *
+ * For a futex to be placed on a THP tail page, get_futex_key requires a
+ * __get_user_pages_fast implementation that can pin pages. Thus it's still
+ * useful to have gup_huge_pmd even if we can't operate on ptes.
+ */
+static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
+                         int write, struct page **pages, int *nr)
+{
+        return 0;
+}
+#endif /* __HAVE_ARCH_PTE_SPECIAL */
+static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
+                unsigned long end, int write, struct page **pages, int *nr)
+{
+        struct page *head, *page, *tail;
+        int refs;
+        if (write && !pmd_write(orig))
+                return 0;
+        refs = 0;
+        head = pmd_page(orig);
+        page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+        tail = page;
+        do {
+                VM_BUG_ON_PAGE(compound_head(page) != head, page);
+                pages[*nr] = page;
+                (*nr)++;
+                page++;
+                refs++;
+        } while (addr += PAGE_SIZE, addr != end);
+        if (!page_cache_add_speculative(head, refs)) {
+                *nr -= refs;
+                return 0;
+        }
+        if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
+                *nr -= refs;
+                while (refs--)
+                        put_page(head);
+                return 0;
+        }
+        /*
+         * Any tail pages need their mapcount reference taken before we
+         * return. (This allows the THP code to bump their ref count when
+         * they are split into base pages).
+         */
+        while (refs--) {
+                if (PageTail(tail))
+                        get_huge_page_tail(tail);
+                tail++;
+        }
+        return 1;
+}
+static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
+                unsigned long end, int write, struct page **pages, int *nr)
+{
+        struct page *head, *page, *tail;
+        int refs;
+        if (write && !pud_write(orig))
+                return 0;
+        refs = 0;
+        head = pud_page(orig);
+        page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+        tail = page;
+        do {
+                VM_BUG_ON_PAGE(compound_head(page) != head, page);
+                pages[*nr] = page;
+                (*nr)++;
+                page++;
+                refs++;
+        } while (addr += PAGE_SIZE, addr != end);
+        if (!page_cache_add_speculative(head, refs)) {
+                *nr -= refs;
+                return 0;
+        }
+        if (unlikely(pud_val(orig) != pud_val(*pudp))) {
+                *nr -= refs;
+                while (refs--)
+                        put_page(head);
+                return 0;
+        }
+        while (refs--) {
+                if (PageTail(tail))
+                        get_huge_page_tail(tail);
+                tail++;
+        }
+        return 1;
+}
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+                int write, struct page **pages, int *nr)
+{
+        unsigned long next;
+        pmd_t *pmdp;
+        pmdp = pmd_offset(&pud, addr);
+        do {
+                pmd_t pmd = ACCESS_ONCE(*pmdp);
+                next = pmd_addr_end(addr, end);
+                if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+                        return 0;
+                if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
+                        /*
+                         * NUMA hinting faults need to be handled in the GUP
+                         * slowpath for accounting purposes and so that they
+                         * can be serialised against THP migration.
+                         */
+                        if (pmd_numa(pmd))
+                                return 0;
+                        if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
+                                pages, nr))
+                                return 0;
+                } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+                                return 0;
+        } while (pmdp++, addr = next, addr != end);
+        return 1;
+}
+static int gup_pud_range(pgd_t *pgdp, unsigned long addr, unsigned long end,
+                int write, struct page **pages, int *nr)
+{
+        unsigned long next;
+        pud_t *pudp;
+        pudp = pud_offset(pgdp, addr);
+        do {
+                pud_t pud = ACCESS_ONCE(*pudp);
+                next = pud_addr_end(addr, end);
+                if (pud_none(pud))
+                        return 0;
+                if (pud_huge(pud)) {
+                        if (!gup_huge_pud(pud, pudp, addr, next, write,
+                                        pages, nr))
+                                return 0;
+                } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+                        return 0;
+        } while (pudp++, addr = next, addr != end);
+        return 1;
+}
+/*
+ * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
+ * the regular GUP. It will only return non-negative values.
+ */
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+                          struct page **pages)
+{
+        struct mm_struct *mm = current->mm;
+        unsigned long addr, len, end;
+        unsigned long next, flags;
+        pgd_t *pgdp;
+        int nr = 0;
+        start &= PAGE_MASK;
+        addr = start;
+        len = (unsigned long) nr_pages << PAGE_SHIFT;
+        end = start + len;
+        if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+                                        start, len)))
+                return 0;
+        /*
+         * Disable interrupts.  We use the nested form as we can already have
+         * interrupts disabled by get_futex_key.
+         *
+         * With interrupts disabled, we block page table pages from being
+         * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h
+         * for more details.
+         *
+         * We do not adopt an rcu_read_lock(.) here as we also want to
+         * block IPIs that come from THPs splitting.
+         */
+        local_irq_save(flags);
+        pgdp = pgd_offset(mm, addr);
+        do {
+                next = pgd_addr_end(addr, end);
+                if (pgd_none(*pgdp))
+                        break;
+                else if (!gup_pud_range(pgdp, addr, next, write, pages, &nr))
+                        break;
+        } while (pgdp++, addr = next, addr != end);
+        local_irq_restore(flags);
+        return nr;
+}
+/**
+ * get_user_pages_fast() - pin user pages in memory
+ * @start:      starting user address
+ * @nr_pages:   number of pages from start to pin
+ * @write:      whether pages will be written to
+ * @pages:      array that receives pointers to the pages pinned.
+ *              Should be at least nr_pages long.
+ *
+ * Attempt to pin user pages in memory without taking mm->mmap_sem.
+ * If not successful, it will fall back to taking the lock and
+ * calling get_user_pages().
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno.
+ */
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+                        struct page **pages)
+{
+        struct mm_struct *mm = current->mm;
+        int nr, ret;
+        start &= PAGE_MASK;
+        nr = __get_user_pages_fast(start, nr_pages, write, pages);
+        ret = nr;
+        if (nr < nr_pages) {
+                /* Try to get the remaining pages with get_user_pages */
+                start += nr << PAGE_SHIFT;
+                pages += nr;
+                down_read(&mm->mmap_sem);
+                ret = get_user_pages(current, mm, start,
+                                     nr_pages - nr, write, 0, pages, NULL);
+                up_read(&mm->mmap_sem);
+                /* Have to be a bit careful with return values */
+                if (nr > 0) {
+                        if (ret < 0)
+                                ret = nr;
+                        else
+                                ret += nr;
+                }
+        }
+        return ret;
+}
+#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d9a21d06b862..74c78aa8bc2f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1096,7 +1096,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long mmun_end;         /* For mmu_notifiers */
        ptl = pmd_lockptr(mm, pmd);
-        VM_BUG_ON(!vma->anon_vma);
+        VM_BUG_ON_VMA(!vma->anon_vma, vma);
        haddr = address & HPAGE_PMD_MASK;
        if (is_huge_zero_pmd(orig_pmd))
                goto alloc;
@@ -1795,14 +1795,17 @@ static int __split_huge_page_map(struct page *page,
                for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
                        pte_t *pte, entry;
                        BUG_ON(PageCompound(page+i));
+                        /*
+                         * Note that pmd_numa is not transferred deliberately
+                         * to avoid any possibility that pte_numa leaks to
+                         * a PROT_NONE VMA by accident.
+                         */
                        entry = mk_pte(page + i, vma->vm_page_prot);
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                        if (!pmd_write(*pmd))
                                entry = pte_wrprotect(entry);
                        if (!pmd_young(*pmd))
                                entry = pte_mkold(entry);
-                        if (pmd_numa(*pmd))
-                                entry = pte_mknuma(entry);
                        pte = pte_offset_map(&_pmd, haddr);
                        BUG_ON(!pte_none(*pte));
                        set_pte_at(mm, haddr, pte, entry);
@@ -2045,7 +2048,7 @@ int __khugepaged_enter(struct mm_struct *mm)
                return -ENOMEM;
        /* __khugepaged_exit() must not run from under us */
-        VM_BUG_ON(khugepaged_test_exit(mm));
+        VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
        if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
                free_mm_slot(mm_slot);
                return 0;
@@ -2080,7 +2083,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
        if (vma->vm_ops)
                /* khugepaged not yet working on file or special mappings */
                return 0;
-        VM_BUG_ON(vma->vm_flags & VM_NO_THP);
+        VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma);
        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
        hend = vma->vm_end & HPAGE_PMD_MASK;
        if (hstart < hend)
@@ -2319,23 +2322,17 @@ static struct page
                       int node)
 {
        VM_BUG_ON_PAGE(*hpage, *hpage);
        /*
-         * Allocate the page while the vma is still valid and under
+         * Before allocating the hugepage, release the mmap_sem read lock.
-         * the mmap_sem read mode so there is no memory allocation
+         * The allocation can take potentially a long time if it involves
-         * later when we take the mmap_sem in write mode. This is more
+         * sync compaction, and we do not need to hold the mmap_sem during
-         * friendly behavior (OTOH it may actually hide bugs) to
+         * that. We will recheck the vma after taking it again in write mode.
-         * filesystems in userland with daemons allocating memory in
-         * the userland I/O paths.  Allocating memory with the
-         * mmap_sem in read mode is good idea also to allow greater
-         * scalability.
         */
+        up_read(&mm->mmap_sem);
        *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
                khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
-        /*
-         * After allocating the hugepage, release the mmap_sem read lock in
-         * preparation for taking it in write mode.
-         */
-        up_read(&mm->mmap_sem);
        if (unlikely(!*hpage)) {
                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                *hpage = ERR_PTR(-ENOMEM);
@@ -2409,7 +2406,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
                return false;
        if (is_vma_temporary_stack(vma))
                return false;
-        VM_BUG_ON(vma->vm_flags & VM_NO_THP);
+        VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma);
        return true;
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index eeceeeb09019..9fd722769927 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -434,7 +434,7 @@ static inline struct resv_map *inode_resv_map(struct inode *inode)
 static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
 {
-        VM_BUG_ON(!is_vm_hugetlb_page(vma));
+        VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
        if (vma->vm_flags & VM_MAYSHARE) {
                struct address_space *mapping = vma->vm_file->f_mapping;
                struct inode *inode = mapping->host;
@@ -449,8 +449,8 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
 static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
 {
-        VM_BUG_ON(!is_vm_hugetlb_page(vma));
+        VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
-        VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
+        VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
        set_vma_private_data(vma, (get_vma_private_data(vma) &
                                HPAGE_RESV_MASK) | (unsigned long)map);
@@ -458,15 +458,15 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
 static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
 {
-        VM_BUG_ON(!is_vm_hugetlb_page(vma));
+        VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
-        VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
+        VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
        set_vma_private_data(vma, get_vma_private_data(vma) | flags);
 }
 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
 {
-        VM_BUG_ON(!is_vm_hugetlb_page(vma));
+        VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
        return (get_vma_private_data(vma) & flag) != 0;
 }
@@ -474,7 +474,7 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
 /* Reset counters to 0 and clear all HPAGE_RESV_* flags */
 void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
 {
-        VM_BUG_ON(!is_vm_hugetlb_page(vma));
+        VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
        if (!(vma->vm_flags & VM_MAYSHARE))
                vma->vm_private_data = (void *)0;
 }
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 9eebfadeeee1..a67c26e0f360 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -217,7 +217,7 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
        if (hugetlb_cgroup_disabled())
                return;
-        VM_BUG_ON(!spin_is_locked(&hugetlb_lock));
+        lockdep_assert_held(&hugetlb_lock);
        h_cg = hugetlb_cgroup_from_page(page);
        if (unlikely(!h_cg))
                return;
diff --git a/mm/internal.h b/mm/internal.h
index a1b651b11c5f..829304090b90 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -142,10 +142,10 @@ struct compact_control {
        bool finished_update_migrate;
        int order;                      /* order a direct compactor needs */
-        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
+        const gfp_t gfp_mask;           /* gfp mask of a direct compactor */
        struct zone *zone;
-        bool contended;                 /* True if a lock was contended, or
+        int contended;                  /* Signal need_sched() or lock
-                                         * need_resched() true during async
+                                         * contention detected during
                                         * compaction
                                         */
 };
@@ -154,8 +154,8 @@ unsigned long
 isolate_freepages_range(struct compact_control *cc,
                        unsigned long start_pfn, unsigned long end_pfn);
 unsigned long
-isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+isolate_migratepages_range(struct compact_control *cc,
-        unsigned long low_pfn, unsigned long end_pfn, bool unevictable);
+                           unsigned long low_pfn, unsigned long end_pfn);
 #endif
@@ -164,7 +164,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 * general, page_zone(page)->lock must be held by the caller to prevent the
 * page from being allocated in parallel and returning garbage as the order.
 * If a caller does not hold page_zone(page)->lock, it must guarantee that the
- * page cannot be allocated or merged in parallel.
+ * page cannot be allocated or merged in parallel. Alternatively, it must
+ * handle invalid values gracefully, and use page_order_unsafe() below.
 */
 static inline unsigned long page_order(struct page *page)
 {
@@ -172,6 +173,19 @@ static inline unsigned long page_order(struct page *page)
        return page_private(page);
 }
+/*
+ * Like page_order(), but for callers who cannot afford to hold the zone lock.
+ * PageBuddy() should be checked first by the caller to minimize race window,
+ * and invalid values must be handled gracefully.
+ *
+ * ACCESS_ONCE is used so that if the caller assigns the result into a local
+ * variable and e.g. tests it for valid range before using, the compiler cannot
+ * decide to remove the variable and inline the page_private(page) multiple
+ * times, potentially observing different values in the tests and the actual
+ * use of the result.
+ */
+#define page_order_unsafe(page)         ACCESS_ONCE(page_private(page))
 static inline bool is_cow_mapping(vm_flags_t flags)
 {
        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index 4a5822a586e6..8da581fa9060 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -34,7 +34,7 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
        struct vm_area_struct *parent;
        unsigned long last = vma_last_pgoff(node);
-        VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev));
+        VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node);
        if (!prev->shared.linear.rb.rb_right) {
                parent = prev;
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index ab88dc0ea1d3..9a09f2034fcc 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -310,7 +310,7 @@ void iov_iter_init(struct iov_iter *i, int direction,
 EXPORT_SYMBOL(iov_iter_init);
 static ssize_t get_pages_iovec(struct iov_iter *i,
-                   struct page **pages, unsigned maxpages,
+                   struct page **pages, size_t maxsize, unsigned maxpages,
                   size_t *start)
 {
        size_t offset = i->iov_offset;
@@ -323,6 +323,8 @@ static ssize_t get_pages_iovec(struct iov_iter *i,
        len = iov->iov_len - offset;
        if (len > i->count)
                len = i->count;
+        if (len > maxsize)
+                len = maxsize;
        addr = (unsigned long)iov->iov_base + offset;
        len += *start = addr & (PAGE_SIZE - 1);
        if (len > maxpages * PAGE_SIZE)
@@ -588,13 +590,15 @@ static unsigned long alignment_bvec(const struct iov_iter *i)
 }
 static ssize_t get_pages_bvec(struct iov_iter *i,
-                   struct page **pages, unsigned maxpages,
+                   struct page **pages, size_t maxsize, unsigned maxpages,
                   size_t *start)
 {
        const struct bio_vec *bvec = i->bvec;
        size_t len = bvec->bv_len - i->iov_offset;
        if (len > i->count)
                len = i->count;
+        if (len > maxsize)
+                len = maxsize;
        /* can't be more than PAGE_SIZE */
        *start = bvec->bv_offset + i->iov_offset;
@@ -711,13 +715,13 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
 EXPORT_SYMBOL(iov_iter_alignment);
 ssize_t iov_iter_get_pages(struct iov_iter *i,
-                   struct page **pages, unsigned maxpages,
+                   struct page **pages, size_t maxsize, unsigned maxpages,
                   size_t *start)
 {
        if (i->type & ITER_BVEC)
-                return get_pages_bvec(i, pages, maxpages, start);
+                return get_pages_bvec(i, pages, maxsize, maxpages, start);
        else
-                return get_pages_iovec(i, pages, maxpages, start);
+                return get_pages_iovec(i, pages, maxsize, maxpages, start);
 }
 EXPORT_SYMBOL(iov_iter_get_pages);
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index fd814fd61319..cab58bb592d8 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -2,6 +2,7 @@
 #include <linux/mm_types.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include "slab.h"
 #include <linux/kmemcheck.h>
 void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
diff --git a/mm/ksm.c b/mm/ksm.c
index fb7590222706..6b2e337bc03c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2310,7 +2310,7 @@ static int __init ksm_init(void)
        ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
        if (IS_ERR(ksm_thread)) {
-                printk(KERN_ERR "ksm: creating kthread failed\n");
+                pr_err("ksm: creating kthread failed\n");
                err = PTR_ERR(ksm_thread);
                goto out_free;
        }
@@ -2318,7 +2318,7 @@ static int __init ksm_init(void)
 #ifdef CONFIG_SYSFS
        err = sysfs_create_group(mm_kobj, &ksm_attr_group);
        if (err) {
-                printk(KERN_ERR "ksm: register sysfs failed\n");
+                pr_err("ksm: register sysfs failed\n");
                kthread_stop(ksm_thread);
                goto out_free;
        }
diff --git a/mm/memblock.c b/mm/memblock.c
index 6d2f219a48b0..6ecb0d937fb5 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -192,8 +192,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
                                        phys_addr_t align, phys_addr_t start,
                                        phys_addr_t end, int nid)
 {
-        int ret;
+        phys_addr_t kernel_end, ret;
-        phys_addr_t kernel_end;
        /* pump up @end */
        if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
@@ -817,6 +816,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
                if (nid != NUMA_NO_NODE && nid != m_nid)
                        continue;
+                /* skip hotpluggable memory regions if needed */
+                if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
+                        continue;
                if (!type_b) {
                        if (out_start)
                                *out_start = m_start;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ec4dcf1b9562..23976fd885fd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -292,6 +292,9 @@ struct mem_cgroup {
        /* vmpressure notifications */
        struct vmpressure vmpressure;
+        /* css_online() has been completed */
+        int initialized;
        /*
         * the counter to account for mem+swap usage.
         */
@@ -315,9 +318,6 @@ struct mem_cgroup {
        /* OOM-Killer disable */
        int             oom_kill_disable;
-        /* set when res.limit == memsw.limit */
-        bool            memsw_is_minimum;
        /* protect arrays of thresholds */
        struct mutex thresholds_lock;
@@ -481,14 +481,6 @@ enum res_type {
 #define OOM_CONTROL             (0)
 /*
- * Reclaim flags for mem_cgroup_hierarchical_reclaim
- */
-#define MEM_CGROUP_RECLAIM_NOSWAP_BIT   0x0
-#define MEM_CGROUP_RECLAIM_NOSWAP       (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
-#define MEM_CGROUP_RECLAIM_SHRINK_BIT   0x1
-#define MEM_CGROUP_RECLAIM_SHRINK       (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
-/*
 * The memcg_create_mutex will be held whenever a new cgroup is created.
 * As a consequence, any change that needs to protect against new child cgroups
 * appearing has to hold it as well.
@@ -646,11 +638,13 @@ int memcg_limited_groups_array_size;
 struct static_key memcg_kmem_enabled_key;
 EXPORT_SYMBOL(memcg_kmem_enabled_key);
+static void memcg_free_cache_id(int id);
 static void disarm_kmem_keys(struct mem_cgroup *memcg)
 {
        if (memcg_kmem_is_active(memcg)) {
                static_key_slow_dec(&memcg_kmem_enabled_key);
-                ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
+                memcg_free_cache_id(memcg->kmemcg_id);
        }
        /*
         * This check can't live in kmem destruction function,
@@ -1099,10 +1093,21 @@ skip_node:
         * skipping css reference should be safe.
         */
        if (next_css) {
-                if ((next_css == &root->css) ||
+                struct mem_cgroup *memcg = mem_cgroup_from_css(next_css);
-                    ((next_css->flags & CSS_ONLINE) &&
-                     css_tryget_online(next_css)))
+                if (next_css == &root->css)
-                        return mem_cgroup_from_css(next_css);
+                        return memcg;
+                if (css_tryget_online(next_css)) {
+                        /*
+                         * Make sure the memcg is initialized:
+                         * mem_cgroup_css_online() orders the the
+                         * initialization against setting the flag.
+                         */
+                        if (smp_load_acquire(&memcg->initialized))
+                                return memcg;
+                        css_put(next_css);
+                }
                prev_css = next_css;
                goto skip_node;
@@ -1792,42 +1797,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                         NULL, "Memory cgroup out of memory");
 }
-static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
-                                        gfp_t gfp_mask,
-                                        unsigned long flags)
-{
-        unsigned long total = 0;
-        bool noswap = false;
-        int loop;
-        if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
-                noswap = true;
-        if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
-                noswap = true;
-        for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
-                if (loop)
-                        drain_all_stock_async(memcg);
-                total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
-                /*
-                 * Allow limit shrinkers, which are triggered directly
-                 * by userspace, to catch signals and stop reclaim
-                 * after minimal progress, regardless of the margin.
-                 */
-                if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
-                        break;
-                if (mem_cgroup_margin(memcg))
-                        break;
-                /*
-                 * If nothing was reclaimed after two attempts, there
-                 * may be no reclaimable pages in this hierarchy.
-                 */
-                if (loop && !total)
-                        break;
-        }
-        return total;
-}
 /**
 * test_mem_cgroup_node_reclaimable
 * @memcg: the target memcg
@@ -2530,25 +2499,29 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
        struct mem_cgroup *mem_over_limit;
        struct res_counter *fail_res;
        unsigned long nr_reclaimed;
-        unsigned long flags = 0;
        unsigned long long size;
+        bool may_swap = true;
+        bool drained = false;
        int ret = 0;
+        if (mem_cgroup_is_root(memcg))
+                goto done;
 retry:
        if (consume_stock(memcg, nr_pages))
                goto done;
        size = batch * PAGE_SIZE;
-        if (!res_counter_charge(&memcg->res, size, &fail_res)) {
+        if (!do_swap_account ||
-                if (!do_swap_account)
+            !res_counter_charge(&memcg->memsw, size, &fail_res)) {
-                        goto done_restock;
+                if (!res_counter_charge(&memcg->res, size, &fail_res))
-                if (!res_counter_charge(&memcg->memsw, size, &fail_res))
                        goto done_restock;
-                res_counter_uncharge(&memcg->res, size);
+                if (do_swap_account)
-                mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
+                        res_counter_uncharge(&memcg->memsw, size);
-                flags |= MEM_CGROUP_RECLAIM_NOSWAP;
-        } else
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+        } else {
+                mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
+                may_swap = false;
+        }
        if (batch > nr_pages) {
                batch = nr_pages;
@@ -2572,11 +2545,18 @@ retry:
        if (!(gfp_mask & __GFP_WAIT))
                goto nomem;
-        nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
+        nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
+                                                    gfp_mask, may_swap);
        if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
                goto retry;
+        if (!drained) {
+                drain_all_stock_async(mem_over_limit);
+                drained = true;
+                goto retry;
+        }
        if (gfp_mask & __GFP_NORETRY)
                goto nomem;
        /*
@@ -2611,9 +2591,7 @@ nomem:
        if (!(gfp_mask & __GFP_NOFAIL))
                return -ENOMEM;
 bypass:
-        memcg = root_mem_cgroup;
+        return -EINTR;
-        ret = -EINTR;
-        goto retry;
 done_restock:
        if (batch > nr_pages)
@@ -2626,6 +2604,9 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
        unsigned long bytes = nr_pages * PAGE_SIZE;
+        if (mem_cgroup_is_root(memcg))
+                return;
        res_counter_uncharge(&memcg->res, bytes);
        if (do_swap_account)
                res_counter_uncharge(&memcg->memsw, bytes);
@@ -2640,6 +2621,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
 {
        unsigned long bytes = nr_pages * PAGE_SIZE;
+        if (mem_cgroup_is_root(memcg))
+                return;
        res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
        if (do_swap_account)
                res_counter_uncharge_until(&memcg->memsw,
@@ -2778,12 +2762,6 @@ static DEFINE_MUTEX(memcg_slab_mutex);
 static DEFINE_MUTEX(activate_kmem_mutex);
-static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
-{
-        return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
-                memcg_kmem_is_active(memcg);
-}
 /*
 * This is a bit cumbersome, but it is rarely used and avoids a backpointer
 * in the memcg_cache_params struct.
@@ -2803,7 +2781,7 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
        struct memcg_cache_params *params;
-        if (!memcg_can_account_kmem(memcg))
+        if (!memcg_kmem_is_active(memcg))
                return -EIO;
        print_slabinfo_header(m);
@@ -2886,19 +2864,44 @@ int memcg_cache_id(struct mem_cgroup *memcg)
        return memcg ? memcg->kmemcg_id : -1;
 }
-static size_t memcg_caches_array_size(int num_groups)
+static int memcg_alloc_cache_id(void)
 {
-        ssize_t size;
+        int id, size;
-        if (num_groups <= 0)
+        int err;
-                return 0;
+        id = ida_simple_get(&kmem_limited_groups,
+                            0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
+        if (id < 0)
+                return id;
-        size = 2 * num_groups;
+        if (id < memcg_limited_groups_array_size)
+                return id;
+        /*
+         * There's no space for the new id in memcg_caches arrays,
+         * so we have to grow them.
+         */
+        size = 2 * (id + 1);
        if (size < MEMCG_CACHES_MIN_SIZE)
                size = MEMCG_CACHES_MIN_SIZE;
        else if (size > MEMCG_CACHES_MAX_SIZE)
                size = MEMCG_CACHES_MAX_SIZE;
-        return size;
+        mutex_lock(&memcg_slab_mutex);
+        err = memcg_update_all_caches(size);
+        mutex_unlock(&memcg_slab_mutex);
+        if (err) {
+                ida_simple_remove(&kmem_limited_groups, id);
+                return err;
+        }
+        return id;
+}
+static void memcg_free_cache_id(int id)
+{
+        ida_simple_remove(&kmem_limited_groups, id);
 }
 /*
@@ -2908,97 +2911,7 @@ static size_t memcg_caches_array_size(int num_groups)
 */
 void memcg_update_array_size(int num)
 {
-        if (num > memcg_limited_groups_array_size)
+        memcg_limited_groups_array_size = num;
-                memcg_limited_groups_array_size = memcg_caches_array_size(num);
-}
-int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
-{
-        struct memcg_cache_params *cur_params = s->memcg_params;
-        VM_BUG_ON(!is_root_cache(s));
-        if (num_groups > memcg_limited_groups_array_size) {
-                int i;
-                struct memcg_cache_params *new_params;
-                ssize_t size = memcg_caches_array_size(num_groups);
-                size *= sizeof(void *);
-                size += offsetof(struct memcg_cache_params, memcg_caches);
-                new_params = kzalloc(size, GFP_KERNEL);
-                if (!new_params)
-                        return -ENOMEM;
-                new_params->is_root_cache = true;
-                /*
-                 * There is the chance it will be bigger than
-                 * memcg_limited_groups_array_size, if we failed an allocation
-                 * in a cache, in which case all caches updated before it, will
-                 * have a bigger array.
-                 *
-                 * But if that is the case, the data after
-                 * memcg_limited_groups_array_size is certainly unused
-                 */
-                for (i = 0; i < memcg_limited_groups_array_size; i++) {
-                        if (!cur_params->memcg_caches[i])
-                                continue;
-                        new_params->memcg_caches[i] =
-                                                cur_params->memcg_caches[i];
-                }
-                /*
-                 * Ideally, we would wait until all caches succeed, and only
-                 * then free the old one. But this is not worth the extra
-                 * pointer per-cache we'd have to have for this.
-                 *
-                 * It is not a big deal if some caches are left with a size
-                 * bigger than the others. And all updates will reset this
-                 * anyway.
-                 */
-                rcu_assign_pointer(s->memcg_params, new_params);
-                if (cur_params)
-                        kfree_rcu(cur_params, rcu_head);
-        }
-        return 0;
-}
-int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
-                             struct kmem_cache *root_cache)
-{
-        size_t size;
-        if (!memcg_kmem_enabled())
-                return 0;
-        if (!memcg) {
-                size = offsetof(struct memcg_cache_params, memcg_caches);
-                size += memcg_limited_groups_array_size * sizeof(void *);
-        } else
-                size = sizeof(struct memcg_cache_params);
-        s->memcg_params = kzalloc(size, GFP_KERNEL);
-        if (!s->memcg_params)
-                return -ENOMEM;
-        if (memcg) {
-                s->memcg_params->memcg = memcg;
-                s->memcg_params->root_cache = root_cache;
-                css_get(&memcg->css);
-        } else
-                s->memcg_params->is_root_cache = true;
-        return 0;
-}
-void memcg_free_cache_params(struct kmem_cache *s)
-{
-        if (!s->memcg_params)
-                return;
-        if (!s->memcg_params->is_root_cache)
-                css_put(&s->memcg_params->memcg->css);
-        kfree(s->memcg_params);
 }
 static void memcg_register_cache(struct mem_cgroup *memcg,
@@ -3031,6 +2944,7 @@ static void memcg_register_cache(struct mem_cgroup *memcg,
        if (!cachep)
                return;
+        css_get(&memcg->css);
        list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
        /*
@@ -3064,6 +2978,9 @@ static void memcg_unregister_cache(struct kmem_cache *cachep)
        list_del(&cachep->memcg_params->list);
        kmem_cache_destroy(cachep);
+        /* drop the reference taken in memcg_register_cache */
+        css_put(&memcg->css);
 }
 /*
@@ -3241,7 +3158,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
-        if (!memcg_can_account_kmem(memcg))
+        if (!memcg_kmem_is_active(memcg))
                goto out;
        memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
@@ -3326,7 +3243,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
        memcg = get_mem_cgroup_from_mm(current->mm);
-        if (!memcg_can_account_kmem(memcg)) {
+        if (!memcg_kmem_is_active(memcg)) {
                css_put(&memcg->css);
                return true;
        }
@@ -3668,7 +3585,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
                                unsigned long long val)
 {
        int retry_count;
-        u64 memswlimit, memlimit;
        int ret = 0;
        int children = mem_cgroup_count_children(memcg);
        u64 curusage, oldusage;
@@ -3695,31 +3611,23 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
                 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
                 */
                mutex_lock(&set_limit_mutex);
-                memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+                if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) {
-                if (memswlimit < val) {
                        ret = -EINVAL;
                        mutex_unlock(&set_limit_mutex);
                        break;
                }
-                memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+                if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val)
-                if (memlimit < val)
                        enlarge = 1;
                ret = res_counter_set_limit(&memcg->res, val);
-                if (!ret) {
-                        if (memswlimit == val)
-                                memcg->memsw_is_minimum = true;
-                        else
-                                memcg->memsw_is_minimum = false;
-                }
                mutex_unlock(&set_limit_mutex);
                if (!ret)
                        break;
-                mem_cgroup_reclaim(memcg, GFP_KERNEL,
+                try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
-                                   MEM_CGROUP_RECLAIM_SHRINK);
                curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
                /* Usage is reduced ? */
                if (curusage >= oldusage)
@@ -3737,7 +3645,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
                                        unsigned long long val)
 {
        int retry_count;
-        u64 memlimit, memswlimit, oldusage, curusage;
+        u64 oldusage, curusage;
        int children = mem_cgroup_count_children(memcg);
        int ret = -EBUSY;
        int enlarge = 0;
@@ -3756,30 +3664,21 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
                 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
                 */
                mutex_lock(&set_limit_mutex);
-                memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+                if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) {
-                if (memlimit > val) {
                        ret = -EINVAL;
                        mutex_unlock(&set_limit_mutex);
                        break;
                }
-                memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+                if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val)
-                if (memswlimit < val)
                        enlarge = 1;
                ret = res_counter_set_limit(&memcg->memsw, val);
-                if (!ret) {
-                        if (memlimit == val)
-                                memcg->memsw_is_minimum = true;
-                        else
-                                memcg->memsw_is_minimum = false;
-                }
                mutex_unlock(&set_limit_mutex);
                if (!ret)
                        break;
-                mem_cgroup_reclaim(memcg, GFP_KERNEL,
+                try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
-                                   MEM_CGROUP_RECLAIM_NOSWAP |
-                                   MEM_CGROUP_RECLAIM_SHRINK);
                curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
                /* Usage is reduced ? */
                if (curusage >= oldusage)
@@ -4028,8 +3927,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
                if (signal_pending(current))
                        return -EINTR;
-                progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
+                progress = try_to_free_mem_cgroup_pages(memcg, 1,
-                                                false);
+                                                        GFP_KERNEL, true);
                if (!progress) {
                        nr_retries--;
                        /* maybe some writeback is necessary */
@@ -4093,6 +3992,46 @@ out:
        return retval;
 }
+static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
+                                               enum mem_cgroup_stat_index idx)
+{
+        struct mem_cgroup *iter;
+        long val = 0;
+        /* Per-cpu values can be negative, use a signed accumulator */
+        for_each_mem_cgroup_tree(iter, memcg)
+                val += mem_cgroup_read_stat(iter, idx);
+        if (val < 0) /* race ? */
+                val = 0;
+        return val;
+}
+static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
+{
+        u64 val;
+        if (!mem_cgroup_is_root(memcg)) {
+                if (!swap)
+                        return res_counter_read_u64(&memcg->res, RES_USAGE);
+                else
+                        return res_counter_read_u64(&memcg->memsw, RES_USAGE);
+        }
+        /*
+         * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
+         * as well as in MEM_CGROUP_STAT_RSS_HUGE.
+         */
+        val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
+        val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
+        if (swap)
+                val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
+        return val << PAGE_SHIFT;
+}
 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
                               struct cftype *cft)
 {
@@ -4102,8 +4041,12 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
        switch (type) {
        case _MEM:
+                if (name == RES_USAGE)
+                        return mem_cgroup_usage(memcg, false);
                return res_counter_read_u64(&memcg->res, name);
        case _MEMSWAP:
+                if (name == RES_USAGE)
+                        return mem_cgroup_usage(memcg, true);
                return res_counter_read_u64(&memcg->memsw, name);
        case _KMEM:
                return res_counter_read_u64(&memcg->kmem, name);
@@ -4150,23 +4093,12 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
        if (err)
                goto out;
-        memcg_id = ida_simple_get(&kmem_limited_groups,
+        memcg_id = memcg_alloc_cache_id();
-                                  0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
        if (memcg_id < 0) {
                err = memcg_id;
                goto out;
        }
-        /*
-         * Make sure we have enough space for this cgroup in each root cache's
-         * memcg_params.
-         */
-        mutex_lock(&memcg_slab_mutex);
-        err = memcg_update_all_caches(memcg_id + 1);
-        mutex_unlock(&memcg_slab_mutex);
-        if (err)
-                goto out_rmid;
        memcg->kmemcg_id = memcg_id;
        INIT_LIST_HEAD(&memcg->memcg_slab_caches);
@@ -4187,10 +4119,6 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
 out:
        memcg_resume_kmem_account();
        return err;
-out_rmid:
-        ida_simple_remove(&kmem_limited_groups, memcg_id);
-        goto out;
 }
 static int memcg_activate_kmem(struct mem_cgroup *memcg,
@@ -4572,10 +4500,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
        if (!t)
                goto unlock;
-        if (!swap)
+        usage = mem_cgroup_usage(memcg, swap);
-                usage = res_counter_read_u64(&memcg->res, RES_USAGE);
-        else
-                usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
        /*
         * current_threshold points to threshold just below or equal to usage.
@@ -4673,10 +4598,10 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
        if (type == _MEM) {
                thresholds = &memcg->thresholds;
-                usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+                usage = mem_cgroup_usage(memcg, false);
        } else if (type == _MEMSWAP) {
                thresholds = &memcg->memsw_thresholds;
-                usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+                usage = mem_cgroup_usage(memcg, true);
        } else
                BUG();
@@ -4762,10 +4687,10 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
        if (type == _MEM) {
                thresholds = &memcg->thresholds;
-                usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+                usage = mem_cgroup_usage(memcg, false);
        } else if (type == _MEMSWAP) {
                thresholds = &memcg->memsw_thresholds;
-                usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+                usage = mem_cgroup_usage(memcg, true);
        } else
                BUG();
@@ -5502,6 +5427,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
+        int ret;
        if (css->id > MEM_CGROUP_ID_MAX)
                return -ENOSPC;
@@ -5525,9 +5451,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
                 * core guarantees its existence.
                 */
        } else {
-                res_counter_init(&memcg->res, &root_mem_cgroup->res);
+                res_counter_init(&memcg->res, NULL);
-                res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw);
+                res_counter_init(&memcg->memsw, NULL);
-                res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
+                res_counter_init(&memcg->kmem, NULL);
                /*
                 * Deeper hierachy with use_hierarchy == false doesn't make
                 * much sense so let cgroup subsystem know about this
@@ -5538,7 +5464,18 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
        }
        mutex_unlock(&memcg_create_mutex);
-        return memcg_init_kmem(memcg, &memory_cgrp_subsys);
+        ret = memcg_init_kmem(memcg, &memory_cgrp_subsys);
+        if (ret)
+                return ret;
+        /*
+         * Make sure the memcg is initialized: mem_cgroup_iter()
+         * orders reading memcg->initialized against its callers
+         * reading the memcg members.
+         */
+        smp_store_release(&memcg->initialized, 1);
+        return 0;
 }
 /*
@@ -5969,8 +5906,9 @@ static void __mem_cgroup_clear_mc(void)
        /* we must fixup refcnts and charges */
        if (mc.moved_swap) {
                /* uncharge swap account from the old cgroup */
-                res_counter_uncharge(&mc.from->memsw,
+                if (!mem_cgroup_is_root(mc.from))
-                                     PAGE_SIZE * mc.moved_swap);
+                        res_counter_uncharge(&mc.from->memsw,
+                                             PAGE_SIZE * mc.moved_swap);
                for (i = 0; i < mc.moved_swap; i++)
                        css_put(&mc.from->css);
@@ -5979,8 +5917,9 @@ static void __mem_cgroup_clear_mc(void)
                 * we charged both to->res and to->memsw, so we should
                 * uncharge to->res.
                 */
-                res_counter_uncharge(&mc.to->res,
+                if (!mem_cgroup_is_root(mc.to))
-                                     PAGE_SIZE * mc.moved_swap);
+                        res_counter_uncharge(&mc.to->res,
+                                             PAGE_SIZE * mc.moved_swap);
                /* we've already done css_get(mc.to) */
                mc.moved_swap = 0;
        }
@@ -6345,7 +6284,8 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
        rcu_read_lock();
        memcg = mem_cgroup_lookup(id);
        if (memcg) {
-                res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+                if (!mem_cgroup_is_root(memcg))
+                        res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
                mem_cgroup_swap_statistics(memcg, false);
                css_put(&memcg->css);
        }
@@ -6509,12 +6449,15 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
 {
        unsigned long flags;
-        if (nr_mem)
+        if (!mem_cgroup_is_root(memcg)) {
-                res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE);
+                if (nr_mem)
-        if (nr_memsw)
+                        res_counter_uncharge(&memcg->res,
-                res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE);
+                                             nr_mem * PAGE_SIZE);
+                if (nr_memsw)
-        memcg_oom_recover(memcg);
+                        res_counter_uncharge(&memcg->memsw,
+                                             nr_memsw * PAGE_SIZE);
+                memcg_oom_recover(memcg);
+        }
        local_irq_save(flags);
        __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 44c6bd201d3a..8639f6b28746 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -148,7 +148,7 @@ static int hwpoison_filter_task(struct page *p)
        ino = cgroup_ino(css->cgroup);
        css_put(css);
-        if (!ino || ino != hwpoison_filter_memcg)
+        if (ino != hwpoison_filter_memcg)
                return -EINVAL;
        return 0;
diff --git a/mm/memory.c b/mm/memory.c
index ab3537bcfed2..e229970e4223 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -118,6 +118,8 @@ __setup("norandmaps", disable_randmaps);
 unsigned long zero_pfn __read_mostly;
 unsigned long highest_memmap_pfn __read_mostly;
+EXPORT_SYMBOL(zero_pfn);
 /*
 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
 */
@@ -751,7 +753,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
        unsigned long pfn = pte_pfn(pte);
        if (HAVE_PTE_SPECIAL) {
-                if (likely(!pte_special(pte) || pte_numa(pte)))
+                if (likely(!pte_special(pte)))
                        goto check_pfn;
                if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
                        return NULL;
@@ -777,15 +779,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                }
        }
+        if (is_zero_pfn(pfn))
+                return NULL;
 check_pfn:
        if (unlikely(pfn > highest_memmap_pfn)) {
                print_bad_pte(vma, addr, pte, NULL);
                return NULL;
        }
-        if (is_zero_pfn(pfn))
-                return NULL;
        /*
         * NOTE! We still have PageReserved() pages in the page tables.
         * eg. VDSO mappings can cause them to exist.
@@ -1126,7 +1127,7 @@ again:
                                                addr) != page->index) {
                                pte_t ptfile = pgoff_to_pte(page->index);
                                if (pte_soft_dirty(ptent))
-                                        pte_file_mksoft_dirty(ptfile);
+                                        ptfile = pte_file_mksoft_dirty(ptfile);
                                set_pte_at(mm, addr, pte, ptfile);
                        }
                        if (PageAnon(page))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2ff8c2325e96..29d8693d0c61 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1307,7 +1307,7 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
 /*
 * Confirm all pages in a range [start, end) is belongs to the same zone.
 */
-static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
+int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
 {
        unsigned long pfn;
        struct zone *zone = NULL;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8f5330d74f47..e58725aff7e9 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -123,25 +123,23 @@ static struct mempolicy default_policy = {
 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
-static struct mempolicy *get_task_policy(struct task_struct *p)
+struct mempolicy *get_task_policy(struct task_struct *p)
 {
        struct mempolicy *pol = p->mempolicy;
+        int node;
-        if (!pol) {
+        if (pol)
-                int node = numa_node_id();
+                return pol;
-                if (node != NUMA_NO_NODE) {
+        node = numa_node_id();
-                        pol = &preferred_node_policy[node];
+        if (node != NUMA_NO_NODE) {
-                        /*
+                pol = &preferred_node_policy[node];
-                         * preferred_node_policy is not initialised early in
+                /* preferred_node_policy is not initialised early in boot */
-                         * boot
+                if (pol->mode)
-                         */
+                        return pol;
-                        if (!pol->mode)
-                                pol = NULL;
-                }
        }
-        return pol;
+        return &default_policy;
 }
 static const struct mempolicy_operations {
@@ -683,7 +681,9 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                }
                if (flags & MPOL_MF_LAZY) {
-                        change_prot_numa(vma, start, endvma);
+                        /* Similar to task_numa_work, skip inaccessible VMAs */
+                        if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
+                                change_prot_numa(vma, start, endvma);
                        goto next;
                }
@@ -804,7 +804,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
                             nodemask_t *nodes)
 {
        struct mempolicy *new, *old;
-        struct mm_struct *mm = current->mm;
        NODEMASK_SCRATCH(scratch);
        int ret;
@@ -816,20 +815,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
                ret = PTR_ERR(new);
                goto out;
        }
-        /*
-         * prevent changing our mempolicy while show_numa_maps()
-         * is using it.
-         * Note:  do_set_mempolicy() can be called at init time
-         * with no 'mm'.
-         */
-        if (mm)
-                down_write(&mm->mmap_sem);
        task_lock(current);
        ret = mpol_set_nodemask(new, nodes, scratch);
        if (ret) {
                task_unlock(current);
-                if (mm)
-                        up_write(&mm->mmap_sem);
                mpol_put(new);
                goto out;
        }
@@ -839,9 +829,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
            nodes_weight(new->v.nodes))
                current->il_next = first_node(new->v.nodes);
        task_unlock(current);
-        if (mm)
-                up_write(&mm->mmap_sem);
        mpol_put(old);
        ret = 0;
 out:
@@ -1605,32 +1592,14 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
 #endif
-/*
+struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
- * get_vma_policy(@task, @vma, @addr)
+                                                unsigned long addr)
- * @task: task for fallback if vma policy == default
- * @vma: virtual memory area whose policy is sought
- * @addr: address in @vma for shared policy lookup
- *
- * Returns effective policy for a VMA at specified address.
- * Falls back to @task or system default policy, as necessary.
- * Current or other task's task mempolicy and non-shared vma policies must be
- * protected by task_lock(task) by the caller.
- * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
- * count--added by the get_policy() vm_op, as appropriate--to protect against
- * freeing by another task.  It is the caller's responsibility to free the
- * extra reference for shared policies.
- */
-struct mempolicy *get_vma_policy(struct task_struct *task,
-                struct vm_area_struct *vma, unsigned long addr)
 {
-        struct mempolicy *pol = get_task_policy(task);
+        struct mempolicy *pol = NULL;
        if (vma) {
                if (vma->vm_ops && vma->vm_ops->get_policy) {
-                        struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
+                        pol = vma->vm_ops->get_policy(vma, addr);
-                                                                        addr);
-                        if (vpol)
-                                pol = vpol;
                } else if (vma->vm_policy) {
                        pol = vma->vm_policy;
@@ -1644,31 +1613,51 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
                                mpol_get(pol);
                }
        }
+        return pol;
+}
+/*
+ * get_vma_policy(@vma, @addr)
+ * @vma: virtual memory area whose policy is sought
+ * @addr: address in @vma for shared policy lookup
+ *
+ * Returns effective policy for a VMA at specified address.
+ * Falls back to current->mempolicy or system default policy, as necessary.
+ * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
+ * count--added by the get_policy() vm_op, as appropriate--to protect against
+ * freeing by another task.  It is the caller's responsibility to free the
+ * extra reference for shared policies.
+ */
+static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+                                                unsigned long addr)
+{
+        struct mempolicy *pol = __get_vma_policy(vma, addr);
        if (!pol)
-                pol = &default_policy;
+                pol = get_task_policy(current);
        return pol;
 }
-bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
+bool vma_policy_mof(struct vm_area_struct *vma)
 {
-        struct mempolicy *pol = get_task_policy(task);
+        struct mempolicy *pol;
-        if (vma) {
-                if (vma->vm_ops && vma->vm_ops->get_policy) {
-                        bool ret = false;
-                        pol = vma->vm_ops->get_policy(vma, vma->vm_start);
+        if (vma->vm_ops && vma->vm_ops->get_policy) {
-                        if (pol && (pol->flags & MPOL_F_MOF))
+                bool ret = false;
-                                ret = true;
-                        mpol_cond_put(pol);
-                        return ret;
+                pol = vma->vm_ops->get_policy(vma, vma->vm_start);
-                } else if (vma->vm_policy) {
+                if (pol && (pol->flags & MPOL_F_MOF))
-                        pol = vma->vm_policy;
+                        ret = true;
-                }
+                mpol_cond_put(pol);
+                return ret;
        }
+        pol = vma->vm_policy;
        if (!pol)
-                return default_policy.flags & MPOL_F_MOF;
+                pol = get_task_policy(current);
        return pol->flags & MPOL_F_MOF;
 }
@@ -1874,7 +1863,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
 {
        struct zonelist *zl;
-        *mpol = get_vma_policy(current, vma, addr);
+        *mpol = get_vma_policy(vma, addr);
        *nodemask = NULL;       /* assume !MPOL_BIND */
        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
@@ -2029,7 +2018,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
        unsigned int cpuset_mems_cookie;
 retry_cpuset:
-        pol = get_vma_policy(current, vma, addr);
+        pol = get_vma_policy(vma, addr);
        cpuset_mems_cookie = read_mems_allowed_begin();
        if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
@@ -2046,8 +2035,7 @@ retry_cpuset:
        page = __alloc_pages_nodemask(gfp, order,
                                      policy_zonelist(gfp, pol, node),
                                      policy_nodemask(gfp, pol));
-        if (unlikely(mpol_needs_cond_ref(pol)))
+        mpol_cond_put(pol);
-                __mpol_put(pol);
        if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
                goto retry_cpuset;
        return page;
@@ -2074,12 +2062,12 @@ retry_cpuset:
 */
 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
-        struct mempolicy *pol = get_task_policy(current);
+        struct mempolicy *pol = &default_policy;
        struct page *page;
        unsigned int cpuset_mems_cookie;
-        if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
+        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
-                pol = &default_policy;
+                pol = get_task_policy(current);
 retry_cpuset:
        cpuset_mems_cookie = read_mems_allowed_begin();
@@ -2296,7 +2284,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
        BUG_ON(!vma);
-        pol = get_vma_policy(current, vma, addr);
+        pol = get_vma_policy(vma, addr);
        if (!(pol->flags & MPOL_F_MOF))
                goto out;
diff --git a/mm/migrate.c b/mm/migrate.c
index f78ec9bd454d..01439953abf5 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -146,8 +146,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
        pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
        if (pte_swp_soft_dirty(*ptep))
                pte = pte_mksoft_dirty(pte);
+        /* Recheck VMA as permissions can change since migration started  */
        if (is_write_migration_entry(entry))
-                pte = pte_mkwrite(pte);
+                pte = maybe_mkwrite(pte, vma);
 #ifdef CONFIG_HUGETLB_PAGE
        if (PageHuge(new)) {
                pte = pte_mkhuge(pte);
@@ -873,7 +876,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
                }
        }
-        if (unlikely(balloon_page_movable(page))) {
+        if (unlikely(isolated_balloon_page(page))) {
                /*
                 * A ballooned page does not need any special attention from
                 * physical to virtual reverse mapping procedures.
@@ -952,17 +955,6 @@ static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
        rc = __unmap_and_move(page, newpage, force, mode);
-        if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
-                /*
-                 * A ballooned page has been migrated already.
-                 * Now, it's the time to wrap-up counters,
-                 * handle the page back to Buddy and return.
-                 */
-                dec_zone_page_state(page, NR_ISOLATED_ANON +
-                                    page_is_file_cache(page));
-                balloon_page_free(page);
-                return MIGRATEPAGE_SUCCESS;
-        }
 out:
        if (rc != -EAGAIN) {
                /*
@@ -985,6 +977,9 @@ out:
        if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
                ClearPageSwapBacked(newpage);
                put_new_page(newpage, private);
+        } else if (unlikely(__is_movable_balloon_page(newpage))) {
+                /* drop our reference, page already in the balloon */
+                put_page(newpage);
        } else
                putback_lru_page(newpage);
diff --git a/mm/mlock.c b/mm/mlock.c
index ce84cb0b83ef..03aa8512723b 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -233,9 +233,9 @@ long __mlock_vma_pages_range(struct vm_area_struct *vma,
        VM_BUG_ON(start & ~PAGE_MASK);
        VM_BUG_ON(end   & ~PAGE_MASK);
-        VM_BUG_ON(start < vma->vm_start);
+        VM_BUG_ON_VMA(start < vma->vm_start, vma);
-        VM_BUG_ON(end   > vma->vm_end);
+        VM_BUG_ON_VMA(end   > vma->vm_end, vma);
-        VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+        VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
        gup_flags = FOLL_TOUCH | FOLL_MLOCK;
        /*
diff --git a/mm/mmap.c b/mm/mmap.c
index c1f2ea4a0b99..93d28c7e5420 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -70,7 +70,7 @@ static void unmap_region(struct mm_struct *mm,
 * MAP_SHARED   r: (no) no      r: (yes) yes    r: (no) yes     r: (no) yes
 *              w: (no) no      w: (no) no      w: (yes) yes    w: (no) no
 *              x: (no) no      x: (no) yes     x: (no) yes     x: (yes) yes
- *              
+ *
 * MAP_PRIVATE  r: (no) no      r: (yes) yes    r: (no) yes     r: (no) yes
 *              w: (no) no      w: (no) no      w: (copy) copy  w: (no) no
 *              x: (no) no      x: (no) yes     x: (no) yes     x: (yes) yes
@@ -268,7 +268,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len);
 SYSCALL_DEFINE1(brk, unsigned long, brk)
 {
-        unsigned long rlim, retval;
+        unsigned long retval;
        unsigned long newbrk, oldbrk;
        struct mm_struct *mm = current->mm;
        unsigned long min_brk;
@@ -298,9 +298,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
         * segment grow beyond its set limit the in case where the limit is
         * not page aligned -Ram Gupta
         */
-        rlim = rlimit(RLIMIT_DATA);
+        if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
-        if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
+                              mm->end_data, mm->start_data))
-                        (mm->end_data - mm->start_data) > rlim)
                goto out;
        newbrk = PAGE_ALIGN(brk);
@@ -369,20 +368,22 @@ static int browse_rb(struct rb_root *root)
                struct vm_area_struct *vma;
                vma = rb_entry(nd, struct vm_area_struct, vm_rb);
                if (vma->vm_start < prev) {
-                        pr_info("vm_start %lx prev %lx\n", vma->vm_start, prev);
+                        pr_emerg("vm_start %lx < prev %lx\n",
+                                  vma->vm_start, prev);
                        bug = 1;
                }
                if (vma->vm_start < pend) {
-                        pr_info("vm_start %lx pend %lx\n", vma->vm_start, pend);
+                        pr_emerg("vm_start %lx < pend %lx\n",
+                                  vma->vm_start, pend);
                        bug = 1;
                }
                if (vma->vm_start > vma->vm_end) {
-                        pr_info("vm_end %lx < vm_start %lx\n",
+                        pr_emerg("vm_start %lx > vm_end %lx\n",
-                                vma->vm_end, vma->vm_start);
+                                  vma->vm_start, vma->vm_end);
                        bug = 1;
                }
                if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
-                        pr_info("free gap %lx, correct %lx\n",
+                        pr_emerg("free gap %lx, correct %lx\n",
                               vma->rb_subtree_gap,
                               vma_compute_subtree_gap(vma));
                        bug = 1;
@@ -396,7 +397,7 @@ static int browse_rb(struct rb_root *root)
        for (nd = pn; nd; nd = rb_prev(nd))
                j++;
        if (i != j) {
-                pr_info("backwards %d, forwards %d\n", j, i);
+                pr_emerg("backwards %d, forwards %d\n", j, i);
                bug = 1;
        }
        return bug ? -1 : i;
@@ -409,8 +410,9 @@ static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
        for (nd = rb_first(root); nd; nd = rb_next(nd)) {
                struct vm_area_struct *vma;
                vma = rb_entry(nd, struct vm_area_struct, vm_rb);
-                BUG_ON(vma != ignore &&
+                VM_BUG_ON_VMA(vma != ignore &&
-                       vma->rb_subtree_gap != vma_compute_subtree_gap(vma));
+                        vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
+                        vma);
        }
 }
@@ -420,8 +422,10 @@ static void validate_mm(struct mm_struct *mm)
        int i = 0;
        unsigned long highest_address = 0;
        struct vm_area_struct *vma = mm->mmap;
        while (vma) {
                struct anon_vma_chain *avc;
                vma_lock_anon_vma(vma);
                list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                        anon_vma_interval_tree_verify(avc);
@@ -431,20 +435,21 @@ static void validate_mm(struct mm_struct *mm)
                i++;
        }
        if (i != mm->map_count) {
-                pr_info("map_count %d vm_next %d\n", mm->map_count, i);
+                pr_emerg("map_count %d vm_next %d\n", mm->map_count, i);
                bug = 1;
        }
        if (highest_address != mm->highest_vm_end) {
-                pr_info("mm->highest_vm_end %lx, found %lx\n",
+                pr_emerg("mm->highest_vm_end %lx, found %lx\n",
-                       mm->highest_vm_end, highest_address);
+                          mm->highest_vm_end, highest_address);
                bug = 1;
        }
        i = browse_rb(&mm->mm_rb);
        if (i != mm->map_count) {
-                pr_info("map_count %d rb %d\n", mm->map_count, i);
+                if (i != -1)
+                        pr_emerg("map_count %d rb %d\n", mm->map_count, i);
                bug = 1;
        }
-        BUG_ON(bug);
+        VM_BUG_ON_MM(bug, mm);
 }
 #else
 #define validate_mm_rb(root, ignore) do { } while (0)
@@ -741,7 +746,7 @@ again:			remove_next = 1 + (end > next->vm_end);
                         * split_vma inserting another: so it must be
                         * mprotect case 4 shifting the boundary down.
                         */
-                        adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
+                        adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
                        exporter = vma;
                        importer = next;
                }
@@ -787,8 +792,8 @@ again:			remove_next = 1 + (end > next->vm_end);
        if (!anon_vma && adjust_next)
                anon_vma = next->anon_vma;
        if (anon_vma) {
-                VM_BUG_ON(adjust_next && next->anon_vma &&
+                VM_BUG_ON_VMA(adjust_next && next->anon_vma &&
-                          anon_vma != next->anon_vma);
+                          anon_vma != next->anon_vma, next);
                anon_vma_lock_write(anon_vma);
                anon_vma_interval_tree_pre_update_vma(vma);
                if (adjust_next)
@@ -1010,7 +1015,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
 struct vm_area_struct *vma_merge(struct mm_struct *mm,
                        struct vm_area_struct *prev, unsigned long addr,
                        unsigned long end, unsigned long vm_flags,
-                        struct anon_vma *anon_vma, struct file *file,
+                        struct anon_vma *anon_vma, struct file *file,
                        pgoff_t pgoff, struct mempolicy *policy)
 {
        pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
@@ -1036,7 +1041,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
         * Can it merge with the predecessor?
         */
        if (prev && prev->vm_end == addr &&
-                        mpol_equal(vma_policy(prev), policy) &&
+                        mpol_equal(vma_policy(prev), policy) &&
                        can_vma_merge_after(prev, vm_flags,
                                                anon_vma, file, pgoff)) {
                /*
@@ -1064,7 +1069,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
         * Can this new request be merged in front of next?
         */
        if (next && end == next->vm_start &&
-                        mpol_equal(policy, vma_policy(next)) &&
+                        mpol_equal(policy, vma_policy(next)) &&
                        can_vma_merge_before(next, vm_flags,
                                        anon_vma, file, pgoff+pglen)) {
                if (prev && addr < prev->vm_end)        /* case 4 */
@@ -1235,7 +1240,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                        unsigned long flags, unsigned long pgoff,
                        unsigned long *populate)
 {
-        struct mm_struct * mm = current->mm;
+        struct mm_struct *mm = current->mm;
        vm_flags_t vm_flags;
        *populate = 0;
@@ -1263,7 +1268,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
        /* offset overflow? */
        if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
-               return -EOVERFLOW;
+                return -EOVERFLOW;
        /* Too many mappings? */
        if (mm->map_count > sysctl_max_map_count)
@@ -1921,7 +1926,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
        info.align_mask = 0;
        return vm_unmapped_area(&info);
 }
-#endif  
+#endif
 /*
 * This mmap-allocator allocates new areas top-down from below the
@@ -2321,13 +2326,13 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address)
 }
 struct vm_area_struct *
-find_extend_vma(struct mm_struct * mm, unsigned long addr)
+find_extend_vma(struct mm_struct *mm, unsigned long addr)
 {
-        struct vm_area_struct * vma;
+        struct vm_area_struct *vma;
        unsigned long start;
        addr &= PAGE_MASK;
-        vma = find_vma(mm,addr);
+        vma = find_vma(mm, addr);
        if (!vma)
                return NULL;
        if (vma->vm_start <= addr)
@@ -2376,7 +2381,7 @@ static void unmap_region(struct mm_struct *mm,
                struct vm_area_struct *vma, struct vm_area_struct *prev,
                unsigned long start, unsigned long end)
 {
-        struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
+        struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
        struct mmu_gather tlb;
        lru_add_drain();
@@ -2423,7 +2428,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 * __split_vma() bypasses sysctl_max_map_count checking.  We use this on the
 * munmap path where it doesn't make sense to fail.
 */
-static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
+static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
              unsigned long addr, int new_below)
 {
        struct vm_area_struct *new;
@@ -2512,7 +2517,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
        if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
                return -EINVAL;
-        if ((len = PAGE_ALIGN(len)) == 0)
+        len = PAGE_ALIGN(len);
+        if (len == 0)
                return -EINVAL;
        /* Find the first overlapping VMA */
@@ -2558,7 +2564,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
                if (error)
                        return error;
        }
-        vma = prev? prev->vm_next: mm->mmap;
+        vma = prev ? prev->vm_next : mm->mmap;
        /*
         * unlock any mlock()ed ranges before detaching vmas
@@ -2621,10 +2627,10 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
 */
 static unsigned long do_brk(unsigned long addr, unsigned long len)
 {
-        struct mm_struct * mm = current->mm;
+        struct mm_struct *mm = current->mm;
-        struct vm_area_struct * vma, * prev;
+        struct vm_area_struct *vma, *prev;
        unsigned long flags;
-        struct rb_node ** rb_link, * rb_parent;
+        struct rb_node **rb_link, *rb_parent;
        pgoff_t pgoff = addr >> PAGE_SHIFT;
        int error;
@@ -2848,7 +2854,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                         * safe. It is only safe to keep the vm_pgoff
                         * linear if there are no pages mapped yet.
                         */
-                        VM_BUG_ON(faulted_in_anon_vma);
+                        VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
                        *vmap = vma = new_vma;
                }
                *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
@@ -3196,7 +3202,7 @@ void __init mmap_init(void)
 {
        int ret;
-        ret = percpu_counter_init(&vm_committed_as, 0);
+        ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
        VM_BUG_ON(ret);
 }
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 950813b1eb36..2c8da9825fe3 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -107,7 +107,8 @@ void __mmu_notifier_release(struct mm_struct *mm)
 * existed or not.
 */
 int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
-                                        unsigned long address)
+                                        unsigned long start,
+                                        unsigned long end)
 {
        struct mmu_notifier *mn;
        int young = 0, id;
@@ -115,7 +116,7 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->clear_flush_young)
-                        young |= mn->ops->clear_flush_young(mn, mm, address);
+                        young |= mn->ops->clear_flush_young(mn, mm, start, end);
        }
        srcu_read_unlock(&srcu, id);
diff --git a/mm/mremap.c b/mm/mremap.c
index 05f1180e9f21..b147f66f4c40 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -21,8 +21,8 @@
 #include <linux/syscalls.h>
 #include <linux/mmu_notifier.h>
 #include <linux/sched/sysctl.h>
+#include <linux/uaccess.h>
-#include <asm/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
@@ -195,7 +195,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                if (pmd_trans_huge(*old_pmd)) {
                        int err = 0;
                        if (extent == HPAGE_PMD_SIZE) {
-                                VM_BUG_ON(vma->vm_file || !vma->anon_vma);
+                                VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma,
+                                              vma);
                                /* See comment in move_ptes() */
                                if (need_rmap_locks)
                                        anon_vma_lock_write(vma->anon_vma);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 7ed58602e71b..7c7ab32ee503 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -119,6 +119,8 @@ static unsigned long __init free_low_memory_core_early(void)
        phys_addr_t start, end;
        u64 i;
+        memblock_clear_hotplug(0, -1);
        for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
                count += __free_memory_core(start, end);
diff --git a/mm/nommu.c b/mm/nommu.c
index a881d9673c6b..bd1808e194a7 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -539,7 +539,7 @@ void __init mmap_init(void)
 {
        int ret;
-        ret = percpu_counter_init(&vm_committed_as, 0);
+        ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
        VM_BUG_ON(ret);
        vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1e11df8fa7ec..bbf405a3a18f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -565,7 +565,7 @@ bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask)
        spin_lock(&zone_scan_lock);
        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
-                if (zone_is_oom_locked(zone)) {
+                if (test_bit(ZONE_OOM_LOCKED, &zone->flags)) {
                        ret = false;
                        goto out;
                }
@@ -575,7 +575,7 @@ bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask)
         * call to oom_zonelist_trylock() doesn't succeed when it shouldn't.
         */
        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
-                zone_set_flag(zone, ZONE_OOM_LOCKED);
+                set_bit(ZONE_OOM_LOCKED, &zone->flags);
 out:
        spin_unlock(&zone_scan_lock);
@@ -594,7 +594,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
        spin_lock(&zone_scan_lock);
        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
-                zone_clear_flag(zone, ZONE_OOM_LOCKED);
+                clear_bit(ZONE_OOM_LOCKED, &zone->flags);
        spin_unlock(&zone_scan_lock);
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 91d73ef1744d..ff24c9d83112 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1075,13 +1075,13 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
        }
        if (dirty < setpoint) {
-                x = min(bdi->balanced_dirty_ratelimit,
+                x = min3(bdi->balanced_dirty_ratelimit,
-                         min(balanced_dirty_ratelimit, task_ratelimit));
+                         balanced_dirty_ratelimit, task_ratelimit);
                if (dirty_ratelimit < x)
                        step = x - dirty_ratelimit;
        } else {
-                x = max(bdi->balanced_dirty_ratelimit,
+                x = max3(bdi->balanced_dirty_ratelimit,
-                         max(balanced_dirty_ratelimit, task_ratelimit));
+                         balanced_dirty_ratelimit, task_ratelimit);
                if (dirty_ratelimit > x)
                        step = dirty_ratelimit - x;
        }
@@ -1777,7 +1777,7 @@ void __init page_writeback_init(void)
        writeback_set_ratelimit();
        register_cpu_notifier(&ratelimit_nb);
-        fprop_global_init(&writeout_completions);
+        fprop_global_init(&writeout_completions, GFP_KERNEL);
 }
 /**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 18cee0d4c8a2..c9710c9bbee2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -53,8 +53,6 @@
 #include <linux/kmemleak.h>
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
-#include <linux/ftrace_event.h>
-#include <linux/memcontrol.h>
 #include <linux/prefetch.h>
 #include <linux/mm_inline.h>
 #include <linux/migrate.h>
@@ -85,6 +83,7 @@ EXPORT_PER_CPU_SYMBOL(numa_node);
 */
 DEFINE_PER_CPU(int, _numa_mem_);                /* Kernel "local memory" node */
 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
+int _node_numa_mem_[MAX_NUMNODES];
 #endif
 /*
@@ -1014,7 +1013,7 @@ int move_freepages(struct zone *zone,
         * Remove at a later date when no bug reports exist related to
         * grouping pages by mobility
         */
-        BUG_ON(page_zone(start_page) != page_zone(end_page));
+        VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
 #endif
        for (page = start_page; page <= end_page;) {
@@ -1612,9 +1611,9 @@ again:
        }
        __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
-        if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 &&
+        if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
-            !zone_is_fair_depleted(zone))
+            !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
-                zone_set_flag(zone, ZONE_FAIR_DEPLETED);
+                set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
        zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1934,7 +1933,7 @@ static void reset_alloc_batches(struct zone *preferred_zone)
                mod_zone_page_state(zone, NR_ALLOC_BATCH,
                        high_wmark_pages(zone) - low_wmark_pages(zone) -
                        atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
-                zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
+                clear_bit(ZONE_FAIR_DEPLETED, &zone->flags);
        } while (zone++ != preferred_zone);
 }
@@ -1985,7 +1984,7 @@ zonelist_scan:
                if (alloc_flags & ALLOC_FAIR) {
                        if (!zone_local(preferred_zone, zone))
                                break;
-                        if (zone_is_fair_depleted(zone)) {
+                        if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
                                nr_fair_skipped++;
                                continue;
                        }
@@ -2296,58 +2295,72 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
        int classzone_idx, int migratetype, enum migrate_mode mode,
-        bool *contended_compaction, bool *deferred_compaction,
+        int *contended_compaction, bool *deferred_compaction)
-        unsigned long *did_some_progress)
 {
-        if (!order)
+        struct zone *last_compact_zone = NULL;
-                return NULL;
+        unsigned long compact_result;
+        struct page *page;
-        if (compaction_deferred(preferred_zone, order)) {
+        if (!order)
-                *deferred_compaction = true;
                return NULL;
-        }
        current->flags |= PF_MEMALLOC;
-        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
+        compact_result = try_to_compact_pages(zonelist, order, gfp_mask,
                                                nodemask, mode,
-                                                contended_compaction);
+                                                contended_compaction,
+                                                &last_compact_zone);
        current->flags &= ~PF_MEMALLOC;
-        if (*did_some_progress != COMPACT_SKIPPED) {
+        switch (compact_result) {
-                struct page *page;
+        case COMPACT_DEFERRED:
+                *deferred_compaction = true;
+                /* fall-through */
+        case COMPACT_SKIPPED:
+                return NULL;
+        default:
+                break;
+        }
-                /* Page migration frees to the PCP lists but we want merging */
+        /*
-                drain_pages(get_cpu());
+         * At least in one zone compaction wasn't deferred or skipped, so let's
-                put_cpu();
+         * count a compaction stall
+         */
+        count_vm_event(COMPACTSTALL);
-                page = get_page_from_freelist(gfp_mask, nodemask,
+        /* Page migration frees to the PCP lists but we want merging */
-                                order, zonelist, high_zoneidx,
+        drain_pages(get_cpu());
-                                alloc_flags & ~ALLOC_NO_WATERMARKS,
+        put_cpu();
-                                preferred_zone, classzone_idx, migratetype);
-                if (page) {
-                        preferred_zone->compact_blockskip_flush = false;
-                        compaction_defer_reset(preferred_zone, order, true);
-                        count_vm_event(COMPACTSUCCESS);
-                        return page;
-                }
-                /*
+        page = get_page_from_freelist(gfp_mask, nodemask,
-                 * It's bad if compaction run occurs and fails.
+                        order, zonelist, high_zoneidx,
-                 * The most likely reason is that pages exist,
+                        alloc_flags & ~ALLOC_NO_WATERMARKS,
-                 * but not enough to satisfy watermarks.
+                        preferred_zone, classzone_idx, migratetype);
-                 */
-                count_vm_event(COMPACTFAIL);
-                /*
+        if (page) {
-                 * As async compaction considers a subset of pageblocks, only
+                struct zone *zone = page_zone(page);
-                 * defer if the failure was a sync compaction failure.
-                 */
-                if (mode != MIGRATE_ASYNC)
-                        defer_compaction(preferred_zone, order);
-                cond_resched();
+                zone->compact_blockskip_flush = false;
+                compaction_defer_reset(zone, order, true);
+                count_vm_event(COMPACTSUCCESS);
+                return page;
        }
+        /*
+         * last_compact_zone is where try_to_compact_pages thought allocation
+         * should succeed, so it did not defer compaction. But here we know
+         * that it didn't succeed, so we do the defer.
+         */
+        if (last_compact_zone && mode != MIGRATE_ASYNC)
+                defer_compaction(last_compact_zone, order);
+        /*
+         * It's bad if compaction run occurs and fails. The most likely reason
+         * is that pages exist, but not enough to satisfy watermarks.
+         */
+        count_vm_event(COMPACTFAIL);
+        cond_resched();
        return NULL;
 }
 #else
@@ -2355,9 +2368,8 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int classzone_idx, int migratetype,
+        int classzone_idx, int migratetype, enum migrate_mode mode,
-        enum migrate_mode mode, bool *contended_compaction,
+        int *contended_compaction, bool *deferred_compaction)
-        bool *deferred_compaction, unsigned long *did_some_progress)
 {
        return NULL;
 }
@@ -2457,12 +2469,14 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 static void wake_all_kswapds(unsigned int order,
                             struct zonelist *zonelist,
                             enum zone_type high_zoneidx,
-                             struct zone *preferred_zone)
+                             struct zone *preferred_zone,
+                             nodemask_t *nodemask)
 {
        struct zoneref *z;
        struct zone *zone;
-        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+        for_each_zone_zonelist_nodemask(zone, z, zonelist,
+                                                high_zoneidx, nodemask)
                wakeup_kswapd(zone, order, zone_idx(preferred_zone));
 }
@@ -2509,7 +2523,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
                        alloc_flags |= ALLOC_NO_WATERMARKS;
        }
 #ifdef CONFIG_CMA
-        if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+        if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
                alloc_flags |= ALLOC_CMA;
 #endif
        return alloc_flags;
@@ -2533,7 +2547,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        unsigned long did_some_progress;
        enum migrate_mode migration_mode = MIGRATE_ASYNC;
        bool deferred_compaction = false;
-        bool contended_compaction = false;
+        int contended_compaction = COMPACT_CONTENDED_NONE;
        /*
         * In the slowpath, we sanity check order to avoid ever trying to
@@ -2560,7 +2574,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 restart:
        if (!(gfp_mask & __GFP_NO_KSWAPD))
-                wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
+                wake_all_kswapds(order, zonelist, high_zoneidx,
+                                preferred_zone, nodemask);
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -2633,20 +2648,40 @@ rebalance:
                                        preferred_zone,
                                        classzone_idx, migratetype,
                                        migration_mode, &contended_compaction,
-                                        &deferred_compaction,
+                                        &deferred_compaction);
-                                        &did_some_progress);
        if (page)
                goto got_pg;
-        /*
+        /* Checks for THP-specific high-order allocations */
-         * If compaction is deferred for high-order allocations, it is because
+        if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) {
-         * sync compaction recently failed. In this is the case and the caller
+                /*
-         * requested a movable allocation that does not heavily disrupt the
+                 * If compaction is deferred for high-order allocations, it is
-         * system then fail the allocation instead of entering direct reclaim.
+                 * because sync compaction recently failed. If this is the case
-         */
+                 * and the caller requested a THP allocation, we do not want
-        if ((deferred_compaction || contended_compaction) &&
+                 * to heavily disrupt the system, so we fail the allocation
-                                                (gfp_mask & __GFP_NO_KSWAPD))
+                 * instead of entering direct reclaim.
-                goto nopage;
+                 */
+                if (deferred_compaction)
+                        goto nopage;
+                /*
+                 * In all zones where compaction was attempted (and not
+                 * deferred or skipped), lock contention has been detected.
+                 * For THP allocation we do not want to disrupt the others
+                 * so we fallback to base pages instead.
+                 */
+                if (contended_compaction == COMPACT_CONTENDED_LOCK)
+                        goto nopage;
+                /*
+                 * If compaction was aborted due to need_resched(), we do not
+                 * want to further increase allocation latency, unless it is
+                 * khugepaged trying to collapse.
+                 */
+                if (contended_compaction == COMPACT_CONTENDED_SCHED
+                        && !(current->flags & PF_KTHREAD))
+                        goto nopage;
+        }
        /*
         * It can become very expensive to allocate transparent hugepages at
@@ -2726,8 +2761,7 @@ rebalance:
                                        preferred_zone,
                                        classzone_idx, migratetype,
                                        migration_mode, &contended_compaction,
-                                        &deferred_compaction,
+                                        &deferred_compaction);
-                                        &did_some_progress);
                if (page)
                        goto got_pg;
        }
@@ -2753,7 +2787,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        struct zone *preferred_zone;
        struct zoneref *preferred_zoneref;
        struct page *page = NULL;
-        int migratetype = allocflags_to_migratetype(gfp_mask);
+        int migratetype = gfpflags_to_migratetype(gfp_mask);
        unsigned int cpuset_mems_cookie;
        int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
        int classzone_idx;
@@ -2775,6 +2809,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        if (unlikely(!zonelist->_zonerefs->zone))
                return NULL;
+        if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE)
+                alloc_flags |= ALLOC_CMA;
 retry_cpuset:
        cpuset_mems_cookie = read_mems_allowed_begin();
@@ -2786,10 +2823,6 @@ retry_cpuset:
                goto out;
        classzone_idx = zonelist_zone_idx(preferred_zoneref);
-#ifdef CONFIG_CMA
-        if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
-                alloc_flags |= ALLOC_CMA;
-#endif
        /* First allocation attempt */
        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
                        zonelist, high_zoneidx, alloc_flags,
@@ -3579,68 +3612,30 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
        zonelist->_zonerefs[pos].zone_idx = 0;
 }
+#if defined(CONFIG_64BIT)
+/*
+ * Devices that require DMA32/DMA are relatively rare and do not justify a
+ * penalty to every machine in case the specialised case applies. Default
+ * to Node-ordering on 64-bit NUMA machines
+ */
+static int default_zonelist_order(void)
+{
+        return ZONELIST_ORDER_NODE;
+}
+#else
+/*
+ * On 32-bit, the Normal zone needs to be preserved for allocations accessible
+ * by the kernel. If processes running on node 0 deplete the low memory zone
+ * then reclaim will occur more frequency increasing stalls and potentially
+ * be easier to OOM if a large percentage of the zone is under writeback or
+ * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
+ * Hence, default to zone ordering on 32-bit.
+ */
 static int default_zonelist_order(void)
 {
-        int nid, zone_type;
-        unsigned long low_kmem_size, total_size;
-        struct zone *z;
-        int average_size;
-        /*
-         * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
-         * If they are really small and used heavily, the system can fall
-         * into OOM very easily.
-         * This function detect ZONE_DMA/DMA32 size and configures zone order.
-         */
-        /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
-        low_kmem_size = 0;
-        total_size = 0;
-        for_each_online_node(nid) {
-                for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
-                        z = &NODE_DATA(nid)->node_zones[zone_type];
-                        if (populated_zone(z)) {
-                                if (zone_type < ZONE_NORMAL)
-                                        low_kmem_size += z->managed_pages;
-                                total_size += z->managed_pages;
-                        } else if (zone_type == ZONE_NORMAL) {
-                                /*
-                                 * If any node has only lowmem, then node order
-                                 * is preferred to allow kernel allocations
-                                 * locally; otherwise, they can easily infringe
-                                 * on other nodes when there is an abundance of
-                                 * lowmem available to allocate from.
-                                 */
-                                return ZONELIST_ORDER_NODE;
-                        }
-                }
-        }
-        if (!low_kmem_size ||  /* there are no DMA area. */
-            low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
-                return ZONELIST_ORDER_NODE;
-        /*
-         * look into each node's config.
-         * If there is a node whose DMA/DMA32 memory is very big area on
-         * local memory, NODE_ORDER may be suitable.
-         */
-        average_size = total_size /
-                                (nodes_weight(node_states[N_MEMORY]) + 1);
-        for_each_online_node(nid) {
-                low_kmem_size = 0;
-                total_size = 0;
-                for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
-                        z = &NODE_DATA(nid)->node_zones[zone_type];
-                        if (populated_zone(z)) {
-                                if (zone_type < ZONE_NORMAL)
-                                        low_kmem_size += z->present_pages;
-                                total_size += z->present_pages;
-                        }
-                }
-                if (low_kmem_size &&
-                    total_size > average_size && /* ignore small node */
-                    low_kmem_size > total_size * 70/100)
-                        return ZONELIST_ORDER_NODE;
-        }
        return ZONELIST_ORDER_ZONE;
 }
+#endif /* CONFIG_64BIT */
 static void set_zonelist_order(void)
 {
@@ -5701,9 +5696,8 @@ static void __setup_per_zone_wmarks(void)
                zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
                __mod_zone_page_state(zone, NR_ALLOC_BATCH,
-                                      high_wmark_pages(zone) -
+                        high_wmark_pages(zone) - low_wmark_pages(zone) -
-                                      low_wmark_pages(zone) -
+                        atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
-                                      zone_page_state(zone, NR_ALLOC_BATCH));
                setup_zone_migrate_reserve(zone);
                spin_unlock_irqrestore(&zone->lock, flags);
@@ -6278,8 +6272,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
                if (list_empty(&cc->migratepages)) {
                        cc->nr_migratepages = 0;
-                        pfn = isolate_migratepages_range(cc->zone, cc,
+                        pfn = isolate_migratepages_range(cc, pfn, end);
-                                                         pfn, end, true);
                        if (!pfn) {
                                ret = -EINTR;
                                break;
@@ -6555,97 +6548,3 @@ bool is_free_buddy_page(struct page *page)
        return order < MAX_ORDER;
 }
 #endif
-static const struct trace_print_flags pageflag_names[] = {
-        {1UL << PG_locked,              "locked"        },
-        {1UL << PG_error,               "error"         },
-        {1UL << PG_referenced,          "referenced"    },
-        {1UL << PG_uptodate,            "uptodate"      },
-        {1UL << PG_dirty,               "dirty"         },
-        {1UL << PG_lru,                 "lru"           },
-        {1UL << PG_active,              "active"        },
-        {1UL << PG_slab,                "slab"          },
-        {1UL << PG_owner_priv_1,        "owner_priv_1"  },
-        {1UL << PG_arch_1,              "arch_1"        },
-        {1UL << PG_reserved,            "reserved"      },
-        {1UL << PG_private,             "private"       },
-        {1UL << PG_private_2,           "private_2"     },
-        {1UL << PG_writeback,           "writeback"     },
-#ifdef CONFIG_PAGEFLAGS_EXTENDED
-        {1UL << PG_head,                "head"          },
-        {1UL << PG_tail,                "tail"          },
-#else
-        {1UL << PG_compound,            "compound"      },
-#endif
-        {1UL << PG_swapcache,           "swapcache"     },
-        {1UL << PG_mappedtodisk,        "mappedtodisk"  },
-        {1UL << PG_reclaim,             "reclaim"       },
-        {1UL << PG_swapbacked,          "swapbacked"    },
-        {1UL << PG_unevictable,         "unevictable"   },
-#ifdef CONFIG_MMU
-        {1UL << PG_mlocked,             "mlocked"       },
-#endif
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
-        {1UL << PG_uncached,            "uncached"      },
-#endif
-#ifdef CONFIG_MEMORY_FAILURE
-        {1UL << PG_hwpoison,            "hwpoison"      },
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-        {1UL << PG_compound_lock,       "compound_lock" },
-#endif
-};
-static void dump_page_flags(unsigned long flags)
-{
-        const char *delim = "";
-        unsigned long mask;
-        int i;
-        BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
-        printk(KERN_ALERT "page flags: %#lx(", flags);
-        /* remove zone id */
-        flags &= (1UL << NR_PAGEFLAGS) - 1;
-        for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
-                mask = pageflag_names[i].mask;
-                if ((flags & mask) != mask)
-                        continue;
-                flags &= ~mask;
-                printk("%s%s", delim, pageflag_names[i].name);
-                delim = "|";
-        }
-        /* check for left over flags */
-        if (flags)
-                printk("%s%#lx", delim, flags);
-        printk(")\n");
-}
-void dump_page_badflags(struct page *page, const char *reason,
-                unsigned long badflags)
-{
-        printk(KERN_ALERT
-               "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
-                page, atomic_read(&page->_count), page_mapcount(page),
-                page->mapping, page->index);
-        dump_page_flags(page->flags);
-        if (reason)
-                pr_alert("page dumped because: %s\n", reason);
-        if (page->flags & badflags) {
-                pr_alert("bad because of flags:\n");
-                dump_page_flags(page->flags & badflags);
-        }
-        mem_cgroup_print_bad_page(page);
-}
-void dump_page(struct page *page, const char *reason)
-{
-        dump_page_badflags(page, reason, 0);
-}
-EXPORT_SYMBOL(dump_page);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 2beeabf502c5..ad83195521f2 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -177,7 +177,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
        if (!walk->mm)
                return -EINVAL;
-        VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
+        VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm);
        pgd = pgd_offset(walk->mm, addr);
        do {
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 89633fefc6a2..10e3d0b8a86d 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -33,17 +33,14 @@
 #include <linux/log2.h>
-static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
+                               int page_start, int page_end)
 {
-        unsigned int cpu;
-        for_each_possible_cpu(cpu)
-                memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
        return 0;
 }
-static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
+                                  int page_start, int page_end)
 {
        /* nada */
 }
@@ -70,6 +67,11 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
        chunk->data = pages;
        chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
+        spin_lock_irq(&pcpu_lock);
+        pcpu_chunk_populated(chunk, 0, nr_pages);
+        spin_unlock_irq(&pcpu_lock);
        return chunk;
 }
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 3707c71ae4cd..538998a137d2 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -20,46 +20,25 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
 }
 /**
- * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
+ * pcpu_get_pages - get temp pages array
 * @chunk: chunk of interest
- * @bitmapp: output parameter for bitmap
- * @may_alloc: may allocate the array
 *
- * Returns pointer to array of pointers to struct page and bitmap,
+ * Returns pointer to array of pointers to struct page which can be indexed
- * both of which can be indexed with pcpu_page_idx().  The returned
+ * with pcpu_page_idx().  Note that there is only one array and accesses
- * array is cleared to zero and *@bitmapp is copied from
+ * should be serialized by pcpu_alloc_mutex.
- * @chunk->populated.  Note that there is only one array and bitmap
- * and access exclusion is the caller's responsibility.
- *
- * CONTEXT:
- * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
- * Otherwise, don't care.
 *
 * RETURNS:
- * Pointer to temp pages array on success, NULL on failure.
+ * Pointer to temp pages array on success.
 */
-static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
+static struct page **pcpu_get_pages(struct pcpu_chunk *chunk_alloc)
-                                               unsigned long **bitmapp,
-                                               bool may_alloc)
 {
        static struct page **pages;
-        static unsigned long *bitmap;
        size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
-        size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
-                             sizeof(unsigned long);
-        if (!pages || !bitmap) {
-                if (may_alloc && !pages)
-                        pages = pcpu_mem_zalloc(pages_size);
-                if (may_alloc && !bitmap)
-                        bitmap = pcpu_mem_zalloc(bitmap_size);
-                if (!pages || !bitmap)
-                        return NULL;
-        }
-        bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
+        lockdep_assert_held(&pcpu_alloc_mutex);
-        *bitmapp = bitmap;
+        if (!pages)
+                pages = pcpu_mem_zalloc(pages_size);
        return pages;
 }
@@ -67,7 +46,6 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
 * pcpu_free_pages - free pages which were allocated for @chunk
 * @chunk: chunk pages were allocated for
 * @pages: array of pages to be freed, indexed by pcpu_page_idx()
- * @populated: populated bitmap
 * @page_start: page index of the first page to be freed
 * @page_end: page index of the last page to be freed + 1
 *
@@ -75,8 +53,7 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
 * The pages were allocated for @chunk.
 */
 static void pcpu_free_pages(struct pcpu_chunk *chunk,
-                            struct page **pages, unsigned long *populated,
+                            struct page **pages, int page_start, int page_end)
-                            int page_start, int page_end)
 {
        unsigned int cpu;
        int i;
@@ -95,7 +72,6 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
 * pcpu_alloc_pages - allocates pages for @chunk
 * @chunk: target chunk
 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
- * @populated: populated bitmap
 * @page_start: page index of the first page to be allocated
 * @page_end: page index of the last page to be allocated + 1
 *
@@ -104,11 +80,10 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
 * content of @pages and will pass it verbatim to pcpu_map_pages().
 */
 static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
-                            struct page **pages, unsigned long *populated,
+                            struct page **pages, int page_start, int page_end)
-                            int page_start, int page_end)
 {
        const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
-        unsigned int cpu;
+        unsigned int cpu, tcpu;
        int i;
        for_each_possible_cpu(cpu) {
@@ -116,14 +91,23 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
                        struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
                        *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
-                        if (!*pagep) {
+                        if (!*pagep)
-                                pcpu_free_pages(chunk, pages, populated,
+                                goto err;
-                                                page_start, page_end);
-                                return -ENOMEM;
-                        }
                }
        }
        return 0;
+err:
+        while (--i >= page_start)
+                __free_page(pages[pcpu_page_idx(cpu, i)]);
+        for_each_possible_cpu(tcpu) {
+                if (tcpu == cpu)
+                        break;
+                for (i = page_start; i < page_end; i++)
+                        __free_page(pages[pcpu_page_idx(tcpu, i)]);
+        }
+        return -ENOMEM;
 }
 /**
@@ -155,7 +139,6 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
 * @chunk: chunk of interest
 * @pages: pages array which can be used to pass information to free
- * @populated: populated bitmap
 * @page_start: page index of the first page to unmap
 * @page_end: page index of the last page to unmap + 1
 *
@@ -166,8 +149,7 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
 * proper pre/post flush functions.
 */
 static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
-                             struct page **pages, unsigned long *populated,
+                             struct page **pages, int page_start, int page_end)
-                             int page_start, int page_end)
 {
        unsigned int cpu;
        int i;
@@ -183,8 +165,6 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
                __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
                                   page_end - page_start);
        }
-        bitmap_clear(populated, page_start, page_end - page_start);
 }
 /**
@@ -219,7 +199,6 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages,
 * pcpu_map_pages - map pages into a pcpu_chunk
 * @chunk: chunk of interest
 * @pages: pages array containing pages to be mapped
- * @populated: populated bitmap
 * @page_start: page index of the first page to map
 * @page_end: page index of the last page to map + 1
 *
@@ -227,13 +206,11 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages,
 * caller is responsible for calling pcpu_post_map_flush() after all
 * mappings are complete.
 *
- * This function is responsible for setting corresponding bits in
+ * This function is responsible for setting up whatever is necessary for
- * @chunk->populated bitmap and whatever is necessary for reverse
+ * reverse lookup (addr -> chunk).
- * lookup (addr -> chunk).
 */
 static int pcpu_map_pages(struct pcpu_chunk *chunk,
-                          struct page **pages, unsigned long *populated,
+                          struct page **pages, int page_start, int page_end)
-                          int page_start, int page_end)
 {
        unsigned int cpu, tcpu;
        int i, err;
@@ -244,18 +221,12 @@ static int pcpu_map_pages(struct pcpu_chunk *chunk,
                                       page_end - page_start);
                if (err < 0)
                        goto err;
-        }
-        /* mapping successful, link chunk and mark populated */
+                for (i = page_start; i < page_end; i++)
-        for (i = page_start; i < page_end; i++) {
-                for_each_possible_cpu(cpu)
                        pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
                                            chunk);
-                __set_bit(i, populated);
        }
        return 0;
 err:
        for_each_possible_cpu(tcpu) {
                if (tcpu == cpu)
@@ -263,6 +234,7 @@ err:
                __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
                                   page_end - page_start);
        }
+        pcpu_post_unmap_tlb_flush(chunk, page_start, page_end);
        return err;
 }
@@ -289,123 +261,69 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
 /**
 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
 * @chunk: chunk of interest
- * @off: offset to the area to populate
+ * @page_start: the start page
- * @size: size of the area to populate in bytes
+ * @page_end: the end page
 *
 * For each cpu, populate and map pages [@page_start,@page_end) into
- * @chunk.  The area is cleared on return.
+ * @chunk.
 *
 * CONTEXT:
 * pcpu_alloc_mutex, does GFP_KERNEL allocation.
 */
-static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
+                               int page_start, int page_end)
 {
-        int page_start = PFN_DOWN(off);
-        int page_end = PFN_UP(off + size);
-        int free_end = page_start, unmap_end = page_start;
        struct page **pages;
-        unsigned long *populated;
-        unsigned int cpu;
-        int rs, re, rc;
-        /* quick path, check whether all pages are already there */
-        rs = page_start;
-        pcpu_next_pop(chunk, &rs, &re, page_end);
-        if (rs == page_start && re == page_end)
-                goto clear;
-        /* need to allocate and map pages, this chunk can't be immutable */
+        pages = pcpu_get_pages(chunk);
-        WARN_ON(chunk->immutable);
-        pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
        if (!pages)
                return -ENOMEM;
-        /* alloc and map */
+        if (pcpu_alloc_pages(chunk, pages, page_start, page_end))
-        pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+                return -ENOMEM;
-                rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
-                if (rc)
-                        goto err_free;
-                free_end = re;
-        }
-        pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+        if (pcpu_map_pages(chunk, pages, page_start, page_end)) {
-                rc = pcpu_map_pages(chunk, pages, populated, rs, re);
+                pcpu_free_pages(chunk, pages, page_start, page_end);
-                if (rc)
+                return -ENOMEM;
-                        goto err_unmap;
-                unmap_end = re;
        }
        pcpu_post_map_flush(chunk, page_start, page_end);
-        /* commit new bitmap */
-        bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
-clear:
-        for_each_possible_cpu(cpu)
-                memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
        return 0;
-err_unmap:
-        pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
-        pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
-                pcpu_unmap_pages(chunk, pages, populated, rs, re);
-        pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
-err_free:
-        pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
-                pcpu_free_pages(chunk, pages, populated, rs, re);
-        return rc;
 }
 /**
 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
 * @chunk: chunk to depopulate
- * @off: offset to the area to depopulate
+ * @page_start: the start page
- * @size: size of the area to depopulate in bytes
+ * @page_end: the end page
 *
 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
- * from @chunk.  If @flush is true, vcache is flushed before unmapping
+ * from @chunk.
- * and tlb after.
 *
 * CONTEXT:
 * pcpu_alloc_mutex.
 */
-static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
+                                  int page_start, int page_end)
 {
-        int page_start = PFN_DOWN(off);
-        int page_end = PFN_UP(off + size);
        struct page **pages;
-        unsigned long *populated;
-        int rs, re;
-        /* quick path, check whether it's empty already */
-        rs = page_start;
-        pcpu_next_unpop(chunk, &rs, &re, page_end);
-        if (rs == page_start && re == page_end)
-                return;
-        /* immutable chunks can't be depopulated */
-        WARN_ON(chunk->immutable);
        /*
         * If control reaches here, there must have been at least one
         * successful population attempt so the temp pages array must
         * be available now.
         */
-        pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
+        pages = pcpu_get_pages(chunk);
        BUG_ON(!pages);
        /* unmap and free */
        pcpu_pre_unmap_flush(chunk, page_start, page_end);
-        pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+        pcpu_unmap_pages(chunk, pages, page_start, page_end);
-                pcpu_unmap_pages(chunk, pages, populated, rs, re);
        /* no need to flush tlb, vmalloc will handle it lazily */
-        pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+        pcpu_free_pages(chunk, pages, page_start, page_end);
-                pcpu_free_pages(chunk, pages, populated, rs, re);
-        /* commit new bitmap */
-        bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
 }
 static struct pcpu_chunk *pcpu_create_chunk(void)
diff --git a/mm/percpu.c b/mm/percpu.c
index 2139e30a4b44..014bab65e0ff 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -76,6 +76,10 @@
 #define PCPU_SLOT_BASE_SHIFT            5       /* 1-31 shares the same slot */
 #define PCPU_DFL_MAP_ALLOC              16      /* start a map with 16 ents */
+#define PCPU_ATOMIC_MAP_MARGIN_LOW      32
+#define PCPU_ATOMIC_MAP_MARGIN_HIGH     64
+#define PCPU_EMPTY_POP_PAGES_LOW        2
+#define PCPU_EMPTY_POP_PAGES_HIGH       4
 #ifdef CONFIG_SMP
 /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
@@ -102,12 +106,16 @@ struct pcpu_chunk {
        int                     free_size;      /* free bytes in the chunk */
        int                     contig_hint;    /* max contiguous size hint */
        void                    *base_addr;     /* base address of this chunk */
        int                     map_used;       /* # of map entries used before the sentry */
        int                     map_alloc;      /* # of map entries allocated */
        int                     *map;           /* allocation map */
+        struct work_struct      map_extend_work;/* async ->map[] extension */
        void                    *data;          /* chunk data */
        int                     first_free;     /* no free below this */
        bool                    immutable;      /* no [de]population allowed */
+        int                     nr_populated;   /* # of populated pages */
        unsigned long           populated[];    /* populated bitmap */
 };
@@ -151,38 +159,33 @@ static struct pcpu_chunk *pcpu_first_chunk;
 static struct pcpu_chunk *pcpu_reserved_chunk;
 static int pcpu_reserved_chunk_limit;
+static DEFINE_SPINLOCK(pcpu_lock);      /* all internal data structures */
+static DEFINE_MUTEX(pcpu_alloc_mutex);  /* chunk create/destroy, [de]pop */
+static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 /*
- * Synchronization rules.
+ * The number of empty populated pages, protected by pcpu_lock.  The
- *
+ * reserved chunk doesn't contribute to the count.
- * There are two locks - pcpu_alloc_mutex and pcpu_lock.  The former
- * protects allocation/reclaim paths, chunks, populated bitmap and
- * vmalloc mapping.  The latter is a spinlock and protects the index
- * data structures - chunk slots, chunks and area maps in chunks.
- *
- * During allocation, pcpu_alloc_mutex is kept locked all the time and
- * pcpu_lock is grabbed and released as necessary.  All actual memory
- * allocations are done using GFP_KERNEL with pcpu_lock released.  In
- * general, percpu memory can't be allocated with irq off but
- * irqsave/restore are still used in alloc path so that it can be used
- * from early init path - sched_init() specifically.
- *
- * Free path accesses and alters only the index data structures, so it
- * can be safely called from atomic context.  When memory needs to be
- * returned to the system, free path schedules reclaim_work which
- * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
- * reclaimed, release both locks and frees the chunks.  Note that it's
- * necessary to grab both locks to remove a chunk from circulation as
- * allocation path might be referencing the chunk with only
- * pcpu_alloc_mutex locked.
 */
-static DEFINE_MUTEX(pcpu_alloc_mutex);  /* protects whole alloc and reclaim */
+static int pcpu_nr_empty_pop_pages;
-static DEFINE_SPINLOCK(pcpu_lock);      /* protects index data structures */
-static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
+/*
+ * Balance work is used to populate or destroy chunks asynchronously.  We
+ * try to keep the number of populated free pages between
+ * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
+ * empty chunk.
+ */
+static void pcpu_balance_workfn(struct work_struct *work);
+static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
+static bool pcpu_async_enabled __read_mostly;
+static bool pcpu_atomic_alloc_failed;
-/* reclaim work to release fully free chunks, scheduled from free path */
+static void pcpu_schedule_balance_work(void)
-static void pcpu_reclaim(struct work_struct *work);
+{
-static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);
+        if (pcpu_async_enabled)
+                schedule_work(&pcpu_balance_work);
+}
 static bool pcpu_addr_in_first_chunk(void *addr)
 {
@@ -315,6 +318,38 @@ static void pcpu_mem_free(void *ptr, size_t size)
 }
 /**
+ * pcpu_count_occupied_pages - count the number of pages an area occupies
+ * @chunk: chunk of interest
+ * @i: index of the area in question
+ *
+ * Count the number of pages chunk's @i'th area occupies.  When the area's
+ * start and/or end address isn't aligned to page boundary, the straddled
+ * page is included in the count iff the rest of the page is free.
+ */
+static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i)
+{
+        int off = chunk->map[i] & ~1;
+        int end = chunk->map[i + 1] & ~1;
+        if (!PAGE_ALIGNED(off) && i > 0) {
+                int prev = chunk->map[i - 1];
+                if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE))
+                        off = round_down(off, PAGE_SIZE);
+        }
+        if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) {
+                int next = chunk->map[i + 1];
+                int nend = chunk->map[i + 2] & ~1;
+                if (!(next & 1) && nend >= round_up(end, PAGE_SIZE))
+                        end = round_up(end, PAGE_SIZE);
+        }
+        return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0);
+}
+/**
 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
 * @chunk: chunk of interest
 * @oslot: the previous slot it was on
@@ -342,9 +377,14 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 /**
 * pcpu_need_to_extend - determine whether chunk area map needs to be extended
 * @chunk: chunk of interest
+ * @is_atomic: the allocation context
 *
- * Determine whether area map of @chunk needs to be extended to
+ * Determine whether area map of @chunk needs to be extended.  If
- * accommodate a new allocation.
+ * @is_atomic, only the amount necessary for a new allocation is
+ * considered; however, async extension is scheduled if the left amount is
+ * low.  If !@is_atomic, it aims for more empty space.  Combined, this
+ * ensures that the map is likely to have enough available space to
+ * accomodate atomic allocations which can't extend maps directly.
 *
 * CONTEXT:
 * pcpu_lock.
@@ -353,15 +393,26 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 * New target map allocation length if extension is necessary, 0
 * otherwise.
 */
-static int pcpu_need_to_extend(struct pcpu_chunk *chunk)
+static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
 {
-        int new_alloc;
+        int margin, new_alloc;
+        if (is_atomic) {
+                margin = 3;
+                if (chunk->map_alloc <
+                    chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
+                    pcpu_async_enabled)
+                        schedule_work(&chunk->map_extend_work);
+        } else {
+                margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
+        }
-        if (chunk->map_alloc >= chunk->map_used + 3)
+        if (chunk->map_alloc >= chunk->map_used + margin)
                return 0;
        new_alloc = PCPU_DFL_MAP_ALLOC;
-        while (new_alloc < chunk->map_used + 3)
+        while (new_alloc < chunk->map_used + margin)
                new_alloc *= 2;
        return new_alloc;
@@ -418,11 +469,76 @@ out_unlock:
        return 0;
 }
+static void pcpu_map_extend_workfn(struct work_struct *work)
+{
+        struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk,
+                                                map_extend_work);
+        int new_alloc;
+        spin_lock_irq(&pcpu_lock);
+        new_alloc = pcpu_need_to_extend(chunk, false);
+        spin_unlock_irq(&pcpu_lock);
+        if (new_alloc)
+                pcpu_extend_area_map(chunk, new_alloc);
+}
+/**
+ * pcpu_fit_in_area - try to fit the requested allocation in a candidate area
+ * @chunk: chunk the candidate area belongs to
+ * @off: the offset to the start of the candidate area
+ * @this_size: the size of the candidate area
+ * @size: the size of the target allocation
+ * @align: the alignment of the target allocation
+ * @pop_only: only allocate from already populated region
+ *
+ * We're trying to allocate @size bytes aligned at @align.  @chunk's area
+ * at @off sized @this_size is a candidate.  This function determines
+ * whether the target allocation fits in the candidate area and returns the
+ * number of bytes to pad after @off.  If the target area doesn't fit, -1
+ * is returned.
+ *
+ * If @pop_only is %true, this function only considers the already
+ * populated part of the candidate area.
+ */
+static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size,
+                            int size, int align, bool pop_only)
+{
+        int cand_off = off;
+        while (true) {
+                int head = ALIGN(cand_off, align) - off;
+                int page_start, page_end, rs, re;
+                if (this_size < head + size)
+                        return -1;
+                if (!pop_only)
+                        return head;
+                /*
+                 * If the first unpopulated page is beyond the end of the
+                 * allocation, the whole allocation is populated;
+                 * otherwise, retry from the end of the unpopulated area.
+                 */
+                page_start = PFN_DOWN(head + off);
+                page_end = PFN_UP(head + off + size);
+                rs = page_start;
+                pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size));
+                if (rs >= page_end)
+                        return head;
+                cand_off = re * PAGE_SIZE;
+        }
+}
 /**
 * pcpu_alloc_area - allocate area from a pcpu_chunk
 * @chunk: chunk of interest
 * @size: wanted size in bytes
 * @align: wanted align
+ * @pop_only: allocate only from the populated area
+ * @occ_pages_p: out param for the number of pages the area occupies
 *
 * Try to allocate @size bytes area aligned at @align from @chunk.
 * Note that this function only allocates the offset.  It doesn't
@@ -437,7 +553,8 @@ out_unlock:
 * Allocated offset in @chunk on success, -1 if no matching area is
 * found.
 */
-static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
+static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align,
+                           bool pop_only, int *occ_pages_p)
 {
        int oslot = pcpu_chunk_slot(chunk);
        int max_contig = 0;
@@ -453,11 +570,11 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
                if (off & 1)
                        continue;
-                /* extra for alignment requirement */
-                head = ALIGN(off, align) - off;
                this_size = (p[1] & ~1) - off;
-                if (this_size < head + size) {
+                head = pcpu_fit_in_area(chunk, off, this_size, size, align,
+                                        pop_only);
+                if (head < 0) {
                        if (!seen_free) {
                                chunk->first_free = i;
                                seen_free = true;
@@ -526,6 +643,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
                chunk->free_size -= size;
                *p |= 1;
+                *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
                pcpu_chunk_relocate(chunk, oslot);
                return off;
        }
@@ -541,6 +659,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 * pcpu_free_area - free area to a pcpu_chunk
 * @chunk: chunk of interest
 * @freeme: offset of area to free
+ * @occ_pages_p: out param for the number of pages the area occupies
 *
 * Free area starting from @freeme to @chunk.  Note that this function
 * only modifies the allocation map.  It doesn't depopulate or unmap
@@ -549,7 +668,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 * CONTEXT:
 * pcpu_lock.
 */
-static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
+static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
+                           int *occ_pages_p)
 {
        int oslot = pcpu_chunk_slot(chunk);
        int off = 0;
@@ -580,6 +700,8 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
        *p = off &= ~1;
        chunk->free_size += (p[1] & ~1) - off;
+        *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
        /* merge with next? */
        if (!(p[1] & 1))
                to_free++;
@@ -620,6 +742,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
        chunk->map_used = 1;
        INIT_LIST_HEAD(&chunk->list);
+        INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn);
        chunk->free_size = pcpu_unit_size;
        chunk->contig_hint = pcpu_unit_size;
@@ -634,6 +757,50 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
        pcpu_mem_free(chunk, pcpu_chunk_struct_size);
 }
+/**
+ * pcpu_chunk_populated - post-population bookkeeping
+ * @chunk: pcpu_chunk which got populated
+ * @page_start: the start page
+ * @page_end: the end page
+ *
+ * Pages in [@page_start,@page_end) have been populated to @chunk.  Update
+ * the bookkeeping information accordingly.  Must be called after each
+ * successful population.
+ */
+static void pcpu_chunk_populated(struct pcpu_chunk *chunk,
+                                 int page_start, int page_end)
+{
+        int nr = page_end - page_start;
+        lockdep_assert_held(&pcpu_lock);
+        bitmap_set(chunk->populated, page_start, nr);
+        chunk->nr_populated += nr;
+        pcpu_nr_empty_pop_pages += nr;
+}
+/**
+ * pcpu_chunk_depopulated - post-depopulation bookkeeping
+ * @chunk: pcpu_chunk which got depopulated
+ * @page_start: the start page
+ * @page_end: the end page
+ *
+ * Pages in [@page_start,@page_end) have been depopulated from @chunk.
+ * Update the bookkeeping information accordingly.  Must be called after
+ * each successful depopulation.
+ */
+static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
+                                   int page_start, int page_end)
+{
+        int nr = page_end - page_start;
+        lockdep_assert_held(&pcpu_lock);
+        bitmap_clear(chunk->populated, page_start, nr);
+        chunk->nr_populated -= nr;
+        pcpu_nr_empty_pop_pages -= nr;
+}
 /*
 * Chunk management implementation.
 *
@@ -695,21 +862,23 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 * @size: size of area to allocate in bytes
 * @align: alignment of area (max PAGE_SIZE)
 * @reserved: allocate from the reserved chunk if available
+ * @gfp: allocation flags
 *
- * Allocate percpu area of @size bytes aligned at @align.
+ * Allocate percpu area of @size bytes aligned at @align.  If @gfp doesn't
- *
+ * contain %GFP_KERNEL, the allocation is atomic.
- * CONTEXT:
- * Does GFP_KERNEL allocation.
 *
 * RETURNS:
 * Percpu pointer to the allocated area on success, NULL on failure.
 */
-static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
+static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
+                                 gfp_t gfp)
 {
        static int warn_limit = 10;
        struct pcpu_chunk *chunk;
        const char *err;
-        int slot, off, new_alloc;
+        bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
+        int occ_pages = 0;
+        int slot, off, new_alloc, cpu, ret;
        unsigned long flags;
        void __percpu *ptr;
@@ -728,7 +897,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
                return NULL;
        }
-        mutex_lock(&pcpu_alloc_mutex);
        spin_lock_irqsave(&pcpu_lock, flags);
        /* serve reserved allocations from the reserved chunk if available */
@@ -740,16 +908,18 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
                        goto fail_unlock;
                }
-                while ((new_alloc = pcpu_need_to_extend(chunk))) {
+                while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) {
                        spin_unlock_irqrestore(&pcpu_lock, flags);
-                        if (pcpu_extend_area_map(chunk, new_alloc) < 0) {
+                        if (is_atomic ||
+                            pcpu_extend_area_map(chunk, new_alloc) < 0) {
                                err = "failed to extend area map of reserved chunk";
-                                goto fail_unlock_mutex;
+                                goto fail;
                        }
                        spin_lock_irqsave(&pcpu_lock, flags);
                }
-                off = pcpu_alloc_area(chunk, size, align);
+                off = pcpu_alloc_area(chunk, size, align, is_atomic,
+                                      &occ_pages);
                if (off >= 0)
                        goto area_found;
@@ -764,13 +934,15 @@ restart:
                        if (size > chunk->contig_hint)
                                continue;
-                        new_alloc = pcpu_need_to_extend(chunk);
+                        new_alloc = pcpu_need_to_extend(chunk, is_atomic);
                        if (new_alloc) {
+                                if (is_atomic)
+                                        continue;
                                spin_unlock_irqrestore(&pcpu_lock, flags);
                                if (pcpu_extend_area_map(chunk,
                                                         new_alloc) < 0) {
                                        err = "failed to extend area map";
-                                        goto fail_unlock_mutex;
+                                        goto fail;
                                }
                                spin_lock_irqsave(&pcpu_lock, flags);
                                /*
@@ -780,74 +952,134 @@ restart:
                                goto restart;
                        }
-                        off = pcpu_alloc_area(chunk, size, align);
+                        off = pcpu_alloc_area(chunk, size, align, is_atomic,
+                                              &occ_pages);
                        if (off >= 0)
                                goto area_found;
                }
        }
-        /* hmmm... no space left, create a new chunk */
        spin_unlock_irqrestore(&pcpu_lock, flags);
-        chunk = pcpu_create_chunk();
+        /*
-        if (!chunk) {
+         * No space left.  Create a new chunk.  We don't want multiple
-                err = "failed to allocate new chunk";
+         * tasks to create chunks simultaneously.  Serialize and create iff
-                goto fail_unlock_mutex;
+         * there's still no empty chunk after grabbing the mutex.
+         */
+        if (is_atomic)
+                goto fail;
+        mutex_lock(&pcpu_alloc_mutex);
+        if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
+                chunk = pcpu_create_chunk();
+                if (!chunk) {
+                        mutex_unlock(&pcpu_alloc_mutex);
+                        err = "failed to allocate new chunk";
+                        goto fail;
+                }
+                spin_lock_irqsave(&pcpu_lock, flags);
+                pcpu_chunk_relocate(chunk, -1);
+        } else {
+                spin_lock_irqsave(&pcpu_lock, flags);
        }
-        spin_lock_irqsave(&pcpu_lock, flags);
+        mutex_unlock(&pcpu_alloc_mutex);
-        pcpu_chunk_relocate(chunk, -1);
        goto restart;
 area_found:
        spin_unlock_irqrestore(&pcpu_lock, flags);
-        /* populate, map and clear the area */
+        /* populate if not all pages are already there */
-        if (pcpu_populate_chunk(chunk, off, size)) {
+        if (!is_atomic) {
-                spin_lock_irqsave(&pcpu_lock, flags);
+                int page_start, page_end, rs, re;
-                pcpu_free_area(chunk, off);
-                err = "failed to populate";
+                mutex_lock(&pcpu_alloc_mutex);
-                goto fail_unlock;
+                page_start = PFN_DOWN(off);
+                page_end = PFN_UP(off + size);
+                pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+                        WARN_ON(chunk->immutable);
+                        ret = pcpu_populate_chunk(chunk, rs, re);
+                        spin_lock_irqsave(&pcpu_lock, flags);
+                        if (ret) {
+                                mutex_unlock(&pcpu_alloc_mutex);
+                                pcpu_free_area(chunk, off, &occ_pages);
+                                err = "failed to populate";
+                                goto fail_unlock;
+                        }
+                        pcpu_chunk_populated(chunk, rs, re);
+                        spin_unlock_irqrestore(&pcpu_lock, flags);
+                }
+                mutex_unlock(&pcpu_alloc_mutex);
        }
-        mutex_unlock(&pcpu_alloc_mutex);
+        if (chunk != pcpu_reserved_chunk)
+                pcpu_nr_empty_pop_pages -= occ_pages;
+        if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
+                pcpu_schedule_balance_work();
+        /* clear the areas and return address relative to base address */
+        for_each_possible_cpu(cpu)
+                memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
-        /* return address relative to base address */
        ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
        kmemleak_alloc_percpu(ptr, size);
        return ptr;
 fail_unlock:
        spin_unlock_irqrestore(&pcpu_lock, flags);
-fail_unlock_mutex:
+fail:
-        mutex_unlock(&pcpu_alloc_mutex);
+        if (!is_atomic && warn_limit) {
-        if (warn_limit) {
+                pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n",
-                pr_warning("PERCPU: allocation failed, size=%zu align=%zu, "
+                           size, align, is_atomic, err);
-                           "%s\n", size, align, err);
                dump_stack();
                if (!--warn_limit)
                        pr_info("PERCPU: limit reached, disable warning\n");
        }
+        if (is_atomic) {
+                /* see the flag handling in pcpu_blance_workfn() */
+                pcpu_atomic_alloc_failed = true;
+                pcpu_schedule_balance_work();
+        }
        return NULL;
 }
 /**
- * __alloc_percpu - allocate dynamic percpu area
+ * __alloc_percpu_gfp - allocate dynamic percpu area
 * @size: size of area to allocate in bytes
 * @align: alignment of area (max PAGE_SIZE)
+ * @gfp: allocation flags
 *
- * Allocate zero-filled percpu area of @size bytes aligned at @align.
+ * Allocate zero-filled percpu area of @size bytes aligned at @align.  If
- * Might sleep.  Might trigger writeouts.
+ * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
- *
+ * be called from any context but is a lot more likely to fail.
- * CONTEXT:
- * Does GFP_KERNEL allocation.
 *
 * RETURNS:
 * Percpu pointer to the allocated area on success, NULL on failure.
 */
+void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
+{
+        return pcpu_alloc(size, align, false, gfp);
+}
+EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
+/**
+ * __alloc_percpu - allocate dynamic percpu area
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
+ */
 void __percpu *__alloc_percpu(size_t size, size_t align)
 {
-        return pcpu_alloc(size, align, false);
+        return pcpu_alloc(size, align, false, GFP_KERNEL);
 }
 EXPORT_SYMBOL_GPL(__alloc_percpu);
@@ -869,44 +1101,121 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
 */
 void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
 {
-        return pcpu_alloc(size, align, true);
+        return pcpu_alloc(size, align, true, GFP_KERNEL);
 }
 /**
- * pcpu_reclaim - reclaim fully free chunks, workqueue function
+ * pcpu_balance_workfn - manage the amount of free chunks and populated pages
 * @work: unused
 *
 * Reclaim all fully free chunks except for the first one.
- *
- * CONTEXT:
- * workqueue context.
 */
-static void pcpu_reclaim(struct work_struct *work)
+static void pcpu_balance_workfn(struct work_struct *work)
 {
-        LIST_HEAD(todo);
+        LIST_HEAD(to_free);
-        struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
+        struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
        struct pcpu_chunk *chunk, *next;
+        int slot, nr_to_pop, ret;
+        /*
+         * There's no reason to keep around multiple unused chunks and VM
+         * areas can be scarce.  Destroy all free chunks except for one.
+         */
        mutex_lock(&pcpu_alloc_mutex);
        spin_lock_irq(&pcpu_lock);
-        list_for_each_entry_safe(chunk, next, head, list) {
+        list_for_each_entry_safe(chunk, next, free_head, list) {
                WARN_ON(chunk->immutable);
                /* spare the first one */
-                if (chunk == list_first_entry(head, struct pcpu_chunk, list))
+                if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
                        continue;
-                list_move(&chunk->list, &todo);
+                list_move(&chunk->list, &to_free);
        }
        spin_unlock_irq(&pcpu_lock);
-        list_for_each_entry_safe(chunk, next, &todo, list) {
+        list_for_each_entry_safe(chunk, next, &to_free, list) {
-                pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
+                int rs, re;
+                pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) {
+                        pcpu_depopulate_chunk(chunk, rs, re);
+                        spin_lock_irq(&pcpu_lock);
+                        pcpu_chunk_depopulated(chunk, rs, re);
+                        spin_unlock_irq(&pcpu_lock);
+                }
                pcpu_destroy_chunk(chunk);
        }
+        /*
+         * Ensure there are certain number of free populated pages for
+         * atomic allocs.  Fill up from the most packed so that atomic
+         * allocs don't increase fragmentation.  If atomic allocation
+         * failed previously, always populate the maximum amount.  This
+         * should prevent atomic allocs larger than PAGE_SIZE from keeping
+         * failing indefinitely; however, large atomic allocs are not
+         * something we support properly and can be highly unreliable and
+         * inefficient.
+         */
+retry_pop:
+        if (pcpu_atomic_alloc_failed) {
+                nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
+                /* best effort anyway, don't worry about synchronization */
+                pcpu_atomic_alloc_failed = false;
+        } else {
+                nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
+                                  pcpu_nr_empty_pop_pages,
+                                  0, PCPU_EMPTY_POP_PAGES_HIGH);
+        }
+        for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
+                int nr_unpop = 0, rs, re;
+                if (!nr_to_pop)
+                        break;
+                spin_lock_irq(&pcpu_lock);
+                list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+                        nr_unpop = pcpu_unit_pages - chunk->nr_populated;
+                        if (nr_unpop)
+                                break;
+                }
+                spin_unlock_irq(&pcpu_lock);
+                if (!nr_unpop)
+                        continue;
+                /* @chunk can't go away while pcpu_alloc_mutex is held */
+                pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) {
+                        int nr = min(re - rs, nr_to_pop);
+                        ret = pcpu_populate_chunk(chunk, rs, rs + nr);
+                        if (!ret) {
+                                nr_to_pop -= nr;
+                                spin_lock_irq(&pcpu_lock);
+                                pcpu_chunk_populated(chunk, rs, rs + nr);
+                                spin_unlock_irq(&pcpu_lock);
+                        } else {
+                                nr_to_pop = 0;
+                        }
+                        if (!nr_to_pop)
+                                break;
+                }
+        }
+        if (nr_to_pop) {
+                /* ran out of chunks to populate, create a new one and retry */
+                chunk = pcpu_create_chunk();
+                if (chunk) {
+                        spin_lock_irq(&pcpu_lock);
+                        pcpu_chunk_relocate(chunk, -1);
+                        spin_unlock_irq(&pcpu_lock);
+                        goto retry_pop;
+                }
+        }
        mutex_unlock(&pcpu_alloc_mutex);
 }
@@ -924,7 +1233,7 @@ void free_percpu(void __percpu *ptr)
        void *addr;
        struct pcpu_chunk *chunk;
        unsigned long flags;
-        int off;
+        int off, occ_pages;
        if (!ptr)
                return;
@@ -938,7 +1247,10 @@ void free_percpu(void __percpu *ptr)
        chunk = pcpu_chunk_addr_search(addr);
        off = addr - chunk->base_addr;
-        pcpu_free_area(chunk, off);
+        pcpu_free_area(chunk, off, &occ_pages);
+        if (chunk != pcpu_reserved_chunk)
+                pcpu_nr_empty_pop_pages += occ_pages;
        /* if there are more than one fully free chunks, wake up grim reaper */
        if (chunk->free_size == pcpu_unit_size) {
@@ -946,7 +1258,7 @@ void free_percpu(void __percpu *ptr)
                list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
                        if (pos != chunk) {
-                                schedule_work(&pcpu_reclaim_work);
+                                pcpu_schedule_balance_work();
                                break;
                        }
        }
@@ -1336,11 +1648,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
         */
        schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
        INIT_LIST_HEAD(&schunk->list);
+        INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn);
        schunk->base_addr = base_addr;
        schunk->map = smap;
        schunk->map_alloc = ARRAY_SIZE(smap);
        schunk->immutable = true;
        bitmap_fill(schunk->populated, pcpu_unit_pages);
+        schunk->nr_populated = pcpu_unit_pages;
        if (ai->reserved_size) {
                schunk->free_size = ai->reserved_size;
@@ -1364,11 +1678,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        if (dyn_size) {
                dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
                INIT_LIST_HEAD(&dchunk->list);
+                INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn);
                dchunk->base_addr = base_addr;
                dchunk->map = dmap;
                dchunk->map_alloc = ARRAY_SIZE(dmap);
                dchunk->immutable = true;
                bitmap_fill(dchunk->populated, pcpu_unit_pages);
+                dchunk->nr_populated = pcpu_unit_pages;
                dchunk->contig_hint = dchunk->free_size = dyn_size;
                dchunk->map[0] = 1;
@@ -1379,6 +1695,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        /* link the first chunk in */
        pcpu_first_chunk = dchunk ?: schunk;
+        pcpu_nr_empty_pop_pages +=
+                pcpu_count_occupied_pages(pcpu_first_chunk, 1);
        pcpu_chunk_relocate(pcpu_first_chunk, -1);
        /* we're done */
@@ -1965,3 +2283,15 @@ void __init percpu_init_late(void)
                spin_unlock_irqrestore(&pcpu_lock, flags);
        }
 }
+/*
+ * Percpu allocator is initialized early during boot when neither slab or
+ * workqueue is available.  Plug async management until everything is up
+ * and running.
+ */
+static int __init percpu_enable_async(void)
+{
+        pcpu_async_enabled = true;
+        return 0;
+}
+subsys_initcall(percpu_enable_async);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index a8b919925934..dfb79e028ecb 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -195,7 +195,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
        pmd_t entry = *pmdp;
        if (pmd_numa(entry))
                entry = pmd_mknonnuma(entry);
-        set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
+        set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry));
        flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/mm/rmap.c b/mm/rmap.c
index 3e8491c504f8..116a5053415b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -527,7 +527,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
        unsigned long address = __vma_address(page, vma);
        /* page should be within @vma mapping range */
-        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+        VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
        return address;
 }
@@ -897,7 +897,7 @@ void page_move_anon_rmap(struct page *page,
        struct anon_vma *anon_vma = vma->anon_vma;
        VM_BUG_ON_PAGE(!PageLocked(page), page);
-        VM_BUG_ON(!anon_vma);
+        VM_BUG_ON_VMA(!anon_vma, vma);
        VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
@@ -1024,7 +1024,7 @@ void do_page_add_anon_rmap(struct page *page,
 void page_add_new_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address)
 {
-        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+        VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
        SetPageSwapBacked(page);
        atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
        if (PageTransHuge(page))
@@ -1355,7 +1355,11 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
                        continue;       /* don't unmap */
                }
-                if (ptep_clear_flush_young_notify(vma, address, pte))
+                /*
+                 * No need for _notify because we're within an
+                 * mmu_notifier_invalidate_range_ {start|end} scope.
+                 */
+                if (ptep_clear_flush_young(vma, address, pte))
                        continue;
                /* Nuke the page table entry. */
@@ -1666,7 +1670,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
         * structure at mapping cannot be freed and reused yet,
         * so we can safely take mapping->i_mmap_mutex.
         */
-        VM_BUG_ON(!PageLocked(page));
+        VM_BUG_ON_PAGE(!PageLocked(page), page);
        if (!mapping)
                return ret;
diff --git a/mm/shmem.c b/mm/shmem.c
index 0e5fb225007c..cd6fc7590e54 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2367,8 +2367,10 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
        if (new_dentry->d_inode) {
                (void) shmem_unlink(new_dir, new_dentry);
-                if (they_are_dirs)
+                if (they_are_dirs) {
+                        drop_nlink(new_dentry->d_inode);
                        drop_nlink(old_dir);
+                }
        } else if (they_are_dirs) {
                drop_nlink(old_dir);
                inc_nlink(new_dir);
@@ -2993,7 +2995,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
 #endif
        spin_lock_init(&sbinfo->stat_lock);
-        if (percpu_counter_init(&sbinfo->used_blocks, 0))
+        if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
                goto failed;
        sbinfo->free_inodes = sbinfo->max_inodes;
@@ -3075,7 +3077,9 @@ static const struct address_space_operations shmem_aops = {
        .write_begin    = shmem_write_begin,
        .write_end      = shmem_write_end,
 #endif
+#ifdef CONFIG_MIGRATION
        .migratepage    = migrate_page,
+#endif
        .error_remove_page = generic_error_remove_page,
 };
diff --git a/mm/slab.c b/mm/slab.c
index a467b308c682..154aac8411c5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -237,11 +237,10 @@ struct arraycache_init {
 /*
 * Need this for bootstrapping a per node allocator.
 */
-#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
+#define NUM_INIT_LISTS (2 * MAX_NUMNODES)
 static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS];
 #define CACHE_CACHE 0
-#define SIZE_AC MAX_NUMNODES
+#define SIZE_NODE (MAX_NUMNODES)
-#define SIZE_NODE (2 * MAX_NUMNODES)
 static int drain_freelist(struct kmem_cache *cache,
                        struct kmem_cache_node *n, int tofree);
@@ -253,7 +252,6 @@ static void cache_reap(struct work_struct *unused);
 static int slab_early_init = 1;
-#define INDEX_AC kmalloc_index(sizeof(struct arraycache_init))
 #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
 static void kmem_cache_node_init(struct kmem_cache_node *parent)
@@ -458,9 +456,6 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache,
        return reciprocal_divide(offset, cache->reciprocal_buffer_size);
 }
-static struct arraycache_init initarray_generic =
-    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 /* internal cache of cache description objs */
 static struct kmem_cache kmem_cache_boot = {
        .batchcount = 1,
@@ -476,7 +471,7 @@ static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 {
-        return cachep->array[smp_processor_id()];
+        return this_cpu_ptr(cachep->cpu_cache);
 }
 static size_t calculate_freelist_size(int nr_objs, size_t align)
@@ -785,8 +780,8 @@ static inline void *ac_get_obj(struct kmem_cache *cachep,
        return objp;
 }
-static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+static noinline void *__ac_put_obj(struct kmem_cache *cachep,
-                                                                void *objp)
+                        struct array_cache *ac, void *objp)
 {
        if (unlikely(pfmemalloc_active)) {
                /* Some pfmemalloc slabs exist, check if this is one */
@@ -984,46 +979,50 @@ static void drain_alien_cache(struct kmem_cache *cachep,
        }
 }
-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
+                                int node, int page_node)
 {
-        int nodeid = page_to_nid(virt_to_page(objp));
        struct kmem_cache_node *n;
        struct alien_cache *alien = NULL;
        struct array_cache *ac;
-        int node;
        LIST_HEAD(list);
-        node = numa_mem_id();
-        /*
-         * Make sure we are not freeing a object from another node to the array
-         * cache on this cpu.
-         */
-        if (likely(nodeid == node))
-                return 0;
        n = get_node(cachep, node);
        STATS_INC_NODEFREES(cachep);
-        if (n->alien && n->alien[nodeid]) {
+        if (n->alien && n->alien[page_node]) {
-                alien = n->alien[nodeid];
+                alien = n->alien[page_node];
                ac = &alien->ac;
                spin_lock(&alien->lock);
                if (unlikely(ac->avail == ac->limit)) {
                        STATS_INC_ACOVERFLOW(cachep);
-                        __drain_alien_cache(cachep, ac, nodeid, &list);
+                        __drain_alien_cache(cachep, ac, page_node, &list);
                }
                ac_put_obj(cachep, ac, objp);
                spin_unlock(&alien->lock);
                slabs_destroy(cachep, &list);
        } else {
-                n = get_node(cachep, nodeid);
+                n = get_node(cachep, page_node);
                spin_lock(&n->list_lock);
-                free_block(cachep, &objp, 1, nodeid, &list);
+                free_block(cachep, &objp, 1, page_node, &list);
                spin_unlock(&n->list_lock);
                slabs_destroy(cachep, &list);
        }
        return 1;
 }
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+{
+        int page_node = page_to_nid(virt_to_page(objp));
+        int node = numa_mem_id();
+        /*
+         * Make sure we are not freeing a object from another node to the array
+         * cache on this cpu.
+         */
+        if (likely(node == page_node))
+                return 0;
+        return __cache_free_alien(cachep, objp, node, page_node);
+}
 #endif
 /*
@@ -1092,24 +1091,25 @@ static void cpuup_canceled(long cpu)
                struct alien_cache **alien;
                LIST_HEAD(list);
-                /* cpu is dead; no one can alloc from it. */
-                nc = cachep->array[cpu];
-                cachep->array[cpu] = NULL;
                n = get_node(cachep, node);
                if (!n)
-                        goto free_array_cache;
+                        continue;
                spin_lock_irq(&n->list_lock);
                /* Free limit for this kmem_cache_node */
                n->free_limit -= cachep->batchcount;
-                if (nc)
+                /* cpu is dead; no one can alloc from it. */
+                nc = per_cpu_ptr(cachep->cpu_cache, cpu);
+                if (nc) {
                        free_block(cachep, nc->entry, nc->avail, node, &list);
+                        nc->avail = 0;
+                }
                if (!cpumask_empty(mask)) {
                        spin_unlock_irq(&n->list_lock);
-                        goto free_array_cache;
+                        goto free_slab;
                }
                shared = n->shared;
@@ -1129,9 +1129,9 @@ static void cpuup_canceled(long cpu)
                        drain_alien_cache(cachep, alien);
                        free_alien_cache(alien);
                }
-free_array_cache:
+free_slab:
                slabs_destroy(cachep, &list);
-                kfree(nc);
        }
        /*
         * In the previous loop, all the objects were freed to
@@ -1168,32 +1168,23 @@ static int cpuup_prepare(long cpu)
         * array caches
         */
        list_for_each_entry(cachep, &slab_caches, list) {
-                struct array_cache *nc;
                struct array_cache *shared = NULL;
                struct alien_cache **alien = NULL;
-                nc = alloc_arraycache(node, cachep->limit,
-                                        cachep->batchcount, GFP_KERNEL);
-                if (!nc)
-                        goto bad;
                if (cachep->shared) {
                        shared = alloc_arraycache(node,
                                cachep->shared * cachep->batchcount,
                                0xbaadf00d, GFP_KERNEL);
-                        if (!shared) {
+                        if (!shared)
-                                kfree(nc);
                                goto bad;
-                        }
                }
                if (use_alien_caches) {
                        alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
                        if (!alien) {
                                kfree(shared);
-                                kfree(nc);
                                goto bad;
                        }
                }
-                cachep->array[cpu] = nc;
                n = get_node(cachep, node);
                BUG_ON(!n);
@@ -1385,15 +1376,6 @@ static void __init set_up_node(struct kmem_cache *cachep, int index)
 }
 /*
- * The memory after the last cpu cache pointer is used for the
- * the node pointer.
- */
-static void setup_node_pointer(struct kmem_cache *cachep)
-{
-        cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids];
-}
-/*
 * Initialisation.  Called after the page allocator have been initialised and
 * before smp_init().
 */
@@ -1404,7 +1386,6 @@ void __init kmem_cache_init(void)
        BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) <
                                        sizeof(struct rcu_head));
        kmem_cache = &kmem_cache_boot;
-        setup_node_pointer(kmem_cache);
        if (num_possible_nodes() == 1)
                use_alien_caches = 0;
@@ -1412,8 +1393,6 @@ void __init kmem_cache_init(void)
        for (i = 0; i < NUM_INIT_LISTS; i++)
                kmem_cache_node_init(&init_kmem_cache_node[i]);
-        set_up_node(kmem_cache, CACHE_CACHE);
        /*
         * Fragmentation resistance on low memory - only use bigger
         * page orders on machines with more than 32MB of memory if
@@ -1448,49 +1427,22 @@ void __init kmem_cache_init(void)
         * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
         */
        create_boot_cache(kmem_cache, "kmem_cache",
-                offsetof(struct kmem_cache, array[nr_cpu_ids]) +
+                offsetof(struct kmem_cache, node) +
                                  nr_node_ids * sizeof(struct kmem_cache_node *),
                                  SLAB_HWCACHE_ALIGN);
        list_add(&kmem_cache->list, &slab_caches);
+        slab_state = PARTIAL;
-        /* 2+3) create the kmalloc caches */
        /*
-         * Initialize the caches that provide memory for the array cache and the
+         * Initialize the caches that provide memory for the  kmem_cache_node
-         * kmem_cache_node structures first.  Without this, further allocations will
+         * structures first.  Without this, further allocations will bug.
-         * bug.
         */
+        kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node",
-        kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac",
-                                        kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS);
-        if (INDEX_AC != INDEX_NODE)
-                kmalloc_caches[INDEX_NODE] =
-                        create_kmalloc_cache("kmalloc-node",
                                kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
+        slab_state = PARTIAL_NODE;
        slab_early_init = 0;
-        /* 4) Replace the bootstrap head arrays */
-        {
-                struct array_cache *ptr;
-                ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
-                memcpy(ptr, cpu_cache_get(kmem_cache),
-                       sizeof(struct arraycache_init));
-                kmem_cache->array[smp_processor_id()] = ptr;
-                ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
-                BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC])
-                       != &initarray_generic.cache);
-                memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]),
-                       sizeof(struct arraycache_init));
-                kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr;
-        }
        /* 5) Replace the bootstrap kmem_cache_node */
        {
                int nid;
@@ -1498,13 +1450,8 @@ void __init kmem_cache_init(void)
                for_each_online_node(nid) {
                        init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);
-                        init_list(kmalloc_caches[INDEX_AC],
+                        init_list(kmalloc_caches[INDEX_NODE],
-                                  &init_kmem_cache_node[SIZE_AC + nid], nid);
-                        if (INDEX_AC != INDEX_NODE) {
-                                init_list(kmalloc_caches[INDEX_NODE],
                                          &init_kmem_cache_node[SIZE_NODE + nid], nid);
-                        }
                }
        }
@@ -2037,56 +1984,53 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
        return left_over;
 }
+static struct array_cache __percpu *alloc_kmem_cache_cpus(
+                struct kmem_cache *cachep, int entries, int batchcount)
+{
+        int cpu;
+        size_t size;
+        struct array_cache __percpu *cpu_cache;
+        size = sizeof(void *) * entries + sizeof(struct array_cache);
+        cpu_cache = __alloc_percpu(size, 0);
+        if (!cpu_cache)
+                return NULL;
+        for_each_possible_cpu(cpu) {
+                init_arraycache(per_cpu_ptr(cpu_cache, cpu),
+                                entries, batchcount);
+        }
+        return cpu_cache;
+}
 static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 {
        if (slab_state >= FULL)
                return enable_cpucache(cachep, gfp);
+        cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1);
+        if (!cachep->cpu_cache)
+                return 1;
        if (slab_state == DOWN) {
-                /*
+                /* Creation of first cache (kmem_cache). */
-                 * Note: Creation of first cache (kmem_cache).
+                set_up_node(kmem_cache, CACHE_CACHE);
-                 * The setup_node is taken care
-                 * of by the caller of __kmem_cache_create
-                 */
-                cachep->array[smp_processor_id()] = &initarray_generic.cache;
-                slab_state = PARTIAL;
        } else if (slab_state == PARTIAL) {
-                /*
+                /* For kmem_cache_node */
-                 * Note: the second kmem_cache_create must create the cache
+                set_up_node(cachep, SIZE_NODE);
-                 * that's used by kmalloc(24), otherwise the creation of
-                 * further caches will BUG().
-                 */
-                cachep->array[smp_processor_id()] = &initarray_generic.cache;
-                /*
-                 * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is
-                 * the second cache, then we need to set up all its node/,
-                 * otherwise the creation of further caches will BUG().
-                 */
-                set_up_node(cachep, SIZE_AC);
-                if (INDEX_AC == INDEX_NODE)
-                        slab_state = PARTIAL_NODE;
-                else
-                        slab_state = PARTIAL_ARRAYCACHE;
        } else {
-                /* Remaining boot caches */
+                int node;
-                cachep->array[smp_processor_id()] =
-                        kmalloc(sizeof(struct arraycache_init), gfp);
-                if (slab_state == PARTIAL_ARRAYCACHE) {
+                for_each_online_node(node) {
-                        set_up_node(cachep, SIZE_NODE);
+                        cachep->node[node] = kmalloc_node(
-                        slab_state = PARTIAL_NODE;
+                                sizeof(struct kmem_cache_node), gfp, node);
-                } else {
+                        BUG_ON(!cachep->node[node]);
-                        int node;
+                        kmem_cache_node_init(cachep->node[node]);
-                        for_each_online_node(node) {
-                                cachep->node[node] =
-                                    kmalloc_node(sizeof(struct kmem_cache_node),
-                                                gfp, node);
-                                BUG_ON(!cachep->node[node]);
-                                kmem_cache_node_init(cachep->node[node]);
-                        }
                }
        }
        cachep->node[numa_mem_id()]->next_reap =
                        jiffies + REAPTIMEOUT_NODE +
                        ((unsigned long)cachep) % REAPTIMEOUT_NODE;
@@ -2100,6 +2044,32 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
        return 0;
 }
+unsigned long kmem_cache_flags(unsigned long object_size,
+        unsigned long flags, const char *name,
+        void (*ctor)(void *))
+{
+        return flags;
+}
+struct kmem_cache *
+__kmem_cache_alias(const char *name, size_t size, size_t align,
+                   unsigned long flags, void (*ctor)(void *))
+{
+        struct kmem_cache *cachep;
+        cachep = find_mergeable(size, align, flags, name, ctor);
+        if (cachep) {
+                cachep->refcount++;
+                /*
+                 * Adjust the object sizes so that we clear
+                 * the complete object on kzalloc.
+                 */
+                cachep->object_size = max_t(int, cachep->object_size, size);
+        }
+        return cachep;
+}
 /**
 * __kmem_cache_create - Create a cache.
 * @cachep: cache management descriptor
@@ -2124,7 +2094,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 int
 __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 {
-        size_t left_over, freelist_size, ralign;
+        size_t left_over, freelist_size;
+        size_t ralign = BYTES_PER_WORD;
        gfp_t gfp;
        int err;
        size_t size = cachep->size;
@@ -2157,14 +2128,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
                size &= ~(BYTES_PER_WORD - 1);
        }
-        /*
-         * Redzoning and user store require word alignment or possibly larger.
-         * Note this will be overridden by architecture or caller mandated
-         * alignment if either is greater than BYTES_PER_WORD.
-         */
-        if (flags & SLAB_STORE_USER)
-                ralign = BYTES_PER_WORD;
        if (flags & SLAB_RED_ZONE) {
                ralign = REDZONE_ALIGN;
                /* If redzoning, ensure that the second redzone is suitably
@@ -2190,7 +2153,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
        else
                gfp = GFP_NOWAIT;
-        setup_node_pointer(cachep);
 #if DEBUG
        /*
@@ -2447,8 +2409,7 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
        if (rc)
                return rc;
-        for_each_online_cpu(i)
+        free_percpu(cachep->cpu_cache);
-            kfree(cachep->array[i]);
        /* NUMA: free the node structures */
        for_each_kmem_cache_node(cachep, i, n) {
@@ -2994,7 +2955,7 @@ out:
 #ifdef CONFIG_NUMA
 /*
- * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set.
+ * Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set.
 *
 * If we are in_interrupt, then process context, including cpusets and
 * mempolicy, may not apply and should not be used for allocation policy.
@@ -3226,7 +3187,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
 {
        void *objp;
-        if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) {
+        if (current->mempolicy || cpuset_do_slab_mem_spread()) {
                objp = alternate_node_alloc(cache, flags);
                if (objp)
                        goto out;
@@ -3406,7 +3367,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
        if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
                return;
-        if (likely(ac->avail < ac->limit)) {
+        if (ac->avail < ac->limit) {
                STATS_INC_FREEHIT(cachep);
        } else {
                STATS_INC_FREEMISS(cachep);
@@ -3503,7 +3464,6 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
        return kmem_cache_alloc_node_trace(cachep, flags, node, size);
 }
-#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
        return __do_kmalloc_node(size, flags, node, _RET_IP_);
@@ -3516,13 +3476,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
        return __do_kmalloc_node(size, flags, node, caller);
 }
 EXPORT_SYMBOL(__kmalloc_node_track_caller);
-#else
-void *__kmalloc_node(size_t size, gfp_t flags, int node)
-{
-        return __do_kmalloc_node(size, flags, node, 0);
-}
-EXPORT_SYMBOL(__kmalloc_node);
-#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
 #endif /* CONFIG_NUMA */
 /**
@@ -3548,8 +3501,6 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
        return ret;
 }
-#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
 void *__kmalloc(size_t size, gfp_t flags)
 {
        return __do_kmalloc(size, flags, _RET_IP_);
@@ -3562,14 +3513,6 @@ void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
 }
 EXPORT_SYMBOL(__kmalloc_track_caller);
-#else
-void *__kmalloc(size_t size, gfp_t flags)
-{
-        return __do_kmalloc(size, flags, 0);
-}
-EXPORT_SYMBOL(__kmalloc);
-#endif
 /**
 * kmem_cache_free - Deallocate an object
 * @cachep: The cache the allocation was from.
@@ -3714,72 +3657,45 @@ fail:
        return -ENOMEM;
 }
-struct ccupdate_struct {
-        struct kmem_cache *cachep;
-        struct array_cache *new[0];
-};
-static void do_ccupdate_local(void *info)
-{
-        struct ccupdate_struct *new = info;
-        struct array_cache *old;
-        check_irq_off();
-        old = cpu_cache_get(new->cachep);
-        new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
-        new->new[smp_processor_id()] = old;
-}
 /* Always called with the slab_mutex held */
 static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
                                int batchcount, int shared, gfp_t gfp)
 {
-        struct ccupdate_struct *new;
+        struct array_cache __percpu *cpu_cache, *prev;
-        int i;
+        int cpu;
-        new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),
+        cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount);
-                      gfp);
+        if (!cpu_cache)
-        if (!new)
                return -ENOMEM;
-        for_each_online_cpu(i) {
+        prev = cachep->cpu_cache;
-                new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
+        cachep->cpu_cache = cpu_cache;
-                                                batchcount, gfp);
+        kick_all_cpus_sync();
-                if (!new->new[i]) {
-                        for (i--; i >= 0; i--)
-                                kfree(new->new[i]);
-                        kfree(new);
-                        return -ENOMEM;
-                }
-        }
-        new->cachep = cachep;
-        on_each_cpu(do_ccupdate_local, (void *)new, 1);
        check_irq_on();
        cachep->batchcount = batchcount;
        cachep->limit = limit;
        cachep->shared = shared;
-        for_each_online_cpu(i) {
+        if (!prev)
+                goto alloc_node;
+        for_each_online_cpu(cpu) {
                LIST_HEAD(list);
-                struct array_cache *ccold = new->new[i];
                int node;
                struct kmem_cache_node *n;
+                struct array_cache *ac = per_cpu_ptr(prev, cpu);
-                if (!ccold)
+                node = cpu_to_mem(cpu);
-                        continue;
-                node = cpu_to_mem(i);
                n = get_node(cachep, node);
                spin_lock_irq(&n->list_lock);
-                free_block(cachep, ccold->entry, ccold->avail, node, &list);
+                free_block(cachep, ac->entry, ac->avail, node, &list);
                spin_unlock_irq(&n->list_lock);
                slabs_destroy(cachep, &list);
-                kfree(ccold);
        }
-        kfree(new);
+        free_percpu(prev);
+alloc_node:
        return alloc_kmem_cache_node(cachep, gfp);
 }
@@ -4262,19 +4178,15 @@ static const struct seq_operations slabstats_op = {
 static int slabstats_open(struct inode *inode, struct file *file)
 {
-        unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
+        unsigned long *n;
-        int ret = -ENOMEM;
-        if (n) {
+        n = __seq_open_private(file, &slabstats_op, PAGE_SIZE);
-                ret = seq_open(file, &slabstats_op);
+        if (!n)
-                if (!ret) {
+                return -ENOMEM;
-                        struct seq_file *m = file->private_data;
-                        *n = PAGE_SIZE / (2 * sizeof(unsigned long));
+        *n = PAGE_SIZE / (2 * sizeof(unsigned long));
-                        m->private = n;
-                        n = NULL;
+        return 0;
-                }
-                kfree(n);
-        }
-        return ret;
 }
 static const struct file_operations proc_slabstats_operations = {
diff --git a/mm/slab.h b/mm/slab.h
index 0e0fdd365840..ab019e63e3c2 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -4,6 +4,41 @@
 * Internal slab definitions
 */
+#ifdef CONFIG_SLOB
+/*
+ * Common fields provided in kmem_cache by all slab allocators
+ * This struct is either used directly by the allocator (SLOB)
+ * or the allocator must include definitions for all fields
+ * provided in kmem_cache_common in their definition of kmem_cache.
+ *
+ * Once we can do anonymous structs (C11 standard) we could put a
+ * anonymous struct definition in these allocators so that the
+ * separate allocations in the kmem_cache structure of SLAB and
+ * SLUB is no longer needed.
+ */
+struct kmem_cache {
+        unsigned int object_size;/* The original size of the object */
+        unsigned int size;      /* The aligned/padded/added on size  */
+        unsigned int align;     /* Alignment as calculated */
+        unsigned long flags;    /* Active flags on the slab */
+        const char *name;       /* Slab name for sysfs */
+        int refcount;           /* Use counter */
+        void (*ctor)(void *);   /* Called on object slot creation */
+        struct list_head list;  /* List of all slab caches on the system */
+};
+#endif /* CONFIG_SLOB */
+#ifdef CONFIG_SLAB
+#include <linux/slab_def.h>
+#endif
+#ifdef CONFIG_SLUB
+#include <linux/slub_def.h>
+#endif
+#include <linux/memcontrol.h>
 /*
 * State of the slab allocator.
 *
@@ -15,7 +50,6 @@
 enum slab_state {
        DOWN,                   /* No slab functionality yet */
        PARTIAL,                /* SLUB: kmem_cache_node available */
-        PARTIAL_ARRAYCACHE,     /* SLAB: kmalloc size for arraycache available */
        PARTIAL_NODE,           /* SLAB: kmalloc size for node struct available */
        UP,                     /* Slab caches usable but not all extras yet */
        FULL                    /* Everything is working */
@@ -53,15 +87,30 @@ extern void create_boot_cache(struct kmem_cache *, const char *name,
                        size_t size, unsigned long flags);
 struct mem_cgroup;
-#ifdef CONFIG_SLUB
+int slab_unmergeable(struct kmem_cache *s);
+struct kmem_cache *find_mergeable(size_t size, size_t align,
+                unsigned long flags, const char *name, void (*ctor)(void *));
+#ifndef CONFIG_SLOB
 struct kmem_cache *
 __kmem_cache_alias(const char *name, size_t size, size_t align,
                   unsigned long flags, void (*ctor)(void *));
+unsigned long kmem_cache_flags(unsigned long object_size,
+        unsigned long flags, const char *name,
+        void (*ctor)(void *));
 #else
 static inline struct kmem_cache *
 __kmem_cache_alias(const char *name, size_t size, size_t align,
                   unsigned long flags, void (*ctor)(void *))
 { return NULL; }
+static inline unsigned long kmem_cache_flags(unsigned long object_size,
+        unsigned long flags, const char *name,
+        void (*ctor)(void *))
+{
+        return flags;
+}
 #endif
@@ -303,8 +352,8 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 * a kmem_cache_node structure allocated (which is true for all online nodes)
 */
 #define for_each_kmem_cache_node(__s, __node, __n) \
-        for (__node = 0; __n = get_node(__s, __node), __node < nr_node_ids; __node++) \
+        for (__node = 0; __node < nr_node_ids; __node++) \
-                 if (__n)
+                 if ((__n = get_node(__s, __node)))
 #endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index d319502b2403..3a6e0cfdf03a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -30,6 +30,43 @@ LIST_HEAD(slab_caches);
 DEFINE_MUTEX(slab_mutex);
 struct kmem_cache *kmem_cache;
+/*
+ * Set of flags that will prevent slab merging
+ */
+#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
+                SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
+                SLAB_FAILSLAB)
+#define SLAB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
+                SLAB_CACHE_DMA | SLAB_NOTRACK)
+/*
+ * Merge control. If this is set then no merging of slab caches will occur.
+ * (Could be removed. This was introduced to pacify the merge skeptics.)
+ */
+static int slab_nomerge;
+static int __init setup_slab_nomerge(char *str)
+{
+        slab_nomerge = 1;
+        return 1;
+}
+#ifdef CONFIG_SLUB
+__setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0);
+#endif
+__setup("slab_nomerge", setup_slab_nomerge);
+/*
+ * Determine the size of a slab object
+ */
+unsigned int kmem_cache_size(struct kmem_cache *s)
+{
+        return s->object_size;
+}
+EXPORT_SYMBOL(kmem_cache_size);
 #ifdef CONFIG_DEBUG_VM
 static int kmem_cache_sanity_check(const char *name, size_t size)
 {
@@ -79,6 +116,65 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
 #endif
 #ifdef CONFIG_MEMCG_KMEM
+static int memcg_alloc_cache_params(struct mem_cgroup *memcg,
+                struct kmem_cache *s, struct kmem_cache *root_cache)
+{
+        size_t size;
+        if (!memcg_kmem_enabled())
+                return 0;
+        if (!memcg) {
+                size = offsetof(struct memcg_cache_params, memcg_caches);
+                size += memcg_limited_groups_array_size * sizeof(void *);
+        } else
+                size = sizeof(struct memcg_cache_params);
+        s->memcg_params = kzalloc(size, GFP_KERNEL);
+        if (!s->memcg_params)
+                return -ENOMEM;
+        if (memcg) {
+                s->memcg_params->memcg = memcg;
+                s->memcg_params->root_cache = root_cache;
+        } else
+                s->memcg_params->is_root_cache = true;
+        return 0;
+}
+static void memcg_free_cache_params(struct kmem_cache *s)
+{
+        kfree(s->memcg_params);
+}
+static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs)
+{
+        int size;
+        struct memcg_cache_params *new_params, *cur_params;
+        BUG_ON(!is_root_cache(s));
+        size = offsetof(struct memcg_cache_params, memcg_caches);
+        size += num_memcgs * sizeof(void *);
+        new_params = kzalloc(size, GFP_KERNEL);
+        if (!new_params)
+                return -ENOMEM;
+        cur_params = s->memcg_params;
+        memcpy(new_params->memcg_caches, cur_params->memcg_caches,
+               memcg_limited_groups_array_size * sizeof(void *));
+        new_params->is_root_cache = true;
+        rcu_assign_pointer(s->memcg_params, new_params);
+        if (cur_params)
+                kfree_rcu(cur_params, rcu_head);
+        return 0;
+}
 int memcg_update_all_caches(int num_memcgs)
 {
        struct kmem_cache *s;
@@ -89,9 +185,8 @@ int memcg_update_all_caches(int num_memcgs)
                if (!is_root_cache(s))
                        continue;
-                ret = memcg_update_cache_size(s, num_memcgs);
+                ret = memcg_update_cache_params(s, num_memcgs);
                /*
-                 * See comment in memcontrol.c, memcg_update_cache_size:
                 * Instead of freeing the memory, we'll just leave the caches
                 * up to this point in an updated state.
                 */
@@ -104,7 +199,80 @@ out:
        mutex_unlock(&slab_mutex);
        return ret;
 }
-#endif
+#else
+static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg,
+                struct kmem_cache *s, struct kmem_cache *root_cache)
+{
+        return 0;
+}
+static inline void memcg_free_cache_params(struct kmem_cache *s)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+/*
+ * Find a mergeable slab cache
+ */
+int slab_unmergeable(struct kmem_cache *s)
+{
+        if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
+                return 1;
+        if (!is_root_cache(s))
+                return 1;
+        if (s->ctor)
+                return 1;
+        /*
+         * We may have set a slab to be unmergeable during bootstrap.
+         */
+        if (s->refcount < 0)
+                return 1;
+        return 0;
+}
+struct kmem_cache *find_mergeable(size_t size, size_t align,
+                unsigned long flags, const char *name, void (*ctor)(void *))
+{
+        struct kmem_cache *s;
+        if (slab_nomerge || (flags & SLAB_NEVER_MERGE))
+                return NULL;
+        if (ctor)
+                return NULL;
+        size = ALIGN(size, sizeof(void *));
+        align = calculate_alignment(flags, align, size);
+        size = ALIGN(size, align);
+        flags = kmem_cache_flags(size, flags, name, NULL);
+        list_for_each_entry(s, &slab_caches, list) {
+                if (slab_unmergeable(s))
+                        continue;
+                if (size > s->size)
+                        continue;
+                if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME))
+                        continue;
+                /*
+                 * Check if alignment is compatible.
+                 * Courtesy of Adrian Drzewiecki
+                 */
+                if ((s->size & ~(align - 1)) != s->size)
+                        continue;
+                if (s->size - size >= sizeof(void *))
+                        continue;
+                return s;
+        }
+        return NULL;
+}
 /*
 * Figure out what the alignment of the objects will be given a set of
@@ -211,8 +379,10 @@ kmem_cache_create(const char *name, size_t size, size_t align,
        mutex_lock(&slab_mutex);
        err = kmem_cache_sanity_check(name, size);
-        if (err)
+        if (err) {
+                s = NULL;       /* suppress uninit var warning */
                goto out_unlock;
+        }
        /*
         * Some allocators will constraint the set of valid flags to a subset
diff --git a/mm/slob.c b/mm/slob.c
index 21980e0f39a8..96a86206a26b 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -468,7 +468,6 @@ void *__kmalloc(size_t size, gfp_t gfp)
 }
 EXPORT_SYMBOL(__kmalloc);
-#ifdef CONFIG_TRACING
 void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
 {
        return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller);
@@ -481,7 +480,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
        return __do_kmalloc_node(size, gfp, node, caller);
 }
 #endif
-#endif
 void kfree(const void *block)
 {
diff --git a/mm/slub.c b/mm/slub.c
index 3e8afcc07a76..ae7b9f1ad394 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -169,16 +169,6 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
 */
 #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
-/*
- * Set of flags that will prevent slab merging
- */
-#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
-                SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
-                SLAB_FAILSLAB)
-#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
-                SLAB_CACHE_DMA | SLAB_NOTRACK)
 #define OO_SHIFT        16
 #define OO_MASK         ((1 << OO_SHIFT) - 1)
 #define MAX_OBJS_PER_PAGE       32767 /* since page.objects is u15 */
@@ -1176,7 +1166,7 @@ out:
 __setup("slub_debug", setup_slub_debug);
-static unsigned long kmem_cache_flags(unsigned long object_size,
+unsigned long kmem_cache_flags(unsigned long object_size,
        unsigned long flags, const char *name,
        void (*ctor)(void *))
 {
@@ -1208,7 +1198,7 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
                                        struct page *page) {}
 static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
                                        struct page *page) {}
-static inline unsigned long kmem_cache_flags(unsigned long object_size,
+unsigned long kmem_cache_flags(unsigned long object_size,
        unsigned long flags, const char *name,
        void (*ctor)(void *))
 {
@@ -1699,7 +1689,12 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
                struct kmem_cache_cpu *c)
 {
        void *object;
-        int searchnode = (node == NUMA_NO_NODE) ? numa_mem_id() : node;
+        int searchnode = node;
+        if (node == NUMA_NO_NODE)
+                searchnode = numa_mem_id();
+        else if (!node_present_pages(node))
+                searchnode = node_to_mem_node(node);
        object = get_partial_node(s, get_node(s, searchnode), c, flags);
        if (object || node != NUMA_NO_NODE)
@@ -2280,11 +2275,18 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 redo:
        if (unlikely(!node_match(page, node))) {
-                stat(s, ALLOC_NODE_MISMATCH);
+                int searchnode = node;
-                deactivate_slab(s, page, c->freelist);
-                c->page = NULL;
+                if (node != NUMA_NO_NODE && !node_present_pages(node))
-                c->freelist = NULL;
+                        searchnode = node_to_mem_node(node);
-                goto new_slab;
+                if (unlikely(!node_match(page, searchnode))) {
+                        stat(s, ALLOC_NODE_MISMATCH);
+                        deactivate_slab(s, page, c->freelist);
+                        c->page = NULL;
+                        c->freelist = NULL;
+                        goto new_slab;
+                }
        }
        /*
@@ -2707,12 +2709,6 @@ static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
 static int slub_min_objects;
 /*
- * Merge control. If this is set then no merging of slab caches will occur.
- * (Could be removed. This was introduced to pacify the merge skeptics.)
- */
-static int slub_nomerge;
-/*
 * Calculate the order of allocation given an slab object size.
 *
 * The order of allocation has significant impact on performance and other
@@ -3240,14 +3236,6 @@ static int __init setup_slub_min_objects(char *str)
 __setup("slub_min_objects=", setup_slub_min_objects);
-static int __init setup_slub_nomerge(char *str)
-{
-        slub_nomerge = 1;
-        return 1;
-}
-__setup("slub_nomerge", setup_slub_nomerge);
 void *__kmalloc(size_t size, gfp_t flags)
 {
        struct kmem_cache *s;
@@ -3625,69 +3613,6 @@ void __init kmem_cache_init_late(void)
 {
 }
-/*
- * Find a mergeable slab cache
- */
-static int slab_unmergeable(struct kmem_cache *s)
-{
-        if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
-                return 1;
-        if (!is_root_cache(s))
-                return 1;
-        if (s->ctor)
-                return 1;
-        /*
-         * We may have set a slab to be unmergeable during bootstrap.
-         */
-        if (s->refcount < 0)
-                return 1;
-        return 0;
-}
-static struct kmem_cache *find_mergeable(size_t size, size_t align,
-                unsigned long flags, const char *name, void (*ctor)(void *))
-{
-        struct kmem_cache *s;
-        if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
-                return NULL;
-        if (ctor)
-                return NULL;
-        size = ALIGN(size, sizeof(void *));
-        align = calculate_alignment(flags, align, size);
-        size = ALIGN(size, align);
-        flags = kmem_cache_flags(size, flags, name, NULL);
-        list_for_each_entry(s, &slab_caches, list) {
-                if (slab_unmergeable(s))
-                        continue;
-                if (size > s->size)
-                        continue;
-                if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
-                        continue;
-                /*
-                 * Check if alignment is compatible.
-                 * Courtesy of Adrian Drzewiecki
-                 */
-                if ((s->size & ~(align - 1)) != s->size)
-                        continue;
-                if (s->size - size >= sizeof(void *))
-                        continue;
-                return s;
-        }
-        return NULL;
-}
 struct kmem_cache *
 __kmem_cache_alias(const char *name, size_t size, size_t align,
                   unsigned long flags, void (*ctor)(void *))
@@ -4604,6 +4529,14 @@ static ssize_t trace_show(struct kmem_cache *s, char *buf)
 static ssize_t trace_store(struct kmem_cache *s, const char *buf,
                                                        size_t length)
 {
+        /*
+         * Tracing a merged cache is going to give confusing results
+         * as well as cause other issues like converting a mergeable
+         * cache into an umergeable one.
+         */
+        if (s->refcount > 1)
+                return -EINVAL;
        s->flags &= ~SLAB_TRACE;
        if (buf[0] == '1') {
                s->flags &= ~__CMPXCHG_DOUBLE;
@@ -4721,6 +4654,9 @@ static ssize_t failslab_show(struct kmem_cache *s, char *buf)
 static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
                                                        size_t length)
 {
+        if (s->refcount > 1)
+                return -EINVAL;
        s->flags &= ~SLAB_FAILSLAB;
        if (buf[0] == '1')
                s->flags |= SLAB_FAILSLAB;
diff --git a/mm/swap.c b/mm/swap.c
index 6b2dc3897cd5..8a12b33936b4 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -887,18 +887,14 @@ void lru_add_drain_all(void)
        mutex_unlock(&lock);
 }
-/*
+/**
- * Batched page_cache_release().  Decrement the reference count on all the
+ * release_pages - batched page_cache_release()
- * passed pages.  If it fell to zero then remove the page from the LRU and
+ * @pages: array of pages to release
- * free it.
+ * @nr: number of pages
- *
+ * @cold: whether the pages are cache cold
- * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
- * for the remainder of the operation.
 *
- * The locking in this function is against shrink_inactive_list(): we recheck
+ * Decrement the reference count on all the pages in @pages.  If it
- * the page count inside the lock to see whether shrink_inactive_list()
+ * fell to zero, remove the page from the LRU and free it.
- * grabbed the page via the LRU.  If it did, give up: shrink_inactive_list()
- * will free it.
 */
 void release_pages(struct page **pages, int nr, bool cold)
 {
@@ -907,6 +903,7 @@ void release_pages(struct page **pages, int nr, bool cold)
        struct zone *zone = NULL;
        struct lruvec *lruvec;
        unsigned long uninitialized_var(flags);
+        unsigned int uninitialized_var(lock_batch);
        for (i = 0; i < nr; i++) {
                struct page *page = pages[i];
@@ -920,6 +917,16 @@ void release_pages(struct page **pages, int nr, bool cold)
                        continue;
                }
+                /*
+                 * Make sure the IRQ-safe lock-holding time does not get
+                 * excessive with a continuous string of pages from the
+                 * same zone. The lock is held only if zone != NULL.
+                 */
+                if (zone && ++lock_batch == SWAP_CLUSTER_MAX) {
+                        spin_unlock_irqrestore(&zone->lru_lock, flags);
+                        zone = NULL;
+                }
                if (!put_page_testzero(page))
                        continue;
@@ -930,6 +937,7 @@ void release_pages(struct page **pages, int nr, bool cold)
                                if (zone)
                                        spin_unlock_irqrestore(&zone->lru_lock,
                                                                        flags);
+                                lock_batch = 0;
                                zone = pagezone;
                                spin_lock_irqsave(&zone->lru_lock, flags);
                        }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3e0ec83d000c..154444918685 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -28,7 +28,9 @@
 static const struct address_space_operations swap_aops = {
        .writepage      = swap_writepage,
        .set_page_dirty = swap_set_page_dirty,
+#ifdef CONFIG_MIGRATION
        .migratepage    = migrate_page,
+#endif
 };
 static struct backing_dev_info swap_backing_dev_info = {
@@ -263,18 +265,12 @@ void free_page_and_swap_cache(struct page *page)
 void free_pages_and_swap_cache(struct page **pages, int nr)
 {
        struct page **pagep = pages;
+        int i;
        lru_add_drain();
-        while (nr) {
+        for (i = 0; i < nr; i++)
-                int todo = min(nr, PAGEVEC_SIZE);
+                free_swap_cache(pagep[i]);
-                int i;
+        release_pages(pagep, nr, false);
-                for (i = 0; i < todo; i++)
-                        free_swap_cache(pagep[i]);
-                release_pages(pagep, todo, false);
-                pagep += todo;
-                nr -= todo;
-        }
 }
 /*
diff --git a/mm/util.c b/mm/util.c
index 093c973f1697..fec39d4509a9 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -170,32 +170,25 @@ static int vm_is_stack_for_task(struct task_struct *t,
 /*
 * Check if the vma is being used as a stack.
 * If is_group is non-zero, check in the entire thread group or else
- * just check in the current task. Returns the pid of the task that
+ * just check in the current task. Returns the task_struct of the task
- * the vma is stack for.
+ * that the vma is stack for. Must be called under rcu_read_lock().
 */
-pid_t vm_is_stack(struct task_struct *task,
+struct task_struct *task_of_stack(struct task_struct *task,
-                  struct vm_area_struct *vma, int in_group)
+                                struct vm_area_struct *vma, bool in_group)
 {
-        pid_t ret = 0;
        if (vm_is_stack_for_task(task, vma))
-                return task->pid;
+                return task;
        if (in_group) {
                struct task_struct *t;
-                rcu_read_lock();
                for_each_thread(task, t) {
-                        if (vm_is_stack_for_task(t, vma)) {
+                        if (vm_is_stack_for_task(t, vma))
-                                ret = t->pid;
+                                return t;
-                                goto done;
-                        }
                }
-done:
-                rcu_read_unlock();
        }
-        return ret;
+        return NULL;
 }
 #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2b0aa5486092..90520af7f186 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2646,21 +2646,11 @@ static const struct seq_operations vmalloc_op = {
 static int vmalloc_open(struct inode *inode, struct file *file)
 {
-        unsigned int *ptr = NULL;
+        if (IS_ENABLED(CONFIG_NUMA))
-        int ret;
+                return seq_open_private(file, &vmalloc_op,
+                                        nr_node_ids * sizeof(unsigned int));
-        if (IS_ENABLED(CONFIG_NUMA)) {
+        else
-                ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
+                return seq_open(file, &vmalloc_op);
-                if (ptr == NULL)
-                        return -ENOMEM;
-        }
-        ret = seq_open(file, &vmalloc_op);
-        if (!ret) {
-                struct seq_file *m = file->private_data;
-                m->private = ptr;
-        } else
-                kfree(ptr);
-        return ret;
 }
 static const struct file_operations proc_vmalloc_operations = {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2836b5373b2e..dcb47074ae03 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -920,7 +920,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        /* Case 1 above */
                        if (current_is_kswapd() &&
                            PageReclaim(page) &&
-                            zone_is_reclaim_writeback(zone)) {
+                            test_bit(ZONE_WRITEBACK, &zone->flags)) {
                                nr_immediate++;
                                goto keep_locked;
@@ -1002,7 +1002,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         */
                        if (page_is_file_cache(page) &&
                                        (!current_is_kswapd() ||
-                                         !zone_is_reclaim_dirty(zone))) {
+                                         !test_bit(ZONE_DIRTY, &zone->flags))) {
                                /*
                                 * Immediately reclaim when written back.
                                 * Similar in principal to deactivate_page()
@@ -1563,7 +1563,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         * are encountered in the nr_immediate check below.
         */
        if (nr_writeback && nr_writeback == nr_taken)
-                zone_set_flag(zone, ZONE_WRITEBACK);
+                set_bit(ZONE_WRITEBACK, &zone->flags);
        /*
         * memcg will stall in page writeback so only consider forcibly
@@ -1575,16 +1575,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
                 * backed by a congested BDI and wait_iff_congested will stall.
                 */
                if (nr_dirty && nr_dirty == nr_congested)
-                        zone_set_flag(zone, ZONE_CONGESTED);
+                        set_bit(ZONE_CONGESTED, &zone->flags);
                /*
                 * If dirty pages are scanned that are not queued for IO, it
                 * implies that flushers are not keeping up. In this case, flag
-                 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
+                 * the zone ZONE_DIRTY and kswapd will start writing pages from
-                 * pages from reclaim context.
+                 * reclaim context.
                 */
                if (nr_unqueued_dirty == nr_taken)
-                        zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
+                        set_bit(ZONE_DIRTY, &zone->flags);
                /*
                 * If kswapd scans pages marked marked for immediate
@@ -2315,7 +2315,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc)
        return reclaimable;
 }
-/* Returns true if compaction should go ahead for a high-order request */
+/*
+ * Returns true if compaction should go ahead for a high-order request, or
+ * the high-order allocation would succeed without compaction.
+ */
 static inline bool compaction_ready(struct zone *zone, int order)
 {
        unsigned long balance_gap, watermark;
@@ -2339,8 +2342,11 @@ static inline bool compaction_ready(struct zone *zone, int order)
        if (compaction_deferred(zone, order))
                return watermark_ok;
-        /* If compaction is not ready to start, keep reclaiming */
+        /*
-        if (!compaction_suitable(zone, order))
+         * If compaction is not ready to start and allocation is not likely
+         * to succeed without it, then keep reclaiming.
+         */
+        if (compaction_suitable(zone, order) == COMPACT_SKIPPED)
                return false;
        return watermark_ok;
@@ -2753,21 +2759,22 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
 }
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
+                                           unsigned long nr_pages,
                                           gfp_t gfp_mask,
-                                           bool noswap)
+                                           bool may_swap)
 {
        struct zonelist *zonelist;
        unsigned long nr_reclaimed;
        int nid;
        struct scan_control sc = {
-                .nr_to_reclaim = SWAP_CLUSTER_MAX,
+                .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
                .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                                (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
                .target_mem_cgroup = memcg,
                .priority = DEF_PRIORITY,
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
-                .may_swap = !noswap,
+                .may_swap = may_swap,
        };
        /*
@@ -2818,7 +2825,7 @@ static bool zone_balanced(struct zone *zone, int order,
                return false;
        if (IS_ENABLED(CONFIG_COMPACTION) && order &&
-            !compaction_suitable(zone, order))
+            compaction_suitable(zone, order) == COMPACT_SKIPPED)
                return false;
        return true;
@@ -2978,7 +2985,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
        /* Account for the number of pages attempted to reclaim */
        *nr_attempted += sc->nr_to_reclaim;
-        zone_clear_flag(zone, ZONE_WRITEBACK);
+        clear_bit(ZONE_WRITEBACK, &zone->flags);
        /*
         * If a zone reaches its high watermark, consider it to be no longer
@@ -2988,8 +2995,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
         */
        if (zone_reclaimable(zone) &&
            zone_balanced(zone, testorder, 0, classzone_idx)) {
-                zone_clear_flag(zone, ZONE_CONGESTED);
+                clear_bit(ZONE_CONGESTED, &zone->flags);
-                zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
+                clear_bit(ZONE_DIRTY, &zone->flags);
        }
        return sc->nr_scanned >= sc->nr_to_reclaim;
@@ -3080,8 +3087,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                                 * If balanced, clear the dirty and congested
                                 * flags
                                 */
-                                zone_clear_flag(zone, ZONE_CONGESTED);
+                                clear_bit(ZONE_CONGESTED, &zone->flags);
-                                zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
+                                clear_bit(ZONE_DIRTY, &zone->flags);
                        }
                }
@@ -3708,11 +3715,11 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        if (node_state(node_id, N_CPU) && node_id != numa_node_id())
                return ZONE_RECLAIM_NOSCAN;
-        if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
+        if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags))
                return ZONE_RECLAIM_NOSCAN;
        ret = __zone_reclaim(zone, gfp_mask, order);
-        zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
+        clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
        if (!ret)
                count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
@@ -3791,66 +3798,3 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
        }
 }
 #endif /* CONFIG_SHMEM */
-static void warn_scan_unevictable_pages(void)
-{
-        printk_once(KERN_WARNING
-                    "%s: The scan_unevictable_pages sysctl/node-interface has been "
-                    "disabled for lack of a legitimate use case.  If you have "
-                    "one, please send an email to linux-mm@kvack.org.\n",
-                    current->comm);
-}
-/*
- * scan_unevictable_pages [vm] sysctl handler.  On demand re-scan of
- * all nodes' unevictable lists for evictable pages
- */
-unsigned long scan_unevictable_pages;
-int scan_unevictable_handler(struct ctl_table *table, int write,
-                           void __user *buffer,
-                           size_t *length, loff_t *ppos)
-{
-        warn_scan_unevictable_pages();
-        proc_doulongvec_minmax(table, write, buffer, length, ppos);
-        scan_unevictable_pages = 0;
-        return 0;
-}
-#ifdef CONFIG_NUMA
-/*
- * per node 'scan_unevictable_pages' attribute.  On demand re-scan of
- * a specified node's per zone unevictable lists for evictable pages.
- */
-static ssize_t read_scan_unevictable_node(struct device *dev,
-                                          struct device_attribute *attr,
-                                          char *buf)
-{
-        warn_scan_unevictable_pages();
-        return sprintf(buf, "0\n");     /* always zero; should fit... */
-}
-static ssize_t write_scan_unevictable_node(struct device *dev,
-                                           struct device_attribute *attr,
-                                        const char *buf, size_t count)
-{
-        warn_scan_unevictable_pages();
-        return 1;
-}
-static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
-                        read_scan_unevictable_node,
-                        write_scan_unevictable_node);
-int scan_unevictable_register_node(struct node *node)
-{
-        return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
-}
-void scan_unevictable_unregister_node(struct node *node)
-{
-        device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages);
-}
-#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index e9ab104b956f..1b12d390dc68 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -7,6 +7,7 @@
 *  zoned VM statistics
 *  Copyright (C) 2006 Silicon Graphics, Inc.,
 *              Christoph Lameter <christoph@lameter.com>
+ *  Copyright (C) 2008-2014 Christoph Lameter
 */
 #include <linux/fs.h>
 #include <linux/mm.h>
@@ -14,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/cpu.h>
+#include <linux/cpumask.h>
 #include <linux/vmstat.h>
 #include <linux/sched.h>
 #include <linux/math64.h>
@@ -419,13 +421,22 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
 EXPORT_SYMBOL(dec_zone_page_state);
 #endif
-static inline void fold_diff(int *diff)
+/*
+ * Fold a differential into the global counters.
+ * Returns the number of counters updated.
+ */
+static int fold_diff(int *diff)
 {
        int i;
+        int changes = 0;
        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-                if (diff[i])
+                if (diff[i]) {
                        atomic_long_add(diff[i], &vm_stat[i]);
+                        changes++;
+        }
+        return changes;
 }
 /*
@@ -441,12 +452,15 @@ static inline void fold_diff(int *diff)
 * statistics in the remote zone struct as well as the global cachelines
 * with the global counters. These could cause remote node cache line
 * bouncing and will have to be only done when necessary.
+ *
+ * The function returns the number of global counters updated.
 */
-static void refresh_cpu_vm_stats(void)
+static int refresh_cpu_vm_stats(void)
 {
        struct zone *zone;
        int i;
        int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+        int changes = 0;
        for_each_populated_zone(zone) {
                struct per_cpu_pageset __percpu *p = zone->pageset;
@@ -486,15 +500,17 @@ static void refresh_cpu_vm_stats(void)
                        continue;
                }
                if (__this_cpu_dec_return(p->expire))
                        continue;
-                if (__this_cpu_read(p->pcp.count))
+                if (__this_cpu_read(p->pcp.count)) {
                        drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
+                        changes++;
+                }
 #endif
        }
-        fold_diff(global_diff);
+        changes += fold_diff(global_diff);
+        return changes;
 }
 /*
@@ -735,7 +751,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
                                        TEXT_FOR_HIGHMEM(xx) xx "_movable",
 const char * const vmstat_text[] = {
-        /* Zoned VM counters */
+        /* enum zone_stat_item countes */
        "nr_free_pages",
        "nr_alloc_batch",
        "nr_inactive_anon",
@@ -778,10 +794,13 @@ const char * const vmstat_text[] = {
        "workingset_nodereclaim",
        "nr_anon_transparent_hugepages",
        "nr_free_cma",
+        /* enum writeback_stat_item counters */
        "nr_dirty_threshold",
        "nr_dirty_background_threshold",
 #ifdef CONFIG_VM_EVENT_COUNTERS
+        /* enum vm_event_item counters */
        "pgpgin",
        "pgpgout",
        "pswpin",
@@ -860,6 +879,13 @@ const char * const vmstat_text[] = {
        "thp_zero_page_alloc",
        "thp_zero_page_alloc_failed",
 #endif
+#ifdef CONFIG_MEMORY_BALLOON
+        "balloon_inflate",
+        "balloon_deflate",
+#ifdef CONFIG_BALLOON_COMPACTION
+        "balloon_migrate",
+#endif
+#endif /* CONFIG_MEMORY_BALLOON */
 #ifdef CONFIG_DEBUG_TLBFLUSH
 #ifdef CONFIG_SMP
        "nr_tlb_remote_flush",
@@ -1229,20 +1255,108 @@ static const struct file_operations proc_vmstat_file_operations = {
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
 int sysctl_stat_interval __read_mostly = HZ;
+static cpumask_var_t cpu_stat_off;
 static void vmstat_update(struct work_struct *w)
 {
-        refresh_cpu_vm_stats();
+        if (refresh_cpu_vm_stats())
-        schedule_delayed_work(this_cpu_ptr(&vmstat_work),
+                /*
+                 * Counters were updated so we expect more updates
+                 * to occur in the future. Keep on running the
+                 * update worker thread.
+                 */
+                schedule_delayed_work(this_cpu_ptr(&vmstat_work),
+                        round_jiffies_relative(sysctl_stat_interval));
+        else {
+                /*
+                 * We did not update any counters so the app may be in
+                 * a mode where it does not cause counter updates.
+                 * We may be uselessly running vmstat_update.
+                 * Defer the checking for differentials to the
+                 * shepherd thread on a different processor.
+                 */
+                int r;
+                /*
+                 * Shepherd work thread does not race since it never
+                 * changes the bit if its zero but the cpu
+                 * online / off line code may race if
+                 * worker threads are still allowed during
+                 * shutdown / startup.
+                 */
+                r = cpumask_test_and_set_cpu(smp_processor_id(),
+                        cpu_stat_off);
+                VM_BUG_ON(r);
+        }
+}
+/*
+ * Check if the diffs for a certain cpu indicate that
+ * an update is needed.
+ */
+static bool need_update(int cpu)
+{
+        struct zone *zone;
+        for_each_populated_zone(zone) {
+                struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
+                BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
+                /*
+                 * The fast way of checking if there are any vmstat diffs.
+                 * This works because the diffs are byte sized items.
+                 */
+                if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
+                        return true;
+        }
+        return false;
+}
+/*
+ * Shepherd worker thread that checks the
+ * differentials of processors that have their worker
+ * threads for vm statistics updates disabled because of
+ * inactivity.
+ */
+static void vmstat_shepherd(struct work_struct *w);
+static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd);
+static void vmstat_shepherd(struct work_struct *w)
+{
+        int cpu;
+        get_online_cpus();
+        /* Check processors whose vmstat worker threads have been disabled */
+        for_each_cpu(cpu, cpu_stat_off)
+                if (need_update(cpu) &&
+                        cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
+                        schedule_delayed_work_on(cpu, &per_cpu(vmstat_work, cpu),
+                                __round_jiffies_relative(sysctl_stat_interval, cpu));
+        put_online_cpus();
+        schedule_delayed_work(&shepherd,
                round_jiffies_relative(sysctl_stat_interval));
 }
-static void start_cpu_timer(int cpu)
+static void __init start_shepherd_timer(void)
 {
-        struct delayed_work *work = &per_cpu(vmstat_work, cpu);
+        int cpu;
+        for_each_possible_cpu(cpu)
+                INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
+                        vmstat_update);
+        if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))
+                BUG();
+        cpumask_copy(cpu_stat_off, cpu_online_mask);
-        INIT_DEFERRABLE_WORK(work, vmstat_update);
+        schedule_delayed_work(&shepherd,
-        schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
+                round_jiffies_relative(sysctl_stat_interval));
 }
 static void vmstat_cpu_dead(int node)
@@ -1273,17 +1387,17 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb,
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
                refresh_zone_stat_thresholds();
-                start_cpu_timer(cpu);
                node_set_state(cpu_to_node(cpu), N_CPU);
+                cpumask_set_cpu(cpu, cpu_stat_off);
                break;
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
                cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
-                per_cpu(vmstat_work, cpu).work.func = NULL;
+                cpumask_clear_cpu(cpu, cpu_stat_off);
                break;
        case CPU_DOWN_FAILED:
        case CPU_DOWN_FAILED_FROZEN:
-                start_cpu_timer(cpu);
+                cpumask_set_cpu(cpu, cpu_stat_off);
                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
@@ -1303,15 +1417,10 @@ static struct notifier_block vmstat_notifier =
 static int __init setup_vmstat(void)
 {
 #ifdef CONFIG_SMP
-        int cpu;
        cpu_notifier_register_begin();
        __register_cpu_notifier(&vmstat_notifier);
-        for_each_online_cpu(cpu) {
+        start_shepherd_timer();
-                start_cpu_timer(cpu);
-                node_set_state(cpu_to_node(cpu), N_CPU);
-        }
        cpu_notifier_register_done();
 #endif
 #ifdef CONFIG_PROC_FS
diff --git a/mm/zbud.c b/mm/zbud.c
index a05790b1915e..ecf1dbef6983 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -60,15 +60,17 @@
 * NCHUNKS_ORDER determines the internal allocation granularity, effectively
 * adjusting internal fragmentation.  It also determines the number of
 * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
- * allocation granularity will be in chunks of size PAGE_SIZE/64, and there
+ * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk
- * will be 64 freelists per pool.
+ * in allocated page is occupied by zbud header, NCHUNKS will be calculated to
+ * 63 which shows the max number of free chunks in zbud page, also there will be
+ * 63 freelists per pool.
 */
 #define NCHUNKS_ORDER   6
 #define CHUNK_SHIFT     (PAGE_SHIFT - NCHUNKS_ORDER)
 #define CHUNK_SIZE      (1 << CHUNK_SHIFT)
-#define NCHUNKS         (PAGE_SIZE >> CHUNK_SHIFT)
 #define ZHDR_SIZE_ALIGNED CHUNK_SIZE
+#define NCHUNKS         ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
 /**
 * struct zbud_pool - stores metadata for each zbud pool
@@ -195,6 +197,7 @@ static struct zpool_driver zbud_zpool_driver = {
        .total_size =   zbud_zpool_total_size,
 };
+MODULE_ALIAS("zpool-zbud");
 #endif /* CONFIG_ZPOOL */
 /*****************
@@ -267,10 +270,9 @@ static int num_free_chunks(struct zbud_header *zhdr)
 {
        /*
         * Rather than branch for different situations, just use the fact that
-         * free buddies have a length of zero to simplify everything. -1 at the
+         * free buddies have a length of zero to simplify everything.
-         * end for the zbud header.
         */
-        return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1;
+        return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
 }
 /*****************
diff --git a/mm/zpool.c b/mm/zpool.c
index e40612a1df00..739cdf0d183a 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -150,7 +150,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
        driver = zpool_get_driver(type);
        if (!driver) {
-                request_module(type);
+                request_module("zpool-%s", type);
                driver = zpool_get_driver(type);
        }
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 4e2fc83cb394..839a48c3ca27 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -175,7 +175,7 @@ enum fullness_group {
 *      n <= N / f, where
 * n = number of allocated objects
 * N = total number of objects zspage can store
- * f = 1/fullness_threshold_frac
+ * f = fullness_threshold_frac
 *
 * Similarly, we assign zspage to:
 *      ZS_ALMOST_FULL  when n > N / f
@@ -199,9 +199,6 @@ struct size_class {
        spinlock_t lock;
-        /* stats */
-        u64 pages_allocated;
        struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
 };
@@ -220,6 +217,7 @@ struct zs_pool {
        struct size_class size_class[ZS_SIZE_CLASSES];
        gfp_t flags;    /* allocation flags used when growing pool */
+        atomic_long_t pages_allocated;
 };
 /*
@@ -299,7 +297,7 @@ static void zs_zpool_unmap(void *pool, unsigned long handle)
 static u64 zs_zpool_total_size(void *pool)
 {
-        return zs_get_total_size_bytes(pool);
+        return zs_get_total_pages(pool) << PAGE_SHIFT;
 }
 static struct zpool_driver zs_zpool_driver = {
@@ -315,6 +313,7 @@ static struct zpool_driver zs_zpool_driver = {
        .total_size =   zs_zpool_total_size,
 };
+MODULE_ALIAS("zpool-zsmalloc");
 #endif /* CONFIG_ZPOOL */
 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
@@ -629,7 +628,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
        while (page) {
                struct page *next_page;
                struct link_free *link;
-                unsigned int i, objs_on_page;
+                unsigned int i = 1;
                /*
                 * page->index stores offset of first object starting
@@ -642,14 +641,10 @@ static void init_zspage(struct page *first_page, struct size_class *class)
                link = (struct link_free *)kmap_atomic(page) +
                                                off / sizeof(*link);
-                objs_on_page = (PAGE_SIZE - off) / class->size;
-                for (i = 1; i <= objs_on_page; i++) {
+                while ((off += class->size) < PAGE_SIZE) {
-                        off += class->size;
+                        link->next = obj_location_to_handle(page, i++);
-                        if (off < PAGE_SIZE) {
+                        link += class->size / sizeof(*link);
-                                link->next = obj_location_to_handle(page, i);
-                                link += class->size / sizeof(*link);
-                        }
                }
                /*
@@ -661,7 +656,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
                link->next = obj_location_to_handle(next_page, 0);
                kunmap_atomic(link);
                page = next_page;
-                off = (off + class->size) % PAGE_SIZE;
+                off %= PAGE_SIZE;
        }
 }
@@ -1027,8 +1022,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
                        return 0;
                set_zspage_mapping(first_page, class->index, ZS_EMPTY);
+                atomic_long_add(class->pages_per_zspage,
+                                        &pool->pages_allocated);
                spin_lock(&class->lock);
-                class->pages_allocated += class->pages_per_zspage;
        }
        obj = (unsigned long)first_page->freelist;
@@ -1081,14 +1077,13 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
        first_page->inuse--;
        fullness = fix_fullness_group(pool, first_page);
-        if (fullness == ZS_EMPTY)
-                class->pages_allocated -= class->pages_per_zspage;
        spin_unlock(&class->lock);
-        if (fullness == ZS_EMPTY)
+        if (fullness == ZS_EMPTY) {
+                atomic_long_sub(class->pages_per_zspage,
+                                &pool->pages_allocated);
                free_zspage(first_page);
+        }
 }
 EXPORT_SYMBOL_GPL(zs_free);
@@ -1182,17 +1177,11 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
 }
 EXPORT_SYMBOL_GPL(zs_unmap_object);
-u64 zs_get_total_size_bytes(struct zs_pool *pool)
+unsigned long zs_get_total_pages(struct zs_pool *pool)
 {
-        int i;
+        return atomic_long_read(&pool->pages_allocated);
-        u64 npages = 0;
-        for (i = 0; i < ZS_SIZE_CLASSES; i++)
-                npages += pool->size_class[i].pages_allocated;
-        return npages << PAGE_SHIFT;
 }
-EXPORT_SYMBOL_GPL(zs_get_total_size_bytes);
+EXPORT_SYMBOL_GPL(zs_get_total_pages);
 module_init(zs_init);
 module_exit(zs_exit);