45 files changed, 2789 insertions, 2595 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index ebe5880c29d6..3e9977a9d657 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -134,6 +134,9 @@ config HAVE_MEMBLOCK
 config HAVE_MEMBLOCK_NODE_MAP
        boolean
+config HAVE_MEMBLOCK_PHYS_MAP
+        boolean
 config ARCH_DISCARD_MEMBLOCK
        boolean
@@ -264,6 +267,9 @@ config MIGRATION
          pages as migration can relocate pages to satisfy a huge page
          allocation instead of reclaiming.
+config ARCH_ENABLE_HUGEPAGE_MIGRATION
+        boolean
 config PHYS_ADDR_T_64BIT
        def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
@@ -430,16 +436,6 @@ choice
          benefit.
 endchoice
-config CROSS_MEMORY_ATTACH
-        bool "Cross Memory Support"
-        depends on MMU
-        default y
-        help
-          Enabling this option adds the system calls process_vm_readv and
-          process_vm_writev which allow a process with the correct privileges
-          to directly read from or write to to another process's address space.
-          See the man page for more details.
 #
 # UP and nommu archs use km based percpu allocator
 #
@@ -555,7 +551,7 @@ config MEM_SOFT_DIRTY
          See Documentation/vm/soft-dirty.txt for more details.
 config ZSMALLOC
-        bool "Memory allocator for compressed pages"
+        tristate "Memory allocator for compressed pages"
        depends on MMU
        default n
        help
@@ -581,3 +577,18 @@ config PGTABLE_MAPPING
 config GENERIC_EARLY_IOREMAP
        bool
+config MAX_STACK_SIZE_MB
+        int "Maximum user stack size for 32-bit processes (MB)"
+        default 80
+        range 8 256 if METAG
+        range 8 2048
+        depends on STACK_GROWSUP && (!64BIT || COMPAT)
+        help
+          This is the maximum stack size in Megabytes in the VM layout of 32-bit
+          user processes when the stack grows upwards (currently only on parisc
+          and metag arch). The stack will be located at the highest memory
+          address minus the given value, unless the RLIMIT_STACK hard limit is
+          changed to a smaller value in which case that is used.
+          A sane initial value is 80 MB.
diff --git a/mm/Makefile b/mm/Makefile
index b484452dac57..4064f3ec145e 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -3,7 +3,7 @@
 #
 mmu-y                   := nommu.o
-mmu-$(CONFIG_MMU)       := fremap.o highmem.o madvise.o memory.o mincore.o \
+mmu-$(CONFIG_MMU)       := fremap.o gup.o highmem.o madvise.o memory.o mincore.o \
                           mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
                           vmalloc.o pagewalk.o pgtable-generic.o
@@ -30,7 +30,6 @@ endif
 obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
-obj-$(CONFIG_BOUNCE)    += bounce.o
 obj-$(CONFIG_SWAP)      += page_io.o swap_state.o swapfile.o
 obj-$(CONFIG_FRONTSWAP) += frontswap.o
 obj-$(CONFIG_ZSWAP)     += zswap.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 09d9591b7708..1706cbbdf5f0 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -557,7 +557,7 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
        bit = sync ? BDI_sync_congested : BDI_async_congested;
        if (test_and_clear_bit(bit, &bdi->state))
                atomic_dec(&nr_bdi_congested[sync]);
-        smp_mb__after_clear_bit();
+        smp_mb__after_atomic();
        if (waitqueue_active(wqh))
                wake_up(wqh);
 }
diff --git a/mm/bounce.c b/mm/bounce.c
deleted file mode 100644
index 523918b8c6dc..000000000000
--- a/mm/bounce.c
+++ /dev/null
@@ -1,287 +0,0 @@
-/* bounce buffer handling for block devices
- *
- * - Split from highmem.c
- */
-#include <linux/mm.h>
-#include <linux/export.h>
-#include <linux/swap.h>
-#include <linux/gfp.h>
-#include <linux/bio.h>
-#include <linux/pagemap.h>
-#include <linux/mempool.h>
-#include <linux/blkdev.h>
-#include <linux/init.h>
-#include <linux/hash.h>
-#include <linux/highmem.h>
-#include <linux/bootmem.h>
-#include <asm/tlbflush.h>
-#include <trace/events/block.h>
-#define POOL_SIZE       64
-#define ISA_POOL_SIZE   16
-static mempool_t *page_pool, *isa_page_pool;
-#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL)
-static __init int init_emergency_pool(void)
-{
-#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
-        if (max_pfn <= max_low_pfn)
-                return 0;
-#endif
-        page_pool = mempool_create_page_pool(POOL_SIZE, 0);
-        BUG_ON(!page_pool);
-        printk("bounce pool size: %d pages\n", POOL_SIZE);
-        return 0;
-}
-__initcall(init_emergency_pool);
-#endif
-#ifdef CONFIG_HIGHMEM
-/*
- * highmem version, map in to vec
- */
-static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
-{
-        unsigned long flags;
-        unsigned char *vto;
-        local_irq_save(flags);
-        vto = kmap_atomic(to->bv_page);
-        memcpy(vto + to->bv_offset, vfrom, to->bv_len);
-        kunmap_atomic(vto);
-        local_irq_restore(flags);
-}
-#else /* CONFIG_HIGHMEM */
-#define bounce_copy_vec(to, vfrom)      \
-        memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
-#endif /* CONFIG_HIGHMEM */
-/*
- * allocate pages in the DMA region for the ISA pool
- */
-static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
-{
-        return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
-}
-/*
- * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
- * as the max address, so check if the pool has already been created.
- */
-int init_emergency_isa_pool(void)
-{
-        if (isa_page_pool)
-                return 0;
-        isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
-                                       mempool_free_pages, (void *) 0);
-        BUG_ON(!isa_page_pool);
-        printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
-        return 0;
-}
-/*
- * Simple bounce buffer support for highmem pages. Depending on the
- * queue gfp mask set, *to may or may not be a highmem page. kmap it
- * always, it will do the Right Thing
- */
-static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
-{
-        unsigned char *vfrom;
-        struct bio_vec tovec, *fromvec = from->bi_io_vec;
-        struct bvec_iter iter;
-        bio_for_each_segment(tovec, to, iter) {
-                if (tovec.bv_page != fromvec->bv_page) {
-                        /*
-                         * fromvec->bv_offset and fromvec->bv_len might have
-                         * been modified by the block layer, so use the original
-                         * copy, bounce_copy_vec already uses tovec->bv_len
-                         */
-                        vfrom = page_address(fromvec->bv_page) +
-                                tovec.bv_offset;
-                        bounce_copy_vec(&tovec, vfrom);
-                        flush_dcache_page(tovec.bv_page);
-                }
-                fromvec++;
-        }
-}
-static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
-{
-        struct bio *bio_orig = bio->bi_private;
-        struct bio_vec *bvec, *org_vec;
-        int i;
-        if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
-                set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
-        /*
-         * free up bounce indirect pages used
-         */
-        bio_for_each_segment_all(bvec, bio, i) {
-                org_vec = bio_orig->bi_io_vec + i;
-                if (bvec->bv_page == org_vec->bv_page)
-                        continue;
-                dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
-                mempool_free(bvec->bv_page, pool);
-        }
-        bio_endio(bio_orig, err);
-        bio_put(bio);
-}
-static void bounce_end_io_write(struct bio *bio, int err)
-{
-        bounce_end_io(bio, page_pool, err);
-}
-static void bounce_end_io_write_isa(struct bio *bio, int err)
-{
-        bounce_end_io(bio, isa_page_pool, err);
-}
-static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
-{
-        struct bio *bio_orig = bio->bi_private;
-        if (test_bit(BIO_UPTODATE, &bio->bi_flags))
-                copy_to_high_bio_irq(bio_orig, bio);
-        bounce_end_io(bio, pool, err);
-}
-static void bounce_end_io_read(struct bio *bio, int err)
-{
-        __bounce_end_io_read(bio, page_pool, err);
-}
-static void bounce_end_io_read_isa(struct bio *bio, int err)
-{
-        __bounce_end_io_read(bio, isa_page_pool, err);
-}
-#ifdef CONFIG_NEED_BOUNCE_POOL
-static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
-{
-        if (bio_data_dir(bio) != WRITE)
-                return 0;
-        if (!bdi_cap_stable_pages_required(&q->backing_dev_info))
-                return 0;
-        return test_bit(BIO_SNAP_STABLE, &bio->bi_flags);
-}
-#else
-static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
-{
-        return 0;
-}
-#endif /* CONFIG_NEED_BOUNCE_POOL */
-static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
-                               mempool_t *pool, int force)
-{
-        struct bio *bio;
-        int rw = bio_data_dir(*bio_orig);
-        struct bio_vec *to, from;
-        struct bvec_iter iter;
-        unsigned i;
-        if (force)
-                goto bounce;
-        bio_for_each_segment(from, *bio_orig, iter)
-                if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q))
-                        goto bounce;
-        return;
-bounce:
-        bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set);
-        bio_for_each_segment_all(to, bio, i) {
-                struct page *page = to->bv_page;
-                if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
-                        continue;
-                inc_zone_page_state(to->bv_page, NR_BOUNCE);
-                to->bv_page = mempool_alloc(pool, q->bounce_gfp);
-                if (rw == WRITE) {
-                        char *vto, *vfrom;
-                        flush_dcache_page(page);
-                        vto = page_address(to->bv_page) + to->bv_offset;
-                        vfrom = kmap_atomic(page) + to->bv_offset;
-                        memcpy(vto, vfrom, to->bv_len);
-                        kunmap_atomic(vfrom);
-                }
-        }
-        trace_block_bio_bounce(q, *bio_orig);
-        bio->bi_flags |= (1 << BIO_BOUNCED);
-        if (pool == page_pool) {
-                bio->bi_end_io = bounce_end_io_write;
-                if (rw == READ)
-                        bio->bi_end_io = bounce_end_io_read;
-        } else {
-                bio->bi_end_io = bounce_end_io_write_isa;
-                if (rw == READ)
-                        bio->bi_end_io = bounce_end_io_read_isa;
-        }
-        bio->bi_private = *bio_orig;
-        *bio_orig = bio;
-}
-void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
-{
-        int must_bounce;
-        mempool_t *pool;
-        /*
-         * Data-less bio, nothing to bounce
-         */
-        if (!bio_has_data(*bio_orig))
-                return;
-        must_bounce = must_snapshot_stable_pages(q, *bio_orig);
-        /*
-         * for non-isa bounce case, just check if the bounce pfn is equal
-         * to or bigger than the highest pfn in the system -- in that case,
-         * don't waste time iterating over bio segments
-         */
-        if (!(q->bounce_gfp & GFP_DMA)) {
-                if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce)
-                        return;
-                pool = page_pool;
-        } else {
-                BUG_ON(!isa_page_pool);
-                pool = isa_page_pool;
-        }
-        /*
-         * slow path
-         */
-        __blk_queue_bounce(q, bio_orig, pool, must_bounce);
-}
-EXPORT_SYMBOL(blk_queue_bounce);
diff --git a/mm/compaction.c b/mm/compaction.c
index 627dc2e4320f..21bf292b642a 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
        unsigned long end_pfn = zone_end_pfn(zone);
        unsigned long pfn;
-        zone->compact_cached_migrate_pfn = start_pfn;
+        zone->compact_cached_migrate_pfn[0] = start_pfn;
+        zone->compact_cached_migrate_pfn[1] = start_pfn;
        zone->compact_cached_free_pfn = end_pfn;
        zone->compact_blockskip_flush = false;
@@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat)
 */
 static void update_pageblock_skip(struct compact_control *cc,
                        struct page *page, unsigned long nr_isolated,
-                        bool migrate_scanner)
+                        bool set_unsuitable, bool migrate_scanner)
 {
        struct zone *zone = cc->zone;
+        unsigned long pfn;
        if (cc->ignore_skip_hint)
                return;
@@ -141,20 +143,32 @@ static void update_pageblock_skip(struct compact_control *cc,
        if (!page)
                return;
-        if (!nr_isolated) {
+        if (nr_isolated)
-                unsigned long pfn = page_to_pfn(page);
+                return;
+        /*
+         * Only skip pageblocks when all forms of compaction will be known to
+         * fail in the near future.
+         */
+        if (set_unsuitable)
                set_pageblock_skip(page);
-                /* Update where compaction should restart */
+        pfn = page_to_pfn(page);
-                if (migrate_scanner) {
-                        if (!cc->finished_update_migrate &&
+        /* Update where async and sync compaction should restart */
-                            pfn > zone->compact_cached_migrate_pfn)
+        if (migrate_scanner) {
-                                zone->compact_cached_migrate_pfn = pfn;
+                if (cc->finished_update_migrate)
-                } else {
+                        return;
-                        if (!cc->finished_update_free &&
+                if (pfn > zone->compact_cached_migrate_pfn[0])
-                            pfn < zone->compact_cached_free_pfn)
+                        zone->compact_cached_migrate_pfn[0] = pfn;
-                                zone->compact_cached_free_pfn = pfn;
+                if (cc->mode != MIGRATE_ASYNC &&
-                }
+                    pfn > zone->compact_cached_migrate_pfn[1])
+                        zone->compact_cached_migrate_pfn[1] = pfn;
+        } else {
+                if (cc->finished_update_free)
+                        return;
+                if (pfn < zone->compact_cached_free_pfn)
+                        zone->compact_cached_free_pfn = pfn;
        }
 }
 #else
@@ -166,7 +180,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
 static void update_pageblock_skip(struct compact_control *cc,
                        struct page *page, unsigned long nr_isolated,
-                        bool migrate_scanner)
+                        bool set_unsuitable, bool migrate_scanner)
 {
 }
 #endif /* CONFIG_COMPACTION */
@@ -195,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
                }
                /* async aborts if taking too long or contended */
-                if (!cc->sync) {
+                if (cc->mode == MIGRATE_ASYNC) {
                        cc->contended = true;
                        return false;
                }
@@ -208,10 +222,28 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
        return true;
 }
-static inline bool compact_trylock_irqsave(spinlock_t *lock,
+/*
-                        unsigned long *flags, struct compact_control *cc)
+ * Aside from avoiding lock contention, compaction also periodically checks
+ * need_resched() and either schedules in sync compaction or aborts async
+ * compaction. This is similar to what compact_checklock_irqsave() does, but
+ * is used where no lock is concerned.
+ *
+ * Returns false when no scheduling was needed, or sync compaction scheduled.
+ * Returns true when async compaction should abort.
+ */
+static inline bool compact_should_abort(struct compact_control *cc)
 {
-        return compact_checklock_irqsave(lock, flags, false, cc);
+        /* async compaction aborts if contended */
+        if (need_resched()) {
+                if (cc->mode == MIGRATE_ASYNC) {
+                        cc->contended = true;
+                        return true;
+                }
+                cond_resched();
+        }
+        return false;
 }
 /* Returns true if the page is within a block suitable for migration to */
@@ -329,7 +361,8 @@ isolate_fail:
        /* Update the pageblock-skip if the whole pageblock was scanned */
        if (blockpfn == end_pfn)
-                update_pageblock_skip(cc, valid_page, total_isolated, false);
+                update_pageblock_skip(cc, valid_page, total_isolated, true,
+                                      false);
        count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
        if (total_isolated)
@@ -464,8 +497,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
        unsigned long flags;
        bool locked = false;
        struct page *page = NULL, *valid_page = NULL;
-        bool skipped_async_unsuitable = false;
+        bool set_unsuitable = true;
-        const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
+        const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
+                                        ISOLATE_ASYNC_MIGRATE : 0) |
                                    (unevictable ? ISOLATE_UNEVICTABLE : 0);
        /*
@@ -475,7 +509,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
         */
        while (unlikely(too_many_isolated(zone))) {
                /* async migration should just abort */
-                if (!cc->sync)
+                if (cc->mode == MIGRATE_ASYNC)
                        return 0;
                congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -484,8 +518,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                        return 0;
        }
+        if (compact_should_abort(cc))
+                return 0;
        /* Time to isolate some pages for migration */
-        cond_resched();
        for (; low_pfn < end_pfn; low_pfn++) {
                /* give a chance to irqs before checking need_resched() */
                if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
@@ -540,9 +576,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                         * the minimum amount of work satisfies the allocation
                         */
                        mt = get_pageblock_migratetype(page);
-                        if (!cc->sync && !migrate_async_suitable(mt)) {
+                        if (cc->mode == MIGRATE_ASYNC &&
-                                cc->finished_update_migrate = true;
+                            !migrate_async_suitable(mt)) {
-                                skipped_async_unsuitable = true;
+                                set_unsuitable = false;
                                goto next_pageblock;
                        }
                }
@@ -646,11 +682,10 @@ next_pageblock:
        /*
         * Update the pageblock-skip information and cached scanner pfn,
         * if the whole pageblock was scanned without isolating any page.
-         * This is not done when pageblock was skipped due to being unsuitable
-         * for async compaction, so that eventual sync compaction can try.
         */
-        if (low_pfn == end_pfn && !skipped_async_unsuitable)
+        if (low_pfn == end_pfn)
-                update_pageblock_skip(cc, valid_page, nr_isolated, true);
+                update_pageblock_skip(cc, valid_page, nr_isolated,
+                                      set_unsuitable, true);
        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
@@ -671,7 +706,9 @@ static void isolate_freepages(struct zone *zone,
                                struct compact_control *cc)
 {
        struct page *page;
-        unsigned long high_pfn, low_pfn, pfn, z_end_pfn;
+        unsigned long block_start_pfn;  /* start of current pageblock */
+        unsigned long block_end_pfn;    /* end of current pageblock */
+        unsigned long low_pfn;       /* lowest pfn scanner is able to scan */
        int nr_freepages = cc->nr_freepages;
        struct list_head *freelist = &cc->freepages;
@@ -679,41 +716,38 @@ static void isolate_freepages(struct zone *zone,
         * Initialise the free scanner. The starting point is where we last
         * successfully isolated from, zone-cached value, or the end of the
         * zone when isolating for the first time. We need this aligned to
-         * the pageblock boundary, because we do pfn -= pageblock_nr_pages
+         * the pageblock boundary, because we do
-         * in the for loop.
+         * block_start_pfn -= pageblock_nr_pages in the for loop.
+         * For ending point, take care when isolating in last pageblock of a
+         * a zone which ends in the middle of a pageblock.
         * The low boundary is the end of the pageblock the migration scanner
         * is using.
         */
-        pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
+        block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
+        block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
+                                                zone_end_pfn(zone));
        low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
        /*
-         * Take care that if the migration scanner is at the end of the zone
-         * that the free scanner does not accidentally move to the next zone
-         * in the next isolation cycle.
-         */
-        high_pfn = min(low_pfn, pfn);
-        z_end_pfn = zone_end_pfn(zone);
-        /*
         * Isolate free pages until enough are available to migrate the
         * pages on cc->migratepages. We stop searching if the migrate
         * and free page scanners meet or enough free pages are isolated.
         */
-        for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
+        for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
-                                        pfn -= pageblock_nr_pages) {
+                                block_end_pfn = block_start_pfn,
+                                block_start_pfn -= pageblock_nr_pages) {
                unsigned long isolated;
-                unsigned long end_pfn;
                /*
                 * This can iterate a massively long zone without finding any
                 * suitable migration targets, so periodically check if we need
-                 * to schedule.
+                 * to schedule, or even abort async compaction.
                 */
-                cond_resched();
+                if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
+                                                && compact_should_abort(cc))
+                        break;
-                if (!pfn_valid(pfn))
+                if (!pfn_valid(block_start_pfn))
                        continue;
                /*
@@ -723,7 +757,7 @@ static void isolate_freepages(struct zone *zone,
                 * i.e. it's possible that all pages within a zones range of
                 * pages do not belong to a single zone.
                 */
-                page = pfn_to_page(pfn);
+                page = pfn_to_page(block_start_pfn);
                if (page_zone(page) != zone)
                        continue;
@@ -736,26 +770,26 @@ static void isolate_freepages(struct zone *zone,
                        continue;
                /* Found a block suitable for isolating free pages from */
-                isolated = 0;
+                cc->free_pfn = block_start_pfn;
+                isolated = isolate_freepages_block(cc, block_start_pfn,
+                                        block_end_pfn, freelist, false);
+                nr_freepages += isolated;
                /*
-                 * Take care when isolating in last pageblock of a zone which
+                 * Set a flag that we successfully isolated in this pageblock.
-                 * ends in the middle of a pageblock.
+                 * In the next loop iteration, zone->compact_cached_free_pfn
+                 * will not be updated and thus it will effectively contain the
+                 * highest pageblock we isolated pages from.
                 */
-                end_pfn = min(pfn + pageblock_nr_pages, z_end_pfn);
+                if (isolated)
-                isolated = isolate_freepages_block(cc, pfn, end_pfn,
+                        cc->finished_update_free = true;
-                                                   freelist, false);
-                nr_freepages += isolated;
                /*
-                 * Record the highest PFN we isolated pages from. When next
+                 * isolate_freepages_block() might have aborted due to async
-                 * looking for free pages, the search will restart here as
+                 * compaction being contended
-                 * page migration may have returned some pages to the allocator
                 */
-                if (isolated) {
+                if (cc->contended)
-                        cc->finished_update_free = true;
+                        break;
-                        high_pfn = max(high_pfn, pfn);
-                }
        }
        /* split_free_page does not map the pages */
@@ -765,10 +799,9 @@ static void isolate_freepages(struct zone *zone,
         * If we crossed the migrate scanner, we want to keep it that way
         * so that compact_finished() may detect this
         */
-        if (pfn < low_pfn)
+        if (block_start_pfn < low_pfn)
-                cc->free_pfn = max(pfn, zone->zone_start_pfn);
+                cc->free_pfn = cc->migrate_pfn;
-        else
-                cc->free_pfn = high_pfn;
        cc->nr_freepages = nr_freepages;
 }
@@ -783,9 +816,13 @@ static struct page *compaction_alloc(struct page *migratepage,
        struct compact_control *cc = (struct compact_control *)data;
        struct page *freepage;
-        /* Isolate free pages if necessary */
+        /*
+         * Isolate free pages if necessary, and if we are not aborting due to
+         * contention.
+         */
        if (list_empty(&cc->freepages)) {
-                isolate_freepages(cc->zone, cc);
+                if (!cc->contended)
+                        isolate_freepages(cc->zone, cc);
                if (list_empty(&cc->freepages))
                        return NULL;
@@ -799,23 +836,16 @@ static struct page *compaction_alloc(struct page *migratepage,
 }
 /*
- * We cannot control nr_migratepages and nr_freepages fully when migration is
+ * This is a migrate-callback that "frees" freepages back to the isolated
- * running as migrate_pages() has no knowledge of compact_control. When
+ * freelist.  All pages on the freelist are from the same zone, so there is no
- * migration is complete, we count the number of pages on the lists by hand.
+ * special handling needed for NUMA.
 */
-static void update_nr_listpages(struct compact_control *cc)
+static void compaction_free(struct page *page, unsigned long data)
 {
-        int nr_migratepages = 0;
+        struct compact_control *cc = (struct compact_control *)data;
-        int nr_freepages = 0;
-        struct page *page;
-        list_for_each_entry(page, &cc->migratepages, lru)
-                nr_migratepages++;
-        list_for_each_entry(page, &cc->freepages, lru)
-                nr_freepages++;
-        cc->nr_migratepages = nr_migratepages;
+        list_add(&page->lru, &cc->freepages);
-        cc->nr_freepages = nr_freepages;
+        cc->nr_freepages++;
 }
 /* possible outcome of isolate_migratepages */
@@ -862,13 +892,14 @@ static int compact_finished(struct zone *zone,
        unsigned int order;
        unsigned long watermark;
-        if (fatal_signal_pending(current))
+        if (cc->contended || fatal_signal_pending(current))
                return COMPACT_PARTIAL;
        /* Compaction run completes if the migrate and free scanner meet */
        if (cc->free_pfn <= cc->migrate_pfn) {
                /* Let the next compaction start anew. */
-                zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
+                zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
+                zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
                zone->compact_cached_free_pfn = zone_end_pfn(zone);
                /*
@@ -968,6 +999,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        int ret;
        unsigned long start_pfn = zone->zone_start_pfn;
        unsigned long end_pfn = zone_end_pfn(zone);
+        const bool sync = cc->mode != MIGRATE_ASYNC;
        ret = compaction_suitable(zone, cc->order);
        switch (ret) {
@@ -993,7 +1025,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
         * information on where the scanners should start but check that it
         * is initialised by ensuring the values are within zone boundaries.
         */
-        cc->migrate_pfn = zone->compact_cached_migrate_pfn;
+        cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
        cc->free_pfn = zone->compact_cached_free_pfn;
        if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
                cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -1001,7 +1033,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        }
        if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
                cc->migrate_pfn = start_pfn;
-                zone->compact_cached_migrate_pfn = cc->migrate_pfn;
+                zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
+                zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
        }
        trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
@@ -1009,7 +1042,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        migrate_prep_local();
        while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
-                unsigned long nr_migrate, nr_remaining;
                int err;
                switch (isolate_migratepages(zone, cc)) {
@@ -1024,21 +1056,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                        ;
                }
-                nr_migrate = cc->nr_migratepages;
+                if (!cc->nr_migratepages)
+                        continue;
                err = migrate_pages(&cc->migratepages, compaction_alloc,
-                                (unsigned long)cc,
+                                compaction_free, (unsigned long)cc, cc->mode,
-                                cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
                                MR_COMPACTION);
-                update_nr_listpages(cc);
-                nr_remaining = cc->nr_migratepages;
-                trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
+                trace_mm_compaction_migratepages(cc->nr_migratepages, err,
-                                                nr_remaining);
+                                                        &cc->migratepages);
-                /* Release isolated pages not migrated */
+                /* All pages were either migrated or will be released */
+                cc->nr_migratepages = 0;
                if (err) {
                        putback_movable_pages(&cc->migratepages);
-                        cc->nr_migratepages = 0;
                        /*
                         * migrate_pages() may return -ENOMEM when scanners meet
                         * and we want compact_finished() to detect it
@@ -1060,9 +1091,8 @@ out:
        return ret;
 }
-static unsigned long compact_zone_order(struct zone *zone,
+static unsigned long compact_zone_order(struct zone *zone, int order,
-                                 int order, gfp_t gfp_mask,
+                gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
-                                 bool sync, bool *contended)
 {
        unsigned long ret;
        struct compact_control cc = {
@@ -1071,7 +1101,7 @@ static unsigned long compact_zone_order(struct zone *zone,
                .order = order,
                .migratetype = allocflags_to_migratetype(gfp_mask),
                .zone = zone,
-                .sync = sync,
+                .mode = mode,
        };
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
@@ -1093,7 +1123,7 @@ int sysctl_extfrag_threshold = 500;
 * @order: The order of the current allocation
 * @gfp_mask: The GFP mask of the current allocation
 * @nodemask: The allowed nodes to allocate from
- * @sync: Whether migration is synchronous or not
+ * @mode: The migration mode for async, sync light, or sync migration
 * @contended: Return value that is true if compaction was aborted due to lock contention
 * @page: Optionally capture a free page of the requested order during compaction
 *
@@ -1101,7 +1131,7 @@ int sysctl_extfrag_threshold = 500;
 */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
-                        bool sync, bool *contended)
+                        enum migrate_mode mode, bool *contended)
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1126,7 +1156,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
                                                                nodemask) {
                int status;
-                status = compact_zone_order(zone, order, gfp_mask, sync,
+                status = compact_zone_order(zone, order, gfp_mask, mode,
                                                contended);
                rc = max(status, rc);
@@ -1165,9 +1195,6 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
                        if (zone_watermark_ok(zone, cc->order,
                                                low_wmark_pages(zone), 0, 0))
                                compaction_defer_reset(zone, cc->order, false);
-                        /* Currently async compaction is never deferred. */
-                        else if (cc->sync)
-                                defer_compaction(zone, cc->order);
                }
                VM_BUG_ON(!list_empty(&cc->freepages));
@@ -1179,7 +1206,7 @@ void compact_pgdat(pg_data_t *pgdat, int order)
 {
        struct compact_control cc = {
                .order = order,
-                .sync = false,
+                .mode = MIGRATE_ASYNC,
        };
        if (!order)
@@ -1192,7 +1219,7 @@ static void compact_node(int nid)
 {
        struct compact_control cc = {
                .order = -1,
-                .sync = true,
+                .mode = MIGRATE_SYNC,
                .ignore_skip_hint = true,
        };
diff --git a/mm/dmapool.c b/mm/dmapool.c
index c69781e97cf9..306baa594f95 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -170,24 +170,16 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
        retval->boundary = boundary;
        retval->allocation = allocation;
-        if (dev) {
+        INIT_LIST_HEAD(&retval->pools);
-                int ret;
-                mutex_lock(&pools_lock);
+        mutex_lock(&pools_lock);
-                if (list_empty(&dev->dma_pools))
+        if (list_empty(&dev->dma_pools) &&
-                        ret = device_create_file(dev, &dev_attr_pools);
+            device_create_file(dev, &dev_attr_pools)) {
-                else
+                kfree(retval);
-                        ret = 0;
+                return NULL;
-                /* note:  not currently insisting "name" be unique */
-                if (!ret)
-                        list_add(&retval->pools, &dev->dma_pools);
-                else {
-                        kfree(retval);
-                        retval = NULL;
-                }
-                mutex_unlock(&pools_lock);
        } else
-                INIT_LIST_HEAD(&retval->pools);
+                list_add(&retval->pools, &dev->dma_pools);
+        mutex_unlock(&pools_lock);
        return retval;
 }
@@ -341,10 +333,10 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
                                continue;
                        if (pool->dev)
                                dev_err(pool->dev,
-                                        "dma_pool_alloc %s, %p (corruped)\n",
+                                        "dma_pool_alloc %s, %p (corrupted)\n",
                                        pool->name, retval);
                        else
-                                pr_err("dma_pool_alloc %s, %p (corruped)\n",
+                                pr_err("dma_pool_alloc %s, %p (corrupted)\n",
                                        pool->name, retval);
                        /*
@@ -508,7 +500,6 @@ void dmam_pool_destroy(struct dma_pool *pool)
 {
        struct device *dev = pool->dev;
-        WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool));
+        WARN_ON(devres_release(dev, dmam_pool_release, dmam_pool_match, pool));
-        dma_pool_destroy(pool);
 }
 EXPORT_SYMBOL(dmam_pool_destroy);
diff --git a/mm/filemap.c b/mm/filemap.c
index 000a220e2a41..7fadf1c62838 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -257,9 +257,11 @@ static int filemap_check_errors(struct address_space *mapping)
 {
        int ret = 0;
        /* Check for outstanding write errors */
-        if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+        if (test_bit(AS_ENOSPC, &mapping->flags) &&
+            test_and_clear_bit(AS_ENOSPC, &mapping->flags))
                ret = -ENOSPC;
-        if (test_and_clear_bit(AS_EIO, &mapping->flags))
+        if (test_bit(AS_EIO, &mapping->flags) &&
+            test_and_clear_bit(AS_EIO, &mapping->flags))
                ret = -EIO;
        return ret;
 }
@@ -740,7 +742,7 @@ void unlock_page(struct page *page)
 {
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        clear_bit_unlock(PG_locked, &page->flags);
-        smp_mb__after_clear_bit();
+        smp_mb__after_atomic();
        wake_up_page(page, PG_locked);
 }
 EXPORT_SYMBOL(unlock_page);
@@ -751,17 +753,51 @@ EXPORT_SYMBOL(unlock_page);
 */
 void end_page_writeback(struct page *page)
 {
-        if (TestClearPageReclaim(page))
+        /*
+         * TestClearPageReclaim could be used here but it is an atomic
+         * operation and overkill in this particular case. Failing to
+         * shuffle a page marked for immediate reclaim is too mild to
+         * justify taking an atomic operation penalty at the end of
+         * ever page writeback.
+         */
+        if (PageReclaim(page)) {
+                ClearPageReclaim(page);
                rotate_reclaimable_page(page);
+        }
        if (!test_clear_page_writeback(page))
                BUG();
-        smp_mb__after_clear_bit();
+        smp_mb__after_atomic();
        wake_up_page(page, PG_writeback);
 }
 EXPORT_SYMBOL(end_page_writeback);
+/*
+ * After completing I/O on a page, call this routine to update the page
+ * flags appropriately
+ */
+void page_endio(struct page *page, int rw, int err)
+{
+        if (rw == READ) {
+                if (!err) {
+                        SetPageUptodate(page);
+                } else {
+                        ClearPageUptodate(page);
+                        SetPageError(page);
+                }
+                unlock_page(page);
+        } else { /* rw == WRITE */
+                if (err) {
+                        SetPageError(page);
+                        if (page->mapping)
+                                mapping_set_error(page->mapping, err);
+                }
+                end_page_writeback(page);
+        }
+}
+EXPORT_SYMBOL_GPL(page_endio);
 /**
 * __lock_page - get a lock on the page, assuming we need to sleep to get it
 * @page: the page to lock
@@ -955,26 +991,6 @@ out:
 EXPORT_SYMBOL(find_get_entry);
 /**
- * find_get_page - find and get a page reference
- * @mapping: the address_space to search
- * @offset: the page index
- *
- * Looks up the page cache slot at @mapping & @offset.  If there is a
- * page cache page, it is returned with an increased refcount.
- *
- * Otherwise, %NULL is returned.
- */
-struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
-{
-        struct page *page = find_get_entry(mapping, offset);
-        if (radix_tree_exceptional_entry(page))
-                page = NULL;
-        return page;
-}
-EXPORT_SYMBOL(find_get_page);
-/**
 * find_lock_entry - locate, pin and lock a page cache entry
 * @mapping: the address_space to search
 * @offset: the page cache index
@@ -1011,66 +1027,84 @@ repeat:
 EXPORT_SYMBOL(find_lock_entry);
 /**
- * find_lock_page - locate, pin and lock a pagecache page
+ * pagecache_get_page - find and get a page reference
 * @mapping: the address_space to search
 * @offset: the page index
+ * @fgp_flags: PCG flags
+ * @gfp_mask: gfp mask to use if a page is to be allocated
 *
- * Looks up the page cache slot at @mapping & @offset.  If there is a
+ * Looks up the page cache slot at @mapping & @offset.
- * page cache page, it is returned locked and with an increased
- * refcount.
- *
- * Otherwise, %NULL is returned.
- *
- * find_lock_page() may sleep.
- */
-struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
-{
-        struct page *page = find_lock_entry(mapping, offset);
-        if (radix_tree_exceptional_entry(page))
-                page = NULL;
-        return page;
-}
-EXPORT_SYMBOL(find_lock_page);
-/**
- * find_or_create_page - locate or add a pagecache page
- * @mapping: the page's address_space
- * @index: the page's index into the mapping
- * @gfp_mask: page allocation mode
 *
- * Looks up the page cache slot at @mapping & @offset.  If there is a
+ * PCG flags modify how the page is returned
- * page cache page, it is returned locked and with an increased
- * refcount.
 *
- * If the page is not present, a new page is allocated using @gfp_mask
+ * FGP_ACCESSED: the page will be marked accessed
- * and added to the page cache and the VM's LRU list.  The page is
+ * FGP_LOCK: Page is return locked
- * returned locked and with an increased refcount.
+ * FGP_CREAT: If page is not present then a new page is allocated using
+ *              @gfp_mask and added to the page cache and the VM's LRU
+ *              list. The page is returned locked and with an increased
+ *              refcount. Otherwise, %NULL is returned.
 *
- * On memory exhaustion, %NULL is returned.
+ * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
+ * if the GFP flags specified for FGP_CREAT are atomic.
 *
- * find_or_create_page() may sleep, even if @gfp_flags specifies an
+ * If there is a page cache page, it is returned with an increased refcount.
- * atomic allocation!
 */
-struct page *find_or_create_page(struct address_space *mapping,
+struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
-                pgoff_t index, gfp_t gfp_mask)
+        int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask)
 {
        struct page *page;
-        int err;
 repeat:
-        page = find_lock_page(mapping, index);
+        page = find_get_entry(mapping, offset);
-        if (!page) {
+        if (radix_tree_exceptional_entry(page))
-                page = __page_cache_alloc(gfp_mask);
+                page = NULL;
+        if (!page)
+                goto no_page;
+        if (fgp_flags & FGP_LOCK) {
+                if (fgp_flags & FGP_NOWAIT) {
+                        if (!trylock_page(page)) {
+                                page_cache_release(page);
+                                return NULL;
+                        }
+                } else {
+                        lock_page(page);
+                }
+                /* Has the page been truncated? */
+                if (unlikely(page->mapping != mapping)) {
+                        unlock_page(page);
+                        page_cache_release(page);
+                        goto repeat;
+                }
+                VM_BUG_ON_PAGE(page->index != offset, page);
+        }
+        if (page && (fgp_flags & FGP_ACCESSED))
+                mark_page_accessed(page);
+no_page:
+        if (!page && (fgp_flags & FGP_CREAT)) {
+                int err;
+                if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
+                        cache_gfp_mask |= __GFP_WRITE;
+                if (fgp_flags & FGP_NOFS) {
+                        cache_gfp_mask &= ~__GFP_FS;
+                        radix_gfp_mask &= ~__GFP_FS;
+                }
+                page = __page_cache_alloc(cache_gfp_mask);
                if (!page)
                        return NULL;
-                /*
-                 * We want a regular kernel memory (not highmem or DMA etc)
+                if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
-                 * allocation for the radix tree nodes, but we need to honour
+                        fgp_flags |= FGP_LOCK;
-                 * the context-specific requirements the caller has asked for.
-                 * GFP_RECLAIM_MASK collects those requirements.
+                /* Init accessed so avoit atomic mark_page_accessed later */
-                 */
+                if (fgp_flags & FGP_ACCESSED)
-                err = add_to_page_cache_lru(page, mapping, index,
+                        init_page_accessed(page);
-                        (gfp_mask & GFP_RECLAIM_MASK));
+                err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask);
                if (unlikely(err)) {
                        page_cache_release(page);
                        page = NULL;
@@ -1078,9 +1112,10 @@ repeat:
                                goto repeat;
                }
        }
        return page;
 }
-EXPORT_SYMBOL(find_or_create_page);
+EXPORT_SYMBOL(pagecache_get_page);
 /**
 * find_get_entries - gang pagecache lookup
@@ -1377,39 +1412,6 @@ repeat:
 }
 EXPORT_SYMBOL(find_get_pages_tag);
-/**
- * grab_cache_page_nowait - returns locked page at given index in given cache
- * @mapping: target address_space
- * @index: the page index
- *
- * Same as grab_cache_page(), but do not wait if the page is unavailable.
- * This is intended for speculative data generators, where the data can
- * be regenerated if the page couldn't be grabbed.  This routine should
- * be safe to call while holding the lock for another page.
- *
- * Clear __GFP_FS when allocating the page to avoid recursion into the fs
- * and deadlock against the caller's locked page.
- */
-struct page *
-grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
-{
-        struct page *page = find_get_page(mapping, index);
-        if (page) {
-                if (trylock_page(page))
-                        return page;
-                page_cache_release(page);
-                return NULL;
-        }
-        page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
-        if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
-                page_cache_release(page);
-                page = NULL;
-        }
-        return page;
-}
-EXPORT_SYMBOL(grab_cache_page_nowait);
 /*
 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
 * a _large_ part of the i/o request. Imagine the worst scenario:
@@ -2379,7 +2381,6 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
 {
        const struct address_space_operations *aops = mapping->a_ops;
-        mark_page_accessed(page);
        return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
 }
 EXPORT_SYMBOL(pagecache_write_end);
@@ -2461,34 +2462,18 @@ EXPORT_SYMBOL(generic_file_direct_write);
 struct page *grab_cache_page_write_begin(struct address_space *mapping,
                                        pgoff_t index, unsigned flags)
 {
-        int status;
-        gfp_t gfp_mask;
        struct page *page;
-        gfp_t gfp_notmask = 0;
+        int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT;
-        gfp_mask = mapping_gfp_mask(mapping);
-        if (mapping_cap_account_dirty(mapping))
-                gfp_mask |= __GFP_WRITE;
        if (flags & AOP_FLAG_NOFS)
-                gfp_notmask = __GFP_FS;
+                fgp_flags |= FGP_NOFS;
-repeat:
-        page = find_lock_page(mapping, index);
+        page = pagecache_get_page(mapping, index, fgp_flags,
+                        mapping_gfp_mask(mapping),
+                        GFP_KERNEL);
        if (page)
-                goto found;
+                wait_for_stable_page(page);
-        page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
-        if (!page)
-                return NULL;
-        status = add_to_page_cache_lru(page, mapping, index,
-                                                GFP_KERNEL & ~gfp_notmask);
-        if (unlikely(status)) {
-                page_cache_release(page);
-                if (status == -EEXIST)
-                        goto repeat;
-                return NULL;
-        }
-found:
-        wait_for_stable_page(page);
        return page;
 }
 EXPORT_SYMBOL(grab_cache_page_write_begin);
@@ -2537,7 +2522,7 @@ again:
                status = a_ops->write_begin(file, mapping, pos, bytes, flags,
                                                &page, &fsdata);
-                if (unlikely(status))
+                if (unlikely(status < 0))
                        break;
                if (mapping_writably_mapped(mapping))
@@ -2546,7 +2531,6 @@ again:
                copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
                flush_dcache_page(page);
-                mark_page_accessed(page);
                status = a_ops->write_end(file, mapping, pos, bytes, copied,
                                                page, fsdata);
                if (unlikely(status < 0))
diff --git a/mm/fremap.c b/mm/fremap.c
index 34feba60a17e..2c5646f11f41 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -82,13 +82,10 @@ static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
        ptfile = pgoff_to_pte(pgoff);
-        if (!pte_none(*pte)) {
+        if (!pte_none(*pte))
-                if (pte_present(*pte) && pte_soft_dirty(*pte))
-                        pte_file_mksoft_dirty(ptfile);
                zap_pte(mm, vma, addr, pte);
-        }
-        set_pte_at(mm, addr, pte, ptfile);
+        set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile));
        /*
         * We don't need to run update_mmu_cache() here because the "file pte"
         * being installed by install_file_pte() is not a real pte - it's a
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 1b24bdcb3197..c30eec536f03 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area);
 static unsigned long __frontswap_curr_pages(void)
 {
-        int type;
        unsigned long totalpages = 0;
        struct swap_info_struct *si = NULL;
        assert_spin_locked(&swap_lock);
-        for (type = swap_list.head; type >= 0; type = si->next) {
+        plist_for_each_entry(si, &swap_active_head, list)
-                si = swap_info[type];
                totalpages += atomic_read(&si->frontswap_pages);
-        }
        return totalpages;
 }
@@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
        int si_frontswap_pages;
        unsigned long total_pages_to_unuse = total;
        unsigned long pages = 0, pages_to_unuse = 0;
-        int type;
        assert_spin_locked(&swap_lock);
-        for (type = swap_list.head; type >= 0; type = si->next) {
+        plist_for_each_entry(si, &swap_active_head, list) {
-                si = swap_info[type];
                si_frontswap_pages = atomic_read(&si->frontswap_pages);
                if (total_pages_to_unuse < si_frontswap_pages) {
                        pages = pages_to_unuse = total_pages_to_unuse;
@@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
                }
                vm_unacct_memory(pages);
                *unused = pages_to_unuse;
-                *swapid = type;
+                *swapid = si->type;
                ret = 0;
                break;
        }
@@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages)
        /*
         * we don't want to hold swap_lock while doing a very
         * lengthy try_to_unuse, but swap_list may change
-         * so restart scan from swap_list.head each time
+         * so restart scan from swap_active_head each time
         */
        spin_lock(&swap_lock);
        ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
diff --git a/mm/gup.c b/mm/gup.c
new file mode 100644
index 000000000000..cc5a9e7adea7
--- /dev/null
+++ b/mm/gup.c
@@ -0,0 +1,662 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/hugetlb.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include "internal.h"
+static struct page *no_page_table(struct vm_area_struct *vma,
+                unsigned int flags)
+{
+        /*
+         * When core dumping an enormous anonymous area that nobody
+         * has touched so far, we don't want to allocate unnecessary pages or
+         * page tables.  Return error instead of NULL to skip handle_mm_fault,
+         * then get_dump_page() will return NULL to leave a hole in the dump.
+         * But we can only make this optimization where a hole would surely
+         * be zero-filled if handle_mm_fault() actually did handle it.
+         */
+        if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault))
+                return ERR_PTR(-EFAULT);
+        return NULL;
+}
+static struct page *follow_page_pte(struct vm_area_struct *vma,
+                unsigned long address, pmd_t *pmd, unsigned int flags)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        struct page *page;
+        spinlock_t *ptl;
+        pte_t *ptep, pte;
+retry:
+        if (unlikely(pmd_bad(*pmd)))
+                return no_page_table(vma, flags);
+        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+        pte = *ptep;
+        if (!pte_present(pte)) {
+                swp_entry_t entry;
+                /*
+                 * KSM's break_ksm() relies upon recognizing a ksm page
+                 * even while it is being migrated, so for that case we
+                 * need migration_entry_wait().
+                 */
+                if (likely(!(flags & FOLL_MIGRATION)))
+                        goto no_page;
+                if (pte_none(pte) || pte_file(pte))
+                        goto no_page;
+                entry = pte_to_swp_entry(pte);
+                if (!is_migration_entry(entry))
+                        goto no_page;
+                pte_unmap_unlock(ptep, ptl);
+                migration_entry_wait(mm, pmd, address);
+                goto retry;
+        }
+        if ((flags & FOLL_NUMA) && pte_numa(pte))
+                goto no_page;
+        if ((flags & FOLL_WRITE) && !pte_write(pte)) {
+                pte_unmap_unlock(ptep, ptl);
+                return NULL;
+        }
+        page = vm_normal_page(vma, address, pte);
+        if (unlikely(!page)) {
+                if ((flags & FOLL_DUMP) ||
+                    !is_zero_pfn(pte_pfn(pte)))
+                        goto bad_page;
+                page = pte_page(pte);
+        }
+        if (flags & FOLL_GET)
+                get_page_foll(page);
+        if (flags & FOLL_TOUCH) {
+                if ((flags & FOLL_WRITE) &&
+                    !pte_dirty(pte) && !PageDirty(page))
+                        set_page_dirty(page);
+                /*
+                 * pte_mkyoung() would be more correct here, but atomic care
+                 * is needed to avoid losing the dirty bit: it is easier to use
+                 * mark_page_accessed().
+                 */
+                mark_page_accessed(page);
+        }
+        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+                /*
+                 * The preliminary mapping check is mainly to avoid the
+                 * pointless overhead of lock_page on the ZERO_PAGE
+                 * which might bounce very badly if there is contention.
+                 *
+                 * If the page is already locked, we don't need to
+                 * handle it now - vmscan will handle it later if and
+                 * when it attempts to reclaim the page.
+                 */
+                if (page->mapping && trylock_page(page)) {
+                        lru_add_drain();  /* push cached pages to LRU */
+                        /*
+                         * Because we lock page here, and migration is
+                         * blocked by the pte's page reference, and we
+                         * know the page is still mapped, we don't even
+                         * need to check for file-cache page truncation.
+                         */
+                        mlock_vma_page(page);
+                        unlock_page(page);
+                }
+        }
+        pte_unmap_unlock(ptep, ptl);
+        return page;
+bad_page:
+        pte_unmap_unlock(ptep, ptl);
+        return ERR_PTR(-EFAULT);
+no_page:
+        pte_unmap_unlock(ptep, ptl);
+        if (!pte_none(pte))
+                return NULL;
+        return no_page_table(vma, flags);
+}
+/**
+ * follow_page_mask - look up a page descriptor from a user-virtual address
+ * @vma: vm_area_struct mapping @address
+ * @address: virtual address to look up
+ * @flags: flags modifying lookup behaviour
+ * @page_mask: on output, *page_mask is set according to the size of the page
+ *
+ * @flags can have FOLL_ flags set, defined in <linux/mm.h>
+ *
+ * Returns the mapped (struct page *), %NULL if no mapping exists, or
+ * an error pointer if there is a mapping to something not represented
+ * by a page descriptor (see also vm_normal_page()).
+ */
+struct page *follow_page_mask(struct vm_area_struct *vma,
+                              unsigned long address, unsigned int flags,
+                              unsigned int *page_mask)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        spinlock_t *ptl;
+        struct page *page;
+        struct mm_struct *mm = vma->vm_mm;
+        *page_mask = 0;
+        page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
+        if (!IS_ERR(page)) {
+                BUG_ON(flags & FOLL_GET);
+                return page;
+        }
+        pgd = pgd_offset(mm, address);
+        if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+                return no_page_table(vma, flags);
+        pud = pud_offset(pgd, address);
+        if (pud_none(*pud))
+                return no_page_table(vma, flags);
+        if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
+                if (flags & FOLL_GET)
+                        return NULL;
+                page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
+                return page;
+        }
+        if (unlikely(pud_bad(*pud)))
+                return no_page_table(vma, flags);
+        pmd = pmd_offset(pud, address);
+        if (pmd_none(*pmd))
+                return no_page_table(vma, flags);
+        if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
+                page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
+                if (flags & FOLL_GET) {
+                        /*
+                         * Refcount on tail pages are not well-defined and
+                         * shouldn't be taken. The caller should handle a NULL
+                         * return when trying to follow tail pages.
+                         */
+                        if (PageHead(page))
+                                get_page(page);
+                        else
+                                page = NULL;
+                }
+                return page;
+        }
+        if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+                return no_page_table(vma, flags);
+        if (pmd_trans_huge(*pmd)) {
+                if (flags & FOLL_SPLIT) {
+                        split_huge_page_pmd(vma, address, pmd);
+                        return follow_page_pte(vma, address, pmd, flags);
+                }
+                ptl = pmd_lock(mm, pmd);
+                if (likely(pmd_trans_huge(*pmd))) {
+                        if (unlikely(pmd_trans_splitting(*pmd))) {
+                                spin_unlock(ptl);
+                                wait_split_huge_page(vma->anon_vma, pmd);
+                        } else {
+                                page = follow_trans_huge_pmd(vma, address,
+                                                             pmd, flags);
+                                spin_unlock(ptl);
+                                *page_mask = HPAGE_PMD_NR - 1;
+                                return page;
+                        }
+                } else
+                        spin_unlock(ptl);
+        }
+        return follow_page_pte(vma, address, pmd, flags);
+}
+static int get_gate_page(struct mm_struct *mm, unsigned long address,
+                unsigned int gup_flags, struct vm_area_struct **vma,
+                struct page **page)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte;
+        int ret = -EFAULT;
+        /* user gate pages are read-only */
+        if (gup_flags & FOLL_WRITE)
+                return -EFAULT;
+        if (address > TASK_SIZE)
+                pgd = pgd_offset_k(address);
+        else
+                pgd = pgd_offset_gate(mm, address);
+        BUG_ON(pgd_none(*pgd));
+        pud = pud_offset(pgd, address);
+        BUG_ON(pud_none(*pud));
+        pmd = pmd_offset(pud, address);
+        if (pmd_none(*pmd))
+                return -EFAULT;
+        VM_BUG_ON(pmd_trans_huge(*pmd));
+        pte = pte_offset_map(pmd, address);
+        if (pte_none(*pte))
+                goto unmap;
+        *vma = get_gate_vma(mm);
+        if (!page)
+                goto out;
+        *page = vm_normal_page(*vma, address, *pte);
+        if (!*page) {
+                if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
+                        goto unmap;
+                *page = pte_page(*pte);
+        }
+        get_page(*page);
+out:
+        ret = 0;
+unmap:
+        pte_unmap(pte);
+        return ret;
+}
+static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
+                unsigned long address, unsigned int *flags, int *nonblocking)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        unsigned int fault_flags = 0;
+        int ret;
+        /* For mlock, just skip the stack guard page. */
+        if ((*flags & FOLL_MLOCK) &&
+                        (stack_guard_page_start(vma, address) ||
+                         stack_guard_page_end(vma, address + PAGE_SIZE)))
+                return -ENOENT;
+        if (*flags & FOLL_WRITE)
+                fault_flags |= FAULT_FLAG_WRITE;
+        if (nonblocking)
+                fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+        if (*flags & FOLL_NOWAIT)
+                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
+        ret = handle_mm_fault(mm, vma, address, fault_flags);
+        if (ret & VM_FAULT_ERROR) {
+                if (ret & VM_FAULT_OOM)
+                        return -ENOMEM;
+                if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
+                        return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
+                if (ret & VM_FAULT_SIGBUS)
+                        return -EFAULT;
+                BUG();
+        }
+        if (tsk) {
+                if (ret & VM_FAULT_MAJOR)
+                        tsk->maj_flt++;
+                else
+                        tsk->min_flt++;
+        }
+        if (ret & VM_FAULT_RETRY) {
+                if (nonblocking)
+                        *nonblocking = 0;
+                return -EBUSY;
+        }
+        /*
+         * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
+         * necessary, even if maybe_mkwrite decided not to set pte_write. We
+         * can thus safely do subsequent page lookups as if they were reads.
+         * But only do so when looping for pte_write is futile: in some cases
+         * userspace may also be wanting to write to the gotten user page,
+         * which a read fault here might prevent (a readonly page might get
+         * reCOWed by userspace write).
+         */
+        if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
+                *flags &= ~FOLL_WRITE;
+        return 0;
+}
+static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
+{
+        vm_flags_t vm_flags = vma->vm_flags;
+        if (vm_flags & (VM_IO | VM_PFNMAP))
+                return -EFAULT;
+        if (gup_flags & FOLL_WRITE) {
+                if (!(vm_flags & VM_WRITE)) {
+                        if (!(gup_flags & FOLL_FORCE))
+                                return -EFAULT;
+                        /*
+                         * We used to let the write,force case do COW in a
+                         * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
+                         * set a breakpoint in a read-only mapping of an
+                         * executable, without corrupting the file (yet only
+                         * when that file had been opened for writing!).
+                         * Anon pages in shared mappings are surprising: now
+                         * just reject it.
+                         */
+                        if (!is_cow_mapping(vm_flags)) {
+                                WARN_ON_ONCE(vm_flags & VM_MAYWRITE);
+                                return -EFAULT;
+                        }
+                }
+        } else if (!(vm_flags & VM_READ)) {
+                if (!(gup_flags & FOLL_FORCE))
+                        return -EFAULT;
+                /*
+                 * Is there actually any vma we can reach here which does not
+                 * have VM_MAYREAD set?
+                 */
+                if (!(vm_flags & VM_MAYREAD))
+                        return -EFAULT;
+        }
+        return 0;
+}
+/**
+ * __get_user_pages() - pin user pages in memory
+ * @tsk:        task_struct of target task
+ * @mm:         mm_struct of target mm
+ * @start:      starting user address
+ * @nr_pages:   number of pages from start to pin
+ * @gup_flags:  flags modifying pin behaviour
+ * @pages:      array that receives pointers to the pages pinned.
+ *              Should be at least nr_pages long. Or NULL, if caller
+ *              only intends to ensure the pages are faulted in.
+ * @vmas:       array of pointers to vmas corresponding to each page.
+ *              Or NULL if the caller does not require them.
+ * @nonblocking: whether waiting for disk IO or mmap_sem contention
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with. vmas will only
+ * remain valid while mmap_sem is held.
+ *
+ * Must be called with mmap_sem held for read or write.
+ *
+ * __get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * __get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
+ * the page is written to, set_page_dirty (or set_page_dirty_lock, as
+ * appropriate) must be called after the page is finished with, and
+ * before put_page is called.
+ *
+ * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
+ * or mmap_sem contention, and if waiting is needed to pin all pages,
+ * *@nonblocking will be set to 0.
+ *
+ * In most cases, get_user_pages or get_user_pages_fast should be used
+ * instead of __get_user_pages. __get_user_pages should be used only if
+ * you need some special @gup_flags.
+ */
+long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+                unsigned long start, unsigned long nr_pages,
+                unsigned int gup_flags, struct page **pages,
+                struct vm_area_struct **vmas, int *nonblocking)
+{
+        long i = 0;
+        unsigned int page_mask;
+        struct vm_area_struct *vma = NULL;
+        if (!nr_pages)
+                return 0;
+        VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
+        /*
+         * If FOLL_FORCE is set then do not force a full fault as the hinting
+         * fault information is unrelated to the reference behaviour of a task
+         * using the address space
+         */
+        if (!(gup_flags & FOLL_FORCE))
+                gup_flags |= FOLL_NUMA;
+        do {
+                struct page *page;
+                unsigned int foll_flags = gup_flags;
+                unsigned int page_increm;
+                /* first iteration or cross vma bound */
+                if (!vma || start >= vma->vm_end) {
+                        vma = find_extend_vma(mm, start);
+                        if (!vma && in_gate_area(mm, start)) {
+                                int ret;
+                                ret = get_gate_page(mm, start & PAGE_MASK,
+                                                gup_flags, &vma,
+                                                pages ? &pages[i] : NULL);
+                                if (ret)
+                                        return i ? : ret;
+                                page_mask = 0;
+                                goto next_page;
+                        }
+                        if (!vma || check_vma_flags(vma, gup_flags))
+                                return i ? : -EFAULT;
+                        if (is_vm_hugetlb_page(vma)) {
+                                i = follow_hugetlb_page(mm, vma, pages, vmas,
+                                                &start, &nr_pages, i,
+                                                gup_flags);
+                                continue;
+                        }
+                }
+retry:
+                /*
+                 * If we have a pending SIGKILL, don't keep faulting pages and
+                 * potentially allocating memory.
+                 */
+                if (unlikely(fatal_signal_pending(current)))
+                        return i ? i : -ERESTARTSYS;
+                cond_resched();
+                page = follow_page_mask(vma, start, foll_flags, &page_mask);
+                if (!page) {
+                        int ret;
+                        ret = faultin_page(tsk, vma, start, &foll_flags,
+                                        nonblocking);
+                        switch (ret) {
+                        case 0:
+                                goto retry;
+                        case -EFAULT:
+                        case -ENOMEM:
+                        case -EHWPOISON:
+                                return i ? i : ret;
+                        case -EBUSY:
+                                return i;
+                        case -ENOENT:
+                                goto next_page;
+                        }
+                        BUG();
+                }
+                if (IS_ERR(page))
+                        return i ? i : PTR_ERR(page);
+                if (pages) {
+                        pages[i] = page;
+                        flush_anon_page(vma, page, start);
+                        flush_dcache_page(page);
+                        page_mask = 0;
+                }
+next_page:
+                if (vmas) {
+                        vmas[i] = vma;
+                        page_mask = 0;
+                }
+                page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
+                if (page_increm > nr_pages)
+                        page_increm = nr_pages;
+                i += page_increm;
+                start += page_increm * PAGE_SIZE;
+                nr_pages -= page_increm;
+        } while (nr_pages);
+        return i;
+}
+EXPORT_SYMBOL(__get_user_pages);
+/*
+ * fixup_user_fault() - manually resolve a user page fault
+ * @tsk:        the task_struct to use for page fault accounting, or
+ *              NULL if faults are not to be recorded.
+ * @mm:         mm_struct of target mm
+ * @address:    user address
+ * @fault_flags:flags to pass down to handle_mm_fault()
+ *
+ * This is meant to be called in the specific scenario where for locking reasons
+ * we try to access user memory in atomic context (within a pagefault_disable()
+ * section), this returns -EFAULT, and we want to resolve the user fault before
+ * trying again.
+ *
+ * Typically this is meant to be used by the futex code.
+ *
+ * The main difference with get_user_pages() is that this function will
+ * unconditionally call handle_mm_fault() which will in turn perform all the
+ * necessary SW fixup of the dirty and young bits in the PTE, while
+ * handle_mm_fault() only guarantees to update these in the struct page.
+ *
+ * This is important for some architectures where those bits also gate the
+ * access permission to the page because they are maintained in software.  On
+ * such architectures, gup() will not be enough to make a subsequent access
+ * succeed.
+ *
+ * This should be called with the mm_sem held for read.
+ */
+int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+                     unsigned long address, unsigned int fault_flags)
+{
+        struct vm_area_struct *vma;
+        vm_flags_t vm_flags;
+        int ret;
+        vma = find_extend_vma(mm, address);
+        if (!vma || address < vma->vm_start)
+                return -EFAULT;
+        vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ;
+        if (!(vm_flags & vma->vm_flags))
+                return -EFAULT;
+        ret = handle_mm_fault(mm, vma, address, fault_flags);
+        if (ret & VM_FAULT_ERROR) {
+                if (ret & VM_FAULT_OOM)
+                        return -ENOMEM;
+                if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
+                        return -EHWPOISON;
+                if (ret & VM_FAULT_SIGBUS)
+                        return -EFAULT;
+                BUG();
+        }
+        if (tsk) {
+                if (ret & VM_FAULT_MAJOR)
+                        tsk->maj_flt++;
+                else
+                        tsk->min_flt++;
+        }
+        return 0;
+}
+/*
+ * get_user_pages() - pin user pages in memory
+ * @tsk:        the task_struct to use for page fault accounting, or
+ *              NULL if faults are not to be recorded.
+ * @mm:         mm_struct of target mm
+ * @start:      starting user address
+ * @nr_pages:   number of pages from start to pin
+ * @write:      whether pages will be written to by the caller
+ * @force:      whether to force access even when user mapping is currently
+ *              protected (but never forces write access to shared mapping).
+ * @pages:      array that receives pointers to the pages pinned.
+ *              Should be at least nr_pages long. Or NULL, if caller
+ *              only intends to ensure the pages are faulted in.
+ * @vmas:       array of pointers to vmas corresponding to each page.
+ *              Or NULL if the caller does not require them.
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with. vmas will only
+ * remain valid while mmap_sem is held.
+ *
+ * Must be called with mmap_sem held for read or write.
+ *
+ * get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If write=0, the page must not be written to. If the page is written to,
+ * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
+ * after the page is finished with, and before put_page is called.
+ *
+ * get_user_pages is typically used for fewer-copy IO operations, to get a
+ * handle on the memory by some means other than accesses via the user virtual
+ * addresses. The pages may be submitted for DMA to devices or accessed via
+ * their kernel linear mapping (via the kmap APIs). Care should be taken to
+ * use the correct cache flushing APIs.
+ *
+ * See also get_user_pages_fast, for performance critical applications.
+ */
+long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+                unsigned long start, unsigned long nr_pages, int write,
+                int force, struct page **pages, struct vm_area_struct **vmas)
+{
+        int flags = FOLL_TOUCH;
+        if (pages)
+                flags |= FOLL_GET;
+        if (write)
+                flags |= FOLL_WRITE;
+        if (force)
+                flags |= FOLL_FORCE;
+        return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+                                NULL);
+}
+EXPORT_SYMBOL(get_user_pages);
+/**
+ * get_dump_page() - pin user page in memory while writing it to core dump
+ * @addr: user address
+ *
+ * Returns struct page pointer of user page pinned for dump,
+ * to be freed afterwards by page_cache_release() or put_page().
+ *
+ * Returns NULL on any kind of failure - a hole must then be inserted into
+ * the corefile, to preserve alignment with its headers; and also returns
+ * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
+ * allowing a hole to be left in the corefile to save diskspace.
+ *
+ * Called without mmap_sem, but after all other threads have been killed.
+ */
+#ifdef CONFIG_ELF_CORE
+struct page *get_dump_page(unsigned long addr)
+{
+        struct vm_area_struct *vma;
+        struct page *page;
+        if (__get_user_pages(current, current->mm, addr, 1,
+                             FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
+                             NULL) < 1)
+                return NULL;
+        flush_cache_page(vma, addr, page_to_pfn(page));
+        return page;
+}
+#endif /* CONFIG_ELF_CORE */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b4b1feba6472..e60837dc785c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -5,6 +5,8 @@
 *  the COPYING file in the top-level directory.
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/highmem.h>
@@ -151,8 +153,7 @@ static int start_khugepaged(void)
                        khugepaged_thread = kthread_run(khugepaged, NULL,
                                                        "khugepaged");
                if (unlikely(IS_ERR(khugepaged_thread))) {
-                        printk(KERN_ERR
+                        pr_err("khugepaged: kthread_run(khugepaged) failed\n");
-                               "khugepaged: kthread_run(khugepaged) failed\n");
                        err = PTR_ERR(khugepaged_thread);
                        khugepaged_thread = NULL;
                }
@@ -584,19 +585,19 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
        *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
        if (unlikely(!*hugepage_kobj)) {
-                printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n");
+                pr_err("failed to create transparent hugepage kobject\n");
                return -ENOMEM;
        }
        err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
        if (err) {
-                printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
+                pr_err("failed to register transparent hugepage group\n");
                goto delete_obj;
        }
        err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
        if (err) {
-                printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
+                pr_err("failed to register transparent hugepage group\n");
                goto remove_hp_group;
        }
@@ -689,8 +690,7 @@ static int __init setup_transparent_hugepage(char *str)
        }
 out:
        if (!ret)
-                printk(KERN_WARNING
+                pr_warn("transparent_hugepage= cannot parse, ignored\n");
-                       "transparent_hugepage= cannot parse, ignored\n");
        return ret;
 }
 __setup("transparent_hugepage=", setup_transparent_hugepage);
@@ -1830,10 +1830,11 @@ static void __split_huge_page(struct page *page,
         * the newly established pmd of the child later during the
         * walk, to be able to set it as pmd_trans_splitting too.
         */
-        if (mapcount != page_mapcount(page))
+        if (mapcount != page_mapcount(page)) {
-                printk(KERN_ERR "mapcount %d page_mapcount %d\n",
+                pr_err("mapcount %d page_mapcount %d\n",
-                       mapcount, page_mapcount(page));
+                        mapcount, page_mapcount(page));
-        BUG_ON(mapcount != page_mapcount(page));
+                BUG();
+        }
        __split_huge_page_refcount(page, list);
@@ -1844,10 +1845,11 @@ static void __split_huge_page(struct page *page,
                BUG_ON(is_vma_temporary_stack(vma));
                mapcount2 += __split_huge_page_map(page, vma, addr);
        }
-        if (mapcount != mapcount2)
+        if (mapcount != mapcount2) {
-                printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
+                pr_err("mapcount %d mapcount2 %d page_mapcount %d\n",
-                       mapcount, mapcount2, page_mapcount(page));
+                        mapcount, mapcount2, page_mapcount(page));
-        BUG_ON(mapcount != mapcount2);
+                BUG();
+        }
 }
 /*
@@ -2740,7 +2742,7 @@ static int khugepaged(void *none)
        struct mm_slot *mm_slot;
        set_freezable();
-        set_user_nice(current, 19);
+        set_user_nice(current, MAX_NICE);
        while (!kthread_should_stop()) {
                khugepaged_do_scan();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c82290b9c1fc..226910cb7c9b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -544,7 +544,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
 /* Movability of hugepages depends on migration support. */
 static inline gfp_t htlb_alloc_mask(struct hstate *h)
 {
-        if (hugepages_treat_as_movable || hugepage_migration_support(h))
+        if (hugepages_treat_as_movable || hugepage_migration_supported(h))
                return GFP_HIGHUSER_MOVABLE;
        else
                return GFP_HIGHUSER;
@@ -607,25 +607,242 @@ err:
        return NULL;
 }
+/*
+ * common helper functions for hstate_next_node_to_{alloc|free}.
+ * We may have allocated or freed a huge page based on a different
+ * nodes_allowed previously, so h->next_node_to_{alloc|free} might
+ * be outside of *nodes_allowed.  Ensure that we use an allowed
+ * node for alloc or free.
+ */
+static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+        nid = next_node(nid, *nodes_allowed);
+        if (nid == MAX_NUMNODES)
+                nid = first_node(*nodes_allowed);
+        VM_BUG_ON(nid >= MAX_NUMNODES);
+        return nid;
+}
+static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+        if (!node_isset(nid, *nodes_allowed))
+                nid = next_node_allowed(nid, nodes_allowed);
+        return nid;
+}
+/*
+ * returns the previously saved node ["this node"] from which to
+ * allocate a persistent huge page for the pool and advance the
+ * next node from which to allocate, handling wrap at end of node
+ * mask.
+ */
+static int hstate_next_node_to_alloc(struct hstate *h,
+                                        nodemask_t *nodes_allowed)
+{
+        int nid;
+        VM_BUG_ON(!nodes_allowed);
+        nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
+        h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
+        return nid;
+}
+/*
+ * helper for free_pool_huge_page() - return the previously saved
+ * node ["this node"] from which to free a huge page.  Advance the
+ * next node id whether or not we find a free huge page to free so
+ * that the next attempt to free addresses the next node.
+ */
+static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
+{
+        int nid;
+        VM_BUG_ON(!nodes_allowed);
+        nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
+        h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
+        return nid;
+}
+#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)           \
+        for (nr_nodes = nodes_weight(*mask);                            \
+                nr_nodes > 0 &&                                         \
+                ((node = hstate_next_node_to_alloc(hs, mask)) || 1);    \
+                nr_nodes--)
+#define for_each_node_mask_to_free(hs, nr_nodes, node, mask)            \
+        for (nr_nodes = nodes_weight(*mask);                            \
+                nr_nodes > 0 &&                                         \
+                ((node = hstate_next_node_to_free(hs, mask)) || 1);     \
+                nr_nodes--)
+#if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
+static void destroy_compound_gigantic_page(struct page *page,
+                                        unsigned long order)
+{
+        int i;
+        int nr_pages = 1 << order;
+        struct page *p = page + 1;
+        for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+                __ClearPageTail(p);
+                set_page_refcounted(p);
+                p->first_page = NULL;
+        }
+        set_compound_order(page, 0);
+        __ClearPageHead(page);
+}
+static void free_gigantic_page(struct page *page, unsigned order)
+{
+        free_contig_range(page_to_pfn(page), 1 << order);
+}
+static int __alloc_gigantic_page(unsigned long start_pfn,
+                                unsigned long nr_pages)
+{
+        unsigned long end_pfn = start_pfn + nr_pages;
+        return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+}
+static bool pfn_range_valid_gigantic(unsigned long start_pfn,
+                                unsigned long nr_pages)
+{
+        unsigned long i, end_pfn = start_pfn + nr_pages;
+        struct page *page;
+        for (i = start_pfn; i < end_pfn; i++) {
+                if (!pfn_valid(i))
+                        return false;
+                page = pfn_to_page(i);
+                if (PageReserved(page))
+                        return false;
+                if (page_count(page) > 0)
+                        return false;
+                if (PageHuge(page))
+                        return false;
+        }
+        return true;
+}
+static bool zone_spans_last_pfn(const struct zone *zone,
+                        unsigned long start_pfn, unsigned long nr_pages)
+{
+        unsigned long last_pfn = start_pfn + nr_pages - 1;
+        return zone_spans_pfn(zone, last_pfn);
+}
+static struct page *alloc_gigantic_page(int nid, unsigned order)
+{
+        unsigned long nr_pages = 1 << order;
+        unsigned long ret, pfn, flags;
+        struct zone *z;
+        z = NODE_DATA(nid)->node_zones;
+        for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) {
+                spin_lock_irqsave(&z->lock, flags);
+                pfn = ALIGN(z->zone_start_pfn, nr_pages);
+                while (zone_spans_last_pfn(z, pfn, nr_pages)) {
+                        if (pfn_range_valid_gigantic(pfn, nr_pages)) {
+                                /*
+                                 * We release the zone lock here because
+                                 * alloc_contig_range() will also lock the zone
+                                 * at some point. If there's an allocation
+                                 * spinning on this lock, it may win the race
+                                 * and cause alloc_contig_range() to fail...
+                                 */
+                                spin_unlock_irqrestore(&z->lock, flags);
+                                ret = __alloc_gigantic_page(pfn, nr_pages);
+                                if (!ret)
+                                        return pfn_to_page(pfn);
+                                spin_lock_irqsave(&z->lock, flags);
+                        }
+                        pfn += nr_pages;
+                }
+                spin_unlock_irqrestore(&z->lock, flags);
+        }
+        return NULL;
+}
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
+static void prep_compound_gigantic_page(struct page *page, unsigned long order);
+static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
+{
+        struct page *page;
+        page = alloc_gigantic_page(nid, huge_page_order(h));
+        if (page) {
+                prep_compound_gigantic_page(page, huge_page_order(h));
+                prep_new_huge_page(h, page, nid);
+        }
+        return page;
+}
+static int alloc_fresh_gigantic_page(struct hstate *h,
+                                nodemask_t *nodes_allowed)
+{
+        struct page *page = NULL;
+        int nr_nodes, node;
+        for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+                page = alloc_fresh_gigantic_page_node(h, node);
+                if (page)
+                        return 1;
+        }
+        return 0;
+}
+static inline bool gigantic_page_supported(void) { return true; }
+#else
+static inline bool gigantic_page_supported(void) { return false; }
+static inline void free_gigantic_page(struct page *page, unsigned order) { }
+static inline void destroy_compound_gigantic_page(struct page *page,
+                                                unsigned long order) { }
+static inline int alloc_fresh_gigantic_page(struct hstate *h,
+                                        nodemask_t *nodes_allowed) { return 0; }
+#endif
 static void update_and_free_page(struct hstate *h, struct page *page)
 {
        int i;
-        VM_BUG_ON(h->order >= MAX_ORDER);
+        if (hstate_is_gigantic(h) && !gigantic_page_supported())
+                return;
        h->nr_huge_pages--;
        h->nr_huge_pages_node[page_to_nid(page)]--;
        for (i = 0; i < pages_per_huge_page(h); i++) {
                page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
                                1 << PG_referenced | 1 << PG_dirty |
-                                1 << PG_active | 1 << PG_reserved |
+                                1 << PG_active | 1 << PG_private |
-                                1 << PG_private | 1 << PG_writeback);
+                                1 << PG_writeback);
        }
        VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
        set_compound_page_dtor(page, NULL);
        set_page_refcounted(page);
-        arch_release_hugepage(page);
+        if (hstate_is_gigantic(h)) {
-        __free_pages(page, huge_page_order(h));
+                destroy_compound_gigantic_page(page, huge_page_order(h));
+                free_gigantic_page(page, huge_page_order(h));
+        } else {
+                arch_release_hugepage(page);
+                __free_pages(page, huge_page_order(h));
+        }
 }
 struct hstate *size_to_hstate(unsigned long size)
@@ -664,7 +881,7 @@ static void free_huge_page(struct page *page)
        if (restore_reserve)
                h->resv_huge_pages++;
-        if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
+        if (h->surplus_huge_pages_node[nid]) {
                /* remove the page from active list */
                list_del(&page->lru);
                update_and_free_page(h, page);
@@ -690,8 +907,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
        put_page(page); /* free it into the hugepage allocator */
 }
-static void __init prep_compound_gigantic_page(struct page *page,
+static void prep_compound_gigantic_page(struct page *page, unsigned long order)
-                                               unsigned long order)
 {
        int i;
        int nr_pages = 1 << order;
@@ -769,9 +985,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 {
        struct page *page;
-        if (h->order >= MAX_ORDER)
-                return NULL;
        page = alloc_pages_exact_node(nid,
                htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
                                                __GFP_REPEAT|__GFP_NOWARN,
@@ -787,79 +1000,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
        return page;
 }
-/*
- * common helper functions for hstate_next_node_to_{alloc|free}.
- * We may have allocated or freed a huge page based on a different
- * nodes_allowed previously, so h->next_node_to_{alloc|free} might
- * be outside of *nodes_allowed.  Ensure that we use an allowed
- * node for alloc or free.
- */
-static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
-        nid = next_node(nid, *nodes_allowed);
-        if (nid == MAX_NUMNODES)
-                nid = first_node(*nodes_allowed);
-        VM_BUG_ON(nid >= MAX_NUMNODES);
-        return nid;
-}
-static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
-        if (!node_isset(nid, *nodes_allowed))
-                nid = next_node_allowed(nid, nodes_allowed);
-        return nid;
-}
-/*
- * returns the previously saved node ["this node"] from which to
- * allocate a persistent huge page for the pool and advance the
- * next node from which to allocate, handling wrap at end of node
- * mask.
- */
-static int hstate_next_node_to_alloc(struct hstate *h,
-                                        nodemask_t *nodes_allowed)
-{
-        int nid;
-        VM_BUG_ON(!nodes_allowed);
-        nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
-        h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
-        return nid;
-}
-/*
- * helper for free_pool_huge_page() - return the previously saved
- * node ["this node"] from which to free a huge page.  Advance the
- * next node id whether or not we find a free huge page to free so
- * that the next attempt to free addresses the next node.
- */
-static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
-{
-        int nid;
-        VM_BUG_ON(!nodes_allowed);
-        nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
-        h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
-        return nid;
-}
-#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)           \
-        for (nr_nodes = nodes_weight(*mask);                            \
-                nr_nodes > 0 &&                                         \
-                ((node = hstate_next_node_to_alloc(hs, mask)) || 1);    \
-                nr_nodes--)
-#define for_each_node_mask_to_free(hs, nr_nodes, node, mask)            \
-        for (nr_nodes = nodes_weight(*mask);                            \
-                nr_nodes > 0 &&                                         \
-                ((node = hstate_next_node_to_free(hs, mask)) || 1);     \
-                nr_nodes--)
 static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
 {
        struct page *page;
@@ -963,7 +1103,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
        struct page *page;
        unsigned int r_nid;
-        if (h->order >= MAX_ORDER)
+        if (hstate_is_gigantic(h))
                return NULL;
        /*
@@ -1156,7 +1296,7 @@ static void return_unused_surplus_pages(struct hstate *h,
        h->resv_huge_pages -= unused_resv_pages;
        /* Cannot return gigantic pages currently */
-        if (h->order >= MAX_ORDER)
+        if (hstate_is_gigantic(h))
                return;
        nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
@@ -1246,24 +1386,17 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
                        return ERR_PTR(-ENOSPC);
        ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
-        if (ret) {
+        if (ret)
-                if (chg || avoid_reserve)
+                goto out_subpool_put;
-                        hugepage_subpool_put_pages(spool, 1);
-                return ERR_PTR(-ENOSPC);
-        }
        spin_lock(&hugetlb_lock);
        page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
        if (!page) {
                spin_unlock(&hugetlb_lock);
                page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
-                if (!page) {
+                if (!page)
-                        hugetlb_cgroup_uncharge_cgroup(idx,
+                        goto out_uncharge_cgroup;
-                                                       pages_per_huge_page(h),
-                                                       h_cg);
-                        if (chg || avoid_reserve)
-                                hugepage_subpool_put_pages(spool, 1);
-                        return ERR_PTR(-ENOSPC);
-                }
                spin_lock(&hugetlb_lock);
                list_move(&page->lru, &h->hugepage_activelist);
                /* Fall through */
@@ -1275,6 +1408,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        vma_commit_reservation(h, vma, addr);
        return page;
+out_uncharge_cgroup:
+        hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
+out_subpool_put:
+        if (chg || avoid_reserve)
+                hugepage_subpool_put_pages(spool, 1);
+        return ERR_PTR(-ENOSPC);
 }
 /*
@@ -1356,7 +1496,7 @@ static void __init gather_bootmem_prealloc(void)
                 * fix confusing memory reports from free(1) and another
                 * side-effects, like CommitLimit going negative.
                 */
-                if (h->order > (MAX_ORDER - 1))
+                if (hstate_is_gigantic(h))
                        adjust_managed_page_count(page, 1 << h->order);
        }
 }
@@ -1366,7 +1506,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
        unsigned long i;
        for (i = 0; i < h->max_huge_pages; ++i) {
-                if (h->order >= MAX_ORDER) {
+                if (hstate_is_gigantic(h)) {
                        if (!alloc_bootmem_huge_page(h))
                                break;
                } else if (!alloc_fresh_huge_page(h,
@@ -1382,7 +1522,7 @@ static void __init hugetlb_init_hstates(void)
        for_each_hstate(h) {
                /* oversize hugepages were init'ed in early boot */
-                if (h->order < MAX_ORDER)
+                if (!hstate_is_gigantic(h))
                        hugetlb_hstate_alloc_pages(h);
        }
 }
@@ -1416,7 +1556,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
 {
        int i;
-        if (h->order >= MAX_ORDER)
+        if (hstate_is_gigantic(h))
                return;
        for_each_node_mask(i, *nodes_allowed) {
@@ -1479,7 +1619,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
 {
        unsigned long min_count, ret;
-        if (h->order >= MAX_ORDER)
+        if (hstate_is_gigantic(h) && !gigantic_page_supported())
                return h->max_huge_pages;
        /*
@@ -1506,7 +1646,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
                 * and reducing the surplus.
                 */
                spin_unlock(&hugetlb_lock);
-                ret = alloc_fresh_huge_page(h, nodes_allowed);
+                if (hstate_is_gigantic(h))
+                        ret = alloc_fresh_gigantic_page(h, nodes_allowed);
+                else
+                        ret = alloc_fresh_huge_page(h, nodes_allowed);
                spin_lock(&hugetlb_lock);
                if (!ret)
                        goto out;
@@ -1606,7 +1749,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
                goto out;
        h = kobj_to_hstate(kobj, &nid);
-        if (h->order >= MAX_ORDER) {
+        if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
                err = -EINVAL;
                goto out;
        }
@@ -1689,7 +1832,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
        unsigned long input;
        struct hstate *h = kobj_to_hstate(kobj, NULL);
-        if (h->order >= MAX_ORDER)
+        if (hstate_is_gigantic(h))
                return -EINVAL;
        err = kstrtoul(buf, 10, &input);
@@ -2113,7 +2256,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
        tmp = h->max_huge_pages;
-        if (write && h->order >= MAX_ORDER)
+        if (write && hstate_is_gigantic(h) && !gigantic_page_supported())
                return -EINVAL;
        table->data = &tmp;
@@ -2169,7 +2312,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
        tmp = h->nr_overcommit_huge_pages;
-        if (write && h->order >= MAX_ORDER)
+        if (write && hstate_is_gigantic(h))
                return -EINVAL;
        table->data = &tmp;
diff --git a/mm/internal.h b/mm/internal.h
index 07b67361a40a..7f22a11fcc66 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -134,7 +134,7 @@ struct compact_control {
        unsigned long nr_migratepages;  /* Number of pages to migrate */
        unsigned long free_pfn;         /* isolate_freepages search base */
        unsigned long migrate_pfn;      /* isolate_migratepages search base */
-        bool sync;                      /* Synchronous migration */
+        enum migrate_mode mode;         /* Async or sync migration mode */
        bool ignore_skip_hint;          /* Scan blocks even if marked skip */
        bool finished_update_free;      /* True when the zone cached pfns are
                                         * no longer being updated
@@ -144,7 +144,10 @@ struct compact_control {
        int order;                      /* order a direct compactor needs */
        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
        struct zone *zone;
-        bool contended;                 /* True if a lock was contended */
+        bool contended;                 /* True if a lock was contended, or
+                                         * need_resched() true during async
+                                         * compaction
+                                         */
 };
 unsigned long
@@ -169,6 +172,11 @@ static inline unsigned long page_order(struct page *page)
        return page_private(page);
 }
+static inline bool is_cow_mapping(vm_flags_t flags)
+{
+        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+}
 /* mm/util.c */
 void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
                struct vm_area_struct *prev, struct rb_node *rb_parent);
@@ -184,26 +192,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
 }
 /*
- * Called only in fault path, to determine if a new page is being
- * mapped into a LOCKED vma.  If it is, mark page as mlocked.
- */
-static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
-                                    struct page *page)
-{
-        VM_BUG_ON_PAGE(PageLRU(page), page);
-        if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
-                return 0;
-        if (!TestSetPageMlocked(page)) {
-                mod_zone_page_state(page_zone(page), NR_MLOCK,
-                                    hpage_nr_pages(page));
-                count_vm_event(UNEVICTABLE_PGMLOCKED);
-        }
-        return 1;
-}
-/*
 * must be called with vma's mmap_sem held for read or write, and page locked.
 */
 extern void mlock_vma_page(struct page *page);
@@ -245,10 +233,6 @@ extern unsigned long vma_address(struct page *page,
                                 struct vm_area_struct *vma);
 #endif
 #else /* !CONFIG_MMU */
-static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p)
-{
-        return 0;
-}
 static inline void clear_page_mlock(struct page *page) { }
 static inline void mlock_vma_page(struct page *page) { }
 static inline void mlock_migrate_page(struct page *new, struct page *old) { }
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 8d2fcdfeff7f..736ade31d1dc 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1300,7 +1300,7 @@ static void kmemleak_scan(void)
        /*
         * Struct page scanning for each node.
         */
-        lock_memory_hotplug();
+        get_online_mems();
        for_each_online_node(i) {
                unsigned long start_pfn = node_start_pfn(i);
                unsigned long end_pfn = node_end_pfn(i);
@@ -1318,7 +1318,7 @@ static void kmemleak_scan(void)
                        scan_block(page, page + 1, NULL, 1);
                }
        }
-        unlock_memory_hotplug();
+        put_online_mems();
        /*
         * Scanning the task stacks (may introduce false negatives).
diff --git a/mm/madvise.c b/mm/madvise.c
index 539eeb96b323..a402f8fdc68e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -195,7 +195,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
        for (; start < end; start += PAGE_SIZE) {
                index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-                page = find_get_page(mapping, index);
+                page = find_get_entry(mapping, index);
                if (!radix_tree_exceptional_entry(page)) {
                        if (page)
                                page_cache_release(page);
diff --git a/mm/memblock.c b/mm/memblock.c
index e9d6ca9a01a9..0aa0d2b07624 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -27,6 +27,9 @@
 static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
 static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
+#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
+static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock;
+#endif
 struct memblock memblock __initdata_memblock = {
        .memory.regions         = memblock_memory_init_regions,
@@ -37,6 +40,12 @@ struct memblock memblock __initdata_memblock = {
        .reserved.cnt           = 1,    /* empty dummy entry */
        .reserved.max           = INIT_MEMBLOCK_REGIONS,
+#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
+        .physmem.regions        = memblock_physmem_init_regions,
+        .physmem.cnt            = 1,    /* empty dummy entry */
+        .physmem.max            = INIT_PHYSMEM_REGIONS,
+#endif
        .bottom_up              = false,
        .current_limit          = MEMBLOCK_ALLOC_ANYWHERE,
 };
@@ -472,7 +481,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
 }
 /**
- * memblock_add_region - add new memblock region
+ * memblock_add_range - add new memblock region
 * @type: memblock type to add new region into
 * @base: base address of the new region
 * @size: size of the new region
@@ -487,7 +496,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
 * RETURNS:
 * 0 on success, -errno on failure.
 */
-static int __init_memblock memblock_add_region(struct memblock_type *type,
+int __init_memblock memblock_add_range(struct memblock_type *type,
                                phys_addr_t base, phys_addr_t size,
                                int nid, unsigned long flags)
 {
@@ -569,12 +578,12 @@ repeat:
 int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
                                       int nid)
 {
-        return memblock_add_region(&memblock.memory, base, size, nid, 0);
+        return memblock_add_range(&memblock.memory, base, size, nid, 0);
 }
 int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
 {
-        return memblock_add_region(&memblock.memory, base, size,
+        return memblock_add_range(&memblock.memory, base, size,
                                   MAX_NUMNODES, 0);
 }
@@ -654,8 +663,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
        return 0;
 }
-static int __init_memblock __memblock_remove(struct memblock_type *type,
+int __init_memblock memblock_remove_range(struct memblock_type *type,
-                                             phys_addr_t base, phys_addr_t size)
+                                          phys_addr_t base, phys_addr_t size)
 {
        int start_rgn, end_rgn;
        int i, ret;
@@ -671,9 +680,10 @@ static int __init_memblock __memblock_remove(struct memblock_type *type,
 int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
 {
-        return __memblock_remove(&memblock.memory, base, size);
+        return memblock_remove_range(&memblock.memory, base, size);
 }
 int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
 {
        memblock_dbg("   memblock_free: [%#016llx-%#016llx] %pF\n",
@@ -681,7 +691,7 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
                     (unsigned long long)base + size - 1,
                     (void *)_RET_IP_);
-        return __memblock_remove(&memblock.reserved, base, size);
+        return memblock_remove_range(&memblock.reserved, base, size);
 }
 static int __init_memblock memblock_reserve_region(phys_addr_t base,
@@ -696,7 +706,7 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base,
                     (unsigned long long)base + size - 1,
                     flags, (void *)_RET_IP_);
-        return memblock_add_region(_rgn, base, size, nid, flags);
+        return memblock_add_range(_rgn, base, size, nid, flags);
 }
 int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
@@ -758,17 +768,19 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
 }
 /**
- * __next_free_mem_range - next function for for_each_free_mem_range()
+ * __next__mem_range - next function for for_each_free_mem_range() etc.
 * @idx: pointer to u64 loop variable
 * @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @type_a: pointer to memblock_type from where the range is taken
+ * @type_b: pointer to memblock_type which excludes memory from being taken
 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
 * @out_nid: ptr to int for nid of the range, can be %NULL
 *
- * Find the first free area from *@idx which matches @nid, fill the out
+ * Find the first area from *@idx which matches @nid, fill the out
 * parameters, and update *@idx for the next iteration.  The lower 32bit of
- * *@idx contains index into memory region and the upper 32bit indexes the
+ * *@idx contains index into type_a and the upper 32bit indexes the
- * areas before each reserved region.  For example, if reserved regions
+ * areas before each region in type_b.  For example, if type_b regions
 * look like the following,
 *
 *      0:[0-16), 1:[32-48), 2:[128-130)
@@ -780,53 +792,77 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
 * As both region arrays are sorted, the function advances the two indices
 * in lockstep and returns each intersection.
 */
-void __init_memblock __next_free_mem_range(u64 *idx, int nid,
+void __init_memblock __next_mem_range(u64 *idx, int nid,
-                                           phys_addr_t *out_start,
+                                      struct memblock_type *type_a,
-                                           phys_addr_t *out_end, int *out_nid)
+                                      struct memblock_type *type_b,
+                                      phys_addr_t *out_start,
+                                      phys_addr_t *out_end, int *out_nid)
 {
-        struct memblock_type *mem = &memblock.memory;
+        int idx_a = *idx & 0xffffffff;
-        struct memblock_type *rsv = &memblock.reserved;
+        int idx_b = *idx >> 32;
-        int mi = *idx & 0xffffffff;
-        int ri = *idx >> 32;
-        if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+        if (WARN_ONCE(nid == MAX_NUMNODES,
+        "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
                nid = NUMA_NO_NODE;
-        for ( ; mi < mem->cnt; mi++) {
+        for (; idx_a < type_a->cnt; idx_a++) {
-                struct memblock_region *m = &mem->regions[mi];
+                struct memblock_region *m = &type_a->regions[idx_a];
                phys_addr_t m_start = m->base;
                phys_addr_t m_end = m->base + m->size;
+                int         m_nid = memblock_get_region_node(m);
                /* only memory regions are associated with nodes, check it */
-                if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m))
+                if (nid != NUMA_NO_NODE && nid != m_nid)
                        continue;
-                /* scan areas before each reservation for intersection */
+                if (!type_b) {
-                for ( ; ri < rsv->cnt + 1; ri++) {
+                        if (out_start)
-                        struct memblock_region *r = &rsv->regions[ri];
+                                *out_start = m_start;
-                        phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0;
+                        if (out_end)
-                        phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX;
+                                *out_end = m_end;
+                        if (out_nid)
+                                *out_nid = m_nid;
+                        idx_a++;
+                        *idx = (u32)idx_a | (u64)idx_b << 32;
+                        return;
+                }
+                /* scan areas before each reservation */
+                for (; idx_b < type_b->cnt + 1; idx_b++) {
+                        struct memblock_region *r;
+                        phys_addr_t r_start;
+                        phys_addr_t r_end;
-                        /* if ri advanced past mi, break out to advance mi */
+                        r = &type_b->regions[idx_b];
+                        r_start = idx_b ? r[-1].base + r[-1].size : 0;
+                        r_end = idx_b < type_b->cnt ?
+                                r->base : ULLONG_MAX;
+                        /*
+                         * if idx_b advanced past idx_a,
+                         * break out to advance idx_a
+                         */
                        if (r_start >= m_end)
                                break;
                        /* if the two regions intersect, we're done */
                        if (m_start < r_end) {
                                if (out_start)
-                                        *out_start = max(m_start, r_start);
+                                        *out_start =
+                                                max(m_start, r_start);
                                if (out_end)
                                        *out_end = min(m_end, r_end);
                                if (out_nid)
-                                        *out_nid = memblock_get_region_node(m);
+                                        *out_nid = m_nid;
                                /*
-                                 * The region which ends first is advanced
+                                 * The region which ends first is
-                                 * for the next iteration.
+                                 * advanced for the next iteration.
                                 */
                                if (m_end <= r_end)
-                                        mi++;
+                                        idx_a++;
                                else
-                                        ri++;
+                                        idx_b++;
-                                *idx = (u32)mi | (u64)ri << 32;
+                                *idx = (u32)idx_a | (u64)idx_b << 32;
                                return;
                        }
                }
@@ -837,57 +873,80 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
 }
 /**
- * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
+ * __next_mem_range_rev - generic next function for for_each_*_range_rev()
+ *
+ * Finds the next range from type_a which is not marked as unsuitable
+ * in type_b.
+ *
 * @idx: pointer to u64 loop variable
 * @nid: nid: node selector, %NUMA_NO_NODE for all nodes
+ * @type_a: pointer to memblock_type from where the range is taken
+ * @type_b: pointer to memblock_type which excludes memory from being taken
 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
 * @out_nid: ptr to int for nid of the range, can be %NULL
 *
- * Reverse of __next_free_mem_range().
+ * Reverse of __next_mem_range().
- *
- * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't
- * be able to hot-remove hotpluggable memory used by the kernel. So this
- * function skip hotpluggable regions if needed when allocating memory for the
- * kernel.
 */
-void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
+void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
-                                           phys_addr_t *out_start,
+                                          struct memblock_type *type_a,
-                                           phys_addr_t *out_end, int *out_nid)
+                                          struct memblock_type *type_b,
+                                          phys_addr_t *out_start,
+                                          phys_addr_t *out_end, int *out_nid)
 {
-        struct memblock_type *mem = &memblock.memory;
+        int idx_a = *idx & 0xffffffff;
-        struct memblock_type *rsv = &memblock.reserved;
+        int idx_b = *idx >> 32;
-        int mi = *idx & 0xffffffff;
-        int ri = *idx >> 32;
        if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
                nid = NUMA_NO_NODE;
        if (*idx == (u64)ULLONG_MAX) {
-                mi = mem->cnt - 1;
+                idx_a = type_a->cnt - 1;
-                ri = rsv->cnt;
+                idx_b = type_b->cnt;
        }
-        for ( ; mi >= 0; mi--) {
+        for (; idx_a >= 0; idx_a--) {
-                struct memblock_region *m = &mem->regions[mi];
+                struct memblock_region *m = &type_a->regions[idx_a];
                phys_addr_t m_start = m->base;
                phys_addr_t m_end = m->base + m->size;
+                int m_nid = memblock_get_region_node(m);
                /* only memory regions are associated with nodes, check it */
-                if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m))
+                if (nid != NUMA_NO_NODE && nid != m_nid)
                        continue;
                /* skip hotpluggable memory regions if needed */
                if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
                        continue;
-                /* scan areas before each reservation for intersection */
+                if (!type_b) {
-                for ( ; ri >= 0; ri--) {
+                        if (out_start)
-                        struct memblock_region *r = &rsv->regions[ri];
+                                *out_start = m_start;
-                        phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0;
+                        if (out_end)
-                        phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX;
+                                *out_end = m_end;
+                        if (out_nid)
+                                *out_nid = m_nid;
+                        idx_a++;
+                        *idx = (u32)idx_a | (u64)idx_b << 32;
+                        return;
+                }
+                /* scan areas before each reservation */
+                for (; idx_b >= 0; idx_b--) {
+                        struct memblock_region *r;
+                        phys_addr_t r_start;
+                        phys_addr_t r_end;
+                        r = &type_b->regions[idx_b];
+                        r_start = idx_b ? r[-1].base + r[-1].size : 0;
+                        r_end = idx_b < type_b->cnt ?
+                                r->base : ULLONG_MAX;
+                        /*
+                         * if idx_b advanced past idx_a,
+                         * break out to advance idx_a
+                         */
-                        /* if ri advanced past mi, break out to advance mi */
                        if (r_end <= m_start)
                                break;
                        /* if the two regions intersect, we're done */
@@ -897,18 +956,17 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
                                if (out_end)
                                        *out_end = min(m_end, r_end);
                                if (out_nid)
-                                        *out_nid = memblock_get_region_node(m);
+                                        *out_nid = m_nid;
                                if (m_start >= r_start)
-                                        mi--;
+                                        idx_a--;
                                else
-                                        ri--;
+                                        idx_b--;
-                                *idx = (u32)mi | (u64)ri << 32;
+                                *idx = (u32)idx_a | (u64)idx_b << 32;
                                return;
                        }
                }
        }
+        /* signal end of iteration */
        *idx = ULLONG_MAX;
 }
@@ -975,22 +1033,35 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
 }
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
+static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
-                                        phys_addr_t align, phys_addr_t max_addr,
+                                        phys_addr_t align, phys_addr_t start,
-                                        int nid)
+                                        phys_addr_t end, int nid)
 {
        phys_addr_t found;
        if (!align)
                align = SMP_CACHE_BYTES;
-        found = memblock_find_in_range_node(size, align, 0, max_addr, nid);
+        found = memblock_find_in_range_node(size, align, start, end, nid);
        if (found && !memblock_reserve(found, size))
                return found;
        return 0;
 }
+phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
+                                        phys_addr_t start, phys_addr_t end)
+{
+        return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE);
+}
+static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
+                                        phys_addr_t align, phys_addr_t max_addr,
+                                        int nid)
+{
+        return memblock_alloc_range_nid(size, align, 0, max_addr, nid);
+}
 phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
 {
        return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
@@ -1201,7 +1272,7 @@ void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
                     __func__, (u64)base, (u64)base + size - 1,
                     (void *)_RET_IP_);
        kmemleak_free_part(__va(base), size);
-        __memblock_remove(&memblock.reserved, base, size);
+        memblock_remove_range(&memblock.reserved, base, size);
 }
 /*
@@ -1287,8 +1358,10 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit)
        }
        /* truncate both memory and reserved regions */
-        __memblock_remove(&memblock.memory, max_addr, (phys_addr_t)ULLONG_MAX);
+        memblock_remove_range(&memblock.memory, max_addr,
-        __memblock_remove(&memblock.reserved, max_addr, (phys_addr_t)ULLONG_MAX);
+                              (phys_addr_t)ULLONG_MAX);
+        memblock_remove_range(&memblock.reserved, max_addr,
+                              (phys_addr_t)ULLONG_MAX);
 }
 static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
@@ -1329,9 +1402,8 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
        if (mid == -1)
                return -1;
-        *start_pfn = type->regions[mid].base >> PAGE_SHIFT;
+        *start_pfn = PFN_DOWN(type->regions[mid].base);
-        *end_pfn = (type->regions[mid].base + type->regions[mid].size)
+        *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size);
-                        >> PAGE_SHIFT;
        return type->regions[mid].nid;
 }
@@ -1502,6 +1574,9 @@ static int __init memblock_init_debugfs(void)
                return -ENXIO;
        debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops);
        debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops);
+#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
+        debugfs_create_file("physmem", S_IRUGO, root, &memblock.physmem, &memblock_debug_fops);
+#endif
        return 0;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c47dffdcb246..a500cb0594c4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -80,7 +80,7 @@ int do_swap_account __read_mostly;
 #ifdef CONFIG_MEMCG_SWAP_ENABLED
 static int really_do_swap_account __initdata = 1;
 #else
-static int really_do_swap_account __initdata = 0;
+static int really_do_swap_account __initdata;
 #endif
 #else
@@ -357,10 +357,9 @@ struct mem_cgroup {
        struct cg_proto tcp_mem;
 #endif
 #if defined(CONFIG_MEMCG_KMEM)
-        /* analogous to slab_common's slab_caches list. per-memcg */
+        /* analogous to slab_common's slab_caches list, but per-memcg;
+         * protected by memcg_slab_mutex */
        struct list_head memcg_slab_caches;
-        /* Not a spinlock, we can take a lot of time walking the list */
-        struct mutex slab_caches_mutex;
        /* Index in the kmem_cache->memcg_params->memcg_caches array */
        int kmemcg_id;
 #endif
@@ -1077,9 +1076,18 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
        rcu_read_lock();
        do {
-                memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+                /*
-                if (unlikely(!memcg))
+                 * Page cache insertions can happen withou an
+                 * actual mm context, e.g. during disk probing
+                 * on boot, loopback IO, acct() writes etc.
+                 */
+                if (unlikely(!mm))
                        memcg = root_mem_cgroup;
+                else {
+                        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+                        if (unlikely(!memcg))
+                                memcg = root_mem_cgroup;
+                }
        } while (!css_tryget(&memcg->css));
        rcu_read_unlock();
        return memcg;
@@ -1586,23 +1594,12 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)
 }
 /*
- * 2 routines for checking "mem" is under move_account() or not.
+ * A routine for checking "mem" is under move_account() or not.
- *
- * mem_cgroup_stolen() -  checking whether a cgroup is mc.from or not. This
- *                        is used for avoiding races in accounting.  If true,
- *                        pc->mem_cgroup may be overwritten.
 *
- * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
+ * Checking a cgroup is mc.from or mc.to or under hierarchy of
- *                        under hierarchy of moving cgroups. This is for
+ * moving cgroups. This is for waiting at high-memory pressure
- *                        waiting at hith-memory prressure caused by "move".
+ * caused by "move".
 */
-static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
-{
-        VM_BUG_ON(!rcu_read_lock_held());
-        return atomic_read(&memcg->moving_account) > 0;
-}
 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
 {
        struct mem_cgroup *from;
@@ -1645,7 +1642,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
 * Take this lock when
 * - a code tries to modify page's memcg while it's USED.
 * - a code tries to modify page state accounting in a memcg.
- * see mem_cgroup_stolen(), too.
 */
 static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
                                  unsigned long *flags)
@@ -2280,12 +2276,11 @@ cleanup:
 }
 /*
- * Currently used to update mapped file statistics, but the routine can be
+ * Used to update mapped file or writeback or other statistics.
- * generalized to update other statistics as well.
 *
 * Notes: Race condition
 *
- * We usually use page_cgroup_lock() for accessing page_cgroup member but
+ * We usually use lock_page_cgroup() for accessing page_cgroup member but
 * it tends to be costly. But considering some conditions, we doesn't need
 * to do so _always_.
 *
@@ -2299,8 +2294,8 @@ cleanup:
 * by flags.
 *
 * Considering "move", this is an only case we see a race. To make the race
- * small, we check mm->moving_account and detect there are possibility of race
+ * small, we check memcg->moving_account and detect there are possibility
- * If there is, we take a lock.
+ * of race or not. If there is, we take a lock.
 */
 void __mem_cgroup_begin_update_page_stat(struct page *page,
@@ -2318,9 +2313,10 @@ again:
         * If this memory cgroup is not under account moving, we don't
         * need to take move_lock_mem_cgroup(). Because we already hold
         * rcu_read_lock(), any calls to move_account will be delayed until
-         * rcu_read_unlock() if mem_cgroup_stolen() == true.
+         * rcu_read_unlock().
         */
-        if (!mem_cgroup_stolen(memcg))
+        VM_BUG_ON(!rcu_read_lock_held());
+        if (atomic_read(&memcg->moving_account) <= 0)
                return;
        move_lock_mem_cgroup(memcg, flags);
@@ -2428,7 +2424,7 @@ static void drain_stock(struct memcg_stock_pcp *stock)
 */
 static void drain_local_stock(struct work_struct *dummy)
 {
-        struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
+        struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
        drain_stock(stock);
        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 }
@@ -2675,7 +2671,8 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
         * free their memory.
         */
        if (unlikely(test_thread_flag(TIF_MEMDIE) ||
-                     fatal_signal_pending(current)))
+                     fatal_signal_pending(current) ||
+                     current->flags & PF_EXITING))
                goto bypass;
        if (unlikely(task_in_memcg_oom(current)))
@@ -2903,6 +2900,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 static DEFINE_MUTEX(set_limit_mutex);
 #ifdef CONFIG_MEMCG_KMEM
+/*
+ * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
+ * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.
+ */
+static DEFINE_MUTEX(memcg_slab_mutex);
 static DEFINE_MUTEX(activate_kmem_mutex);
 static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
@@ -2935,10 +2938,10 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
        print_slabinfo_header(m);
-        mutex_lock(&memcg->slab_caches_mutex);
+        mutex_lock(&memcg_slab_mutex);
        list_for_each_entry(params, &memcg->memcg_slab_caches, list)
                cache_show(memcg_params_to_cache(params), m);
-        mutex_unlock(&memcg->slab_caches_mutex);
+        mutex_unlock(&memcg_slab_mutex);
        return 0;
 }
@@ -3040,8 +3043,6 @@ void memcg_update_array_size(int num)
                memcg_limited_groups_array_size = memcg_caches_array_size(num);
 }
-static void kmem_cache_destroy_work_func(struct work_struct *w);
 int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
 {
        struct memcg_cache_params *cur_params = s->memcg_params;
@@ -3094,29 +3095,6 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
        return 0;
 }
-char *memcg_create_cache_name(struct mem_cgroup *memcg,
-                              struct kmem_cache *root_cache)
-{
-        static char *buf = NULL;
-        /*
-         * We need a mutex here to protect the shared buffer. Since this is
-         * expected to be called only on cache creation, we can employ the
-         * slab_mutex for that purpose.
-         */
-        lockdep_assert_held(&slab_mutex);
-        if (!buf) {
-                buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
-                if (!buf)
-                        return NULL;
-        }
-        cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1);
-        return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
-                         memcg_cache_id(memcg), buf);
-}
 int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
                             struct kmem_cache *root_cache)
 {
@@ -3138,8 +3116,6 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
        if (memcg) {
                s->memcg_params->memcg = memcg;
                s->memcg_params->root_cache = root_cache;
-                INIT_WORK(&s->memcg_params->destroy,
-                                kmem_cache_destroy_work_func);
                css_get(&memcg->css);
        } else
                s->memcg_params->is_root_cache = true;
@@ -3156,24 +3132,37 @@ void memcg_free_cache_params(struct kmem_cache *s)
        kfree(s->memcg_params);
 }
-void memcg_register_cache(struct kmem_cache *s)
+static void memcg_register_cache(struct mem_cgroup *memcg,
+                                 struct kmem_cache *root_cache)
 {
-        struct kmem_cache *root;
+        static char memcg_name_buf[NAME_MAX + 1]; /* protected by
-        struct mem_cgroup *memcg;
+                                                     memcg_slab_mutex */
+        struct kmem_cache *cachep;
        int id;
-        if (is_root_cache(s))
+        lockdep_assert_held(&memcg_slab_mutex);
+        id = memcg_cache_id(memcg);
+        /*
+         * Since per-memcg caches are created asynchronously on first
+         * allocation (see memcg_kmem_get_cache()), several threads can try to
+         * create the same cache, but only one of them may succeed.
+         */
+        if (cache_from_memcg_idx(root_cache, id))
                return;
+        cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
+        cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
        /*
-         * Holding the slab_mutex assures nobody will touch the memcg_caches
+         * If we could not create a memcg cache, do not complain, because
-         * array while we are modifying it.
+         * that's not critical at all as we can always proceed with the root
+         * cache.
         */
-        lockdep_assert_held(&slab_mutex);
+        if (!cachep)
+                return;
-        root = s->memcg_params->root_cache;
+        list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
-        memcg = s->memcg_params->memcg;
-        id = memcg_cache_id(memcg);
        /*
         * Since readers won't lock (see cache_from_memcg_idx()), we need a
@@ -3182,49 +3171,30 @@ void memcg_register_cache(struct kmem_cache *s)
         */
        smp_wmb();
-        /*
+        BUG_ON(root_cache->memcg_params->memcg_caches[id]);
-         * Initialize the pointer to this cache in its parent's memcg_params
+        root_cache->memcg_params->memcg_caches[id] = cachep;
-         * before adding it to the memcg_slab_caches list, otherwise we can
-         * fail to convert memcg_params_to_cache() while traversing the list.
-         */
-        VM_BUG_ON(root->memcg_params->memcg_caches[id]);
-        root->memcg_params->memcg_caches[id] = s;
-        mutex_lock(&memcg->slab_caches_mutex);
-        list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
-        mutex_unlock(&memcg->slab_caches_mutex);
 }
-void memcg_unregister_cache(struct kmem_cache *s)
+static void memcg_unregister_cache(struct kmem_cache *cachep)
 {
-        struct kmem_cache *root;
+        struct kmem_cache *root_cache;
        struct mem_cgroup *memcg;
        int id;
-        if (is_root_cache(s))
+        lockdep_assert_held(&memcg_slab_mutex);
-                return;
-        /*
+        BUG_ON(is_root_cache(cachep));
-         * Holding the slab_mutex assures nobody will touch the memcg_caches
-         * array while we are modifying it.
-         */
-        lockdep_assert_held(&slab_mutex);
-        root = s->memcg_params->root_cache;
+        root_cache = cachep->memcg_params->root_cache;
-        memcg = s->memcg_params->memcg;
+        memcg = cachep->memcg_params->memcg;
        id = memcg_cache_id(memcg);
-        mutex_lock(&memcg->slab_caches_mutex);
+        BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
-        list_del(&s->memcg_params->list);
+        root_cache->memcg_params->memcg_caches[id] = NULL;
-        mutex_unlock(&memcg->slab_caches_mutex);
-        /*
+        list_del(&cachep->memcg_params->list);
-         * Clear the pointer to this cache in its parent's memcg_params only
-         * after removing it from the memcg_slab_caches list, otherwise we can
+        kmem_cache_destroy(cachep);
-         * fail to convert memcg_params_to_cache() while traversing the list.
-         */
-        VM_BUG_ON(root->memcg_params->memcg_caches[id] != s);
-        root->memcg_params->memcg_caches[id] = NULL;
 }
 /*
@@ -3258,144 +3228,61 @@ static inline void memcg_resume_kmem_account(void)
        current->memcg_kmem_skip_account--;
 }
-static void kmem_cache_destroy_work_func(struct work_struct *w)
+int __memcg_cleanup_cache_params(struct kmem_cache *s)
-{
-        struct kmem_cache *cachep;
-        struct memcg_cache_params *p;
-        p = container_of(w, struct memcg_cache_params, destroy);
-        cachep = memcg_params_to_cache(p);
-        /*
-         * If we get down to 0 after shrink, we could delete right away.
-         * However, memcg_release_pages() already puts us back in the workqueue
-         * in that case. If we proceed deleting, we'll get a dangling
-         * reference, and removing the object from the workqueue in that case
-         * is unnecessary complication. We are not a fast path.
-         *
-         * Note that this case is fundamentally different from racing with
-         * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
-         * kmem_cache_shrink, not only we would be reinserting a dead cache
-         * into the queue, but doing so from inside the worker racing to
-         * destroy it.
-         *
-         * So if we aren't down to zero, we'll just schedule a worker and try
-         * again
-         */
-        if (atomic_read(&cachep->memcg_params->nr_pages) != 0)
-                kmem_cache_shrink(cachep);
-        else
-                kmem_cache_destroy(cachep);
-}
-void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
-{
-        if (!cachep->memcg_params->dead)
-                return;
-        /*
-         * There are many ways in which we can get here.
-         *
-         * We can get to a memory-pressure situation while the delayed work is
-         * still pending to run. The vmscan shrinkers can then release all
-         * cache memory and get us to destruction. If this is the case, we'll
-         * be executed twice, which is a bug (the second time will execute over
-         * bogus data). In this case, cancelling the work should be fine.
-         *
-         * But we can also get here from the worker itself, if
-         * kmem_cache_shrink is enough to shake all the remaining objects and
-         * get the page count to 0. In this case, we'll deadlock if we try to
-         * cancel the work (the worker runs with an internal lock held, which
-         * is the same lock we would hold for cancel_work_sync().)
-         *
-         * Since we can't possibly know who got us here, just refrain from
-         * running if there is already work pending
-         */
-        if (work_pending(&cachep->memcg_params->destroy))
-                return;
-        /*
-         * We have to defer the actual destroying to a workqueue, because
-         * we might currently be in a context that cannot sleep.
-         */
-        schedule_work(&cachep->memcg_params->destroy);
-}
-int __kmem_cache_destroy_memcg_children(struct kmem_cache *s)
 {
        struct kmem_cache *c;
        int i, failed = 0;
-        /*
+        mutex_lock(&memcg_slab_mutex);
-         * If the cache is being destroyed, we trust that there is no one else
-         * requesting objects from it. Even if there are, the sanity checks in
-         * kmem_cache_destroy should caught this ill-case.
-         *
-         * Still, we don't want anyone else freeing memcg_caches under our
-         * noses, which can happen if a new memcg comes to life. As usual,
-         * we'll take the activate_kmem_mutex to protect ourselves against
-         * this.
-         */
-        mutex_lock(&activate_kmem_mutex);
        for_each_memcg_cache_index(i) {
                c = cache_from_memcg_idx(s, i);
                if (!c)
                        continue;
-                /*
+                memcg_unregister_cache(c);
-                 * We will now manually delete the caches, so to avoid races
-                 * we need to cancel all pending destruction workers and
-                 * proceed with destruction ourselves.
-                 *
-                 * kmem_cache_destroy() will call kmem_cache_shrink internally,
-                 * and that could spawn the workers again: it is likely that
-                 * the cache still have active pages until this very moment.
-                 * This would lead us back to mem_cgroup_destroy_cache.
-                 *
-                 * But that will not execute at all if the "dead" flag is not
-                 * set, so flip it down to guarantee we are in control.
-                 */
-                c->memcg_params->dead = false;
-                cancel_work_sync(&c->memcg_params->destroy);
-                kmem_cache_destroy(c);
                if (cache_from_memcg_idx(s, i))
                        failed++;
        }
-        mutex_unlock(&activate_kmem_mutex);
+        mutex_unlock(&memcg_slab_mutex);
        return failed;
 }
-static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
 {
        struct kmem_cache *cachep;
-        struct memcg_cache_params *params;
+        struct memcg_cache_params *params, *tmp;
        if (!memcg_kmem_is_active(memcg))
                return;
-        mutex_lock(&memcg->slab_caches_mutex);
+        mutex_lock(&memcg_slab_mutex);
-        list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
+        list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
                cachep = memcg_params_to_cache(params);
-                cachep->memcg_params->dead = true;
+                kmem_cache_shrink(cachep);
-                schedule_work(&cachep->memcg_params->destroy);
+                if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
+                        memcg_unregister_cache(cachep);
        }
-        mutex_unlock(&memcg->slab_caches_mutex);
+        mutex_unlock(&memcg_slab_mutex);
 }
-struct create_work {
+struct memcg_register_cache_work {
        struct mem_cgroup *memcg;
        struct kmem_cache *cachep;
        struct work_struct work;
 };
-static void memcg_create_cache_work_func(struct work_struct *w)
+static void memcg_register_cache_func(struct work_struct *w)
 {
-        struct create_work *cw = container_of(w, struct create_work, work);
+        struct memcg_register_cache_work *cw =
+                container_of(w, struct memcg_register_cache_work, work);
        struct mem_cgroup *memcg = cw->memcg;
        struct kmem_cache *cachep = cw->cachep;
-        kmem_cache_create_memcg(memcg, cachep);
+        mutex_lock(&memcg_slab_mutex);
+        memcg_register_cache(memcg, cachep);
+        mutex_unlock(&memcg_slab_mutex);
        css_put(&memcg->css);
        kfree(cw);
 }
@@ -3403,12 +3290,12 @@ static void memcg_create_cache_work_func(struct work_struct *w)
 /*
 * Enqueue the creation of a per-memcg kmem_cache.
 */
-static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
+static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
-                                         struct kmem_cache *cachep)
+                                            struct kmem_cache *cachep)
 {
-        struct create_work *cw;
+        struct memcg_register_cache_work *cw;
-        cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
+        cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
        if (cw == NULL) {
                css_put(&memcg->css);
                return;
@@ -3417,17 +3304,17 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
        cw->memcg = memcg;
        cw->cachep = cachep;
-        INIT_WORK(&cw->work, memcg_create_cache_work_func);
+        INIT_WORK(&cw->work, memcg_register_cache_func);
        schedule_work(&cw->work);
 }
-static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
+static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
-                                       struct kmem_cache *cachep)
+                                          struct kmem_cache *cachep)
 {
        /*
         * We need to stop accounting when we kmalloc, because if the
         * corresponding kmalloc cache is not yet created, the first allocation
-         * in __memcg_create_cache_enqueue will recurse.
+         * in __memcg_schedule_register_cache will recurse.
         *
         * However, it is better to enclose the whole function. Depending on
         * the debugging options enabled, INIT_WORK(), for instance, can
@@ -3436,9 +3323,27 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
         * the safest choice is to do it like this, wrapping the whole function.
         */
        memcg_stop_kmem_account();
-        __memcg_create_cache_enqueue(memcg, cachep);
+        __memcg_schedule_register_cache(memcg, cachep);
        memcg_resume_kmem_account();
 }
+int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
+{
+        int res;
+        res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp,
+                                PAGE_SIZE << order);
+        if (!res)
+                atomic_add(1 << order, &cachep->memcg_params->nr_pages);
+        return res;
+}
+void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
+{
+        memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order);
+        atomic_sub(1 << order, &cachep->memcg_params->nr_pages);
+}
 /*
 * Return the kmem_cache we're supposed to use for a slab allocation.
 * We try to use the current memcg's version of the cache.
@@ -3489,22 +3394,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
         *
         * However, there are some clashes that can arrive from locking.
         * For instance, because we acquire the slab_mutex while doing
-         * kmem_cache_dup, this means no further allocation could happen
+         * memcg_create_kmem_cache, this means no further allocation
-         * with the slab_mutex held.
+         * could happen with the slab_mutex held. So it's better to
-         *
+         * defer everything.
-         * Also, because cache creation issue get_online_cpus(), this
-         * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
-         * that ends up reversed during cpu hotplug. (cpuset allocates
-         * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
-         * better to defer everything.
         */
-        memcg_create_cache_enqueue(memcg, cachep);
+        memcg_schedule_register_cache(memcg, cachep);
        return cachep;
 out:
        rcu_read_unlock();
        return cachep;
 }
-EXPORT_SYMBOL(__memcg_kmem_get_cache);
 /*
 * We need to verify if the allocation against current->mm->owner's memcg is
@@ -3531,11 +3430,12 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
        /*
         * Disabling accounting is only relevant for some specific memcg
         * internal allocations. Therefore we would initially not have such
-         * check here, since direct calls to the page allocator that are marked
+         * check here, since direct calls to the page allocator that are
-         * with GFP_KMEMCG only happen outside memcg core. We are mostly
+         * accounted to kmemcg (alloc_kmem_pages and friends) only happen
-         * concerned with cache allocations, and by having this test at
+         * outside memcg core. We are mostly concerned with cache allocations,
-         * memcg_kmem_get_cache, we are already able to relay the allocation to
+         * and by having this test at memcg_kmem_get_cache, we are already able
-         * the root cache and bypass the memcg cache altogether.
+         * to relay the allocation to the root cache and bypass the memcg cache
+         * altogether.
         *
         * There is one exception, though: the SLUB allocator does not create
         * large order caches, but rather service large kmallocs directly from
@@ -3622,7 +3522,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
        memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
 }
 #else
-static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
 {
 }
 #endif /* CONFIG_MEMCG_KMEM */
@@ -3958,17 +3858,9 @@ int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
                return 0;
        }
-        /*
+        memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
-         * Page cache insertions can happen without an actual mm
+        if (!memcg)
-         * context, e.g. during disk probing on boot.
+                return -ENOMEM;
-         */
-        if (unlikely(!mm))
-                memcg = root_mem_cgroup;
-        else {
-                memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
-                if (!memcg)
-                        return -ENOMEM;
-        }
        __mem_cgroup_commit_charge(memcg, page, 1, type, false);
        return 0;
 }
@@ -4783,9 +4675,9 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
                if (mem_cgroup_move_parent(page, pc, memcg)) {
                        /* found lock contention or "pc" is obsolete. */
                        busy = page;
-                        cond_resched();
                } else
                        busy = NULL;
+                cond_resched();
        } while (!list_empty(list));
 }
@@ -5061,13 +4953,14 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
         * Make sure we have enough space for this cgroup in each root cache's
         * memcg_params.
         */
+        mutex_lock(&memcg_slab_mutex);
        err = memcg_update_all_caches(memcg_id + 1);
+        mutex_unlock(&memcg_slab_mutex);
        if (err)
                goto out_rmid;
        memcg->kmemcg_id = memcg_id;
        INIT_LIST_HEAD(&memcg->memcg_slab_caches);
-        mutex_init(&memcg->slab_caches_mutex);
        /*
         * We couldn't have accounted to this cgroup, because it hasn't got the
@@ -5442,22 +5335,14 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
                                       struct cftype *cft, u64 val)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-        struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
-        if (val > 100 || !parent)
+        if (val > 100)
                return -EINVAL;
-        mutex_lock(&memcg_create_mutex);
+        if (css_parent(css))
+                memcg->swappiness = val;
-        /* If under hierarchy, only empty-root can set this value */
+        else
-        if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
+                vm_swappiness = val;
-                mutex_unlock(&memcg_create_mutex);
-                return -EINVAL;
-        }
-        memcg->swappiness = val;
-        mutex_unlock(&memcg_create_mutex);
        return 0;
 }
@@ -5789,22 +5674,15 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
        struct cftype *cft, u64 val)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-        struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
        /* cannot set to root cgroup and only 0 and 1 are allowed */
-        if (!parent || !((val == 0) || (val == 1)))
+        if (!css_parent(css) || !((val == 0) || (val == 1)))
                return -EINVAL;
-        mutex_lock(&memcg_create_mutex);
-        /* oom-kill-disable is a flag for subhierarchy. */
-        if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
-                mutex_unlock(&memcg_create_mutex);
-                return -EINVAL;
-        }
        memcg->oom_kill_disable = val;
        if (!val)
                memcg_oom_recover(memcg);
-        mutex_unlock(&memcg_create_mutex);
        return 0;
 }
@@ -6490,7 +6368,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
        css_for_each_descendant_post(iter, css)
                mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
-        mem_cgroup_destroy_all_caches(memcg);
+        memcg_unregister_all_caches(memcg);
        vmpressure_cleanup(&memcg->vmpressure);
 }
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 35ef28acf137..cd8989c1027e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -204,9 +204,9 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
 #endif
        si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
-        if ((flags & MF_ACTION_REQUIRED) && t == current) {
+        if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
                si.si_code = BUS_MCEERR_AR;
-                ret = force_sig_info(SIGBUS, &si, t);
+                ret = force_sig_info(SIGBUS, &si, current);
        } else {
                /*
                 * Don't use force here, it's convenient if the signal
@@ -380,20 +380,51 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
        }
 }
-static int task_early_kill(struct task_struct *tsk)
+/*
+ * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
+ * on behalf of the thread group. Return task_struct of the (first found)
+ * dedicated thread if found, and return NULL otherwise.
+ *
+ * We already hold read_lock(&tasklist_lock) in the caller, so we don't
+ * have to call rcu_read_lock/unlock() in this function.
+ */
+static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
 {
+        struct task_struct *t;
+        for_each_thread(tsk, t)
+                if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
+                        return t;
+        return NULL;
+}
+/*
+ * Determine whether a given process is "early kill" process which expects
+ * to be signaled when some page under the process is hwpoisoned.
+ * Return task_struct of the dedicated thread (main thread unless explicitly
+ * specified) if the process is "early kill," and otherwise returns NULL.
+ */
+static struct task_struct *task_early_kill(struct task_struct *tsk,
+                                           int force_early)
+{
+        struct task_struct *t;
        if (!tsk->mm)
-                return 0;
+                return NULL;
-        if (tsk->flags & PF_MCE_PROCESS)
+        if (force_early)
-                return !!(tsk->flags & PF_MCE_EARLY);
+                return tsk;
-        return sysctl_memory_failure_early_kill;
+        t = find_early_kill_thread(tsk);
+        if (t)
+                return t;
+        if (sysctl_memory_failure_early_kill)
+                return tsk;
+        return NULL;
 }
 /*
 * Collect processes when the error hit an anonymous page.
 */
 static void collect_procs_anon(struct page *page, struct list_head *to_kill,
-                              struct to_kill **tkc)
+                              struct to_kill **tkc, int force_early)
 {
        struct vm_area_struct *vma;
        struct task_struct *tsk;
@@ -408,16 +439,17 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
        read_lock(&tasklist_lock);
        for_each_process (tsk) {
                struct anon_vma_chain *vmac;
+                struct task_struct *t = task_early_kill(tsk, force_early);
-                if (!task_early_kill(tsk))
+                if (!t)
                        continue;
                anon_vma_interval_tree_foreach(vmac, &av->rb_root,
                                               pgoff, pgoff) {
                        vma = vmac->vma;
                        if (!page_mapped_in_vma(page, vma))
                                continue;
-                        if (vma->vm_mm == tsk->mm)
+                        if (vma->vm_mm == t->mm)
-                                add_to_kill(tsk, page, vma, to_kill, tkc);
+                                add_to_kill(t, page, vma, to_kill, tkc);
                }
        }
        read_unlock(&tasklist_lock);
@@ -428,7 +460,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 * Collect processes when the error hit a file mapped page.
 */
 static void collect_procs_file(struct page *page, struct list_head *to_kill,
-                              struct to_kill **tkc)
+                              struct to_kill **tkc, int force_early)
 {
        struct vm_area_struct *vma;
        struct task_struct *tsk;
@@ -438,10 +470,10 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
        read_lock(&tasklist_lock);
        for_each_process(tsk) {
                pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+                struct task_struct *t = task_early_kill(tsk, force_early);
-                if (!task_early_kill(tsk))
+                if (!t)
                        continue;
                vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
                                      pgoff) {
                        /*
@@ -451,8 +483,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
                         * Assume applications who requested early kill want
                         * to be informed of all such data corruptions.
                         */
-                        if (vma->vm_mm == tsk->mm)
+                        if (vma->vm_mm == t->mm)
-                                add_to_kill(tsk, page, vma, to_kill, tkc);
+                                add_to_kill(t, page, vma, to_kill, tkc);
                }
        }
        read_unlock(&tasklist_lock);
@@ -465,7 +497,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 * First preallocate one tokill structure outside the spin locks,
 * so that we can kill at least one process reasonably reliable.
 */
-static void collect_procs(struct page *page, struct list_head *tokill)
+static void collect_procs(struct page *page, struct list_head *tokill,
+                                int force_early)
 {
        struct to_kill *tk;
@@ -476,9 +509,9 @@ static void collect_procs(struct page *page, struct list_head *tokill)
        if (!tk)
                return;
        if (PageAnon(page))
-                collect_procs_anon(page, tokill, &tk);
+                collect_procs_anon(page, tokill, &tk, force_early);
        else
-                collect_procs_file(page, tokill, &tk);
+                collect_procs_file(page, tokill, &tk, force_early);
        kfree(tk);
 }
@@ -963,7 +996,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * there's nothing that can be done.
         */
        if (kill)
-                collect_procs(ppage, &tokill);
+                collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);
        ret = try_to_unmap(ppage, ttu);
        if (ret != SWAP_SUCCESS)
@@ -1081,15 +1114,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                        return 0;
                } else if (PageHuge(hpage)) {
                        /*
-                         * Check "just unpoisoned", "filter hit", and
+                         * Check "filter hit" and "race with other subpage."
-                         * "race with other subpage."
                         */
                        lock_page(hpage);
-                        if (!PageHWPoison(hpage)
+                        if (PageHWPoison(hpage)) {
-                            || (hwpoison_filter(p) && TestClearPageHWPoison(p))
+                                if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
-                            || (p != hpage && TestSetPageHWPoison(hpage))) {
+                                    || (p != hpage && TestSetPageHWPoison(hpage))) {
-                                atomic_long_sub(nr_pages, &num_poisoned_pages);
+                                        atomic_long_sub(nr_pages, &num_poisoned_pages);
-                                return 0;
+                                        unlock_page(hpage);
+                                        return 0;
+                                }
                        }
                        set_page_hwpoison_huge_page(hpage);
                        res = dequeue_hwpoisoned_huge_page(hpage);
@@ -1131,11 +1165,6 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                }
        }
-        /*
-         * Lock the page and wait for writeback to finish.
-         * It's very difficult to mess with pages currently under IO
-         * and in many cases impossible, so we just avoid it here.
-         */
        lock_page(hpage);
        /*
@@ -1152,6 +1181,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         */
        if (!PageHWPoison(p)) {
                printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
+                atomic_long_sub(nr_pages, &num_poisoned_pages);
+                put_page(hpage);
                res = 0;
                goto out;
        }
@@ -1183,6 +1214,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        if (PageHuge(p))
                set_page_hwpoison_huge_page(hpage);
+        /*
+         * It's very difficult to mess with pages currently under IO
+         * and in many cases impossible, so we just avoid it here.
+         */
        wait_on_page_writeback(p);
        /*
@@ -1295,7 +1330,7 @@ static void memory_failure_work_func(struct work_struct *work)
        unsigned long proc_flags;
        int gotten;
-        mf_cpu = &__get_cpu_var(memory_failure_cpu);
+        mf_cpu = this_cpu_ptr(&memory_failure_cpu);
        for (;;) {
                spin_lock_irqsave(&mf_cpu->lock, proc_flags);
                gotten = kfifo_get(&mf_cpu->fifo, &entry);
@@ -1500,7 +1535,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
        /* Keep page count to indicate a given hugepage is isolated. */
        list_move(&hpage->lru, &pagelist);
-        ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+        ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
                                MIGRATE_SYNC, MR_MEMORY_FAILURE);
        if (ret) {
                pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
@@ -1581,7 +1616,7 @@ static int __soft_offline_page(struct page *page, int flags)
                inc_zone_page_state(page, NR_ISOLATED_ANON +
                                        page_is_file_cache(page));
                list_add(&page->lru, &pagelist);
-                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+                ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
                                        MIGRATE_SYNC, MR_MEMORY_FAILURE);
                if (ret) {
                        if (!list_empty(&pagelist)) {
@@ -1661,11 +1696,7 @@ int soft_offline_page(struct page *page, int flags)
                }
        }
-        /*
+        get_online_mems();
-         * The lock_memory_hotplug prevents a race with memory hotplug.
-         * This is a big hammer, a better would be nicer.
-         */
-        lock_memory_hotplug();
        /*
         * Isolate the page, so that it doesn't get reallocated if it
@@ -1676,7 +1707,7 @@ int soft_offline_page(struct page *page, int flags)
                set_migratetype_isolate(page, true);
        ret = get_any_page(page, pfn, flags);
-        unlock_memory_hotplug();
+        put_online_mems();
        if (ret > 0) { /* for in-use pages */
                if (PageHuge(page))
                        ret = soft_offline_huge_page(page, flags);
diff --git a/mm/memory.c b/mm/memory.c
index 037b812a9531..d67fd9fcf1f2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -698,11 +698,6 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 }
-static inline bool is_cow_mapping(vm_flags_t flags)
-{
-        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
-}
 /*
 * vm_normal_page -- This function gets the "struct page" associated with a pte.
 *
@@ -756,7 +751,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
        unsigned long pfn = pte_pfn(pte);
        if (HAVE_PTE_SPECIAL) {
-                if (likely(!pte_special(pte)))
+                if (likely(!pte_special(pte) || pte_numa(pte)))
                        goto check_pfn;
                if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
                        return NULL;
@@ -782,14 +777,15 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                }
        }
-        if (is_zero_pfn(pfn))
-                return NULL;
 check_pfn:
        if (unlikely(pfn > highest_memmap_pfn)) {
                print_bad_pte(vma, addr, pte, NULL);
                return NULL;
        }
+        if (is_zero_pfn(pfn))
+                return NULL;
        /*
         * NOTE! We still have PageReserved() pages in the page tables.
         * eg. VDSO mappings can cause them to exist.
@@ -1457,646 +1453,6 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 }
 EXPORT_SYMBOL_GPL(zap_vma_ptes);
-/**
- * follow_page_mask - look up a page descriptor from a user-virtual address
- * @vma: vm_area_struct mapping @address
- * @address: virtual address to look up
- * @flags: flags modifying lookup behaviour
- * @page_mask: on output, *page_mask is set according to the size of the page
- *
- * @flags can have FOLL_ flags set, defined in <linux/mm.h>
- *
- * Returns the mapped (struct page *), %NULL if no mapping exists, or
- * an error pointer if there is a mapping to something not represented
- * by a page descriptor (see also vm_normal_page()).
- */
-struct page *follow_page_mask(struct vm_area_struct *vma,
-                              unsigned long address, unsigned int flags,
-                              unsigned int *page_mask)
-{
-        pgd_t *pgd;
-        pud_t *pud;
-        pmd_t *pmd;
-        pte_t *ptep, pte;
-        spinlock_t *ptl;
-        struct page *page;
-        struct mm_struct *mm = vma->vm_mm;
-        *page_mask = 0;
-        page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
-        if (!IS_ERR(page)) {
-                BUG_ON(flags & FOLL_GET);
-                goto out;
-        }
-        page = NULL;
-        pgd = pgd_offset(mm, address);
-        if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
-                goto no_page_table;
-        pud = pud_offset(pgd, address);
-        if (pud_none(*pud))
-                goto no_page_table;
-        if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
-                if (flags & FOLL_GET)
-                        goto out;
-                page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
-                goto out;
-        }
-        if (unlikely(pud_bad(*pud)))
-                goto no_page_table;
-        pmd = pmd_offset(pud, address);
-        if (pmd_none(*pmd))
-                goto no_page_table;
-        if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
-                page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
-                if (flags & FOLL_GET) {
-                        /*
-                         * Refcount on tail pages are not well-defined and
-                         * shouldn't be taken. The caller should handle a NULL
-                         * return when trying to follow tail pages.
-                         */
-                        if (PageHead(page))
-                                get_page(page);
-                        else {
-                                page = NULL;
-                                goto out;
-                        }
-                }
-                goto out;
-        }
-        if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
-                goto no_page_table;
-        if (pmd_trans_huge(*pmd)) {
-                if (flags & FOLL_SPLIT) {
-                        split_huge_page_pmd(vma, address, pmd);
-                        goto split_fallthrough;
-                }
-                ptl = pmd_lock(mm, pmd);
-                if (likely(pmd_trans_huge(*pmd))) {
-                        if (unlikely(pmd_trans_splitting(*pmd))) {
-                                spin_unlock(ptl);
-                                wait_split_huge_page(vma->anon_vma, pmd);
-                        } else {
-                                page = follow_trans_huge_pmd(vma, address,
-                                                             pmd, flags);
-                                spin_unlock(ptl);
-                                *page_mask = HPAGE_PMD_NR - 1;
-                                goto out;
-                        }
-                } else
-                        spin_unlock(ptl);
-                /* fall through */
-        }
-split_fallthrough:
-        if (unlikely(pmd_bad(*pmd)))
-                goto no_page_table;
-        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
-        pte = *ptep;
-        if (!pte_present(pte)) {
-                swp_entry_t entry;
-                /*
-                 * KSM's break_ksm() relies upon recognizing a ksm page
-                 * even while it is being migrated, so for that case we
-                 * need migration_entry_wait().
-                 */
-                if (likely(!(flags & FOLL_MIGRATION)))
-                        goto no_page;
-                if (pte_none(pte) || pte_file(pte))
-                        goto no_page;
-                entry = pte_to_swp_entry(pte);
-                if (!is_migration_entry(entry))
-                        goto no_page;
-                pte_unmap_unlock(ptep, ptl);
-                migration_entry_wait(mm, pmd, address);
-                goto split_fallthrough;
-        }
-        if ((flags & FOLL_NUMA) && pte_numa(pte))
-                goto no_page;
-        if ((flags & FOLL_WRITE) && !pte_write(pte))
-                goto unlock;
-        page = vm_normal_page(vma, address, pte);
-        if (unlikely(!page)) {
-                if ((flags & FOLL_DUMP) ||
-                    !is_zero_pfn(pte_pfn(pte)))
-                        goto bad_page;
-                page = pte_page(pte);
-        }
-        if (flags & FOLL_GET)
-                get_page_foll(page);
-        if (flags & FOLL_TOUCH) {
-                if ((flags & FOLL_WRITE) &&
-                    !pte_dirty(pte) && !PageDirty(page))
-                        set_page_dirty(page);
-                /*
-                 * pte_mkyoung() would be more correct here, but atomic care
-                 * is needed to avoid losing the dirty bit: it is easier to use
-                 * mark_page_accessed().
-                 */
-                mark_page_accessed(page);
-        }
-        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
-                /*
-                 * The preliminary mapping check is mainly to avoid the
-                 * pointless overhead of lock_page on the ZERO_PAGE
-                 * which might bounce very badly if there is contention.
-                 *
-                 * If the page is already locked, we don't need to
-                 * handle it now - vmscan will handle it later if and
-                 * when it attempts to reclaim the page.
-                 */
-                if (page->mapping && trylock_page(page)) {
-                        lru_add_drain();  /* push cached pages to LRU */
-                        /*
-                         * Because we lock page here, and migration is
-                         * blocked by the pte's page reference, and we
-                         * know the page is still mapped, we don't even
-                         * need to check for file-cache page truncation.
-                         */
-                        mlock_vma_page(page);
-                        unlock_page(page);
-                }
-        }
-unlock:
-        pte_unmap_unlock(ptep, ptl);
-out:
-        return page;
-bad_page:
-        pte_unmap_unlock(ptep, ptl);
-        return ERR_PTR(-EFAULT);
-no_page:
-        pte_unmap_unlock(ptep, ptl);
-        if (!pte_none(pte))
-                return page;
-no_page_table:
-        /*
-         * When core dumping an enormous anonymous area that nobody
-         * has touched so far, we don't want to allocate unnecessary pages or
-         * page tables.  Return error instead of NULL to skip handle_mm_fault,
-         * then get_dump_page() will return NULL to leave a hole in the dump.
-         * But we can only make this optimization where a hole would surely
-         * be zero-filled if handle_mm_fault() actually did handle it.
-         */
-        if ((flags & FOLL_DUMP) &&
-            (!vma->vm_ops || !vma->vm_ops->fault))
-                return ERR_PTR(-EFAULT);
-        return page;
-}
-static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
-{
-        return stack_guard_page_start(vma, addr) ||
-               stack_guard_page_end(vma, addr+PAGE_SIZE);
-}
-/**
- * __get_user_pages() - pin user pages in memory
- * @tsk:        task_struct of target task
- * @mm:         mm_struct of target mm
- * @start:      starting user address
- * @nr_pages:   number of pages from start to pin
- * @gup_flags:  flags modifying pin behaviour
- * @pages:      array that receives pointers to the pages pinned.
- *              Should be at least nr_pages long. Or NULL, if caller
- *              only intends to ensure the pages are faulted in.
- * @vmas:       array of pointers to vmas corresponding to each page.
- *              Or NULL if the caller does not require them.
- * @nonblocking: whether waiting for disk IO or mmap_sem contention
- *
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno. Each page returned must be released
- * with a put_page() call when it is finished with. vmas will only
- * remain valid while mmap_sem is held.
- *
- * Must be called with mmap_sem held for read or write.
- *
- * __get_user_pages walks a process's page tables and takes a reference to
- * each struct page that each user address corresponds to at a given
- * instant. That is, it takes the page that would be accessed if a user
- * thread accesses the given user virtual address at that instant.
- *
- * This does not guarantee that the page exists in the user mappings when
- * __get_user_pages returns, and there may even be a completely different
- * page there in some cases (eg. if mmapped pagecache has been invalidated
- * and subsequently re faulted). However it does guarantee that the page
- * won't be freed completely. And mostly callers simply care that the page
- * contains data that was valid *at some point in time*. Typically, an IO
- * or similar operation cannot guarantee anything stronger anyway because
- * locks can't be held over the syscall boundary.
- *
- * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
- * the page is written to, set_page_dirty (or set_page_dirty_lock, as
- * appropriate) must be called after the page is finished with, and
- * before put_page is called.
- *
- * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
- * or mmap_sem contention, and if waiting is needed to pin all pages,
- * *@nonblocking will be set to 0.
- *
- * In most cases, get_user_pages or get_user_pages_fast should be used
- * instead of __get_user_pages. __get_user_pages should be used only if
- * you need some special @gup_flags.
- */
-long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-                unsigned long start, unsigned long nr_pages,
-                unsigned int gup_flags, struct page **pages,
-                struct vm_area_struct **vmas, int *nonblocking)
-{
-        long i;
-        unsigned long vm_flags;
-        unsigned int page_mask;
-        if (!nr_pages)
-                return 0;
-        VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
-        /*
-         * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
-         * would be called on PROT_NONE ranges. We must never invoke
-         * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
-         * page faults would unprotect the PROT_NONE ranges if
-         * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
-         * bitflag. So to avoid that, don't set FOLL_NUMA if
-         * FOLL_FORCE is set.
-         */
-        if (!(gup_flags & FOLL_FORCE))
-                gup_flags |= FOLL_NUMA;
-        i = 0;
-        do {
-                struct vm_area_struct *vma;
-                vma = find_extend_vma(mm, start);
-                if (!vma && in_gate_area(mm, start)) {
-                        unsigned long pg = start & PAGE_MASK;
-                        pgd_t *pgd;
-                        pud_t *pud;
-                        pmd_t *pmd;
-                        pte_t *pte;
-                        /* user gate pages are read-only */
-                        if (gup_flags & FOLL_WRITE)
-                                goto efault;
-                        if (pg > TASK_SIZE)
-                                pgd = pgd_offset_k(pg);
-                        else
-                                pgd = pgd_offset_gate(mm, pg);
-                        BUG_ON(pgd_none(*pgd));
-                        pud = pud_offset(pgd, pg);
-                        BUG_ON(pud_none(*pud));
-                        pmd = pmd_offset(pud, pg);
-                        if (pmd_none(*pmd))
-                                goto efault;
-                        VM_BUG_ON(pmd_trans_huge(*pmd));
-                        pte = pte_offset_map(pmd, pg);
-                        if (pte_none(*pte)) {
-                                pte_unmap(pte);
-                                goto efault;
-                        }
-                        vma = get_gate_vma(mm);
-                        if (pages) {
-                                struct page *page;
-                                page = vm_normal_page(vma, start, *pte);
-                                if (!page) {
-                                        if (!(gup_flags & FOLL_DUMP) &&
-                                             is_zero_pfn(pte_pfn(*pte)))
-                                                page = pte_page(*pte);
-                                        else {
-                                                pte_unmap(pte);
-                                                goto efault;
-                                        }
-                                }
-                                pages[i] = page;
-                                get_page(page);
-                        }
-                        pte_unmap(pte);
-                        page_mask = 0;
-                        goto next_page;
-                }
-                if (!vma)
-                        goto efault;
-                vm_flags = vma->vm_flags;
-                if (vm_flags & (VM_IO | VM_PFNMAP))
-                        goto efault;
-                if (gup_flags & FOLL_WRITE) {
-                        if (!(vm_flags & VM_WRITE)) {
-                                if (!(gup_flags & FOLL_FORCE))
-                                        goto efault;
-                                /*
-                                 * We used to let the write,force case do COW
-                                 * in a VM_MAYWRITE VM_SHARED !VM_WRITE vma, so
-                                 * ptrace could set a breakpoint in a read-only
-                                 * mapping of an executable, without corrupting
-                                 * the file (yet only when that file had been
-                                 * opened for writing!).  Anon pages in shared
-                                 * mappings are surprising: now just reject it.
-                                 */
-                                if (!is_cow_mapping(vm_flags)) {
-                                        WARN_ON_ONCE(vm_flags & VM_MAYWRITE);
-                                        goto efault;
-                                }
-                        }
-                } else {
-                        if (!(vm_flags & VM_READ)) {
-                                if (!(gup_flags & FOLL_FORCE))
-                                        goto efault;
-                                /*
-                                 * Is there actually any vma we can reach here
-                                 * which does not have VM_MAYREAD set?
-                                 */
-                                if (!(vm_flags & VM_MAYREAD))
-                                        goto efault;
-                        }
-                }
-                if (is_vm_hugetlb_page(vma)) {
-                        i = follow_hugetlb_page(mm, vma, pages, vmas,
-                                        &start, &nr_pages, i, gup_flags);
-                        continue;
-                }
-                do {
-                        struct page *page;
-                        unsigned int foll_flags = gup_flags;
-                        unsigned int page_increm;
-                        /*
-                         * If we have a pending SIGKILL, don't keep faulting
-                         * pages and potentially allocating memory.
-                         */
-                        if (unlikely(fatal_signal_pending(current)))
-                                return i ? i : -ERESTARTSYS;
-                        cond_resched();
-                        while (!(page = follow_page_mask(vma, start,
-                                                foll_flags, &page_mask))) {
-                                int ret;
-                                unsigned int fault_flags = 0;
-                                /* For mlock, just skip the stack guard page. */
-                                if (foll_flags & FOLL_MLOCK) {
-                                        if (stack_guard_page(vma, start))
-                                                goto next_page;
-                                }
-                                if (foll_flags & FOLL_WRITE)
-                                        fault_flags |= FAULT_FLAG_WRITE;
-                                if (nonblocking)
-                                        fault_flags |= FAULT_FLAG_ALLOW_RETRY;
-                                if (foll_flags & FOLL_NOWAIT)
-                                        fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
-                                ret = handle_mm_fault(mm, vma, start,
-                                                        fault_flags);
-                                if (ret & VM_FAULT_ERROR) {
-                                        if (ret & VM_FAULT_OOM)
-                                                return i ? i : -ENOMEM;
-                                        if (ret & (VM_FAULT_HWPOISON |
-                                                   VM_FAULT_HWPOISON_LARGE)) {
-                                                if (i)
-                                                        return i;
-                                                else if (gup_flags & FOLL_HWPOISON)
-                                                        return -EHWPOISON;
-                                                else
-                                                        return -EFAULT;
-                                        }
-                                        if (ret & VM_FAULT_SIGBUS)
-                                                goto efault;
-                                        BUG();
-                                }
-                                if (tsk) {
-                                        if (ret & VM_FAULT_MAJOR)
-                                                tsk->maj_flt++;
-                                        else
-                                                tsk->min_flt++;
-                                }
-                                if (ret & VM_FAULT_RETRY) {
-                                        if (nonblocking)
-                                                *nonblocking = 0;
-                                        return i;
-                                }
-                                /*
-                                 * The VM_FAULT_WRITE bit tells us that
-                                 * do_wp_page has broken COW when necessary,
-                                 * even if maybe_mkwrite decided not to set
-                                 * pte_write. We can thus safely do subsequent
-                                 * page lookups as if they were reads. But only
-                                 * do so when looping for pte_write is futile:
-                                 * in some cases userspace may also be wanting
-                                 * to write to the gotten user page, which a
-                                 * read fault here might prevent (a readonly
-                                 * page might get reCOWed by userspace write).
-                                 */
-                                if ((ret & VM_FAULT_WRITE) &&
-                                    !(vma->vm_flags & VM_WRITE))
-                                        foll_flags &= ~FOLL_WRITE;
-                                cond_resched();
-                        }
-                        if (IS_ERR(page))
-                                return i ? i : PTR_ERR(page);
-                        if (pages) {
-                                pages[i] = page;
-                                flush_anon_page(vma, page, start);
-                                flush_dcache_page(page);
-                                page_mask = 0;
-                        }
-next_page:
-                        if (vmas) {
-                                vmas[i] = vma;
-                                page_mask = 0;
-                        }
-                        page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
-                        if (page_increm > nr_pages)
-                                page_increm = nr_pages;
-                        i += page_increm;
-                        start += page_increm * PAGE_SIZE;
-                        nr_pages -= page_increm;
-                } while (nr_pages && start < vma->vm_end);
-        } while (nr_pages);
-        return i;
-efault:
-        return i ? : -EFAULT;
-}
-EXPORT_SYMBOL(__get_user_pages);
-/*
- * fixup_user_fault() - manually resolve a user page fault
- * @tsk:        the task_struct to use for page fault accounting, or
- *              NULL if faults are not to be recorded.
- * @mm:         mm_struct of target mm
- * @address:    user address
- * @fault_flags:flags to pass down to handle_mm_fault()
- *
- * This is meant to be called in the specific scenario where for locking reasons
- * we try to access user memory in atomic context (within a pagefault_disable()
- * section), this returns -EFAULT, and we want to resolve the user fault before
- * trying again.
- *
- * Typically this is meant to be used by the futex code.
- *
- * The main difference with get_user_pages() is that this function will
- * unconditionally call handle_mm_fault() which will in turn perform all the
- * necessary SW fixup of the dirty and young bits in the PTE, while
- * handle_mm_fault() only guarantees to update these in the struct page.
- *
- * This is important for some architectures where those bits also gate the
- * access permission to the page because they are maintained in software.  On
- * such architectures, gup() will not be enough to make a subsequent access
- * succeed.
- *
- * This should be called with the mm_sem held for read.
- */
-int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
-                     unsigned long address, unsigned int fault_flags)
-{
-        struct vm_area_struct *vma;
-        vm_flags_t vm_flags;
-        int ret;
-        vma = find_extend_vma(mm, address);
-        if (!vma || address < vma->vm_start)
-                return -EFAULT;
-        vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ;
-        if (!(vm_flags & vma->vm_flags))
-                return -EFAULT;
-        ret = handle_mm_fault(mm, vma, address, fault_flags);
-        if (ret & VM_FAULT_ERROR) {
-                if (ret & VM_FAULT_OOM)
-                        return -ENOMEM;
-                if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
-                        return -EHWPOISON;
-                if (ret & VM_FAULT_SIGBUS)
-                        return -EFAULT;
-                BUG();
-        }
-        if (tsk) {
-                if (ret & VM_FAULT_MAJOR)
-                        tsk->maj_flt++;
-                else
-                        tsk->min_flt++;
-        }
-        return 0;
-}
-/*
- * get_user_pages() - pin user pages in memory
- * @tsk:        the task_struct to use for page fault accounting, or
- *              NULL if faults are not to be recorded.
- * @mm:         mm_struct of target mm
- * @start:      starting user address
- * @nr_pages:   number of pages from start to pin
- * @write:      whether pages will be written to by the caller
- * @force:      whether to force access even when user mapping is currently
- *              protected (but never forces write access to shared mapping).
- * @pages:      array that receives pointers to the pages pinned.
- *              Should be at least nr_pages long. Or NULL, if caller
- *              only intends to ensure the pages are faulted in.
- * @vmas:       array of pointers to vmas corresponding to each page.
- *              Or NULL if the caller does not require them.
- *
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno. Each page returned must be released
- * with a put_page() call when it is finished with. vmas will only
- * remain valid while mmap_sem is held.
- *
- * Must be called with mmap_sem held for read or write.
- *
- * get_user_pages walks a process's page tables and takes a reference to
- * each struct page that each user address corresponds to at a given
- * instant. That is, it takes the page that would be accessed if a user
- * thread accesses the given user virtual address at that instant.
- *
- * This does not guarantee that the page exists in the user mappings when
- * get_user_pages returns, and there may even be a completely different
- * page there in some cases (eg. if mmapped pagecache has been invalidated
- * and subsequently re faulted). However it does guarantee that the page
- * won't be freed completely. And mostly callers simply care that the page
- * contains data that was valid *at some point in time*. Typically, an IO
- * or similar operation cannot guarantee anything stronger anyway because
- * locks can't be held over the syscall boundary.
- *
- * If write=0, the page must not be written to. If the page is written to,
- * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
- * after the page is finished with, and before put_page is called.
- *
- * get_user_pages is typically used for fewer-copy IO operations, to get a
- * handle on the memory by some means other than accesses via the user virtual
- * addresses. The pages may be submitted for DMA to devices or accessed via
- * their kernel linear mapping (via the kmap APIs). Care should be taken to
- * use the correct cache flushing APIs.
- *
- * See also get_user_pages_fast, for performance critical applications.
- */
-long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-                unsigned long start, unsigned long nr_pages, int write,
-                int force, struct page **pages, struct vm_area_struct **vmas)
-{
-        int flags = FOLL_TOUCH;
-        if (pages)
-                flags |= FOLL_GET;
-        if (write)
-                flags |= FOLL_WRITE;
-        if (force)
-                flags |= FOLL_FORCE;
-        return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
-                                NULL);
-}
-EXPORT_SYMBOL(get_user_pages);
-/**
- * get_dump_page() - pin user page in memory while writing it to core dump
- * @addr: user address
- *
- * Returns struct page pointer of user page pinned for dump,
- * to be freed afterwards by page_cache_release() or put_page().
- *
- * Returns NULL on any kind of failure - a hole must then be inserted into
- * the corefile, to preserve alignment with its headers; and also returns
- * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
- * allowing a hole to be left in the corefile to save diskspace.
- *
- * Called without mmap_sem, but after all other threads have been killed.
- */
-#ifdef CONFIG_ELF_CORE
-struct page *get_dump_page(unsigned long addr)
-{
-        struct vm_area_struct *vma;
-        struct page *page;
-        if (__get_user_pages(current, current->mm, addr, 1,
-                             FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
-                             NULL) < 1)
-                return NULL;
-        flush_cache_page(vma, addr, page_to_pfn(page));
-        return page;
-}
-#endif /* CONFIG_ELF_CORE */
 pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
                        spinlock_t **ptl)
 {
@@ -3402,65 +2758,76 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
        update_mmu_cache(vma, address, pte);
 }
-#define FAULT_AROUND_ORDER 4
+static unsigned long fault_around_bytes = 65536;
-#ifdef CONFIG_DEBUG_FS
+/*
-static unsigned int fault_around_order = FAULT_AROUND_ORDER;
+ * fault_around_pages() and fault_around_mask() round down fault_around_bytes
+ * to nearest page order. It's what do_fault_around() expects to see.
+ */
+static inline unsigned long fault_around_pages(void)
+{
+        return rounddown_pow_of_two(fault_around_bytes) / PAGE_SIZE;
+}
+static inline unsigned long fault_around_mask(void)
+{
+        return ~(rounddown_pow_of_two(fault_around_bytes) - 1) & PAGE_MASK;
+}
-static int fault_around_order_get(void *data, u64 *val)
+#ifdef CONFIG_DEBUG_FS
+static int fault_around_bytes_get(void *data, u64 *val)
 {
-        *val = fault_around_order;
+        *val = fault_around_bytes;
        return 0;
 }
-static int fault_around_order_set(void *data, u64 val)
+static int fault_around_bytes_set(void *data, u64 val)
 {
-        BUILD_BUG_ON((1UL << FAULT_AROUND_ORDER) > PTRS_PER_PTE);
+        if (val / PAGE_SIZE > PTRS_PER_PTE)
-        if (1UL << val > PTRS_PER_PTE)
                return -EINVAL;
-        fault_around_order = val;
+        fault_around_bytes = val;
        return 0;
 }
-DEFINE_SIMPLE_ATTRIBUTE(fault_around_order_fops,
+DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops,
-                fault_around_order_get, fault_around_order_set, "%llu\n");
+                fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
 static int __init fault_around_debugfs(void)
 {
        void *ret;
-        ret = debugfs_create_file("fault_around_order", 0644, NULL, NULL,
+        ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL,
-                        &fault_around_order_fops);
+                        &fault_around_bytes_fops);
        if (!ret)
-                pr_warn("Failed to create fault_around_order in debugfs");
+                pr_warn("Failed to create fault_around_bytes in debugfs");
        return 0;
 }
 late_initcall(fault_around_debugfs);
-static inline unsigned long fault_around_pages(void)
-{
-        return 1UL << fault_around_order;
-}
-static inline unsigned long fault_around_mask(void)
-{
-        return ~((1UL << (PAGE_SHIFT + fault_around_order)) - 1);
-}
-#else
-static inline unsigned long fault_around_pages(void)
-{
-        unsigned long nr_pages;
-        nr_pages = 1UL << FAULT_AROUND_ORDER;
-        BUILD_BUG_ON(nr_pages > PTRS_PER_PTE);
-        return nr_pages;
-}
-static inline unsigned long fault_around_mask(void)
-{
-        return ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1);
-}
 #endif
+/*
+ * do_fault_around() tries to map few pages around the fault address. The hope
+ * is that the pages will be needed soon and this will lower the number of
+ * faults to handle.
+ *
+ * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
+ * not ready to be mapped: not up-to-date, locked, etc.
+ *
+ * This function is called with the page table lock taken. In the split ptlock
+ * case the page table lock only protects only those entries which belong to
+ * the page table corresponding to the fault address.
+ *
+ * This function doesn't cross the VMA boundaries, in order to call map_pages()
+ * only once.
+ *
+ * fault_around_pages() defines how many pages we'll try to map.
+ * do_fault_around() expects it to return a power of two less than or equal to
+ * PTRS_PER_PTE.
+ *
+ * The virtual address of the area that we map is naturally aligned to the
+ * fault_around_pages() value (and therefore to page order).  This way it's
+ * easier to guarantee that we don't cross page table boundaries.
+ */
 static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
                pte_t *pte, pgoff_t pgoff, unsigned int flags)
 {
@@ -3476,7 +2843,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
        /*
         *  max_pgoff is either end of page table or end of vma
-         *  or fault_around_pages() from pgoff, depending what is neast.
+         *  or fault_around_pages() from pgoff, depending what is nearest.
         */
        max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
                PTRS_PER_PTE - 1;
@@ -3515,7 +2882,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * if page by the offset is not ready to be mapped (cold cache or
         * something).
         */
-        if (vma->vm_ops->map_pages) {
+        if (vma->vm_ops->map_pages && fault_around_pages() > 1) {
                pte = pte_offset_map_lock(mm, pmd, address, &ptl);
                do_fault_around(vma, address, pte, pgoff, flags);
                if (!pte_same(*pte, orig_pte))
@@ -3920,9 +3287,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                }
        }
-        /* THP should already have been handled */
-        BUG_ON(pmd_numa(*pmd));
        /*
         * Use __pte_alloc instead of pte_alloc_map, because we can't
         * run pte_offset_map on the pmd, if an huge pmd could
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a650db29606f..469bbf505f85 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -46,19 +46,84 @@
 static void generic_online_page(struct page *page);
 static online_page_callback_t online_page_callback = generic_online_page;
+static DEFINE_MUTEX(online_page_callback_lock);
-DEFINE_MUTEX(mem_hotplug_mutex);
+/* The same as the cpu_hotplug lock, but for memory hotplug. */
+static struct {
+        struct task_struct *active_writer;
+        struct mutex lock; /* Synchronizes accesses to refcount, */
+        /*
+         * Also blocks the new readers during
+         * an ongoing mem hotplug operation.
+         */
+        int refcount;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        struct lockdep_map dep_map;
+#endif
+} mem_hotplug = {
+        .active_writer = NULL,
+        .lock = __MUTEX_INITIALIZER(mem_hotplug.lock),
+        .refcount = 0,
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        .dep_map = {.name = "mem_hotplug.lock" },
+#endif
+};
+/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */
+#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map)
+#define memhp_lock_acquire()      lock_map_acquire(&mem_hotplug.dep_map)
+#define memhp_lock_release()      lock_map_release(&mem_hotplug.dep_map)
+void get_online_mems(void)
+{
+        might_sleep();
+        if (mem_hotplug.active_writer == current)
+                return;
+        memhp_lock_acquire_read();
+        mutex_lock(&mem_hotplug.lock);
+        mem_hotplug.refcount++;
+        mutex_unlock(&mem_hotplug.lock);
+}
-void lock_memory_hotplug(void)
+void put_online_mems(void)
 {
-        mutex_lock(&mem_hotplug_mutex);
+        if (mem_hotplug.active_writer == current)
+                return;
+        mutex_lock(&mem_hotplug.lock);
+        if (WARN_ON(!mem_hotplug.refcount))
+                mem_hotplug.refcount++; /* try to fix things up */
+        if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer))
+                wake_up_process(mem_hotplug.active_writer);
+        mutex_unlock(&mem_hotplug.lock);
+        memhp_lock_release();
 }
-void unlock_memory_hotplug(void)
+static void mem_hotplug_begin(void)
 {
-        mutex_unlock(&mem_hotplug_mutex);
+        mem_hotplug.active_writer = current;
+        memhp_lock_acquire();
+        for (;;) {
+                mutex_lock(&mem_hotplug.lock);
+                if (likely(!mem_hotplug.refcount))
+                        break;
+                __set_current_state(TASK_UNINTERRUPTIBLE);
+                mutex_unlock(&mem_hotplug.lock);
+                schedule();
+        }
 }
+static void mem_hotplug_done(void)
+{
+        mem_hotplug.active_writer = NULL;
+        mutex_unlock(&mem_hotplug.lock);
+        memhp_lock_release();
+}
 /* add this memory to iomem resource */
 static struct resource *register_memory_resource(u64 start, u64 size)
@@ -727,14 +792,16 @@ int set_online_page_callback(online_page_callback_t callback)
 {
        int rc = -EINVAL;
-        lock_memory_hotplug();
+        get_online_mems();
+        mutex_lock(&online_page_callback_lock);
        if (online_page_callback == generic_online_page) {
                online_page_callback = callback;
                rc = 0;
        }
-        unlock_memory_hotplug();
+        mutex_unlock(&online_page_callback_lock);
+        put_online_mems();
        return rc;
 }
@@ -744,14 +811,16 @@ int restore_online_page_callback(online_page_callback_t callback)
 {
        int rc = -EINVAL;
-        lock_memory_hotplug();
+        get_online_mems();
+        mutex_lock(&online_page_callback_lock);
        if (online_page_callback == callback) {
                online_page_callback = generic_online_page;
                rc = 0;
        }
-        unlock_memory_hotplug();
+        mutex_unlock(&online_page_callback_lock);
+        put_online_mems();
        return rc;
 }
@@ -899,7 +968,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
        int ret;
        struct memory_notify arg;
-        lock_memory_hotplug();
+        mem_hotplug_begin();
        /*
         * This doesn't need a lock to do pfn_to_page().
         * The section can't be removed here because of the
@@ -907,23 +976,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
         */
        zone = page_zone(pfn_to_page(pfn));
+        ret = -EINVAL;
        if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
-            !can_online_high_movable(zone)) {
+            !can_online_high_movable(zone))
-                unlock_memory_hotplug();
+                goto out;
-                return -EINVAL;
-        }
        if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
-                if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
+                if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages))
-                        unlock_memory_hotplug();
+                        goto out;
-                        return -EINVAL;
-                }
        }
        if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
-                if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) {
+                if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages))
-                        unlock_memory_hotplug();
+                        goto out;
-                        return -EINVAL;
-                }
        }
        /* Previous code may changed the zone of the pfn range */
@@ -939,8 +1003,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
        ret = notifier_to_errno(ret);
        if (ret) {
                memory_notify(MEM_CANCEL_ONLINE, &arg);
-                unlock_memory_hotplug();
+                goto out;
-                return ret;
        }
        /*
         * If this zone is not populated, then it is not in zonelist.
@@ -964,8 +1027,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
                       (((unsigned long long) pfn + nr_pages)
                            << PAGE_SHIFT) - 1);
                memory_notify(MEM_CANCEL_ONLINE, &arg);
-                unlock_memory_hotplug();
+                goto out;
-                return ret;
        }
        zone->present_pages += onlined_pages;
@@ -995,9 +1057,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
        if (onlined_pages)
                memory_notify(MEM_ONLINE, &arg);
-        unlock_memory_hotplug();
+out:
+        mem_hotplug_done();
-        return 0;
+        return ret;
 }
 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
@@ -1007,7 +1069,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
        struct pglist_data *pgdat;
        unsigned long zones_size[MAX_NR_ZONES] = {0};
        unsigned long zholes_size[MAX_NR_ZONES] = {0};
-        unsigned long start_pfn = start >> PAGE_SHIFT;
+        unsigned long start_pfn = PFN_DOWN(start);
        pgdat = NODE_DATA(nid);
        if (!pgdat) {
@@ -1055,7 +1117,7 @@ int try_online_node(int nid)
        if (node_online(nid))
                return 0;
-        lock_memory_hotplug();
+        mem_hotplug_begin();
        pgdat = hotadd_new_pgdat(nid, 0);
        if (!pgdat) {
                pr_err("Cannot online node %d due to NULL pgdat\n", nid);
@@ -1073,13 +1135,13 @@ int try_online_node(int nid)
        }
 out:
-        unlock_memory_hotplug();
+        mem_hotplug_done();
        return ret;
 }
 static int check_hotplug_memory_range(u64 start, u64 size)
 {
-        u64 start_pfn = start >> PAGE_SHIFT;
+        u64 start_pfn = PFN_DOWN(start);
        u64 nr_pages = size >> PAGE_SHIFT;
        /* Memory range must be aligned with section */
@@ -1117,7 +1179,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
                new_pgdat = !p;
        }
-        lock_memory_hotplug();
+        mem_hotplug_begin();
        new_node = !node_online(nid);
        if (new_node) {
@@ -1158,7 +1220,7 @@ error:
        release_memory_resource(res);
 out:
-        unlock_memory_hotplug();
+        mem_hotplug_done();
        return ret;
 }
 EXPORT_SYMBOL_GPL(add_memory);
@@ -1332,7 +1394,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                 * alloc_migrate_target should be improooooved!!
                 * migrate_pages returns # of failed pages.
                 */
-                ret = migrate_pages(&source, alloc_migrate_target, 0,
+                ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
                                        MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
                if (ret)
                        putback_movable_pages(&source);
@@ -1565,7 +1627,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
        if (!test_pages_in_a_zone(start_pfn, end_pfn))
                return -EINVAL;
-        lock_memory_hotplug();
+        mem_hotplug_begin();
        zone = page_zone(pfn_to_page(start_pfn));
        node = zone_to_nid(zone);
@@ -1672,7 +1734,7 @@ repeat:
        writeback_set_ratelimit();
        memory_notify(MEM_OFFLINE, &arg);
-        unlock_memory_hotplug();
+        mem_hotplug_done();
        return 0;
 failed_removal:
@@ -1684,7 +1746,7 @@ failed_removal:
        undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
 out:
-        unlock_memory_hotplug();
+        mem_hotplug_done();
        return ret;
 }
@@ -1888,7 +1950,7 @@ void __ref remove_memory(int nid, u64 start, u64 size)
        BUG_ON(check_hotplug_memory_range(start, size));
-        lock_memory_hotplug();
+        mem_hotplug_begin();
        /*
         * All memory blocks must be offlined before removing memory.  Check
@@ -1897,10 +1959,8 @@ void __ref remove_memory(int nid, u64 start, u64 size)
         */
        ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
                                check_memblock_offlined_cb);
-        if (ret) {
+        if (ret)
-                unlock_memory_hotplug();
                BUG();
-        }
        /* remove memmap entry */
        firmware_map_remove(start, start + size, "System RAM");
@@ -1909,7 +1969,7 @@ void __ref remove_memory(int nid, u64 start, u64 size)
        try_offline_node(nid);
-        unlock_memory_hotplug();
+        mem_hotplug_done();
 }
 EXPORT_SYMBOL_GPL(remove_memory);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 78e1472933ea..16bc9fa42998 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
        if (!list_empty(&pagelist)) {
-                err = migrate_pages(&pagelist, new_node_page, dest,
+                err = migrate_pages(&pagelist, new_node_page, NULL, dest,
                                        MIGRATE_SYNC, MR_SYSCALL);
                if (err)
                        putback_movable_pages(&pagelist);
@@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len,
                if (!list_empty(&pagelist)) {
                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
                        nr_failed = migrate_pages(&pagelist, new_vma_page,
-                                        (unsigned long)vma,
+                                        NULL, (unsigned long)vma,
                                        MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
                        if (nr_failed)
                                putback_movable_pages(&pagelist);
@@ -1362,7 +1362,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 }
 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
-                unsigned long, mode, unsigned long __user *, nmask,
+                unsigned long, mode, const unsigned long __user *, nmask,
                unsigned long, maxnode, unsigned, flags)
 {
        nodemask_t nodes;
@@ -1383,7 +1383,7 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
 }
 /* Set the process memory policy */
-SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
+SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
                unsigned long, maxnode)
 {
        int err;
@@ -1606,9 +1606,9 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
 /*
 * get_vma_policy(@task, @vma, @addr)
- * @task - task for fallback if vma policy == default
+ * @task: task for fallback if vma policy == default
- * @vma   - virtual memory area whose policy is sought
+ * @vma: virtual memory area whose policy is sought
- * @addr  - address in @vma for shared policy lookup
+ * @addr: address in @vma for shared policy lookup
 *
 * Returns effective policy for a VMA at specified address.
 * Falls back to @task or system default policy, as necessary.
@@ -1854,11 +1854,11 @@ int node_random(const nodemask_t *maskp)
 #ifdef CONFIG_HUGETLBFS
 /*
 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
- * @vma = virtual memory area whose policy is sought
+ * @vma: virtual memory area whose policy is sought
- * @addr = address in @vma for shared policy lookup and interleave policy
+ * @addr: address in @vma for shared policy lookup and interleave policy
- * @gfp_flags = for requested zone
+ * @gfp_flags: for requested zone
- * @mpol = pointer to mempolicy pointer for reference counted mempolicy
+ * @mpol: pointer to mempolicy pointer for reference counted mempolicy
- * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
+ * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
 *
 * Returns a zonelist suitable for a huge page allocation and a pointer
 * to the struct mempolicy for conditional unref after allocation.
@@ -2270,9 +2270,9 @@ static void sp_free(struct sp_node *n)
 /**
 * mpol_misplaced - check whether current page node is valid in policy
 *
- * @page   - page to be checked
+ * @page: page to be checked
- * @vma    - vm area where page mapped
+ * @vma: vm area where page mapped
- * @addr   - virtual address where page mapped
+ * @addr: virtual address where page mapped
 *
 * Lookup current policy node id for vma,addr and "compare to" page's
 * node id.
diff --git a/mm/mempool.c b/mm/mempool.c
index 905434f18c97..455d468c3a5d 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -192,6 +192,7 @@ EXPORT_SYMBOL(mempool_resize);
 * returns NULL. Note that due to preallocation, this function
 * *never* fails when called from process contexts. (it might
 * fail if called from an IRQ context.)
+ * Note: using __GFP_ZERO is not supported.
 */
 void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
 {
@@ -200,6 +201,7 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
        wait_queue_t wait;
        gfp_t gfp_temp;
+        VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
        might_sleep_if(gfp_mask & __GFP_WAIT);
        gfp_mask |= __GFP_NOMEMALLOC;   /* don't allocate emergency reserves */
diff --git a/mm/migrate.c b/mm/migrate.c
index bed48809e5d0..63f0cd559999 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -938,8 +938,9 @@ out:
 * Obtain the lock on page, remove all ptes and migrate the page
 * to the newly allocated page in newpage.
 */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
+static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
-                        struct page *page, int force, enum migrate_mode mode)
+                        unsigned long private, struct page *page, int force,
+                        enum migrate_mode mode)
 {
        int rc = 0;
        int *result = NULL;
@@ -983,11 +984,17 @@ out:
                                page_is_file_cache(page));
                putback_lru_page(page);
        }
        /*
-         * Move the new page to the LRU. If migration was not successful
+         * If migration was not successful and there's a freeing callback, use
-         * then this will free the page.
+         * it.  Otherwise, putback_lru_page() will drop the reference grabbed
+         * during isolation.
         */
-        putback_lru_page(newpage);
+        if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+                put_new_page(newpage, private);
+        else
+                putback_lru_page(newpage);
        if (result) {
                if (rc)
                        *result = rc;
@@ -1016,8 +1023,9 @@ out:
 * will wait in the page fault for migration to complete.
 */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
-                                unsigned long private, struct page *hpage,
+                                free_page_t put_new_page, unsigned long private,
-                                int force, enum migrate_mode mode)
+                                struct page *hpage, int force,
+                                enum migrate_mode mode)
 {
        int rc = 0;
        int *result = NULL;
@@ -1031,7 +1039,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
         * tables or check whether the hugepage is pmd-based or not before
         * kicking migration.
         */
-        if (!hugepage_migration_support(page_hstate(hpage))) {
+        if (!hugepage_migration_supported(page_hstate(hpage))) {
                putback_active_hugepage(hpage);
                return -ENOSYS;
        }
@@ -1056,20 +1064,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
        if (!page_mapped(hpage))
                rc = move_to_new_page(new_hpage, hpage, 1, mode);
-        if (rc)
+        if (rc != MIGRATEPAGE_SUCCESS)
                remove_migration_ptes(hpage, hpage);
        if (anon_vma)
                put_anon_vma(anon_vma);
-        if (!rc)
+        if (rc == MIGRATEPAGE_SUCCESS)
                hugetlb_cgroup_migrate(hpage, new_hpage);
        unlock_page(hpage);
 out:
        if (rc != -EAGAIN)
                putback_active_hugepage(hpage);
-        put_page(new_hpage);
+        /*
+         * If migration was not successful and there's a freeing callback, use
+         * it.  Otherwise, put_page() will drop the reference grabbed during
+         * isolation.
+         */
+        if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+                put_new_page(new_hpage, private);
+        else
+                put_page(new_hpage);
        if (result) {
                if (rc)
                        *result = rc;
@@ -1086,6 +1104,8 @@ out:
 * @from:               The list of pages to be migrated.
 * @get_new_page:       The function used to allocate free pages to be used
 *                      as the target of the page migration.
+ * @put_new_page:       The function used to free target pages if migration
+ *                      fails, or NULL if no special handling is necessary.
 * @private:            Private data to be passed on to get_new_page()
 * @mode:               The migration mode that specifies the constraints for
 *                      page migration, if any.
@@ -1099,7 +1119,8 @@ out:
 * Returns the number of pages that were not migrated, or an error code.
 */
 int migrate_pages(struct list_head *from, new_page_t get_new_page,
-                unsigned long private, enum migrate_mode mode, int reason)
+                free_page_t put_new_page, unsigned long private,
+                enum migrate_mode mode, int reason)
 {
        int retry = 1;
        int nr_failed = 0;
@@ -1121,10 +1142,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
                        if (PageHuge(page))
                                rc = unmap_and_move_huge_page(get_new_page,
-                                                private, page, pass > 2, mode);
+                                                put_new_page, private, page,
+                                                pass > 2, mode);
                        else
-                                rc = unmap_and_move(get_new_page, private,
+                                rc = unmap_and_move(get_new_page, put_new_page,
-                                                page, pass > 2, mode);
+                                                private, page, pass > 2, mode);
                        switch(rc) {
                        case -ENOMEM:
@@ -1273,7 +1295,7 @@ set_status:
        err = 0;
        if (!list_empty(&pagelist)) {
-                err = migrate_pages(&pagelist, new_page_node,
+                err = migrate_pages(&pagelist, new_page_node, NULL,
                                (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
                if (err)
                        putback_movable_pages(&pagelist);
@@ -1729,7 +1751,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
        list_add(&page->lru, &migratepages);
        nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
-                                     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
+                                     NULL, node, MIGRATE_ASYNC,
+                                     MR_NUMA_MISPLACED);
        if (nr_remaining) {
                if (!list_empty(&migratepages)) {
                        list_del(&page->lru);
@@ -1852,7 +1875,7 @@ fail_putback:
         * guarantee the copy is visible before the pagetable update.
         */
        flush_cache_range(vma, mmun_start, mmun_end);
-        page_add_new_anon_rmap(new_page, vma, mmun_start);
+        page_add_anon_rmap(new_page, vma, mmun_start);
        pmdp_clear_flush(vma, mmun_start, pmd);
        set_pmd_at(mm, mmun_start, pmd, entry);
        flush_tlb_range(vma, mmun_start, mmun_end);
@@ -1877,6 +1900,10 @@ fail_putback:
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+        /* Take an "isolate" reference and put new page on the LRU. */
+        get_page(new_page);
+        putback_lru_page(new_page);
        unlock_page(new_page);
        unlock_page(page);
        put_page(page);                 /* Drop the rmap reference */
diff --git a/mm/mmap.c b/mm/mmap.c
index b1202cf81f4b..8a56d39df4ed 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -640,11 +640,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        struct address_space *mapping = NULL;
-        if (vma->vm_file)
+        if (vma->vm_file) {
                mapping = vma->vm_file->f_mapping;
-        if (mapping)
                mutex_lock(&mapping->i_mmap_mutex);
+        }
        __vma_link(mm, vma, prev, rb_link, rb_parent);
        __vma_link_file(vma);
@@ -2965,9 +2964,7 @@ int install_special_mapping(struct mm_struct *mm,
        struct vm_area_struct *vma = _install_special_mapping(mm,
                            addr, len, vm_flags, pages);
-        if (IS_ERR(vma))
+        return PTR_ERR_OR_ZERO(vma);
-                return PTR_ERR(vma);
-        return 0;
 }
 static DEFINE_MUTEX(mm_all_locks_mutex);
diff --git a/mm/msync.c b/mm/msync.c
index 632df4527c01..a5c673669ca6 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -58,6 +58,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
        vma = find_vma(mm, start);
        for (;;) {
                struct file *file;
+                loff_t fstart, fend;
                /* Still start < end. */
                error = -ENOMEM;
@@ -77,12 +78,17 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
                        goto out_unlock;
                }
                file = vma->vm_file;
+                fstart = start + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+                fend = fstart + (min(end, vma->vm_end) - start) - 1;
                start = vma->vm_end;
                if ((flags & MS_SYNC) && file &&
                                (vma->vm_flags & VM_SHARED)) {
                        get_file(file);
                        up_read(&mm->mmap_sem);
-                        error = vfs_fsync(file, 0);
+                        if (vma->vm_flags & VM_NONLINEAR)
+                                error = vfs_fsync(file, 1);
+                        else
+                                error = vfs_fsync_range(file, fstart, fend, 1);
                        fput(file);
                        if (error || start >= end)
                                goto out;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a4317da60532..533fa60c9ac1 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -156,24 +156,6 @@ static unsigned long writeout_period_time = 0;
 #define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
 /*
- * Work out the current dirty-memory clamping and background writeout
- * thresholds.
- *
- * The main aim here is to lower them aggressively if there is a lot of mapped
- * memory around.  To avoid stressing page reclaim with lots of unreclaimable
- * pages.  It is better to clamp down on writers than to start swapping, and
- * performing lots of scanning.
- *
- * We only allow 1/2 of the currently-unmapped memory to be dirtied.
- *
- * We don't permit the clamping level to fall below 5% - that is getting rather
- * excessive.
- *
- * We make sure that the background writeout level is below the adjusted
- * clamping level.
- */
-/*
 * In a memory zone, there is a certain amount of pages we consider
 * available for the page cache, which is essentially the number of
 * free and reclaimable pages, minus some zone reserves to protect
@@ -1623,7 +1605,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
         * 1000+ tasks, all of them start dirtying pages at exactly the same
         * time, hence all honoured too large initial task->nr_dirtied_pause.
         */
-        p =  &__get_cpu_var(bdp_ratelimits);
+        p =  this_cpu_ptr(&bdp_ratelimits);
        if (unlikely(current->nr_dirtied >= ratelimit))
                *p = 0;
        else if (unlikely(*p >= ratelimit_pages)) {
@@ -1635,7 +1617,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
         * short-lived tasks (eg. gcc invocations in a kernel build) escaping
         * the dirty throttling and livelock other long-run dirtiers.
         */
-        p = &__get_cpu_var(dirty_throttle_leaks);
+        p = this_cpu_ptr(&dirty_throttle_leaks);
        if (*p > 0 && current->nr_dirtied < ratelimit) {
                unsigned long nr_pages_dirtied;
                nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5dba2933c9c0..a59bdb653958 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -261,8 +261,9 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
        } while (zone_span_seqretry(zone, seq));
        if (ret)
-                pr_err("page %lu outside zone [ %lu - %lu ]\n",
+                pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
-                        pfn, start_pfn, start_pfn + sp);
+                        pfn, zone_to_nid(zone), zone->name,
+                        start_pfn, start_pfn + sp);
        return ret;
 }
@@ -408,7 +409,8 @@ static int destroy_compound_page(struct page *page, unsigned long order)
        return bad;
 }
-static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
+static inline void prep_zero_page(struct page *page, unsigned int order,
+                                                        gfp_t gfp_flags)
 {
        int i;
@@ -452,7 +454,7 @@ static inline void set_page_guard_flag(struct page *page) { }
 static inline void clear_page_guard_flag(struct page *page) { }
 #endif
-static inline void set_page_order(struct page *page, int order)
+static inline void set_page_order(struct page *page, unsigned int order)
 {
        set_page_private(page, order);
        __SetPageBuddy(page);
@@ -503,21 +505,31 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
 * For recording page's order, we use page_private(page).
 */
 static inline int page_is_buddy(struct page *page, struct page *buddy,
-                                                                int order)
+                                                        unsigned int order)
 {
        if (!pfn_valid_within(page_to_pfn(buddy)))
                return 0;
-        if (page_zone_id(page) != page_zone_id(buddy))
-                return 0;
        if (page_is_guard(buddy) && page_order(buddy) == order) {
                VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
+                if (page_zone_id(page) != page_zone_id(buddy))
+                        return 0;
                return 1;
        }
        if (PageBuddy(buddy) && page_order(buddy) == order) {
                VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
+                /*
+                 * zone check is done late to avoid uselessly
+                 * calculating zone/node ids for pages that could
+                 * never merge.
+                 */
+                if (page_zone_id(page) != page_zone_id(buddy))
+                        return 0;
                return 1;
        }
        return 0;
@@ -549,6 +561,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
 */
 static inline void __free_one_page(struct page *page,
+                unsigned long pfn,
                struct zone *zone, unsigned int order,
                int migratetype)
 {
@@ -565,7 +578,7 @@ static inline void __free_one_page(struct page *page,
        VM_BUG_ON(migratetype == -1);
-        page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
+        page_idx = pfn & ((1 << MAX_ORDER) - 1);
        VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
        VM_BUG_ON_PAGE(bad_range(zone, page), page);
@@ -700,7 +713,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        list_del(&page->lru);
                        mt = get_freepage_migratetype(page);
                        /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
-                        __free_one_page(page, zone, 0, mt);
+                        __free_one_page(page, page_to_pfn(page), zone, 0, mt);
                        trace_mm_page_pcpu_drain(page, 0, mt);
                        if (likely(!is_migrate_isolate_page(page))) {
                                __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
@@ -712,13 +725,15 @@ static void free_pcppages_bulk(struct zone *zone, int count,
        spin_unlock(&zone->lock);
 }
-static void free_one_page(struct zone *zone, struct page *page, int order,
+static void free_one_page(struct zone *zone,
+                                struct page *page, unsigned long pfn,
+                                unsigned int order,
                                int migratetype)
 {
        spin_lock(&zone->lock);
        zone->pages_scanned = 0;
-        __free_one_page(page, zone, order, migratetype);
+        __free_one_page(page, pfn, zone, order, migratetype);
        if (unlikely(!is_migrate_isolate(migratetype)))
                __mod_zone_freepage_state(zone, 1 << order, migratetype);
        spin_unlock(&zone->lock);
@@ -755,15 +770,16 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 {
        unsigned long flags;
        int migratetype;
+        unsigned long pfn = page_to_pfn(page);
        if (!free_pages_prepare(page, order))
                return;
+        migratetype = get_pfnblock_migratetype(page, pfn);
        local_irq_save(flags);
        __count_vm_events(PGFREE, 1 << order);
-        migratetype = get_pageblock_migratetype(page);
        set_freepage_migratetype(page, migratetype);
-        free_one_page(page_zone(page), page, order, migratetype);
+        free_one_page(page_zone(page), page, pfn, order, migratetype);
        local_irq_restore(flags);
 }
@@ -882,7 +898,7 @@ static inline int check_new_page(struct page *page)
        return 0;
 }
-static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
+static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
 {
        int i;
@@ -931,6 +947,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
                rmv_page_order(page);
                area->nr_free--;
                expand(zone, page, order, current_order, area, migratetype);
+                set_freepage_migratetype(page, migratetype);
                return page;
        }
@@ -1057,7 +1074,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
        /*
         * When borrowing from MIGRATE_CMA, we need to release the excess
-         * buddy pages to CMA itself.
+         * buddy pages to CMA itself. We also ensure the freepage_migratetype
+         * is set to CMA so it is returned to the correct freelist in case
+         * the page ends up being not actually allocated from the pcp lists.
         */
        if (is_migrate_cma(fallback_type))
                return fallback_type;
@@ -1090,16 +1109,17 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
 /* Remove an element from the buddy allocator from the fallback list */
 static inline struct page *
-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
+__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
 {
        struct free_area *area;
-        int current_order;
+        unsigned int current_order;
        struct page *page;
        int migratetype, new_type, i;
        /* Find the largest possible block of pages in the other list */
-        for (current_order = MAX_ORDER-1; current_order >= order;
+        for (current_order = MAX_ORDER-1;
-                                                --current_order) {
+                                current_order >= order && current_order <= MAX_ORDER-1;
+                                --current_order) {
                for (i = 0;; i++) {
                        migratetype = fallbacks[start_migratetype][i];
@@ -1125,6 +1145,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
                        expand(zone, page, order, current_order, area,
                               new_type);
+                        /* The freepage_migratetype may differ from pageblock's
+                         * migratetype depending on the decisions in
+                         * try_to_steal_freepages. This is OK as long as it does
+                         * not differ for MIGRATE_CMA type.
+                         */
+                        set_freepage_migratetype(page, new_type);
                        trace_mm_page_alloc_extfrag(page, order, current_order,
                                start_migratetype, migratetype, new_type);
@@ -1173,9 +1199,9 @@ retry_reserve:
 */
 static int rmqueue_bulk(struct zone *zone, unsigned int order,
                        unsigned long count, struct list_head *list,
-                        int migratetype, int cold)
+                        int migratetype, bool cold)
 {
-        int mt = migratetype, i;
+        int i;
        spin_lock(&zone->lock);
        for (i = 0; i < count; ++i) {
@@ -1192,18 +1218,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                 * merge IO requests if the physical pages are ordered
                 * properly.
                 */
-                if (likely(cold == 0))
+                if (likely(!cold))
                        list_add(&page->lru, list);
                else
                        list_add_tail(&page->lru, list);
-                if (IS_ENABLED(CONFIG_CMA)) {
-                        mt = get_pageblock_migratetype(page);
-                        if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
-                                mt = migratetype;
-                }
-                set_freepage_migratetype(page, mt);
                list = &page->lru;
-                if (is_migrate_cma(mt))
+                if (is_migrate_cma(get_freepage_migratetype(page)))
                        __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
                                              -(1 << order));
        }
@@ -1327,7 +1347,7 @@ void mark_free_pages(struct zone *zone)
 {
        unsigned long pfn, max_zone_pfn;
        unsigned long flags;
-        int order, t;
+        unsigned int order, t;
        struct list_head *curr;
        if (zone_is_empty(zone))
@@ -1359,19 +1379,20 @@ void mark_free_pages(struct zone *zone)
 /*
 * Free a 0-order page
- * cold == 1 ? free a cold page : free a hot page
+ * cold == true ? free a cold page : free a hot page
 */
-void free_hot_cold_page(struct page *page, int cold)
+void free_hot_cold_page(struct page *page, bool cold)
 {
        struct zone *zone = page_zone(page);
        struct per_cpu_pages *pcp;
        unsigned long flags;
+        unsigned long pfn = page_to_pfn(page);
        int migratetype;
        if (!free_pages_prepare(page, 0))
                return;
-        migratetype = get_pageblock_migratetype(page);
+        migratetype = get_pfnblock_migratetype(page, pfn);
        set_freepage_migratetype(page, migratetype);
        local_irq_save(flags);
        __count_vm_event(PGFREE);
@@ -1385,17 +1406,17 @@ void free_hot_cold_page(struct page *page, int cold)
         */
        if (migratetype >= MIGRATE_PCPTYPES) {
                if (unlikely(is_migrate_isolate(migratetype))) {
-                        free_one_page(zone, page, 0, migratetype);
+                        free_one_page(zone, page, pfn, 0, migratetype);
                        goto out;
                }
                migratetype = MIGRATE_MOVABLE;
        }
        pcp = &this_cpu_ptr(zone->pageset)->pcp;
-        if (cold)
+        if (!cold)
-                list_add_tail(&page->lru, &pcp->lists[migratetype]);
-        else
                list_add(&page->lru, &pcp->lists[migratetype]);
+        else
+                list_add_tail(&page->lru, &pcp->lists[migratetype]);
        pcp->count++;
        if (pcp->count >= pcp->high) {
                unsigned long batch = ACCESS_ONCE(pcp->batch);
@@ -1410,7 +1431,7 @@ out:
 /*
 * Free a list of 0-order pages
 */
-void free_hot_cold_page_list(struct list_head *list, int cold)
+void free_hot_cold_page_list(struct list_head *list, bool cold)
 {
        struct page *page, *next;
@@ -1522,12 +1543,12 @@ int split_free_page(struct page *page)
 */
 static inline
 struct page *buffered_rmqueue(struct zone *preferred_zone,
-                        struct zone *zone, int order, gfp_t gfp_flags,
+                        struct zone *zone, unsigned int order,
-                        int migratetype)
+                        gfp_t gfp_flags, int migratetype)
 {
        unsigned long flags;
        struct page *page;
-        int cold = !!(gfp_flags & __GFP_COLD);
+        bool cold = ((gfp_flags & __GFP_COLD) != 0);
 again:
        if (likely(order == 0)) {
@@ -1572,7 +1593,7 @@ again:
                if (!page)
                        goto failed;
                __mod_zone_freepage_state(zone, -(1 << order),
-                                          get_pageblock_migratetype(page));
+                                          get_freepage_migratetype(page));
        }
        __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
@@ -1672,8 +1693,9 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 * Return true if free pages are above 'mark'. This takes into account the order
 * of the allocation.
 */
-static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+static bool __zone_watermark_ok(struct zone *z, unsigned int order,
-                      int classzone_idx, int alloc_flags, long free_pages)
+                        unsigned long mark, int classzone_idx, int alloc_flags,
+                        long free_pages)
 {
        /* free_pages my go negative - that's OK */
        long min = mark;
@@ -1707,15 +1729,15 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        return true;
 }
-bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
                      int classzone_idx, int alloc_flags)
 {
        return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
                                        zone_page_state(z, NR_FREE_PAGES));
 }
-bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
-                      int classzone_idx, int alloc_flags)
+                        unsigned long mark, int classzone_idx, int alloc_flags)
 {
        long free_pages = zone_page_state(z, NR_FREE_PAGES);
@@ -1850,18 +1872,8 @@ static bool zone_local(struct zone *local_zone, struct zone *zone)
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
-        return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
+        return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
-}
+                                RECLAIM_DISTANCE;
-static void __paginginit init_zone_allows_reclaim(int nid)
-{
-        int i;
-        for_each_node_state(i, N_MEMORY)
-                if (node_distance(nid, i) <= RECLAIM_DISTANCE)
-                        node_set(i, NODE_DATA(nid)->reclaim_nodes);
-                else
-                        zone_reclaim_mode = 1;
 }
 #else   /* CONFIG_NUMA */
@@ -1895,9 +1907,6 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
        return true;
 }
-static inline void init_zone_allows_reclaim(int nid)
-{
-}
 #endif  /* CONFIG_NUMA */
 /*
@@ -1907,17 +1916,17 @@ static inline void init_zone_allows_reclaim(int nid)
 static struct page *
 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
                struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
-                struct zone *preferred_zone, int migratetype)
+                struct zone *preferred_zone, int classzone_idx, int migratetype)
 {
        struct zoneref *z;
        struct page *page = NULL;
-        int classzone_idx;
        struct zone *zone;
        nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
        int zlc_active = 0;             /* set if using zonelist_cache */
        int did_zlc_setup = 0;          /* just call zlc_setup() one time */
+        bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
+                                (gfp_mask & __GFP_WRITE);
-        classzone_idx = zone_idx(preferred_zone);
 zonelist_scan:
        /*
         * Scan zonelist, looking for a zone with enough free.
@@ -1930,12 +1939,10 @@ zonelist_scan:
                if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
                        !zlc_zone_worth_trying(zonelist, z, allowednodes))
                                continue;
-                if ((alloc_flags & ALLOC_CPUSET) &&
+                if (cpusets_enabled() &&
+                        (alloc_flags & ALLOC_CPUSET) &&
                        !cpuset_zone_allowed_softwall(zone, gfp_mask))
                                continue;
-                BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
-                if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
-                        goto try_this_zone;
                /*
                 * Distribute pages in proportion to the individual
                 * zone size to ensure fair page aging.  The zone a
@@ -1974,15 +1981,19 @@ zonelist_scan:
                 * will require awareness of zones in the
                 * dirty-throttling and the flusher threads.
                 */
-                if ((alloc_flags & ALLOC_WMARK_LOW) &&
+                if (consider_zone_dirty && !zone_dirty_ok(zone))
-                    (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
+                        continue;
-                        goto this_zone_full;
                mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
                if (!zone_watermark_ok(zone, order, mark,
                                       classzone_idx, alloc_flags)) {
                        int ret;
+                        /* Checked here to keep the fast path fast */
+                        BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
+                        if (alloc_flags & ALLOC_NO_WATERMARKS)
+                                goto try_this_zone;
                        if (IS_ENABLED(CONFIG_NUMA) &&
                                        !did_zlc_setup && nr_online_nodes > 1) {
                                /*
@@ -2044,7 +2055,7 @@ try_this_zone:
                if (page)
                        break;
 this_zone_full:
-                if (IS_ENABLED(CONFIG_NUMA))
+                if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
                        zlc_mark_zone_full(zonelist, z);
        }
@@ -2173,7 +2184,7 @@ static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, struct zone *preferred_zone,
-        int migratetype)
+        int classzone_idx, int migratetype)
 {
        struct page *page;
@@ -2191,7 +2202,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
                order, zonelist, high_zoneidx,
                ALLOC_WMARK_HIGH|ALLOC_CPUSET,
-                preferred_zone, migratetype);
+                preferred_zone, classzone_idx, migratetype);
        if (page)
                goto out;
@@ -2226,7 +2237,7 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int migratetype, bool sync_migration,
+        int classzone_idx, int migratetype, enum migrate_mode mode,
        bool *contended_compaction, bool *deferred_compaction,
        unsigned long *did_some_progress)
 {
@@ -2240,7 +2251,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        current->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-                                                nodemask, sync_migration,
+                                                nodemask, mode,
                                                contended_compaction);
        current->flags &= ~PF_MEMALLOC;
@@ -2254,7 +2265,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                page = get_page_from_freelist(gfp_mask, nodemask,
                                order, zonelist, high_zoneidx,
                                alloc_flags & ~ALLOC_NO_WATERMARKS,
-                                preferred_zone, migratetype);
+                                preferred_zone, classzone_idx, migratetype);
                if (page) {
                        preferred_zone->compact_blockskip_flush = false;
                        compaction_defer_reset(preferred_zone, order, true);
@@ -2273,7 +2284,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                 * As async compaction considers a subset of pageblocks, only
                 * defer if the failure was a sync compaction failure.
                 */
-                if (sync_migration)
+                if (mode != MIGRATE_ASYNC)
                        defer_compaction(preferred_zone, order);
                cond_resched();
@@ -2286,9 +2297,9 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int migratetype, bool sync_migration,
+        int classzone_idx, int migratetype,
-        bool *contended_compaction, bool *deferred_compaction,
+        enum migrate_mode mode, bool *contended_compaction,
-        unsigned long *did_some_progress)
+        bool *deferred_compaction, unsigned long *did_some_progress)
 {
        return NULL;
 }
@@ -2327,7 +2338,7 @@ static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int migratetype, unsigned long *did_some_progress)
+        int classzone_idx, int migratetype, unsigned long *did_some_progress)
 {
        struct page *page = NULL;
        bool drained = false;
@@ -2345,7 +2356,8 @@ retry:
        page = get_page_from_freelist(gfp_mask, nodemask, order,
                                        zonelist, high_zoneidx,
                                        alloc_flags & ~ALLOC_NO_WATERMARKS,
-                                        preferred_zone, migratetype);
+                                        preferred_zone, classzone_idx,
+                                        migratetype);
        /*
         * If an allocation failed after direct reclaim, it could be because
@@ -2368,14 +2380,14 @@ static inline struct page *
 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, struct zone *preferred_zone,
-        int migratetype)
+        int classzone_idx, int migratetype)
 {
        struct page *page;
        do {
                page = get_page_from_freelist(gfp_mask, nodemask, order,
                        zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
-                        preferred_zone, migratetype);
+                        preferred_zone, classzone_idx, migratetype);
                if (!page && gfp_mask & __GFP_NOFAIL)
                        wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
@@ -2476,14 +2488,14 @@ static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, struct zone *preferred_zone,
-        int migratetype)
+        int classzone_idx, int migratetype)
 {
        const gfp_t wait = gfp_mask & __GFP_WAIT;
        struct page *page = NULL;
        int alloc_flags;
        unsigned long pages_reclaimed = 0;
        unsigned long did_some_progress;
-        bool sync_migration = false;
+        enum migrate_mode migration_mode = MIGRATE_ASYNC;
        bool deferred_compaction = false;
        bool contended_compaction = false;
@@ -2525,15 +2537,18 @@ restart:
         * Find the true preferred zone if the allocation is unconstrained by
         * cpusets.
         */
-        if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
+        if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
-                first_zones_zonelist(zonelist, high_zoneidx, NULL,
+                struct zoneref *preferred_zoneref;
-                                        &preferred_zone);
+                preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
+                                NULL, &preferred_zone);
+                classzone_idx = zonelist_zone_idx(preferred_zoneref);
+        }
 rebalance:
        /* This is the last chance, in general, before the goto nopage. */
        page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
                        high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
-                        preferred_zone, migratetype);
+                        preferred_zone, classzone_idx, migratetype);
        if (page)
                goto got_pg;
@@ -2548,7 +2563,7 @@ rebalance:
                page = __alloc_pages_high_priority(gfp_mask, order,
                                zonelist, high_zoneidx, nodemask,
-                                preferred_zone, migratetype);
+                                preferred_zone, classzone_idx, migratetype);
                if (page) {
                        goto got_pg;
                }
@@ -2577,17 +2592,23 @@ rebalance:
         * Try direct compaction. The first pass is asynchronous. Subsequent
         * attempts after direct reclaim are synchronous
         */
-        page = __alloc_pages_direct_compact(gfp_mask, order,
+        page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
-                                        zonelist, high_zoneidx,
+                                        high_zoneidx, nodemask, alloc_flags,
-                                        nodemask,
+                                        preferred_zone,
-                                        alloc_flags, preferred_zone,
+                                        classzone_idx, migratetype,
-                                        migratetype, sync_migration,
+                                        migration_mode, &contended_compaction,
-                                        &contended_compaction,
                                        &deferred_compaction,
                                        &did_some_progress);
        if (page)
                goto got_pg;
-        sync_migration = true;
+        /*
+         * It can become very expensive to allocate transparent hugepages at
+         * fault, so use asynchronous memory compaction for THP unless it is
+         * khugepaged trying to collapse.
+         */
+        if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD))
+                migration_mode = MIGRATE_SYNC_LIGHT;
        /*
         * If compaction is deferred for high-order allocations, it is because
@@ -2604,7 +2625,8 @@ rebalance:
                                        zonelist, high_zoneidx,
                                        nodemask,
                                        alloc_flags, preferred_zone,
-                                        migratetype, &did_some_progress);
+                                        classzone_idx, migratetype,
+                                        &did_some_progress);
        if (page)
                goto got_pg;
@@ -2623,7 +2645,7 @@ rebalance:
                        page = __alloc_pages_may_oom(gfp_mask, order,
                                        zonelist, high_zoneidx,
                                        nodemask, preferred_zone,
-                                        migratetype);
+                                        classzone_idx, migratetype);
                        if (page)
                                goto got_pg;
@@ -2662,12 +2684,11 @@ rebalance:
                 * direct reclaim and reclaim/compaction depends on compaction
                 * being called after reclaim so call directly if necessary
                 */
-                page = __alloc_pages_direct_compact(gfp_mask, order,
+                page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
-                                        zonelist, high_zoneidx,
+                                        high_zoneidx, nodemask, alloc_flags,
-                                        nodemask,
+                                        preferred_zone,
-                                        alloc_flags, preferred_zone,
+                                        classzone_idx, migratetype,
-                                        migratetype, sync_migration,
+                                        migration_mode, &contended_compaction,
-                                        &contended_compaction,
                                        &deferred_compaction,
                                        &did_some_progress);
                if (page)
@@ -2693,11 +2714,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        struct zone *preferred_zone;
+        struct zoneref *preferred_zoneref;
        struct page *page = NULL;
        int migratetype = allocflags_to_migratetype(gfp_mask);
        unsigned int cpuset_mems_cookie;
        int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
-        struct mem_cgroup *memcg = NULL;
+        int classzone_idx;
        gfp_mask &= gfp_allowed_mask;
@@ -2716,22 +2738,16 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        if (unlikely(!zonelist->_zonerefs->zone))
                return NULL;
-        /*
-         * Will only have any effect when __GFP_KMEMCG is set.  This is
-         * verified in the (always inline) callee
-         */
-        if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
-                return NULL;
 retry_cpuset:
        cpuset_mems_cookie = read_mems_allowed_begin();
        /* The preferred zone is used for statistics later */
-        first_zones_zonelist(zonelist, high_zoneidx,
+        preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
                                nodemask ? : &cpuset_current_mems_allowed,
                                &preferred_zone);
        if (!preferred_zone)
                goto out;
+        classzone_idx = zonelist_zone_idx(preferred_zoneref);
 #ifdef CONFIG_CMA
        if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
@@ -2741,7 +2757,7 @@ retry:
        /* First allocation attempt */
        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
                        zonelist, high_zoneidx, alloc_flags,
-                        preferred_zone, migratetype);
+                        preferred_zone, classzone_idx, migratetype);
        if (unlikely(!page)) {
                /*
                 * The first pass makes sure allocations are spread
@@ -2767,7 +2783,7 @@ retry:
                gfp_mask = memalloc_noio_flags(gfp_mask);
                page = __alloc_pages_slowpath(gfp_mask, order,
                                zonelist, high_zoneidx, nodemask,
-                                preferred_zone, migratetype);
+                                preferred_zone, classzone_idx, migratetype);
        }
        trace_mm_page_alloc(page, order, gfp_mask, migratetype);
@@ -2782,8 +2798,6 @@ out:
        if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
                goto retry_cpuset;
-        memcg_kmem_commit_charge(page, memcg, order);
        return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2818,7 +2832,7 @@ void __free_pages(struct page *page, unsigned int order)
 {
        if (put_page_testzero(page)) {
                if (order == 0)
-                        free_hot_cold_page(page, 0);
+                        free_hot_cold_page(page, false);
                else
                        __free_pages_ok(page, order);
        }
@@ -2837,27 +2851,51 @@ void free_pages(unsigned long addr, unsigned int order)
 EXPORT_SYMBOL(free_pages);
 /*
- * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
+ * alloc_kmem_pages charges newly allocated pages to the kmem resource counter
- * pages allocated with __GFP_KMEMCG.
+ * of the current memory cgroup.
- *
- * Those pages are accounted to a particular memcg, embedded in the
- * corresponding page_cgroup. To avoid adding a hit in the allocator to search
- * for that information only to find out that it is NULL for users who have no
- * interest in that whatsoever, we provide these functions.
 *
- * The caller knows better which flags it relies on.
+ * It should be used when the caller would like to use kmalloc, but since the
+ * allocation is large, it has to fall back to the page allocator.
 */
-void __free_memcg_kmem_pages(struct page *page, unsigned int order)
+struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
+{
+        struct page *page;
+        struct mem_cgroup *memcg = NULL;
+        if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
+                return NULL;
+        page = alloc_pages(gfp_mask, order);
+        memcg_kmem_commit_charge(page, memcg, order);
+        return page;
+}
+struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
+{
+        struct page *page;
+        struct mem_cgroup *memcg = NULL;
+        if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
+                return NULL;
+        page = alloc_pages_node(nid, gfp_mask, order);
+        memcg_kmem_commit_charge(page, memcg, order);
+        return page;
+}
+/*
+ * __free_kmem_pages and free_kmem_pages will free pages allocated with
+ * alloc_kmem_pages.
+ */
+void __free_kmem_pages(struct page *page, unsigned int order)
 {
        memcg_kmem_uncharge_pages(page, order);
        __free_pages(page, order);
 }
-void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
+void free_kmem_pages(unsigned long addr, unsigned int order)
 {
        if (addr != 0) {
                VM_BUG_ON(!virt_addr_valid((void *)addr));
-                __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
+                __free_kmem_pages(virt_to_page((void *)addr), order);
        }
 }
@@ -4095,7 +4133,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 static void __meminit zone_init_free_lists(struct zone *zone)
 {
-        int order, t;
+        unsigned int order, t;
        for_each_migratetype_order(order, t) {
                INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
                zone->free_area[order].nr_free = 0;
@@ -4349,9 +4387,6 @@ int __meminit init_currently_empty_zone(struct zone *zone,
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 /*
 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
- * Architectures may implement their own version but if add_active_range()
- * was used and there are no special requirements, this is a convenient
- * alternative
 */
 int __meminit __early_pfn_to_nid(unsigned long pfn)
 {
@@ -4406,10 +4441,9 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
 * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
 *
- * If an architecture guarantees that all ranges registered with
+ * If an architecture guarantees that all ranges registered contain no holes
- * add_active_ranges() contain no holes and may be freed, this
+ * and may be freed, this this function may be used instead of calling
- * this function may be used instead of calling memblock_free_early_nid()
+ * memblock_free_early_nid() manually.
- * manually.
 */
 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
 {
@@ -4431,9 +4465,8 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
 * sparse_memory_present_with_active_regions - Call memory_present for each active range
 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
 *
- * If an architecture guarantees that all ranges registered with
+ * If an architecture guarantees that all ranges registered contain no holes and may
- * add_active_ranges() contain no holes and may be freed, this
+ * be freed, this function may be used instead of calling memory_present() manually.
- * function may be used instead of calling memory_present() manually.
 */
 void __init sparse_memory_present_with_active_regions(int nid)
 {
@@ -4451,7 +4484,7 @@ void __init sparse_memory_present_with_active_regions(int nid)
 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
 *
 * It returns the start and end page frame of a node based on information
- * provided by an arch calling add_active_range(). If called for a node
+ * provided by memblock_set_node(). If called for a node
 * with no available memory, a warning is printed and the start and end
 * PFNs will be 0.
 */
@@ -4921,8 +4954,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
        pgdat->node_id = nid;
        pgdat->node_start_pfn = node_start_pfn;
-        if (node_state(nid, N_MEMORY))
-                init_zone_allows_reclaim(nid);
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
        get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
 #endif
@@ -5030,7 +5061,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
 * find_min_pfn_with_active_regions - Find the minimum PFN registered
 *
 * It returns the minimum PFN based on information provided via
- * add_active_range().
+ * memblock_set_node().
 */
 unsigned long __init find_min_pfn_with_active_regions(void)
 {
@@ -5251,7 +5282,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
 * @max_zone_pfn: an array of max PFNs for each zone
 *
 * This will call free_area_init_node() for each active node in the system.
- * Using the page ranges provided by add_active_range(), the size of each
+ * Using the page ranges provided by memblock_set_node(), the size of each
 * zone in each node and their holes is calculated. If the maximum PFN
 * between two adjacent zones match, it is assumed that the zone is empty.
 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
@@ -6009,53 +6040,64 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
 * @end_bitidx: The last bit of interest
 * returns pageblock_bits flags
 */
-unsigned long get_pageblock_flags_group(struct page *page,
+unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
-                                        int start_bitidx, int end_bitidx)
+                                        unsigned long end_bitidx,
+                                        unsigned long mask)
 {
        struct zone *zone;
        unsigned long *bitmap;
-        unsigned long pfn, bitidx;
+        unsigned long bitidx, word_bitidx;
-        unsigned long flags = 0;
+        unsigned long word;
-        unsigned long value = 1;
        zone = page_zone(page);
-        pfn = page_to_pfn(page);
        bitmap = get_pageblock_bitmap(zone, pfn);
        bitidx = pfn_to_bitidx(zone, pfn);
+        word_bitidx = bitidx / BITS_PER_LONG;
+        bitidx &= (BITS_PER_LONG-1);
-        for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
+        word = bitmap[word_bitidx];
-                if (test_bit(bitidx + start_bitidx, bitmap))
+        bitidx += end_bitidx;
-                        flags |= value;
+        return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
-        return flags;
 }
 /**
- * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
 * @page: The page within the block of interest
 * @start_bitidx: The first bit of interest
 * @end_bitidx: The last bit of interest
 * @flags: The flags to set
 */
-void set_pageblock_flags_group(struct page *page, unsigned long flags,
+void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
-                                        int start_bitidx, int end_bitidx)
+                                        unsigned long pfn,
+                                        unsigned long end_bitidx,
+                                        unsigned long mask)
 {
        struct zone *zone;
        unsigned long *bitmap;
-        unsigned long pfn, bitidx;
+        unsigned long bitidx, word_bitidx;
-        unsigned long value = 1;
+        unsigned long old_word, word;
+        BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
        zone = page_zone(page);
-        pfn = page_to_pfn(page);
        bitmap = get_pageblock_bitmap(zone, pfn);
        bitidx = pfn_to_bitidx(zone, pfn);
+        word_bitidx = bitidx / BITS_PER_LONG;
+        bitidx &= (BITS_PER_LONG-1);
        VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
-        for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
+        bitidx += end_bitidx;
-                if (flags & value)
+        mask <<= (BITS_PER_LONG - bitidx - 1);
-                        __set_bit(bitidx + start_bitidx, bitmap);
+        flags <<= (BITS_PER_LONG - bitidx - 1);
-                else
-                        __clear_bit(bitidx + start_bitidx, bitmap);
+        word = ACCESS_ONCE(bitmap[word_bitidx]);
+        for (;;) {
+                old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
+                if (word == old_word)
+                        break;
+                word = old_word;
+        }
 }
 /*
@@ -6215,7 +6257,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
                cc->nr_migratepages -= nr_reclaimed;
                ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
-                                    0, MIGRATE_SYNC, MR_CMA);
+                                    NULL, 0, cc->mode, MR_CMA);
        }
        if (ret < 0) {
                putback_movable_pages(&cc->migratepages);
@@ -6254,7 +6296,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
                .nr_migratepages = 0,
                .order = -1,
                .zone = page_zone(pfn_to_page(start)),
-                .sync = true,
+                .mode = MIGRATE_SYNC,
                .ignore_skip_hint = true,
        };
        INIT_LIST_HEAD(&cc.migratepages);
@@ -6409,7 +6451,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 {
        struct page *page;
        struct zone *zone;
-        int order, i;
+        unsigned int order, i;
        unsigned long pfn;
        unsigned long flags;
        /* find the first valid pfn */
@@ -6461,7 +6503,7 @@ bool is_free_buddy_page(struct page *page)
        struct zone *zone = page_zone(page);
        unsigned long pfn = page_to_pfn(page);
        unsigned long flags;
-        int order;
+        unsigned int order;
        spin_lock_irqsave(&zone->lock, flags);
        for (order = 0; order < MAX_ORDER; order++) {
diff --git a/mm/page_io.c b/mm/page_io.c
index 7c59ef681381..58b50d2901fe 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -248,11 +248,16 @@ out:
        return ret;
 }
+static sector_t swap_page_sector(struct page *page)
+{
+        return (sector_t)__page_file_index(page) << (PAGE_CACHE_SHIFT - 9);
+}
 int __swap_writepage(struct page *page, struct writeback_control *wbc,
        void (*end_write_func)(struct bio *, int))
 {
        struct bio *bio;
-        int ret = 0, rw = WRITE;
+        int ret, rw = WRITE;
        struct swap_info_struct *sis = page_swap_info(page);
        if (sis->flags & SWP_FILE) {
@@ -297,6 +302,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
                return ret;
        }
+        ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
+        if (!ret) {
+                count_vm_event(PSWPOUT);
+                return 0;
+        }
+        ret = 0;
        bio = get_swap_bio(GFP_NOIO, page, end_write_func);
        if (bio == NULL) {
                set_page_dirty(page);
@@ -338,6 +350,13 @@ int swap_readpage(struct page *page)
                return ret;
        }
+        ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
+        if (!ret) {
+                count_vm_event(PSWPIN);
+                return 0;
+        }
+        ret = 0;
        bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
        if (bio == NULL) {
                unlock_page(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index 9c3e77396d1a..ea8e20d75b29 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -103,6 +103,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
         * LOCK should suffice since the actual taking of the lock must
         * happen _before_ what follows.
         */
+        might_sleep();
        if (rwsem_is_locked(&anon_vma->root->rwsem)) {
                anon_vma_lock_write(anon_vma);
                anon_vma_unlock_write(anon_vma);
@@ -426,8 +427,9 @@ struct anon_vma *page_get_anon_vma(struct page *page)
         * above cannot corrupt).
         */
        if (!page_mapped(page)) {
+                rcu_read_unlock();
                put_anon_vma(anon_vma);
-                anon_vma = NULL;
+                return NULL;
        }
 out:
        rcu_read_unlock();
@@ -477,9 +479,9 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
        }
        if (!page_mapped(page)) {
+                rcu_read_unlock();
                put_anon_vma(anon_vma);
-                anon_vma = NULL;
+                return NULL;
-                goto out;
        }
        /* we pinned the anon_vma, its safe to sleep */
@@ -669,7 +671,7 @@ struct page_referenced_arg {
 /*
 * arg: page_referenced_arg will be passed
 */
-int page_referenced_one(struct page *page, struct vm_area_struct *vma,
+static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                        unsigned long address, void *arg)
 {
        struct mm_struct *mm = vma->vm_mm;
@@ -986,6 +988,12 @@ void do_page_add_anon_rmap(struct page *page,
 {
        int first = atomic_inc_and_test(&page->_mapcount);
        if (first) {
+                /*
+                 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
+                 * these counters are not modified in interrupt context, and
+                 * pte lock(a spinlock) is held, which implies preemption
+                 * disabled.
+                 */
                if (PageTransHuge(page))
                        __inc_zone_page_state(page,
                                              NR_ANON_TRANSPARENT_HUGEPAGES);
@@ -1024,11 +1032,25 @@ void page_add_new_anon_rmap(struct page *page,
        __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
                        hpage_nr_pages(page));
        __page_set_anon_rmap(page, vma, address, 1);
-        if (!mlocked_vma_newpage(vma, page)) {
+        VM_BUG_ON_PAGE(PageLRU(page), page);
+        if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
                SetPageActive(page);
                lru_cache_add(page);
-        } else
+                return;
-                add_page_to_unevictable_list(page);
+        }
+        if (!TestSetPageMlocked(page)) {
+                /*
+                 * We use the irq-unsafe __mod_zone_page_stat because this
+                 * counter is not modified from interrupt context, and the pte
+                 * lock is held(spinlock), which implies preemption disabled.
+                 */
+                __mod_zone_page_state(page_zone(page), NR_MLOCK,
+                                    hpage_nr_pages(page));
+                count_vm_event(UNEVICTABLE_PGMLOCKED);
+        }
+        add_page_to_unevictable_list(page);
 }
 /**
@@ -1077,6 +1099,11 @@ void page_remove_rmap(struct page *page)
        /*
         * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
         * and not charged by memcg for now.
+         *
+         * We use the irq-unsafe __{inc|mod}_zone_page_stat because
+         * these counters are not modified in interrupt context, and
+         * these counters are not modified in interrupt context, and
+         * pte lock(a spinlock) is held, which implies preemption disabled.
         */
        if (unlikely(PageHuge(page)))
                goto out;
@@ -1112,7 +1139,7 @@ out:
 /*
 * @arg: enum ttu_flags will be passed to this argument
 */
-int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                     unsigned long address, void *arg)
 {
        struct mm_struct *mm = vma->vm_mm;
@@ -1135,7 +1162,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                if (vma->vm_flags & VM_LOCKED)
                        goto out_mlock;
-                if (TTU_ACTION(flags) == TTU_MUNLOCK)
+                if (flags & TTU_MUNLOCK)
                        goto out_unmap;
        }
        if (!(flags & TTU_IGNORE_ACCESS)) {
@@ -1203,7 +1230,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                         * pte. do_swap_page() will wait until the migration
                         * pte is removed and then restart fault handling.
                         */
-                        BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
+                        BUG_ON(!(flags & TTU_MIGRATION));
                        entry = make_migration_entry(page, pte_write(pteval));
                }
                swp_pte = swp_entry_to_pte(entry);
@@ -1212,7 +1239,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                set_pte_at(mm, address, pte, swp_pte);
                BUG_ON(pte_file(*pte));
        } else if (IS_ENABLED(CONFIG_MIGRATION) &&
-                   (TTU_ACTION(flags) == TTU_MIGRATION)) {
+                   (flags & TTU_MIGRATION)) {
                /* Establish migration entry for a file page */
                swp_entry_t entry;
                entry = make_migration_entry(page, pte_write(pteval));
@@ -1225,7 +1252,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 out_unmap:
        pte_unmap_unlock(pte, ptl);
-        if (ret != SWAP_FAIL)
+        if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK))
                mmu_notifier_invalidate_page(mm, address);
 out:
        return ret;
@@ -1359,7 +1386,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
                if (page->index != linear_page_index(vma, address)) {
                        pte_t ptfile = pgoff_to_pte(page->index);
                        if (pte_soft_dirty(pteval))
-                                pte_file_mksoft_dirty(ptfile);
+                                ptfile = pte_file_mksoft_dirty(ptfile);
                        set_pte_at(mm, address, pte, ptfile);
                }
@@ -1512,7 +1539,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
         * locking requirements of exec(), migration skips
         * temporary VMAs until after exec() completes.
         */
-        if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page))
+        if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page))
                rwc.invalid_vma = invalid_migration_vma;
        ret = rmap_walk(page, &rwc);
diff --git a/mm/shmem.c b/mm/shmem.c
index 9f70e02111c6..5402481c28d1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1132,7 +1132,7 @@ repeat:
                        goto decused;
                }
-                SetPageSwapBacked(page);
+                __SetPageSwapBacked(page);
                __set_page_locked(page);
                error = mem_cgroup_charge_file(page, current->mm,
                                                gfp & GFP_RECLAIM_MASK);
@@ -1372,9 +1372,13 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
 {
+        int ret;
        struct inode *inode = mapping->host;
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
+        ret = shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
+        if (ret == 0 && *pagep)
+                init_page_accessed(*pagep);
+        return ret;
 }
 static int
diff --git a/mm/slab.c b/mm/slab.c
index 19d92181ce24..9ca3b87edabc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1621,10 +1621,16 @@ __initcall(cpucache_init);
 static noinline void
 slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
 {
+#if DEBUG
        struct kmem_cache_node *n;
        struct page *page;
        unsigned long flags;
        int node;
+        static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
+                                      DEFAULT_RATELIMIT_BURST);
+        if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs))
+                return;
        printk(KERN_WARNING
                "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
@@ -1662,6 +1668,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
                        node, active_slabs, num_slabs, active_objs, num_objs,
                        free_objects);
        }
+#endif
 }
 /*
@@ -1681,10 +1688,13 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
                flags |= __GFP_RECLAIMABLE;
+        if (memcg_charge_slab(cachep, flags, cachep->gfporder))
+                return NULL;
        page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
        if (!page) {
-                if (!(flags & __GFP_NOWARN) && printk_ratelimit())
+                memcg_uncharge_slab(cachep, cachep->gfporder);
-                        slab_out_of_memory(cachep, flags, nodeid);
+                slab_out_of_memory(cachep, flags, nodeid);
                return NULL;
        }
@@ -1702,7 +1712,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
        __SetPageSlab(page);
        if (page->pfmemalloc)
                SetPageSlabPfmemalloc(page);
-        memcg_bind_pages(cachep, cachep->gfporder);
        if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
                kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1738,10 +1747,10 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
        page_mapcount_reset(page);
        page->mapping = NULL;
-        memcg_release_pages(cachep, cachep->gfporder);
        if (current->reclaim_state)
                current->reclaim_state->reclaimed_slab += nr_freed;
-        __free_memcg_kmem_pages(page, cachep->gfporder);
+        __free_pages(page, cachep->gfporder);
+        memcg_uncharge_slab(cachep, cachep->gfporder);
 }
 static void kmem_rcu_free(struct rcu_head *head)
@@ -2469,8 +2478,7 @@ out:
        return nr_freed;
 }
-/* Called with slab_mutex held to protect against cpu hotplug */
+int __kmem_cache_shrink(struct kmem_cache *cachep)
-static int __cache_shrink(struct kmem_cache *cachep)
 {
        int ret = 0, i = 0;
        struct kmem_cache_node *n;
@@ -2491,32 +2499,11 @@ static int __cache_shrink(struct kmem_cache *cachep)
        return (ret ? 1 : 0);
 }
-/**
- * kmem_cache_shrink - Shrink a cache.
- * @cachep: The cache to shrink.
- *
- * Releases as many slabs as possible for a cache.
- * To help debugging, a zero exit status indicates all slabs were released.
- */
-int kmem_cache_shrink(struct kmem_cache *cachep)
-{
-        int ret;
-        BUG_ON(!cachep || in_interrupt());
-        get_online_cpus();
-        mutex_lock(&slab_mutex);
-        ret = __cache_shrink(cachep);
-        mutex_unlock(&slab_mutex);
-        put_online_cpus();
-        return ret;
-}
-EXPORT_SYMBOL(kmem_cache_shrink);
 int __kmem_cache_shutdown(struct kmem_cache *cachep)
 {
        int i;
        struct kmem_cache_node *n;
-        int rc = __cache_shrink(cachep);
+        int rc = __kmem_cache_shrink(cachep);
        if (rc)
                return rc;
diff --git a/mm/slab.h b/mm/slab.h
index 6bd4c353704f..961a3fb1f5a2 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -91,6 +91,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
 #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
 int __kmem_cache_shutdown(struct kmem_cache *);
+int __kmem_cache_shrink(struct kmem_cache *);
 void slab_kmem_cache_release(struct kmem_cache *);
 struct seq_file;
@@ -120,21 +121,6 @@ static inline bool is_root_cache(struct kmem_cache *s)
        return !s->memcg_params || s->memcg_params->is_root_cache;
 }
-static inline void memcg_bind_pages(struct kmem_cache *s, int order)
-{
-        if (!is_root_cache(s))
-                atomic_add(1 << order, &s->memcg_params->nr_pages);
-}
-static inline void memcg_release_pages(struct kmem_cache *s, int order)
-{
-        if (is_root_cache(s))
-                return;
-        if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages))
-                mem_cgroup_destroy_cache(s);
-}
 static inline bool slab_equal_or_root(struct kmem_cache *s,
                                        struct kmem_cache *p)
 {
@@ -192,18 +178,29 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
                return s;
        return s->memcg_params->root_cache;
 }
-#else
-static inline bool is_root_cache(struct kmem_cache *s)
+static __always_inline int memcg_charge_slab(struct kmem_cache *s,
+                                             gfp_t gfp, int order)
 {
-        return true;
+        if (!memcg_kmem_enabled())
+                return 0;
+        if (is_root_cache(s))
+                return 0;
+        return __memcg_charge_slab(s, gfp, order);
 }
-static inline void memcg_bind_pages(struct kmem_cache *s, int order)
+static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
 {
+        if (!memcg_kmem_enabled())
+                return;
+        if (is_root_cache(s))
+                return;
+        __memcg_uncharge_slab(s, order);
 }
+#else
-static inline void memcg_release_pages(struct kmem_cache *s, int order)
+static inline bool is_root_cache(struct kmem_cache *s)
 {
+        return true;
 }
 static inline bool slab_equal_or_root(struct kmem_cache *s,
@@ -227,6 +224,15 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
 {
        return s;
 }
+static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
+{
+        return 0;
+}
+static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
+{
+}
 #endif
 static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 102cc6fca3d3..735e01a0db6f 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -160,7 +160,6 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
        s->refcount = 1;
        list_add(&s->list, &slab_caches);
-        memcg_register_cache(s);
 out:
        if (err)
                return ERR_PTR(err);
@@ -205,6 +204,8 @@ kmem_cache_create(const char *name, size_t size, size_t align,
        int err;
        get_online_cpus();
+        get_online_mems();
        mutex_lock(&slab_mutex);
        err = kmem_cache_sanity_check(name, size);
@@ -239,6 +240,8 @@ kmem_cache_create(const char *name, size_t size, size_t align,
 out_unlock:
        mutex_unlock(&slab_mutex);
+        put_online_mems();
        put_online_cpus();
        if (err) {
@@ -258,31 +261,29 @@ EXPORT_SYMBOL(kmem_cache_create);
 #ifdef CONFIG_MEMCG_KMEM
 /*
- * kmem_cache_create_memcg - Create a cache for a memory cgroup.
+ * memcg_create_kmem_cache - Create a cache for a memory cgroup.
 * @memcg: The memory cgroup the new cache is for.
 * @root_cache: The parent of the new cache.
+ * @memcg_name: The name of the memory cgroup (used for naming the new cache).
 *
 * This function attempts to create a kmem cache that will serve allocation
 * requests going from @memcg to @root_cache. The new cache inherits properties
 * from its parent.
 */
-void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
+                                           struct kmem_cache *root_cache,
+                                           const char *memcg_name)
 {
-        struct kmem_cache *s;
+        struct kmem_cache *s = NULL;
        char *cache_name;
        get_online_cpus();
-        mutex_lock(&slab_mutex);
+        get_online_mems();
-        /*
+        mutex_lock(&slab_mutex);
-         * Since per-memcg caches are created asynchronously on first
-         * allocation (see memcg_kmem_get_cache()), several threads can try to
-         * create the same cache, but only one of them may succeed.
-         */
-        if (cache_from_memcg_idx(root_cache, memcg_cache_id(memcg)))
-                goto out_unlock;
-        cache_name = memcg_create_cache_name(memcg, root_cache);
+        cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
+                               memcg_cache_id(memcg), memcg_name);
        if (!cache_name)
                goto out_unlock;
@@ -292,17 +293,19 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c
                                 memcg, root_cache);
        if (IS_ERR(s)) {
                kfree(cache_name);
-                goto out_unlock;
+                s = NULL;
        }
-        s->allocflags |= __GFP_KMEMCG;
 out_unlock:
        mutex_unlock(&slab_mutex);
+        put_online_mems();
        put_online_cpus();
+        return s;
 }
-static int kmem_cache_destroy_memcg_children(struct kmem_cache *s)
+static int memcg_cleanup_cache_params(struct kmem_cache *s)
 {
        int rc;
@@ -311,13 +314,13 @@ static int kmem_cache_destroy_memcg_children(struct kmem_cache *s)
                return 0;
        mutex_unlock(&slab_mutex);
-        rc = __kmem_cache_destroy_memcg_children(s);
+        rc = __memcg_cleanup_cache_params(s);
        mutex_lock(&slab_mutex);
        return rc;
 }
 #else
-static int kmem_cache_destroy_memcg_children(struct kmem_cache *s)
+static int memcg_cleanup_cache_params(struct kmem_cache *s)
 {
        return 0;
 }
@@ -332,27 +335,26 @@ void slab_kmem_cache_release(struct kmem_cache *s)
 void kmem_cache_destroy(struct kmem_cache *s)
 {
        get_online_cpus();
+        get_online_mems();
        mutex_lock(&slab_mutex);
        s->refcount--;
        if (s->refcount)
                goto out_unlock;
-        if (kmem_cache_destroy_memcg_children(s) != 0)
+        if (memcg_cleanup_cache_params(s) != 0)
                goto out_unlock;
-        list_del(&s->list);
-        memcg_unregister_cache(s);
        if (__kmem_cache_shutdown(s) != 0) {
-                list_add(&s->list, &slab_caches);
-                memcg_register_cache(s);
                printk(KERN_ERR "kmem_cache_destroy %s: "
                       "Slab cache still has objects\n", s->name);
                dump_stack();
                goto out_unlock;
        }
+        list_del(&s->list);
        mutex_unlock(&slab_mutex);
        if (s->flags & SLAB_DESTROY_BY_RCU)
                rcu_barrier();
@@ -363,15 +365,36 @@ void kmem_cache_destroy(struct kmem_cache *s)
 #else
        slab_kmem_cache_release(s);
 #endif
-        goto out_put_cpus;
+        goto out;
 out_unlock:
        mutex_unlock(&slab_mutex);
-out_put_cpus:
+out:
+        put_online_mems();
        put_online_cpus();
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
+/**
+ * kmem_cache_shrink - Shrink a cache.
+ * @cachep: The cache to shrink.
+ *
+ * Releases as many slabs as possible for a cache.
+ * To help debugging, a zero exit status indicates all slabs were released.
+ */
+int kmem_cache_shrink(struct kmem_cache *cachep)
+{
+        int ret;
+        get_online_cpus();
+        get_online_mems();
+        ret = __kmem_cache_shrink(cachep);
+        put_online_mems();
+        put_online_cpus();
+        return ret;
+}
+EXPORT_SYMBOL(kmem_cache_shrink);
 int slab_is_available(void)
 {
        return slab_state >= UP;
@@ -586,6 +609,24 @@ void __init create_kmalloc_caches(unsigned long flags)
 }
 #endif /* !CONFIG_SLOB */
+/*
+ * To avoid unnecessary overhead, we pass through large allocation requests
+ * directly to the page allocator. We use __GFP_COMP, because we will need to
+ * know the allocation order to free the pages properly in kfree.
+ */
+void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
+{
+        void *ret;
+        struct page *page;
+        flags |= __GFP_COMP;
+        page = alloc_kmem_pages(flags, order);
+        ret = page ? page_address(page) : NULL;
+        kmemleak_alloc(ret, size, 1, flags);
+        return ret;
+}
+EXPORT_SYMBOL(kmalloc_order);
 #ifdef CONFIG_TRACING
 void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
 {
diff --git a/mm/slob.c b/mm/slob.c
index 730cad45d4be..21980e0f39a8 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -620,11 +620,10 @@ int __kmem_cache_shutdown(struct kmem_cache *c)
        return 0;
 }
-int kmem_cache_shrink(struct kmem_cache *d)
+int __kmem_cache_shrink(struct kmem_cache *d)
 {
        return 0;
 }
-EXPORT_SYMBOL(kmem_cache_shrink);
 struct kmem_cache kmem_cache_boot = {
        .name = "kmem_cache",
diff --git a/mm/slub.c b/mm/slub.c
index 2b1ce697fc4b..fdf0fe4da9a9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -403,7 +403,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
        stat(s, CMPXCHG_DOUBLE_FAIL);
 #ifdef SLUB_DEBUG_CMPXCHG
-        printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
+        pr_info("%s %s: cmpxchg double redo ", n, s->name);
 #endif
        return 0;
@@ -444,7 +444,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
        stat(s, CMPXCHG_DOUBLE_FAIL);
 #ifdef SLUB_DEBUG_CMPXCHG
-        printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
+        pr_info("%s %s: cmpxchg double redo ", n, s->name);
 #endif
        return 0;
@@ -546,14 +546,14 @@ static void print_track(const char *s, struct track *t)
        if (!t->addr)
                return;
-        printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
+        pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
-                s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
+               s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
 #ifdef CONFIG_STACKTRACE
        {
                int i;
                for (i = 0; i < TRACK_ADDRS_COUNT; i++)
                        if (t->addrs[i])
-                                printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]);
+                                pr_err("\t%pS\n", (void *)t->addrs[i]);
                        else
                                break;
        }
@@ -571,38 +571,37 @@ static void print_tracking(struct kmem_cache *s, void *object)
 static void print_page_info(struct page *page)
 {
-        printk(KERN_ERR
+        pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
-               "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
               page, page->objects, page->inuse, page->freelist, page->flags);
 }
 static void slab_bug(struct kmem_cache *s, char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
-        char buf[100];
        va_start(args, fmt);
-        vsnprintf(buf, sizeof(buf), fmt, args);
+        vaf.fmt = fmt;
-        va_end(args);
+        vaf.va = &args;
-        printk(KERN_ERR "========================================"
+        pr_err("=============================================================================\n");
-                        "=====================================\n");
+        pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
-        printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf);
+        pr_err("-----------------------------------------------------------------------------\n\n");
-        printk(KERN_ERR "----------------------------------------"
-                        "-------------------------------------\n\n");
        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+        va_end(args);
 }
 static void slab_fix(struct kmem_cache *s, char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
-        char buf[100];
        va_start(args, fmt);
-        vsnprintf(buf, sizeof(buf), fmt, args);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        pr_err("FIX %s: %pV\n", s->name, &vaf);
        va_end(args);
-        printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
 }
 static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
@@ -614,8 +613,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
        print_page_info(page);
-        printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
+        pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
-                        p, p - addr, get_freepointer(s, p));
+               p, p - addr, get_freepointer(s, p));
        if (p > addr + 16)
                print_section("Bytes b4 ", p - 16, 16);
@@ -698,7 +697,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
                end--;
        slab_bug(s, "%s overwritten", what);
-        printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
+        pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
                                        fault, end - 1, fault[0], value);
        print_trailer(s, page, object);
@@ -931,7 +930,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
                                                                int alloc)
 {
        if (s->flags & SLAB_TRACE) {
-                printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
+                pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
                        s->name,
                        alloc ? "alloc" : "free",
                        object, page->inuse,
@@ -1134,9 +1133,8 @@ static noinline struct kmem_cache_node *free_debug_processing(
                        slab_err(s, page, "Attempt to free object(0x%p) "
                                "outside of slab", object);
                } else if (!page->slab_cache) {
-                        printk(KERN_ERR
+                        pr_err("SLUB <none>: no slab for object 0x%p.\n",
-                                "SLUB <none>: no slab for object 0x%p.\n",
+                               object);
-                                                object);
                        dump_stack();
                } else
                        object_err(s, page, object,
@@ -1219,8 +1217,8 @@ static int __init setup_slub_debug(char *str)
                        slub_debug |= SLAB_FAILSLAB;
                        break;
                default:
-                        printk(KERN_ERR "slub_debug option '%c' "
+                        pr_err("slub_debug option '%c' unknown. skipped\n",
-                                "unknown. skipped\n", *str);
+                               *str);
                }
        }
@@ -1314,17 +1312,26 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
 /*
 * Slab allocation and freeing
 */
-static inline struct page *alloc_slab_page(gfp_t flags, int node,
+static inline struct page *alloc_slab_page(struct kmem_cache *s,
-                                        struct kmem_cache_order_objects oo)
+                gfp_t flags, int node, struct kmem_cache_order_objects oo)
 {
+        struct page *page;
        int order = oo_order(oo);
        flags |= __GFP_NOTRACK;
+        if (memcg_charge_slab(s, flags, order))
+                return NULL;
        if (node == NUMA_NO_NODE)
-                return alloc_pages(flags, order);
+                page = alloc_pages(flags, order);
        else
-                return alloc_pages_exact_node(node, flags, order);
+                page = alloc_pages_exact_node(node, flags, order);
+        if (!page)
+                memcg_uncharge_slab(s, order);
+        return page;
 }
 static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1346,7 +1353,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
         */
        alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
-        page = alloc_slab_page(alloc_gfp, node, oo);
+        page = alloc_slab_page(s, alloc_gfp, node, oo);
        if (unlikely(!page)) {
                oo = s->min;
                alloc_gfp = flags;
@@ -1354,7 +1361,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
                 * Allocation may have failed due to fragmentation.
                 * Try a lower order alloc if possible
                 */
-                page = alloc_slab_page(alloc_gfp, node, oo);
+                page = alloc_slab_page(s, alloc_gfp, node, oo);
                if (page)
                        stat(s, ORDER_FALLBACK);
@@ -1415,7 +1422,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        order = compound_order(page);
        inc_slabs_node(s, page_to_nid(page), page->objects);
-        memcg_bind_pages(s, order);
        page->slab_cache = s;
        __SetPageSlab(page);
        if (page->pfmemalloc)
@@ -1466,11 +1472,11 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
        __ClearPageSlabPfmemalloc(page);
        __ClearPageSlab(page);
-        memcg_release_pages(s, order);
        page_mapcount_reset(page);
        if (current->reclaim_state)
                current->reclaim_state->reclaimed_slab += pages;
-        __free_memcg_kmem_pages(page, order);
+        __free_pages(page, order);
+        memcg_uncharge_slab(s, order);
 }
 #define need_reserve_slab_rcu                                           \
@@ -1770,19 +1776,19 @@ static inline void note_cmpxchg_failure(const char *n,
 #ifdef SLUB_DEBUG_CMPXCHG
        unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
-        printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name);
+        pr_info("%s %s: cmpxchg redo ", n, s->name);
 #ifdef CONFIG_PREEMPT
        if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
-                printk("due to cpu change %d -> %d\n",
+                pr_warn("due to cpu change %d -> %d\n",
                        tid_to_cpu(tid), tid_to_cpu(actual_tid));
        else
 #endif
        if (tid_to_event(tid) != tid_to_event(actual_tid))
-                printk("due to cpu running other code. Event %ld->%ld\n",
+                pr_warn("due to cpu running other code. Event %ld->%ld\n",
                        tid_to_event(tid), tid_to_event(actual_tid));
        else
-                printk("for unknown reason: actual=%lx was=%lx target=%lx\n",
+                pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
                        actual_tid, tid, next_tid(tid));
 #endif
        stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
@@ -2121,11 +2127,19 @@ static inline int node_match(struct page *page, int node)
        return 1;
 }
+#ifdef CONFIG_SLUB_DEBUG
 static int count_free(struct page *page)
 {
        return page->objects - page->inuse;
 }
+static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
+{
+        return atomic_long_read(&n->total_objects);
+}
+#endif /* CONFIG_SLUB_DEBUG */
+#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
 static unsigned long count_partial(struct kmem_cache_node *n,
                                        int (*get_count)(struct page *))
 {
@@ -2139,31 +2153,28 @@ static unsigned long count_partial(struct kmem_cache_node *n,
        spin_unlock_irqrestore(&n->list_lock, flags);
        return x;
 }
+#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
-static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
-{
-#ifdef CONFIG_SLUB_DEBUG
-        return atomic_long_read(&n->total_objects);
-#else
-        return 0;
-#endif
-}
 static noinline void
 slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
 {
+#ifdef CONFIG_SLUB_DEBUG
+        static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
+                                      DEFAULT_RATELIMIT_BURST);
        int node;
-        printk(KERN_WARNING
+        if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
-                "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
+                return;
+        pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
                nid, gfpflags);
-        printk(KERN_WARNING "  cache: %s, object size: %d, buffer size: %d, "
+        pr_warn("  cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n",
-                "default order: %d, min order: %d\n", s->name, s->object_size,
+                s->name, s->object_size, s->size, oo_order(s->oo),
-                s->size, oo_order(s->oo), oo_order(s->min));
+                oo_order(s->min));
        if (oo_order(s->min) > get_order(s->object_size))
-                printk(KERN_WARNING "  %s debugging increased min order, use "
+                pr_warn("  %s debugging increased min order, use slub_debug=O to disable.\n",
-                       "slub_debug=O to disable.\n", s->name);
+                        s->name);
        for_each_online_node(node) {
                struct kmem_cache_node *n = get_node(s, node);
@@ -2178,10 +2189,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
                nr_slabs = node_nr_slabs(n);
                nr_objs  = node_nr_objs(n);
-                printk(KERN_WARNING
+                pr_warn("  node %d: slabs: %ld, objs: %ld, free: %ld\n",
-                        "  node %d: slabs: %ld, objs: %ld, free: %ld\n",
                        node, nr_slabs, nr_objs, nr_free);
        }
+#endif
 }
 static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
@@ -2198,7 +2209,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
        page = new_slab(s, flags, node);
        if (page) {
-                c = __this_cpu_ptr(s->cpu_slab);
+                c = raw_cpu_ptr(s->cpu_slab);
                if (c->page)
                        flush_slab(s, c);
@@ -2323,8 +2334,6 @@ redo:
        if (freelist)
                goto load_freelist;
-        stat(s, ALLOC_SLOWPATH);
        freelist = get_freelist(s, page);
        if (!freelist) {
@@ -2360,9 +2369,7 @@ new_slab:
        freelist = new_slab_objects(s, gfpflags, node, &c);
        if (unlikely(!freelist)) {
-                if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
+                slab_out_of_memory(s, gfpflags, node);
-                        slab_out_of_memory(s, gfpflags, node);
                local_irq_restore(flags);
                return NULL;
        }
@@ -2418,7 +2425,7 @@ redo:
         * and the retrieval of the tid.
         */
        preempt_disable();
-        c = __this_cpu_ptr(s->cpu_slab);
+        c = this_cpu_ptr(s->cpu_slab);
        /*
         * The transaction ids are globally unique per cpu and per operation on
@@ -2431,10 +2438,10 @@ redo:
        object = c->freelist;
        page = c->page;
-        if (unlikely(!object || !node_match(page, node)))
+        if (unlikely(!object || !node_match(page, node))) {
                object = __slab_alloc(s, gfpflags, node, addr, c);
+                stat(s, ALLOC_SLOWPATH);
-        else {
+        } else {
                void *next_object = get_freepointer_safe(s, object);
                /*
@@ -2674,7 +2681,7 @@ redo:
         * during the cmpxchg then the free will succedd.
         */
        preempt_disable();
-        c = __this_cpu_ptr(s->cpu_slab);
+        c = this_cpu_ptr(s->cpu_slab);
        tid = c->tid;
        preempt_enable();
@@ -2894,10 +2901,8 @@ static void early_kmem_cache_node_alloc(int node)
        BUG_ON(!page);
        if (page_to_nid(page) != node) {
-                printk(KERN_ERR "SLUB: Unable to allocate memory from "
+                pr_err("SLUB: Unable to allocate memory from node %d\n", node);
-                                "node %d\n", node);
+                pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
-                printk(KERN_ERR "SLUB: Allocating a useless per node structure "
-                                "in order to be able to continue\n");
        }
        n = page->freelist;
@@ -3182,8 +3187,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
        for_each_object(p, s, addr, page->objects) {
                if (!test_bit(slab_index(p, s, addr), map)) {
-                        printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n",
+                        pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
-                                                        p, p - addr);
                        print_tracking(s, p);
                }
        }
@@ -3305,8 +3309,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
        struct page *page;
        void *ptr = NULL;
-        flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
+        flags |= __GFP_COMP | __GFP_NOTRACK;
-        page = alloc_pages_node(node, flags, get_order(size));
+        page = alloc_kmem_pages_node(node, flags, get_order(size));
        if (page)
                ptr = page_address(page);
@@ -3375,7 +3379,7 @@ void kfree(const void *x)
        if (unlikely(!PageSlab(page))) {
                BUG_ON(!PageCompound(page));
                kfree_hook(x);
-                __free_memcg_kmem_pages(page, compound_order(page));
+                __free_kmem_pages(page, compound_order(page));
                return;
        }
        slab_free(page->slab_cache, page, object, _RET_IP_);
@@ -3392,7 +3396,7 @@ EXPORT_SYMBOL(kfree);
 * being allocated from last increasing the chance that the last objects
 * are freed in them.
 */
-int kmem_cache_shrink(struct kmem_cache *s)
+int __kmem_cache_shrink(struct kmem_cache *s)
 {
        int node;
        int i;
@@ -3448,7 +3452,6 @@ int kmem_cache_shrink(struct kmem_cache *s)
        kfree(slabs_by_inuse);
        return 0;
 }
-EXPORT_SYMBOL(kmem_cache_shrink);
 static int slab_mem_going_offline_callback(void *arg)
 {
@@ -3456,7 +3459,7 @@ static int slab_mem_going_offline_callback(void *arg)
        mutex_lock(&slab_mutex);
        list_for_each_entry(s, &slab_caches, list)
-                kmem_cache_shrink(s);
+                __kmem_cache_shrink(s);
        mutex_unlock(&slab_mutex);
        return 0;
@@ -3650,9 +3653,7 @@ void __init kmem_cache_init(void)
        register_cpu_notifier(&slab_notifier);
 #endif
-        printk(KERN_INFO
+        pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n",
-                "SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d,"
-                " CPUs=%d, Nodes=%d\n",
                cache_line_size(),
                slub_min_order, slub_max_order, slub_min_objects,
                nr_cpu_ids, nr_node_ids);
@@ -3934,8 +3935,8 @@ static int validate_slab_node(struct kmem_cache *s,
                count++;
        }
        if (count != n->nr_partial)
-                printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
+                pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
-                        "counter=%ld\n", s->name, count, n->nr_partial);
+                       s->name, count, n->nr_partial);
        if (!(s->flags & SLAB_STORE_USER))
                goto out;
@@ -3945,9 +3946,8 @@ static int validate_slab_node(struct kmem_cache *s,
                count++;
        }
        if (count != atomic_long_read(&n->nr_slabs))
-                printk(KERN_ERR "SLUB: %s %ld slabs counted but "
+                pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
-                        "counter=%ld\n", s->name, count,
+                       s->name, count, atomic_long_read(&n->nr_slabs));
-                        atomic_long_read(&n->nr_slabs));
 out:
        spin_unlock_irqrestore(&n->list_lock, flags);
@@ -4211,53 +4211,50 @@ static void resiliency_test(void)
        BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
-        printk(KERN_ERR "SLUB resiliency testing\n");
+        pr_err("SLUB resiliency testing\n");
-        printk(KERN_ERR "-----------------------\n");
+        pr_err("-----------------------\n");
-        printk(KERN_ERR "A. Corruption after allocation\n");
+        pr_err("A. Corruption after allocation\n");
        p = kzalloc(16, GFP_KERNEL);
        p[16] = 0x12;
-        printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
+        pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
-                        " 0x12->0x%p\n\n", p + 16);
+               p + 16);
        validate_slab_cache(kmalloc_caches[4]);
        /* Hmmm... The next two are dangerous */
        p = kzalloc(32, GFP_KERNEL);
        p[32 + sizeof(void *)] = 0x34;
-        printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
+        pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n",
-                        " 0x34 -> -0x%p\n", p);
+               p);
-        printk(KERN_ERR
+        pr_err("If allocated object is overwritten then not detectable\n\n");
-                "If allocated object is overwritten then not detectable\n\n");
        validate_slab_cache(kmalloc_caches[5]);
        p = kzalloc(64, GFP_KERNEL);
        p += 64 + (get_cycles() & 0xff) * sizeof(void *);
        *p = 0x56;
-        printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
+        pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
-                                                                        p);
+               p);
-        printk(KERN_ERR
+        pr_err("If allocated object is overwritten then not detectable\n\n");
-                "If allocated object is overwritten then not detectable\n\n");
        validate_slab_cache(kmalloc_caches[6]);
-        printk(KERN_ERR "\nB. Corruption after free\n");
+        pr_err("\nB. Corruption after free\n");
        p = kzalloc(128, GFP_KERNEL);
        kfree(p);
        *p = 0x78;
-        printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
+        pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
        validate_slab_cache(kmalloc_caches[7]);
        p = kzalloc(256, GFP_KERNEL);
        kfree(p);
        p[50] = 0x9a;
-        printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
+        pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
-                        p);
        validate_slab_cache(kmalloc_caches[8]);
        p = kzalloc(512, GFP_KERNEL);
        kfree(p);
        p[512] = 0xab;
-        printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
+        pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
        validate_slab_cache(kmalloc_caches[9]);
 }
 #else
@@ -4332,7 +4329,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
                }
        }
-        lock_memory_hotplug();
+        get_online_mems();
 #ifdef CONFIG_SLUB_DEBUG
        if (flags & SO_ALL) {
                for_each_node_state(node, N_NORMAL_MEMORY) {
@@ -4372,7 +4369,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
                        x += sprintf(buf + x, " N%d=%lu",
                                        node, nodes[node]);
 #endif
-        unlock_memory_hotplug();
+        put_online_mems();
        kfree(nodes);
        return x + sprintf(buf + x, "\n");
 }
@@ -5303,7 +5300,7 @@ static int __init slab_sysfs_init(void)
        slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
        if (!slab_kset) {
                mutex_unlock(&slab_mutex);
-                printk(KERN_ERR "Cannot register slab subsystem.\n");
+                pr_err("Cannot register slab subsystem.\n");
                return -ENOSYS;
        }
@@ -5312,8 +5309,8 @@ static int __init slab_sysfs_init(void)
        list_for_each_entry(s, &slab_caches, list) {
                err = sysfs_slab_add(s);
                if (err)
-                        printk(KERN_ERR "SLUB: Unable to add boot slab %s"
+                        pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
-                                                " to sysfs\n", s->name);
+                               s->name);
        }
        while (alias_list) {
@@ -5322,8 +5319,8 @@ static int __init slab_sysfs_init(void)
                alias_list = alias_list->next;
                err = sysfs_slab_alias(al->s, al->name);
                if (err)
-                        printk(KERN_ERR "SLUB: Unable to add boot slab alias"
+                        pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
-                                        " %s to sysfs\n", al->name);
+                               al->name);
                kfree(al);
        }
diff --git a/mm/swap.c b/mm/swap.c
index 9ce43ba4498b..9e8e3472248b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -67,7 +67,7 @@ static void __page_cache_release(struct page *page)
 static void __put_single_page(struct page *page)
 {
        __page_cache_release(page);
-        free_hot_cold_page(page, 0);
+        free_hot_cold_page(page, false);
 }
 static void __put_compound_page(struct page *page)
@@ -79,95 +79,88 @@ static void __put_compound_page(struct page *page)
        (*dtor)(page);
 }
-static void put_compound_page(struct page *page)
+/**
+ * Two special cases here: we could avoid taking compound_lock_irqsave
+ * and could skip the tail refcounting(in _mapcount).
+ *
+ * 1. Hugetlbfs page:
+ *
+ *    PageHeadHuge will remain true until the compound page
+ *    is released and enters the buddy allocator, and it could
+ *    not be split by __split_huge_page_refcount().
+ *
+ *    So if we see PageHeadHuge set, and we have the tail page pin,
+ *    then we could safely put head page.
+ *
+ * 2. Slab THP page:
+ *
+ *    PG_slab is cleared before the slab frees the head page, and
+ *    tail pin cannot be the last reference left on the head page,
+ *    because the slab code is free to reuse the compound page
+ *    after a kfree/kmem_cache_free without having to check if
+ *    there's any tail pin left.  In turn all tail pinsmust be always
+ *    released while the head is still pinned by the slab code
+ *    and so we know PG_slab will be still set too.
+ *
+ *    So if we see PageSlab set, and we have the tail page pin,
+ *    then we could safely put head page.
+ */
+static __always_inline
+void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
 {
-        struct page *page_head;
-        if (likely(!PageTail(page))) {
-                if (put_page_testzero(page)) {
-                        /*
-                         * By the time all refcounts have been released
-                         * split_huge_page cannot run anymore from under us.
-                         */
-                        if (PageHead(page))
-                                __put_compound_page(page);
-                        else
-                                __put_single_page(page);
-                }
-                return;
-        }
-        /* __split_huge_page_refcount can run under us */
-        page_head = compound_head(page);
        /*
-         * THP can not break up slab pages so avoid taking
+         * If @page is a THP tail, we must read the tail page
-         * compound_lock() and skip the tail page refcounting (in
+         * flags after the head page flags. The
-         * _mapcount) too. Slab performs non-atomic bit ops on
+         * __split_huge_page_refcount side enforces write memory barriers
-         * page->flags for better performance. In particular
+         * between clearing PageTail and before the head page
-         * slab_unlock() in slub used to be a hot path. It is still
+         * can be freed and reallocated.
-         * hot on arches that do not support
-         * this_cpu_cmpxchg_double().
-         *
-         * If "page" is part of a slab or hugetlbfs page it cannot be
-         * splitted and the head page cannot change from under us. And
-         * if "page" is part of a THP page under splitting, if the
-         * head page pointed by the THP tail isn't a THP head anymore,
-         * we'll find PageTail clear after smp_rmb() and we'll treat
-         * it as a single page.
         */
-        if (!__compound_tail_refcounted(page_head)) {
+        smp_rmb();
+        if (likely(PageTail(page))) {
                /*
-                 * If "page" is a THP tail, we must read the tail page
+                 * __split_huge_page_refcount cannot race
-                 * flags after the head page flags. The
+                 * here, see the comment above this function.
-                 * split_huge_page side enforces write memory barriers
-                 * between clearing PageTail and before the head page
-                 * can be freed and reallocated.
                 */
-                smp_rmb();
+                VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
-                if (likely(PageTail(page))) {
+                VM_BUG_ON_PAGE(page_mapcount(page) != 0, page);
-                        /*
+                if (put_page_testzero(page_head)) {
-                         * __split_huge_page_refcount cannot race
-                         * here.
-                         */
-                        VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
-                        VM_BUG_ON_PAGE(page_mapcount(page) != 0, page);
-                        if (put_page_testzero(page_head)) {
-                                /*
-                                 * If this is the tail of a slab
-                                 * compound page, the tail pin must
-                                 * not be the last reference held on
-                                 * the page, because the PG_slab
-                                 * cannot be cleared before all tail
-                                 * pins (which skips the _mapcount
-                                 * tail refcounting) have been
-                                 * released. For hugetlbfs the tail
-                                 * pin may be the last reference on
-                                 * the page instead, because
-                                 * PageHeadHuge will not go away until
-                                 * the compound page enters the buddy
-                                 * allocator.
-                                 */
-                                VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
-                                __put_compound_page(page_head);
-                        }
-                        return;
-                } else
                        /*
-                         * __split_huge_page_refcount run before us,
+                         * If this is the tail of a slab THP page,
-                         * "page" was a THP tail. The split page_head
+                         * the tail pin must not be the last reference
-                         * has been freed and reallocated as slab or
+                         * held on the page, because the PG_slab cannot
-                         * hugetlbfs page of smaller order (only
+                         * be cleared before all tail pins (which skips
-                         * possible if reallocated as slab on x86).
+                         * the _mapcount tail refcounting) have been
+                         * released.
+                         *
+                         * If this is the tail of a hugetlbfs page,
+                         * the tail pin may be the last reference on
+                         * the page instead, because PageHeadHuge will
+                         * not go away until the compound page enters
+                         * the buddy allocator.
                         */
-                        goto out_put_single;
+                        VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
-        }
+                        __put_compound_page(page_head);
+                }
+        } else
+                /*
+                 * __split_huge_page_refcount run before us,
+                 * @page was a THP tail. The split @page_head
+                 * has been freed and reallocated as slab or
+                 * hugetlbfs page of smaller order (only
+                 * possible if reallocated as slab on x86).
+                 */
+                if (put_page_testzero(page))
+                        __put_single_page(page);
+}
+static __always_inline
+void put_refcounted_compound_page(struct page *page_head, struct page *page)
+{
        if (likely(page != page_head && get_page_unless_zero(page_head))) {
                unsigned long flags;
                /*
-                 * page_head wasn't a dangling pointer but it may not
+                 * @page_head wasn't a dangling pointer but it may not
                 * be a head page anymore by the time we obtain the
                 * lock. That is ok as long as it can't be freed from
                 * under us.
@@ -178,7 +171,7 @@ static void put_compound_page(struct page *page)
                        compound_unlock_irqrestore(page_head, flags);
                        if (put_page_testzero(page_head)) {
                                /*
-                                 * The head page may have been freed
+                                 * The @page_head may have been freed
                                 * and reallocated as a compound page
                                 * of smaller order and then freed
                                 * again.  All we know is that it
@@ -222,12 +215,51 @@ out_put_single:
                                __put_single_page(page_head);
                }
        } else {
-                /* page_head is a dangling pointer */
+                /* @page_head is a dangling pointer */
                VM_BUG_ON_PAGE(PageTail(page), page);
                goto out_put_single;
        }
 }
+static void put_compound_page(struct page *page)
+{
+        struct page *page_head;
+        /*
+         * We see the PageCompound set and PageTail not set, so @page maybe:
+         *  1. hugetlbfs head page, or
+         *  2. THP head page.
+         */
+        if (likely(!PageTail(page))) {
+                if (put_page_testzero(page)) {
+                        /*
+                         * By the time all refcounts have been released
+                         * split_huge_page cannot run anymore from under us.
+                         */
+                        if (PageHead(page))
+                                __put_compound_page(page);
+                        else
+                                __put_single_page(page);
+                }
+                return;
+        }
+        /*
+         * We see the PageCompound set and PageTail set, so @page maybe:
+         *  1. a tail hugetlbfs page, or
+         *  2. a tail THP page, or
+         *  3. a split THP page.
+         *
+         *  Case 3 is possible, as we may race with
+         *  __split_huge_page_refcount tearing down a THP page.
+         */
+        page_head = compound_head_by_tail(page);
+        if (!__compound_tail_refcounted(page_head))
+                put_unrefcounted_compound_page(page_head, page);
+        else
+                put_refcounted_compound_page(page_head, page);
+}
 void put_page(struct page *page)
 {
        if (unlikely(PageCompound(page)))
@@ -441,7 +473,7 @@ void rotate_reclaimable_page(struct page *page)
                page_cache_get(page);
                local_irq_save(flags);
-                pvec = &__get_cpu_var(lru_rotate_pvecs);
+                pvec = this_cpu_ptr(&lru_rotate_pvecs);
                if (!pagevec_add(pvec, page))
                        pagevec_move_tail(pvec);
                local_irq_restore(flags);
@@ -583,12 +615,17 @@ void mark_page_accessed(struct page *page)
 EXPORT_SYMBOL(mark_page_accessed);
 /*
- * Queue the page for addition to the LRU via pagevec. The decision on whether
+ * Used to mark_page_accessed(page) that is not visible yet and when it is
- * to add the page to the [in]active [file|anon] list is deferred until the
+ * still safe to use non-atomic ops
- * pagevec is drained. This gives a chance for the caller of __lru_cache_add()
- * have the page added to the active list using mark_page_accessed().
 */
-void __lru_cache_add(struct page *page)
+void init_page_accessed(struct page *page)
+{
+        if (!PageReferenced(page))
+                __SetPageReferenced(page);
+}
+EXPORT_SYMBOL(init_page_accessed);
+static void __lru_cache_add(struct page *page)
 {
        struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
@@ -598,11 +635,34 @@ void __lru_cache_add(struct page *page)
        pagevec_add(pvec, page);
        put_cpu_var(lru_add_pvec);
 }
-EXPORT_SYMBOL(__lru_cache_add);
+/**
+ * lru_cache_add: add a page to the page lists
+ * @page: the page to add
+ */
+void lru_cache_add_anon(struct page *page)
+{
+        if (PageActive(page))
+                ClearPageActive(page);
+        __lru_cache_add(page);
+}
+void lru_cache_add_file(struct page *page)
+{
+        if (PageActive(page))
+                ClearPageActive(page);
+        __lru_cache_add(page);
+}
+EXPORT_SYMBOL(lru_cache_add_file);
 /**
 * lru_cache_add - add a page to a page list
 * @page: the page to be added to the LRU.
+ *
+ * Queue the page for addition to the LRU via pagevec. The decision on whether
+ * to add the page to the [in]active [file|anon] list is deferred until the
+ * pagevec is drained. This gives a chance for the caller of lru_cache_add()
+ * have the page added to the active list using mark_page_accessed().
 */
 void lru_cache_add(struct page *page)
 {
@@ -813,7 +873,7 @@ void lru_add_drain_all(void)
 * grabbed the page via the LRU.  If it did, give up: shrink_inactive_list()
 * will free it.
 */
-void release_pages(struct page **pages, int nr, int cold)
+void release_pages(struct page **pages, int nr, bool cold)
 {
        int i;
        LIST_HEAD(pages_to_free);
@@ -854,7 +914,7 @@ void release_pages(struct page **pages, int nr, int cold)
                }
                /* Clear Active bit in case of parallel mark_page_accessed */
-                ClearPageActive(page);
+                __ClearPageActive(page);
                list_add(&page->lru, &pages_to_free);
        }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e76ace30d436..2972eee184a4 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -270,7 +270,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
                for (i = 0; i < todo; i++)
                        free_swap_cache(pagep[i]);
-                release_pages(pagep, todo, 0);
+                release_pages(pagep, todo, false);
                pagep += todo;
                nr -= todo;
        }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4a7f7e6992b6..4c524f7bd0bf 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -51,14 +51,32 @@ atomic_long_t nr_swap_pages;
 /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
 long total_swap_pages;
 static int least_priority;
-static atomic_t highest_priority_index = ATOMIC_INIT(-1);
 static const char Bad_file[] = "Bad swap file entry ";
 static const char Unused_file[] = "Unused swap file entry ";
 static const char Bad_offset[] = "Bad swap offset entry ";
 static const char Unused_offset[] = "Unused swap offset entry ";
-struct swap_list_t swap_list = {-1, -1};
+/*
+ * all active swap_info_structs
+ * protected with swap_lock, and ordered by priority.
+ */
+PLIST_HEAD(swap_active_head);
+/*
+ * all available (active, not full) swap_info_structs
+ * protected with swap_avail_lock, ordered by priority.
+ * This is used by get_swap_page() instead of swap_active_head
+ * because swap_active_head includes all swap_info_structs,
+ * but get_swap_page() doesn't need to look at full ones.
+ * This uses its own lock instead of swap_lock because when a
+ * swap_info_struct changes between not-full/full, it needs to
+ * add/remove itself to/from this list, but the swap_info_struct->lock
+ * is held and the locking order requires swap_lock to be taken
+ * before any swap_info_struct->lock.
+ */
+static PLIST_HEAD(swap_avail_head);
+static DEFINE_SPINLOCK(swap_avail_lock);
 struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -505,13 +523,10 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
                /*
                 * If seek is expensive, start searching for new cluster from
                 * start of partition, to minimize the span of allocated swap.
-                 * But if seek is cheap, search from our current position, so
+                 * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
-                 * that swap is allocated from all over the partition: if the
+                 * case, just handled by scan_swap_map_try_ssd_cluster() above.
-                 * Flash Translation Layer only remaps within limited zones,
-                 * we don't want to wear out the first zone too quickly.
                 */
-                if (!(si->flags & SWP_SOLIDSTATE))
+                scan_base = offset = si->lowest_bit;
-                        scan_base = offset = si->lowest_bit;
                last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
                /* Locate the first empty (unaligned) cluster */
@@ -531,26 +546,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
                        }
                }
-                offset = si->lowest_bit;
-                last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
-                /* Locate the first empty (unaligned) cluster */
-                for (; last_in_cluster < scan_base; offset++) {
-                        if (si->swap_map[offset])
-                                last_in_cluster = offset + SWAPFILE_CLUSTER;
-                        else if (offset == last_in_cluster) {
-                                spin_lock(&si->lock);
-                                offset -= SWAPFILE_CLUSTER - 1;
-                                si->cluster_next = offset;
-                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
-                                goto checks;
-                        }
-                        if (unlikely(--latency_ration < 0)) {
-                                cond_resched();
-                                latency_ration = LATENCY_LIMIT;
-                        }
-                }
                offset = scan_base;
                spin_lock(&si->lock);
                si->cluster_nr = SWAPFILE_CLUSTER - 1;
@@ -591,6 +586,9 @@ checks:
        if (si->inuse_pages == si->pages) {
                si->lowest_bit = si->max;
                si->highest_bit = 0;
+                spin_lock(&swap_avail_lock);
+                plist_del(&si->avail_list, &swap_avail_head);
+                spin_unlock(&swap_avail_lock);
        }
        si->swap_map[offset] = usage;
        inc_cluster_info_page(si, si->cluster_info, offset);
@@ -640,71 +638,65 @@ no_page:
 swp_entry_t get_swap_page(void)
 {
-        struct swap_info_struct *si;
+        struct swap_info_struct *si, *next;
        pgoff_t offset;
-        int type, next;
-        int wrapped = 0;
-        int hp_index;
-        spin_lock(&swap_lock);
        if (atomic_long_read(&nr_swap_pages) <= 0)
                goto noswap;
        atomic_long_dec(&nr_swap_pages);
-        for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
+        spin_lock(&swap_avail_lock);
-                hp_index = atomic_xchg(&highest_priority_index, -1);
-                /*
-                 * highest_priority_index records current highest priority swap
-                 * type which just frees swap entries. If its priority is
-                 * higher than that of swap_list.next swap type, we use it.  It
-                 * isn't protected by swap_lock, so it can be an invalid value
-                 * if the corresponding swap type is swapoff. We double check
-                 * the flags here. It's even possible the swap type is swapoff
-                 * and swapon again and its priority is changed. In such rare
-                 * case, low prority swap type might be used, but eventually
-                 * high priority swap will be used after several rounds of
-                 * swap.
-                 */
-                if (hp_index != -1 && hp_index != type &&
-                    swap_info[type]->prio < swap_info[hp_index]->prio &&
-                    (swap_info[hp_index]->flags & SWP_WRITEOK)) {
-                        type = hp_index;
-                        swap_list.next = type;
-                }
-                si = swap_info[type];
-                next = si->next;
-                if (next < 0 ||
-                    (!wrapped && si->prio != swap_info[next]->prio)) {
-                        next = swap_list.head;
-                        wrapped++;
-                }
+start_over:
+        plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
+                /* requeue si to after same-priority siblings */
+                plist_requeue(&si->avail_list, &swap_avail_head);
+                spin_unlock(&swap_avail_lock);
                spin_lock(&si->lock);
-                if (!si->highest_bit) {
+                if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
-                        spin_unlock(&si->lock);
+                        spin_lock(&swap_avail_lock);
-                        continue;
+                        if (plist_node_empty(&si->avail_list)) {
-                }
+                                spin_unlock(&si->lock);
-                if (!(si->flags & SWP_WRITEOK)) {
+                                goto nextsi;
+                        }
+                        WARN(!si->highest_bit,
+                             "swap_info %d in list but !highest_bit\n",
+                             si->type);
+                        WARN(!(si->flags & SWP_WRITEOK),
+                             "swap_info %d in list but !SWP_WRITEOK\n",
+                             si->type);
+                        plist_del(&si->avail_list, &swap_avail_head);
                        spin_unlock(&si->lock);
-                        continue;
+                        goto nextsi;
                }
-                swap_list.next = next;
-                spin_unlock(&swap_lock);
                /* This is called for allocating swap entry for cache */
                offset = scan_swap_map(si, SWAP_HAS_CACHE);
                spin_unlock(&si->lock);
                if (offset)
-                        return swp_entry(type, offset);
+                        return swp_entry(si->type, offset);
-                spin_lock(&swap_lock);
+                pr_debug("scan_swap_map of si %d failed to find offset\n",
-                next = swap_list.next;
+                       si->type);
+                spin_lock(&swap_avail_lock);
+nextsi:
+                /*
+                 * if we got here, it's likely that si was almost full before,
+                 * and since scan_swap_map() can drop the si->lock, multiple
+                 * callers probably all tried to get a page from the same si
+                 * and it filled up before we could get one; or, the si filled
+                 * up between us dropping swap_avail_lock and taking si->lock.
+                 * Since we dropped the swap_avail_lock, the swap_avail_head
+                 * list may have been modified; so if next is still in the
+                 * swap_avail_head list then try it, otherwise start over.
+                 */
+                if (plist_node_empty(&next->avail_list))
+                        goto start_over;
        }
+        spin_unlock(&swap_avail_lock);
        atomic_long_inc(&nr_swap_pages);
 noswap:
-        spin_unlock(&swap_lock);
        return (swp_entry_t) {0};
 }
@@ -766,27 +758,6 @@ out:
        return NULL;
 }
-/*
- * This swap type frees swap entry, check if it is the highest priority swap
- * type which just frees swap entry. get_swap_page() uses
- * highest_priority_index to search highest priority swap type. The
- * swap_info_struct.lock can't protect us if there are multiple swap types
- * active, so we use atomic_cmpxchg.
- */
-static void set_highest_priority_index(int type)
-{
-        int old_hp_index, new_hp_index;
-        do {
-                old_hp_index = atomic_read(&highest_priority_index);
-                if (old_hp_index != -1 &&
-                        swap_info[old_hp_index]->prio >= swap_info[type]->prio)
-                        break;
-                new_hp_index = type;
-        } while (atomic_cmpxchg(&highest_priority_index,
-                old_hp_index, new_hp_index) != old_hp_index);
-}
 static unsigned char swap_entry_free(struct swap_info_struct *p,
                                     swp_entry_t entry, unsigned char usage)
 {
@@ -828,9 +799,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
                dec_cluster_info_page(p, p->cluster_info, offset);
                if (offset < p->lowest_bit)
                        p->lowest_bit = offset;
-                if (offset > p->highest_bit)
+                if (offset > p->highest_bit) {
+                        bool was_full = !p->highest_bit;
                        p->highest_bit = offset;
-                set_highest_priority_index(p->type);
+                        if (was_full && (p->flags & SWP_WRITEOK)) {
+                                spin_lock(&swap_avail_lock);
+                                WARN_ON(!plist_node_empty(&p->avail_list));
+                                if (plist_node_empty(&p->avail_list))
+                                        plist_add(&p->avail_list,
+                                                  &swap_avail_head);
+                                spin_unlock(&swap_avail_lock);
+                        }
+                }
                atomic_long_inc(&nr_swap_pages);
                p->inuse_pages--;
                frontswap_invalidate_page(p->type, offset);
@@ -1765,30 +1745,37 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
                                unsigned char *swap_map,
                                struct swap_cluster_info *cluster_info)
 {
-        int i, prev;
        if (prio >= 0)
                p->prio = prio;
        else
                p->prio = --least_priority;
+        /*
+         * the plist prio is negated because plist ordering is
+         * low-to-high, while swap ordering is high-to-low
+         */
+        p->list.prio = -p->prio;
+        p->avail_list.prio = -p->prio;
        p->swap_map = swap_map;
        p->cluster_info = cluster_info;
        p->flags |= SWP_WRITEOK;
        atomic_long_add(p->pages, &nr_swap_pages);
        total_swap_pages += p->pages;
-        /* insert swap space into swap_list: */
+        assert_spin_locked(&swap_lock);
-        prev = -1;
+        /*
-        for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
+         * both lists are plists, and thus priority ordered.
-                if (p->prio >= swap_info[i]->prio)
+         * swap_active_head needs to be priority ordered for swapoff(),
-                        break;
+         * which on removal of any swap_info_struct with an auto-assigned
-                prev = i;
+         * (i.e. negative) priority increments the auto-assigned priority
-        }
+         * of any lower-priority swap_info_structs.
-        p->next = i;
+         * swap_avail_head needs to be priority ordered for get_swap_page(),
-        if (prev < 0)
+         * which allocates swap pages from the highest available priority
-                swap_list.head = swap_list.next = p->type;
+         * swap_info_struct.
-        else
+         */
-                swap_info[prev]->next = p->type;
+        plist_add(&p->list, &swap_active_head);
+        spin_lock(&swap_avail_lock);
+        plist_add(&p->avail_list, &swap_avail_head);
+        spin_unlock(&swap_avail_lock);
 }
 static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -1823,8 +1810,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        struct address_space *mapping;
        struct inode *inode;
        struct filename *pathname;
-        int i, type, prev;
+        int err, found = 0;
-        int err;
        unsigned int old_block_size;
        if (!capable(CAP_SYS_ADMIN))
@@ -1842,17 +1828,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                goto out;
        mapping = victim->f_mapping;
-        prev = -1;
        spin_lock(&swap_lock);
-        for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
+        plist_for_each_entry(p, &swap_active_head, list) {
-                p = swap_info[type];
                if (p->flags & SWP_WRITEOK) {
-                        if (p->swap_file->f_mapping == mapping)
+                        if (p->swap_file->f_mapping == mapping) {
+                                found = 1;
                                break;
+                        }
                }
-                prev = type;
        }
-        if (type < 0) {
+        if (!found) {
                err = -EINVAL;
                spin_unlock(&swap_lock);
                goto out_dput;
@@ -1864,20 +1849,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                spin_unlock(&swap_lock);
                goto out_dput;
        }
-        if (prev < 0)
+        spin_lock(&swap_avail_lock);
-                swap_list.head = p->next;
+        plist_del(&p->avail_list, &swap_avail_head);
-        else
+        spin_unlock(&swap_avail_lock);
-                swap_info[prev]->next = p->next;
-        if (type == swap_list.next) {
-                /* just pick something that's safe... */
-                swap_list.next = swap_list.head;
-        }
        spin_lock(&p->lock);
        if (p->prio < 0) {
-                for (i = p->next; i >= 0; i = swap_info[i]->next)
+                struct swap_info_struct *si = p;
-                        swap_info[i]->prio = p->prio--;
+                plist_for_each_entry_continue(si, &swap_active_head, list) {
+                        si->prio++;
+                        si->list.prio--;
+                        si->avail_list.prio--;
+                }
                least_priority++;
        }
+        plist_del(&p->list, &swap_active_head);
        atomic_long_sub(p->pages, &nr_swap_pages);
        total_swap_pages -= p->pages;
        p->flags &= ~SWP_WRITEOK;
@@ -1885,7 +1871,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        spin_unlock(&swap_lock);
        set_current_oom_origin();
-        err = try_to_unuse(type, false, 0); /* force all pages to be unused */
+        err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
        clear_current_oom_origin();
        if (err) {
@@ -1926,7 +1912,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        frontswap_map = frontswap_map_get(p);
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
-        frontswap_invalidate_area(type);
+        frontswap_invalidate_area(p->type);
        frontswap_map_set(p, NULL);
        mutex_unlock(&swapon_mutex);
        free_percpu(p->percpu_cluster);
@@ -1935,7 +1921,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        vfree(cluster_info);
        vfree(frontswap_map);
        /* Destroy swap account information */
-        swap_cgroup_swapoff(type);
+        swap_cgroup_swapoff(p->type);
        inode = mapping->host;
        if (S_ISBLK(inode->i_mode)) {
@@ -2142,8 +2128,9 @@ static struct swap_info_struct *alloc_swap_info(void)
                 */
        }
        INIT_LIST_HEAD(&p->first_swap_extent.list);
+        plist_node_init(&p->list, 0);
+        plist_node_init(&p->avail_list, 0);
        p->flags = SWP_USED;
-        p->next = -1;
        spin_unlock(&swap_lock);
        spin_lock_init(&p->lock);
diff --git a/mm/vmacache.c b/mm/vmacache.c
index 1037a3bab505..9f25af825dec 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -17,6 +17,16 @@ void vmacache_flush_all(struct mm_struct *mm)
 {
        struct task_struct *g, *p;
+        /*
+         * Single threaded tasks need not iterate the entire
+         * list of process. We can avoid the flushing as well
+         * since the mm's seqnum was increased and don't have
+         * to worry about other threads' seqnum. Current's
+         * flush will occur upon the next lookup.
+         */
+        if (atomic_read(&mm->mm_users) == 1)
+                return;
        rcu_read_lock();
        for_each_process_thread(g, p) {
                /*
@@ -78,6 +88,8 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
        if (!vmacache_valid(mm))
                return NULL;
+        count_vm_vmacache_event(VMACACHE_FIND_CALLS);
        for (i = 0; i < VMACACHE_SIZE; i++) {
                struct vm_area_struct *vma = current->vmacache[i];
@@ -85,8 +97,10 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
                        continue;
                if (WARN_ON_ONCE(vma->vm_mm != mm))
                        break;
-                if (vma->vm_start <= addr && vma->vm_end > addr)
+                if (vma->vm_start <= addr && vma->vm_end > addr) {
+                        count_vm_vmacache_event(VMACACHE_FIND_HITS);
                        return vma;
+                }
        }
        return NULL;
@@ -102,11 +116,15 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
        if (!vmacache_valid(mm))
                return NULL;
+        count_vm_vmacache_event(VMACACHE_FIND_CALLS);
        for (i = 0; i < VMACACHE_SIZE; i++) {
                struct vm_area_struct *vma = current->vmacache[i];
-                if (vma && vma->vm_start == start && vma->vm_end == end)
+                if (vma && vma->vm_start == start && vma->vm_end == end) {
+                        count_vm_vmacache_event(VMACACHE_FIND_HITS);
                        return vma;
+                }
        }
        return NULL;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index bf233b283319..f64632b67196 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1268,6 +1268,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
        vunmap_page_range(addr, end);
        flush_tlb_kernel_range(addr, end);
 }
+EXPORT_SYMBOL_GPL(unmap_kernel_range);
 int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
 {
@@ -1496,7 +1497,7 @@ void vfree(const void *addr)
        if (!addr)
                return;
        if (unlikely(in_interrupt())) {
-                struct vfree_deferred *p = &__get_cpu_var(vfree_deferred);
+                struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred);
                if (llist_add((struct llist_node *)addr, &p->list))
                        schedule_work(&p->wq);
        } else
@@ -2619,19 +2620,19 @@ static int s_show(struct seq_file *m, void *p)
                seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr);
        if (v->flags & VM_IOREMAP)
-                seq_printf(m, " ioremap");
+                seq_puts(m, " ioremap");
        if (v->flags & VM_ALLOC)
-                seq_printf(m, " vmalloc");
+                seq_puts(m, " vmalloc");
        if (v->flags & VM_MAP)
-                seq_printf(m, " vmap");
+                seq_puts(m, " vmap");
        if (v->flags & VM_USERMAP)
-                seq_printf(m, " user");
+                seq_puts(m, " user");
        if (v->flags & VM_VPAGES)
-                seq_printf(m, " vpages");
+                seq_puts(m, " vpages");
        show_numa_info(m, v);
        seq_putc(m, '\n');
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 32c661d66a45..9149444f947d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -324,7 +324,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
        else
                new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
-        trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
+        trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
        return freed;
 }
@@ -1121,7 +1121,7 @@ keep:
                VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
        }
-        free_hot_cold_page_list(&free_pages, 1);
+        free_hot_cold_page_list(&free_pages, true);
        list_splice(&ret_pages, page_list);
        count_vm_events(PGACTIVATE, pgactivate);
@@ -1439,6 +1439,19 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
 }
 /*
+ * If a kernel thread (such as nfsd for loop-back mounts) services
+ * a backing device by writing to the page cache it sets PF_LESS_THROTTLE.
+ * In that case we should only throttle if the backing device it is
+ * writing to is congested.  In other cases it is safe to throttle.
+ */
+static int current_may_throttle(void)
+{
+        return !(current->flags & PF_LESS_THROTTLE) ||
+                current->backing_dev_info == NULL ||
+                bdi_write_congested(current->backing_dev_info);
+}
+/*
 * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
 * of reclaimed pages
 */
@@ -1519,7 +1532,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        spin_unlock_irq(&zone->lru_lock);
-        free_hot_cold_page_list(&page_list, 1);
+        free_hot_cold_page_list(&page_list, true);
        /*
         * If reclaim is isolating dirty pages under writeback, it implies
@@ -1566,7 +1579,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
                 * implies that pages are cycling through the LRU faster than
                 * they are written so also forcibly stall.
                 */
-                if (nr_unqueued_dirty == nr_taken || nr_immediate)
+                if ((nr_unqueued_dirty == nr_taken || nr_immediate) &&
+                    current_may_throttle())
                        congestion_wait(BLK_RW_ASYNC, HZ/10);
        }
@@ -1575,7 +1589,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         * is congested. Allow kswapd to continue until it starts encountering
         * unqueued dirty pages or cycling through the LRU too quickly.
         */
-        if (!sc->hibernation_mode && !current_is_kswapd())
+        if (!sc->hibernation_mode && !current_is_kswapd() &&
+            current_may_throttle())
                wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
        trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
@@ -1740,7 +1755,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
        __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
        spin_unlock_irq(&zone->lru_lock);
-        free_hot_cold_page_list(&l_hold, 1);
+        free_hot_cold_page_list(&l_hold, true);
 }
 #ifdef CONFIG_SWAP
@@ -1866,6 +1881,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
        bool force_scan = false;
        unsigned long ap, fp;
        enum lru_list lru;
+        bool some_scanned;
+        int pass;
        /*
         * If the zone or memcg is small, nr[l] can be 0.  This
@@ -1989,39 +2006,49 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
        fraction[1] = fp;
        denominator = ap + fp + 1;
 out:
-        for_each_evictable_lru(lru) {
+        some_scanned = false;
-                int file = is_file_lru(lru);
+        /* Only use force_scan on second pass. */
-                unsigned long size;
+        for (pass = 0; !some_scanned && pass < 2; pass++) {
-                unsigned long scan;
+                for_each_evictable_lru(lru) {
+                        int file = is_file_lru(lru);
+                        unsigned long size;
+                        unsigned long scan;
-                size = get_lru_size(lruvec, lru);
+                        size = get_lru_size(lruvec, lru);
-                scan = size >> sc->priority;
+                        scan = size >> sc->priority;
-                if (!scan && force_scan)
+                        if (!scan && pass && force_scan)
-                        scan = min(size, SWAP_CLUSTER_MAX);
+                                scan = min(size, SWAP_CLUSTER_MAX);
-                switch (scan_balance) {
+                        switch (scan_balance) {
-                case SCAN_EQUAL:
+                        case SCAN_EQUAL:
-                        /* Scan lists relative to size */
+                                /* Scan lists relative to size */
-                        break;
+                                break;
-                case SCAN_FRACT:
+                        case SCAN_FRACT:
+                                /*
+                                 * Scan types proportional to swappiness and
+                                 * their relative recent reclaim efficiency.
+                                 */
+                                scan = div64_u64(scan * fraction[file],
+                                                        denominator);
+                                break;
+                        case SCAN_FILE:
+                        case SCAN_ANON:
+                                /* Scan one type exclusively */
+                                if ((scan_balance == SCAN_FILE) != file)
+                                        scan = 0;
+                                break;
+                        default:
+                                /* Look ma, no brain */
+                                BUG();
+                        }
+                        nr[lru] = scan;
                        /*
-                         * Scan types proportional to swappiness and
+                         * Skip the second pass and don't force_scan,
-                         * their relative recent reclaim efficiency.
+                         * if we found something to scan.
                         */
-                        scan = div64_u64(scan * fraction[file], denominator);
+                        some_scanned |= !!scan;
-                        break;
-                case SCAN_FILE:
-                case SCAN_ANON:
-                        /* Scan one type exclusively */
-                        if ((scan_balance == SCAN_FILE) != file)
-                                scan = 0;
-                        break;
-                default:
-                        /* Look ma, no brain */
-                        BUG();
                }
-                nr[lru] = scan;
        }
 }
@@ -2037,13 +2064,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
        unsigned long nr_reclaimed = 0;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
        struct blk_plug plug;
-        bool scan_adjusted = false;
+        bool scan_adjusted;
        get_scan_count(lruvec, sc, nr);
        /* Record the original scan target for proportional adjustments later */
        memcpy(targets, nr, sizeof(nr));
+        /*
+         * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
+         * event that can occur when there is little memory pressure e.g.
+         * multiple streaming readers/writers. Hence, we do not abort scanning
+         * when the requested number of pages are reclaimed when scanning at
+         * DEF_PRIORITY on the assumption that the fact we are direct
+         * reclaiming implies that kswapd is not keeping up and it is best to
+         * do a batch of work at once. For memcg reclaim one check is made to
+         * abort proportional reclaim if either the file or anon lru has already
+         * dropped to zero at the first pass.
+         */
+        scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
+                         sc->priority == DEF_PRIORITY);
        blk_start_plug(&plug);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
@@ -2064,17 +2105,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
                        continue;
                /*
-                 * For global direct reclaim, reclaim only the number of pages
-                 * requested. Less care is taken to scan proportionally as it
-                 * is more important to minimise direct reclaim stall latency
-                 * than it is to properly age the LRU lists.
-                 */
-                if (global_reclaim(sc) && !current_is_kswapd())
-                        break;
-                /*
                 * For kswapd and memcg, reclaim at least the number of pages
-                 * requested. Ensure that the anon and file LRUs shrink
+                 * requested. Ensure that the anon and file LRUs are scanned
                 * proportionally what was requested by get_scan_count(). We
                 * stop reclaiming one LRU and reduce the amount scanning
                 * proportional to the original scan target.
@@ -2082,6 +2114,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
                nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
                nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
+                /*
+                 * It's just vindictive to attack the larger once the smaller
+                 * has gone to zero.  And given the way we stop scanning the
+                 * smaller below, this makes sure that we only make one nudge
+                 * towards proportionality once we've got nr_to_reclaim.
+                 */
+                if (!nr_file || !nr_anon)
+                        break;
                if (nr_file > nr_anon) {
                        unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
                                                targets[LRU_ACTIVE_ANON] + 1;
@@ -2268,9 +2309,8 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
         * there is a buffer of free pages available to give compaction
         * a reasonable chance of completing and allocating the page
         */
-        balance_gap = min(low_wmark_pages(zone),
+        balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
-                (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+                        zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
-                        KSWAPD_ZONE_BALANCE_GAP_RATIO);
        watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
        watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
@@ -2525,10 +2565,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
        for (i = 0; i <= ZONE_NORMAL; i++) {
                zone = &pgdat->node_zones[i];
+                if (!populated_zone(zone))
+                        continue;
                pfmemalloc_reserve += min_wmark_pages(zone);
                free_pages += zone_page_state(zone, NR_FREE_PAGES);
        }
+        /* If there are no reserves (unexpected config) then do not throttle */
+        if (!pfmemalloc_reserve)
+                return true;
        wmark_ok = free_pages > pfmemalloc_reserve / 2;
        /* kswapd must be awake if processes are being throttled */
@@ -2553,9 +2600,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
 static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
                                        nodemask_t *nodemask)
 {
+        struct zoneref *z;
        struct zone *zone;
-        int high_zoneidx = gfp_zone(gfp_mask);
+        pg_data_t *pgdat = NULL;
-        pg_data_t *pgdat;
        /*
         * Kernel threads should not be throttled as they may be indirectly
@@ -2574,10 +2621,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
        if (fatal_signal_pending(current))
                goto out;
-        /* Check if the pfmemalloc reserves are ok */
+        /*
-        first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
+         * Check if the pfmemalloc reserves are ok by finding the first node
-        pgdat = zone->zone_pgdat;
+         * with a usable ZONE_NORMAL or lower zone. The expectation is that
-        if (pfmemalloc_watermark_ok(pgdat))
+         * GFP_KERNEL will be required for allocating network buffers when
+         * swapping over the network so ZONE_HIGHMEM is unusable.
+         *
+         * Throttling is based on the first usable node and throttled processes
+         * wait on a queue until kswapd makes progress and wakes them. There
+         * is an affinity then between processes waking up and where reclaim
+         * progress has been made assuming the process wakes on the same node.
+         * More importantly, processes running on remote nodes will not compete
+         * for remote pfmemalloc reserves and processes on different nodes
+         * should make reasonable progress.
+         */
+        for_each_zone_zonelist_nodemask(zone, z, zonelist,
+                                        gfp_mask, nodemask) {
+                if (zone_idx(zone) > ZONE_NORMAL)
+                        continue;
+                /* Throttle based on the first usable node */
+                pgdat = zone->zone_pgdat;
+                if (pfmemalloc_watermark_ok(pgdat))
+                        goto out;
+                break;
+        }
+        /* If no zone was usable by the allocation flags then do not throttle */
+        if (!pgdat)
                goto out;
        /* Account for the throttling */
@@ -2891,9 +2962,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
         * high wmark plus a "gap" where the gap is either the low
         * watermark or 1% of the zone, whichever is smaller.
         */
-        balance_gap = min(low_wmark_pages(zone),
+        balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
-                (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+                        zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
-                KSWAPD_ZONE_BALANCE_GAP_RATIO);
        /*
         * If there is no low memory pressure or the zone is balanced then no
@@ -3422,7 +3492,7 @@ int kswapd_run(int nid)
 /*
 * Called by memory hotplug when all memory in a node is offlined.  Caller must
- * hold lock_memory_hotplug().
+ * hold mem_hotplug_begin/end().
 */
 void kswapd_stop(int nid)
 {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 302dd076b8bf..b37bd49bfd55 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -207,7 +207,9 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
 }
 /*
- * For use when we know that interrupts are disabled.
+ * For use when we know that interrupts are disabled,
+ * or when we know that preemption is disabled and that
+ * particular counter cannot be updated from interrupt context.
 */
 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
                                int delta)
@@ -489,7 +491,7 @@ static void refresh_cpu_vm_stats(void)
                        continue;
                if (__this_cpu_read(p->pcp.count))
-                        drain_zone_pages(zone, __this_cpu_ptr(&p->pcp));
+                        drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
 #endif
        }
        fold_diff(global_diff);
@@ -866,6 +868,10 @@ const char * const vmstat_text[] = {
        "nr_tlb_local_flush_one",
 #endif /* CONFIG_DEBUG_TLBFLUSH */
+#ifdef CONFIG_DEBUG_VM_VMACACHE
+        "vmacache_find_calls",
+        "vmacache_find_hits",
+#endif
 #endif /* CONFIG_VM_EVENTS_COUNTERS */
 };
 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
@@ -1226,7 +1232,7 @@ int sysctl_stat_interval __read_mostly = HZ;
 static void vmstat_update(struct work_struct *w)
 {
        refresh_cpu_vm_stats();
-        schedule_delayed_work(&__get_cpu_var(vmstat_work),
+        schedule_delayed_work(this_cpu_ptr(&vmstat_work),
                round_jiffies_relative(sysctl_stat_interval));
 }
diff --git a/mm/zbud.c b/mm/zbud.c
index 9451361e6aa7..01df13a7e2e1 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -247,7 +247,7 @@ void zbud_destroy_pool(struct zbud_pool *pool)
 * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
 * a new page.
 */
-int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
+int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp,
                        unsigned long *handle)
 {
        int chunks, i, freechunks;
@@ -255,7 +255,7 @@ int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
        enum buddy bud;
        struct page *page;
-        if (size <= 0 || gfp & __GFP_HIGHMEM)
+        if (!size || (gfp & __GFP_HIGHMEM))
                return -EINVAL;
        if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
                return -ENOSPC;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 36b4591a7a2d..fe78189624cf 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -141,7 +141,7 @@
 #define ZS_MAX_ALLOC_SIZE       PAGE_SIZE
 /*
- * On systems with 4K page size, this gives 254 size classes! There is a
+ * On systems with 4K page size, this gives 255 size classes! There is a
 * trader-off here:
 *  - Large number of size classes is potentially wasteful as free page are
 *    spread across these classes
@@ -1082,7 +1082,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
        class = &pool->size_class[class_idx];
        off = obj_idx_to_offset(page, obj_idx, class->size);
-        area = &__get_cpu_var(zs_map_area);
+        area = this_cpu_ptr(&zs_map_area);
        if (off + class->size <= PAGE_SIZE)
                kunmap_atomic(area->vm_addr);
        else {
diff --git a/mm/zswap.c b/mm/zswap.c
index aeaef0fb5624..008388fe7b0f 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -347,7 +347,7 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
                        return NOTIFY_BAD;
                }
                *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm;
-                dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL);
+                dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
                if (!dst) {
                        pr_err("can't allocate compressor buffer\n");
                        crypto_free_comp(tfm);