47 files changed, 3230 insertions, 1591 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 278e3ab1f169..ae55c1e04d10 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1,6 +1,6 @@
 config SELECT_MEMORY_MODEL
        def_bool y
-        depends on EXPERIMENTAL || ARCH_SELECT_MEMORY_MODEL
+        depends on ARCH_SELECT_MEMORY_MODEL
 choice
        prompt "Memory model"
@@ -162,10 +162,16 @@ config MOVABLE_NODE
          Say Y here if you want to hotplug a whole node.
          Say N here if you want kernel to use memory on all nodes evenly.
+#
+# Only be set on architectures that have completely implemented memory hotplug
+# feature. If you are not sure, don't touch it.
+#
+config HAVE_BOOTMEM_INFO_NODE
+        def_bool n
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
-        select MEMORY_ISOLATION
        depends on SPARSEMEM || X86_64_ACPI_NUMA
        depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
        depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
@@ -176,6 +182,8 @@ config MEMORY_HOTPLUG_SPARSE
 config MEMORY_HOTREMOVE
        bool "Allow for memory hot remove"
+        select MEMORY_ISOLATION
+        select HAVE_BOOTMEM_INFO_NODE if X86_64
        depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
        depends on MIGRATION
@@ -258,6 +266,19 @@ config BOUNCE
        def_bool y
        depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM)
+# On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often
+# have more than 4GB of memory, but we don't currently use the IOTLB to present
+# a 32-bit address to OHCI.  So we need to use a bounce pool instead.
+#
+# We also use the bounce pool to provide stable page writes for jbd.  jbd
+# initiates buffer writeback without locking the page or setting PG_writeback,
+# and fixing that behavior (a second time; jbd2 doesn't have this problem) is
+# a major rework effort.  Instead, use the bounce buffer to snapshot pages
+# (until jbd goes away).  The only jbd user is ext3.
+config NEED_BOUNCE_POOL
+        bool
+        default y if (TILE && USB_OHCI_HCD) || (BLK_DEV_INTEGRITY && JBD)
 config NR_QUICK
        int
        depends on QUICKLIST
@@ -266,7 +287,7 @@ config NR_QUICK
 config VIRT_TO_BUS
        def_bool y
-        depends on !ARCH_NO_VIRT_TO_BUS
+        depends on HAVE_VIRT_TO_BUS
 config MMU_NOTIFIER
        bool
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d3ca2b3ee176..41733c5dc820 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -221,12 +221,23 @@ static ssize_t max_ratio_store(struct device *dev,
 }
 BDI_SHOW(max_ratio, bdi->max_ratio)
+static ssize_t stable_pages_required_show(struct device *dev,
+                                          struct device_attribute *attr,
+                                          char *page)
+{
+        struct backing_dev_info *bdi = dev_get_drvdata(dev);
+        return snprintf(page, PAGE_SIZE-1, "%d\n",
+                        bdi_cap_stable_pages_required(bdi) ? 1 : 0);
+}
 #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
 static struct device_attribute bdi_dev_attrs[] = {
        __ATTR_RW(read_ahead_kb),
        __ATTR_RW(min_ratio),
        __ATTR_RW(max_ratio),
+        __ATTR_RO(stable_pages_required),
        __ATTR_NULL,
 };
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 1324cd74faec..2b0bcb019ec2 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -185,10 +185,23 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
        while (start < end) {
                unsigned long *map, idx, vec;
+                unsigned shift;
                map = bdata->node_bootmem_map;
                idx = start - bdata->node_min_pfn;
+                shift = idx & (BITS_PER_LONG - 1);
+                /*
+                 * vec holds at most BITS_PER_LONG map bits,
+                 * bit 0 corresponds to start.
+                 */
                vec = ~map[idx / BITS_PER_LONG];
+                if (shift) {
+                        vec >>= shift;
+                        if (end - start >= BITS_PER_LONG)
+                                vec |= ~map[idx / BITS_PER_LONG + 1] <<
+                                        (BITS_PER_LONG - shift);
+                }
                /*
                 * If we have a properly aligned and fully unreserved
                 * BITS_PER_LONG block of pages in front of us, free
@@ -201,19 +214,18 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
                        count += BITS_PER_LONG;
                        start += BITS_PER_LONG;
                } else {
-                        unsigned long off = 0;
+                        unsigned long cur = start;
-                        vec >>= start & (BITS_PER_LONG - 1);
+                        start = ALIGN(start + 1, BITS_PER_LONG);
-                        while (vec) {
+                        while (vec && cur != start) {
                                if (vec & 1) {
-                                        page = pfn_to_page(start + off);
+                                        page = pfn_to_page(cur);
                                        __free_pages_bootmem(page, 0);
                                        count++;
                                }
                                vec >>= 1;
-                                off++;
+                                ++cur;
                        }
-                        start = ALIGN(start + 1, BITS_PER_LONG);
                }
        }
@@ -821,6 +833,14 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
        return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
 }
+void * __init __alloc_bootmem_low_nopanic(unsigned long size,
+                                          unsigned long align,
+                                          unsigned long goal)
+{
+        return ___alloc_bootmem_nopanic(size, align, goal,
+                                        ARCH_LOW_ADDRESS_LIMIT);
+}
 /**
 * __alloc_bootmem_low_node - allocate low boot memory from a specific node
 * @pgdat: node to allocate from
diff --git a/mm/bounce.c b/mm/bounce.c
index 042086775561..5f8901768602 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -178,8 +178,45 @@ static void bounce_end_io_read_isa(struct bio *bio, int err)
        __bounce_end_io_read(bio, isa_page_pool, err);
 }
+#ifdef CONFIG_NEED_BOUNCE_POOL
+static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
+{
+        struct page *page;
+        struct backing_dev_info *bdi;
+        struct address_space *mapping;
+        struct bio_vec *from;
+        int i;
+        if (bio_data_dir(bio) != WRITE)
+                return 0;
+        if (!bdi_cap_stable_pages_required(&q->backing_dev_info))
+                return 0;
+        /*
+         * Based on the first page that has a valid mapping, decide whether or
+         * not we have to employ bounce buffering to guarantee stable pages.
+         */
+        bio_for_each_segment(from, bio, i) {
+                page = from->bv_page;
+                mapping = page_mapping(page);
+                if (!mapping)
+                        continue;
+                bdi = mapping->backing_dev_info;
+                return mapping->host->i_sb->s_flags & MS_SNAP_STABLE;
+        }
+        return 0;
+}
+#else
+static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
+{
+        return 0;
+}
+#endif /* CONFIG_NEED_BOUNCE_POOL */
 static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
-                               mempool_t *pool)
+                               mempool_t *pool, int force)
 {
        struct page *page;
        struct bio *bio = NULL;
@@ -192,7 +229,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
                /*
                 * is destination page below bounce pfn?
                 */
-                if (page_to_pfn(page) <= queue_bounce_pfn(q))
+                if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
                        continue;
                /*
@@ -270,6 +307,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 {
+        int must_bounce;
        mempool_t *pool;
        /*
@@ -278,13 +316,15 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
        if (!bio_has_data(*bio_orig))
                return;
+        must_bounce = must_snapshot_stable_pages(q, *bio_orig);
        /*
         * for non-isa bounce case, just check if the bounce pfn is equal
         * to or bigger than the highest pfn in the system -- in that case,
         * don't waste time iterating over bio segments
         */
        if (!(q->bounce_gfp & GFP_DMA)) {
-                if (queue_bounce_pfn(q) >= blk_max_pfn)
+                if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce)
                        return;
                pool = page_pool;
        } else {
@@ -295,7 +335,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
        /*
         * slow path
         */
-        __blk_queue_bounce(q, bio_orig, pool);
+        __blk_queue_bounce(q, bio_orig, pool, must_bounce);
 }
 EXPORT_SYMBOL(blk_queue_bounce);
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 32e6f4136fa2..d76ba74be2d0 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -89,7 +89,7 @@ static int cleancache_get_key(struct inode *inode,
                fhfn = sb->s_export_op->encode_fh;
                if  (fhfn) {
                        len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL);
-                        if (len <= 0 || len == 255)
+                        if (len <= FILEID_ROOT || len == FILEID_INVALID)
                                return -1;
                        if (maxlen > CLEANCACHE_KEY_MAX)
                                return -1;
diff --git a/mm/compaction.c b/mm/compaction.c
index 6b807e466497..05ccb4cc0bdb 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -15,6 +15,7 @@
 #include <linux/sysctl.h>
 #include <linux/sysfs.h>
 #include <linux/balloon_compaction.h>
+#include <linux/page-isolation.h>
 #include "internal.h"
 #ifdef CONFIG_COMPACTION
@@ -85,7 +86,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
 static void __reset_isolation_suitable(struct zone *zone)
 {
        unsigned long start_pfn = zone->zone_start_pfn;
-        unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+        unsigned long end_pfn = zone_end_pfn(zone);
        unsigned long pfn;
        zone->compact_cached_migrate_pfn = start_pfn;
@@ -215,7 +216,10 @@ static bool suitable_migration_target(struct page *page)
        int migratetype = get_pageblock_migratetype(page);
        /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
-        if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
+        if (migratetype == MIGRATE_RESERVE)
+                return false;
+        if (is_migrate_isolate(migratetype))
                return false;
        /* If the page is a large free page, then allow migration */
@@ -611,8 +615,7 @@ check_compact_cluster:
                continue;
 next_pageblock:
-                low_pfn += pageblock_nr_pages;
+                low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
-                low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
                last_pageblock_nr = pageblock_nr;
        }
@@ -644,7 +647,7 @@ static void isolate_freepages(struct zone *zone,
                                struct compact_control *cc)
 {
        struct page *page;
-        unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
+        unsigned long high_pfn, low_pfn, pfn, z_end_pfn, end_pfn;
        int nr_freepages = cc->nr_freepages;
        struct list_head *freelist = &cc->freepages;
@@ -663,7 +666,7 @@ static void isolate_freepages(struct zone *zone,
         */
        high_pfn = min(low_pfn, pfn);
-        zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+        z_end_pfn = zone_end_pfn(zone);
        /*
         * Isolate free pages until enough are available to migrate the
@@ -706,7 +709,7 @@ static void isolate_freepages(struct zone *zone,
                 * only scans within a pageblock
                 */
                end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
-                end_pfn = min(end_pfn, zone_end_pfn);
+                end_pfn = min(end_pfn, z_end_pfn);
                isolated = isolate_freepages_block(cc, pfn, end_pfn,
                                                   freelist, false);
                nr_freepages += isolated;
@@ -795,7 +798,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
        low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
        /* Only scan within a pageblock boundary */
-        end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
+        end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
        /* Do not cross the free scanner or scan within a memory hole */
        if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
@@ -816,6 +819,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 static int compact_finished(struct zone *zone,
                            struct compact_control *cc)
 {
+        unsigned int order;
        unsigned long watermark;
        if (fatal_signal_pending(current))
@@ -850,22 +854,16 @@ static int compact_finished(struct zone *zone,
                return COMPACT_CONTINUE;
        /* Direct compactor: Is a suitable page free? */
-        if (cc->page) {
+        for (order = cc->order; order < MAX_ORDER; order++) {
-                /* Was a suitable page captured? */
+                struct free_area *area = &zone->free_area[order];
-                if (*cc->page)
+                /* Job done if page is free of the right migratetype */
+                if (!list_empty(&area->free_list[cc->migratetype]))
+                        return COMPACT_PARTIAL;
+                /* Job done if allocation would set block type */
+                if (cc->order >= pageblock_order && area->nr_free)
                        return COMPACT_PARTIAL;
-        } else {
-                unsigned int order;
-                for (order = cc->order; order < MAX_ORDER; order++) {
-                        struct free_area *area = &zone->free_area[cc->order];
-                        /* Job done if page is free of the right migratetype */
-                        if (!list_empty(&area->free_list[cc->migratetype]))
-                                return COMPACT_PARTIAL;
-                        /* Job done if allocation would set block type */
-                        if (cc->order >= pageblock_order && area->nr_free)
-                                return COMPACT_PARTIAL;
-                }
        }
        return COMPACT_CONTINUE;
@@ -921,65 +919,11 @@ unsigned long compaction_suitable(struct zone *zone, int order)
        return COMPACT_CONTINUE;
 }
-static void compact_capture_page(struct compact_control *cc)
-{
-        unsigned long flags;
-        int mtype, mtype_low, mtype_high;
-        if (!cc->page || *cc->page)
-                return;
-        /*
-         * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
-         * regardless of the migratetype of the freelist is is captured from.
-         * This is fine because the order for a high-order MIGRATE_MOVABLE
-         * allocation is typically at least a pageblock size and overall
-         * fragmentation is not impaired. Other allocation types must
-         * capture pages from their own migratelist because otherwise they
-         * could pollute other pageblocks like MIGRATE_MOVABLE with
-         * difficult to move pages and making fragmentation worse overall.
-         */
-        if (cc->migratetype == MIGRATE_MOVABLE) {
-                mtype_low = 0;
-                mtype_high = MIGRATE_PCPTYPES;
-        } else {
-                mtype_low = cc->migratetype;
-                mtype_high = cc->migratetype + 1;
-        }
-        /* Speculatively examine the free lists without zone lock */
-        for (mtype = mtype_low; mtype < mtype_high; mtype++) {
-                int order;
-                for (order = cc->order; order < MAX_ORDER; order++) {
-                        struct page *page;
-                        struct free_area *area;
-                        area = &(cc->zone->free_area[order]);
-                        if (list_empty(&area->free_list[mtype]))
-                                continue;
-                        /* Take the lock and attempt capture of the page */
-                        if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
-                                return;
-                        if (!list_empty(&area->free_list[mtype])) {
-                                page = list_entry(area->free_list[mtype].next,
-                                                        struct page, lru);
-                                if (capture_free_page(page, cc->order, mtype)) {
-                                        spin_unlock_irqrestore(&cc->zone->lock,
-                                                                        flags);
-                                        *cc->page = page;
-                                        return;
-                                }
-                        }
-                        spin_unlock_irqrestore(&cc->zone->lock, flags);
-                }
-        }
-}
 static int compact_zone(struct zone *zone, struct compact_control *cc)
 {
        int ret;
        unsigned long start_pfn = zone->zone_start_pfn;
-        unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+        unsigned long end_pfn = zone_end_pfn(zone);
        ret = compaction_suitable(zone, cc->order);
        switch (ret) {
@@ -1036,7 +980,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                nr_migrate = cc->nr_migratepages;
                err = migrate_pages(&cc->migratepages, compaction_alloc,
-                                (unsigned long)cc, false,
+                                (unsigned long)cc,
                                cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
                                MR_COMPACTION);
                update_nr_listpages(cc);
@@ -1054,9 +998,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                                goto out;
                        }
                }
-                /* Capture a page now if it is a suitable size */
-                compact_capture_page(cc);
        }
 out:
@@ -1069,8 +1010,7 @@ out:
 static unsigned long compact_zone_order(struct zone *zone,
                                 int order, gfp_t gfp_mask,
-                                 bool sync, bool *contended,
+                                 bool sync, bool *contended)
-                                 struct page **page)
 {
        unsigned long ret;
        struct compact_control cc = {
@@ -1080,7 +1020,6 @@ static unsigned long compact_zone_order(struct zone *zone,
                .migratetype = allocflags_to_migratetype(gfp_mask),
                .zone = zone,
                .sync = sync,
-                .page = page,
        };
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
@@ -1110,7 +1049,7 @@ int sysctl_extfrag_threshold = 500;
 */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
-                        bool sync, bool *contended, struct page **page)
+                        bool sync, bool *contended)
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1136,7 +1075,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
                int status;
                status = compact_zone_order(zone, order, gfp_mask, sync,
-                                                contended, page);
+                                                contended);
                rc = max(status, rc);
                /* If a normal allocation would succeed, stop compacting */
@@ -1150,7 +1089,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 /* Compact all zones within a node */
-static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
+static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
 {
        int zoneid;
        struct zone *zone;
@@ -1183,34 +1122,30 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
                VM_BUG_ON(!list_empty(&cc->freepages));
                VM_BUG_ON(!list_empty(&cc->migratepages));
        }
-        return 0;
 }
-int compact_pgdat(pg_data_t *pgdat, int order)
+void compact_pgdat(pg_data_t *pgdat, int order)
 {
        struct compact_control cc = {
                .order = order,
                .sync = false,
-                .page = NULL,
        };
-        return __compact_pgdat(pgdat, &cc);
+        __compact_pgdat(pgdat, &cc);
 }
-static int compact_node(int nid)
+static void compact_node(int nid)
 {
        struct compact_control cc = {
                .order = -1,
                .sync = true,
-                .page = NULL,
        };
-        return __compact_pgdat(NODE_DATA(nid), &cc);
+        __compact_pgdat(NODE_DATA(nid), &cc);
 }
 /* Compact all nodes in the system */
-static int compact_nodes(void)
+static void compact_nodes(void)
 {
        int nid;
@@ -1219,8 +1154,6 @@ static int compact_nodes(void)
        for_each_online_node(nid)
                compact_node(nid);
-        return COMPACT_COMPLETE;
 }
 /* The written value is actually unused, all memory is compacted */
@@ -1231,7 +1164,7 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
                        void __user *buffer, size_t *length, loff_t *ppos)
 {
        if (write)
-                return compact_nodes();
+                compact_nodes();
        return 0;
 }
diff --git a/mm/fadvise.c b/mm/fadvise.c
index a47f0f50c89f..7e092689a12a 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -17,6 +17,7 @@
 #include <linux/fadvise.h>
 #include <linux/writeback.h>
 #include <linux/syscalls.h>
+#include <linux/swap.h>
 #include <asm/unistd.h>
@@ -38,7 +39,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
        if (!f.file)
                return -EBADF;
-        if (S_ISFIFO(f.file->f_path.dentry->d_inode->i_mode)) {
+        if (S_ISFIFO(file_inode(f.file)->i_mode)) {
                ret = -ESPIPE;
                goto out;
        }
@@ -120,9 +121,22 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
                start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
                end_index = (endbyte >> PAGE_CACHE_SHIFT);
-                if (end_index >= start_index)
+                if (end_index >= start_index) {
-                        invalidate_mapping_pages(mapping, start_index,
+                        unsigned long count = invalidate_mapping_pages(mapping,
+                                                start_index, end_index);
+                        /*
+                         * If fewer pages were invalidated than expected then
+                         * it is possible that some of the pages were on
+                         * a per-cpu pagevec for a remote CPU. Drain all
+                         * pagevecs and try again.
+                         */
+                        if (count < (end_index - start_index + 1)) {
+                                lru_add_drain_all();
+                                invalidate_mapping_pages(mapping, start_index,
                                                end_index);
+                        }
+                }
                break;
        default:
                ret = -EINVAL;
diff --git a/mm/filemap.c b/mm/filemap.c
index 83efee76a5c0..e1979fdca805 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1711,7 +1711,7 @@ EXPORT_SYMBOL(filemap_fault);
 int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct page *page = vmf->page;
-        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+        struct inode *inode = file_inode(vma->vm_file);
        int ret = VM_FAULT_LOCKED;
        sb_start_pagefault(inode->i_sb);
@@ -1728,6 +1728,7 @@ int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
         * see the dirty page and writeprotect it again.
         */
        set_page_dirty(page);
+        wait_for_stable_page(page);
 out:
        sb_end_pagefault(inode->i_sb);
        return ret;
@@ -2056,7 +2057,7 @@ EXPORT_SYMBOL(iov_iter_fault_in_readable);
 /*
 * Return the count of just the current iov_iter segment.
 */
-size_t iov_iter_single_seg_count(struct iov_iter *i)
+size_t iov_iter_single_seg_count(const struct iov_iter *i)
 {
        const struct iovec *iov = i->iov;
        if (i->nr_segs == 1)
@@ -2274,7 +2275,7 @@ repeat:
                return NULL;
        }
 found:
-        wait_on_page_writeback(page);
+        wait_for_stable_page(page);
        return page;
 }
 EXPORT_SYMBOL(grab_cache_page_write_begin);
diff --git a/mm/fremap.c b/mm/fremap.c
index a0aaf0e56800..0cd4c11488ed 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -129,6 +129,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
        struct vm_area_struct *vma;
        int err = -EINVAL;
        int has_write_lock = 0;
+        vm_flags_t vm_flags;
        if (prot)
                return err;
@@ -160,15 +161,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
        /*
         * Make sure the vma is shared, that it supports prefaulting,
         * and that the remapped range is valid and fully within
-         * the single existing vma.  vm_private_data is used as a
+         * the single existing vma.
-         * swapout cursor in a VM_NONLINEAR vma.
         */
        if (!vma || !(vma->vm_flags & VM_SHARED))
                goto out;
-        if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
-                goto out;
        if (!vma->vm_ops || !vma->vm_ops->remap_pages)
                goto out;
@@ -177,6 +174,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
        /* Must set VM_NONLINEAR before any pages are populated. */
        if (!(vma->vm_flags & VM_NONLINEAR)) {
+                /*
+                 * vm_private_data is used as a swapout cursor
+                 * in a VM_NONLINEAR vma.
+                 */
+                if (vma->vm_private_data)
+                        goto out;
                /* Don't need a nonlinear mapping, exit success */
                if (pgoff == linear_page_index(vma, start)) {
                        err = 0;
@@ -184,6 +188,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                }
                if (!has_write_lock) {
+get_write_lock:
                        up_read(&mm->mmap_sem);
                        down_write(&mm->mmap_sem);
                        has_write_lock = 1;
@@ -199,9 +204,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                        unsigned long addr;
                        struct file *file = get_file(vma->vm_file);
-                        flags &= MAP_NONBLOCK;
+                        vm_flags = vma->vm_flags;
-                        addr = mmap_region(file, start, size,
+                        if (!(flags & MAP_NONBLOCK))
-                                        flags, vma->vm_flags, pgoff);
+                                vm_flags |= VM_POPULATE;
+                        addr = mmap_region(file, start, size, vm_flags, pgoff);
                        fput(file);
                        if (IS_ERR_VALUE(addr)) {
                                err = addr;
@@ -220,32 +226,26 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                mutex_unlock(&mapping->i_mmap_mutex);
        }
+        if (!(flags & MAP_NONBLOCK) && !(vma->vm_flags & VM_POPULATE)) {
+                if (!has_write_lock)
+                        goto get_write_lock;
+                vma->vm_flags |= VM_POPULATE;
+        }
        if (vma->vm_flags & VM_LOCKED) {
                /*
                 * drop PG_Mlocked flag for over-mapped range
                 */
-                vm_flags_t saved_flags = vma->vm_flags;
+                if (!has_write_lock)
+                        goto get_write_lock;
+                vm_flags = vma->vm_flags;
                munlock_vma_pages_range(vma, start, start + size);
-                vma->vm_flags = saved_flags;
+                vma->vm_flags = vm_flags;
        }
        mmu_notifier_invalidate_range_start(mm, start, start + size);
        err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
        mmu_notifier_invalidate_range_end(mm, start, start + size);
-        if (!err && !(flags & MAP_NONBLOCK)) {
-                if (vma->vm_flags & VM_LOCKED) {
-                        /*
-                         * might be mapping previously unmapped range of file
-                         */
-                        mlock_vma_pages_range(vma, start, start + size);
-                } else {
-                        if (unlikely(has_write_lock)) {
-                                downgrade_write(&mm->mmap_sem);
-                                has_write_lock = 0;
-                        }
-                        make_pages_present(start, start+size);
-                }
-        }
        /*
         * We can't clear VM_NONLINEAR because we'd have to do
@@ -254,10 +254,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
         */
 out:
+        vm_flags = vma->vm_flags;
        if (likely(!has_write_lock))
                up_read(&mm->mmap_sem);
        else
                up_write(&mm->mmap_sem);
+        if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK)))
+                mm_populate(start, size);
        return err;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9e894edc7811..e2f7f5aaaafb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -20,6 +20,7 @@
 #include <linux/mman.h>
 #include <linux/pagemap.h>
 #include <linux/migrate.h>
+#include <linux/hashtable.h>
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -62,12 +63,11 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
 static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
 static int khugepaged(void *none);
-static int mm_slots_hash_init(void);
 static int khugepaged_slab_init(void);
-static void khugepaged_slab_free(void);
-#define MM_SLOTS_HASH_HEADS 1024
+#define MM_SLOTS_HASH_BITS 10
-static struct hlist_head *mm_slots_hash __read_mostly;
+static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
 static struct kmem_cache *mm_slot_cache __read_mostly;
 /**
@@ -105,7 +105,6 @@ static int set_recommended_min_free_kbytes(void)
        struct zone *zone;
        int nr_zones = 0;
        unsigned long recommended_min;
-        extern int min_free_kbytes;
        if (!khugepaged_enabled())
                return 0;
@@ -634,12 +633,6 @@ static int __init hugepage_init(void)
        if (err)
                goto out;
-        err = mm_slots_hash_init();
-        if (err) {
-                khugepaged_slab_free();
-                goto out;
-        }
        register_shrinker(&huge_zero_page_shrinker);
        /*
@@ -1257,6 +1250,10 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
        if (flags & FOLL_WRITE && !pmd_write(*pmd))
                goto out;
+        /* Avoid dumping huge zero page */
+        if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
+                return ERR_PTR(-EFAULT);
        page = pmd_page(*pmd);
        VM_BUG_ON(!PageHead(page));
        if (flags & FOLL_TOUCH) {
@@ -1298,7 +1295,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        int target_nid;
        int current_nid = -1;
        bool migrated;
-        bool page_locked = false;
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_same(pmd, *pmdp)))
@@ -1320,7 +1316,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        /* Acquire the page lock to serialise THP migrations */
        spin_unlock(&mm->page_table_lock);
        lock_page(page);
-        page_locked = true;
        /* Confirm the PTE did not while locked */
        spin_lock(&mm->page_table_lock);
@@ -1333,34 +1328,26 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        /* Migrate the THP to the requested node */
        migrated = migrate_misplaced_transhuge_page(mm, vma,
-                                pmdp, pmd, addr,
+                                pmdp, pmd, addr, page, target_nid);
-                                page, target_nid);
+        if (!migrated)
-        if (migrated)
+                goto check_same;
-                current_nid = target_nid;
-        else {
-                spin_lock(&mm->page_table_lock);
-                if (unlikely(!pmd_same(pmd, *pmdp))) {
-                        unlock_page(page);
-                        goto out_unlock;
-                }
-                goto clear_pmdnuma;
-        }
-        task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+        task_numa_fault(target_nid, HPAGE_PMD_NR, true);
        return 0;
+check_same:
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_same(pmd, *pmdp)))
+                goto out_unlock;
 clear_pmdnuma:
        pmd = pmd_mknonnuma(pmd);
        set_pmd_at(mm, haddr, pmdp, pmd);
        VM_BUG_ON(pmd_numa(*pmdp));
        update_mmu_cache_pmd(vma, addr, pmdp);
-        if (page_locked)
-                unlock_page(page);
 out_unlock:
        spin_unlock(&mm->page_table_lock);
        if (current_nid != -1)
-                task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+                task_numa_fault(current_nid, HPAGE_PMD_NR, false);
        return 0;
 }
@@ -1652,7 +1639,7 @@ static void __split_huge_page_refcount(struct page *page)
                page_tail->mapping = page->mapping;
                page_tail->index = page->index + i;
-                page_xchg_last_nid(page_tail, page_last_nid(page));
+                page_nid_xchg_last(page_tail, page_nid_last(page));
                BUG_ON(!PageAnon(page_tail));
                BUG_ON(!PageUptodate(page_tail));
@@ -1819,9 +1806,19 @@ int split_huge_page(struct page *page)
        BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
        BUG_ON(!PageAnon(page));
-        anon_vma = page_lock_anon_vma_read(page);
+        /*
+         * The caller does not necessarily hold an mmap_sem that would prevent
+         * the anon_vma disappearing so we first we take a reference to it
+         * and then lock the anon_vma for write. This is similar to
+         * page_lock_anon_vma_read except the write lock is taken to serialise
+         * against parallel split or collapse operations.
+         */
+        anon_vma = page_get_anon_vma(page);
        if (!anon_vma)
                goto out;
+        anon_vma_lock_write(anon_vma);
        ret = 0;
        if (!PageCompound(page))
                goto out_unlock;
@@ -1832,7 +1829,8 @@ int split_huge_page(struct page *page)
        BUG_ON(PageCompound(page));
 out_unlock:
-        page_unlock_anon_vma_read(anon_vma);
+        anon_vma_unlock_write(anon_vma);
+        put_anon_vma(anon_vma);
 out:
        return ret;
 }
@@ -1893,12 +1891,6 @@ static int __init khugepaged_slab_init(void)
        return 0;
 }
-static void __init khugepaged_slab_free(void)
-{
-        kmem_cache_destroy(mm_slot_cache);
-        mm_slot_cache = NULL;
-}
 static inline struct mm_slot *alloc_mm_slot(void)
 {
        if (!mm_slot_cache)     /* initialization failed */
@@ -1911,47 +1903,22 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
        kmem_cache_free(mm_slot_cache, mm_slot);
 }
-static int __init mm_slots_hash_init(void)
-{
-        mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
-                                GFP_KERNEL);
-        if (!mm_slots_hash)
-                return -ENOMEM;
-        return 0;
-}
-#if 0
-static void __init mm_slots_hash_free(void)
-{
-        kfree(mm_slots_hash);
-        mm_slots_hash = NULL;
-}
-#endif
 static struct mm_slot *get_mm_slot(struct mm_struct *mm)
 {
        struct mm_slot *mm_slot;
-        struct hlist_head *bucket;
-        struct hlist_node *node;
-        bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+        hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
-                                % MM_SLOTS_HASH_HEADS];
-        hlist_for_each_entry(mm_slot, node, bucket, hash) {
                if (mm == mm_slot->mm)
                        return mm_slot;
-        }
        return NULL;
 }
 static void insert_to_mm_slots_hash(struct mm_struct *mm,
                                    struct mm_slot *mm_slot)
 {
-        struct hlist_head *bucket;
-        bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
-                                % MM_SLOTS_HASH_HEADS];
        mm_slot->mm = mm;
-        hlist_add_head(&mm_slot->hash, bucket);
+        hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
 }
 static inline int khugepaged_test_exit(struct mm_struct *mm)
@@ -2020,7 +1987,7 @@ void __khugepaged_exit(struct mm_struct *mm)
        spin_lock(&khugepaged_mm_lock);
        mm_slot = get_mm_slot(mm);
        if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
-                hlist_del(&mm_slot->hash);
+                hash_del(&mm_slot->hash);
                list_del(&mm_slot->mm_node);
                free = 1;
        }
@@ -2353,7 +2320,7 @@ static void collapse_huge_page(struct mm_struct *mm,
                BUG_ON(!pmd_none(*pmd));
                set_pmd_at(mm, address, pmd, _pmd);
                spin_unlock(&mm->page_table_lock);
-                anon_vma_unlock(vma->anon_vma);
+                anon_vma_unlock_write(vma->anon_vma);
                goto out;
        }
@@ -2361,7 +2328,7 @@ static void collapse_huge_page(struct mm_struct *mm,
         * All pages are isolated and locked so anon_vma rmap
         * can't run anymore.
         */
-        anon_vma_unlock(vma->anon_vma);
+        anon_vma_unlock_write(vma->anon_vma);
        __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
        pte_unmap(pte);
@@ -2408,7 +2375,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
        struct page *page;
        unsigned long _address;
        spinlock_t *ptl;
-        int node = -1;
+        int node = NUMA_NO_NODE;
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
@@ -2438,7 +2405,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                 * be more sophisticated and look at more pages,
                 * but isn't for now.
                 */
-                if (node == -1)
+                if (node == NUMA_NO_NODE)
                        node = page_to_nid(page);
                VM_BUG_ON(PageCompound(page));
                if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
@@ -2469,7 +2436,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
        if (khugepaged_test_exit(mm)) {
                /* free mm_slot */
-                hlist_del(&mm_slot->hash);
+                hash_del(&mm_slot->hash);
                list_del(&mm_slot->mm_node);
                /*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4f3ea0b1e57c..0a0be33bb199 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -127,7 +127,7 @@ static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
 static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
 {
-        return subpool_inode(vma->vm_file->f_dentry->d_inode);
+        return subpool_inode(file_inode(vma->vm_file));
 }
 /*
@@ -1293,8 +1293,7 @@ static void __init report_hugepages(void)
        for_each_hstate(h) {
                char buf[32];
-                printk(KERN_INFO "HugeTLB registered %s page size, "
+                pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
-                                 "pre-allocated %ld pages\n",
                        memfmt(buf, huge_page_size(h)),
                        h->free_huge_pages);
        }
@@ -1702,8 +1701,7 @@ static void __init hugetlb_sysfs_init(void)
                err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
                                         hstate_kobjs, &hstate_attr_group);
                if (err)
-                        printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
+                        pr_err("Hugetlb: Unable to add hstate %s", h->name);
-                                                                h->name);
        }
 }
@@ -1826,9 +1824,8 @@ void hugetlb_register_node(struct node *node)
                                                nhs->hstate_kobjs,
                                                &per_node_hstate_attr_group);
                if (err) {
-                        printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
+                        pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
-                                        " for node %d\n",
+                                h->name, node->dev.id);
-                                                h->name, node->dev.id);
                        hugetlb_unregister_node(node);
                        break;
                }
@@ -1924,7 +1921,7 @@ void __init hugetlb_add_hstate(unsigned order)
        unsigned long i;
        if (size_to_hstate(PAGE_SIZE << order)) {
-                printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
+                pr_warning("hugepagesz= specified twice, ignoring\n");
                return;
        }
        BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
@@ -1960,8 +1957,8 @@ static int __init hugetlb_nrpages_setup(char *s)
                mhp = &parsed_hstate->max_huge_pages;
        if (mhp == last_mhp) {
-                printk(KERN_WARNING "hugepages= specified twice without "
+                pr_warning("hugepages= specified twice without "
-                        "interleaving hugepagesz=, ignoring\n");
+                           "interleaving hugepagesz=, ignoring\n");
                return 1;
        }
@@ -2482,7 +2479,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
        address = address & huge_page_mask(h);
        pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
                        vma->vm_pgoff;
-        mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
+        mapping = file_inode(vma->vm_file)->i_mapping;
        /*
         * Take the mapping lock for the duration of the table walk. As
@@ -2692,9 +2689,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
         * COW. Warn that such a situation has occurred as it may not be obvious
         */
        if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
-                printk(KERN_WARNING
+                pr_warning("PID %d killed due to inadequate hugepage pool\n",
-                        "PID %d killed due to inadequate hugepage pool\n",
+                           current->pid);
-                        current->pid);
                return ret;
        }
@@ -2924,14 +2920,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
        return NULL;
 }
-int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
-                        struct page **pages, struct vm_area_struct **vmas,
+                         struct page **pages, struct vm_area_struct **vmas,
-                        unsigned long *position, int *length, int i,
+                         unsigned long *position, unsigned long *nr_pages,
-                        unsigned int flags)
+                         long i, unsigned int flags)
 {
        unsigned long pfn_offset;
        unsigned long vaddr = *position;
-        int remainder = *length;
+        unsigned long remainder = *nr_pages;
        struct hstate *h = hstate_vma(vma);
        spin_lock(&mm->page_table_lock);
@@ -3001,7 +2997,7 @@ same_page:
                }
        }
        spin_unlock(&mm->page_table_lock);
-        *length = remainder;
+        *nr_pages = remainder;
        *position = vaddr;
        return i ? i : -EFAULT;
@@ -3033,6 +3029,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                if (!huge_pte_none(huge_ptep_get(ptep))) {
                        pte = huge_ptep_get_and_clear(mm, address, ptep);
                        pte = pte_mkhuge(pte_modify(pte, newprot));
+                        pte = arch_make_huge_pte(pte, vma, NULL, 0);
                        set_huge_pte_at(mm, address, ptep, pte);
                        pages++;
                }
diff --git a/mm/internal.h b/mm/internal.h
index d597f94cc205..8562de0a5197 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -135,7 +135,6 @@ struct compact_control {
        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
        struct zone *zone;
        bool contended;                 /* True if a lock was contended */
-        struct page **page;             /* Page captured of requested size */
 };
 unsigned long
@@ -163,8 +162,8 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
                struct vm_area_struct *prev, struct rb_node *rb_parent);
 #ifdef CONFIG_MMU
-extern long mlock_vma_pages_range(struct vm_area_struct *vma,
+extern long __mlock_vma_pages_range(struct vm_area_struct *vma,
-                        unsigned long start, unsigned long end);
+                unsigned long start, unsigned long end, int *nonblocking);
 extern void munlock_vma_pages_range(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end);
 static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
@@ -196,7 +195,7 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
 * must be called with vma's mmap_sem held for read or write, and page locked.
 */
 extern void mlock_vma_page(struct page *page);
-extern void munlock_vma_page(struct page *page);
+extern unsigned int munlock_vma_page(struct page *page);
 /*
 * Clear the page's PageMlocked().  This can be useful in a situation where
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 752a705c77c2..c8d7f3110fd0 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -436,7 +436,7 @@ static int get_object(struct kmemleak_object *object)
 */
 static void free_object_rcu(struct rcu_head *rcu)
 {
-        struct hlist_node *elem, *tmp;
+        struct hlist_node *tmp;
        struct kmemleak_scan_area *area;
        struct kmemleak_object *object =
                container_of(rcu, struct kmemleak_object, rcu);
@@ -445,8 +445,8 @@ static void free_object_rcu(struct rcu_head *rcu)
         * Once use_count is 0 (guaranteed by put_object), there is no other
         * code accessing this object, hence no need for locking.
         */
-        hlist_for_each_entry_safe(area, elem, tmp, &object->area_list, node) {
+        hlist_for_each_entry_safe(area, tmp, &object->area_list, node) {
-                hlist_del(elem);
+                hlist_del(&area->node);
                kmem_cache_free(scan_area_cache, area);
        }
        kmem_cache_free(object_cache, object);
@@ -1177,7 +1177,6 @@ static void scan_block(void *_start, void *_end,
 static void scan_object(struct kmemleak_object *object)
 {
        struct kmemleak_scan_area *area;
-        struct hlist_node *elem;
        unsigned long flags;
        /*
@@ -1205,7 +1204,7 @@ static void scan_object(struct kmemleak_object *object)
                        spin_lock_irqsave(&object->lock, flags);
                }
        } else
-                hlist_for_each_entry(area, elem, &object->area_list, node)
+                hlist_for_each_entry(area, &object->area_list, node)
                        scan_block((void *)area->start,
                                   (void *)(area->start + area->size),
                                   object, 0);
@@ -1300,9 +1299,8 @@ static void kmemleak_scan(void)
         */
        lock_memory_hotplug();
        for_each_online_node(i) {
-                pg_data_t *pgdat = NODE_DATA(i);
+                unsigned long start_pfn = node_start_pfn(i);
-                unsigned long start_pfn = pgdat->node_start_pfn;
+                unsigned long end_pfn = node_end_pfn(i);
-                unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
                unsigned long pfn;
                for (pfn = start_pfn; pfn < end_pfn; pfn++) {
diff --git a/mm/ksm.c b/mm/ksm.c
index 51573858938d..85bfd4c16346 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -33,13 +33,22 @@
 #include <linux/mmu_notifier.h>
 #include <linux/swap.h>
 #include <linux/ksm.h>
-#include <linux/hash.h>
+#include <linux/hashtable.h>
 #include <linux/freezer.h>
 #include <linux/oom.h>
+#include <linux/numa.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
+#ifdef CONFIG_NUMA
+#define NUMA(x)         (x)
+#define DO_NUMA(x)      do { (x); } while (0)
+#else
+#define NUMA(x)         (0)
+#define DO_NUMA(x)      do { } while (0)
+#endif
 /*
 * A few notes about the KSM scanning process,
 * to make it easier to understand the data structures below:
@@ -78,6 +87,9 @@
 *    take 10 attempts to find a page in the unstable tree, once it is found,
 *    it is secured in the stable tree.  (When we scan a new page, we first
 *    compare it against the stable tree, and then against the unstable tree.)
+ *
+ * If the merge_across_nodes tunable is unset, then KSM maintains multiple
+ * stable trees and multiple unstable trees: one of each for each NUMA node.
 */
 /**
@@ -113,19 +125,32 @@ struct ksm_scan {
 /**
 * struct stable_node - node of the stable rbtree
 * @node: rb node of this ksm page in the stable tree
+ * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
+ * @list: linked into migrate_nodes, pending placement in the proper node tree
 * @hlist: hlist head of rmap_items using this ksm page
- * @kpfn: page frame number of this ksm page
+ * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
+ * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
 */
 struct stable_node {
-        struct rb_node node;
+        union {
+                struct rb_node node;    /* when node of stable tree */
+                struct {                /* when listed for migration */
+                        struct list_head *head;
+                        struct list_head list;
+                };
+        };
        struct hlist_head hlist;
        unsigned long kpfn;
+#ifdef CONFIG_NUMA
+        int nid;
+#endif
 };
 /**
 * struct rmap_item - reverse mapping item for virtual addresses
 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
+ * @nid: NUMA node id of unstable tree in which linked (may not match page)
 * @mm: the memory structure this rmap_item is pointing into
 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
 * @oldchecksum: previous checksum of the page at that virtual address
@@ -135,7 +160,12 @@ struct stable_node {
 */
 struct rmap_item {
        struct rmap_item *rmap_list;
-        struct anon_vma *anon_vma;      /* when stable */
+        union {
+                struct anon_vma *anon_vma;      /* when stable */
+#ifdef CONFIG_NUMA
+                int nid;                /* when node of unstable tree */
+#endif
+        };
        struct mm_struct *mm;
        unsigned long address;          /* + low bits used for flags below */
        unsigned int oldchecksum;       /* when unstable */
@@ -153,12 +183,16 @@ struct rmap_item {
 #define STABLE_FLAG     0x200   /* is listed from the stable tree */
 /* The stable and unstable tree heads */
-static struct rb_root root_stable_tree = RB_ROOT;
+static struct rb_root one_stable_tree[1] = { RB_ROOT };
-static struct rb_root root_unstable_tree = RB_ROOT;
+static struct rb_root one_unstable_tree[1] = { RB_ROOT };
+static struct rb_root *root_stable_tree = one_stable_tree;
+static struct rb_root *root_unstable_tree = one_unstable_tree;
-#define MM_SLOTS_HASH_SHIFT 10
+/* Recently migrated nodes of stable tree, pending proper placement */
-#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT)
+static LIST_HEAD(migrate_nodes);
-static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS];
+#define MM_SLOTS_HASH_BITS 10
+static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
 static struct mm_slot ksm_mm_head = {
        .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
@@ -189,10 +223,21 @@ static unsigned int ksm_thread_pages_to_scan = 100;
 /* Milliseconds ksmd should sleep between batches */
 static unsigned int ksm_thread_sleep_millisecs = 20;
+#ifdef CONFIG_NUMA
+/* Zeroed when merging across nodes is not allowed */
+static unsigned int ksm_merge_across_nodes = 1;
+static int ksm_nr_node_ids = 1;
+#else
+#define ksm_merge_across_nodes  1U
+#define ksm_nr_node_ids         1
+#endif
 #define KSM_RUN_STOP    0
 #define KSM_RUN_MERGE   1
 #define KSM_RUN_UNMERGE 2
-static unsigned int ksm_run = KSM_RUN_STOP;
+#define KSM_RUN_OFFLINE 4
+static unsigned long ksm_run = KSM_RUN_STOP;
+static void wait_while_offlining(void);
 static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
 static DEFINE_MUTEX(ksm_thread_mutex);
@@ -275,31 +320,20 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
 static struct mm_slot *get_mm_slot(struct mm_struct *mm)
 {
-        struct mm_slot *mm_slot;
+        struct mm_slot *slot;
-        struct hlist_head *bucket;
-        struct hlist_node *node;
+        hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
+                if (slot->mm == mm)
+                        return slot;
-        bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
-        hlist_for_each_entry(mm_slot, node, bucket, link) {
-                if (mm == mm_slot->mm)
-                        return mm_slot;
-        }
        return NULL;
 }
 static void insert_to_mm_slots_hash(struct mm_struct *mm,
                                    struct mm_slot *mm_slot)
 {
-        struct hlist_head *bucket;
-        bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
        mm_slot->mm = mm;
-        hlist_add_head(&mm_slot->link, bucket);
+        hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
-}
-static inline int in_stable_tree(struct rmap_item *rmap_item)
-{
-        return rmap_item->address & STABLE_FLAG;
 }
 /*
@@ -333,7 +367,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
        do {
                cond_resched();
-                page = follow_page(vma, addr, FOLL_GET);
+                page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION);
                if (IS_ERR_OR_NULL(page))
                        break;
                if (PageKsm(page))
@@ -447,12 +481,22 @@ out:		page = NULL;
        return page;
 }
+/*
+ * This helper is used for getting right index into array of tree roots.
+ * When merge_across_nodes knob is set to 1, there are only two rb-trees for
+ * stable and unstable pages from all nodes with roots in index 0. Otherwise,
+ * every node has its own stable and unstable tree.
+ */
+static inline int get_kpfn_nid(unsigned long kpfn)
+{
+        return ksm_merge_across_nodes ? 0 : pfn_to_nid(kpfn);
+}
 static void remove_node_from_stable_tree(struct stable_node *stable_node)
 {
        struct rmap_item *rmap_item;
-        struct hlist_node *hlist;
-        hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+        hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
                if (rmap_item->hlist.next)
                        ksm_pages_sharing--;
                else
@@ -462,7 +506,11 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
                cond_resched();
        }
-        rb_erase(&stable_node->node, &root_stable_tree);
+        if (stable_node->head == &migrate_nodes)
+                list_del(&stable_node->list);
+        else
+                rb_erase(&stable_node->node,
+                         root_stable_tree + NUMA(stable_node->nid));
        free_stable_node(stable_node);
 }
@@ -472,6 +520,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
 * In which case we can trust the content of the page, and it
 * returns the gotten page; but if the page has now been zapped,
 * remove the stale node from the stable tree and return NULL.
+ * But beware, the stable node's page might be being migrated.
 *
 * You would expect the stable_node to hold a reference to the ksm page.
 * But if it increments the page's count, swapping out has to wait for
@@ -482,40 +531,77 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
 * pointing back to this stable node.  This relies on freeing a PageAnon
 * page to reset its page->mapping to NULL, and relies on no other use of
 * a page to put something that might look like our key in page->mapping.
- *
- * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
- * but this is different - made simpler by ksm_thread_mutex being held, but
- * interesting for assuming that no other use of the struct page could ever
- * put our expected_mapping into page->mapping (or a field of the union which
- * coincides with page->mapping).  The RCU calls are not for KSM at all, but
- * to keep the page_count protocol described with page_cache_get_speculative.
- *
- * Note: it is possible that get_ksm_page() will return NULL one moment,
- * then page the next, if the page is in between page_freeze_refs() and
- * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
 * is on its way to being freed; but it is an anomaly to bear in mind.
 */
-static struct page *get_ksm_page(struct stable_node *stable_node)
+static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
 {
        struct page *page;
        void *expected_mapping;
+        unsigned long kpfn;
-        page = pfn_to_page(stable_node->kpfn);
        expected_mapping = (void *)stable_node +
                                (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
-        rcu_read_lock();
+again:
-        if (page->mapping != expected_mapping)
+        kpfn = ACCESS_ONCE(stable_node->kpfn);
-                goto stale;
+        page = pfn_to_page(kpfn);
-        if (!get_page_unless_zero(page))
+        /*
+         * page is computed from kpfn, so on most architectures reading
+         * page->mapping is naturally ordered after reading node->kpfn,
+         * but on Alpha we need to be more careful.
+         */
+        smp_read_barrier_depends();
+        if (ACCESS_ONCE(page->mapping) != expected_mapping)
                goto stale;
-        if (page->mapping != expected_mapping) {
+        /*
+         * We cannot do anything with the page while its refcount is 0.
+         * Usually 0 means free, or tail of a higher-order page: in which
+         * case this node is no longer referenced, and should be freed;
+         * however, it might mean that the page is under page_freeze_refs().
+         * The __remove_mapping() case is easy, again the node is now stale;
+         * but if page is swapcache in migrate_page_move_mapping(), it might
+         * still be our page, in which case it's essential to keep the node.
+         */
+        while (!get_page_unless_zero(page)) {
+                /*
+                 * Another check for page->mapping != expected_mapping would
+                 * work here too.  We have chosen the !PageSwapCache test to
+                 * optimize the common case, when the page is or is about to
+                 * be freed: PageSwapCache is cleared (under spin_lock_irq)
+                 * in the freeze_refs section of __remove_mapping(); but Anon
+                 * page->mapping reset to NULL later, in free_pages_prepare().
+                 */
+                if (!PageSwapCache(page))
+                        goto stale;
+                cpu_relax();
+        }
+        if (ACCESS_ONCE(page->mapping) != expected_mapping) {
                put_page(page);
                goto stale;
        }
-        rcu_read_unlock();
+        if (lock_it) {
+                lock_page(page);
+                if (ACCESS_ONCE(page->mapping) != expected_mapping) {
+                        unlock_page(page);
+                        put_page(page);
+                        goto stale;
+                }
+        }
        return page;
 stale:
-        rcu_read_unlock();
+        /*
+         * We come here from above when page->mapping or !PageSwapCache
+         * suggests that the node is stale; but it might be under migration.
+         * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
+         * before checking whether node->kpfn has been changed.
+         */
+        smp_rmb();
+        if (ACCESS_ONCE(stable_node->kpfn) != kpfn)
+                goto again;
        remove_node_from_stable_tree(stable_node);
        return NULL;
 }
@@ -531,11 +617,10 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
                struct page *page;
                stable_node = rmap_item->head;
-                page = get_ksm_page(stable_node);
+                page = get_ksm_page(stable_node, true);
                if (!page)
                        goto out;
-                lock_page(page);
                hlist_del(&rmap_item->hlist);
                unlock_page(page);
                put_page(page);
@@ -560,8 +645,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
                age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
                BUG_ON(age > 1);
                if (!age)
-                        rb_erase(&rmap_item->node, &root_unstable_tree);
+                        rb_erase(&rmap_item->node,
+                                 root_unstable_tree + NUMA(rmap_item->nid));
                ksm_pages_unshared--;
                rmap_item->address &= PAGE_MASK;
        }
@@ -581,7 +666,7 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
 }
 /*
- * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
+ * Though it's very tempting to unmerge rmap_items from stable tree rather
 * than check every pte of a given vma, the locking doesn't quite work for
 * that - an rmap_item is assigned to the stable tree after inserting ksm
 * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
@@ -614,6 +699,71 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
 /*
 * Only called through the sysfs control interface:
 */
+static int remove_stable_node(struct stable_node *stable_node)
+{
+        struct page *page;
+        int err;
+        page = get_ksm_page(stable_node, true);
+        if (!page) {
+                /*
+                 * get_ksm_page did remove_node_from_stable_tree itself.
+                 */
+                return 0;
+        }
+        if (WARN_ON_ONCE(page_mapped(page))) {
+                /*
+                 * This should not happen: but if it does, just refuse to let
+                 * merge_across_nodes be switched - there is no need to panic.
+                 */
+                err = -EBUSY;
+        } else {
+                /*
+                 * The stable node did not yet appear stale to get_ksm_page(),
+                 * since that allows for an unmapped ksm page to be recognized
+                 * right up until it is freed; but the node is safe to remove.
+                 * This page might be in a pagevec waiting to be freed,
+                 * or it might be PageSwapCache (perhaps under writeback),
+                 * or it might have been removed from swapcache a moment ago.
+                 */
+                set_page_stable_node(page, NULL);
+                remove_node_from_stable_tree(stable_node);
+                err = 0;
+        }
+        unlock_page(page);
+        put_page(page);
+        return err;
+}
+static int remove_all_stable_nodes(void)
+{
+        struct stable_node *stable_node;
+        struct list_head *this, *next;
+        int nid;
+        int err = 0;
+        for (nid = 0; nid < ksm_nr_node_ids; nid++) {
+                while (root_stable_tree[nid].rb_node) {
+                        stable_node = rb_entry(root_stable_tree[nid].rb_node,
+                                                struct stable_node, node);
+                        if (remove_stable_node(stable_node)) {
+                                err = -EBUSY;
+                                break;  /* proceed to next nid */
+                        }
+                        cond_resched();
+                }
+        }
+        list_for_each_safe(this, next, &migrate_nodes) {
+                stable_node = list_entry(this, struct stable_node, list);
+                if (remove_stable_node(stable_node))
+                        err = -EBUSY;
+                cond_resched();
+        }
+        return err;
+}
 static int unmerge_and_remove_all_rmap_items(void)
 {
        struct mm_slot *mm_slot;
@@ -647,7 +797,7 @@ static int unmerge_and_remove_all_rmap_items(void)
                ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
                                                struct mm_slot, mm_list);
                if (ksm_test_exit(mm)) {
-                        hlist_del(&mm_slot->link);
+                        hash_del(&mm_slot->link);
                        list_del(&mm_slot->mm_list);
                        spin_unlock(&ksm_mmlist_lock);
@@ -661,6 +811,8 @@ static int unmerge_and_remove_all_rmap_items(void)
                }
        }
+        /* Clean up stable nodes, but don't worry if some are still busy */
+        remove_all_stable_nodes();
        ksm_scan.seqnr = 0;
        return 0;
@@ -946,6 +1098,9 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
        if (err)
                goto out;
+        /* Unstable nid is in union with stable anon_vma: remove first */
+        remove_rmap_item_from_tree(rmap_item);
        /* Must get reference to anon_vma while still holding mmap_sem */
        rmap_item->anon_vma = vma->anon_vma;
        get_anon_vma(vma->anon_vma);
@@ -996,42 +1151,99 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
 */
 static struct page *stable_tree_search(struct page *page)
 {
-        struct rb_node *node = root_stable_tree.rb_node;
+        int nid;
+        struct rb_root *root;
+        struct rb_node **new;
+        struct rb_node *parent;
        struct stable_node *stable_node;
+        struct stable_node *page_node;
-        stable_node = page_stable_node(page);
+        page_node = page_stable_node(page);
-        if (stable_node) {                      /* ksm page forked */
+        if (page_node && page_node->head != &migrate_nodes) {
+                /* ksm page forked */
                get_page(page);
                return page;
        }
-        while (node) {
+        nid = get_kpfn_nid(page_to_pfn(page));
+        root = root_stable_tree + nid;
+again:
+        new = &root->rb_node;
+        parent = NULL;
+        while (*new) {
                struct page *tree_page;
                int ret;
                cond_resched();
-                stable_node = rb_entry(node, struct stable_node, node);
+                stable_node = rb_entry(*new, struct stable_node, node);
-                tree_page = get_ksm_page(stable_node);
+                tree_page = get_ksm_page(stable_node, false);
                if (!tree_page)
                        return NULL;
                ret = memcmp_pages(page, tree_page);
+                put_page(tree_page);
-                if (ret < 0) {
+                parent = *new;
-                        put_page(tree_page);
+                if (ret < 0)
-                        node = node->rb_left;
+                        new = &parent->rb_left;
-                } else if (ret > 0) {
+                else if (ret > 0)
-                        put_page(tree_page);
+                        new = &parent->rb_right;
-                        node = node->rb_right;
+                else {
-                } else
+                        /*
-                        return tree_page;
+                         * Lock and unlock the stable_node's page (which
+                         * might already have been migrated) so that page
+                         * migration is sure to notice its raised count.
+                         * It would be more elegant to return stable_node
+                         * than kpage, but that involves more changes.
+                         */
+                        tree_page = get_ksm_page(stable_node, true);
+                        if (tree_page) {
+                                unlock_page(tree_page);
+                                if (get_kpfn_nid(stable_node->kpfn) !=
+                                                NUMA(stable_node->nid)) {
+                                        put_page(tree_page);
+                                        goto replace;
+                                }
+                                return tree_page;
+                        }
+                        /*
+                         * There is now a place for page_node, but the tree may
+                         * have been rebalanced, so re-evaluate parent and new.
+                         */
+                        if (page_node)
+                                goto again;
+                        return NULL;
+                }
        }
-        return NULL;
+        if (!page_node)
+                return NULL;
+        list_del(&page_node->list);
+        DO_NUMA(page_node->nid = nid);
+        rb_link_node(&page_node->node, parent, new);
+        rb_insert_color(&page_node->node, root);
+        get_page(page);
+        return page;
+replace:
+        if (page_node) {
+                list_del(&page_node->list);
+                DO_NUMA(page_node->nid = nid);
+                rb_replace_node(&stable_node->node, &page_node->node, root);
+                get_page(page);
+        } else {
+                rb_erase(&stable_node->node, root);
+                page = NULL;
+        }
+        stable_node->head = &migrate_nodes;
+        list_add(&stable_node->list, stable_node->head);
+        return page;
 }
 /*
- * stable_tree_insert - insert rmap_item pointing to new ksm page
+ * stable_tree_insert - insert stable tree node pointing to new ksm page
 * into the stable tree.
 *
 * This function returns the stable tree node just allocated on success,
@@ -1039,17 +1251,25 @@ static struct page *stable_tree_search(struct page *page)
 */
 static struct stable_node *stable_tree_insert(struct page *kpage)
 {
-        struct rb_node **new = &root_stable_tree.rb_node;
+        int nid;
+        unsigned long kpfn;
+        struct rb_root *root;
+        struct rb_node **new;
        struct rb_node *parent = NULL;
        struct stable_node *stable_node;
+        kpfn = page_to_pfn(kpage);
+        nid = get_kpfn_nid(kpfn);
+        root = root_stable_tree + nid;
+        new = &root->rb_node;
        while (*new) {
                struct page *tree_page;
                int ret;
                cond_resched();
                stable_node = rb_entry(*new, struct stable_node, node);
-                tree_page = get_ksm_page(stable_node);
+                tree_page = get_ksm_page(stable_node, false);
                if (!tree_page)
                        return NULL;
@@ -1075,13 +1295,12 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
        if (!stable_node)
                return NULL;
-        rb_link_node(&stable_node->node, parent, new);
-        rb_insert_color(&stable_node->node, &root_stable_tree);
        INIT_HLIST_HEAD(&stable_node->hlist);
+        stable_node->kpfn = kpfn;
-        stable_node->kpfn = page_to_pfn(kpage);
        set_page_stable_node(kpage, stable_node);
+        DO_NUMA(stable_node->nid = nid);
+        rb_link_node(&stable_node->node, parent, new);
+        rb_insert_color(&stable_node->node, root);
        return stable_node;
 }
@@ -1104,10 +1323,15 @@ static
 struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
                                              struct page *page,
                                              struct page **tree_pagep)
 {
-        struct rb_node **new = &root_unstable_tree.rb_node;
+        struct rb_node **new;
+        struct rb_root *root;
        struct rb_node *parent = NULL;
+        int nid;
+        nid = get_kpfn_nid(page_to_pfn(page));
+        root = root_unstable_tree + nid;
+        new = &root->rb_node;
        while (*new) {
                struct rmap_item *tree_rmap_item;
@@ -1137,6 +1361,15 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
                } else if (ret > 0) {
                        put_page(tree_page);
                        new = &parent->rb_right;
+                } else if (!ksm_merge_across_nodes &&
+                           page_to_nid(tree_page) != nid) {
+                        /*
+                         * If tree_page has been migrated to another NUMA node,
+                         * it will be flushed out and put in the right unstable
+                         * tree next time: only merge with it when across_nodes.
+                         */
+                        put_page(tree_page);
+                        return NULL;
                } else {
                        *tree_pagep = tree_page;
                        return tree_rmap_item;
@@ -1145,8 +1378,9 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
        rmap_item->address |= UNSTABLE_FLAG;
        rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
+        DO_NUMA(rmap_item->nid = nid);
        rb_link_node(&rmap_item->node, parent, new);
-        rb_insert_color(&rmap_item->node, &root_unstable_tree);
+        rb_insert_color(&rmap_item->node, root);
        ksm_pages_unshared++;
        return NULL;
@@ -1188,10 +1422,29 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
        unsigned int checksum;
        int err;
-        remove_rmap_item_from_tree(rmap_item);
+        stable_node = page_stable_node(page);
+        if (stable_node) {
+                if (stable_node->head != &migrate_nodes &&
+                    get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
+                        rb_erase(&stable_node->node,
+                                 root_stable_tree + NUMA(stable_node->nid));
+                        stable_node->head = &migrate_nodes;
+                        list_add(&stable_node->list, stable_node->head);
+                }
+                if (stable_node->head != &migrate_nodes &&
+                    rmap_item->head == stable_node)
+                        return;
+        }
        /* We first start with searching the page inside the stable tree */
        kpage = stable_tree_search(page);
+        if (kpage == page && rmap_item->head == stable_node) {
+                put_page(kpage);
+                return;
+        }
+        remove_rmap_item_from_tree(rmap_item);
        if (kpage) {
                err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
                if (!err) {
@@ -1225,14 +1478,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
                kpage = try_to_merge_two_pages(rmap_item, page,
                                                tree_rmap_item, tree_page);
                put_page(tree_page);
-                /*
-                 * As soon as we merge this page, we want to remove the
-                 * rmap_item of the page we have merged with from the unstable
-                 * tree, and insert it instead as new node in the stable tree.
-                 */
                if (kpage) {
-                        remove_rmap_item_from_tree(tree_rmap_item);
+                        /*
+                         * The pages were successfully merged: insert new
+                         * node in the stable tree and add both rmap_items.
+                         */
                        lock_page(kpage);
                        stable_node = stable_tree_insert(kpage);
                        if (stable_node) {
@@ -1289,6 +1539,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
        struct mm_slot *slot;
        struct vm_area_struct *vma;
        struct rmap_item *rmap_item;
+        int nid;
        if (list_empty(&ksm_mm_head.mm_list))
                return NULL;
@@ -1307,7 +1558,29 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
                 */
                lru_add_drain_all();
-                root_unstable_tree = RB_ROOT;
+                /*
+                 * Whereas stale stable_nodes on the stable_tree itself
+                 * get pruned in the regular course of stable_tree_search(),
+                 * those moved out to the migrate_nodes list can accumulate:
+                 * so prune them once before each full scan.
+                 */
+                if (!ksm_merge_across_nodes) {
+                        struct stable_node *stable_node;
+                        struct list_head *this, *next;
+                        struct page *page;
+                        list_for_each_safe(this, next, &migrate_nodes) {
+                                stable_node = list_entry(this,
+                                                struct stable_node, list);
+                                page = get_ksm_page(stable_node, false);
+                                if (page)
+                                        put_page(page);
+                                cond_resched();
+                        }
+                }
+                for (nid = 0; nid < ksm_nr_node_ids; nid++)
+                        root_unstable_tree[nid] = RB_ROOT;
                spin_lock(&ksm_mmlist_lock);
                slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
@@ -1392,7 +1665,7 @@ next_mm:
                 * or when all VM_MERGEABLE areas have been unmapped (and
                 * mmap_sem then protects against race with MADV_MERGEABLE).
                 */
-                hlist_del(&slot->link);
+                hash_del(&slot->link);
                list_del(&slot->mm_list);
                spin_unlock(&ksm_mmlist_lock);
@@ -1428,8 +1701,7 @@ static void ksm_do_scan(unsigned int scan_npages)
                rmap_item = scan_get_next_rmap_item(&page);
                if (!rmap_item)
                        return;
-                if (!PageKsm(page) || !in_stable_tree(rmap_item))
+                cmp_and_merge_page(page, rmap_item);
-                        cmp_and_merge_page(page, rmap_item);
                put_page(page);
        }
 }
@@ -1446,6 +1718,7 @@ static int ksm_scan_thread(void *nothing)
        while (!kthread_should_stop()) {
                mutex_lock(&ksm_thread_mutex);
+                wait_while_offlining();
                if (ksmd_should_run())
                        ksm_do_scan(ksm_thread_pages_to_scan);
                mutex_unlock(&ksm_thread_mutex);
@@ -1525,11 +1798,19 @@ int __ksm_enter(struct mm_struct *mm)
        spin_lock(&ksm_mmlist_lock);
        insert_to_mm_slots_hash(mm, mm_slot);
        /*
-         * Insert just behind the scanning cursor, to let the area settle
+         * When KSM_RUN_MERGE (or KSM_RUN_STOP),
+         * insert just behind the scanning cursor, to let the area settle
         * down a little; when fork is followed by immediate exec, we don't
         * want ksmd to waste time setting up and tearing down an rmap_list.
+         *
+         * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
+         * scanning cursor, otherwise KSM pages in newly forked mms will be
+         * missed: then we might as well insert at the end of the list.
         */
-        list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
+        if (ksm_run & KSM_RUN_UNMERGE)
+                list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
+        else
+                list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
        spin_unlock(&ksm_mmlist_lock);
        set_bit(MMF_VM_MERGEABLE, &mm->flags);
@@ -1559,7 +1840,7 @@ void __ksm_exit(struct mm_struct *mm)
        mm_slot = get_mm_slot(mm);
        if (mm_slot && ksm_scan.mm_slot != mm_slot) {
                if (!mm_slot->rmap_list) {
-                        hlist_del(&mm_slot->link);
+                        hash_del(&mm_slot->link);
                        list_del(&mm_slot->mm_list);
                        easy_to_free = 1;
                } else {
@@ -1579,24 +1860,32 @@ void __ksm_exit(struct mm_struct *mm)
        }
 }
-struct page *ksm_does_need_to_copy(struct page *page,
+struct page *ksm_might_need_to_copy(struct page *page,
                        struct vm_area_struct *vma, unsigned long address)
 {
+        struct anon_vma *anon_vma = page_anon_vma(page);
        struct page *new_page;
+        if (PageKsm(page)) {
+                if (page_stable_node(page) &&
+                    !(ksm_run & KSM_RUN_UNMERGE))
+                        return page;    /* no need to copy it */
+        } else if (!anon_vma) {
+                return page;            /* no need to copy it */
+        } else if (anon_vma->root == vma->anon_vma->root &&
+                 page->index == linear_page_index(vma, address)) {
+                return page;            /* still no need to copy it */
+        }
+        if (!PageUptodate(page))
+                return page;            /* let do_swap_page report the error */
        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
        if (new_page) {
                copy_user_highpage(new_page, page, address, vma);
                SetPageDirty(new_page);
                __SetPageUptodate(new_page);
-                SetPageSwapBacked(new_page);
                __set_page_locked(new_page);
-                if (!mlocked_vma_newpage(vma, new_page))
-                        lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
-                else
-                        add_page_to_unevictable_list(new_page);
        }
        return new_page;
@@ -1607,7 +1896,6 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
 {
        struct stable_node *stable_node;
        struct rmap_item *rmap_item;
-        struct hlist_node *hlist;
        unsigned int mapcount = page_mapcount(page);
        int referenced = 0;
        int search_new_forks = 0;
@@ -1619,7 +1907,7 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
        if (!stable_node)
                return 0;
 again:
-        hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+        hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
                struct anon_vma *anon_vma = rmap_item->anon_vma;
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
@@ -1661,7 +1949,6 @@ out:
 int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
 {
        struct stable_node *stable_node;
-        struct hlist_node *hlist;
        struct rmap_item *rmap_item;
        int ret = SWAP_AGAIN;
        int search_new_forks = 0;
@@ -1673,7 +1960,7 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
        if (!stable_node)
                return SWAP_FAIL;
 again:
-        hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+        hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
                struct anon_vma *anon_vma = rmap_item->anon_vma;
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
@@ -1714,7 +2001,6 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
                  struct vm_area_struct *, unsigned long, void *), void *arg)
 {
        struct stable_node *stable_node;
-        struct hlist_node *hlist;
        struct rmap_item *rmap_item;
        int ret = SWAP_AGAIN;
        int search_new_forks = 0;
@@ -1726,7 +2012,7 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
        if (!stable_node)
                return ret;
 again:
-        hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+        hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
                struct anon_vma *anon_vma = rmap_item->anon_vma;
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
@@ -1773,64 +2059,115 @@ void ksm_migrate_page(struct page *newpage, struct page *oldpage)
        if (stable_node) {
                VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
                stable_node->kpfn = page_to_pfn(newpage);
+                /*
+                 * newpage->mapping was set in advance; now we need smp_wmb()
+                 * to make sure that the new stable_node->kpfn is visible
+                 * to get_ksm_page() before it can see that oldpage->mapping
+                 * has gone stale (or that PageSwapCache has been cleared).
+                 */
+                smp_wmb();
+                set_page_stable_node(oldpage, NULL);
        }
 }
 #endif /* CONFIG_MIGRATION */
 #ifdef CONFIG_MEMORY_HOTREMOVE
-static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn,
+static int just_wait(void *word)
-                                                 unsigned long end_pfn)
 {
-        struct rb_node *node;
+        schedule();
+        return 0;
+}
-        for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) {
+static void wait_while_offlining(void)
-                struct stable_node *stable_node;
+{
+        while (ksm_run & KSM_RUN_OFFLINE) {
+                mutex_unlock(&ksm_thread_mutex);
+                wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
+                                just_wait, TASK_UNINTERRUPTIBLE);
+                mutex_lock(&ksm_thread_mutex);
+        }
+}
-                stable_node = rb_entry(node, struct stable_node, node);
+static void ksm_check_stable_tree(unsigned long start_pfn,
+                                  unsigned long end_pfn)
+{
+        struct stable_node *stable_node;
+        struct list_head *this, *next;
+        struct rb_node *node;
+        int nid;
+        for (nid = 0; nid < ksm_nr_node_ids; nid++) {
+                node = rb_first(root_stable_tree + nid);
+                while (node) {
+                        stable_node = rb_entry(node, struct stable_node, node);
+                        if (stable_node->kpfn >= start_pfn &&
+                            stable_node->kpfn < end_pfn) {
+                                /*
+                                 * Don't get_ksm_page, page has already gone:
+                                 * which is why we keep kpfn instead of page*
+                                 */
+                                remove_node_from_stable_tree(stable_node);
+                                node = rb_first(root_stable_tree + nid);
+                        } else
+                                node = rb_next(node);
+                        cond_resched();
+                }
+        }
+        list_for_each_safe(this, next, &migrate_nodes) {
+                stable_node = list_entry(this, struct stable_node, list);
                if (stable_node->kpfn >= start_pfn &&
                    stable_node->kpfn < end_pfn)
-                        return stable_node;
+                        remove_node_from_stable_tree(stable_node);
+                cond_resched();
        }
-        return NULL;
 }
 static int ksm_memory_callback(struct notifier_block *self,
                               unsigned long action, void *arg)
 {
        struct memory_notify *mn = arg;
-        struct stable_node *stable_node;
        switch (action) {
        case MEM_GOING_OFFLINE:
                /*
-                 * Keep it very simple for now: just lock out ksmd and
+                 * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
-                 * MADV_UNMERGEABLE while any memory is going offline.
+                 * and remove_all_stable_nodes() while memory is going offline:
-                 * mutex_lock_nested() is necessary because lockdep was alarmed
+                 * it is unsafe for them to touch the stable tree at this time.
-                 * that here we take ksm_thread_mutex inside notifier chain
+                 * But unmerge_ksm_pages(), rmap lookups and other entry points
-                 * mutex, and later take notifier chain mutex inside
+                 * which do not need the ksm_thread_mutex are all safe.
-                 * ksm_thread_mutex to unlock it.   But that's safe because both
-                 * are inside mem_hotplug_mutex.
                 */
-                mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING);
+                mutex_lock(&ksm_thread_mutex);
+                ksm_run |= KSM_RUN_OFFLINE;
+                mutex_unlock(&ksm_thread_mutex);
                break;
        case MEM_OFFLINE:
                /*
                 * Most of the work is done by page migration; but there might
                 * be a few stable_nodes left over, still pointing to struct
-                 * pages which have been offlined: prune those from the tree.
+                 * pages which have been offlined: prune those from the tree,
+                 * otherwise get_ksm_page() might later try to access a
+                 * non-existent struct page.
                 */
-                while ((stable_node = ksm_check_stable_tree(mn->start_pfn,
+                ksm_check_stable_tree(mn->start_pfn,
-                                        mn->start_pfn + mn->nr_pages)) != NULL)
+                                      mn->start_pfn + mn->nr_pages);
-                        remove_node_from_stable_tree(stable_node);
                /* fallthrough */
        case MEM_CANCEL_OFFLINE:
+                mutex_lock(&ksm_thread_mutex);
+                ksm_run &= ~KSM_RUN_OFFLINE;
                mutex_unlock(&ksm_thread_mutex);
+                smp_mb();       /* wake_up_bit advises this */
+                wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
                break;
        }
        return NOTIFY_OK;
 }
+#else
+static void wait_while_offlining(void)
+{
+}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 #ifdef CONFIG_SYSFS
@@ -1893,7 +2230,7 @@ KSM_ATTR(pages_to_scan);
 static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
                        char *buf)
 {
-        return sprintf(buf, "%u\n", ksm_run);
+        return sprintf(buf, "%lu\n", ksm_run);
 }
 static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1916,6 +2253,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
         */
        mutex_lock(&ksm_thread_mutex);
+        wait_while_offlining();
        if (ksm_run != flags) {
                ksm_run = flags;
                if (flags & KSM_RUN_UNMERGE) {
@@ -1937,6 +2275,64 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
 }
 KSM_ATTR(run);
+#ifdef CONFIG_NUMA
+static ssize_t merge_across_nodes_show(struct kobject *kobj,
+                                struct kobj_attribute *attr, char *buf)
+{
+        return sprintf(buf, "%u\n", ksm_merge_across_nodes);
+}
+static ssize_t merge_across_nodes_store(struct kobject *kobj,
+                                   struct kobj_attribute *attr,
+                                   const char *buf, size_t count)
+{
+        int err;
+        unsigned long knob;
+        err = kstrtoul(buf, 10, &knob);
+        if (err)
+                return err;
+        if (knob > 1)
+                return -EINVAL;
+        mutex_lock(&ksm_thread_mutex);
+        wait_while_offlining();
+        if (ksm_merge_across_nodes != knob) {
+                if (ksm_pages_shared || remove_all_stable_nodes())
+                        err = -EBUSY;
+                else if (root_stable_tree == one_stable_tree) {
+                        struct rb_root *buf;
+                        /*
+                         * This is the first time that we switch away from the
+                         * default of merging across nodes: must now allocate
+                         * a buffer to hold as many roots as may be needed.
+                         * Allocate stable and unstable together:
+                         * MAXSMP NODES_SHIFT 10 will use 16kB.
+                         */
+                        buf = kcalloc(nr_node_ids + nr_node_ids,
+                                sizeof(*buf), GFP_KERNEL | __GFP_ZERO);
+                        /* Let us assume that RB_ROOT is NULL is zero */
+                        if (!buf)
+                                err = -ENOMEM;
+                        else {
+                                root_stable_tree = buf;
+                                root_unstable_tree = buf + nr_node_ids;
+                                /* Stable tree is empty but not the unstable */
+                                root_unstable_tree[0] = one_unstable_tree[0];
+                        }
+                }
+                if (!err) {
+                        ksm_merge_across_nodes = knob;
+                        ksm_nr_node_ids = knob ? 1 : nr_node_ids;
+                }
+        }
+        mutex_unlock(&ksm_thread_mutex);
+        return err ? err : count;
+}
+KSM_ATTR(merge_across_nodes);
+#endif
 static ssize_t pages_shared_show(struct kobject *kobj,
                                 struct kobj_attribute *attr, char *buf)
 {
@@ -1991,6 +2387,9 @@ static struct attribute *ksm_attrs[] = {
        &pages_unshared_attr.attr,
        &pages_volatile_attr.attr,
        &full_scans_attr.attr,
+#ifdef CONFIG_NUMA
+        &merge_across_nodes_attr.attr,
+#endif
        NULL,
 };
@@ -2029,10 +2428,7 @@ static int __init ksm_init(void)
 #endif /* CONFIG_SYSFS */
 #ifdef CONFIG_MEMORY_HOTREMOVE
-        /*
+        /* There is no significance to this priority 100 */
-         * Choose a high priority since the callback takes ksm_thread_mutex:
-         * later callbacks could only be taking locks which nest within that.
-         */
        hotplug_memory_notifier(ksm_memory_callback, 100);
 #endif
        return 0;
diff --git a/mm/madvise.c b/mm/madvise.c
index 03dfa5c7adb3..c58c94b56c3d 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -16,6 +16,9 @@
 #include <linux/ksm.h>
 #include <linux/fs.h>
 #include <linux/file.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
 /*
 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -131,6 +134,84 @@ out:
        return error;
 }
+#ifdef CONFIG_SWAP
+static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
+        unsigned long end, struct mm_walk *walk)
+{
+        pte_t *orig_pte;
+        struct vm_area_struct *vma = walk->private;
+        unsigned long index;
+        if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+                return 0;
+        for (index = start; index != end; index += PAGE_SIZE) {
+                pte_t pte;
+                swp_entry_t entry;
+                struct page *page;
+                spinlock_t *ptl;
+                orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
+                pte = *(orig_pte + ((index - start) / PAGE_SIZE));
+                pte_unmap_unlock(orig_pte, ptl);
+                if (pte_present(pte) || pte_none(pte) || pte_file(pte))
+                        continue;
+                entry = pte_to_swp_entry(pte);
+                if (unlikely(non_swap_entry(entry)))
+                        continue;
+                page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
+                                                                vma, index);
+                if (page)
+                        page_cache_release(page);
+        }
+        return 0;
+}
+static void force_swapin_readahead(struct vm_area_struct *vma,
+                unsigned long start, unsigned long end)
+{
+        struct mm_walk walk = {
+                .mm = vma->vm_mm,
+                .pmd_entry = swapin_walk_pmd_entry,
+                .private = vma,
+        };
+        walk_page_range(start, end, &walk);
+        lru_add_drain();        /* Push any new pages onto the LRU now */
+}
+static void force_shm_swapin_readahead(struct vm_area_struct *vma,
+                unsigned long start, unsigned long end,
+                struct address_space *mapping)
+{
+        pgoff_t index;
+        struct page *page;
+        swp_entry_t swap;
+        for (; start < end; start += PAGE_SIZE) {
+                index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+                page = find_get_page(mapping, index);
+                if (!radix_tree_exceptional_entry(page)) {
+                        if (page)
+                                page_cache_release(page);
+                        continue;
+                }
+                swap = radix_to_swp_entry(page);
+                page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
+                                                                NULL, 0);
+                if (page)
+                        page_cache_release(page);
+        }
+        lru_add_drain();        /* Push any new pages onto the LRU now */
+}
+#endif          /* CONFIG_SWAP */
 /*
 * Schedule all required I/O operations.  Do not wait for completion.
 */
@@ -140,6 +221,18 @@ static long madvise_willneed(struct vm_area_struct * vma,
 {
        struct file *file = vma->vm_file;
+#ifdef CONFIG_SWAP
+        if (!file || mapping_cap_swap_backed(file->f_mapping)) {
+                *prev = vma;
+                if (!file)
+                        force_swapin_readahead(vma, start, end);
+                else
+                        force_shm_swapin_readahead(vma, start, end,
+                                                file->f_mapping);
+                return 0;
+        }
+#endif
        if (!file)
                return -EBADF;
@@ -371,6 +464,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
        int error = -EINVAL;
        int write;
        size_t len;
+        struct blk_plug plug;
 #ifdef CONFIG_MEMORY_FAILURE
        if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
@@ -410,18 +504,19 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
        if (vma && start > vma->vm_start)
                prev = vma;
+        blk_start_plug(&plug);
        for (;;) {
                /* Still start < end. */
                error = -ENOMEM;
                if (!vma)
-                        goto out;
+                        goto out_plug;
                /* Here start < (end|vma->vm_end). */
                if (start < vma->vm_start) {
                        unmapped_error = -ENOMEM;
                        start = vma->vm_start;
                        if (start >= end)
-                                goto out;
+                                goto out_plug;
                }
                /* Here vma->vm_start <= start < (end|vma->vm_end) */
@@ -432,18 +527,20 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
                /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
                error = madvise_vma(vma, &prev, start, tmp, behavior);
                if (error)
-                        goto out;
+                        goto out_plug;
                start = tmp;
                if (prev && start < prev->vm_end)
                        start = prev->vm_end;
                error = unmapped_error;
                if (start >= end)
-                        goto out;
+                        goto out_plug;
                if (prev)
                        vma = prev->vm_next;
                else    /* madvise_remove dropped mmap_sem */
                        vma = find_vma(current->mm, start);
        }
+out_plug:
+        blk_finish_plug(&plug);
 out:
        if (write)
                up_write(&current->mm->mmap_sem);
diff --git a/mm/memblock.c b/mm/memblock.c
index 625905523c2a..1bcd9b970564 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -92,9 +92,58 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
 *
 * Find @size free area aligned to @align in the specified range and node.
 *
+ * If we have CONFIG_HAVE_MEMBLOCK_NODE_MAP defined, we need to check if the
+ * memory we found if not in hotpluggable ranges.
+ *
 * RETURNS:
 * Found address on success, %0 on failure.
 */
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
+                                        phys_addr_t end, phys_addr_t size,
+                                        phys_addr_t align, int nid)
+{
+        phys_addr_t this_start, this_end, cand;
+        u64 i;
+        int curr = movablemem_map.nr_map - 1;
+        /* pump up @end */
+        if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
+                end = memblock.current_limit;
+        /* avoid allocating the first page */
+        start = max_t(phys_addr_t, start, PAGE_SIZE);
+        end = max(start, end);
+        for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
+                this_start = clamp(this_start, start, end);
+                this_end = clamp(this_end, start, end);
+restart:
+                if (this_end <= this_start || this_end < size)
+                        continue;
+                for (; curr >= 0; curr--) {
+                        if ((movablemem_map.map[curr].start_pfn << PAGE_SHIFT)
+                            < this_end)
+                                break;
+                }
+                cand = round_down(this_end - size, align);
+                if (curr >= 0 &&
+                    cand < movablemem_map.map[curr].end_pfn << PAGE_SHIFT) {
+                        this_end = movablemem_map.map[curr].start_pfn
+                                   << PAGE_SHIFT;
+                        goto restart;
+                }
+                if (cand >= this_start)
+                        return cand;
+        }
+        return 0;
+}
+#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
                                        phys_addr_t end, phys_addr_t size,
                                        phys_addr_t align, int nid)
@@ -123,6 +172,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
        }
        return 0;
 }
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 /**
 * memblock_find_in_range - find free area in given range
@@ -314,7 +364,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
                }
                this->size += next->size;
-                memmove(next, next + 1, (type->cnt - (i + 1)) * sizeof(*next));
+                /* move forward from next + 1, index of which is i + 2 */
+                memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next));
                type->cnt--;
        }
 }
@@ -827,6 +878,23 @@ phys_addr_t __init memblock_phys_mem_size(void)
        return memblock.memory.total_size;
 }
+phys_addr_t __init memblock_mem_size(unsigned long limit_pfn)
+{
+        unsigned long pages = 0;
+        struct memblock_region *r;
+        unsigned long start_pfn, end_pfn;
+        for_each_memblock(memory, r) {
+                start_pfn = memblock_region_memory_base_pfn(r);
+                end_pfn = memblock_region_memory_end_pfn(r);
+                start_pfn = min_t(unsigned long, start_pfn, limit_pfn);
+                end_pfn = min_t(unsigned long, end_pfn, limit_pfn);
+                pages += end_pfn - start_pfn;
+        }
+        return (phys_addr_t)pages << PAGE_SHIFT;
+}
 /* lowest address */
 phys_addr_t __init_memblock memblock_start_of_DRAM(void)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 09255ec8159c..53b8201b31eb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -120,6 +120,14 @@ static const char * const mem_cgroup_events_names[] = {
        "pgmajfault",
 };
+static const char * const mem_cgroup_lru_names[] = {
+        "inactive_anon",
+        "active_anon",
+        "inactive_file",
+        "active_file",
+        "unevictable",
+};
 /*
 * Per memcg event counter is incremented at every pagein/pageout. With THP,
 * it will be incremated by the number of pages. This counter is used for
@@ -172,7 +180,7 @@ struct mem_cgroup_per_node {
 };
 struct mem_cgroup_lru_info {
-        struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
+        struct mem_cgroup_per_node *nodeinfo[0];
 };
 /*
@@ -276,17 +284,6 @@ struct mem_cgroup {
         */
        struct res_counter kmem;
        /*
-         * Per cgroup active and inactive list, similar to the
-         * per zone LRU lists.
-         */
-        struct mem_cgroup_lru_info info;
-        int last_scanned_node;
-#if MAX_NUMNODES > 1
-        nodemask_t      scan_nodes;
-        atomic_t        numainfo_events;
-        atomic_t        numainfo_updating;
-#endif
-        /*
         * Should the accounting and control be hierarchical, per subtree?
         */
        bool use_hierarchy;
@@ -349,8 +346,29 @@ struct mem_cgroup {
        /* Index in the kmem_cache->memcg_params->memcg_caches array */
        int kmemcg_id;
 #endif
+        int last_scanned_node;
+#if MAX_NUMNODES > 1
+        nodemask_t      scan_nodes;
+        atomic_t        numainfo_events;
+        atomic_t        numainfo_updating;
+#endif
+        /*
+         * Per cgroup active and inactive list, similar to the
+         * per zone LRU lists.
+         *
+         * WARNING: This has to be the last element of the struct. Don't
+         * add new fields after this point.
+         */
+        struct mem_cgroup_lru_info info;
 };
+static size_t memcg_size(void)
+{
+        return sizeof(struct mem_cgroup) +
+                nr_node_ids * sizeof(struct mem_cgroup_per_node);
+}
 /* internal only representation about the status of kmem accounting. */
 enum {
        KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
@@ -398,8 +416,8 @@ static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
 /* Stuffs for move charges at task migration. */
 /*
- * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
+ * Types of charges to be moved. "move_charge_at_immitgrate" and
- * left-shifted bitmap of these types.
+ * "immigrate_flags" are treated as a left-shifted bitmap of these types.
 */
 enum move_type {
        MOVE_CHARGE_TYPE_ANON,  /* private anonymous page and swap of it */
@@ -412,6 +430,7 @@ static struct move_charge_struct {
        spinlock_t        lock; /* for from, to */
        struct mem_cgroup *from;
        struct mem_cgroup *to;
+        unsigned long immigrate_flags;
        unsigned long precharge;
        unsigned long moved_charge;
        unsigned long moved_swap;
@@ -424,14 +443,12 @@ static struct move_charge_struct {
 static bool move_anon(void)
 {
-        return test_bit(MOVE_CHARGE_TYPE_ANON,
+        return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
-                                        &mc.to->move_charge_at_immigrate);
 }
 static bool move_file(void)
 {
-        return test_bit(MOVE_CHARGE_TYPE_FILE,
+        return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
-                                        &mc.to->move_charge_at_immigrate);
 }
 /*
@@ -471,6 +488,13 @@ enum res_type {
 #define MEM_CGROUP_RECLAIM_SHRINK_BIT   0x1
 #define MEM_CGROUP_RECLAIM_SHRINK       (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
+/*
+ * The memcg_create_mutex will be held whenever a new cgroup is created.
+ * As a consequence, any change that needs to protect against new child cgroups
+ * appearing has to hold it as well.
+ */
+static DEFINE_MUTEX(memcg_create_mutex);
 static void mem_cgroup_get(struct mem_cgroup *memcg);
 static void mem_cgroup_put(struct mem_cgroup *memcg);
@@ -627,6 +651,7 @@ static void drain_all_stock_async(struct mem_cgroup *memcg);
 static struct mem_cgroup_per_zone *
 mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
 {
+        VM_BUG_ON((unsigned)nid >= nr_node_ids);
        return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
 }
@@ -1371,17 +1396,6 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
        return inactive * inactive_ratio < active;
 }
-int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
-{
-        unsigned long active;
-        unsigned long inactive;
-        inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
-        active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
-        return (active > inactive);
-}
 #define mem_cgroup_from_res_counter(counter, member)    \
        container_of(counter, struct mem_cgroup, member)
@@ -1524,8 +1538,9 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
        spin_unlock_irqrestore(&memcg->move_lock, *flags);
 }
+#define K(x) ((x) << (PAGE_SHIFT-10))
 /**
- * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
+ * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
 * @memcg: The memory cgroup that went over limit
 * @p: Task that is going to be killed
 *
@@ -1543,8 +1558,10 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
         */
        static char memcg_name[PATH_MAX];
        int ret;
+        struct mem_cgroup *iter;
+        unsigned int i;
-        if (!memcg || !p)
+        if (!p)
                return;
        rcu_read_lock();
@@ -1563,7 +1580,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
        }
        rcu_read_unlock();
-        printk(KERN_INFO "Task in %s killed", memcg_name);
+        pr_info("Task in %s killed", memcg_name);
        rcu_read_lock();
        ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
@@ -1576,22 +1593,45 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
        /*
         * Continues from above, so we don't need an KERN_ level
         */
-        printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
+        pr_cont(" as a result of limit of %s\n", memcg_name);
 done:
-        printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
+        pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
                res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
                res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
                res_counter_read_u64(&memcg->res, RES_FAILCNT));
-        printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
+        pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
-                "failcnt %llu\n",
                res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
                res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
                res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
-        printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
+        pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
                res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
                res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
                res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
+        for_each_mem_cgroup_tree(iter, memcg) {
+                pr_info("Memory cgroup stats");
+                rcu_read_lock();
+                ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
+                if (!ret)
+                        pr_cont(" for %s", memcg_name);
+                rcu_read_unlock();
+                pr_cont(":");
+                for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
+                        if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
+                                continue;
+                        pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
+                                K(mem_cgroup_read_stat(iter, i)));
+                }
+                for (i = 0; i < NR_LRU_LISTS; i++)
+                        pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
+                                K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
+                pr_cont("\n");
+        }
 }
 /*
@@ -2256,6 +2296,17 @@ static void drain_local_stock(struct work_struct *dummy)
        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 }
+static void __init memcg_stock_init(void)
+{
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                struct memcg_stock_pcp *stock =
+                                        &per_cpu(memcg_stock, cpu);
+                INIT_WORK(&stock->work, drain_local_stock);
+        }
+}
 /*
 * Cache charges(val) which is from res_counter, to local per_cpu area.
 * This will be consumed by consume_stock() function, later.
@@ -3030,7 +3081,9 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
        if (memcg) {
                s->memcg_params->memcg = memcg;
                s->memcg_params->root_cache = root_cache;
-        }
+        } else
+                s->memcg_params->is_root_cache = true;
        return 0;
 }
@@ -4389,8 +4442,8 @@ void mem_cgroup_print_bad_page(struct page *page)
        pc = lookup_page_cgroup_used(page);
        if (pc) {
-                printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
+                pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
-                       pc, pc->flags, pc->mem_cgroup);
+                         pc, pc->flags, pc->mem_cgroup);
        }
 }
 #endif
@@ -4717,6 +4770,33 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
 }
 /*
+ * This mainly exists for tests during the setting of set of use_hierarchy.
+ * Since this is the very setting we are changing, the current hierarchy value
+ * is meaningless
+ */
+static inline bool __memcg_has_children(struct mem_cgroup *memcg)
+{
+        struct cgroup *pos;
+        /* bounce at first found */
+        cgroup_for_each_child(pos, memcg->css.cgroup)
+                return true;
+        return false;
+}
+/*
+ * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed
+ * to be already dead (as in mem_cgroup_force_empty, for instance).  This is
+ * from mem_cgroup_count_children(), in the sense that we don't really care how
+ * many children we have; we only need to know if we have any.  It also counts
+ * any memcg without hierarchy as infertile.
+ */
+static inline bool memcg_has_children(struct mem_cgroup *memcg)
+{
+        return memcg->use_hierarchy && __memcg_has_children(memcg);
+}
+/*
 * Reclaims as many pages from the given memcg as possible and moves
 * the rest to the parent.
 *
@@ -4786,7 +4866,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
        if (parent)
                parent_memcg = mem_cgroup_from_cont(parent);
-        cgroup_lock();
+        mutex_lock(&memcg_create_mutex);
        if (memcg->use_hierarchy == val)
                goto out;
@@ -4801,7 +4881,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
         */
        if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
                                (val == 1 || val == 0)) {
-                if (list_empty(&cont->children))
+                if (!__memcg_has_children(memcg))
                        memcg->use_hierarchy = val;
                else
                        retval = -EBUSY;
@@ -4809,7 +4889,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
                retval = -EINVAL;
 out:
-        cgroup_unlock();
+        mutex_unlock(&memcg_create_mutex);
        return retval;
 }
@@ -4894,8 +4974,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
 {
        int ret = -EINVAL;
 #ifdef CONFIG_MEMCG_KMEM
-        bool must_inc_static_branch = false;
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
        /*
         * For simplicity, we won't allow this to be disabled.  It also can't
@@ -4908,18 +4986,11 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
         *
         * After it first became limited, changes in the value of the limit are
         * of course permitted.
-         *
-         * Taking the cgroup_lock is really offensive, but it is so far the only
-         * way to guarantee that no children will appear. There are plenty of
-         * other offenders, and they should all go away. Fine grained locking
-         * is probably the way to go here. When we are fully hierarchical, we
-         * can also get rid of the use_hierarchy check.
         */
-        cgroup_lock();
+        mutex_lock(&memcg_create_mutex);
        mutex_lock(&set_limit_mutex);
        if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
-                if (cgroup_task_count(cont) || (memcg->use_hierarchy &&
+                if (cgroup_task_count(cont) || memcg_has_children(memcg)) {
-                                                !list_empty(&cont->children))) {
                        ret = -EBUSY;
                        goto out;
                }
@@ -4931,7 +5002,13 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
                        res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
                        goto out;
                }
-                must_inc_static_branch = true;
+                static_key_slow_inc(&memcg_kmem_enabled_key);
+                /*
+                 * setting the active bit after the inc will guarantee no one
+                 * starts accounting before all call sites are patched
+                 */
+                memcg_kmem_set_active(memcg);
                /*
                 * kmem charges can outlive the cgroup. In the case of slab
                 * pages, for instance, a page contain objects from various
@@ -4943,32 +5020,12 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
                ret = res_counter_set_limit(&memcg->kmem, val);
 out:
        mutex_unlock(&set_limit_mutex);
-        cgroup_unlock();
+        mutex_unlock(&memcg_create_mutex);
-        /*
-         * We are by now familiar with the fact that we can't inc the static
-         * branch inside cgroup_lock. See disarm functions for details. A
-         * worker here is overkill, but also wrong: After the limit is set, we
-         * must start accounting right away. Since this operation can't fail,
-         * we can safely defer it to here - no rollback will be needed.
-         *
-         * The boolean used to control this is also safe, because
-         * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be
-         * able to set it to true;
-         */
-        if (must_inc_static_branch) {
-                static_key_slow_inc(&memcg_kmem_enabled_key);
-                /*
-                 * setting the active bit after the inc will guarantee no one
-                 * starts accounting before all call sites are patched
-                 */
-                memcg_kmem_set_active(memcg);
-        }
 #endif
        return ret;
 }
+#ifdef CONFIG_MEMCG_KMEM
 static int memcg_propagate_kmem(struct mem_cgroup *memcg)
 {
        int ret = 0;
@@ -4977,7 +5034,6 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
                goto out;
        memcg->kmem_account_flags = parent->kmem_account_flags;
-#ifdef CONFIG_MEMCG_KMEM
        /*
         * When that happen, we need to disable the static branch only on those
         * memcgs that enabled it. To achieve this, we would be forced to
@@ -5003,10 +5059,10 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
        mutex_lock(&set_limit_mutex);
        ret = memcg_update_cache_sizes(memcg);
        mutex_unlock(&set_limit_mutex);
-#endif
 out:
        return ret;
 }
+#endif /* CONFIG_MEMCG_KMEM */
 /*
 * The user of this function is...
@@ -5146,15 +5202,14 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
        if (val >= (1 << NR_MOVE_TYPE))
                return -EINVAL;
        /*
-         * We check this value several times in both in can_attach() and
+         * No kind of locking is needed in here, because ->can_attach() will
-         * attach(), so we need cgroup lock to prevent this value from being
+         * check this value once in the beginning of the process, and then carry
-         * inconsistent.
+         * on with stale data. This means that changes to this value will only
+         * affect task migrations starting after the change.
         */
-        cgroup_lock();
        memcg->move_charge_at_immigrate = val;
-        cgroup_unlock();
        return 0;
 }
 #else
@@ -5212,14 +5267,6 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
 }
 #endif /* CONFIG_NUMA */
-static const char * const mem_cgroup_lru_names[] = {
-        "inactive_anon",
-        "active_anon",
-        "inactive_file",
-        "active_file",
-        "unevictable",
-};
 static inline void mem_cgroup_lru_names_not_uptodate(void)
 {
        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
@@ -5333,18 +5380,17 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
        parent = mem_cgroup_from_cont(cgrp->parent);
-        cgroup_lock();
+        mutex_lock(&memcg_create_mutex);
        /* If under hierarchy, only empty-root can set this value */
-        if ((parent->use_hierarchy) ||
+        if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
-            (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
+                mutex_unlock(&memcg_create_mutex);
-                cgroup_unlock();
                return -EINVAL;
        }
        memcg->swappiness = val;
-        cgroup_unlock();
+        mutex_unlock(&memcg_create_mutex);
        return 0;
 }
@@ -5670,17 +5716,16 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
        parent = mem_cgroup_from_cont(cgrp->parent);
-        cgroup_lock();
+        mutex_lock(&memcg_create_mutex);
        /* oom-kill-disable is a flag for subhierarchy. */
-        if ((parent->use_hierarchy) ||
+        if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
-            (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
+                mutex_unlock(&memcg_create_mutex);
-                cgroup_unlock();
                return -EINVAL;
        }
        memcg->oom_kill_disable = val;
        if (!val)
                memcg_oom_recover(memcg);
-        cgroup_unlock();
+        mutex_unlock(&memcg_create_mutex);
        return 0;
 }
@@ -5795,33 +5840,6 @@ static struct cftype mem_cgroup_files[] = {
                .read_seq_string = memcg_numa_stat_show,
        },
 #endif
-#ifdef CONFIG_MEMCG_SWAP
-        {
-                .name = "memsw.usage_in_bytes",
-                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
-                .read = mem_cgroup_read,
-                .register_event = mem_cgroup_usage_register_event,
-                .unregister_event = mem_cgroup_usage_unregister_event,
-        },
-        {
-                .name = "memsw.max_usage_in_bytes",
-                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
-                .trigger = mem_cgroup_reset,
-                .read = mem_cgroup_read,
-        },
-        {
-                .name = "memsw.limit_in_bytes",
-                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
-                .write_string = mem_cgroup_write,
-                .read = mem_cgroup_read,
-        },
-        {
-                .name = "memsw.failcnt",
-                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
-                .trigger = mem_cgroup_reset,
-                .read = mem_cgroup_read,
-        },
-#endif
 #ifdef CONFIG_MEMCG_KMEM
        {
                .name = "kmem.limit_in_bytes",
@@ -5856,6 +5874,36 @@ static struct cftype mem_cgroup_files[] = {
        { },    /* terminate */
 };
+#ifdef CONFIG_MEMCG_SWAP
+static struct cftype memsw_cgroup_files[] = {
+        {
+                .name = "memsw.usage_in_bytes",
+                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
+                .read = mem_cgroup_read,
+                .register_event = mem_cgroup_usage_register_event,
+                .unregister_event = mem_cgroup_usage_unregister_event,
+        },
+        {
+                .name = "memsw.max_usage_in_bytes",
+                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
+                .trigger = mem_cgroup_reset,
+                .read = mem_cgroup_read,
+        },
+        {
+                .name = "memsw.limit_in_bytes",
+                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
+                .write_string = mem_cgroup_write,
+                .read = mem_cgroup_read,
+        },
+        {
+                .name = "memsw.failcnt",
+                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
+                .trigger = mem_cgroup_reset,
+                .read = mem_cgroup_read,
+        },
+        { },    /* terminate */
+};
+#endif
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
        struct mem_cgroup_per_node *pn;
@@ -5894,9 +5942,9 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
        struct mem_cgroup *memcg;
-        int size = sizeof(struct mem_cgroup);
+        size_t size = memcg_size();
-        /* Can be very big if MAX_NUMNODES is very big */
+        /* Can be very big if nr_node_ids is very big */
        if (size < PAGE_SIZE)
                memcg = kzalloc(size, GFP_KERNEL);
        else
@@ -5933,7 +5981,7 @@ out_free:
 static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
        int node;
-        int size = sizeof(struct mem_cgroup);
+        size_t size = memcg_size();
        mem_cgroup_remove_from_trees(memcg);
        free_css_id(&mem_cgroup_subsys, &memcg->css);
@@ -6015,19 +6063,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(parent_mem_cgroup);
-#ifdef CONFIG_MEMCG_SWAP
+static void __init mem_cgroup_soft_limit_tree_init(void)
-static void __init enable_swap_cgroup(void)
-{
-        if (!mem_cgroup_disabled() && really_do_swap_account)
-                do_swap_account = 1;
-}
-#else
-static void __init enable_swap_cgroup(void)
-{
-}
-#endif
-static int mem_cgroup_soft_limit_tree_init(void)
 {
        struct mem_cgroup_tree_per_node *rtpn;
        struct mem_cgroup_tree_per_zone *rtpz;
@@ -6038,8 +6074,7 @@ static int mem_cgroup_soft_limit_tree_init(void)
                if (!node_state(node, N_NORMAL_MEMORY))
                        tmp = -1;
                rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
-                if (!rtpn)
+                BUG_ON(!rtpn);
-                        goto err_cleanup;
                soft_limit_tree.rb_tree_per_node[node] = rtpn;
@@ -6049,23 +6084,12 @@ static int mem_cgroup_soft_limit_tree_init(void)
                        spin_lock_init(&rtpz->lock);
                }
        }
-        return 0;
-err_cleanup:
-        for_each_node(node) {
-                if (!soft_limit_tree.rb_tree_per_node[node])
-                        break;
-                kfree(soft_limit_tree.rb_tree_per_node[node]);
-                soft_limit_tree.rb_tree_per_node[node] = NULL;
-        }
-        return 1;
 }
 static struct cgroup_subsys_state * __ref
 mem_cgroup_css_alloc(struct cgroup *cont)
 {
-        struct mem_cgroup *memcg, *parent;
+        struct mem_cgroup *memcg;
        long error = -ENOMEM;
        int node;
@@ -6079,24 +6103,44 @@ mem_cgroup_css_alloc(struct cgroup *cont)
        /* root ? */
        if (cont->parent == NULL) {
-                int cpu;
-                enable_swap_cgroup();
-                parent = NULL;
-                if (mem_cgroup_soft_limit_tree_init())
-                        goto free_out;
                root_mem_cgroup = memcg;
-                for_each_possible_cpu(cpu) {
+                res_counter_init(&memcg->res, NULL);
-                        struct memcg_stock_pcp *stock =
+                res_counter_init(&memcg->memsw, NULL);
-                                                &per_cpu(memcg_stock, cpu);
+                res_counter_init(&memcg->kmem, NULL);
-                        INIT_WORK(&stock->work, drain_local_stock);
-                }
-        } else {
-                parent = mem_cgroup_from_cont(cont->parent);
-                memcg->use_hierarchy = parent->use_hierarchy;
-                memcg->oom_kill_disable = parent->oom_kill_disable;
        }
-        if (parent && parent->use_hierarchy) {
+        memcg->last_scanned_node = MAX_NUMNODES;
+        INIT_LIST_HEAD(&memcg->oom_notify);
+        atomic_set(&memcg->refcnt, 1);
+        memcg->move_charge_at_immigrate = 0;
+        mutex_init(&memcg->thresholds_lock);
+        spin_lock_init(&memcg->move_lock);
+        return &memcg->css;
+free_out:
+        __mem_cgroup_free(memcg);
+        return ERR_PTR(error);
+}
+static int
+mem_cgroup_css_online(struct cgroup *cont)
+{
+        struct mem_cgroup *memcg, *parent;
+        int error = 0;
+        if (!cont->parent)
+                return 0;
+        mutex_lock(&memcg_create_mutex);
+        memcg = mem_cgroup_from_cont(cont);
+        parent = mem_cgroup_from_cont(cont->parent);
+        memcg->use_hierarchy = parent->use_hierarchy;
+        memcg->oom_kill_disable = parent->oom_kill_disable;
+        memcg->swappiness = mem_cgroup_swappiness(parent);
+        if (parent->use_hierarchy) {
                res_counter_init(&memcg->res, &parent->res);
                res_counter_init(&memcg->memsw, &parent->memsw);
                res_counter_init(&memcg->kmem, &parent->kmem);
@@ -6117,20 +6161,12 @@ mem_cgroup_css_alloc(struct cgroup *cont)
                 * much sense so let cgroup subsystem know about this
                 * unfortunate state in our controller.
                 */
-                if (parent && parent != root_mem_cgroup)
+                if (parent != root_mem_cgroup)
                        mem_cgroup_subsys.broken_hierarchy = true;
        }
-        memcg->last_scanned_node = MAX_NUMNODES;
-        INIT_LIST_HEAD(&memcg->oom_notify);
-        if (parent)
-                memcg->swappiness = mem_cgroup_swappiness(parent);
-        atomic_set(&memcg->refcnt, 1);
-        memcg->move_charge_at_immigrate = 0;
-        mutex_init(&memcg->thresholds_lock);
-        spin_lock_init(&memcg->move_lock);
        error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
+        mutex_unlock(&memcg_create_mutex);
        if (error) {
                /*
                 * We call put now because our (and parent's) refcnts
@@ -6138,12 +6174,10 @@ mem_cgroup_css_alloc(struct cgroup *cont)
                 * call __mem_cgroup_free, so return directly
                 */
                mem_cgroup_put(memcg);
-                return ERR_PTR(error);
+                if (parent->use_hierarchy)
+                        mem_cgroup_put(parent);
        }
-        return &memcg->css;
+        return error;
-free_out:
-        __mem_cgroup_free(memcg);
-        return ERR_PTR(error);
 }
 static void mem_cgroup_css_offline(struct cgroup *cont)
@@ -6279,7 +6313,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
         * Because lookup_swap_cache() updates some statistics counter,
         * we call find_get_page() with swapper_space directly.
         */
-        page = find_get_page(&swapper_space, ent.val);
+        page = find_get_page(swap_address_space(ent), ent.val);
        if (do_swap_account)
                entry->val = ent.val;
@@ -6320,7 +6354,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
                swp_entry_t swap = radix_to_swp_entry(page);
                if (do_swap_account)
                        *entry = swap;
-                page = find_get_page(&swapper_space, swap.val);
+                page = find_get_page(swap_address_space(swap), swap.val);
        }
 #endif
        return page;
@@ -6530,8 +6564,15 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup,
        struct task_struct *p = cgroup_taskset_first(tset);
        int ret = 0;
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
+        unsigned long move_charge_at_immigrate;
-        if (memcg->move_charge_at_immigrate) {
+        /*
+         * We are now commited to this value whatever it is. Changes in this
+         * tunable will only affect upcoming migrations, not the current one.
+         * So we need to save it, and keep it going.
+         */
+        move_charge_at_immigrate  = memcg->move_charge_at_immigrate;
+        if (move_charge_at_immigrate) {
                struct mm_struct *mm;
                struct mem_cgroup *from = mem_cgroup_from_task(p);
@@ -6551,6 +6592,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup,
                        spin_lock(&mc.lock);
                        mc.from = from;
                        mc.to = memcg;
+                        mc.immigrate_flags = move_charge_at_immigrate;
                        spin_unlock(&mc.lock);
                        /* We set mc.moving_task later */
@@ -6745,6 +6787,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
        .name = "memory",
        .subsys_id = mem_cgroup_subsys_id,
        .css_alloc = mem_cgroup_css_alloc,
+        .css_online = mem_cgroup_css_online,
        .css_offline = mem_cgroup_css_offline,
        .css_free = mem_cgroup_css_free,
        .can_attach = mem_cgroup_can_attach,
@@ -6755,19 +6798,6 @@ struct cgroup_subsys mem_cgroup_subsys = {
        .use_id = 1,
 };
-/*
- * The rest of init is performed during ->css_alloc() for root css which
- * happens before initcalls.  hotcpu_notifier() can't be done together as
- * it would introduce circular locking by adding cgroup_lock -> cpu hotplug
- * dependency.  Do it from a subsys_initcall().
- */
-static int __init mem_cgroup_init(void)
-{
-        hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
-        return 0;
-}
-subsys_initcall(mem_cgroup_init);
 #ifdef CONFIG_MEMCG_SWAP
 static int __init enable_swap_account(char *s)
 {
@@ -6780,4 +6810,39 @@ static int __init enable_swap_account(char *s)
 }
 __setup("swapaccount=", enable_swap_account);
+static void __init memsw_file_init(void)
+{
+        WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files));
+}
+static void __init enable_swap_cgroup(void)
+{
+        if (!mem_cgroup_disabled() && really_do_swap_account) {
+                do_swap_account = 1;
+                memsw_file_init();
+        }
+}
+#else
+static void __init enable_swap_cgroup(void)
+{
+}
 #endif
+/*
+ * subsys_initcall() for memory controller.
+ *
+ * Some parts like hotcpu_notifier() have to be initialized from this context
+ * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
+ * everything that doesn't depend on a specific mem_cgroup structure should
+ * be initialized from here.
+ */
+static int __init mem_cgroup_init(void)
+{
+        hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
+        enable_swap_cgroup();
+        mem_cgroup_soft_limit_tree_init();
+        memcg_stock_init();
+        return 0;
+}
+subsys_initcall(mem_cgroup_init);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c6e4dd3e1c08..df0694c6adef 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -61,7 +61,7 @@ int sysctl_memory_failure_early_kill __read_mostly = 0;
 int sysctl_memory_failure_recovery __read_mostly = 1;
-atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
+atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
 #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
@@ -784,12 +784,12 @@ static struct page_state {
        { sc|dirty,     sc|dirty,       "dirty swapcache",      me_swapcache_dirty },
        { sc|dirty,     sc,             "clean swapcache",      me_swapcache_clean },
-        { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
-        { unevict,      unevict,        "clean unevictable LRU", me_pagecache_clean },
        { mlock|dirty,  mlock|dirty,    "dirty mlocked LRU",    me_pagecache_dirty },
        { mlock,        mlock,          "clean mlocked LRU",    me_pagecache_clean },
+        { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
+        { unevict,      unevict,        "clean unevictable LRU", me_pagecache_clean },
        { lru|dirty,    lru|dirty,      "dirty LRU",    me_pagecache_dirty },
        { lru|dirty,    lru,            "clean LRU",    me_pagecache_clean },
@@ -1021,6 +1021,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        struct page *hpage;
        int res;
        unsigned int nr_pages;
+        unsigned long page_flags;
        if (!sysctl_memory_failure_recovery)
                panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -1039,8 +1040,18 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                return 0;
        }
-        nr_pages = 1 << compound_trans_order(hpage);
+        /*
-        atomic_long_add(nr_pages, &mce_bad_pages);
+         * Currently errors on hugetlbfs pages are measured in hugepage units,
+         * so nr_pages should be 1 << compound_order.  OTOH when errors are on
+         * transparent hugepages, they are supposed to be split and error
+         * measurement is done in normal page units.  So nr_pages should be one
+         * in this case.
+         */
+        if (PageHuge(p))
+                nr_pages = 1 << compound_order(hpage);
+        else /* normal page or thp */
+                nr_pages = 1;
+        atomic_long_add(nr_pages, &num_poisoned_pages);
        /*
         * We need/can do nothing about count=0 pages.
@@ -1070,7 +1081,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                        if (!PageHWPoison(hpage)
                            || (hwpoison_filter(p) && TestClearPageHWPoison(p))
                            || (p != hpage && TestSetPageHWPoison(hpage))) {
-                                atomic_long_sub(nr_pages, &mce_bad_pages);
+                                atomic_long_sub(nr_pages, &num_poisoned_pages);
                                return 0;
                        }
                        set_page_hwpoison_huge_page(hpage);
@@ -1119,6 +1130,15 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        lock_page(hpage);
        /*
+         * We use page flags to determine what action should be taken, but
+         * the flags can be modified by the error containment action.  One
+         * example is an mlocked page, where PG_mlocked is cleared by
+         * page_remove_rmap() in try_to_unmap_one(). So to determine page status
+         * correctly, we save a copy of the page flags at this time.
+         */
+        page_flags = p->flags;
+        /*
         * unpoison always clear PG_hwpoison inside page lock
         */
        if (!PageHWPoison(p)) {
@@ -1128,7 +1148,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        }
        if (hwpoison_filter(p)) {
                if (TestClearPageHWPoison(p))
-                        atomic_long_sub(nr_pages, &mce_bad_pages);
+                        atomic_long_sub(nr_pages, &num_poisoned_pages);
                unlock_page(hpage);
                put_page(hpage);
                return 0;
@@ -1176,12 +1196,19 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        }
        res = -EBUSY;
-        for (ps = error_states;; ps++) {
+        /*
-                if ((p->flags & ps->mask) == ps->res) {
+         * The first check uses the current page flags which may not have any
-                        res = page_action(ps, p, pfn);
+         * relevant information. The second check with the saved page flagss is
+         * carried out only if the first check can't determine the page status.
+         */
+        for (ps = error_states;; ps++)
+                if ((p->flags & ps->mask) == ps->res)
                        break;
-                }
+        if (!ps->mask)
-        }
+                for (ps = error_states;; ps++)
+                        if ((page_flags & ps->mask) == ps->res)
+                                break;
+        res = page_action(ps, p, pfn);
 out:
        unlock_page(hpage);
        return res;
@@ -1323,7 +1350,7 @@ int unpoison_memory(unsigned long pfn)
                        return 0;
                }
                if (TestClearPageHWPoison(p))
-                        atomic_long_sub(nr_pages, &mce_bad_pages);
+                        atomic_long_sub(nr_pages, &num_poisoned_pages);
                pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
                return 0;
        }
@@ -1337,7 +1364,7 @@ int unpoison_memory(unsigned long pfn)
         */
        if (TestClearPageHWPoison(page)) {
                pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
-                atomic_long_sub(nr_pages, &mce_bad_pages);
+                atomic_long_sub(nr_pages, &num_poisoned_pages);
                freeit = 1;
                if (PageHuge(page))
                        clear_page_hwpoison_huge_page(page);
@@ -1368,7 +1395,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x)
 * that is not free, and 1 for any other page type.
 * For 1 the page is returned with increased page count, otherwise not.
 */
-static int get_any_page(struct page *p, unsigned long pfn, int flags)
+static int __get_any_page(struct page *p, unsigned long pfn, int flags)
 {
        int ret;
@@ -1393,11 +1420,9 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
        if (!get_page_unless_zero(compound_head(p))) {
                if (PageHuge(p)) {
                        pr_info("%s: %#lx free huge page\n", __func__, pfn);
-                        ret = dequeue_hwpoisoned_huge_page(compound_head(p));
+                        ret = 0;
                } else if (is_free_buddy_page(p)) {
                        pr_info("%s: %#lx free buddy page\n", __func__, pfn);
-                        /* Set hwpoison bit while page is still isolated */
-                        SetPageHWPoison(p);
                        ret = 0;
                } else {
                        pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
@@ -1413,43 +1438,68 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
        return ret;
 }
+static int get_any_page(struct page *page, unsigned long pfn, int flags)
+{
+        int ret = __get_any_page(page, pfn, flags);
+        if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {
+                /*
+                 * Try to free it.
+                 */
+                put_page(page);
+                shake_page(page, 1);
+                /*
+                 * Did it turn free?
+                 */
+                ret = __get_any_page(page, pfn, 0);
+                if (!PageLRU(page)) {
+                        pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
+                                pfn, page->flags);
+                        return -EIO;
+                }
+        }
+        return ret;
+}
 static int soft_offline_huge_page(struct page *page, int flags)
 {
        int ret;
        unsigned long pfn = page_to_pfn(page);
        struct page *hpage = compound_head(page);
-        ret = get_any_page(page, pfn, flags);
+        /*
-        if (ret < 0)
+         * This double-check of PageHWPoison is to avoid the race with
-                return ret;
+         * memory_failure(). See also comment in __soft_offline_page().
-        if (ret == 0)
+         */
-                goto done;
+        lock_page(hpage);
        if (PageHWPoison(hpage)) {
+                unlock_page(hpage);
                put_page(hpage);
                pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
                return -EBUSY;
        }
+        unlock_page(hpage);
        /* Keep page count to indicate a given hugepage is isolated. */
-        ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false,
+        ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL,
                                MIGRATE_SYNC);
        put_page(hpage);
        if (ret) {
                pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                        pfn, ret, page->flags);
-                return ret;
+        } else {
-        }
+                set_page_hwpoison_huge_page(hpage);
-done:
+                dequeue_hwpoisoned_huge_page(hpage);
-        if (!PageHWPoison(hpage))
                atomic_long_add(1 << compound_trans_order(hpage),
-                                &mce_bad_pages);
+                                &num_poisoned_pages);
-        set_page_hwpoison_huge_page(hpage);
+        }
-        dequeue_hwpoisoned_huge_page(hpage);
        /* keep elevated page count for bad page */
        return ret;
 }
+static int __soft_offline_page(struct page *page, int flags);
 /**
 * soft_offline_page - Soft offline a page.
 * @page: page to offline
@@ -1478,9 +1528,11 @@ int soft_offline_page(struct page *page, int flags)
        unsigned long pfn = page_to_pfn(page);
        struct page *hpage = compound_trans_head(page);
-        if (PageHuge(page))
+        if (PageHWPoison(page)) {
-                return soft_offline_huge_page(page, flags);
+                pr_info("soft offline: %#lx page already poisoned\n", pfn);
-        if (PageTransHuge(hpage)) {
+                return -EBUSY;
+        }
+        if (!PageHuge(page) && PageTransHuge(hpage)) {
                if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
                        pr_info("soft offline: %#lx: failed to split THP\n",
                                pfn);
@@ -1491,47 +1543,45 @@ int soft_offline_page(struct page *page, int flags)
        ret = get_any_page(page, pfn, flags);
        if (ret < 0)
                return ret;
-        if (ret == 0)
+        if (ret) { /* for in-use pages */
-                goto done;
+                if (PageHuge(page))
+                        ret = soft_offline_huge_page(page, flags);
-        /*
+                else
-         * Page cache page we can handle?
+                        ret = __soft_offline_page(page, flags);
-         */
+        } else { /* for free pages */
-        if (!PageLRU(page)) {
+                if (PageHuge(page)) {
-                /*
+                        set_page_hwpoison_huge_page(hpage);
-                 * Try to free it.
+                        dequeue_hwpoisoned_huge_page(hpage);
-                 */
+                        atomic_long_add(1 << compound_trans_order(hpage),
-                put_page(page);
+                                        &num_poisoned_pages);
-                shake_page(page, 1);
+                } else {
+                        SetPageHWPoison(page);
-                /*
+                        atomic_long_inc(&num_poisoned_pages);
-                 * Did it turn free?
+                }
-                 */
-                ret = get_any_page(page, pfn, 0);
-                if (ret < 0)
-                        return ret;
-                if (ret == 0)
-                        goto done;
-        }
-        if (!PageLRU(page)) {
-                pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
-                        pfn, page->flags);
-                return -EIO;
        }
+        /* keep elevated page count for bad page */
+        return ret;
+}
-        lock_page(page);
+static int __soft_offline_page(struct page *page, int flags)
-        wait_on_page_writeback(page);
+{
+        int ret;
+        unsigned long pfn = page_to_pfn(page);
        /*
-         * Synchronized using the page lock with memory_failure()
+         * Check PageHWPoison again inside page lock because PageHWPoison
+         * is set by memory_failure() outside page lock. Note that
+         * memory_failure() also double-checks PageHWPoison inside page lock,
+         * so there's no race between soft_offline_page() and memory_failure().
         */
+        lock_page(page);
+        wait_on_page_writeback(page);
        if (PageHWPoison(page)) {
                unlock_page(page);
                put_page(page);
                pr_info("soft offline: %#lx page already poisoned\n", pfn);
                return -EBUSY;
        }
        /*
         * Try to invalidate first. This should work for
         * non dirty unmapped page cache pages.
@@ -1544,9 +1594,10 @@ int soft_offline_page(struct page *page, int flags)
         */
        if (ret == 1) {
                put_page(page);
-                ret = 0;
                pr_info("soft_offline: %#lx: invalidated\n", pfn);
-                goto done;
+                SetPageHWPoison(page);
+                atomic_long_inc(&num_poisoned_pages);
+                return 0;
        }
        /*
@@ -1563,28 +1614,23 @@ int soft_offline_page(struct page *page, int flags)
        if (!ret) {
                LIST_HEAD(pagelist);
                inc_zone_page_state(page, NR_ISOLATED_ANON +
-                                            page_is_file_cache(page));
+                                        page_is_file_cache(page));
                list_add(&page->lru, &pagelist);
                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
-                                                        false, MIGRATE_SYNC,
+                                        MIGRATE_SYNC, MR_MEMORY_FAILURE);
-                                                        MR_MEMORY_FAILURE);
                if (ret) {
                        putback_lru_pages(&pagelist);
                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                                pfn, ret, page->flags);
                        if (ret > 0)
                                ret = -EIO;
+                } else {
+                        SetPageHWPoison(page);
+                        atomic_long_inc(&num_poisoned_pages);
                }
        } else {
                pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
                        pfn, ret, page_count(page), page->flags);
        }
-        if (ret)
-                return ret;
-done:
-        atomic_long_add(1, &mce_bad_pages);
-        SetPageHWPoison(page);
-        /* keep elevated page count for bad page */
        return ret;
 }
diff --git a/mm/memory.c b/mm/memory.c
index bb1369f7b9b4..494526ae024a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,6 +69,10 @@
 #include "internal.h"
+#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
+#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid.
+#endif
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
@@ -716,7 +720,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
                print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
                                (unsigned long)vma->vm_file->f_op->mmap);
        dump_stack();
-        add_taint(TAINT_BAD_PAGE);
+        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 }
 static inline bool is_cow_mapping(vm_flags_t flags)
@@ -1458,10 +1462,11 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 EXPORT_SYMBOL_GPL(zap_vma_ptes);
 /**
- * follow_page - look up a page descriptor from a user-virtual address
+ * follow_page_mask - look up a page descriptor from a user-virtual address
 * @vma: vm_area_struct mapping @address
 * @address: virtual address to look up
 * @flags: flags modifying lookup behaviour
+ * @page_mask: on output, *page_mask is set according to the size of the page
 *
 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
 *
@@ -1469,8 +1474,9 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes);
 * an error pointer if there is a mapping to something not represented
 * by a page descriptor (see also vm_normal_page()).
 */
-struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+struct page *follow_page_mask(struct vm_area_struct *vma,
-                        unsigned int flags)
+                              unsigned long address, unsigned int flags,
+                              unsigned int *page_mask)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -1480,6 +1486,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
        struct page *page;
        struct mm_struct *mm = vma->vm_mm;
+        *page_mask = 0;
        page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
        if (!IS_ERR(page)) {
                BUG_ON(flags & FOLL_GET);
@@ -1526,6 +1534,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                                page = follow_trans_huge_pmd(vma, address,
                                                             pmd, flags);
                                spin_unlock(&mm->page_table_lock);
+                                *page_mask = HPAGE_PMD_NR - 1;
                                goto out;
                        }
                } else
@@ -1539,8 +1548,24 @@ split_fallthrough:
        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
        pte = *ptep;
-        if (!pte_present(pte))
+        if (!pte_present(pte)) {
-                goto no_page;
+                swp_entry_t entry;
+                /*
+                 * KSM's break_ksm() relies upon recognizing a ksm page
+                 * even while it is being migrated, so for that case we
+                 * need migration_entry_wait().
+                 */
+                if (likely(!(flags & FOLL_MIGRATION)))
+                        goto no_page;
+                if (pte_none(pte) || pte_file(pte))
+                        goto no_page;
+                entry = pte_to_swp_entry(pte);
+                if (!is_migration_entry(entry))
+                        goto no_page;
+                pte_unmap_unlock(ptep, ptl);
+                migration_entry_wait(mm, pmd, address);
+                goto split_fallthrough;
+        }
        if ((flags & FOLL_NUMA) && pte_numa(pte))
                goto no_page;
        if ((flags & FOLL_WRITE) && !pte_write(pte))
@@ -1673,15 +1698,16 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add
 * instead of __get_user_pages. __get_user_pages should be used only if
 * you need some special @gup_flags.
 */
-int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-                     unsigned long start, int nr_pages, unsigned int gup_flags,
+                unsigned long start, unsigned long nr_pages,
-                     struct page **pages, struct vm_area_struct **vmas,
+                unsigned int gup_flags, struct page **pages,
-                     int *nonblocking)
+                struct vm_area_struct **vmas, int *nonblocking)
 {
-        int i;
+        long i;
        unsigned long vm_flags;
+        unsigned int page_mask;
-        if (nr_pages <= 0)
+        if (!nr_pages)
                return 0;
        VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
@@ -1757,6 +1783,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                get_page(page);
                        }
                        pte_unmap(pte);
+                        page_mask = 0;
                        goto next_page;
                }
@@ -1774,6 +1801,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                do {
                        struct page *page;
                        unsigned int foll_flags = gup_flags;
+                        unsigned int page_increm;
                        /*
                         * If we have a pending SIGKILL, don't keep faulting
@@ -1783,7 +1811,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                return i ? i : -ERESTARTSYS;
                        cond_resched();
-                        while (!(page = follow_page(vma, start, foll_flags))) {
+                        while (!(page = follow_page_mask(vma, start,
+                                                foll_flags, &page_mask))) {
                                int ret;
                                unsigned int fault_flags = 0;
@@ -1857,13 +1886,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                flush_anon_page(vma, page, start);
                                flush_dcache_page(page);
+                                page_mask = 0;
                        }
 next_page:
-                        if (vmas)
+                        if (vmas) {
                                vmas[i] = vma;
-                        i++;
+                                page_mask = 0;
-                        start += PAGE_SIZE;
+                        }
-                        nr_pages--;
+                        page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
+                        if (page_increm > nr_pages)
+                                page_increm = nr_pages;
+                        i += page_increm;
+                        start += page_increm * PAGE_SIZE;
+                        nr_pages -= page_increm;
                } while (nr_pages && start < vma->vm_end);
        } while (nr_pages);
        return i;
@@ -1977,9 +2012,9 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
 *
 * See also get_user_pages_fast, for performance critical applications.
 */
-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-                unsigned long start, int nr_pages, int write, int force,
+                unsigned long start, unsigned long nr_pages, int write,
-                struct page **pages, struct vm_area_struct **vmas)
+                int force, struct page **pages, struct vm_area_struct **vmas)
 {
        int flags = FOLL_TOUCH;
@@ -2919,7 +2954,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned int flags, pte_t orig_pte)
 {
        spinlock_t *ptl;
-        struct page *page, *swapcache = NULL;
+        struct page *page, *swapcache;
        swp_entry_t entry;
        pte_t pte;
        int locked;
@@ -2970,9 +3005,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 */
                ret = VM_FAULT_HWPOISON;
                delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+                swapcache = page;
                goto out_release;
        }
+        swapcache = page;
        locked = lock_page_or_retry(page, mm, flags);
        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2990,16 +3027,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
                goto out_page;
-        if (ksm_might_need_to_copy(page, vma, address)) {
+        page = ksm_might_need_to_copy(page, vma, address);
-                swapcache = page;
+        if (unlikely(!page)) {
-                page = ksm_does_need_to_copy(page, vma, address);
+                ret = VM_FAULT_OOM;
+                page = swapcache;
-                if (unlikely(!page)) {
+                goto out_page;
-                        ret = VM_FAULT_OOM;
-                        page = swapcache;
-                        swapcache = NULL;
-                        goto out_page;
-                }
        }
        if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
@@ -3044,7 +3076,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        }
        flush_icache_page(vma, page);
        set_pte_at(mm, address, page_table, pte);
-        do_page_add_anon_rmap(page, vma, address, exclusive);
+        if (page == swapcache)
+                do_page_add_anon_rmap(page, vma, address, exclusive);
+        else /* ksm created a completely new copy */
+                page_add_new_anon_rmap(page, vma, address);
        /* It's better to call commit-charge after rmap is established */
        mem_cgroup_commit_charge_swapin(page, ptr);
@@ -3052,7 +3087,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
                try_to_free_swap(page);
        unlock_page(page);
-        if (swapcache) {
+        if (page != swapcache) {
                /*
                 * Hold the lock to avoid the swap entry to be reused
                 * until we take the PT lock for the pte_same() check
@@ -3085,7 +3120,7 @@ out_page:
        unlock_page(page);
 out_release:
        page_cache_release(page);
-        if (swapcache) {
+        if (page != swapcache) {
                unlock_page(swapcache);
                page_cache_release(swapcache);
        }
@@ -3821,30 +3856,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
-int make_pages_present(unsigned long addr, unsigned long end)
-{
-        int ret, len, write;
-        struct vm_area_struct * vma;
-        vma = find_vma(current->mm, addr);
-        if (!vma)
-                return -ENOMEM;
-        /*
-         * We want to touch writable mappings with a write fault in order
-         * to break COW, except for shared mappings because these don't COW
-         * and we would not want to dirty them for nothing.
-         */
-        write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
-        BUG_ON(addr >= end);
-        BUG_ON(end > vma->vm_end);
-        len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
-        ret = get_user_pages(current, current->mm, addr,
-                        len, write, 0, NULL, NULL);
-        if (ret < 0)
-                return ret;
-        return ret == len ? 0 : -EFAULT;
-}
 #if !defined(__HAVE_ARCH_GATE_AREA)
 #if defined(AT_SYSINFO_EHDR)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d04ed87bfacb..b81a367b9f39 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -29,6 +29,7 @@
 #include <linux/suspend.h>
 #include <linux/mm_inline.h>
 #include <linux/firmware-map.h>
+#include <linux/stop_machine.h>
 #include <asm/tlbflush.h>
@@ -91,9 +92,8 @@ static void release_memory_resource(struct resource *res)
 }
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
-#ifndef CONFIG_SPARSEMEM_VMEMMAP
+void get_page_bootmem(unsigned long info,  struct page *page,
-static void get_page_bootmem(unsigned long info,  struct page *page,
+                      unsigned long type)
-                             unsigned long type)
 {
        page->lru.next = (struct list_head *) type;
        SetPagePrivate(page);
@@ -124,10 +124,13 @@ void __ref put_page_bootmem(struct page *page)
                mutex_lock(&ppb_lock);
                __free_pages_bootmem(page, 0);
                mutex_unlock(&ppb_lock);
+                totalram_pages++;
        }
 }
+#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
 static void register_page_bootmem_info_section(unsigned long start_pfn)
 {
        unsigned long *usemap, mapsize, section_nr, i;
@@ -161,6 +164,32 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
                get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
 }
+#else /* CONFIG_SPARSEMEM_VMEMMAP */
+static void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+        unsigned long *usemap, mapsize, section_nr, i;
+        struct mem_section *ms;
+        struct page *page, *memmap;
+        if (!pfn_valid(start_pfn))
+                return;
+        section_nr = pfn_to_section_nr(start_pfn);
+        ms = __nr_to_section(section_nr);
+        memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+        register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
+        usemap = __nr_to_section(section_nr)->pageblock_flags;
+        page = virt_to_page(usemap);
+        mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+        for (i = 0; i < mapsize; i++, page++)
+                get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
+}
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 void register_page_bootmem_info_node(struct pglist_data *pgdat)
 {
@@ -189,7 +218,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
        }
        pfn = pgdat->node_start_pfn;
-        end_pfn = pfn + pgdat->node_spanned_pages;
+        end_pfn = pgdat_end_pfn(pgdat);
        /* register_section info */
        for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
@@ -203,7 +232,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
                        register_page_bootmem_info_section(pfn);
        }
 }
-#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
 static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
                           unsigned long end_pfn)
@@ -253,6 +282,17 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
                set_page_links(pfn_to_page(pfn), zid, nid, pfn);
 }
+/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
+ * alloc_bootmem_node_nopanic() */
+static int __ref ensure_zone_is_initialized(struct zone *zone,
+                        unsigned long start_pfn, unsigned long num_pages)
+{
+        if (!zone_is_initialized(zone))
+                return init_currently_empty_zone(zone, start_pfn, num_pages,
+                                                 MEMMAP_HOTPLUG);
+        return 0;
+}
 static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
                unsigned long start_pfn, unsigned long end_pfn)
 {
@@ -260,17 +300,14 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
        unsigned long flags;
        unsigned long z1_start_pfn;
-        if (!z1->wait_table) {
+        ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn);
-                ret = init_currently_empty_zone(z1, start_pfn,
+        if (ret)
-                        end_pfn - start_pfn, MEMMAP_HOTPLUG);
+                return ret;
-                if (ret)
-                        return ret;
-        }
        pgdat_resize_lock(z1->zone_pgdat, &flags);
        /* can't move pfns which are higher than @z2 */
-        if (end_pfn > z2->zone_start_pfn + z2->spanned_pages)
+        if (end_pfn > zone_end_pfn(z2))
                goto out_fail;
        /* the move out part mast at the left most of @z2 */
        if (start_pfn > z2->zone_start_pfn)
@@ -286,7 +323,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
                z1_start_pfn = start_pfn;
        resize_zone(z1, z1_start_pfn, end_pfn);
-        resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages);
+        resize_zone(z2, end_pfn, zone_end_pfn(z2));
        pgdat_resize_unlock(z1->zone_pgdat, &flags);
@@ -305,12 +342,9 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
        unsigned long flags;
        unsigned long z2_end_pfn;
-        if (!z2->wait_table) {
+        ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn);
-                ret = init_currently_empty_zone(z2, start_pfn,
+        if (ret)
-                        end_pfn - start_pfn, MEMMAP_HOTPLUG);
+                return ret;
-                if (ret)
-                        return ret;
-        }
        pgdat_resize_lock(z1->zone_pgdat, &flags);
@@ -318,15 +352,15 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
        if (z1->zone_start_pfn > start_pfn)
                goto out_fail;
        /* the move out part mast at the right most of @z1 */
-        if (z1->zone_start_pfn + z1->spanned_pages >  end_pfn)
+        if (zone_end_pfn(z1) >  end_pfn)
                goto out_fail;
        /* must included/overlap */
-        if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages)
+        if (start_pfn >= zone_end_pfn(z1))
                goto out_fail;
        /* use end_pfn for z2's end_pfn if z2 is empty */
        if (z2->spanned_pages)
-                z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages;
+                z2_end_pfn = zone_end_pfn(z2);
        else
                z2_end_pfn = end_pfn;
@@ -363,16 +397,13 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
        int nid = pgdat->node_id;
        int zone_type;
        unsigned long flags;
+        int ret;
        zone_type = zone - pgdat->node_zones;
-        if (!zone->wait_table) {
+        ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages);
-                int ret;
+        if (ret)
+                return ret;
-                ret = init_currently_empty_zone(zone, phys_start_pfn,
-                                                nr_pages, MEMMAP_HOTPLUG);
-                if (ret)
-                        return ret;
-        }
        pgdat_resize_lock(zone->zone_pgdat, &flags);
        grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
        grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
@@ -405,20 +436,211 @@ static int __meminit __add_section(int nid, struct zone *zone,
        return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
 }
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
-static int __remove_section(struct zone *zone, struct mem_section *ms)
+static int find_smallest_section_pfn(int nid, struct zone *zone,
+                                     unsigned long start_pfn,
+                                     unsigned long end_pfn)
+{
+        struct mem_section *ms;
+        for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
+                ms = __pfn_to_section(start_pfn);
+                if (unlikely(!valid_section(ms)))
+                        continue;
+                if (unlikely(pfn_to_nid(start_pfn) != nid))
+                        continue;
+                if (zone && zone != page_zone(pfn_to_page(start_pfn)))
+                        continue;
+                return start_pfn;
+        }
+        return 0;
+}
+/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
+static int find_biggest_section_pfn(int nid, struct zone *zone,
+                                    unsigned long start_pfn,
+                                    unsigned long end_pfn)
+{
+        struct mem_section *ms;
+        unsigned long pfn;
+        /* pfn is the end pfn of a memory section. */
+        pfn = end_pfn - 1;
+        for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
+                ms = __pfn_to_section(pfn);
+                if (unlikely(!valid_section(ms)))
+                        continue;
+                if (unlikely(pfn_to_nid(pfn) != nid))
+                        continue;
+                if (zone && zone != page_zone(pfn_to_page(pfn)))
+                        continue;
+                return pfn;
+        }
+        return 0;
+}
+static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
+                             unsigned long end_pfn)
 {
+        unsigned long zone_start_pfn =  zone->zone_start_pfn;
+        unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+        unsigned long pfn;
+        struct mem_section *ms;
+        int nid = zone_to_nid(zone);
+        zone_span_writelock(zone);
+        if (zone_start_pfn == start_pfn) {
+                /*
+                 * If the section is smallest section in the zone, it need
+                 * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
+                 * In this case, we find second smallest valid mem_section
+                 * for shrinking zone.
+                 */
+                pfn = find_smallest_section_pfn(nid, zone, end_pfn,
+                                                zone_end_pfn);
+                if (pfn) {
+                        zone->zone_start_pfn = pfn;
+                        zone->spanned_pages = zone_end_pfn - pfn;
+                }
+        } else if (zone_end_pfn == end_pfn) {
+                /*
+                 * If the section is biggest section in the zone, it need
+                 * shrink zone->spanned_pages.
+                 * In this case, we find second biggest valid mem_section for
+                 * shrinking zone.
+                 */
+                pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
+                                               start_pfn);
+                if (pfn)
+                        zone->spanned_pages = pfn - zone_start_pfn + 1;
+        }
        /*
-         * XXX: Freeing memmap with vmemmap is not implement yet.
+         * The section is not biggest or smallest mem_section in the zone, it
-         *      This should be removed later.
+         * only creates a hole in the zone. So in this case, we need not
+         * change the zone. But perhaps, the zone has only hole data. Thus
+         * it check the zone has only hole or not.
         */
-        return -EBUSY;
+        pfn = zone_start_pfn;
+        for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
+                ms = __pfn_to_section(pfn);
+                if (unlikely(!valid_section(ms)))
+                        continue;
+                if (page_zone(pfn_to_page(pfn)) != zone)
+                        continue;
+                 /* If the section is current section, it continues the loop */
+                if (start_pfn == pfn)
+                        continue;
+                /* If we find valid section, we have nothing to do */
+                zone_span_writeunlock(zone);
+                return;
+        }
+        /* The zone has no valid section */
+        zone->zone_start_pfn = 0;
+        zone->spanned_pages = 0;
+        zone_span_writeunlock(zone);
 }
-#else
-static int __remove_section(struct zone *zone, struct mem_section *ms)
+static void shrink_pgdat_span(struct pglist_data *pgdat,
+                              unsigned long start_pfn, unsigned long end_pfn)
+{
+        unsigned long pgdat_start_pfn =  pgdat->node_start_pfn;
+        unsigned long pgdat_end_pfn =
+                pgdat->node_start_pfn + pgdat->node_spanned_pages;
+        unsigned long pfn;
+        struct mem_section *ms;
+        int nid = pgdat->node_id;
+        if (pgdat_start_pfn == start_pfn) {
+                /*
+                 * If the section is smallest section in the pgdat, it need
+                 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
+                 * In this case, we find second smallest valid mem_section
+                 * for shrinking zone.
+                 */
+                pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
+                                                pgdat_end_pfn);
+                if (pfn) {
+                        pgdat->node_start_pfn = pfn;
+                        pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
+                }
+        } else if (pgdat_end_pfn == end_pfn) {
+                /*
+                 * If the section is biggest section in the pgdat, it need
+                 * shrink pgdat->node_spanned_pages.
+                 * In this case, we find second biggest valid mem_section for
+                 * shrinking zone.
+                 */
+                pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
+                                               start_pfn);
+                if (pfn)
+                        pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
+        }
+        /*
+         * If the section is not biggest or smallest mem_section in the pgdat,
+         * it only creates a hole in the pgdat. So in this case, we need not
+         * change the pgdat.
+         * But perhaps, the pgdat has only hole data. Thus it check the pgdat
+         * has only hole or not.
+         */
+        pfn = pgdat_start_pfn;
+        for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
+                ms = __pfn_to_section(pfn);
+                if (unlikely(!valid_section(ms)))
+                        continue;
+                if (pfn_to_nid(pfn) != nid)
+                        continue;
+                 /* If the section is current section, it continues the loop */
+                if (start_pfn == pfn)
+                        continue;
+                /* If we find valid section, we have nothing to do */
+                return;
+        }
+        /* The pgdat has no valid section */
+        pgdat->node_start_pfn = 0;
+        pgdat->node_spanned_pages = 0;
+}
+static void __remove_zone(struct zone *zone, unsigned long start_pfn)
 {
-        unsigned long flags;
        struct pglist_data *pgdat = zone->zone_pgdat;
+        int nr_pages = PAGES_PER_SECTION;
+        int zone_type;
+        unsigned long flags;
+        zone_type = zone - pgdat->node_zones;
+        pgdat_resize_lock(zone->zone_pgdat, &flags);
+        shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
+        shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
+        pgdat_resize_unlock(zone->zone_pgdat, &flags);
+}
+static int __remove_section(struct zone *zone, struct mem_section *ms)
+{
+        unsigned long start_pfn;
+        int scn_nr;
        int ret = -EINVAL;
        if (!valid_section(ms))
@@ -428,12 +650,13 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
        if (ret)
                return ret;
-        pgdat_resize_lock(pgdat, &flags);
+        scn_nr = __section_nr(ms);
+        start_pfn = section_nr_to_pfn(scn_nr);
+        __remove_zone(zone, start_pfn);
        sparse_remove_one_section(zone, ms);
-        pgdat_resize_unlock(pgdat, &flags);
        return 0;
 }
-#endif
 /*
 * Reasonably generic function for adding memory.  It is
@@ -797,11 +1020,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
        unsigned long zholes_size[MAX_NR_ZONES] = {0};
        unsigned long start_pfn = start >> PAGE_SHIFT;
-        pgdat = arch_alloc_nodedata(nid);
+        pgdat = NODE_DATA(nid);
-        if (!pgdat)
+        if (!pgdat) {
-                return NULL;
+                pgdat = arch_alloc_nodedata(nid);
+                if (!pgdat)
+                        return NULL;
-        arch_refresh_nodedata(nid, pgdat);
+                arch_refresh_nodedata(nid, pgdat);
+        }
        /* we can use NODE_DATA(nid) from here */
@@ -854,7 +1080,8 @@ out:
 int __ref add_memory(int nid, u64 start, u64 size)
 {
        pg_data_t *pgdat = NULL;
-        int new_pgdat = 0;
+        bool new_pgdat;
+        bool new_node;
        struct resource *res;
        int ret;
@@ -865,12 +1092,16 @@ int __ref add_memory(int nid, u64 start, u64 size)
        if (!res)
                goto out;
-        if (!node_online(nid)) {
+        {       /* Stupid hack to suppress address-never-null warning */
+                void *p = NODE_DATA(nid);
+                new_pgdat = !p;
+        }
+        new_node = !node_online(nid);
+        if (new_node) {
                pgdat = hotadd_new_pgdat(nid, start);
                ret = -ENOMEM;
                if (!pgdat)
                        goto error;
-                new_pgdat = 1;
        }
        /* call arch's memory hotadd */
@@ -882,7 +1113,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
        /* we online node here. we can't roll back from here. */
        node_set_online(nid);
-        if (new_pgdat) {
+        if (new_node) {
                ret = register_one_node(nid);
                /*
                 * If sysfs file of new node can't create, cpu on the node
@@ -901,8 +1132,7 @@ error:
        /* rollback pgdat allocation and others */
        if (new_pgdat)
                rollback_node_hotadd(nid, pgdat);
-        if (res)
+        release_memory_resource(res);
-                release_memory_resource(res);
 out:
        unlock_memory_hotplug();
@@ -1058,8 +1288,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                 * migrate_pages returns # of failed pages.
                 */
                ret = migrate_pages(&source, alloc_migrate_target, 0,
-                                                        true, MIGRATE_SYNC,
+                                        MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
-                                                        MR_MEMORY_HOTPLUG);
                if (ret)
                        putback_lru_pages(&source);
        }
@@ -1381,17 +1610,26 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
        return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
 }
-int remove_memory(u64 start, u64 size)
+/**
+ * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
+ * @start_pfn: start pfn of the memory range
+ * @end_pfn: end pft of the memory range
+ * @arg: argument passed to func
+ * @func: callback for each memory section walked
+ *
+ * This function walks through all present mem sections in range
+ * [start_pfn, end_pfn) and call func on each mem section.
+ *
+ * Returns the return value of func.
+ */
+static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
+                void *arg, int (*func)(struct memory_block *, void *))
 {
        struct memory_block *mem = NULL;
        struct mem_section *section;
-        unsigned long start_pfn, end_pfn;
        unsigned long pfn, section_nr;
        int ret;
-        start_pfn = PFN_DOWN(start);
-        end_pfn = start_pfn + PFN_DOWN(size);
        for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
                section_nr = pfn_to_section_nr(pfn);
                if (!present_section_nr(section_nr))
@@ -1408,7 +1646,7 @@ int remove_memory(u64 start, u64 size)
                if (!mem)
                        continue;
-                ret = offline_memory_block(mem);
+                ret = func(mem, arg);
                if (ret) {
                        kobject_put(&mem->dev.kobj);
                        return ret;
@@ -1420,12 +1658,209 @@ int remove_memory(u64 start, u64 size)
        return 0;
 }
+/**
+ * offline_memory_block_cb - callback function for offlining memory block
+ * @mem: the memory block to be offlined
+ * @arg: buffer to hold error msg
+ *
+ * Always return 0, and put the error msg in arg if any.
+ */
+static int offline_memory_block_cb(struct memory_block *mem, void *arg)
+{
+        int *ret = arg;
+        int error = offline_memory_block(mem);
+        if (error != 0 && *ret == 0)
+                *ret = error;
+        return 0;
+}
+static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
+{
+        int ret = !is_memblock_offlined(mem);
+        if (unlikely(ret))
+                pr_warn("removing memory fails, because memory "
+                        "[%#010llx-%#010llx] is onlined\n",
+                        PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)),
+                        PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1);
+        return ret;
+}
+static int check_cpu_on_node(void *data)
+{
+        struct pglist_data *pgdat = data;
+        int cpu;
+        for_each_present_cpu(cpu) {
+                if (cpu_to_node(cpu) == pgdat->node_id)
+                        /*
+                         * the cpu on this node isn't removed, and we can't
+                         * offline this node.
+                         */
+                        return -EBUSY;
+        }
+        return 0;
+}
+static void unmap_cpu_on_node(void *data)
+{
+#ifdef CONFIG_ACPI_NUMA
+        struct pglist_data *pgdat = data;
+        int cpu;
+        for_each_possible_cpu(cpu)
+                if (cpu_to_node(cpu) == pgdat->node_id)
+                        numa_clear_node(cpu);
+#endif
+}
+static int check_and_unmap_cpu_on_node(void *data)
+{
+        int ret = check_cpu_on_node(data);
+        if (ret)
+                return ret;
+        /*
+         * the node will be offlined when we come here, so we can clear
+         * the cpu_to_node() now.
+         */
+        unmap_cpu_on_node(data);
+        return 0;
+}
+/* offline the node if all memory sections of this node are removed */
+void try_offline_node(int nid)
+{
+        pg_data_t *pgdat = NODE_DATA(nid);
+        unsigned long start_pfn = pgdat->node_start_pfn;
+        unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
+        unsigned long pfn;
+        struct page *pgdat_page = virt_to_page(pgdat);
+        int i;
+        for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+                unsigned long section_nr = pfn_to_section_nr(pfn);
+                if (!present_section_nr(section_nr))
+                        continue;
+                if (pfn_to_nid(pfn) != nid)
+                        continue;
+                /*
+                 * some memory sections of this node are not removed, and we
+                 * can't offline node now.
+                 */
+                return;
+        }
+        if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL))
+                return;
+        /*
+         * all memory/cpu of this node are removed, we can offline this
+         * node now.
+         */
+        node_set_offline(nid);
+        unregister_one_node(nid);
+        if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page))
+                /* node data is allocated from boot memory */
+                return;
+        /* free waittable in each zone */
+        for (i = 0; i < MAX_NR_ZONES; i++) {
+                struct zone *zone = pgdat->node_zones + i;
+                if (zone->wait_table)
+                        vfree(zone->wait_table);
+        }
+        /*
+         * Since there is no way to guarentee the address of pgdat/zone is not
+         * on stack of any kernel threads or used by other kernel objects
+         * without reference counting or other symchronizing method, do not
+         * reset node_data and free pgdat here. Just reset it to 0 and reuse
+         * the memory when the node is online again.
+         */
+        memset(pgdat, 0, sizeof(*pgdat));
+}
+EXPORT_SYMBOL(try_offline_node);
+int __ref remove_memory(int nid, u64 start, u64 size)
+{
+        unsigned long start_pfn, end_pfn;
+        int ret = 0;
+        int retry = 1;
+        start_pfn = PFN_DOWN(start);
+        end_pfn = start_pfn + PFN_DOWN(size);
+        /*
+         * When CONFIG_MEMCG is on, one memory block may be used by other
+         * blocks to store page cgroup when onlining pages. But we don't know
+         * in what order pages are onlined. So we iterate twice to offline
+         * memory:
+         * 1st iterate: offline every non primary memory block.
+         * 2nd iterate: offline primary (i.e. first added) memory block.
+         */
+repeat:
+        walk_memory_range(start_pfn, end_pfn, &ret,
+                          offline_memory_block_cb);
+        if (ret) {
+                if (!retry)
+                        return ret;
+                retry = 0;
+                ret = 0;
+                goto repeat;
+        }
+        lock_memory_hotplug();
+        /*
+         * we have offlined all memory blocks like this:
+         *   1. lock memory hotplug
+         *   2. offline a memory block
+         *   3. unlock memory hotplug
+         *
+         * repeat step1-3 to offline the memory block. All memory blocks
+         * must be offlined before removing memory. But we don't hold the
+         * lock in the whole operation. So we should check whether all
+         * memory blocks are offlined.
+         */
+        ret = walk_memory_range(start_pfn, end_pfn, NULL,
+                                is_memblock_offlined_cb);
+        if (ret) {
+                unlock_memory_hotplug();
+                return ret;
+        }
+        /* remove memmap entry */
+        firmware_map_remove(start, start + size, "System RAM");
+        arch_remove_memory(start, size);
+        try_offline_node(nid);
+        unlock_memory_hotplug();
+        return 0;
+}
 #else
 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
        return -EINVAL;
 }
-int remove_memory(u64 start, u64 size)
+int remove_memory(int nid, u64 start, u64 size)
 {
        return -EINVAL;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e2df1c1fb41f..31d26637b658 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -26,7 +26,7 @@
 *                the allocation to memory nodes instead
 *
 * preferred       Try a specific node first before normal fallback.
- *                As a special case node -1 here means do the allocation
+ *                As a special case NUMA_NO_NODE here means do the allocation
 *                on the local CPU. This is normally identical to default,
 *                but useful to set in a VMA when you have a non default
 *                process policy.
@@ -127,7 +127,7 @@ static struct mempolicy *get_task_policy(struct task_struct *p)
        if (!pol) {
                node = numa_node_id();
-                if (node != -1)
+                if (node != NUMA_NO_NODE)
                        pol = &preferred_node_policy[node];
                /* preferred_node_policy is not initialised early in boot */
@@ -161,19 +161,7 @@ static const struct mempolicy_operations {
 /* Check that the nodemask contains at least one populated zone */
 static int is_valid_nodemask(const nodemask_t *nodemask)
 {
-        int nd, k;
+        return nodes_intersects(*nodemask, node_states[N_MEMORY]);
-        for_each_node_mask(nd, *nodemask) {
-                struct zone *z;
-                for (k = 0; k <= policy_zone; k++) {
-                        z = &NODE_DATA(nd)->node_zones[k];
-                        if (z->present_pages > 0)
-                                return 1;
-                }
-        }
-        return 0;
 }
 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
@@ -270,7 +258,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
        struct mempolicy *policy;
        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
-                 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
+                 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
        if (mode == MPOL_DEFAULT) {
                if (nodes && !nodes_empty(*nodes))
@@ -508,9 +496,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                /*
                 * vm_normal_page() filters out zero pages, but there might
                 * still be PageReserved pages to skip, perhaps in a VDSO.
-                 * And we cannot move PageKsm pages sensibly or safely yet.
                 */
-                if (PageReserved(page) || PageKsm(page))
+                if (PageReserved(page))
                        continue;
                nid = page_to_nid(page);
                if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
@@ -1027,8 +1014,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_node_page, dest,
-                                                        false, MIGRATE_SYNC,
+                                        MIGRATE_SYNC, MR_SYSCALL);
-                                                        MR_SYSCALL);
                if (err)
                        putback_lru_pages(&pagelist);
        }
@@ -1235,7 +1221,7 @@ static long do_mbind(unsigned long start, unsigned long len,
        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
                 start, start + len, mode, mode_flags,
-                 nmask ? nodes_addr(*nmask)[0] : -1);
+                 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
@@ -1272,9 +1258,8 @@ static long do_mbind(unsigned long start, unsigned long len,
                if (!list_empty(&pagelist)) {
                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
                        nr_failed = migrate_pages(&pagelist, new_vma_page,
-                                                (unsigned long)vma,
+                                        (unsigned long)vma,
-                                                false, MIGRATE_SYNC,
+                                        MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
-                                                MR_MEMPOLICY_MBIND);
                        if (nr_failed)
                                putback_lru_pages(&pagelist);
                }
@@ -1644,6 +1629,26 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
        return pol;
 }
+static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
+{
+        enum zone_type dynamic_policy_zone = policy_zone;
+        BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
+        /*
+         * if policy->v.nodes has movable memory only,
+         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
+         *
+         * policy->v.nodes is intersect with node_states[N_MEMORY].
+         * so if the following test faile, it implies
+         * policy->v.nodes has movable memory only.
+         */
+        if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
+                dynamic_policy_zone = ZONE_MOVABLE;
+        return zone >= dynamic_policy_zone;
+}
 /*
 * Return a nodemask representing a mempolicy for filtering nodes for
 * page allocation
@@ -1652,7 +1657,7 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 {
        /* Lower zones don't get a nodemask applied for MPOL_BIND */
        if (unlikely(policy->mode == MPOL_BIND) &&
-                        gfp_zone(gfp) >= policy_zone &&
+                        apply_policy_zone(policy, gfp_zone(gfp)) &&
                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
                return &policy->v.nodes;
@@ -2308,7 +2313,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
                 * it less likely we act on an unlikely task<->page
                 * relation.
                 */
-                last_nid = page_xchg_last_nid(page, polnid);
+                last_nid = page_nid_xchg_last(page, polnid);
                if (last_nid != polnid)
                        goto out;
        }
@@ -2483,7 +2488,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
                 vma->vm_pgoff,
                 sz, npol ? npol->mode : -1,
                 npol ? npol->flags : -1,
-                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
+                 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
        if (npol) {
                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
diff --git a/mm/migrate.c b/mm/migrate.c
index 3b676b0c5c3e..3bbaf5d230b0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -160,8 +160,10 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
        if (is_write_migration_entry(entry))
                pte = pte_mkwrite(pte);
 #ifdef CONFIG_HUGETLB_PAGE
-        if (PageHuge(new))
+        if (PageHuge(new)) {
                pte = pte_mkhuge(pte);
+                pte = arch_make_huge_pte(pte, vma, new, 0);
+        }
 #endif
        flush_cache_page(vma, addr, pte_pfn(pte));
        set_pte_at(mm, addr, ptep, pte);
@@ -462,7 +464,10 @@ void migrate_page_copy(struct page *newpage, struct page *page)
        mlock_migrate_page(newpage, page);
        ksm_migrate_page(newpage, page);
+        /*
+         * Please do not reorder this without considering how mm/ksm.c's
+         * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
+         */
        ClearPageSwapCache(page);
        ClearPagePrivate(page);
        set_page_private(page, 0);
@@ -696,7 +701,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 }
 static int __unmap_and_move(struct page *page, struct page *newpage,
-                        int force, bool offlining, enum migrate_mode mode)
+                                int force, enum migrate_mode mode)
 {
        int rc = -EAGAIN;
        int remap_swapcache = 1;
@@ -726,20 +731,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
                lock_page(page);
        }
-        /*
-         * Only memory hotplug's offline_pages() caller has locked out KSM,
-         * and can safely migrate a KSM page.  The other cases have skipped
-         * PageKsm along with PageReserved - but it is only now when we have
-         * the page lock that we can be certain it will not go KSM beneath us
-         * (KSM will not upgrade a page from PageAnon to PageKsm when it sees
-         * its pagecount raised, but only here do we take the page lock which
-         * serializes that).
-         */
-        if (PageKsm(page) && !offlining) {
-                rc = -EBUSY;
-                goto unlock;
-        }
        /* charge against new page */
        mem_cgroup_prepare_migration(page, newpage, &mem);
@@ -766,7 +757,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
         * File Caches may use write_page() or lock_page() in migration, then,
         * just care Anon page here.
         */
-        if (PageAnon(page)) {
+        if (PageAnon(page) && !PageKsm(page)) {
                /*
                 * Only page_lock_anon_vma_read() understands the subtleties of
                 * getting a hold on an anon_vma from outside one of its mms.
@@ -846,7 +837,6 @@ uncharge:
        mem_cgroup_end_migration(mem, page, newpage,
                                 (rc == MIGRATEPAGE_SUCCESS ||
                                  rc == MIGRATEPAGE_BALLOON_SUCCESS));
-unlock:
        unlock_page(page);
 out:
        return rc;
@@ -857,8 +847,7 @@ out:
 * to the newly allocated page in newpage.
 */
 static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-                        struct page *page, int force, bool offlining,
+                        struct page *page, int force, enum migrate_mode mode)
-                        enum migrate_mode mode)
 {
        int rc = 0;
        int *result = NULL;
@@ -876,7 +865,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                if (unlikely(split_huge_page(page)))
                        goto out;
-        rc = __unmap_and_move(page, newpage, force, offlining, mode);
+        rc = __unmap_and_move(page, newpage, force, mode);
        if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
                /*
@@ -936,8 +925,7 @@ out:
 */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
                                unsigned long private, struct page *hpage,
-                                int force, bool offlining,
+                                int force, enum migrate_mode mode)
-                                enum migrate_mode mode)
 {
        int rc = 0;
        int *result = NULL;
@@ -999,9 +987,8 @@ out:
 *
 * Return: Number of pages not migrated or error code.
 */
-int migrate_pages(struct list_head *from,
+int migrate_pages(struct list_head *from, new_page_t get_new_page,
-                new_page_t get_new_page, unsigned long private, bool offlining,
+                unsigned long private, enum migrate_mode mode, int reason)
-                enum migrate_mode mode, int reason)
 {
        int retry = 1;
        int nr_failed = 0;
@@ -1022,8 +1009,7 @@ int migrate_pages(struct list_head *from,
                        cond_resched();
                        rc = unmap_and_move(get_new_page, private,
-                                                page, pass > 2, offlining,
+                                                page, pass > 2, mode);
-                                                mode);
                        switch(rc) {
                        case -ENOMEM:
@@ -1056,15 +1042,13 @@ out:
 }
 int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
-                      unsigned long private, bool offlining,
+                      unsigned long private, enum migrate_mode mode)
-                      enum migrate_mode mode)
 {
        int pass, rc;
        for (pass = 0; pass < 10; pass++) {
-                rc = unmap_and_move_huge_page(get_new_page,
+                rc = unmap_and_move_huge_page(get_new_page, private,
-                                              private, hpage, pass > 2, offlining,
+                                                hpage, pass > 2, mode);
-                                              mode);
                switch (rc) {
                case -ENOMEM:
                        goto out;
@@ -1150,7 +1134,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
                        goto set_status;
                /* Use PageReserved to check for zero page */
-                if (PageReserved(page) || PageKsm(page))
+                if (PageReserved(page))
                        goto put_and_set;
                pp->page = page;
@@ -1187,8 +1171,7 @@ set_status:
        err = 0;
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_page_node,
-                                (unsigned long)pm, 0, MIGRATE_SYNC,
+                                (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
-                                MR_SYSCALL);
                if (err)
                        putback_lru_pages(&pagelist);
        }
@@ -1312,7 +1295,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
                err = -ENOENT;
                /* Use PageReserved to check for zero page */
-                if (!page || PageReserved(page) || PageKsm(page))
+                if (!page || PageReserved(page))
                        goto set_status;
                err = page_to_nid(page);
@@ -1459,7 +1442,7 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
 * pages. Currently it only checks the watermarks which crude
 */
 static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
-                                   int nr_migrate_pages)
+                                   unsigned long nr_migrate_pages)
 {
        int z;
        for (z = pgdat->nr_zones - 1; z >= 0; z--) {
@@ -1495,7 +1478,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
                                          __GFP_NOWARN) &
                                         ~GFP_IOFS, 0);
        if (newpage)
-                page_xchg_last_nid(newpage, page_last_nid(page));
+                page_nid_xchg_last(newpage, page_nid_last(page));
        return newpage;
 }
@@ -1555,39 +1538,40 @@ bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
 int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 {
-        int ret = 0;
+        int page_lru;
+        VM_BUG_ON(compound_order(page) && !PageTransHuge(page));
        /* Avoid migrating to a node that is nearly full */
-        if (migrate_balanced_pgdat(pgdat, 1)) {
+        if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
-                int page_lru;
+                return 0;
-                if (isolate_lru_page(page)) {
+        if (isolate_lru_page(page))
-                        put_page(page);
+                return 0;
-                        return 0;
-                }
-                /* Page is isolated */
+        /*
-                ret = 1;
+         * migrate_misplaced_transhuge_page() skips page migration's usual
-                page_lru = page_is_file_cache(page);
+         * check on page_count(), so we must do it here, now that the page
-                if (!PageTransHuge(page))
+         * has been isolated: a GUP pin, or any other pin, prevents migration.
-                        inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
+         * The expected page count is 3: 1 for page's mapcount and 1 for the
-                else
+         * caller's pin and 1 for the reference taken by isolate_lru_page().
-                        mod_zone_page_state(page_zone(page),
+         */
-                                        NR_ISOLATED_ANON + page_lru,
+        if (PageTransHuge(page) && page_count(page) != 3) {
-                                        HPAGE_PMD_NR);
+                putback_lru_page(page);
+                return 0;
        }
+        page_lru = page_is_file_cache(page);
+        mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru,
+                                hpage_nr_pages(page));
        /*
-         * Page is either isolated or there is not enough space on the target
+         * Isolating the page has taken another reference, so the
-         * node. If isolated, then it has taken a reference count and the
+         * caller's reference can be safely dropped without the page
-         * callers reference can be safely dropped without the page
+         * disappearing underneath us during migration.
-         * disappearing underneath us during migration. Otherwise the page is
-         * not to be migrated but the callers reference should still be
-         * dropped so it does not leak.
         */
        put_page(page);
+        return 1;
-        return ret;
 }
 /*
@@ -1598,7 +1582,7 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 int migrate_misplaced_page(struct page *page, int node)
 {
        pg_data_t *pgdat = NODE_DATA(node);
-        int isolated = 0;
+        int isolated;
        int nr_remaining;
        LIST_HEAD(migratepages);
@@ -1606,42 +1590,43 @@ int migrate_misplaced_page(struct page *page, int node)
         * Don't migrate pages that are mapped in multiple processes.
         * TODO: Handle false sharing detection instead of this hammer
         */
-        if (page_mapcount(page) != 1) {
+        if (page_mapcount(page) != 1)
-                put_page(page);
                goto out;
-        }
        /*
         * Rate-limit the amount of data that is being migrated to a node.
         * Optimal placement is no good if the memory bus is saturated and
         * all the time is being spent migrating!
         */
-        if (numamigrate_update_ratelimit(pgdat, 1)) {
+        if (numamigrate_update_ratelimit(pgdat, 1))
-                put_page(page);
                goto out;
-        }
        isolated = numamigrate_isolate_page(pgdat, page);
        if (!isolated)
                goto out;
        list_add(&page->lru, &migratepages);
-        nr_remaining = migrate_pages(&migratepages,
+        nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
-                        alloc_misplaced_dst_page,
+                                     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
-                        node, false, MIGRATE_ASYNC,
-                        MR_NUMA_MISPLACED);
        if (nr_remaining) {
                putback_lru_pages(&migratepages);
                isolated = 0;
        } else
                count_vm_numa_event(NUMA_PAGE_MIGRATE);
        BUG_ON(!list_empty(&migratepages));
-out:
        return isolated;
+out:
+        put_page(page);
+        return 0;
 }
 #endif /* CONFIG_NUMA_BALANCING */
 #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+/*
+ * Migrates a THP to a given target node. page must be locked and is unlocked
+ * before returning.
+ */
 int migrate_misplaced_transhuge_page(struct mm_struct *mm,
                                struct vm_area_struct *vma,
                                pmd_t *pmd, pmd_t entry,
@@ -1672,17 +1657,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
        new_page = alloc_pages_node(node,
                (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
-        if (!new_page) {
+        if (!new_page)
-                count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+                goto out_fail;
-                goto out_dropref;
-        }
+        page_nid_xchg_last(new_page, page_nid_last(page));
-        page_xchg_last_nid(new_page, page_last_nid(page));
        isolated = numamigrate_isolate_page(pgdat, page);
        if (!isolated) {
-                count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
                put_page(new_page);
-                goto out_keep_locked;
+                goto out_fail;
        }
        /* Prepare a page as a migration target */
@@ -1714,6 +1697,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
                putback_lru_page(page);
                count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+                isolated = 0;
                goto out;
        }
@@ -1758,9 +1742,11 @@ out:
                        -HPAGE_PMD_NR);
        return isolated;
+out_fail:
+        count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
 out_dropref:
+        unlock_page(page);
        put_page(page);
-out_keep_locked:
        return 0;
 }
 #endif /* CONFIG_NUMA_BALANCING */
diff --git a/mm/mincore.c b/mm/mincore.c
index 936b4cee8cb1..da2be56a7b8f 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -75,7 +75,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
        /* shmem/tmpfs may return swap: account for swapcache page too. */
        if (radix_tree_exceptional_entry(page)) {
                swp_entry_t swap = radix_to_swp_entry(page);
-                page = find_get_page(&swapper_space, swap.val);
+                page = find_get_page(swap_address_space(swap), swap.val);
        }
 #endif
        if (page) {
@@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                        } else {
 #ifdef CONFIG_SWAP
                                pgoff = entry.val;
-                                *vec = mincore_page(&swapper_space, pgoff);
+                                *vec = mincore_page(swap_address_space(entry),
+                                        pgoff);
 #else
                                WARN_ON(1);
                                *vec = 1;
diff --git a/mm/mlock.c b/mm/mlock.c
index f0b9ce572fc7..1c5e33fce639 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -102,13 +102,16 @@ void mlock_vma_page(struct page *page)
 * can't isolate the page, we leave it for putback_lru_page() and vmscan
 * [page_referenced()/try_to_unmap()] to deal with.
 */
-void munlock_vma_page(struct page *page)
+unsigned int munlock_vma_page(struct page *page)
 {
+        unsigned int page_mask = 0;
        BUG_ON(!PageLocked(page));
        if (TestClearPageMlocked(page)) {
-                mod_zone_page_state(page_zone(page), NR_MLOCK,
+                unsigned int nr_pages = hpage_nr_pages(page);
-                                    -hpage_nr_pages(page));
+                mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+                page_mask = nr_pages - 1;
                if (!isolate_lru_page(page)) {
                        int ret = SWAP_AGAIN;
@@ -141,6 +144,8 @@ void munlock_vma_page(struct page *page)
                                count_vm_event(UNEVICTABLE_PGMUNLOCKED);
                }
        }
+        return page_mask;
 }
 /**
@@ -155,13 +160,11 @@ void munlock_vma_page(struct page *page)
 *
 * vma->vm_mm->mmap_sem must be held for at least read.
 */
-static long __mlock_vma_pages_range(struct vm_area_struct *vma,
+long __mlock_vma_pages_range(struct vm_area_struct *vma,
-                                    unsigned long start, unsigned long end,
+                unsigned long start, unsigned long end, int *nonblocking)
-                                    int *nonblocking)
 {
        struct mm_struct *mm = vma->vm_mm;
-        unsigned long addr = start;
+        unsigned long nr_pages = (end - start) / PAGE_SIZE;
-        int nr_pages = (end - start) / PAGE_SIZE;
        int gup_flags;
        VM_BUG_ON(start & ~PAGE_MASK);
@@ -186,7 +189,11 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
        if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
                gup_flags |= FOLL_FORCE;
-        return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
+        /*
+         * We made sure addr is within a VMA, so the following will
+         * not result in a stack expansion that recurses back here.
+         */
+        return __get_user_pages(current, mm, start, nr_pages, gup_flags,
                                NULL, NULL, nonblocking);
 }
@@ -202,56 +209,6 @@ static int __mlock_posix_error_return(long retval)
        return retval;
 }
-/**
- * mlock_vma_pages_range() - mlock pages in specified vma range.
- * @vma - the vma containing the specfied address range
- * @start - starting address in @vma to mlock
- * @end   - end address [+1] in @vma to mlock
- *
- * For mmap()/mremap()/expansion of mlocked vma.
- *
- * return 0 on success for "normal" vmas.
- *
- * return number of pages [> 0] to be removed from locked_vm on success
- * of "special" vmas.
- */
-long mlock_vma_pages_range(struct vm_area_struct *vma,
-                        unsigned long start, unsigned long end)
-{
-        int nr_pages = (end - start) / PAGE_SIZE;
-        BUG_ON(!(vma->vm_flags & VM_LOCKED));
-        /*
-         * filter unlockable vmas
-         */
-        if (vma->vm_flags & (VM_IO | VM_PFNMAP))
-                goto no_mlock;
-        if (!((vma->vm_flags & VM_DONTEXPAND) ||
-                        is_vm_hugetlb_page(vma) ||
-                        vma == get_gate_vma(current->mm))) {
-                __mlock_vma_pages_range(vma, start, end, NULL);
-                /* Hide errors from mmap() and other callers */
-                return 0;
-        }
-        /*
-         * User mapped kernel pages or huge pages:
-         * make these pages present to populate the ptes, but
-         * fall thru' to reset VM_LOCKED--no need to unlock, and
-         * return nr_pages so these don't get counted against task's
-         * locked limit.  huge pages are already counted against
-         * locked vm limit.
-         */
-        make_pages_present(start, end);
-no_mlock:
-        vma->vm_flags &= ~VM_LOCKED;    /* and don't come back! */
-        return nr_pages;                /* error or pages NOT mlocked */
-}
 /*
 * munlock_vma_pages_range() - munlock all pages in the vma range.'
 * @vma - vma containing range to be munlock()ed.
@@ -273,13 +230,12 @@ no_mlock:
 void munlock_vma_pages_range(struct vm_area_struct *vma,
                             unsigned long start, unsigned long end)
 {
-        unsigned long addr;
-        lru_add_drain();
        vma->vm_flags &= ~VM_LOCKED;
-        for (addr = start; addr < end; addr += PAGE_SIZE) {
+        while (start < end) {
                struct page *page;
+                unsigned int page_mask, page_increm;
                /*
                 * Although FOLL_DUMP is intended for get_dump_page(),
                 * it just so happens that its special treatment of the
@@ -287,13 +243,22 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
                 * suits munlock very well (and if somehow an abnormal page
                 * has sneaked into the range, we won't oops here: great).
                 */
-                page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
+                page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
+                                        &page_mask);
                if (page && !IS_ERR(page)) {
                        lock_page(page);
-                        munlock_vma_page(page);
+                        lru_add_drain();
+                        /*
+                         * Any THP page found by follow_page_mask() may have
+                         * gotten split before reaching munlock_vma_page(),
+                         * so we need to recompute the page_mask here.
+                         */
+                        page_mask = munlock_vma_page(page);
                        unlock_page(page);
                        put_page(page);
                }
+                page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
+                start += page_increm * PAGE_SIZE;
                cond_resched();
        }
 }
@@ -303,7 +268,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
 *
 * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
 * munlock is a no-op.  However, for some special vmas, we go ahead and
- * populate the ptes via make_pages_present().
+ * populate the ptes.
 *
 * For vmas that pass the filters, merge/split as appropriate.
 */
@@ -391,9 +356,9 @@ static int do_mlock(unsigned long start, size_t len, int on)
                /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
-                newflags = vma->vm_flags | VM_LOCKED;
+                newflags = vma->vm_flags & ~VM_LOCKED;
-                if (!on)
+                if (on)
-                        newflags &= ~VM_LOCKED;
+                        newflags |= VM_LOCKED | VM_POPULATE;
                tmp = vma->vm_end;
                if (tmp > end)
@@ -416,13 +381,20 @@ static int do_mlock(unsigned long start, size_t len, int on)
        return error;
 }
-static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
+/*
+ * __mm_populate - populate and/or mlock pages within a range of address space.
+ *
+ * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
+ * flags. VMAs must be already marked with the desired vm_flags, and
+ * mmap_sem must not be held.
+ */
+int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
 {
        struct mm_struct *mm = current->mm;
        unsigned long end, nstart, nend;
        struct vm_area_struct *vma = NULL;
        int locked = 0;
-        int ret = 0;
+        long ret = 0;
        VM_BUG_ON(start & ~PAGE_MASK);
        VM_BUG_ON(len != PAGE_ALIGN(len));
@@ -446,7 +418,8 @@ static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
                 * range with the first VMA. Also, skip undesirable VMA types.
                 */
                nend = min(end, vma->vm_end);
-                if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+                if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_POPULATE)) !=
+                    VM_POPULATE)
                        continue;
                if (nstart < vma->vm_start)
                        nstart = vma->vm_start;
@@ -498,7 +471,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
                error = do_mlock(start, len, 1);
        up_write(&current->mm->mmap_sem);
        if (!error)
-                error = do_mlock_pages(start, len, 0);
+                error = __mm_populate(start, len, 0);
        return error;
 }
@@ -517,20 +490,20 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
 static int do_mlockall(int flags)
 {
        struct vm_area_struct * vma, * prev = NULL;
-        unsigned int def_flags = 0;
        if (flags & MCL_FUTURE)
-                def_flags = VM_LOCKED;
+                current->mm->def_flags |= VM_LOCKED | VM_POPULATE;
-        current->mm->def_flags = def_flags;
+        else
+                current->mm->def_flags &= ~(VM_LOCKED | VM_POPULATE);
        if (flags == MCL_FUTURE)
                goto out;
        for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
                vm_flags_t newflags;
-                newflags = vma->vm_flags | VM_LOCKED;
+                newflags = vma->vm_flags & ~VM_LOCKED;
-                if (!(flags & MCL_CURRENT))
+                if (flags & MCL_CURRENT)
-                        newflags &= ~VM_LOCKED;
+                        newflags |= VM_LOCKED | VM_POPULATE;
                /* Ignore errors */
                mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
@@ -564,10 +537,8 @@ SYSCALL_DEFINE1(mlockall, int, flags)
            capable(CAP_IPC_LOCK))
                ret = do_mlockall(flags);
        up_write(&current->mm->mmap_sem);
-        if (!ret && (flags & MCL_CURRENT)) {
+        if (!ret && (flags & MCL_CURRENT))
-                /* Ignore errors */
+                mm_populate(0, TASK_SIZE);
-                do_mlock_pages(0, TASK_SIZE, 1);
-        }
 out:
        return ret;
 }
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 1ffd97ae26d7..c280a02ea11e 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -69,34 +69,41 @@ void __init mminit_verify_pageflags_layout(void)
        unsigned long or_mask, add_mask;
        shift = 8 * sizeof(unsigned long);
-        width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH;
+        width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT;
        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
-                "Section %d Node %d Zone %d Flags %d\n",
+                "Section %d Node %d Zone %d Lastnid %d Flags %d\n",
                SECTIONS_WIDTH,
                NODES_WIDTH,
                ZONES_WIDTH,
+                LAST_NID_WIDTH,
                NR_PAGEFLAGS);
        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
-                "Section %d Node %d Zone %d\n",
+                "Section %d Node %d Zone %d Lastnid %d\n",
                SECTIONS_SHIFT,
                NODES_SHIFT,
-                ZONES_SHIFT);
+                ZONES_SHIFT,
-        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets",
+                LAST_NID_SHIFT);
-                "Section %lu Node %lu Zone %lu\n",
+        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
+                "Section %lu Node %lu Zone %lu Lastnid %lu\n",
                (unsigned long)SECTIONS_PGSHIFT,
                (unsigned long)NODES_PGSHIFT,
-                (unsigned long)ZONES_PGSHIFT);
+                (unsigned long)ZONES_PGSHIFT,
-        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid",
+                (unsigned long)LAST_NID_PGSHIFT);
-                "Zone ID: %lu -> %lu\n",
+        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
-                (unsigned long)ZONEID_PGOFF,
+                "Node/Zone ID: %lu -> %lu\n",
-                (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT));
+                (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
+                (unsigned long)ZONEID_PGOFF);
        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
-                "location: %d -> %d unused %d -> %d flags %d -> %d\n",
+                "location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n",
                shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
 #ifdef NODE_NOT_IN_PAGE_FLAGS
        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
                "Node not in page flags");
 #endif
+#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
+        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
+                "Last nid not in page flags");
+#endif
        if (SECTIONS_WIDTH) {
                shift -= SECTIONS_WIDTH;
diff --git a/mm/mmap.c b/mm/mmap.c
index f54b235f29a9..2664a47cec93 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -32,6 +32,7 @@
 #include <linux/khugepaged.h>
 #include <linux/uprobes.h>
 #include <linux/rbtree_augmented.h>
+#include <linux/sched/sysctl.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -143,7 +144,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                 */
                free -= global_page_state(NR_SHMEM);
-                free += nr_swap_pages;
+                free += get_nr_swap_pages();
                /*
                 * Any slabs which are created with the
@@ -202,7 +203,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
                struct file *file, struct address_space *mapping)
 {
        if (vma->vm_flags & VM_DENYWRITE)
-                atomic_inc(&file->f_path.dentry->d_inode->i_writecount);
+                atomic_inc(&file_inode(file)->i_writecount);
        if (vma->vm_flags & VM_SHARED)
                mapping->i_mmap_writable--;
@@ -255,6 +256,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
        unsigned long newbrk, oldbrk;
        struct mm_struct *mm = current->mm;
        unsigned long min_brk;
+        bool populate;
        down_write(&mm->mmap_sem);
@@ -304,8 +306,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
        /* Ok, looks good - let it rip. */
        if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
                goto out;
 set_brk:
        mm->brk = brk;
+        populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
+        up_write(&mm->mmap_sem);
+        if (populate)
+                mm_populate(oldbrk, newbrk - oldbrk);
+        return brk;
 out:
        retval = mm->brk;
        up_write(&mm->mmap_sem);
@@ -567,7 +576,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
                struct address_space *mapping = file->f_mapping;
                if (vma->vm_flags & VM_DENYWRITE)
-                        atomic_dec(&file->f_path.dentry->d_inode->i_writecount);
+                        atomic_dec(&file_inode(file)->i_writecount);
                if (vma->vm_flags & VM_SHARED)
                        mapping->i_mmap_writable++;
@@ -800,7 +809,7 @@ again:			remove_next = 1 + (end > next->vm_end);
                anon_vma_interval_tree_post_update_vma(vma);
                if (adjust_next)
                        anon_vma_interval_tree_post_update_vma(next);
-                anon_vma_unlock(anon_vma);
+                anon_vma_unlock_write(anon_vma);
        }
        if (mapping)
                mutex_unlock(&mapping->i_mmap_mutex);
@@ -1153,12 +1162,15 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
 unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                        unsigned long len, unsigned long prot,
-                        unsigned long flags, unsigned long pgoff)
+                        unsigned long flags, unsigned long pgoff,
+                        unsigned long *populate)
 {
        struct mm_struct * mm = current->mm;
        struct inode *inode;
        vm_flags_t vm_flags;
+        *populate = 0;
        /*
         * Does the application expect PROT_READ to imply PROT_EXEC?
         *
@@ -1217,7 +1229,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                        return -EAGAIN;
        }
-        inode = file ? file->f_path.dentry->d_inode : NULL;
+        inode = file ? file_inode(file) : NULL;
        if (file) {
                switch (flags & MAP_TYPE) {
@@ -1279,7 +1291,24 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                }
        }
-        return mmap_region(file, addr, len, flags, vm_flags, pgoff);
+        /*
+         * Set 'VM_NORESERVE' if we should not account for the
+         * memory use of this mapping.
+         */
+        if (flags & MAP_NORESERVE) {
+                /* We honor MAP_NORESERVE if allowed to overcommit */
+                if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+                        vm_flags |= VM_NORESERVE;
+                /* hugetlb applies strict overcommit unless MAP_NORESERVE */
+                if (file && is_file_hugepages(file))
+                        vm_flags |= VM_NORESERVE;
+        }
+        addr = mmap_region(file, addr, len, vm_flags, pgoff);
+        if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE))
+                *populate = len;
+        return addr;
 }
 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
@@ -1394,8 +1423,7 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
 }
 unsigned long mmap_region(struct file *file, unsigned long addr,
-                          unsigned long len, unsigned long flags,
+                unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
-                          vm_flags_t vm_flags, unsigned long pgoff)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
@@ -1403,7 +1431,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
        int error;
        struct rb_node **rb_link, *rb_parent;
        unsigned long charged = 0;
-        struct inode *inode =  file ? file->f_path.dentry->d_inode : NULL;
+        struct inode *inode =  file ? file_inode(file) : NULL;
        /* Clear old maps */
        error = -ENOMEM;
@@ -1419,20 +1447,6 @@ munmap_back:
                return -ENOMEM;
        /*
-         * Set 'VM_NORESERVE' if we should not account for the
-         * memory use of this mapping.
-         */
-        if ((flags & MAP_NORESERVE)) {
-                /* We honor MAP_NORESERVE if allowed to overcommit */
-                if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
-                        vm_flags |= VM_NORESERVE;
-                /* hugetlb applies strict overcommit unless MAP_NORESERVE */
-                if (file && is_file_hugepages(file))
-                        vm_flags |= VM_NORESERVE;
-        }
-        /*
         * Private writable mapping: check memory availability
         */
        if (accountable_mapping(file, vm_flags)) {
@@ -1530,10 +1544,12 @@ out:
        vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
        if (vm_flags & VM_LOCKED) {
-                if (!mlock_vma_pages_range(vma, addr, addr + len))
+                if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
+                                        vma == get_gate_vma(current->mm)))
                        mm->locked_vm += (len >> PAGE_SHIFT);
-        } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
+                else
-                make_pages_present(addr, addr + len);
+                        vma->vm_flags &= ~VM_LOCKED;
+        }
        if (file)
                uprobe_mmap(vma);
@@ -2169,9 +2185,28 @@ int expand_downwards(struct vm_area_struct *vma,
        return error;
 }
+/*
+ * Note how expand_stack() refuses to expand the stack all the way to
+ * abut the next virtual mapping, *unless* that mapping itself is also
+ * a stack mapping. We want to leave room for a guard page, after all
+ * (the guard page itself is not added here, that is done by the
+ * actual page faulting logic)
+ *
+ * This matches the behavior of the guard page logic (see mm/memory.c:
+ * check_stack_guard_page()), which only allows the guard page to be
+ * removed under these circumstances.
+ */
 #ifdef CONFIG_STACK_GROWSUP
 int expand_stack(struct vm_area_struct *vma, unsigned long address)
 {
+        struct vm_area_struct *next;
+        address &= PAGE_MASK;
+        next = vma->vm_next;
+        if (next && next->vm_start == address + PAGE_SIZE) {
+                if (!(next->vm_flags & VM_GROWSUP))
+                        return -ENOMEM;
+        }
        return expand_upwards(vma, address);
 }
@@ -2186,14 +2221,21 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
                return vma;
        if (!prev || expand_stack(prev, addr))
                return NULL;
-        if (prev->vm_flags & VM_LOCKED) {
+        if (prev->vm_flags & VM_LOCKED)
-                mlock_vma_pages_range(prev, addr, prev->vm_end);
+                __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);
-        }
        return prev;
 }
 #else
 int expand_stack(struct vm_area_struct *vma, unsigned long address)
 {
+        struct vm_area_struct *prev;
+        address &= PAGE_MASK;
+        prev = vma->vm_prev;
+        if (prev && prev->vm_end == address) {
+                if (!(prev->vm_flags & VM_GROWSDOWN))
+                        return -ENOMEM;
+        }
        return expand_downwards(vma, address);
 }
@@ -2214,9 +2256,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
        start = vma->vm_start;
        if (expand_stack(vma, addr))
                return NULL;
-        if (vma->vm_flags & VM_LOCKED) {
+        if (vma->vm_flags & VM_LOCKED)
-                mlock_vma_pages_range(vma, addr, start);
+                __mlock_vma_pages_range(vma, addr, start, NULL);
-        }
        return vma;
 }
 #endif
@@ -2589,10 +2630,8 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 out:
        perf_event_mmap(vma);
        mm->total_vm += len >> PAGE_SHIFT;
-        if (flags & VM_LOCKED) {
+        if (flags & VM_LOCKED)
-                if (!mlock_vma_pages_range(vma, addr, addr + len))
+                mm->locked_vm += (len >> PAGE_SHIFT);
-                        mm->locked_vm += (len >> PAGE_SHIFT);
-        }
        return addr;
 }
@@ -2600,10 +2639,14 @@ unsigned long vm_brk(unsigned long addr, unsigned long len)
 {
        struct mm_struct *mm = current->mm;
        unsigned long ret;
+        bool populate;
        down_write(&mm->mmap_sem);
        ret = do_brk(addr, len);
+        populate = ((mm->def_flags & VM_LOCKED) != 0);
        up_write(&mm->mmap_sem);
+        if (populate)
+                mm_populate(addr, len);
        return ret;
 }
 EXPORT_SYMBOL(vm_brk);
@@ -2886,7 +2929,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
                 * The LSB of head.next can't change from under us
                 * because we hold the mm_all_locks_mutex.
                 */
-                down_write(&anon_vma->root->rwsem);
+                down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);
                /*
                 * We can safely modify head.next after taking the
                 * anon_vma->root->rwsem. If some other vma in this mm shares
@@ -2943,7 +2986,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
 * vma in this mm is backed by the same anon_vma or address_space.
 *
 * We can take all the locks in random order because the VM code
- * taking i_mmap_mutex or anon_vma->mutex outside the mmap_sem never
+ * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never
 * takes more than one of them in a row. Secondly we're protected
 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
 *
@@ -3001,7 +3044,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
                if (!__test_and_clear_bit(0, (unsigned long *)
                                          &anon_vma->root->rb_root.rb_node))
                        BUG();
-                anon_vma_unlock(anon_vma);
+                anon_vma_unlock_write(anon_vma);
        }
 }
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8a5ac8c686b0..be04122fb277 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -37,49 +37,51 @@ static struct srcu_struct srcu;
 void __mmu_notifier_release(struct mm_struct *mm)
 {
        struct mmu_notifier *mn;
-        struct hlist_node *n;
        int id;
        /*
-         * SRCU here will block mmu_notifier_unregister until
+         * srcu_read_lock() here will block synchronize_srcu() in
-         * ->release returns.
+         * mmu_notifier_unregister() until all registered
+         * ->release() callouts this function makes have
+         * returned.
         */
        id = srcu_read_lock(&srcu);
-        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
-                /*
-                 * if ->release runs before mmu_notifier_unregister it
-                 * must be handled as it's the only way for the driver
-                 * to flush all existing sptes and stop the driver
-                 * from establishing any more sptes before all the
-                 * pages in the mm are freed.
-                 */
-                if (mn->ops->release)
-                        mn->ops->release(mn, mm);
-        srcu_read_unlock(&srcu, id);
        spin_lock(&mm->mmu_notifier_mm->lock);
        while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
                mn = hlist_entry(mm->mmu_notifier_mm->list.first,
                                 struct mmu_notifier,
                                 hlist);
                /*
-                 * We arrived before mmu_notifier_unregister so
+                 * Unlink.  This will prevent mmu_notifier_unregister()
-                 * mmu_notifier_unregister will do nothing other than
+                 * from also making the ->release() callout.
-                 * to wait ->release to finish and
-                 * mmu_notifier_unregister to return.
                 */
                hlist_del_init_rcu(&mn->hlist);
+                spin_unlock(&mm->mmu_notifier_mm->lock);
+                /*
+                 * Clear sptes. (see 'release' description in mmu_notifier.h)
+                 */
+                if (mn->ops->release)
+                        mn->ops->release(mn, mm);
+                spin_lock(&mm->mmu_notifier_mm->lock);
        }
        spin_unlock(&mm->mmu_notifier_mm->lock);
        /*
-         * synchronize_srcu here prevents mmu_notifier_release to
+         * All callouts to ->release() which we have done are complete.
-         * return to exit_mmap (which would proceed freeing all pages
+         * Allow synchronize_srcu() in mmu_notifier_unregister() to complete
-         * in the mm) until the ->release method returns, if it was
+         */
-         * invoked by mmu_notifier_unregister.
+        srcu_read_unlock(&srcu, id);
-         *
-         * The mmu_notifier_mm can't go away from under us because one
+        /*
-         * mm_count is hold by exit_mmap.
+         * mmu_notifier_unregister() may have unlinked a notifier and may
+         * still be calling out to it.  Additionally, other notifiers
+         * may have been active via vmtruncate() et. al. Block here
+         * to ensure that all notifier callouts for this mm have been
+         * completed and the sptes are really cleaned up before returning
+         * to exit_mmap().
         */
        synchronize_srcu(&srcu);
 }
@@ -93,11 +95,10 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                        unsigned long address)
 {
        struct mmu_notifier *mn;
-        struct hlist_node *n;
        int young = 0, id;
        id = srcu_read_lock(&srcu);
-        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+        hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->clear_flush_young)
                        young |= mn->ops->clear_flush_young(mn, mm, address);
        }
@@ -110,11 +111,10 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
                              unsigned long address)
 {
        struct mmu_notifier *mn;
-        struct hlist_node *n;
        int young = 0, id;
        id = srcu_read_lock(&srcu);
-        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+        hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->test_young) {
                        young = mn->ops->test_young(mn, mm, address);
                        if (young)
@@ -130,11 +130,10 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
                               pte_t pte)
 {
        struct mmu_notifier *mn;
-        struct hlist_node *n;
        int id;
        id = srcu_read_lock(&srcu);
-        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+        hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->change_pte)
                        mn->ops->change_pte(mn, mm, address, pte);
        }
@@ -145,11 +144,10 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm,
                                          unsigned long address)
 {
        struct mmu_notifier *mn;
-        struct hlist_node *n;
        int id;
        id = srcu_read_lock(&srcu);
-        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+        hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->invalidate_page)
                        mn->ops->invalidate_page(mn, mm, address);
        }
@@ -160,31 +158,31 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
                                  unsigned long start, unsigned long end)
 {
        struct mmu_notifier *mn;
-        struct hlist_node *n;
        int id;
        id = srcu_read_lock(&srcu);
-        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+        hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->invalidate_range_start)
                        mn->ops->invalidate_range_start(mn, mm, start, end);
        }
        srcu_read_unlock(&srcu, id);
 }
+EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
 void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
                                  unsigned long start, unsigned long end)
 {
        struct mmu_notifier *mn;
-        struct hlist_node *n;
        int id;
        id = srcu_read_lock(&srcu);
-        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+        hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
                if (mn->ops->invalidate_range_end)
                        mn->ops->invalidate_range_end(mn, mm, start, end);
        }
        srcu_read_unlock(&srcu, id);
 }
+EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end);
 static int do_mmu_notifier_register(struct mmu_notifier *mn,
                                    struct mm_struct *mm,
@@ -294,31 +292,31 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 {
        BUG_ON(atomic_read(&mm->mm_count) <= 0);
+        spin_lock(&mm->mmu_notifier_mm->lock);
        if (!hlist_unhashed(&mn->hlist)) {
-                /*
-                 * SRCU here will force exit_mmap to wait ->release to finish
-                 * before freeing the pages.
-                 */
                int id;
-                id = srcu_read_lock(&srcu);
                /*
-                 * exit_mmap will block in mmu_notifier_release to
+                 * Ensure we synchronize up with __mmu_notifier_release().
-                 * guarantee ->release is called before freeing the
-                 * pages.
                 */
+                id = srcu_read_lock(&srcu);
+                hlist_del_rcu(&mn->hlist);
+                spin_unlock(&mm->mmu_notifier_mm->lock);
                if (mn->ops->release)
                        mn->ops->release(mn, mm);
-                srcu_read_unlock(&srcu, id);
-                spin_lock(&mm->mmu_notifier_mm->lock);
+                /*
-                hlist_del_rcu(&mn->hlist);
+                 * Allow __mmu_notifier_release() to complete.
+                 */
+                srcu_read_unlock(&srcu, id);
+        } else
                spin_unlock(&mm->mmu_notifier_mm->lock);
-        }
        /*
-         * Wait any running method to finish, of course including
+         * Wait for any running method to finish, including ->release() if it
-         * ->release if it was run by mmu_notifier_relase instead of us.
+         * was run by __mmu_notifier_release() instead of us.
         */
        synchronize_srcu(&srcu);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 4596d81b89b1..2ac0afbd68f3 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -1,7 +1,7 @@
 /*
 * linux/mm/mmzone.c
 *
- * management codes for pgdats and zones.
+ * management codes for pgdats, zones and page flags
 */
@@ -96,3 +96,21 @@ void lruvec_init(struct lruvec *lruvec)
        for_each_lru(lru)
                INIT_LIST_HEAD(&lruvec->lists[lru]);
 }
+#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS)
+int page_nid_xchg_last(struct page *page, int nid)
+{
+        unsigned long old_flags, flags;
+        int last_nid;
+        do {
+                old_flags = flags = page->flags;
+                last_nid = page_nid_last(page);
+                flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT);
+                flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT;
+        } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
+        return last_nid;
+}
+#endif
diff --git a/mm/mremap.c b/mm/mremap.c
index e1031e1f6a61..463a25705ac6 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -19,6 +19,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/mmu_notifier.h>
+#include <linux/sched/sysctl.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -134,7 +135,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
        pte_unmap(new_pte - 1);
        pte_unmap_unlock(old_pte - 1, old_ptl);
        if (anon_vma)
-                anon_vma_unlock(anon_vma);
+                anon_vma_unlock_write(anon_vma);
        if (mapping)
                mutex_unlock(&mapping->i_mmap_mutex);
 }
@@ -208,7 +209,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 static unsigned long move_vma(struct vm_area_struct *vma,
                unsigned long old_addr, unsigned long old_len,
-                unsigned long new_len, unsigned long new_addr)
+                unsigned long new_len, unsigned long new_addr, bool *locked)
 {
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *new_vma;
@@ -299,9 +300,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
        if (vm_flags & VM_LOCKED) {
                mm->locked_vm += new_len >> PAGE_SHIFT;
-                if (new_len > old_len)
+                *locked = true;
-                        mlock_vma_pages_range(new_vma, new_addr + old_len,
-                                                       new_addr + new_len);
        }
        return new_addr;
@@ -366,9 +365,8 @@ Eagain:
        return ERR_PTR(-EAGAIN);
 }
-static unsigned long mremap_to(unsigned long addr,
+static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
-        unsigned long old_len, unsigned long new_addr,
+                unsigned long new_addr, unsigned long new_len, bool *locked)
-        unsigned long new_len)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
@@ -418,7 +416,7 @@ static unsigned long mremap_to(unsigned long addr,
        if (ret & ~PAGE_MASK)
                goto out1;
-        ret = move_vma(vma, addr, old_len, new_len, new_addr);
+        ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
        if (!(ret & ~PAGE_MASK))
                goto out;
 out1:
@@ -456,6 +454,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
        struct vm_area_struct *vma;
        unsigned long ret = -EINVAL;
        unsigned long charged = 0;
+        bool locked = false;
        down_write(&current->mm->mmap_sem);
@@ -478,7 +477,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
        if (flags & MREMAP_FIXED) {
                if (flags & MREMAP_MAYMOVE)
-                        ret = mremap_to(addr, old_len, new_addr, new_len);
+                        ret = mremap_to(addr, old_len, new_addr, new_len,
+                                        &locked);
                goto out;
        }
@@ -520,8 +520,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
                        vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
                        if (vma->vm_flags & VM_LOCKED) {
                                mm->locked_vm += pages;
-                                mlock_vma_pages_range(vma, addr + old_len,
+                                locked = true;
-                                                   addr + new_len);
+                                new_addr = addr;
                        }
                        ret = addr;
                        goto out;
@@ -547,11 +547,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
                        goto out;
                }
-                ret = move_vma(vma, addr, old_len, new_len, new_addr);
+                ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
        }
 out:
        if (ret & ~PAGE_MASK)
                vm_unacct_memory(charged);
        up_write(&current->mm->mmap_sem);
+        if (locked && new_len > old_len)
+                mm_populate(new_addr + old_len, new_len - old_len);
        return ret;
 }
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index b8294fc03df8..5e07d36e381e 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -154,21 +154,6 @@ static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
 }
 /**
- * free_all_bootmem_node - release a node's free pages to the buddy allocator
- * @pgdat: node to be released
- *
- * Returns the number of pages actually released.
- */
-unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
-{
-        register_page_bootmem_info_node(pgdat);
-        reset_node_lowmem_managed_pages(pgdat);
-        /* free_low_memory_core_early(MAX_NUMNODES) will be called later */
-        return 0;
-}
-/**
 * free_all_bootmem - release free pages to the buddy allocator
 *
 * Returns the number of pages actually released.
@@ -406,6 +391,14 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
        return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
 }
+void * __init __alloc_bootmem_low_nopanic(unsigned long size,
+                                          unsigned long align,
+                                          unsigned long goal)
+{
+        return ___alloc_bootmem_nopanic(size, align, goal,
+                                        ARCH_LOW_ADDRESS_LIMIT);
+}
 /**
 * __alloc_bootmem_low_node - allocate low boot memory from a specific node
 * @pgdat: node to allocate from
diff --git a/mm/nommu.c b/mm/nommu.c
index 79c3cac87afa..e19328087534 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -29,6 +29,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/audit.h>
+#include <linux/sched/sysctl.h>
 #include <asm/uaccess.h>
 #include <asm/tlb.h>
@@ -139,10 +140,10 @@ unsigned int kobjsize(const void *objp)
        return PAGE_SIZE << compound_order(page);
 }
-int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-                     unsigned long start, int nr_pages, unsigned int foll_flags,
+                      unsigned long start, unsigned long nr_pages,
-                     struct page **pages, struct vm_area_struct **vmas,
+                      unsigned int foll_flags, struct page **pages,
-                     int *retry)
+                      struct vm_area_struct **vmas, int *nonblocking)
 {
        struct vm_area_struct *vma;
        unsigned long vm_flags;
@@ -189,9 +190,10 @@ finish_or_fault:
 *   slab page or a secondary page from a compound page
 * - don't permit access to VMAs that don't support it, such as I/O mappings
 */
-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-        unsigned long start, int nr_pages, int write, int force,
+                    unsigned long start, unsigned long nr_pages,
-        struct page **pages, struct vm_area_struct **vmas)
+                    int write, int force, struct page **pages,
+                    struct vm_area_struct **vmas)
 {
        int flags = 0;
@@ -941,7 +943,7 @@ static int validate_mmap_request(struct file *file,
                 */
                mapping = file->f_mapping;
                if (!mapping)
-                        mapping = file->f_path.dentry->d_inode->i_mapping;
+                        mapping = file_inode(file)->i_mapping;
                capabilities = 0;
                if (mapping && mapping->backing_dev_info)
@@ -950,7 +952,7 @@ static int validate_mmap_request(struct file *file,
                if (!capabilities) {
                        /* no explicit capabilities set, so assume some
                         * defaults */
-                        switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) {
+                        switch (file_inode(file)->i_mode & S_IFMT) {
                        case S_IFREG:
                        case S_IFBLK:
                                capabilities = BDI_CAP_MAP_COPY;
@@ -985,11 +987,11 @@ static int validate_mmap_request(struct file *file,
                            !(file->f_mode & FMODE_WRITE))
                                return -EACCES;
-                        if (IS_APPEND(file->f_path.dentry->d_inode) &&
+                        if (IS_APPEND(file_inode(file)) &&
                            (file->f_mode & FMODE_WRITE))
                                return -EACCES;
-                        if (locks_verify_locked(file->f_path.dentry->d_inode))
+                        if (locks_verify_locked(file_inode(file)))
                                return -EAGAIN;
                        if (!(capabilities & BDI_CAP_MAP_DIRECT))
@@ -1249,7 +1251,8 @@ unsigned long do_mmap_pgoff(struct file *file,
                            unsigned long len,
                            unsigned long prot,
                            unsigned long flags,
-                            unsigned long pgoff)
+                            unsigned long pgoff,
+                            unsigned long *populate)
 {
        struct vm_area_struct *vma;
        struct vm_region *region;
@@ -1259,6 +1262,8 @@ unsigned long do_mmap_pgoff(struct file *file,
        kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
+        *populate = 0;
        /* decide whether we should attempt the mapping, and if so what sort of
         * mapping */
        ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
@@ -1322,8 +1327,8 @@ unsigned long do_mmap_pgoff(struct file *file,
                                continue;
                        /* search for overlapping mappings on the same file */
-                        if (pregion->vm_file->f_path.dentry->d_inode !=
+                        if (file_inode(pregion->vm_file) !=
-                            file->f_path.dentry->d_inode)
+                            file_inode(file))
                                continue;
                        if (pregion->vm_pgoff >= pgend)
@@ -1814,9 +1819,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
        return ret;
 }
-struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+struct page *follow_page_mask(struct vm_area_struct *vma,
-                        unsigned int foll_flags)
+                              unsigned long address, unsigned int flags,
+                              unsigned int *page_mask)
 {
+        *page_mask = 0;
        return NULL;
 }
@@ -1903,7 +1910,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                 */
                free -= global_page_state(NR_SHMEM);
-                free += nr_swap_pages;
+                free += get_nr_swap_pages();
                /*
                 * Any slabs which are created with the
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 0399f146ae49..79e451a78c9e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -386,8 +386,10 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
        cpuset_print_task_mems_allowed(current);
        task_unlock(current);
        dump_stack();
-        mem_cgroup_print_oom_info(memcg, p);
+        if (memcg)
-        show_mem(SHOW_MEM_FILTER_NODES);
+                mem_cgroup_print_oom_info(memcg, p);
+        else
+                show_mem(SHOW_MEM_FILTER_NODES);
        if (sysctl_oom_dump_tasks)
                dump_tasks(memcg, nodemask);
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3734cefd4de4..742c40583159 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -35,6 +35,7 @@
 #include <linux/buffer_head.h> /* __set_page_dirty_buffers */
 #include <linux/pagevec.h>
 #include <linux/timer.h>
+#include <linux/sched/rt.h>
 #include <trace/events/writeback.h>
 /*
@@ -240,6 +241,9 @@ static unsigned long global_dirtyable_memory(void)
        if (!vm_highmem_is_dirtyable)
                x -= highmem_dirtyable_memory(x);
+        /* Subtract min_free_kbytes */
+        x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10));
        return x + 1;   /* Ensure that we never return 0 */
 }
@@ -2291,3 +2295,27 @@ int mapping_tagged(struct address_space *mapping, int tag)
        return radix_tree_tagged(&mapping->page_tree, tag);
 }
 EXPORT_SYMBOL(mapping_tagged);
+/**
+ * wait_for_stable_page() - wait for writeback to finish, if necessary.
+ * @page:       The page to wait on.
+ *
+ * This function determines if the given page is related to a backing device
+ * that requires page contents to be held stable during writeback.  If so, then
+ * it will wait for any pending writeback to complete.
+ */
+void wait_for_stable_page(struct page *page)
+{
+        struct address_space *mapping = page_mapping(page);
+        struct backing_dev_info *bdi = mapping->backing_dev_info;
+        if (!bdi_cap_stable_pages_required(bdi))
+                return;
+#ifdef CONFIG_NEED_BOUNCE_POOL
+        if (mapping->host->i_sb->s_flags & MS_SNAP_STABLE)
+                return;
+#endif /* CONFIG_NEED_BOUNCE_POOL */
+        wait_on_page_writeback(page);
+}
+EXPORT_SYMBOL_GPL(wait_for_stable_page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bc6cc0e913bd..0dade3f18f7d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -58,6 +58,7 @@
 #include <linux/prefetch.h>
 #include <linux/migrate.h>
 #include <linux/page-debug-flags.h>
+#include <linux/sched/rt.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -201,11 +202,18 @@ static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+/* Movable memory ranges, will also be used by memblock subsystem. */
+struct movablemem_map movablemem_map = {
+        .acpi = false,
+        .nr_map = 0,
+};
 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+static unsigned long __meminitdata zone_movable_limit[MAX_NUMNODES];
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
@@ -239,15 +247,20 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
        int ret = 0;
        unsigned seq;
        unsigned long pfn = page_to_pfn(page);
+        unsigned long sp, start_pfn;
        do {
                seq = zone_span_seqbegin(zone);
-                if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
+                start_pfn = zone->zone_start_pfn;
-                        ret = 1;
+                sp = zone->spanned_pages;
-                else if (pfn < zone->zone_start_pfn)
+                if (!zone_spans_pfn(zone, pfn))
                        ret = 1;
        } while (zone_span_seqretry(zone, seq));
+        if (ret)
+                pr_err("page %lu outside zone [ %lu - %lu ]\n",
+                        pfn, start_pfn, start_pfn + sp);
        return ret;
 }
@@ -287,7 +300,7 @@ static void bad_page(struct page *page)
        /* Don't complain about poisoned pages */
        if (PageHWPoison(page)) {
-                reset_page_mapcount(page); /* remove PageBuddy */
+                page_mapcount_reset(page); /* remove PageBuddy */
                return;
        }
@@ -319,8 +332,8 @@ static void bad_page(struct page *page)
        dump_stack();
 out:
        /* Leave bad fields for debug, except PageBuddy could make trouble */
-        reset_page_mapcount(page); /* remove PageBuddy */
+        page_mapcount_reset(page); /* remove PageBuddy */
-        add_taint(TAINT_BAD_PAGE);
+        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 }
 /*
@@ -532,6 +545,8 @@ static inline void __free_one_page(struct page *page,
        unsigned long uninitialized_var(buddy_idx);
        struct page *buddy;
+        VM_BUG_ON(!zone_is_initialized(zone));
        if (unlikely(PageCompound(page)))
                if (unlikely(destroy_compound_page(page, order)))
                        return;
@@ -605,7 +620,7 @@ static inline int free_pages_check(struct page *page)
                bad_page(page);
                return 1;
        }
-        reset_page_last_nid(page);
+        page_nid_reset_last(page);
        if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
                page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
        return 0;
@@ -665,7 +680,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
                        __free_one_page(page, zone, 0, mt);
                        trace_mm_page_pcpu_drain(page, 0, mt);
-                        if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
+                        if (likely(!is_migrate_isolate_page(page))) {
                                __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
                                if (is_migrate_cma(mt))
                                        __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
@@ -683,7 +698,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
        zone->pages_scanned = 0;
        __free_one_page(page, zone, order, migratetype);
-        if (unlikely(migratetype != MIGRATE_ISOLATE))
+        if (unlikely(!is_migrate_isolate(migratetype)))
                __mod_zone_freepage_state(zone, 1 << order, migratetype);
        spin_unlock(&zone->lock);
 }
@@ -773,6 +788,10 @@ void __init init_cma_reserved_pageblock(struct page *page)
        set_pageblock_migratetype(page, MIGRATE_CMA);
        __free_pages(page, pageblock_order);
        totalram_pages += pageblock_nr_pages;
+#ifdef CONFIG_HIGHMEM
+        if (PageHighMem(page))
+                totalhigh_pages += pageblock_nr_pages;
+#endif
 }
 #endif
@@ -911,7 +930,9 @@ static int fallbacks[MIGRATE_TYPES][4] = {
        [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
 #endif
        [MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
+#ifdef CONFIG_MEMORY_ISOLATION
        [MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
+#endif
 };
 /*
@@ -976,9 +997,9 @@ int move_freepages_block(struct zone *zone, struct page *page,
        end_pfn = start_pfn + pageblock_nr_pages - 1;
        /* Do not cross zone boundaries */
-        if (start_pfn < zone->zone_start_pfn)
+        if (!zone_spans_pfn(zone, start_pfn))
                start_page = page;
-        if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
+        if (!zone_spans_pfn(zone, end_pfn))
                return 0;
        return move_freepages(zone, start_page, end_page, migratetype);
@@ -1137,7 +1158,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                        list_add_tail(&page->lru, list);
                if (IS_ENABLED(CONFIG_CMA)) {
                        mt = get_pageblock_migratetype(page);
-                        if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
+                        if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
                                mt = migratetype;
                }
                set_freepage_migratetype(page, mt);
@@ -1272,7 +1293,7 @@ void mark_free_pages(struct zone *zone)
        spin_lock_irqsave(&zone->lock, flags);
-        max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+        max_zone_pfn = zone_end_pfn(zone);
        for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
                if (pfn_valid(pfn)) {
                        struct page *page = pfn_to_page(pfn);
@@ -1321,7 +1342,7 @@ void free_hot_cold_page(struct page *page, int cold)
         * excessively into the page allocator
         */
        if (migratetype >= MIGRATE_PCPTYPES) {
-                if (unlikely(migratetype == MIGRATE_ISOLATE)) {
+                if (unlikely(is_migrate_isolate(migratetype))) {
                        free_one_page(zone, page, 0, migratetype);
                        goto out;
                }
@@ -1384,14 +1405,8 @@ void split_page(struct page *page, unsigned int order)
                set_page_refcounted(page + i);
 }
-/*
+static int __isolate_free_page(struct page *page, unsigned int order)
- * Similar to the split_page family of functions except that the page
- * required at the given order and being isolated now to prevent races
- * with parallel allocators
- */
-int capture_free_page(struct page *page, int alloc_order, int migratetype)
 {
-        unsigned int order;
        unsigned long watermark;
        struct zone *zone;
        int mt;
@@ -1399,16 +1414,15 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
        BUG_ON(!PageBuddy(page));
        zone = page_zone(page);
-        order = page_order(page);
        mt = get_pageblock_migratetype(page);
-        if (mt != MIGRATE_ISOLATE) {
+        if (!is_migrate_isolate(mt)) {
                /* Obey watermarks as if the page was being allocated */
                watermark = low_wmark_pages(zone) + (1 << order);
                if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
                        return 0;
-                __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt);
+                __mod_zone_freepage_state(zone, -(1UL << order), mt);
        }
        /* Remove page from free list */
@@ -1416,22 +1430,18 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
        zone->free_area[order].nr_free--;
        rmv_page_order(page);
-        if (alloc_order != order)
+        /* Set the pageblock if the isolated page is at least a pageblock */
-                expand(zone, page, alloc_order, order,
-                        &zone->free_area[order], migratetype);
-        /* Set the pageblock if the captured page is at least a pageblock */
        if (order >= pageblock_order - 1) {
                struct page *endpage = page + (1 << order) - 1;
                for (; page < endpage; page += pageblock_nr_pages) {
                        int mt = get_pageblock_migratetype(page);
-                        if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))
+                        if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
                                set_pageblock_migratetype(page,
                                                          MIGRATE_MOVABLE);
                }
        }
-        return 1UL << alloc_order;
+        return 1UL << order;
 }
 /*
@@ -1449,10 +1459,9 @@ int split_free_page(struct page *page)
        unsigned int order;
        int nr_pages;
-        BUG_ON(!PageBuddy(page));
        order = page_order(page);
-        nr_pages = capture_free_page(page, order, 0);
+        nr_pages = __isolate_free_page(page, order);
        if (!nr_pages)
                return 0;
@@ -2136,8 +2145,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        bool *contended_compaction, bool *deferred_compaction,
        unsigned long *did_some_progress)
 {
-        struct page *page = NULL;
        if (!order)
                return NULL;
@@ -2149,16 +2156,12 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        current->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
                                                nodemask, sync_migration,
-                                                contended_compaction, &page);
+                                                contended_compaction);
        current->flags &= ~PF_MEMALLOC;
-        /* If compaction captured a page, prep and use it */
-        if (page) {
-                prep_new_page(page, order, gfp_mask);
-                goto got_page;
-        }
        if (*did_some_progress != COMPACT_SKIPPED) {
+                struct page *page;
                /* Page migration frees to the PCP lists but we want merging */
                drain_pages(get_cpu());
                put_cpu();
@@ -2168,7 +2171,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                                alloc_flags & ~ALLOC_NO_WATERMARKS,
                                preferred_zone, migratetype);
                if (page) {
-got_page:
                        preferred_zone->compact_blockskip_flush = false;
                        preferred_zone->compact_considered = 0;
                        preferred_zone->compact_defer_shift = 0;
@@ -2629,10 +2631,17 @@ retry_cpuset:
        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
                        zonelist, high_zoneidx, alloc_flags,
                        preferred_zone, migratetype);
-        if (unlikely(!page))
+        if (unlikely(!page)) {
+                /*
+                 * Runtime PM, block IO and its error handling path
+                 * can deadlock because I/O on the device might not
+                 * complete.
+                 */
+                gfp_mask = memalloc_noio_flags(gfp_mask);
                page = __alloc_pages_slowpath(gfp_mask, order,
                                zonelist, high_zoneidx, nodemask,
                                preferred_zone, migratetype);
+        }
        trace_mm_page_alloc(page, order, gfp_mask, migratetype);
@@ -2804,18 +2813,27 @@ void free_pages_exact(void *virt, size_t size)
 }
 EXPORT_SYMBOL(free_pages_exact);
-static unsigned int nr_free_zone_pages(int offset)
+/**
+ * nr_free_zone_pages - count number of pages beyond high watermark
+ * @offset: The zone index of the highest zone
+ *
+ * nr_free_zone_pages() counts the number of counts pages which are beyond the
+ * high watermark within all zones at or below a given zone index.  For each
+ * zone, the number of pages is calculated as:
+ *     present_pages - high_pages
+ */
+static unsigned long nr_free_zone_pages(int offset)
 {
        struct zoneref *z;
        struct zone *zone;
        /* Just pick one node, since fallback list is circular */
-        unsigned int sum = 0;
+        unsigned long sum = 0;
        struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
        for_each_zone_zonelist(zone, z, zonelist, offset) {
-                unsigned long size = zone->present_pages;
+                unsigned long size = zone->managed_pages;
                unsigned long high = high_wmark_pages(zone);
                if (size > high)
                        sum += size - high;
@@ -2824,19 +2842,25 @@ static unsigned int nr_free_zone_pages(int offset)
        return sum;
 }
-/*
+/**
- * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
+ * nr_free_buffer_pages - count number of pages beyond high watermark
+ *
+ * nr_free_buffer_pages() counts the number of pages which are beyond the high
+ * watermark within ZONE_DMA and ZONE_NORMAL.
 */
-unsigned int nr_free_buffer_pages(void)
+unsigned long nr_free_buffer_pages(void)
 {
        return nr_free_zone_pages(gfp_zone(GFP_USER));
 }
 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
-/*
+/**
- * Amount of free RAM allocatable within all zones
+ * nr_free_pagecache_pages - count number of pages beyond high watermark
+ *
+ * nr_free_pagecache_pages() counts the number of pages which are beyond the
+ * high watermark within all zones.
 */
-unsigned int nr_free_pagecache_pages(void)
+unsigned long nr_free_pagecache_pages(void)
 {
        return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
 }
@@ -2868,7 +2892,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
        val->totalram = pgdat->node_present_pages;
        val->freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
-        val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
+        val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
        val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
                        NR_FREE_PAGES);
 #else
@@ -2911,7 +2935,9 @@ static void show_migration_types(unsigned char type)
 #ifdef CONFIG_CMA
                [MIGRATE_CMA]           = 'C',
 #endif
+#ifdef CONFIG_MEMORY_ISOLATION
                [MIGRATE_ISOLATE]       = 'I',
+#endif
        };
        char tmp[MIGRATE_TYPES + 1];
        char *p = tmp;
@@ -3250,7 +3276,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
 {
        int n, val;
        int min_val = INT_MAX;
-        int best_node = -1;
+        int best_node = NUMA_NO_NODE;
        const struct cpumask *tmp = cpumask_of_node(0);
        /* Use the local node if we haven't already */
@@ -3794,7 +3820,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
         * the block.
         */
        start_pfn = zone->zone_start_pfn;
-        end_pfn = start_pfn + zone->spanned_pages;
+        end_pfn = zone_end_pfn(zone);
        start_pfn = roundup(start_pfn, pageblock_nr_pages);
        reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
                                                        pageblock_order;
@@ -3890,8 +3916,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                set_page_links(page, zone, nid, pfn);
                mminit_verify_page_links(page, zone, nid, pfn);
                init_page_count(page);
-                reset_page_mapcount(page);
+                page_mapcount_reset(page);
-                reset_page_last_nid(page);
+                page_nid_reset_last(page);
                SetPageReserved(page);
                /*
                 * Mark the block movable so that blocks are reserved for
@@ -3908,7 +3934,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                 * pfn out of zone.
                 */
                if ((z->zone_start_pfn <= pfn)
-                    && (pfn < z->zone_start_pfn + z->spanned_pages)
+                    && (pfn < zone_end_pfn(z))
                    && !(pfn & (pageblock_nr_pages - 1)))
                        set_pageblock_migratetype(page, MIGRATE_MOVABLE);
@@ -3946,7 +3972,7 @@ static int __meminit zone_batchsize(struct zone *zone)
         *
         * OK, so we don't know how big the cache is.  So guess.
         */
-        batch = zone->present_pages / 1024;
+        batch = zone->managed_pages / 1024;
        if (batch * PAGE_SIZE > 512 * 1024)
                batch = (512 * 1024) / PAGE_SIZE;
        batch /= 4;             /* We effectively *= 4 below */
@@ -4030,7 +4056,7 @@ static void __meminit setup_zone_pageset(struct zone *zone)
                if (percpu_pagelist_fraction)
                        setup_pagelist_highmark(pcp,
-                                (zone->present_pages /
+                                (zone->managed_pages /
                                        percpu_pagelist_fraction));
        }
 }
@@ -4386,6 +4412,77 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
        return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
 }
+/**
+ * sanitize_zone_movable_limit - Sanitize the zone_movable_limit array.
+ *
+ * zone_movable_limit is initialized as 0. This function will try to get
+ * the first ZONE_MOVABLE pfn of each node from movablemem_map, and
+ * assigne them to zone_movable_limit.
+ * zone_movable_limit[nid] == 0 means no limit for the node.
+ *
+ * Note: Each range is represented as [start_pfn, end_pfn)
+ */
+static void __meminit sanitize_zone_movable_limit(void)
+{
+        int map_pos = 0, i, nid;
+        unsigned long start_pfn, end_pfn;
+        if (!movablemem_map.nr_map)
+                return;
+        /* Iterate all ranges from minimum to maximum */
+        for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+                /*
+                 * If we have found lowest pfn of ZONE_MOVABLE of the node
+                 * specified by user, just go on to check next range.
+                 */
+                if (zone_movable_limit[nid])
+                        continue;
+#ifdef CONFIG_ZONE_DMA
+                /* Skip DMA memory. */
+                if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA])
+                        start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA];
+#endif
+#ifdef CONFIG_ZONE_DMA32
+                /* Skip DMA32 memory. */
+                if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA32])
+                        start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA32];
+#endif
+#ifdef CONFIG_HIGHMEM
+                /* Skip lowmem if ZONE_MOVABLE is highmem. */
+                if (zone_movable_is_highmem() &&
+                    start_pfn < arch_zone_lowest_possible_pfn[ZONE_HIGHMEM])
+                        start_pfn = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM];
+#endif
+                if (start_pfn >= end_pfn)
+                        continue;
+                while (map_pos < movablemem_map.nr_map) {
+                        if (end_pfn <= movablemem_map.map[map_pos].start_pfn)
+                                break;
+                        if (start_pfn >= movablemem_map.map[map_pos].end_pfn) {
+                                map_pos++;
+                                continue;
+                        }
+                        /*
+                         * The start_pfn of ZONE_MOVABLE is either the minimum
+                         * pfn specified by movablemem_map, or 0, which means
+                         * the node has no ZONE_MOVABLE.
+                         */
+                        zone_movable_limit[nid] = max(start_pfn,
+                                        movablemem_map.map[map_pos].start_pfn);
+                        break;
+                }
+        }
+}
 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
                                        unsigned long zone_type,
@@ -4403,7 +4500,6 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
        return zholes_size[zone_type];
 }
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
@@ -4435,10 +4531,11 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
 * round what is now in bits to nearest long in bits, then return it in
 * bytes.
 */
-static unsigned long __init usemap_size(unsigned long zonesize)
+static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
 {
        unsigned long usemapsize;
+        zonesize += zone_start_pfn & (pageblock_nr_pages-1);
        usemapsize = roundup(zonesize, pageblock_nr_pages);
        usemapsize = usemapsize >> pageblock_order;
        usemapsize *= NR_PAGEBLOCK_BITS;
@@ -4448,17 +4545,19 @@ static unsigned long __init usemap_size(unsigned long zonesize)
 }
 static void __init setup_usemap(struct pglist_data *pgdat,
-                                struct zone *zone, unsigned long zonesize)
+                                struct zone *zone,
+                                unsigned long zone_start_pfn,
+                                unsigned long zonesize)
 {
-        unsigned long usemapsize = usemap_size(zonesize);
+        unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
        zone->pageblock_flags = NULL;
        if (usemapsize)
                zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
                                                                   usemapsize);
 }
 #else
-static inline void setup_usemap(struct pglist_data *pgdat,
+static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
-                                struct zone *zone, unsigned long zonesize) {}
+                                unsigned long zone_start_pfn, unsigned long zonesize) {}
 #endif /* CONFIG_SPARSEMEM */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -4584,7 +4683,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                nr_all_pages += freesize;
                zone->spanned_pages = size;
-                zone->present_pages = freesize;
+                zone->present_pages = realsize;
                /*
                 * Set an approximate value for lowmem here, it will be adjusted
                 * when the bootmem allocator frees pages into the buddy system.
@@ -4609,7 +4708,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                        continue;
                set_pageblock_order();
-                setup_usemap(pgdat, zone, size);
+                setup_usemap(pgdat, zone, zone_start_pfn, size);
                ret = init_currently_empty_zone(zone, zone_start_pfn,
                                                size, MEMMAP_EARLY);
                BUG_ON(ret);
@@ -4636,7 +4735,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
                 * for the buddy allocator to function correctly.
                 */
                start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
-                end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
+                end = pgdat_end_pfn(pgdat);
                end = ALIGN(end, MAX_ORDER_NR_PAGES);
                size =  (end - start) * sizeof(struct page);
                map = alloc_remap(pgdat->node_id, size);
@@ -4842,12 +4941,19 @@ static void __init find_zone_movable_pfns_for_nodes(void)
                required_kernelcore = max(required_kernelcore, corepages);
        }
-        /* If kernelcore was not specified, there is no ZONE_MOVABLE */
+        /*
-        if (!required_kernelcore)
+         * If neither kernelcore/movablecore nor movablemem_map is specified,
+         * there is no ZONE_MOVABLE. But if movablemem_map is specified, the
+         * start pfn of ZONE_MOVABLE has been stored in zone_movable_limit[].
+         */
+        if (!required_kernelcore) {
+                if (movablemem_map.nr_map)
+                        memcpy(zone_movable_pfn, zone_movable_limit,
+                                sizeof(zone_movable_pfn));
                goto out;
+        }
        /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
-        find_usable_zone_for_movable();
        usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
 restart:
@@ -4875,10 +4981,24 @@ restart:
                for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
                        unsigned long size_pages;
+                        /*
+                         * Find more memory for kernelcore in
+                         * [zone_movable_pfn[nid], zone_movable_limit[nid]).
+                         */
                        start_pfn = max(start_pfn, zone_movable_pfn[nid]);
                        if (start_pfn >= end_pfn)
                                continue;
+                        if (zone_movable_limit[nid]) {
+                                end_pfn = min(end_pfn, zone_movable_limit[nid]);
+                                /* No range left for kernelcore in this node */
+                                if (start_pfn >= end_pfn) {
+                                        zone_movable_pfn[nid] =
+                                                        zone_movable_limit[nid];
+                                        break;
+                                }
+                        }
                        /* Account for what is only usable for kernelcore */
                        if (start_pfn < usable_startpfn) {
                                unsigned long kernel_pages;
@@ -4938,12 +5058,12 @@ restart:
        if (usable_nodes && required_kernelcore > usable_nodes)
                goto restart;
+out:
        /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
        for (nid = 0; nid < MAX_NUMNODES; nid++)
                zone_movable_pfn[nid] =
                        roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
-out:
        /* restore the node_state */
        node_states[N_MEMORY] = saved_node_state;
 }
@@ -5006,6 +5126,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
        /* Find the PFNs that ZONE_MOVABLE begins at in each node */
        memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
+        find_usable_zone_for_movable();
+        sanitize_zone_movable_limit();
        find_zone_movable_pfns_for_nodes();
        /* Print out the zone ranges */
@@ -5089,6 +5211,181 @@ static int __init cmdline_parse_movablecore(char *p)
 early_param("kernelcore", cmdline_parse_kernelcore);
 early_param("movablecore", cmdline_parse_movablecore);
+/**
+ * movablemem_map_overlap() - Check if a range overlaps movablemem_map.map[].
+ * @start_pfn:  start pfn of the range to be checked
+ * @end_pfn:    end pfn of the range to be checked (exclusive)
+ *
+ * This function checks if a given memory range [start_pfn, end_pfn) overlaps
+ * the movablemem_map.map[] array.
+ *
+ * Return: index of the first overlapped element in movablemem_map.map[]
+ *         or -1 if they don't overlap each other.
+ */
+int __init movablemem_map_overlap(unsigned long start_pfn,
+                                   unsigned long end_pfn)
+{
+        int overlap;
+        if (!movablemem_map.nr_map)
+                return -1;
+        for (overlap = 0; overlap < movablemem_map.nr_map; overlap++)
+                if (start_pfn < movablemem_map.map[overlap].end_pfn)
+                        break;
+        if (overlap == movablemem_map.nr_map ||
+            end_pfn <= movablemem_map.map[overlap].start_pfn)
+                return -1;
+        return overlap;
+}
+/**
+ * insert_movablemem_map - Insert a memory range in to movablemem_map.map.
+ * @start_pfn:  start pfn of the range
+ * @end_pfn:    end pfn of the range
+ *
+ * This function will also merge the overlapped ranges, and sort the array
+ * by start_pfn in monotonic increasing order.
+ */
+void __init insert_movablemem_map(unsigned long start_pfn,
+                                  unsigned long end_pfn)
+{
+        int pos, overlap;
+        /*
+         * pos will be at the 1st overlapped range, or the position
+         * where the element should be inserted.
+         */
+        for (pos = 0; pos < movablemem_map.nr_map; pos++)
+                if (start_pfn <= movablemem_map.map[pos].end_pfn)
+                        break;
+        /* If there is no overlapped range, just insert the element. */
+        if (pos == movablemem_map.nr_map ||
+            end_pfn < movablemem_map.map[pos].start_pfn) {
+                /*
+                 * If pos is not the end of array, we need to move all
+                 * the rest elements backward.
+                 */
+                if (pos < movablemem_map.nr_map)
+                        memmove(&movablemem_map.map[pos+1],
+                                &movablemem_map.map[pos],
+                                sizeof(struct movablemem_entry) *
+                                (movablemem_map.nr_map - pos));
+                movablemem_map.map[pos].start_pfn = start_pfn;
+                movablemem_map.map[pos].end_pfn = end_pfn;
+                movablemem_map.nr_map++;
+                return;
+        }
+        /* overlap will be at the last overlapped range */
+        for (overlap = pos + 1; overlap < movablemem_map.nr_map; overlap++)
+                if (end_pfn < movablemem_map.map[overlap].start_pfn)
+                        break;
+        /*
+         * If there are more ranges overlapped, we need to merge them,
+         * and move the rest elements forward.
+         */
+        overlap--;
+        movablemem_map.map[pos].start_pfn = min(start_pfn,
+                                        movablemem_map.map[pos].start_pfn);
+        movablemem_map.map[pos].end_pfn = max(end_pfn,
+                                        movablemem_map.map[overlap].end_pfn);
+        if (pos != overlap && overlap + 1 != movablemem_map.nr_map)
+                memmove(&movablemem_map.map[pos+1],
+                        &movablemem_map.map[overlap+1],
+                        sizeof(struct movablemem_entry) *
+                        (movablemem_map.nr_map - overlap - 1));
+        movablemem_map.nr_map -= overlap - pos;
+}
+/**
+ * movablemem_map_add_region - Add a memory range into movablemem_map.
+ * @start:      physical start address of range
+ * @end:        physical end address of range
+ *
+ * This function transform the physical address into pfn, and then add the
+ * range into movablemem_map by calling insert_movablemem_map().
+ */
+static void __init movablemem_map_add_region(u64 start, u64 size)
+{
+        unsigned long start_pfn, end_pfn;
+        /* In case size == 0 or start + size overflows */
+        if (start + size <= start)
+                return;
+        if (movablemem_map.nr_map >= ARRAY_SIZE(movablemem_map.map)) {
+                pr_err("movablemem_map: too many entries;"
+                        " ignoring [mem %#010llx-%#010llx]\n",
+                        (unsigned long long) start,
+                        (unsigned long long) (start + size - 1));
+                return;
+        }
+        start_pfn = PFN_DOWN(start);
+        end_pfn = PFN_UP(start + size);
+        insert_movablemem_map(start_pfn, end_pfn);
+}
+/*
+ * cmdline_parse_movablemem_map - Parse boot option movablemem_map.
+ * @p:  The boot option of the following format:
+ *      movablemem_map=nn[KMG]@ss[KMG]
+ *
+ * This option sets the memory range [ss, ss+nn) to be used as movable memory.
+ *
+ * Return: 0 on success or -EINVAL on failure.
+ */
+static int __init cmdline_parse_movablemem_map(char *p)
+{
+        char *oldp;
+        u64 start_at, mem_size;
+        if (!p)
+                goto err;
+        if (!strcmp(p, "acpi"))
+                movablemem_map.acpi = true;
+        /*
+         * If user decide to use info from BIOS, all the other user specified
+         * ranges will be ingored.
+         */
+        if (movablemem_map.acpi) {
+                if (movablemem_map.nr_map) {
+                        memset(movablemem_map.map, 0,
+                                sizeof(struct movablemem_entry)
+                                * movablemem_map.nr_map);
+                        movablemem_map.nr_map = 0;
+                }
+                return 0;
+        }
+        oldp = p;
+        mem_size = memparse(p, &p);
+        if (p == oldp)
+                goto err;
+        if (*p == '@') {
+                oldp = ++p;
+                start_at = memparse(p, &p);
+                if (p == oldp || *p != '\0')
+                        goto err;
+                movablemem_map_add_region(start_at, mem_size);
+                return 0;
+        }
+err:
+        return -EINVAL;
+}
+early_param("movablemem_map", cmdline_parse_movablemem_map);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 /**
@@ -5171,8 +5468,8 @@ static void calculate_totalreserve_pages(void)
                        /* we treat the high watermark as reserved pages. */
                        max += high_wmark_pages(zone);
-                        if (max > zone->present_pages)
+                        if (max > zone->managed_pages)
-                                max = zone->present_pages;
+                                max = zone->managed_pages;
                        reserve_pages += max;
                        /*
                         * Lowmem reserves are not available to
@@ -5204,7 +5501,7 @@ static void setup_per_zone_lowmem_reserve(void)
        for_each_online_pgdat(pgdat) {
                for (j = 0; j < MAX_NR_ZONES; j++) {
                        struct zone *zone = pgdat->node_zones + j;
-                        unsigned long present_pages = zone->present_pages;
+                        unsigned long managed_pages = zone->managed_pages;
                        zone->lowmem_reserve[j] = 0;
@@ -5218,9 +5515,9 @@ static void setup_per_zone_lowmem_reserve(void)
                                        sysctl_lowmem_reserve_ratio[idx] = 1;
                                lower_zone = pgdat->node_zones + idx;
-                                lower_zone->lowmem_reserve[j] = present_pages /
+                                lower_zone->lowmem_reserve[j] = managed_pages /
                                        sysctl_lowmem_reserve_ratio[idx];
-                                present_pages += lower_zone->present_pages;
+                                managed_pages += lower_zone->managed_pages;
                        }
                }
        }
@@ -5239,14 +5536,14 @@ static void __setup_per_zone_wmarks(void)
        /* Calculate total number of !ZONE_HIGHMEM pages */
        for_each_zone(zone) {
                if (!is_highmem(zone))
-                        lowmem_pages += zone->present_pages;
+                        lowmem_pages += zone->managed_pages;
        }
        for_each_zone(zone) {
                u64 tmp;
                spin_lock_irqsave(&zone->lock, flags);
-                tmp = (u64)pages_min * zone->present_pages;
+                tmp = (u64)pages_min * zone->managed_pages;
                do_div(tmp, lowmem_pages);
                if (is_highmem(zone)) {
                        /*
@@ -5258,13 +5555,10 @@ static void __setup_per_zone_wmarks(void)
                         * deltas controls asynch page reclaim, and so should
                         * not be capped for highmem.
                         */
-                        int min_pages;
+                        unsigned long min_pages;
-                        min_pages = zone->present_pages / 1024;
+                        min_pages = zone->managed_pages / 1024;
-                        if (min_pages < SWAP_CLUSTER_MAX)
+                        min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
-                                min_pages = SWAP_CLUSTER_MAX;
-                        if (min_pages > 128)
-                                min_pages = 128;
                        zone->watermark[WMARK_MIN] = min_pages;
                } else {
                        /*
@@ -5325,7 +5619,7 @@ static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
        unsigned int gb, ratio;
        /* Zone size in gigabytes */
-        gb = zone->present_pages >> (30 - PAGE_SHIFT);
+        gb = zone->managed_pages >> (30 - PAGE_SHIFT);
        if (gb)
                ratio = int_sqrt(10 * gb);
        else
@@ -5411,7 +5705,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
                return rc;
        for_each_zone(zone)
-                zone->min_unmapped_pages = (zone->present_pages *
+                zone->min_unmapped_pages = (zone->managed_pages *
                                sysctl_min_unmapped_ratio) / 100;
        return 0;
 }
@@ -5427,7 +5721,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
                return rc;
        for_each_zone(zone)
-                zone->min_slab_pages = (zone->present_pages *
+                zone->min_slab_pages = (zone->managed_pages *
                                sysctl_min_slab_ratio) / 100;
        return 0;
 }
@@ -5469,7 +5763,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
        for_each_populated_zone(zone) {
                for_each_possible_cpu(cpu) {
                        unsigned long  high;
-                        high = zone->present_pages / percpu_pagelist_fraction;
+                        high = zone->managed_pages / percpu_pagelist_fraction;
                        setup_pagelist_highmark(
                                per_cpu_ptr(zone->pageset, cpu), high);
                }
@@ -5604,7 +5898,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
        pfn &= (PAGES_PER_SECTION-1);
        return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #else
-        pfn = pfn - zone->zone_start_pfn;
+        pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
        return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #endif /* CONFIG_SPARSEMEM */
 }
@@ -5656,8 +5950,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
        pfn = page_to_pfn(page);
        bitmap = get_pageblock_bitmap(zone, pfn);
        bitidx = pfn_to_bitidx(zone, pfn);
-        VM_BUG_ON(pfn < zone->zone_start_pfn);
+        VM_BUG_ON(!zone_spans_pfn(zone, pfn));
-        VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
        for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
                if (flags & value)
@@ -5755,8 +6048,7 @@ bool is_pageblock_removable_nolock(struct page *page)
        zone = page_zone(page);
        pfn = page_to_pfn(page);
-        if (zone->zone_start_pfn > pfn ||
+        if (!zone_spans_pfn(zone, pfn))
-                        zone->zone_start_pfn + zone->spanned_pages <= pfn)
                return false;
        return !has_unmovable_pages(zone, page, 0, true);
@@ -5812,14 +6104,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
                                                        &cc->migratepages);
                cc->nr_migratepages -= nr_reclaimed;
-                ret = migrate_pages(&cc->migratepages,
+                ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
-                                    alloc_migrate_target,
+                                    0, MIGRATE_SYNC, MR_CMA);
-                                    0, false, MIGRATE_SYNC,
-                                    MR_CMA);
        }
+        if (ret < 0) {
-        putback_movable_pages(&cc->migratepages);
+                putback_movable_pages(&cc->migratepages);
-        return ret > 0 ? 0 : ret;
+                return ret;
+        }
+        return 0;
 }
 /**
diff --git a/mm/rmap.c b/mm/rmap.c
index 2c78f8cadc95..807c96bf0dc6 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -105,7 +105,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
         */
        if (rwsem_is_locked(&anon_vma->root->rwsem)) {
                anon_vma_lock_write(anon_vma);
-                anon_vma_unlock(anon_vma);
+                anon_vma_unlock_write(anon_vma);
        }
        kmem_cache_free(anon_vma_cachep, anon_vma);
@@ -191,7 +191,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                        avc = NULL;
                }
                spin_unlock(&mm->page_table_lock);
-                anon_vma_unlock(anon_vma);
+                anon_vma_unlock_write(anon_vma);
                if (unlikely(allocated))
                        put_anon_vma(allocated);
@@ -308,7 +308,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
        vma->anon_vma = anon_vma;
        anon_vma_lock_write(anon_vma);
        anon_vma_chain_link(vma, avc, anon_vma);
-        anon_vma_unlock(anon_vma);
+        anon_vma_unlock_write(anon_vma);
        return 0;
@@ -1126,7 +1126,6 @@ void page_add_file_rmap(struct page *page)
 */
 void page_remove_rmap(struct page *page)
 {
-        struct address_space *mapping = page_mapping(page);
        bool anon = PageAnon(page);
        bool locked;
        unsigned long flags;
@@ -1144,29 +1143,6 @@ void page_remove_rmap(struct page *page)
                goto out;
        /*
-         * Now that the last pte has gone, s390 must transfer dirty
-         * flag from storage key to struct page.  We can usually skip
-         * this if the page is anon, so about to be freed; but perhaps
-         * not if it's in swapcache - there might be another pte slot
-         * containing the swap entry, but page not yet written to swap.
-         *
-         * And we can skip it on file pages, so long as the filesystem
-         * participates in dirty tracking (note that this is not only an
-         * optimization but also solves problems caused by dirty flag in
-         * storage key getting set by a write from inside kernel); but need to
-         * catch shm and tmpfs and ramfs pages which have been modified since
-         * creation by read fault.
-         *
-         * Note that mapping must be decided above, before decrementing
-         * mapcount (which luckily provides a barrier): once page is unmapped,
-         * it could be truncated and page->mapping reset to NULL at any moment.
-         * Note also that we are relying on page_mapping(page) to set mapping
-         * to &swapper_space when PageSwapCache(page).
-         */
-        if (mapping && !mapping_cap_account_dirty(mapping) &&
-            page_test_and_clear_dirty(page_to_pfn(page), 1))
-                set_page_dirty(page);
-        /*
         * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
         * and not charged by memcg for now.
         */
diff --git a/mm/shmem.c b/mm/shmem.c
index 5dd56f6efdbd..ed2befb4952e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -335,19 +335,19 @@ static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
                                        pgoff_t start, unsigned int nr_pages,
                                        struct page **pages, pgoff_t *indices)
 {
-        unsigned int i;
+        void **slot;
-        unsigned int ret;
+        unsigned int ret = 0;
-        unsigned int nr_found;
+        struct radix_tree_iter iter;
+        if (!nr_pages)
+                return 0;
        rcu_read_lock();
 restart:
-        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+        radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
-                                (void ***)pages, indices, start, nr_pages);
-        ret = 0;
-        for (i = 0; i < nr_found; i++) {
                struct page *page;
 repeat:
-                page = radix_tree_deref_slot((void **)pages[i]);
+                page = radix_tree_deref_slot(slot);
                if (unlikely(!page))
                        continue;
                if (radix_tree_exception(page)) {
@@ -364,17 +364,16 @@ repeat:
                        goto repeat;
                /* Has the page moved? */
-                if (unlikely(page != *((void **)pages[i]))) {
+                if (unlikely(page != *slot)) {
                        page_cache_release(page);
                        goto repeat;
                }
 export:
-                indices[ret] = indices[i];
+                indices[ret] = iter.index;
                pages[ret] = page;
-                ret++;
+                if (++ret == nr_pages)
+                        break;
        }
-        if (unlikely(!ret && nr_found))
-                goto restart;
        rcu_read_unlock();
        return ret;
 }
@@ -1295,7 +1294,7 @@ unlock:
 static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+        struct inode *inode = file_inode(vma->vm_file);
        int error;
        int ret = VM_FAULT_LOCKED;
@@ -1313,14 +1312,14 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 #ifdef CONFIG_NUMA
 static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
 {
-        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+        struct inode *inode = file_inode(vma->vm_file);
        return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
 }
 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
                                          unsigned long addr)
 {
-        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+        struct inode *inode = file_inode(vma->vm_file);
        pgoff_t index;
        index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -1330,7 +1329,7 @@ static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
 int shmem_lock(struct file *file, int lock, struct user_struct *user)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
+        struct inode *inode = file_inode(file);
        struct shmem_inode_info *info = SHMEM_I(inode);
        int retval = -ENOMEM;
@@ -1465,7 +1464,7 @@ shmem_write_end(struct file *file, struct address_space *mapping,
 static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
 {
-        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct inode *inode = file_inode(filp);
        struct address_space *mapping = inode->i_mapping;
        pgoff_t index;
        unsigned long offset;
@@ -1808,7 +1807,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
 static long shmem_fallocate(struct file *file, int mode, loff_t offset,
                                                         loff_t len)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
+        struct inode *inode = file_inode(file);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        struct shmem_falloc shmem_falloc;
        pgoff_t start, index, end;
@@ -2351,7 +2350,7 @@ static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
 {
        if (*len < 3) {
                *len = 3;
-                return 255;
+                return FILEID_INVALID;
        }
        if (inode_unhashed(inode)) {
@@ -2386,6 +2385,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
                               bool remount)
 {
        char *this_char, *value, *rest;
+        struct mempolicy *mpol = NULL;
        uid_t uid;
        gid_t gid;
@@ -2414,7 +2414,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
                        printk(KERN_ERR
                            "tmpfs: No value for mount option '%s'\n",
                            this_char);
-                        return 1;
+                        goto error;
                }
                if (!strcmp(this_char,"size")) {
@@ -2463,19 +2463,24 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
                        if (!gid_valid(sbinfo->gid))
                                goto bad_val;
                } else if (!strcmp(this_char,"mpol")) {
-                        if (mpol_parse_str(value, &sbinfo->mpol))
+                        mpol_put(mpol);
+                        mpol = NULL;
+                        if (mpol_parse_str(value, &mpol))
                                goto bad_val;
                } else {
                        printk(KERN_ERR "tmpfs: Bad mount option %s\n",
                               this_char);
-                        return 1;
+                        goto error;
                }
        }
+        sbinfo->mpol = mpol;
        return 0;
 bad_val:
        printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
               value, this_char);
+error:
+        mpol_put(mpol);
        return 1;
 }
@@ -2487,6 +2492,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
        unsigned long inodes;
        int error = -EINVAL;
+        config.mpol = NULL;
        if (shmem_parse_options(data, &config, true))
                return error;
@@ -2511,8 +2517,13 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
        sbinfo->max_inodes  = config.max_inodes;
        sbinfo->free_inodes = config.max_inodes - inodes;
-        mpol_put(sbinfo->mpol);
+        /*
-        sbinfo->mpol        = config.mpol;      /* transfers initial ref */
+         * Preserve previous mempolicy unless mpol remount option was specified.
+         */
+        if (config.mpol) {
+                mpol_put(sbinfo->mpol);
+                sbinfo->mpol = config.mpol;     /* transfers initial ref */
+        }
 out:
        spin_unlock(&sbinfo->stat_lock);
        return error;
@@ -2545,6 +2556,7 @@ static void shmem_put_super(struct super_block *sb)
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        percpu_counter_destroy(&sbinfo->used_blocks);
+        mpol_put(sbinfo->mpol);
        kfree(sbinfo);
        sb->s_fs_info = NULL;
 }
@@ -2766,6 +2778,7 @@ static struct file_system_type shmem_fs_type = {
        .name           = "tmpfs",
        .mount          = shmem_mount,
        .kill_sb        = kill_litter_super,
+        .fs_flags       = FS_USERNS_MOUNT,
 };
 int __init shmem_init(void)
@@ -2823,6 +2836,7 @@ static struct file_system_type shmem_fs_type = {
        .name           = "tmpfs",
        .mount          = ramfs_mount,
        .kill_sb        = kill_litter_super,
+        .fs_flags       = FS_USERNS_MOUNT,
 };
 int __init shmem_init(void)
@@ -2865,6 +2879,16 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
 /* common code */
+static char *shmem_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+        return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)",
+                                dentry->d_name.name);
+}
+static struct dentry_operations anon_ops = {
+        .d_dname = shmem_dname
+};
 /**
 * shmem_file_setup - get an unlinked file living in tmpfs
 * @name: name for dentry (to be seen in /proc/<pid>/maps
@@ -2873,15 +2897,14 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
 */
 struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
 {
-        int error;
+        struct file *res;
-        struct file *file;
        struct inode *inode;
        struct path path;
-        struct dentry *root;
+        struct super_block *sb;
        struct qstr this;
        if (IS_ERR(shm_mnt))
-                return (void *)shm_mnt;
+                return ERR_CAST(shm_mnt);
        if (size < 0 || size > MAX_LFS_FILESIZE)
                return ERR_PTR(-EINVAL);
@@ -2889,18 +2912,19 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
        if (shmem_acct_size(flags, size))
                return ERR_PTR(-ENOMEM);
-        error = -ENOMEM;
+        res = ERR_PTR(-ENOMEM);
        this.name = name;
        this.len = strlen(name);
        this.hash = 0; /* will go */
-        root = shm_mnt->mnt_root;
+        sb = shm_mnt->mnt_sb;
-        path.dentry = d_alloc(root, &this);
+        path.dentry = d_alloc_pseudo(sb, &this);
        if (!path.dentry)
                goto put_memory;
+        d_set_d_op(path.dentry, &anon_ops);
        path.mnt = mntget(shm_mnt);
-        error = -ENOSPC;
+        res = ERR_PTR(-ENOSPC);
-        inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
+        inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
        if (!inode)
                goto put_dentry;
@@ -2909,23 +2933,23 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
        clear_nlink(inode);     /* It is unlinked */
 #ifndef CONFIG_MMU
        error = ramfs_nommu_expand_for_mapping(inode, size);
+        res = ERR_PTR(error);
        if (error)
                goto put_dentry;
 #endif
-        error = -ENFILE;
+        res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
-        file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
                  &shmem_file_operations);
-        if (!file)
+        if (IS_ERR(res))
                goto put_dentry;
-        return file;
+        return res;
 put_dentry:
        path_put(&path);
 put_memory:
        shmem_unacct_size(flags, size);
-        return ERR_PTR(error);
+        return res;
 }
 EXPORT_SYMBOL_GPL(shmem_file_setup);
diff --git a/mm/slab.c b/mm/slab.c
index e7667a3584bc..856e4a192d25 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -812,7 +812,7 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
        printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
               function, cachep->name, msg);
        dump_stack();
-        add_taint(TAINT_BAD_PAGE);
+        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 }
 #endif
diff --git a/mm/slob.c b/mm/slob.c
index a99fdf7a0907..eeed4a05a2ef 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -360,7 +360,7 @@ static void slob_free(void *block, int size)
                        clear_slob_page_free(sp);
                spin_unlock_irqrestore(&slob_lock, flags);
                __ClearPageSlab(sp);
-                reset_page_mapcount(sp);
+                page_mapcount_reset(sp);
                slob_free_pages(b, 0);
                return;
        }
diff --git a/mm/slub.c b/mm/slub.c
index ba2ca53f6c3a..4aec53705e4f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -562,7 +562,7 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...)
        printk(KERN_ERR "----------------------------------------"
                        "-------------------------------------\n\n");
-        add_taint(TAINT_BAD_PAGE);
+        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 }
 static void slab_fix(struct kmem_cache *s, char *fmt, ...)
@@ -1408,7 +1408,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
        __ClearPageSlab(page);
        memcg_release_pages(s, order);
-        reset_page_mapcount(page);
+        page_mapcount_reset(page);
        if (current->reclaim_state)
                current->reclaim_state->reclaimed_slab += pages;
        __free_memcg_kmem_pages(page, order);
diff --git a/mm/sparse.c b/mm/sparse.c
index 6b5fb762e2ca..7ca6dc847947 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -615,10 +615,11 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
 }
 static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 {
-        return; /* XXX: Not implemented yet */
+        vmemmap_free(memmap, nr_pages);
 }
 static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
 {
+        vmemmap_free(memmap, nr_pages);
 }
 #else
 static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
@@ -697,7 +698,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
        /*
         * Check to see if allocation came from hot-plug-add
         */
-        if (PageSlab(usemap_page)) {
+        if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
                kfree(usemap);
                if (memmap)
                        __kfree_section_memmap(memmap, PAGES_PER_SECTION);
@@ -782,7 +783,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
        for (i = 0; i < PAGES_PER_SECTION; i++) {
                if (PageHWPoison(&memmap[i])) {
-                        atomic_long_sub(1, &mce_bad_pages);
+                        atomic_long_sub(1, &num_poisoned_pages);
                        ClearPageHWPoison(&memmap[i]);
                }
        }
@@ -796,8 +797,10 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
 {
        struct page *memmap = NULL;
-        unsigned long *usemap = NULL;
+        unsigned long *usemap = NULL, flags;
+        struct pglist_data *pgdat = zone->zone_pgdat;
+        pgdat_resize_lock(pgdat, &flags);
        if (ms->section_mem_map) {
                usemap = ms->pageblock_flags;
                memmap = sparse_decode_mem_map(ms->section_mem_map,
@@ -805,6 +808,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
                ms->section_mem_map = 0;
                ms->pageblock_flags = NULL;
        }
+        pgdat_resize_unlock(pgdat, &flags);
        clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
        free_section_usemap(memmap, usemap);
diff --git a/mm/swap.c b/mm/swap.c
index 6310dc2008ff..8a529a01e8fc 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -855,9 +855,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
 void __init swap_setup(void)
 {
        unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
 #ifdef CONFIG_SWAP
-        bdi_init(swapper_space.backing_dev_info);
+        int i;
+        bdi_init(swapper_spaces[0].backing_dev_info);
+        for (i = 0; i < MAX_SWAPFILES; i++) {
+                spin_lock_init(&swapper_spaces[i].tree_lock);
+                INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
+        }
 #endif
        /* Use a smaller cluster for small-memory machines */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0cb36fb1f61c..7efcf1525921 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -36,12 +36,12 @@ static struct backing_dev_info swap_backing_dev_info = {
        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
 };
-struct address_space swapper_space = {
+struct address_space swapper_spaces[MAX_SWAPFILES] = {
-        .page_tree      = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
+        [0 ... MAX_SWAPFILES - 1] = {
-        .tree_lock      = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
+                .page_tree      = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
-        .a_ops          = &swap_aops,
+                .a_ops          = &swap_aops,
-        .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
+                .backing_dev_info = &swap_backing_dev_info,
-        .backing_dev_info = &swap_backing_dev_info,
+        }
 };
 #define INC_CACHE_INFO(x)       do { swap_cache_info.x++; } while (0)
@@ -53,13 +53,24 @@ static struct {
        unsigned long find_total;
 } swap_cache_info;
+unsigned long total_swapcache_pages(void)
+{
+        int i;
+        unsigned long ret = 0;
+        for (i = 0; i < MAX_SWAPFILES; i++)
+                ret += swapper_spaces[i].nrpages;
+        return ret;
+}
 void show_swap_cache_info(void)
 {
-        printk("%lu pages in swap cache\n", total_swapcache_pages);
+        printk("%lu pages in swap cache\n", total_swapcache_pages());
        printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
                swap_cache_info.add_total, swap_cache_info.del_total,
                swap_cache_info.find_success, swap_cache_info.find_total);
-        printk("Free swap  = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10));
+        printk("Free swap  = %ldkB\n",
+                get_nr_swap_pages() << (PAGE_SHIFT - 10));
        printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
 }
@@ -70,6 +81,7 @@ void show_swap_cache_info(void)
 static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
 {
        int error;
+        struct address_space *address_space;
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(PageSwapCache(page));
@@ -79,14 +91,16 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
        SetPageSwapCache(page);
        set_page_private(page, entry.val);
-        spin_lock_irq(&swapper_space.tree_lock);
+        address_space = swap_address_space(entry);
-        error = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
+        spin_lock_irq(&address_space->tree_lock);
+        error = radix_tree_insert(&address_space->page_tree,
+                                        entry.val, page);
        if (likely(!error)) {
-                total_swapcache_pages++;
+                address_space->nrpages++;
                __inc_zone_page_state(page, NR_FILE_PAGES);
                INC_CACHE_INFO(add_total);
        }
-        spin_unlock_irq(&swapper_space.tree_lock);
+        spin_unlock_irq(&address_space->tree_lock);
        if (unlikely(error)) {
                /*
@@ -122,14 +136,19 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
 */
 void __delete_from_swap_cache(struct page *page)
 {
+        swp_entry_t entry;
+        struct address_space *address_space;
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(!PageSwapCache(page));
        VM_BUG_ON(PageWriteback(page));
-        radix_tree_delete(&swapper_space.page_tree, page_private(page));
+        entry.val = page_private(page);
+        address_space = swap_address_space(entry);
+        radix_tree_delete(&address_space->page_tree, page_private(page));
        set_page_private(page, 0);
        ClearPageSwapCache(page);
-        total_swapcache_pages--;
+        address_space->nrpages--;
        __dec_zone_page_state(page, NR_FILE_PAGES);
        INC_CACHE_INFO(del_total);
 }
@@ -195,12 +214,14 @@ int add_to_swap(struct page *page)
 void delete_from_swap_cache(struct page *page)
 {
        swp_entry_t entry;
+        struct address_space *address_space;
        entry.val = page_private(page);
-        spin_lock_irq(&swapper_space.tree_lock);
+        address_space = swap_address_space(entry);
+        spin_lock_irq(&address_space->tree_lock);
        __delete_from_swap_cache(page);
-        spin_unlock_irq(&swapper_space.tree_lock);
+        spin_unlock_irq(&address_space->tree_lock);
        swapcache_free(entry, page);
        page_cache_release(page);
@@ -263,7 +284,7 @@ struct page * lookup_swap_cache(swp_entry_t entry)
 {
        struct page *page;
-        page = find_get_page(&swapper_space, entry.val);
+        page = find_get_page(swap_address_space(entry), entry.val);
        if (page)
                INC_CACHE_INFO(find_success);
@@ -290,7 +311,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                 * called after lookup_swap_cache() failed, re-calling
                 * that would confuse statistics.
                 */
-                found_page = find_get_page(&swapper_space, entry.val);
+                found_page = find_get_page(swap_address_space(entry),
+                                        entry.val);
                if (found_page)
                        break;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e97a0e5aea91..a1f7772a01fc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -47,9 +47,11 @@ static sector_t map_swap_entry(swp_entry_t, struct block_device**);
 DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
-long nr_swap_pages;
+atomic_long_t nr_swap_pages;
+/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
 long total_swap_pages;
 static int least_priority;
+static atomic_t highest_priority_index = ATOMIC_INIT(-1);
 static const char Bad_file[] = "Bad swap file entry ";
 static const char Unused_file[] = "Unused swap file entry ";
@@ -79,7 +81,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
        struct page *page;
        int ret = 0;
-        page = find_get_page(&swapper_space, entry.val);
+        page = find_get_page(swap_address_space(entry), entry.val);
        if (!page)
                return 0;
        /*
@@ -223,7 +225,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
                        si->lowest_alloc = si->max;
                        si->highest_alloc = 0;
                }
-                spin_unlock(&swap_lock);
+                spin_unlock(&si->lock);
                /*
                 * If seek is expensive, start searching for new cluster from
@@ -242,7 +244,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
                        if (si->swap_map[offset])
                                last_in_cluster = offset + SWAPFILE_CLUSTER;
                        else if (offset == last_in_cluster) {
-                                spin_lock(&swap_lock);
+                                spin_lock(&si->lock);
                                offset -= SWAPFILE_CLUSTER - 1;
                                si->cluster_next = offset;
                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
@@ -263,7 +265,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
                        if (si->swap_map[offset])
                                last_in_cluster = offset + SWAPFILE_CLUSTER;
                        else if (offset == last_in_cluster) {
-                                spin_lock(&swap_lock);
+                                spin_lock(&si->lock);
                                offset -= SWAPFILE_CLUSTER - 1;
                                si->cluster_next = offset;
                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
@@ -277,7 +279,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
                }
                offset = scan_base;
-                spin_lock(&swap_lock);
+                spin_lock(&si->lock);
                si->cluster_nr = SWAPFILE_CLUSTER - 1;
                si->lowest_alloc = 0;
        }
@@ -293,9 +295,9 @@ checks:
        /* reuse swap entry of cache-only swap if not busy. */
        if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
                int swap_was_freed;
-                spin_unlock(&swap_lock);
+                spin_unlock(&si->lock);
                swap_was_freed = __try_to_reclaim_swap(si, offset);
-                spin_lock(&swap_lock);
+                spin_lock(&si->lock);
                /* entry was freed successfully, try to use this again */
                if (swap_was_freed)
                        goto checks;
@@ -335,13 +337,13 @@ checks:
                            si->lowest_alloc <= last_in_cluster)
                                last_in_cluster = si->lowest_alloc - 1;
                        si->flags |= SWP_DISCARDING;
-                        spin_unlock(&swap_lock);
+                        spin_unlock(&si->lock);
                        if (offset < last_in_cluster)
                                discard_swap_cluster(si, offset,
                                        last_in_cluster - offset + 1);
-                        spin_lock(&swap_lock);
+                        spin_lock(&si->lock);
                        si->lowest_alloc = 0;
                        si->flags &= ~SWP_DISCARDING;
@@ -355,10 +357,10 @@ checks:
                         * could defer that delay until swap_writepage,
                         * but it's easier to keep this self-contained.
                         */
-                        spin_unlock(&swap_lock);
+                        spin_unlock(&si->lock);
                        wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
                                wait_for_discard, TASK_UNINTERRUPTIBLE);
-                        spin_lock(&swap_lock);
+                        spin_lock(&si->lock);
                } else {
                        /*
                         * Note pages allocated by racing tasks while
@@ -374,14 +376,14 @@ checks:
        return offset;
 scan:
-        spin_unlock(&swap_lock);
+        spin_unlock(&si->lock);
        while (++offset <= si->highest_bit) {
                if (!si->swap_map[offset]) {
-                        spin_lock(&swap_lock);
+                        spin_lock(&si->lock);
                        goto checks;
                }
                if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
-                        spin_lock(&swap_lock);
+                        spin_lock(&si->lock);
                        goto checks;
                }
                if (unlikely(--latency_ration < 0)) {
@@ -392,11 +394,11 @@ scan:
        offset = si->lowest_bit;
        while (++offset < scan_base) {
                if (!si->swap_map[offset]) {
-                        spin_lock(&swap_lock);
+                        spin_lock(&si->lock);
                        goto checks;
                }
                if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
-                        spin_lock(&swap_lock);
+                        spin_lock(&si->lock);
                        goto checks;
                }
                if (unlikely(--latency_ration < 0)) {
@@ -404,7 +406,7 @@ scan:
                        latency_ration = LATENCY_LIMIT;
                }
        }
-        spin_lock(&swap_lock);
+        spin_lock(&si->lock);
 no_page:
        si->flags -= SWP_SCANNING;
@@ -417,13 +419,34 @@ swp_entry_t get_swap_page(void)
        pgoff_t offset;
        int type, next;
        int wrapped = 0;
+        int hp_index;
        spin_lock(&swap_lock);
-        if (nr_swap_pages <= 0)
+        if (atomic_long_read(&nr_swap_pages) <= 0)
                goto noswap;
-        nr_swap_pages--;
+        atomic_long_dec(&nr_swap_pages);
        for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
+                hp_index = atomic_xchg(&highest_priority_index, -1);
+                /*
+                 * highest_priority_index records current highest priority swap
+                 * type which just frees swap entries. If its priority is
+                 * higher than that of swap_list.next swap type, we use it.  It
+                 * isn't protected by swap_lock, so it can be an invalid value
+                 * if the corresponding swap type is swapoff. We double check
+                 * the flags here. It's even possible the swap type is swapoff
+                 * and swapon again and its priority is changed. In such rare
+                 * case, low prority swap type might be used, but eventually
+                 * high priority swap will be used after several rounds of
+                 * swap.
+                 */
+                if (hp_index != -1 && hp_index != type &&
+                    swap_info[type]->prio < swap_info[hp_index]->prio &&
+                    (swap_info[hp_index]->flags & SWP_WRITEOK)) {
+                        type = hp_index;
+                        swap_list.next = type;
+                }
                si = swap_info[type];
                next = si->next;
                if (next < 0 ||
@@ -432,22 +455,29 @@ swp_entry_t get_swap_page(void)
                        wrapped++;
                }
-                if (!si->highest_bit)
+                spin_lock(&si->lock);
+                if (!si->highest_bit) {
+                        spin_unlock(&si->lock);
                        continue;
-                if (!(si->flags & SWP_WRITEOK))
+                }
+                if (!(si->flags & SWP_WRITEOK)) {
+                        spin_unlock(&si->lock);
                        continue;
+                }
                swap_list.next = next;
+                spin_unlock(&swap_lock);
                /* This is called for allocating swap entry for cache */
                offset = scan_swap_map(si, SWAP_HAS_CACHE);
-                if (offset) {
+                spin_unlock(&si->lock);
-                        spin_unlock(&swap_lock);
+                if (offset)
                        return swp_entry(type, offset);
-                }
+                spin_lock(&swap_lock);
                next = swap_list.next;
        }
-        nr_swap_pages++;
+        atomic_long_inc(&nr_swap_pages);
 noswap:
        spin_unlock(&swap_lock);
        return (swp_entry_t) {0};
@@ -459,19 +489,19 @@ swp_entry_t get_swap_page_of_type(int type)
        struct swap_info_struct *si;
        pgoff_t offset;
-        spin_lock(&swap_lock);
        si = swap_info[type];
+        spin_lock(&si->lock);
        if (si && (si->flags & SWP_WRITEOK)) {
-                nr_swap_pages--;
+                atomic_long_dec(&nr_swap_pages);
                /* This is called for allocating swap entry, not cache */
                offset = scan_swap_map(si, 1);
                if (offset) {
-                        spin_unlock(&swap_lock);
+                        spin_unlock(&si->lock);
                        return swp_entry(type, offset);
                }
-                nr_swap_pages++;
+                atomic_long_inc(&nr_swap_pages);
        }
-        spin_unlock(&swap_lock);
+        spin_unlock(&si->lock);
        return (swp_entry_t) {0};
 }
@@ -493,7 +523,7 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry)
                goto bad_offset;
        if (!p->swap_map[offset])
                goto bad_free;
-        spin_lock(&swap_lock);
+        spin_lock(&p->lock);
        return p;
 bad_free:
@@ -511,6 +541,27 @@ out:
        return NULL;
 }
+/*
+ * This swap type frees swap entry, check if it is the highest priority swap
+ * type which just frees swap entry. get_swap_page() uses
+ * highest_priority_index to search highest priority swap type. The
+ * swap_info_struct.lock can't protect us if there are multiple swap types
+ * active, so we use atomic_cmpxchg.
+ */
+static void set_highest_priority_index(int type)
+{
+        int old_hp_index, new_hp_index;
+        do {
+                old_hp_index = atomic_read(&highest_priority_index);
+                if (old_hp_index != -1 &&
+                        swap_info[old_hp_index]->prio >= swap_info[type]->prio)
+                        break;
+                new_hp_index = type;
+        } while (atomic_cmpxchg(&highest_priority_index,
+                old_hp_index, new_hp_index) != old_hp_index);
+}
 static unsigned char swap_entry_free(struct swap_info_struct *p,
                                     swp_entry_t entry, unsigned char usage)
 {
@@ -553,10 +604,8 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
                        p->lowest_bit = offset;
                if (offset > p->highest_bit)
                        p->highest_bit = offset;
-                if (swap_list.next >= 0 &&
+                set_highest_priority_index(p->type);
-                    p->prio > swap_info[swap_list.next]->prio)
+                atomic_long_inc(&nr_swap_pages);
-                        swap_list.next = p->type;
-                nr_swap_pages++;
                p->inuse_pages--;
                frontswap_invalidate_page(p->type, offset);
                if (p->flags & SWP_BLKDEV) {
@@ -581,7 +630,7 @@ void swap_free(swp_entry_t entry)
        p = swap_info_get(entry);
        if (p) {
                swap_entry_free(p, entry, 1);
-                spin_unlock(&swap_lock);
+                spin_unlock(&p->lock);
        }
 }
@@ -598,7 +647,7 @@ void swapcache_free(swp_entry_t entry, struct page *page)
                count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
                if (page)
                        mem_cgroup_uncharge_swapcache(page, entry, count != 0);
-                spin_unlock(&swap_lock);
+                spin_unlock(&p->lock);
        }
 }
@@ -617,7 +666,7 @@ int page_swapcount(struct page *page)
        p = swap_info_get(entry);
        if (p) {
                count = swap_count(p->swap_map[swp_offset(entry)]);
-                spin_unlock(&swap_lock);
+                spin_unlock(&p->lock);
        }
        return count;
 }
@@ -699,13 +748,14 @@ int free_swap_and_cache(swp_entry_t entry)
        p = swap_info_get(entry);
        if (p) {
                if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
-                        page = find_get_page(&swapper_space, entry.val);
+                        page = find_get_page(swap_address_space(entry),
+                                                entry.val);
                        if (page && !trylock_page(page)) {
                                page_cache_release(page);
                                page = NULL;
                        }
                }
-                spin_unlock(&swap_lock);
+                spin_unlock(&p->lock);
        }
        if (page) {
                /*
@@ -803,11 +853,13 @@ unsigned int count_swap_pages(int type, int free)
        if ((unsigned int)type < nr_swapfiles) {
                struct swap_info_struct *sis = swap_info[type];
+                spin_lock(&sis->lock);
                if (sis->flags & SWP_WRITEOK) {
                        n = sis->pages;
                        if (free)
                                n -= sis->inuse_pages;
                }
+                spin_unlock(&sis->lock);
        }
        spin_unlock(&swap_lock);
        return n;
@@ -822,11 +874,17 @@ unsigned int count_swap_pages(int type, int free)
 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, swp_entry_t entry, struct page *page)
 {
+        struct page *swapcache;
        struct mem_cgroup *memcg;
        spinlock_t *ptl;
        pte_t *pte;
        int ret = 1;
+        swapcache = page;
+        page = ksm_might_need_to_copy(page, vma, addr);
+        if (unlikely(!page))
+                return -ENOMEM;
        if (mem_cgroup_try_charge_swapin(vma->vm_mm, page,
                                         GFP_KERNEL, &memcg)) {
                ret = -ENOMEM;
@@ -845,7 +903,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        get_page(page);
        set_pte_at(vma->vm_mm, addr, pte,
                   pte_mkold(mk_pte(page, vma->vm_page_prot)));
-        page_add_anon_rmap(page, vma, addr);
+        if (page == swapcache)
+                page_add_anon_rmap(page, vma, addr);
+        else /* ksm created a completely new copy */
+                page_add_new_anon_rmap(page, vma, addr);
        mem_cgroup_commit_charge_swapin(page, memcg);
        swap_free(entry);
        /*
@@ -856,6 +917,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 out:
        pte_unmap_unlock(pte, ptl);
 out_nolock:
+        if (page != swapcache) {
+                unlock_page(page);
+                put_page(page);
+        }
        return ret;
 }
@@ -1456,7 +1521,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
        p->swap_map = swap_map;
        frontswap_map_set(p, frontswap_map);
        p->flags |= SWP_WRITEOK;
-        nr_swap_pages += p->pages;
+        atomic_long_add(p->pages, &nr_swap_pages);
        total_swap_pages += p->pages;
        /* insert swap space into swap_list: */
@@ -1478,15 +1543,19 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
                                unsigned long *frontswap_map)
 {
        spin_lock(&swap_lock);
+        spin_lock(&p->lock);
        _enable_swap_info(p, prio, swap_map, frontswap_map);
        frontswap_init(p->type);
+        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
 }
 static void reinsert_swap_info(struct swap_info_struct *p)
 {
        spin_lock(&swap_lock);
+        spin_lock(&p->lock);
        _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
+        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
 }
@@ -1546,14 +1615,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                /* just pick something that's safe... */
                swap_list.next = swap_list.head;
        }
+        spin_lock(&p->lock);
        if (p->prio < 0) {
                for (i = p->next; i >= 0; i = swap_info[i]->next)
                        swap_info[i]->prio = p->prio--;
                least_priority++;
        }
-        nr_swap_pages -= p->pages;
+        atomic_long_sub(p->pages, &nr_swap_pages);
        total_swap_pages -= p->pages;
        p->flags &= ~SWP_WRITEOK;
+        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
        set_current_oom_origin();
@@ -1572,14 +1643,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        mutex_lock(&swapon_mutex);
        spin_lock(&swap_lock);
+        spin_lock(&p->lock);
        drain_mmlist();
        /* wait for anyone still in scan_swap_map */
        p->highest_bit = 0;             /* cuts scans short */
        while (p->flags >= SWP_SCANNING) {
+                spin_unlock(&p->lock);
                spin_unlock(&swap_lock);
                schedule_timeout_uninterruptible(1);
                spin_lock(&swap_lock);
+                spin_lock(&p->lock);
        }
        swap_file = p->swap_file;
@@ -1589,6 +1663,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        p->swap_map = NULL;
        p->flags = 0;
        frontswap_invalidate_area(type);
+        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
        mutex_unlock(&swapon_mutex);
        vfree(swap_map);
@@ -1699,7 +1774,7 @@ static int swap_show(struct seq_file *swap, void *v)
        len = seq_path(swap, &file->f_path, " \t\n\\");
        seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
                        len < 40 ? 40 - len : 1, " ",
-                        S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
+                        S_ISBLK(file_inode(file)->i_mode) ?
                                "partition" : "file\t",
                        si->pages << (PAGE_SHIFT - 10),
                        si->inuse_pages << (PAGE_SHIFT - 10),
@@ -1794,6 +1869,7 @@ static struct swap_info_struct *alloc_swap_info(void)
        p->flags = SWP_USED;
        p->next = -1;
        spin_unlock(&swap_lock);
+        spin_lock_init(&p->lock);
        return p;
 }
@@ -2116,7 +2192,7 @@ void si_swapinfo(struct sysinfo *val)
                if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
                        nr_to_be_unused += si->inuse_pages;
        }
-        val->freeswap = nr_swap_pages + nr_to_be_unused;
+        val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
        val->totalswap = total_swap_pages + nr_to_be_unused;
        spin_unlock(&swap_lock);
 }
@@ -2149,7 +2225,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
        p = swap_info[type];
        offset = swp_offset(entry);
-        spin_lock(&swap_lock);
+        spin_lock(&p->lock);
        if (unlikely(offset >= p->max))
                goto unlock_out;
@@ -2184,7 +2260,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
        p->swap_map[offset] = count | has_cache;
 unlock_out:
-        spin_unlock(&swap_lock);
+        spin_unlock(&p->lock);
 out:
        return err;
@@ -2309,7 +2385,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
        }
        if (!page) {
-                spin_unlock(&swap_lock);
+                spin_unlock(&si->lock);
                return -ENOMEM;
        }
@@ -2357,7 +2433,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
        list_add_tail(&page->lru, &head->lru);
        page = NULL;                    /* now it's attached, don't free it */
 out:
-        spin_unlock(&swap_lock);
+        spin_unlock(&si->lock);
 outer:
        if (page)
                __free_page(page);
diff --git a/mm/util.c b/mm/util.c
index c55e26b17d93..ab1424dbe2e6 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -5,6 +5,8 @@
 #include <linux/err.h>
 #include <linux/sched.h>
 #include <linux/security.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -355,12 +357,16 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 {
        unsigned long ret;
        struct mm_struct *mm = current->mm;
+        unsigned long populate;
        ret = security_mmap_file(file, prot, flag);
        if (!ret) {
                down_write(&mm->mmap_sem);
-                ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff);
+                ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
+                                    &populate);
                up_write(&mm->mmap_sem);
+                if (populate)
+                        mm_populate(ret, populate);
        }
        return ret;
 }
@@ -378,6 +384,24 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
 }
 EXPORT_SYMBOL(vm_mmap);
+struct address_space *page_mapping(struct page *page)
+{
+        struct address_space *mapping = page->mapping;
+        VM_BUG_ON(PageSlab(page));
+#ifdef CONFIG_SWAP
+        if (unlikely(PageSwapCache(page))) {
+                swp_entry_t entry;
+                entry.val = page_private(page);
+                mapping = swap_address_space(entry);
+        } else
+#endif
+        if ((unsigned long)mapping & PAGE_MAPPING_ANON)
+                mapping = NULL;
+        return mapping;
+}
 /* Tracepoints definitions. */
 EXPORT_TRACEPOINT_SYMBOL(kmalloc);
 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5123a169ab7b..0f751f2068c3 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1376,8 +1376,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
 struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
                                unsigned long start, unsigned long end)
 {
-        return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
+        return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
-                                                __builtin_return_address(0));
+                                  GFP_KERNEL, __builtin_return_address(0));
 }
 EXPORT_SYMBOL_GPL(__get_vm_area);
@@ -1385,8 +1385,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
                                       unsigned long start, unsigned long end,
                                       const void *caller)
 {
-        return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
+        return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
-                                  caller);
+                                  GFP_KERNEL, caller);
 }
 /**
@@ -1401,14 +1401,15 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
 {
        return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
-                                -1, GFP_KERNEL, __builtin_return_address(0));
+                                  NUMA_NO_NODE, GFP_KERNEL,
+                                  __builtin_return_address(0));
 }
 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
                                const void *caller)
 {
        return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
-                                                -1, GFP_KERNEL, caller);
+                                  NUMA_NO_NODE, GFP_KERNEL, caller);
 }
 /**
@@ -1650,7 +1651,7 @@ fail:
 *      @end:           vm area range end
 *      @gfp_mask:      flags for the page level allocator
 *      @prot:          protection mask for the allocated pages
- *      @node:          node to use for allocation or -1
+ *      @node:          node to use for allocation or NUMA_NO_NODE
 *      @caller:        caller's return address
 *
 *      Allocate enough pages to cover @size from the page level
@@ -1706,7 +1707,7 @@ fail:
 *      @align:         desired alignment
 *      @gfp_mask:      flags for the page level allocator
 *      @prot:          protection mask for the allocated pages
- *      @node:          node to use for allocation or -1
+ *      @node:          node to use for allocation or NUMA_NO_NODE
 *      @caller:        caller's return address
 *
 *      Allocate enough pages to cover @size from the page level
@@ -1723,7 +1724,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 {
-        return __vmalloc_node(size, 1, gfp_mask, prot, -1,
+        return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE,
                                __builtin_return_address(0));
 }
 EXPORT_SYMBOL(__vmalloc);
@@ -1746,7 +1747,8 @@ static inline void *__vmalloc_node_flags(unsigned long size,
 */
 void *vmalloc(unsigned long size)
 {
-        return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM);
+        return __vmalloc_node_flags(size, NUMA_NO_NODE,
+                                    GFP_KERNEL | __GFP_HIGHMEM);
 }
 EXPORT_SYMBOL(vmalloc);
@@ -1762,7 +1764,7 @@ EXPORT_SYMBOL(vmalloc);
 */
 void *vzalloc(unsigned long size)
 {
-        return __vmalloc_node_flags(size, -1,
+        return __vmalloc_node_flags(size, NUMA_NO_NODE,
                                GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
 }
 EXPORT_SYMBOL(vzalloc);
@@ -1781,7 +1783,8 @@ void *vmalloc_user(unsigned long size)
        ret = __vmalloc_node(size, SHMLBA,
                             GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
-                             PAGE_KERNEL, -1, __builtin_return_address(0));
+                             PAGE_KERNEL, NUMA_NO_NODE,
+                             __builtin_return_address(0));
        if (ret) {
                area = find_vm_area(ret);
                area->flags |= VM_USERMAP;
@@ -1846,7 +1849,7 @@ EXPORT_SYMBOL(vzalloc_node);
 void *vmalloc_exec(unsigned long size)
 {
        return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
-                              -1, __builtin_return_address(0));
+                              NUMA_NO_NODE, __builtin_return_address(0));
 }
 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
@@ -1867,7 +1870,7 @@ void *vmalloc_exec(unsigned long size)
 void *vmalloc_32(unsigned long size)
 {
        return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
-                              -1, __builtin_return_address(0));
+                              NUMA_NO_NODE, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(vmalloc_32);
@@ -1884,7 +1887,7 @@ void *vmalloc_32_user(unsigned long size)
        void *ret;
        ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
-                             -1, __builtin_return_address(0));
+                             NUMA_NO_NODE, __builtin_return_address(0));
        if (ret) {
                area = find_vm_area(ret);
                area->flags |= VM_USERMAP;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 196709f5ee58..88c5fed8b9a4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -128,7 +128,7 @@ struct scan_control {
 * From 0 .. 100.  Higher means more swappy.
 */
 int vm_swappiness = 60;
-long vm_total_pages;    /* The total number of pages which the VM controls */
+unsigned long vm_total_pages;   /* The total number of pages which the VM controls */
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
@@ -1579,16 +1579,6 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec)
 }
 #endif
-static int inactive_file_is_low_global(struct zone *zone)
-{
-        unsigned long active, inactive;
-        active = zone_page_state(zone, NR_ACTIVE_FILE);
-        inactive = zone_page_state(zone, NR_INACTIVE_FILE);
-        return (active > inactive);
-}
 /**
 * inactive_file_is_low - check if file pages need to be deactivated
 * @lruvec: LRU vector to check
@@ -1605,10 +1595,13 @@ static int inactive_file_is_low_global(struct zone *zone)
 */
 static int inactive_file_is_low(struct lruvec *lruvec)
 {
-        if (!mem_cgroup_disabled())
+        unsigned long inactive;
-                return mem_cgroup_inactive_file_is_low(lruvec);
+        unsigned long active;
+        inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
+        active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
-        return inactive_file_is_low_global(lruvec_zone(lruvec));
+        return active > inactive;
 }
 static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
@@ -1638,6 +1631,13 @@ static int vmscan_swappiness(struct scan_control *sc)
        return mem_cgroup_swappiness(sc->target_mem_cgroup);
 }
+enum scan_balance {
+        SCAN_EQUAL,
+        SCAN_FRACT,
+        SCAN_ANON,
+        SCAN_FILE,
+};
 /*
 * Determine how aggressively the anon and file LRU lists should be
 * scanned.  The relative value of each set of LRU lists is determined
@@ -1650,15 +1650,16 @@ static int vmscan_swappiness(struct scan_control *sc)
 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
                           unsigned long *nr)
 {
-        unsigned long anon, file, free;
+        struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+        u64 fraction[2];
+        u64 denominator = 0;    /* gcc */
+        struct zone *zone = lruvec_zone(lruvec);
        unsigned long anon_prio, file_prio;
+        enum scan_balance scan_balance;
+        unsigned long anon, file, free;
+        bool force_scan = false;
        unsigned long ap, fp;
-        struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
-        u64 fraction[2], denominator;
        enum lru_list lru;
-        int noswap = 0;
-        bool force_scan = false;
-        struct zone *zone = lruvec_zone(lruvec);
        /*
         * If the zone or memcg is small, nr[l] can be 0.  This
@@ -1676,11 +1677,30 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
                force_scan = true;
        /* If we have no swap space, do not bother scanning anon pages. */
-        if (!sc->may_swap || (nr_swap_pages <= 0)) {
+        if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
-                noswap = 1;
+                scan_balance = SCAN_FILE;
-                fraction[0] = 0;
+                goto out;
-                fraction[1] = 1;
+        }
-                denominator = 1;
+        /*
+         * Global reclaim will swap to prevent OOM even with no
+         * swappiness, but memcg users want to use this knob to
+         * disable swapping for individual groups completely when
+         * using the memory controller's swap limit feature would be
+         * too expensive.
+         */
+        if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
+                scan_balance = SCAN_FILE;
+                goto out;
+        }
+        /*
+         * Do not apply any pressure balancing cleverness when the
+         * system is close to OOM, scan both anon and file equally
+         * (unless the swappiness setting disagrees with swapping).
+         */
+        if (!sc->priority && vmscan_swappiness(sc)) {
+                scan_balance = SCAN_EQUAL;
                goto out;
        }
@@ -1689,30 +1709,32 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
        file  = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
                get_lru_size(lruvec, LRU_INACTIVE_FILE);
+        /*
+         * If it's foreseeable that reclaiming the file cache won't be
+         * enough to get the zone back into a desirable shape, we have
+         * to swap.  Better start now and leave the - probably heavily
+         * thrashing - remaining file pages alone.
+         */
        if (global_reclaim(sc)) {
-                free  = zone_page_state(zone, NR_FREE_PAGES);
+                free = zone_page_state(zone, NR_FREE_PAGES);
                if (unlikely(file + free <= high_wmark_pages(zone))) {
-                        /*
+                        scan_balance = SCAN_ANON;
-                         * If we have very few page cache pages, force-scan
-                         * anon pages.
-                         */
-                        fraction[0] = 1;
-                        fraction[1] = 0;
-                        denominator = 1;
-                        goto out;
-                } else if (!inactive_file_is_low_global(zone)) {
-                        /*
-                         * There is enough inactive page cache, do not
-                         * reclaim anything from the working set right now.
-                         */
-                        fraction[0] = 0;
-                        fraction[1] = 1;
-                        denominator = 1;
                        goto out;
                }
        }
        /*
+         * There is enough inactive page cache, do not reclaim
+         * anything from the anonymous working set right now.
+         */
+        if (!inactive_file_is_low(lruvec)) {
+                scan_balance = SCAN_FILE;
+                goto out;
+        }
+        scan_balance = SCAN_FRACT;
+        /*
         * With swappiness at 100, anonymous and file have the same priority.
         * This scanning priority is essentially the inverse of IO cost.
         */
@@ -1759,19 +1781,92 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 out:
        for_each_evictable_lru(lru) {
                int file = is_file_lru(lru);
+                unsigned long size;
                unsigned long scan;
-                scan = get_lru_size(lruvec, lru);
+                size = get_lru_size(lruvec, lru);
-                if (sc->priority || noswap || !vmscan_swappiness(sc)) {
+                scan = size >> sc->priority;
-                        scan >>= sc->priority;
-                        if (!scan && force_scan)
+                if (!scan && force_scan)
-                                scan = SWAP_CLUSTER_MAX;
+                        scan = min(size, SWAP_CLUSTER_MAX);
+                switch (scan_balance) {
+                case SCAN_EQUAL:
+                        /* Scan lists relative to size */
+                        break;
+                case SCAN_FRACT:
+                        /*
+                         * Scan types proportional to swappiness and
+                         * their relative recent reclaim efficiency.
+                         */
                        scan = div64_u64(scan * fraction[file], denominator);
+                        break;
+                case SCAN_FILE:
+                case SCAN_ANON:
+                        /* Scan one type exclusively */
+                        if ((scan_balance == SCAN_FILE) != file)
+                                scan = 0;
+                        break;
+                default:
+                        /* Look ma, no brain */
+                        BUG();
                }
                nr[lru] = scan;
        }
 }
+/*
+ * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
+ */
+static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+        unsigned long nr[NR_LRU_LISTS];
+        unsigned long nr_to_scan;
+        enum lru_list lru;
+        unsigned long nr_reclaimed = 0;
+        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+        struct blk_plug plug;
+        get_scan_count(lruvec, sc, nr);
+        blk_start_plug(&plug);
+        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
+                                        nr[LRU_INACTIVE_FILE]) {
+                for_each_evictable_lru(lru) {
+                        if (nr[lru]) {
+                                nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
+                                nr[lru] -= nr_to_scan;
+                                nr_reclaimed += shrink_list(lru, nr_to_scan,
+                                                            lruvec, sc);
+                        }
+                }
+                /*
+                 * On large memory systems, scan >> priority can become
+                 * really large. This is fine for the starting priority;
+                 * we want to put equal scanning pressure on each zone.
+                 * However, if the VM has a harder time of freeing pages,
+                 * with multiple processes reclaiming pages, the total
+                 * freeing target can get unreasonably large.
+                 */
+                if (nr_reclaimed >= nr_to_reclaim &&
+                    sc->priority < DEF_PRIORITY)
+                        break;
+        }
+        blk_finish_plug(&plug);
+        sc->nr_reclaimed += nr_reclaimed;
+        /*
+         * Even if we did not try to evict anon pages at all, we want to
+         * rebalance the anon lru active/inactive ratio.
+         */
+        if (inactive_anon_is_low(lruvec))
+                shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
+                                   sc, LRU_ACTIVE_ANON);
+        throttle_vm_writeout(sc->gfp_mask);
+}
 /* Use reclaim/compaction for costly allocs or under memory pressure */
 static bool in_reclaim_compaction(struct scan_control *sc)
 {
@@ -1790,7 +1885,7 @@ static bool in_reclaim_compaction(struct scan_control *sc)
 * calls try_to_compact_zone() that it will have enough free pages to succeed.
 * It will give up earlier than that if there is difficulty reclaiming pages.
 */
-static inline bool should_continue_reclaim(struct lruvec *lruvec,
+static inline bool should_continue_reclaim(struct zone *zone,
                                        unsigned long nr_reclaimed,
                                        unsigned long nr_scanned,
                                        struct scan_control *sc)
@@ -1830,15 +1925,15 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
         * inactive lists are large enough, continue reclaiming
         */
        pages_for_compaction = (2UL << sc->order);
-        inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
+        inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
-        if (nr_swap_pages > 0)
+        if (get_nr_swap_pages() > 0)
-                inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
+                inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
        if (sc->nr_reclaimed < pages_for_compaction &&
                        inactive_lru_pages > pages_for_compaction)
                return true;
        /* If compaction would go ahead or the allocation would succeed, stop */
-        switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) {
+        switch (compaction_suitable(zone, sc->order)) {
        case COMPACT_PARTIAL:
        case COMPACT_CONTINUE:
                return false;
@@ -1847,98 +1942,48 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
        }
 }
-/*
+static void shrink_zone(struct zone *zone, struct scan_control *sc)
- * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
- */
-static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 {
-        unsigned long nr[NR_LRU_LISTS];
-        unsigned long nr_to_scan;
-        enum lru_list lru;
        unsigned long nr_reclaimed, nr_scanned;
-        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
-        struct blk_plug plug;
-restart:
-        nr_reclaimed = 0;
-        nr_scanned = sc->nr_scanned;
-        get_scan_count(lruvec, sc, nr);
-        blk_start_plug(&plug);
-        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
-                                        nr[LRU_INACTIVE_FILE]) {
-                for_each_evictable_lru(lru) {
-                        if (nr[lru]) {
-                                nr_to_scan = min_t(unsigned long,
-                                                   nr[lru], SWAP_CLUSTER_MAX);
-                                nr[lru] -= nr_to_scan;
-                                nr_reclaimed += shrink_list(lru, nr_to_scan,
-                                                            lruvec, sc);
-                        }
-                }
-                /*
-                 * On large memory systems, scan >> priority can become
-                 * really large. This is fine for the starting priority;
-                 * we want to put equal scanning pressure on each zone.
-                 * However, if the VM has a harder time of freeing pages,
-                 * with multiple processes reclaiming pages, the total
-                 * freeing target can get unreasonably large.
-                 */
-                if (nr_reclaimed >= nr_to_reclaim &&
-                    sc->priority < DEF_PRIORITY)
-                        break;
-        }
-        blk_finish_plug(&plug);
-        sc->nr_reclaimed += nr_reclaimed;
-        /*
+        do {
-         * Even if we did not try to evict anon pages at all, we want to
+                struct mem_cgroup *root = sc->target_mem_cgroup;
-         * rebalance the anon lru active/inactive ratio.
+                struct mem_cgroup_reclaim_cookie reclaim = {
-         */
+                        .zone = zone,
-        if (inactive_anon_is_low(lruvec))
+                        .priority = sc->priority,
-                shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
+                };
-                                   sc, LRU_ACTIVE_ANON);
+                struct mem_cgroup *memcg;
-        /* reclaim/compaction might need reclaim to continue */
-        if (should_continue_reclaim(lruvec, nr_reclaimed,
-                                    sc->nr_scanned - nr_scanned, sc))
-                goto restart;
-        throttle_vm_writeout(sc->gfp_mask);
+                nr_reclaimed = sc->nr_reclaimed;
-}
+                nr_scanned = sc->nr_scanned;
-static void shrink_zone(struct zone *zone, struct scan_control *sc)
+                memcg = mem_cgroup_iter(root, NULL, &reclaim);
-{
+                do {
-        struct mem_cgroup *root = sc->target_mem_cgroup;
+                        struct lruvec *lruvec;
-        struct mem_cgroup_reclaim_cookie reclaim = {
-                .zone = zone,
-                .priority = sc->priority,
-        };
-        struct mem_cgroup *memcg;
-        memcg = mem_cgroup_iter(root, NULL, &reclaim);
+                        lruvec = mem_cgroup_zone_lruvec(zone, memcg);
-        do {
-                struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
-                shrink_lruvec(lruvec, sc);
+                        shrink_lruvec(lruvec, sc);
-                /*
+                        /*
-                 * Limit reclaim has historically picked one memcg and
+                         * Direct reclaim and kswapd have to scan all memory
-                 * scanned it with decreasing priority levels until
+                         * cgroups to fulfill the overall scan target for the
-                 * nr_to_reclaim had been reclaimed.  This priority
+                         * zone.
-                 * cycle is thus over after a single memcg.
+                         *
-                 *
+                         * Limit reclaim, on the other hand, only cares about
-                 * Direct reclaim and kswapd, on the other hand, have
+                         * nr_to_reclaim pages to be reclaimed and it will
-                 * to scan all memory cgroups to fulfill the overall
+                         * retry with decreasing priority if one round over the
-                 * scan target for the zone.
+                         * whole hierarchy is not sufficient.
-                 */
+                         */
-                if (!global_reclaim(sc)) {
+                        if (!global_reclaim(sc) &&
-                        mem_cgroup_iter_break(root, memcg);
+                                        sc->nr_reclaimed >= sc->nr_to_reclaim) {
-                        break;
+                                mem_cgroup_iter_break(root, memcg);
-                }
+                                break;
-                memcg = mem_cgroup_iter(root, memcg, &reclaim);
+                        }
-        } while (memcg);
+                        memcg = mem_cgroup_iter(root, memcg, &reclaim);
+                } while (memcg);
+        } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
+                                         sc->nr_scanned - nr_scanned, sc));
 }
 /* Returns true if compaction should go ahead for a high-order request */
@@ -1958,7 +2003,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
         * a reasonable chance of completing and allocating the page
         */
        balance_gap = min(low_wmark_pages(zone),
-                (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+                (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
                        KSWAPD_ZONE_BALANCE_GAP_RATIO);
        watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
        watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
@@ -2150,6 +2195,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                        goto out;
                /*
+                 * If we're getting trouble reclaiming, start doing
+                 * writepage even in laptop mode.
+                 */
+                if (sc->priority < DEF_PRIORITY - 2)
+                        sc->may_writepage = 1;
+                /*
                 * Try to write back as many pages as we just scanned.  This
                 * tends to cause slow streaming writers to write data to the
                 * disk smoothly, at the dirtying rate, which is nice.   But
@@ -2300,7 +2352,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 {
        unsigned long nr_reclaimed;
        struct scan_control sc = {
-                .gfp_mask = gfp_mask,
+                .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
                .may_writepage = !laptop_mode,
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .may_unmap = 1,
@@ -2473,7 +2525,7 @@ static bool zone_balanced(struct zone *zone, int order,
 */
 static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
 {
-        unsigned long present_pages = 0;
+        unsigned long managed_pages = 0;
        unsigned long balanced_pages = 0;
        int i;
@@ -2484,7 +2536,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
                if (!populated_zone(zone))
                        continue;
-                present_pages += zone->present_pages;
+                managed_pages += zone->managed_pages;
                /*
                 * A special case here:
@@ -2494,18 +2546,18 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
                 * they must be considered balanced here as well!
                 */
                if (zone->all_unreclaimable) {
-                        balanced_pages += zone->present_pages;
+                        balanced_pages += zone->managed_pages;
                        continue;
                }
                if (zone_balanced(zone, order, 0, i))
-                        balanced_pages += zone->present_pages;
+                        balanced_pages += zone->managed_pages;
                else if (!order)
                        return false;
        }
        if (order)
-                return balanced_pages >= (present_pages >> 2);
+                return balanced_pages >= (managed_pages >> 2);
        else
                return true;
 }
@@ -2564,7 +2616,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                                                        int *classzone_idx)
 {
-        struct zone *unbalanced_zone;
+        bool pgdat_is_balanced = false;
        int i;
        int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
        unsigned long total_scanned;
@@ -2595,9 +2647,6 @@ loop_again:
        do {
                unsigned long lru_pages = 0;
-                int has_under_min_watermark_zone = 0;
-                unbalanced_zone = NULL;
                /*
                 * Scan in the highmem->dma direction for the highest
@@ -2638,8 +2687,11 @@ loop_again:
                                zone_clear_flag(zone, ZONE_CONGESTED);
                        }
                }
-                if (i < 0)
+                if (i < 0) {
+                        pgdat_is_balanced = true;
                        goto out;
+                }
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
@@ -2689,7 +2741,7 @@ loop_again:
                         * of the zone, whichever is smaller.
                         */
                        balance_gap = min(low_wmark_pages(zone),
-                                (zone->present_pages +
+                                (zone->managed_pages +
                                        KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
                                KSWAPD_ZONE_BALANCE_GAP_RATIO);
                        /*
@@ -2720,12 +2772,10 @@ loop_again:
                        }
                        /*
-                         * If we've done a decent amount of scanning and
+                         * If we're getting trouble reclaiming, start doing
-                         * the reclaim ratio is low, start doing writepage
+                         * writepage even in laptop mode.
-                         * even in laptop mode
                         */
-                        if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
+                        if (sc.priority < DEF_PRIORITY - 2)
-                            total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
                                sc.may_writepage = 1;
                        if (zone->all_unreclaimable) {
@@ -2734,17 +2784,7 @@ loop_again:
                                continue;
                        }
-                        if (!zone_balanced(zone, testorder, 0, end_zone)) {
+                        if (zone_balanced(zone, testorder, 0, end_zone))
-                                unbalanced_zone = zone;
-                                /*
-                                 * We are still under min water mark.  This
-                                 * means that we have a GFP_ATOMIC allocation
-                                 * failure risk. Hurry up!
-                                 */
-                                if (!zone_watermark_ok_safe(zone, order,
-                                            min_wmark_pages(zone), end_zone, 0))
-                                        has_under_min_watermark_zone = 1;
-                        } else {
                                /*
                                 * If a zone reaches its high watermark,
                                 * consider it to be no longer congested. It's
@@ -2753,8 +2793,6 @@ loop_again:
                                 * speculatively avoid congestion waits
                                 */
                                zone_clear_flag(zone, ZONE_CONGESTED);
-                        }
                }
                /*
@@ -2766,17 +2804,9 @@ loop_again:
                                pfmemalloc_watermark_ok(pgdat))
                        wake_up(&pgdat->pfmemalloc_wait);
-                if (pgdat_balanced(pgdat, order, *classzone_idx))
+                if (pgdat_balanced(pgdat, order, *classzone_idx)) {
+                        pgdat_is_balanced = true;
                        break;          /* kswapd: all done */
-                /*
-                 * OK, kswapd is getting into trouble.  Take a nap, then take
-                 * another pass across the zones.
-                 */
-                if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
-                        if (has_under_min_watermark_zone)
-                                count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
-                        else if (unbalanced_zone)
-                                wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10);
                }
                /*
@@ -2788,9 +2818,9 @@ loop_again:
                if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
                        break;
        } while (--sc.priority >= 0);
-out:
-        if (!pgdat_balanced(pgdat, order, *classzone_idx)) {
+out:
+        if (!pgdat_is_balanced) {
                cond_resched();
                try_to_freeze();
@@ -3053,7 +3083,7 @@ unsigned long global_reclaimable_pages(void)
        nr = global_page_state(NR_ACTIVE_FILE) +
             global_page_state(NR_INACTIVE_FILE);
-        if (nr_swap_pages > 0)
+        if (get_nr_swap_pages() > 0)
                nr += global_page_state(NR_ACTIVE_ANON) +
                      global_page_state(NR_INACTIVE_ANON);
@@ -3067,7 +3097,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
        nr = zone_page_state(zone, NR_ACTIVE_FILE) +
             zone_page_state(zone, NR_INACTIVE_FILE);
-        if (nr_swap_pages > 0)
+        if (get_nr_swap_pages() > 0)
                nr += zone_page_state(zone, NR_ACTIVE_ANON) +
                      zone_page_state(zone, NR_INACTIVE_ANON);
@@ -3280,9 +3310,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
                .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
                .may_swap = 1,
-                .nr_to_reclaim = max_t(unsigned long, nr_pages,
+                .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
-                                       SWAP_CLUSTER_MAX),
+                .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
-                .gfp_mask = gfp_mask,
                .order = order,
                .priority = ZONE_RECLAIM_PRIORITY,
        };
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9800306c8195..e1d8ed172c42 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -142,7 +142,7 @@ int calculate_normal_threshold(struct zone *zone)
         * 125          1024            10      16-32 GB        9
         */
-        mem = zone->present_pages >> (27 - PAGE_SHIFT);
+        mem = zone->managed_pages >> (27 - PAGE_SHIFT);
        threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
@@ -628,7 +628,9 @@ static char * const migratetype_names[MIGRATE_TYPES] = {
 #ifdef CONFIG_CMA
        "CMA",
 #endif
+#ifdef CONFIG_MEMORY_ISOLATION
        "Isolate",
+#endif
 };
 static void *frag_start(struct seq_file *m, loff_t *pos)
@@ -768,7 +770,6 @@ const char * const vmstat_text[] = {
        "kswapd_inodesteal",
        "kswapd_low_wmark_hit_quickly",
        "kswapd_high_wmark_hit_quickly",
-        "kswapd_skip_congestion_wait",
        "pageoutrun",
        "allocstall",
@@ -890,7 +891,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
        int mtype;
        unsigned long pfn;
        unsigned long start_pfn = zone->zone_start_pfn;
-        unsigned long end_pfn = start_pfn + zone->spanned_pages;
+        unsigned long end_pfn = zone_end_pfn(zone);
        unsigned long count[MIGRATE_TYPES] = { 0, };
        for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {