49 files changed, 2807 insertions, 2313 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a03131b6ba8e..390214da4546 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -517,6 +517,12 @@ config CMA_DEBUG
          processing calls such as dma_alloc_from_contiguous().
          This option does not affect warning and error messages.
+config CMA_DEBUGFS
+        bool "CMA debugfs interface"
+        depends on CMA && DEBUG_FS
+        help
+          Turns on the DebugFS interface for CMA.
 config CMA_AREAS
        int "Maximum count of the CMA areas"
        depends on CMA
diff --git a/mm/Makefile b/mm/Makefile
index 3c1caa2693bd..98c4eaeabdcb 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -21,7 +21,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o \
                           mm_init.o mmu_context.o percpu.o slab_common.o \
                           compaction.o vmacache.o \
                           interval_tree.o list_lru.o workingset.o \
-                           iov_iter.o debug.o $(mmu-y)
+                           debug.o $(mmu-y)
 obj-y += init-mm.o
@@ -55,6 +55,7 @@ obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
 obj-$(CONFIG_KASAN)     += kasan/
 obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
+obj-$(CONFIG_MEMTEST)           += memtest.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
@@ -76,3 +77,4 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
 obj-$(CONFIG_CMA)       += cma.o
 obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
 obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
+obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 053bcd8f12fb..8fc50811119b 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -19,7 +19,7 @@
 #include <linux/cleancache.h>
 /*
- * cleancache_ops is set by cleancache_ops_register to contain the pointers
+ * cleancache_ops is set by cleancache_register_ops to contain the pointers
 * to the cleancache "backend" implementation functions.
 */
 static struct cleancache_ops *cleancache_ops __read_mostly;
@@ -34,145 +34,107 @@ static u64 cleancache_failed_gets;
 static u64 cleancache_puts;
 static u64 cleancache_invalidates;
-/*
+static void cleancache_register_ops_sb(struct super_block *sb, void *unused)
- * When no backend is registered all calls to init_fs and init_shared_fs
+{
- * are registered and fake poolids (FAKE_FS_POOLID_OFFSET or
+        switch (sb->cleancache_poolid) {
- * FAKE_SHARED_FS_POOLID_OFFSET, plus offset in the respective array
+        case CLEANCACHE_NO_BACKEND:
- * [shared_|]fs_poolid_map) are given to the respective super block
+                __cleancache_init_fs(sb);
- * (sb->cleancache_poolid) and no tmem_pools are created. When a backend
+                break;
- * registers with cleancache the previous calls to init_fs and init_shared_fs
+        case CLEANCACHE_NO_BACKEND_SHARED:
- * are executed to create tmem_pools and set the respective poolids. While no
+                __cleancache_init_shared_fs(sb);
- * backend is registered all "puts", "gets" and "flushes" are ignored or failed.
+                break;
- */
+        }
-#define MAX_INITIALIZABLE_FS 32
+}
-#define FAKE_FS_POOLID_OFFSET 1000
-#define FAKE_SHARED_FS_POOLID_OFFSET 2000
-#define FS_NO_BACKEND (-1)
-#define FS_UNKNOWN (-2)
-static int fs_poolid_map[MAX_INITIALIZABLE_FS];
-static int shared_fs_poolid_map[MAX_INITIALIZABLE_FS];
-static char *uuids[MAX_INITIALIZABLE_FS];
-/*
- * Mutex for the [shared_|]fs_poolid_map to guard against multiple threads
- * invoking umount (and ending in __cleancache_invalidate_fs) and also multiple
- * threads calling mount (and ending up in __cleancache_init_[shared|]fs).
- */
-static DEFINE_MUTEX(poolid_mutex);
-/*
- * When set to false (default) all calls to the cleancache functions, except
- * the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded
- * by the if (!cleancache_ops) return. This means multiple threads (from
- * different filesystems) will be checking cleancache_ops. The usage of a
- * bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are
- * OK if the time between the backend's have been initialized (and
- * cleancache_ops has been set to not NULL) and when the filesystems start
- * actually calling the backends. The inverse (when unloading) is obviously
- * not good - but this shim does not do that (yet).
- */
-/*
- * The backends and filesystems work all asynchronously. This is b/c the
- * backends can be built as modules.
- * The usual sequence of events is:
- *      a) mount /      -> __cleancache_init_fs is called. We set the
- *              [shared_|]fs_poolid_map and uuids for.
- *
- *      b). user does I/Os -> we call the rest of __cleancache_* functions
- *              which return immediately as cleancache_ops is false.
- *
- *      c). modprobe zcache -> cleancache_register_ops. We init the backend
- *              and set cleancache_ops to true, and for any fs_poolid_map
- *              (which is set by __cleancache_init_fs) we initialize the poolid.
- *
- *      d). user does I/Os -> now that cleancache_ops is true all the
- *              __cleancache_* functions can call the backend. They all check
- *              that fs_poolid_map is valid and if so invoke the backend.
- *
- *      e). umount /    -> __cleancache_invalidate_fs, the fs_poolid_map is
- *              reset (which is the second check in the __cleancache_* ops
- *              to call the backend).
- *
- * The sequence of event could also be c), followed by a), and d). and e). The
- * c) would not happen anymore. There is also the chance of c), and one thread
- * doing a) + d), and another doing e). For that case we depend on the
- * filesystem calling __cleancache_invalidate_fs in the proper sequence (so
- * that it handles all I/Os before it invalidates the fs (which is last part
- * of unmounting process).
- *
- * Note: The acute reader will notice that there is no "rmmod zcache" case.
- * This is b/c the functionality for that is not yet implemented and when
- * done, will require some extra locking not yet devised.
- */
 /*
- * Register operations for cleancache, returning previous thus allowing
+ * Register operations for cleancache. Returns 0 on success.
- * detection of multiple backends and possible nesting.
 */
-struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops)
+int cleancache_register_ops(struct cleancache_ops *ops)
 {
-        struct cleancache_ops *old = cleancache_ops;
+        if (cmpxchg(&cleancache_ops, NULL, ops))
-        int i;
+                return -EBUSY;
-        mutex_lock(&poolid_mutex);
-        for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
-                if (fs_poolid_map[i] == FS_NO_BACKEND)
-                        fs_poolid_map[i] = ops->init_fs(PAGE_SIZE);
-                if (shared_fs_poolid_map[i] == FS_NO_BACKEND)
-                        shared_fs_poolid_map[i] = ops->init_shared_fs
-                                        (uuids[i], PAGE_SIZE);
-        }
        /*
-         * We MUST set cleancache_ops _after_ we have called the backends
+         * A cleancache backend can be built as a module and hence loaded after
-         * init_fs or init_shared_fs functions. Otherwise the compiler might
+         * a cleancache enabled filesystem has called cleancache_init_fs. To
-         * re-order where cleancache_ops is set in this function.
+         * handle such a scenario, here we call ->init_fs or ->init_shared_fs
+         * for each active super block. To differentiate between local and
+         * shared filesystems, we temporarily initialize sb->cleancache_poolid
+         * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED
+         * respectively in case there is no backend registered at the time
+         * cleancache_init_fs or cleancache_init_shared_fs is called.
+         *
+         * Since filesystems can be mounted concurrently with cleancache
+         * backend registration, we have to be careful to guarantee that all
+         * cleancache enabled filesystems that has been mounted by the time
+         * cleancache_register_ops is called has got and all mounted later will
+         * get cleancache_poolid. This is assured by the following statements
+         * tied together:
+         *
+         * a) iterate_supers skips only those super blocks that has started
+         *    ->kill_sb
+         *
+         * b) if iterate_supers encounters a super block that has not finished
+         *    ->mount yet, it waits until it is finished
+         *
+         * c) cleancache_init_fs is called from ->mount and
+         *    cleancache_invalidate_fs is called from ->kill_sb
+         *
+         * d) we call iterate_supers after cleancache_ops has been set
+         *
+         * From a) it follows that if iterate_supers skips a super block, then
+         * either the super block is already dead, in which case we do not need
+         * to bother initializing cleancache for it, or it was mounted after we
+         * initiated iterate_supers. In the latter case, it must have seen
+         * cleancache_ops set according to d) and initialized cleancache from
+         * ->mount by itself according to c). This proves that we call
+         * ->init_fs at least once for each active super block.
+         *
+         * From b) and c) it follows that if iterate_supers encounters a super
+         * block that has already started ->init_fs, it will wait until ->mount
+         * and hence ->init_fs has finished, then check cleancache_poolid, see
+         * that it has already been set and therefore do nothing. This proves
+         * that we call ->init_fs no more than once for each super block.
+         *
+         * Combined together, the last two paragraphs prove the function
+         * correctness.
+         *
+         * Note that various cleancache callbacks may proceed before this
+         * function is called or even concurrently with it, but since
+         * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop
+         * until the corresponding ->init_fs has been actually called and
+         * cleancache_ops has been set.
         */
-        barrier();
+        iterate_supers(cleancache_register_ops_sb, NULL);
-        cleancache_ops = ops;
+        return 0;
-        mutex_unlock(&poolid_mutex);
-        return old;
 }
 EXPORT_SYMBOL(cleancache_register_ops);
 /* Called by a cleancache-enabled filesystem at time of mount */
 void __cleancache_init_fs(struct super_block *sb)
 {
-        int i;
+        int pool_id = CLEANCACHE_NO_BACKEND;
-        mutex_lock(&poolid_mutex);
+        if (cleancache_ops) {
-        for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
+                pool_id = cleancache_ops->init_fs(PAGE_SIZE);
-                if (fs_poolid_map[i] == FS_UNKNOWN) {
+                if (pool_id < 0)
-                        sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET;
+                        pool_id = CLEANCACHE_NO_POOL;
-                        if (cleancache_ops)
-                                fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE);
-                        else
-                                fs_poolid_map[i] = FS_NO_BACKEND;
-                        break;
-                }
        }
-        mutex_unlock(&poolid_mutex);
+        sb->cleancache_poolid = pool_id;
 }
 EXPORT_SYMBOL(__cleancache_init_fs);
 /* Called by a cleancache-enabled clustered filesystem at time of mount */
-void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+void __cleancache_init_shared_fs(struct super_block *sb)
 {
-        int i;
+        int pool_id = CLEANCACHE_NO_BACKEND_SHARED;
-        mutex_lock(&poolid_mutex);
+        if (cleancache_ops) {
-        for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
+                pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE);
-                if (shared_fs_poolid_map[i] == FS_UNKNOWN) {
+                if (pool_id < 0)
-                        sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET;
+                        pool_id = CLEANCACHE_NO_POOL;
-                        uuids[i] = uuid;
-                        if (cleancache_ops)
-                                shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs
-                                                (uuid, PAGE_SIZE);
-                        else
-                                shared_fs_poolid_map[i] = FS_NO_BACKEND;
-                        break;
-                }
        }
-        mutex_unlock(&poolid_mutex);
+        sb->cleancache_poolid = pool_id;
 }
 EXPORT_SYMBOL(__cleancache_init_shared_fs);
@@ -202,19 +164,6 @@ static int cleancache_get_key(struct inode *inode,
 }
 /*
- * Returns a pool_id that is associated with a given fake poolid.
- */
-static int get_poolid_from_fake(int fake_pool_id)
-{
-        if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET)
-                return shared_fs_poolid_map[fake_pool_id -
-                        FAKE_SHARED_FS_POOLID_OFFSET];
-        else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET)
-                return fs_poolid_map[fake_pool_id - FAKE_FS_POOLID_OFFSET];
-        return FS_NO_BACKEND;
-}
-/*
 * "Get" data from cleancache associated with the poolid/inode/index
 * that were specified when the data was put to cleanache and, if
 * successful, use it to fill the specified page with data and return 0.
@@ -229,7 +178,6 @@ int __cleancache_get_page(struct page *page)
 {
        int ret = -1;
        int pool_id;
-        int fake_pool_id;
        struct cleancache_filekey key = { .u.key = { 0 } };
        if (!cleancache_ops) {
@@ -238,17 +186,14 @@ int __cleancache_get_page(struct page *page)
        }
        VM_BUG_ON_PAGE(!PageLocked(page), page);
-        fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
+        pool_id = page->mapping->host->i_sb->cleancache_poolid;
-        if (fake_pool_id < 0)
+        if (pool_id < 0)
                goto out;
-        pool_id = get_poolid_from_fake(fake_pool_id);
        if (cleancache_get_key(page->mapping->host, &key) < 0)
                goto out;
-        if (pool_id >= 0)
+        ret = cleancache_ops->get_page(pool_id, key, page->index, page);
-                ret = cleancache_ops->get_page(pool_id,
-                                key, page->index, page);
        if (ret == 0)
                cleancache_succ_gets++;
        else
@@ -271,7 +216,6 @@ EXPORT_SYMBOL(__cleancache_get_page);
 void __cleancache_put_page(struct page *page)
 {
        int pool_id;
-        int fake_pool_id;
        struct cleancache_filekey key = { .u.key = { 0 } };
        if (!cleancache_ops) {
@@ -280,12 +224,7 @@ void __cleancache_put_page(struct page *page)
        }
        VM_BUG_ON_PAGE(!PageLocked(page), page);
-        fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
+        pool_id = page->mapping->host->i_sb->cleancache_poolid;
-        if (fake_pool_id < 0)
-                return;
-        pool_id = get_poolid_from_fake(fake_pool_id);
        if (pool_id >= 0 &&
                cleancache_get_key(page->mapping->host, &key) >= 0) {
                cleancache_ops->put_page(pool_id, key, page->index, page);
@@ -306,18 +245,13 @@ void __cleancache_invalidate_page(struct address_space *mapping,
                                        struct page *page)
 {
        /* careful... page->mapping is NULL sometimes when this is called */
-        int pool_id;
+        int pool_id = mapping->host->i_sb->cleancache_poolid;
-        int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
        struct cleancache_filekey key = { .u.key = { 0 } };
        if (!cleancache_ops)
                return;
-        if (fake_pool_id >= 0) {
+        if (pool_id >= 0) {
-                pool_id = get_poolid_from_fake(fake_pool_id);
-                if (pool_id < 0)
-                        return;
                VM_BUG_ON_PAGE(!PageLocked(page), page);
                if (cleancache_get_key(mapping->host, &key) >= 0) {
                        cleancache_ops->invalidate_page(pool_id,
@@ -339,18 +273,12 @@ EXPORT_SYMBOL(__cleancache_invalidate_page);
 */
 void __cleancache_invalidate_inode(struct address_space *mapping)
 {
-        int pool_id;
+        int pool_id = mapping->host->i_sb->cleancache_poolid;
-        int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
        struct cleancache_filekey key = { .u.key = { 0 } };
        if (!cleancache_ops)
                return;
-        if (fake_pool_id < 0)
-                return;
-        pool_id = get_poolid_from_fake(fake_pool_id);
        if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
                cleancache_ops->invalidate_inode(pool_id, key);
 }
@@ -363,32 +291,18 @@ EXPORT_SYMBOL(__cleancache_invalidate_inode);
 */
 void __cleancache_invalidate_fs(struct super_block *sb)
 {
-        int index;
+        int pool_id;
-        int fake_pool_id = sb->cleancache_poolid;
-        int old_poolid = fake_pool_id;
-        mutex_lock(&poolid_mutex);
+        pool_id = sb->cleancache_poolid;
-        if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) {
+        sb->cleancache_poolid = CLEANCACHE_NO_POOL;
-                index = fake_pool_id - FAKE_SHARED_FS_POOLID_OFFSET;
-                old_poolid = shared_fs_poolid_map[index];
+        if (cleancache_ops && pool_id >= 0)
-                shared_fs_poolid_map[index] = FS_UNKNOWN;
+                cleancache_ops->invalidate_fs(pool_id);
-                uuids[index] = NULL;
-        } else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) {
-                index = fake_pool_id - FAKE_FS_POOLID_OFFSET;
-                old_poolid = fs_poolid_map[index];
-                fs_poolid_map[index] = FS_UNKNOWN;
-        }
-        sb->cleancache_poolid = -1;
-        if (cleancache_ops)
-                cleancache_ops->invalidate_fs(old_poolid);
-        mutex_unlock(&poolid_mutex);
 }
 EXPORT_SYMBOL(__cleancache_invalidate_fs);
 static int __init init_cleancache(void)
 {
-        int i;
 #ifdef CONFIG_DEBUG_FS
        struct dentry *root = debugfs_create_dir("cleancache", NULL);
        if (root == NULL)
@@ -400,10 +314,6 @@ static int __init init_cleancache(void)
        debugfs_create_u64("invalidates", S_IRUGO,
                                root, &cleancache_invalidates);
 #endif
-        for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
-                fs_poolid_map[i] = FS_UNKNOWN;
-                shared_fs_poolid_map[i] = FS_UNKNOWN;
-        }
        return 0;
 }
 module_init(init_cleancache)
diff --git a/mm/cma.c b/mm/cma.c
index 75016fd1de90..3a7a67b93394 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -23,6 +23,7 @@
 #  define DEBUG
 #endif
 #endif
+#define CREATE_TRACE_POINTS
 #include <linux/memblock.h>
 #include <linux/err.h>
@@ -34,59 +35,54 @@
 #include <linux/cma.h>
 #include <linux/highmem.h>
 #include <linux/io.h>
+#include <trace/events/cma.h>
-struct cma {
+#include "cma.h"
-        unsigned long   base_pfn;
-        unsigned long   count;
-        unsigned long   *bitmap;
-        unsigned int order_per_bit; /* Order of pages represented by one bit */
-        struct mutex    lock;
-};
-static struct cma cma_areas[MAX_CMA_AREAS];
+struct cma cma_areas[MAX_CMA_AREAS];
-static unsigned cma_area_count;
+unsigned cma_area_count;
 static DEFINE_MUTEX(cma_mutex);
-phys_addr_t cma_get_base(struct cma *cma)
+phys_addr_t cma_get_base(const struct cma *cma)
 {
        return PFN_PHYS(cma->base_pfn);
 }
-unsigned long cma_get_size(struct cma *cma)
+unsigned long cma_get_size(const struct cma *cma)
 {
        return cma->count << PAGE_SHIFT;
 }
-static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order)
+static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
+                                             int align_order)
 {
        if (align_order <= cma->order_per_bit)
                return 0;
        return (1UL << (align_order - cma->order_per_bit)) - 1;
 }
-static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order)
+/*
+ * Find a PFN aligned to the specified order and return an offset represented in
+ * order_per_bits.
+ */
+static unsigned long cma_bitmap_aligned_offset(const struct cma *cma,
+                                               int align_order)
 {
-        unsigned int alignment;
        if (align_order <= cma->order_per_bit)
                return 0;
-        alignment = 1UL << (align_order - cma->order_per_bit);
-        return ALIGN(cma->base_pfn, alignment) -
-                (cma->base_pfn >> cma->order_per_bit);
-}
-static unsigned long cma_bitmap_maxno(struct cma *cma)
+        return (ALIGN(cma->base_pfn, (1UL << align_order))
-{
+                - cma->base_pfn) >> cma->order_per_bit;
-        return cma->count >> cma->order_per_bit;
 }
-static unsigned long cma_bitmap_pages_to_bits(struct cma *cma,
+static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma,
-                                                unsigned long pages)
+                                              unsigned long pages)
 {
        return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit;
 }
-static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count)
+static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
+                             unsigned int count)
 {
        unsigned long bitmap_no, bitmap_count;
@@ -132,6 +128,12 @@ static int __init cma_activate_area(struct cma *cma)
        } while (--i);
        mutex_init(&cma->lock);
+#ifdef CONFIG_CMA_DEBUGFS
+        INIT_HLIST_HEAD(&cma->mem_head);
+        spin_lock_init(&cma->mem_head_lock);
+#endif
        return 0;
 err:
@@ -165,7 +167,8 @@ core_initcall(cma_init_reserved_areas);
 * This function creates custom contiguous area from already reserved memory.
 */
 int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
-                                 int order_per_bit, struct cma **res_cma)
+                                 unsigned int order_per_bit,
+                                 struct cma **res_cma)
 {
        struct cma *cma;
        phys_addr_t alignment;
@@ -356,7 +359,7 @@ err:
 * This function allocates part of contiguous memory on specific
 * contiguous memory area.
 */
-struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
+struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align)
 {
        unsigned long mask, offset, pfn, start = 0;
        unsigned long bitmap_maxno, bitmap_no, bitmap_count;
@@ -413,6 +416,8 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
                start = bitmap_no + mask + 1;
        }
+        trace_cma_alloc(page ? pfn : -1UL, page, count, align);
        pr_debug("%s(): returned %p\n", __func__, page);
        return page;
 }
@@ -427,7 +432,7 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
 * It returns false when provided pages do not belong to contiguous area and
 * true otherwise.
 */
-bool cma_release(struct cma *cma, struct page *pages, int count)
+bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
 {
        unsigned long pfn;
@@ -445,6 +450,7 @@ bool cma_release(struct cma *cma, struct page *pages, int count)
        free_contig_range(pfn, count);
        cma_clear_bitmap(cma, pfn, count);
+        trace_cma_release(pfn, pages, count);
        return true;
 }
diff --git a/mm/cma.h b/mm/cma.h
new file mode 100644
index 000000000000..1132d733556d
--- /dev/null
+++ b/mm/cma.h
@@ -0,0 +1,24 @@
+#ifndef __MM_CMA_H__
+#define __MM_CMA_H__
+struct cma {
+        unsigned long   base_pfn;
+        unsigned long   count;
+        unsigned long   *bitmap;
+        unsigned int order_per_bit; /* Order of pages represented by one bit */
+        struct mutex    lock;
+#ifdef CONFIG_CMA_DEBUGFS
+        struct hlist_head mem_head;
+        spinlock_t mem_head_lock;
+#endif
+};
+extern struct cma cma_areas[MAX_CMA_AREAS];
+extern unsigned cma_area_count;
+static unsigned long cma_bitmap_maxno(struct cma *cma)
+{
+        return cma->count >> cma->order_per_bit;
+}
+#endif
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
new file mode 100644
index 000000000000..7621ee34daa0
--- /dev/null
+++ b/mm/cma_debug.c
@@ -0,0 +1,205 @@
+/*
+ * CMA DebugFS Interface
+ *
+ * Copyright (c) 2015 Sasha Levin <sasha.levin@oracle.com>
+ */
+#include <linux/debugfs.h>
+#include <linux/cma.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm_types.h>
+#include "cma.h"
+struct cma_mem {
+        struct hlist_node node;
+        struct page *p;
+        unsigned long n;
+};
+static struct dentry *cma_debugfs_root;
+static int cma_debugfs_get(void *data, u64 *val)
+{
+        unsigned long *p = data;
+        *val = *p;
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n");
+static int cma_used_get(void *data, u64 *val)
+{
+        struct cma *cma = data;
+        unsigned long used;
+        mutex_lock(&cma->lock);
+        /* pages counter is smaller than sizeof(int) */
+        used = bitmap_weight(cma->bitmap, (int)cma->count);
+        mutex_unlock(&cma->lock);
+        *val = (u64)used << cma->order_per_bit;
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n");
+static int cma_maxchunk_get(void *data, u64 *val)
+{
+        struct cma *cma = data;
+        unsigned long maxchunk = 0;
+        unsigned long start, end = 0;
+        mutex_lock(&cma->lock);
+        for (;;) {
+                start = find_next_zero_bit(cma->bitmap, cma->count, end);
+                if (start >= cma->count)
+                        break;
+                end = find_next_bit(cma->bitmap, cma->count, start);
+                maxchunk = max(end - start, maxchunk);
+        }
+        mutex_unlock(&cma->lock);
+        *val = (u64)maxchunk << cma->order_per_bit;
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n");
+static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem)
+{
+        spin_lock(&cma->mem_head_lock);
+        hlist_add_head(&mem->node, &cma->mem_head);
+        spin_unlock(&cma->mem_head_lock);
+}
+static struct cma_mem *cma_get_entry_from_list(struct cma *cma)
+{
+        struct cma_mem *mem = NULL;
+        spin_lock(&cma->mem_head_lock);
+        if (!hlist_empty(&cma->mem_head)) {
+                mem = hlist_entry(cma->mem_head.first, struct cma_mem, node);
+                hlist_del_init(&mem->node);
+        }
+        spin_unlock(&cma->mem_head_lock);
+        return mem;
+}
+static int cma_free_mem(struct cma *cma, int count)
+{
+        struct cma_mem *mem = NULL;
+        while (count) {
+                mem = cma_get_entry_from_list(cma);
+                if (mem == NULL)
+                        return 0;
+                if (mem->n <= count) {
+                        cma_release(cma, mem->p, mem->n);
+                        count -= mem->n;
+                        kfree(mem);
+                } else if (cma->order_per_bit == 0) {
+                        cma_release(cma, mem->p, count);
+                        mem->p += count;
+                        mem->n -= count;
+                        count = 0;
+                        cma_add_to_cma_mem_list(cma, mem);
+                } else {
+                        pr_debug("cma: cannot release partial block when order_per_bit != 0\n");
+                        cma_add_to_cma_mem_list(cma, mem);
+                        break;
+                }
+        }
+        return 0;
+}
+static int cma_free_write(void *data, u64 val)
+{
+        int pages = val;
+        struct cma *cma = data;
+        return cma_free_mem(cma, pages);
+}
+DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n");
+static int cma_alloc_mem(struct cma *cma, int count)
+{
+        struct cma_mem *mem;
+        struct page *p;
+        mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+        if (!mem)
+                return -ENOMEM;
+        p = cma_alloc(cma, count, 0);
+        if (!p) {
+                kfree(mem);
+                return -ENOMEM;
+        }
+        mem->p = p;
+        mem->n = count;
+        cma_add_to_cma_mem_list(cma, mem);
+        return 0;
+}
+static int cma_alloc_write(void *data, u64 val)
+{
+        int pages = val;
+        struct cma *cma = data;
+        return cma_alloc_mem(cma, pages);
+}
+DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
+static void cma_debugfs_add_one(struct cma *cma, int idx)
+{
+        struct dentry *tmp;
+        char name[16];
+        int u32s;
+        sprintf(name, "cma-%d", idx);
+        tmp = debugfs_create_dir(name, cma_debugfs_root);
+        debugfs_create_file("alloc", S_IWUSR, cma_debugfs_root, cma,
+                                &cma_alloc_fops);
+        debugfs_create_file("free", S_IWUSR, cma_debugfs_root, cma,
+                                &cma_free_fops);
+        debugfs_create_file("base_pfn", S_IRUGO, tmp,
+                                &cma->base_pfn, &cma_debugfs_fops);
+        debugfs_create_file("count", S_IRUGO, tmp,
+                                &cma->count, &cma_debugfs_fops);
+        debugfs_create_file("order_per_bit", S_IRUGO, tmp,
+                                &cma->order_per_bit, &cma_debugfs_fops);
+        debugfs_create_file("used", S_IRUGO, tmp, cma, &cma_used_fops);
+        debugfs_create_file("maxchunk", S_IRUGO, tmp, cma, &cma_maxchunk_fops);
+        u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32));
+        debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s);
+}
+static int __init cma_debugfs_init(void)
+{
+        int i;
+        cma_debugfs_root = debugfs_create_dir("cma", NULL);
+        if (!cma_debugfs_root)
+                return -ENOMEM;
+        for (i = 0; i < cma_area_count; i++)
+                cma_debugfs_add_one(&cma_areas[i], i);
+        return 0;
+}
+late_initcall(cma_debugfs_init);
diff --git a/mm/compaction.c b/mm/compaction.c
index 8c0d9459b54a..018f08da99a2 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -391,28 +391,6 @@ static inline bool compact_should_abort(struct compact_control *cc)
        return false;
 }
-/* Returns true if the page is within a block suitable for migration to */
-static bool suitable_migration_target(struct page *page)
-{
-        /* If the page is a large free page, then disallow migration */
-        if (PageBuddy(page)) {
-                /*
-                 * We are checking page_order without zone->lock taken. But
-                 * the only small danger is that we skip a potentially suitable
-                 * pageblock, so it's not worth to check order for valid range.
-                 */
-                if (page_order_unsafe(page) >= pageblock_order)
-                        return false;
-        }
-        /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
-        if (migrate_async_suitable(get_pageblock_migratetype(page)))
-                return true;
-        /* Otherwise skip the block */
-        return false;
-}
 /*
 * Isolate free pages onto a private freelist. If @strict is true, will abort
 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
@@ -896,6 +874,29 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
 #ifdef CONFIG_COMPACTION
+/* Returns true if the page is within a block suitable for migration to */
+static bool suitable_migration_target(struct page *page)
+{
+        /* If the page is a large free page, then disallow migration */
+        if (PageBuddy(page)) {
+                /*
+                 * We are checking page_order without zone->lock taken. But
+                 * the only small danger is that we skip a potentially suitable
+                 * pageblock, so it's not worth to check order for valid range.
+                 */
+                if (page_order_unsafe(page) >= pageblock_order)
+                        return false;
+        }
+        /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
+        if (migrate_async_suitable(get_pageblock_migratetype(page)))
+                return true;
+        /* Otherwise skip the block */
+        return false;
+}
 /*
 * Based on information in the current compact_control, find blocks
 * suitable for isolating free pages from and then isolate them.
@@ -1047,6 +1048,12 @@ typedef enum {
 } isolate_migrate_t;
 /*
+ * Allow userspace to control policy on scanning the unevictable LRU for
+ * compactable pages.
+ */
+int sysctl_compact_unevictable_allowed __read_mostly = 1;
+/*
 * Isolate all pages that can be migrated from the first suitable block,
 * starting at the block pointed to by the migrate scanner pfn within
 * compact_control.
@@ -1057,6 +1064,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
        unsigned long low_pfn, end_pfn;
        struct page *page;
        const isolate_mode_t isolate_mode =
+                (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
                (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
        /*
@@ -1174,13 +1182,24 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
        /* Direct compactor: Is a suitable page free? */
        for (order = cc->order; order < MAX_ORDER; order++) {
                struct free_area *area = &zone->free_area[order];
+                bool can_steal;
                /* Job done if page is free of the right migratetype */
                if (!list_empty(&area->free_list[migratetype]))
                        return COMPACT_PARTIAL;
-                /* Job done if allocation would set block type */
+#ifdef CONFIG_CMA
-                if (order >= pageblock_order && area->nr_free)
+                /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
+                if (migratetype == MIGRATE_MOVABLE &&
+                        !list_empty(&area->free_list[MIGRATE_CMA]))
+                        return COMPACT_PARTIAL;
+#endif
+                /*
+                 * Job done if allocation would steal freepages from
+                 * other migratetype buddy lists.
+                 */
+                if (find_suitable_fallback(area, order, migratetype,
+                                                true, &can_steal) != -1)
                        return COMPACT_PARTIAL;
        }
@@ -1587,6 +1606,14 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
                INIT_LIST_HEAD(&cc->freepages);
                INIT_LIST_HEAD(&cc->migratepages);
+                /*
+                 * When called via /proc/sys/vm/compact_memory
+                 * this makes sure we compact the whole zone regardless of
+                 * cached scanner positions.
+                 */
+                if (cc->order == -1)
+                        __reset_isolation_suitable(zone);
                if (cc->order == -1 || !compaction_deferred(zone, cc->order))
                        compact_zone(zone, cc);
diff --git a/mm/filemap.c b/mm/filemap.c
index ad7242043bdb..6bf5e42d560a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -13,7 +13,6 @@
 #include <linux/compiler.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
-#include <linux/aio.h>
 #include <linux/capability.h>
 #include <linux/kernel_stat.h>
 #include <linux/gfp.h>
@@ -203,16 +202,15 @@ void __delete_from_page_cache(struct page *page, void *shadow)
        BUG_ON(page_mapped(page));
        /*
-         * Some filesystems seem to re-dirty the page even after
+         * At this point page must be either written or cleaned by truncate.
-         * the VM has canceled the dirty bit (eg ext3 journaling).
+         * Dirty page here signals a bug and loss of unwritten data.
         *
-         * Fix it up by doing a final dirty accounting check after
+         * This fixes dirty accounting after removing the page entirely but
-         * having removed the page entirely.
+         * leaves PageDirty set: it has no effect for truncated page and
+         * anyway will be cleared before returning page into buddy allocator.
         */
-        if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
+        if (WARN_ON_ONCE(PageDirty(page)))
-                dec_zone_page_state(page, NR_FILE_DIRTY);
+                account_page_cleaned(page, mapping);
-                dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
-        }
 }
 /**
@@ -1695,7 +1693,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
        loff_t *ppos = &iocb->ki_pos;
        loff_t pos = *ppos;
-        if (io_is_direct(file)) {
+        if (iocb->ki_flags & IOCB_DIRECT) {
                struct address_space *mapping = file->f_mapping;
                struct inode *inode = mapping->host;
                size_t count = iov_iter_count(iter);
@@ -1708,7 +1706,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
                                        pos + count - 1);
                if (!retval) {
                        struct iov_iter data = *iter;
-                        retval = mapping->a_ops->direct_IO(READ, iocb, &data, pos);
+                        retval = mapping->a_ops->direct_IO(iocb, &data, pos);
                }
                if (retval > 0) {
@@ -2261,41 +2259,38 @@ EXPORT_SYMBOL(read_cache_page_gfp);
 * Returns appropriate error code that caller should return or
 * zero in case that write should be allowed.
 */
-inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
+inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
 {
+        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        unsigned long limit = rlimit(RLIMIT_FSIZE);
+        loff_t pos;
-        if (unlikely(*pos < 0))
+        if (!iov_iter_count(from))
-                return -EINVAL;
+                return 0;
-        if (!isblk) {
+        /* FIXME: this is for backwards compatibility with 2.4 */
-                /* FIXME: this is for backwards compatibility with 2.4 */
+        if (iocb->ki_flags & IOCB_APPEND)
-                if (file->f_flags & O_APPEND)
+                iocb->ki_pos = i_size_read(inode);
-                        *pos = i_size_read(inode);
-                if (limit != RLIM_INFINITY) {
+        pos = iocb->ki_pos;
-                        if (*pos >= limit) {
-                                send_sig(SIGXFSZ, current, 0);
+        if (limit != RLIM_INFINITY) {
-                                return -EFBIG;
+                if (iocb->ki_pos >= limit) {
-                        }
+                        send_sig(SIGXFSZ, current, 0);
-                        if (*count > limit - (typeof(limit))*pos) {
+                        return -EFBIG;
-                                *count = limit - (typeof(limit))*pos;
-                        }
                }
+                iov_iter_truncate(from, limit - (unsigned long)pos);
        }
        /*
         * LFS rule
         */
-        if (unlikely(*pos + *count > MAX_NON_LFS &&
+        if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS &&
                                !(file->f_flags & O_LARGEFILE))) {
-                if (*pos >= MAX_NON_LFS) {
+                if (pos >= MAX_NON_LFS)
                        return -EFBIG;
-                }
+                iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos);
-                if (*count > MAX_NON_LFS - (unsigned long)*pos) {
-                        *count = MAX_NON_LFS - (unsigned long)*pos;
-                }
        }
        /*
@@ -2305,34 +2300,11 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
         * exceeded without writing data we send a signal and return EFBIG.
         * Linus frestrict idea will clean these up nicely..
         */
-        if (likely(!isblk)) {
+        if (unlikely(pos >= inode->i_sb->s_maxbytes))
-                if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
+                return -EFBIG;
-                        if (*count || *pos > inode->i_sb->s_maxbytes) {
-                                return -EFBIG;
-                        }
-                        /* zero-length writes at ->s_maxbytes are OK */
-                }
-                if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
-                        *count = inode->i_sb->s_maxbytes - *pos;
-        } else {
-#ifdef CONFIG_BLOCK
-                loff_t isize;
-                if (bdev_read_only(I_BDEV(inode)))
-                        return -EPERM;
-                isize = i_size_read(inode);
-                if (*pos >= isize) {
-                        if (*count || *pos > isize)
-                                return -ENOSPC;
-                }
-                if (*pos + *count > isize)
+        iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos);
-                        *count = isize - *pos;
+        return iov_iter_count(from);
-#else
-                return -EPERM;
-#endif
-        }
-        return 0;
 }
 EXPORT_SYMBOL(generic_write_checks);
@@ -2396,7 +2368,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
        }
        data = *from;
-        written = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos);
+        written = mapping->a_ops->direct_IO(iocb, &data, pos);
        /*
         * Finally, try again to invalidate clean pages which might have been
@@ -2558,23 +2530,12 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        struct file *file = iocb->ki_filp;
        struct address_space * mapping = file->f_mapping;
        struct inode    *inode = mapping->host;
-        loff_t          pos = iocb->ki_pos;
        ssize_t         written = 0;
        ssize_t         err;
        ssize_t         status;
-        size_t          count = iov_iter_count(from);
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = inode_to_bdi(inode);
-        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
-        if (err)
-                goto out;
-        if (count == 0)
-                goto out;
-        iov_iter_truncate(from, count);
        err = file_remove_suid(file);
        if (err)
                goto out;
@@ -2583,10 +2544,10 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        if (err)
                goto out;
-        if (io_is_direct(file)) {
+        if (iocb->ki_flags & IOCB_DIRECT) {
-                loff_t endbyte;
+                loff_t pos, endbyte;
-                written = generic_file_direct_write(iocb, from, pos);
+                written = generic_file_direct_write(iocb, from, iocb->ki_pos);
                /*
                 * If the write stopped short of completing, fall back to
                 * buffered writes.  Some filesystems do this for writes to
@@ -2594,13 +2555,10 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
                 * not succeed (even if it did, DAX does not handle dirty
                 * page-cache pages correctly).
                 */
-                if (written < 0 || written == count || IS_DAX(inode))
+                if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
                        goto out;
-                pos += written;
+                status = generic_perform_write(file, from, pos = iocb->ki_pos);
-                count -= written;
-                status = generic_perform_write(file, from, pos);
                /*
                 * If generic_perform_write() returned a synchronous error
                 * then we want to return the number of bytes which were
@@ -2612,15 +2570,15 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
                        err = status;
                        goto out;
                }
-                iocb->ki_pos = pos + status;
                /*
                 * We need to ensure that the page cache pages are written to
                 * disk and invalidated to preserve the expected O_DIRECT
                 * semantics.
                 */
                endbyte = pos + status - 1;
-                err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
+                err = filemap_write_and_wait_range(mapping, pos, endbyte);
                if (err == 0) {
+                        iocb->ki_pos = endbyte + 1;
                        written += status;
                        invalidate_mapping_pages(mapping,
                                                 pos >> PAGE_CACHE_SHIFT,
@@ -2632,9 +2590,9 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
                         */
                }
        } else {
-                written = generic_perform_write(file, from, pos);
+                written = generic_perform_write(file, from, iocb->ki_pos);
-                if (likely(written >= 0))
+                if (likely(written > 0))
-                        iocb->ki_pos = pos + written;
+                        iocb->ki_pos += written;
        }
 out:
        current->backing_dev_info = NULL;
@@ -2658,7 +2616,9 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        ssize_t ret;
        mutex_lock(&inode->i_mutex);
-        ret = __generic_file_write_iter(iocb, from);
+        ret = generic_write_checks(iocb, from);
+        if (ret > 0)
+                ret = __generic_file_write_iter(iocb, from);
        mutex_unlock(&inode->i_mutex);
        if (ret > 0) {
diff --git a/mm/gup.c b/mm/gup.c
index a6e24e246f86..6297f6bccfb1 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -92,7 +92,7 @@ retry:
                 */
                mark_page_accessed(page);
        }
-        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+        if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
                /*
                 * The preliminary mapping check is mainly to avoid the
                 * pointless overhead of lock_page on the ZERO_PAGE
@@ -265,8 +265,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
        unsigned int fault_flags = 0;
        int ret;
-        /* For mlock, just skip the stack guard page. */
+        /* For mm_populate(), just skip the stack guard page. */
-        if ((*flags & FOLL_MLOCK) &&
+        if ((*flags & FOLL_POPULATE) &&
                        (stack_guard_page_start(vma, address) ||
                         stack_guard_page_end(vma, address + PAGE_SIZE)))
                return -ENOENT;
@@ -819,6 +819,124 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 EXPORT_SYMBOL(get_user_pages);
 /**
+ * populate_vma_page_range() -  populate a range of pages in the vma.
+ * @vma:   target vma
+ * @start: start address
+ * @end:   end address
+ * @nonblocking:
+ *
+ * This takes care of mlocking the pages too if VM_LOCKED is set.
+ *
+ * return 0 on success, negative error code on error.
+ *
+ * vma->vm_mm->mmap_sem must be held.
+ *
+ * If @nonblocking is NULL, it may be held for read or write and will
+ * be unperturbed.
+ *
+ * If @nonblocking is non-NULL, it must held for read only and may be
+ * released.  If it's released, *@nonblocking will be set to 0.
+ */
+long populate_vma_page_range(struct vm_area_struct *vma,
+                unsigned long start, unsigned long end, int *nonblocking)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        unsigned long nr_pages = (end - start) / PAGE_SIZE;
+        int gup_flags;
+        VM_BUG_ON(start & ~PAGE_MASK);
+        VM_BUG_ON(end   & ~PAGE_MASK);
+        VM_BUG_ON_VMA(start < vma->vm_start, vma);
+        VM_BUG_ON_VMA(end   > vma->vm_end, vma);
+        VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
+        gup_flags = FOLL_TOUCH | FOLL_POPULATE;
+        /*
+         * We want to touch writable mappings with a write fault in order
+         * to break COW, except for shared mappings because these don't COW
+         * and we would not want to dirty them for nothing.
+         */
+        if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
+                gup_flags |= FOLL_WRITE;
+        /*
+         * We want mlock to succeed for regions that have any permissions
+         * other than PROT_NONE.
+         */
+        if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
+                gup_flags |= FOLL_FORCE;
+        /*
+         * We made sure addr is within a VMA, so the following will
+         * not result in a stack expansion that recurses back here.
+         */
+        return __get_user_pages(current, mm, start, nr_pages, gup_flags,
+                                NULL, NULL, nonblocking);
+}
+/*
+ * __mm_populate - populate and/or mlock pages within a range of address space.
+ *
+ * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
+ * flags. VMAs must be already marked with the desired vm_flags, and
+ * mmap_sem must not be held.
+ */
+int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
+{
+        struct mm_struct *mm = current->mm;
+        unsigned long end, nstart, nend;
+        struct vm_area_struct *vma = NULL;
+        int locked = 0;
+        long ret = 0;
+        VM_BUG_ON(start & ~PAGE_MASK);
+        VM_BUG_ON(len != PAGE_ALIGN(len));
+        end = start + len;
+        for (nstart = start; nstart < end; nstart = nend) {
+                /*
+                 * We want to fault in pages for [nstart; end) address range.
+                 * Find first corresponding VMA.
+                 */
+                if (!locked) {
+                        locked = 1;
+                        down_read(&mm->mmap_sem);
+                        vma = find_vma(mm, nstart);
+                } else if (nstart >= vma->vm_end)
+                        vma = vma->vm_next;
+                if (!vma || vma->vm_start >= end)
+                        break;
+                /*
+                 * Set [nstart; nend) to intersection of desired address
+                 * range with the first VMA. Also, skip undesirable VMA types.
+                 */
+                nend = min(end, vma->vm_end);
+                if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+                        continue;
+                if (nstart < vma->vm_start)
+                        nstart = vma->vm_start;
+                /*
+                 * Now fault in a range of pages. populate_vma_page_range()
+                 * double checks the vma flags, so that it won't mlock pages
+                 * if the vma was already munlocked.
+                 */
+                ret = populate_vma_page_range(vma, nstart, nend, &locked);
+                if (ret < 0) {
+                        if (ignore_errors) {
+                                ret = 0;
+                                continue;       /* continue at next VMA */
+                        }
+                        break;
+                }
+                nend = nstart + ret * PAGE_SIZE;
+                ret = 0;
+        }
+        if (locked)
+                up_read(&mm->mmap_sem);
+        return ret;     /* 0 or negative error code */
+}
+/**
 * get_dump_page() - pin user page in memory while writing it to core dump
 * @addr: user address
 *
@@ -901,7 +1019,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
                 *
                 * for an example see gup_get_pte in arch/x86/mm/gup.c
                 */
-                pte_t pte = ACCESS_ONCE(*ptep);
+                pte_t pte = READ_ONCE(*ptep);
                struct page *page;
                /*
@@ -1191,7 +1309,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
        local_irq_save(flags);
        pgdp = pgd_offset(mm, addr);
        do {
-                pgd_t pgd = ACCESS_ONCE(*pgdp);
+                pgd_t pgd = READ_ONCE(*pgdp);
                next = pgd_addr_end(addr, end);
                if (pgd_none(pgd))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fc00c8cb5a82..078832cf3636 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -67,6 +67,7 @@ static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
 static int khugepaged(void *none);
 static int khugepaged_slab_init(void);
+static void khugepaged_slab_exit(void);
 #define MM_SLOTS_HASH_BITS 10
 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
@@ -109,9 +110,6 @@ static int set_recommended_min_free_kbytes(void)
        int nr_zones = 0;
        unsigned long recommended_min;
-        if (!khugepaged_enabled())
-                return 0;
        for_each_populated_zone(zone)
                nr_zones++;
@@ -143,9 +141,8 @@ static int set_recommended_min_free_kbytes(void)
        setup_per_zone_wmarks();
        return 0;
 }
-late_initcall(set_recommended_min_free_kbytes);
-static int start_khugepaged(void)
+static int start_stop_khugepaged(void)
 {
        int err = 0;
        if (khugepaged_enabled()) {
@@ -156,6 +153,7 @@ static int start_khugepaged(void)
                        pr_err("khugepaged: kthread_run(khugepaged) failed\n");
                        err = PTR_ERR(khugepaged_thread);
                        khugepaged_thread = NULL;
+                        goto fail;
                }
                if (!list_empty(&khugepaged_scan.mm_head))
@@ -166,7 +164,7 @@ static int start_khugepaged(void)
                kthread_stop(khugepaged_thread);
                khugepaged_thread = NULL;
        }
+fail:
        return err;
 }
@@ -183,7 +181,7 @@ static struct page *get_huge_zero_page(void)
        struct page *zero_page;
 retry:
        if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
-                return ACCESS_ONCE(huge_zero_page);
+                return READ_ONCE(huge_zero_page);
        zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
                        HPAGE_PMD_ORDER);
@@ -202,7 +200,7 @@ retry:
        /* We take additional reference here. It will be put back by shrinker */
        atomic_set(&huge_zero_refcount, 2);
        preempt_enable();
-        return ACCESS_ONCE(huge_zero_page);
+        return READ_ONCE(huge_zero_page);
 }
 static void put_huge_zero_page(void)
@@ -300,7 +298,7 @@ static ssize_t enabled_store(struct kobject *kobj,
                int err;
                mutex_lock(&khugepaged_mutex);
-                err = start_khugepaged();
+                err = start_stop_khugepaged();
                mutex_unlock(&khugepaged_mutex);
                if (err)
@@ -634,27 +632,38 @@ static int __init hugepage_init(void)
        err = hugepage_init_sysfs(&hugepage_kobj);
        if (err)
-                return err;
+                goto err_sysfs;
        err = khugepaged_slab_init();
        if (err)
-                goto out;
+                goto err_slab;
-        register_shrinker(&huge_zero_page_shrinker);
+        err = register_shrinker(&huge_zero_page_shrinker);
+        if (err)
+                goto err_hzp_shrinker;
        /*
         * By default disable transparent hugepages on smaller systems,
         * where the extra memory used could hurt more than TLB overhead
         * is likely to save.  The admin can still enable it through /sys.
         */
-        if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
+        if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
                transparent_hugepage_flags = 0;
+                return 0;
+        }
-        start_khugepaged();
+        err = start_stop_khugepaged();
+        if (err)
+                goto err_khugepaged;
        return 0;
-out:
+err_khugepaged:
+        unregister_shrinker(&huge_zero_page_shrinker);
+err_hzp_shrinker:
+        khugepaged_slab_exit();
+err_slab:
        hugepage_exit_sysfs(hugepage_kobj);
+err_sysfs:
        return err;
 }
 subsys_initcall(hugepage_init);
@@ -708,7 +717,7 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long haddr, pmd_t *pmd,
-                                        struct page *page)
+                                        struct page *page, gfp_t gfp)
 {
        struct mem_cgroup *memcg;
        pgtable_t pgtable;
@@ -716,7 +725,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
        VM_BUG_ON_PAGE(!PageCompound(page), page);
-        if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg))
+        if (mem_cgroup_try_charge(page, mm, gfp, &memcg))
                return VM_FAULT_OOM;
        pgtable = pte_alloc_one(mm, haddr);
@@ -822,7 +831,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
        }
-        if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
+        if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) {
                put_page(page);
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
@@ -1080,6 +1089,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long haddr;
        unsigned long mmun_start;       /* For mmu_notifiers */
        unsigned long mmun_end;         /* For mmu_notifiers */
+        gfp_t huge_gfp;                 /* for allocation and charge */
        ptl = pmd_lockptr(mm, pmd);
        VM_BUG_ON_VMA(!vma->anon_vma, vma);
@@ -1106,10 +1116,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 alloc:
        if (transparent_hugepage_enabled(vma) &&
            !transparent_hugepage_debug_cow()) {
-                gfp_t gfp;
+                huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+                new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
-                gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
-                new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
        } else
                new_page = NULL;
@@ -1130,8 +1138,7 @@ alloc:
                goto out;
        }
-        if (unlikely(mem_cgroup_try_charge(new_page, mm,
+        if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) {
-                                           GFP_TRANSHUGE, &memcg))) {
                put_page(new_page);
                if (page) {
                        split_huge_page(page);
@@ -1231,7 +1238,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                                          pmd, _pmd,  1))
                        update_mmu_cache_pmd(vma, addr, pmd);
        }
-        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+        if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
                if (page->mapping && trylock_page(page)) {
                        lru_add_drain();
                        if (page->mapping)
@@ -1260,6 +1267,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        int target_nid, last_cpupid = -1;
        bool page_locked;
        bool migrated = false;
+        bool was_writable;
        int flags = 0;
        /* A PROT_NONE fault should not end up here */
@@ -1291,12 +1299,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                flags |= TNF_FAULT_LOCAL;
        }
-        /*
+        /* See similar comment in do_numa_page for explanation */
-         * Avoid grouping on DSO/COW pages in specific and RO pages
+        if (!(vma->vm_flags & VM_WRITE))
-         * in general, RO pages shouldn't hurt as much anyway since
-         * they can be in shared cache state.
-         */
-        if (!pmd_write(pmd))
                flags |= TNF_NO_GROUP;
        /*
@@ -1353,12 +1357,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (migrated) {
                flags |= TNF_MIGRATED;
                page_nid = target_nid;
-        }
+        } else
+                flags |= TNF_MIGRATE_FAIL;
        goto out;
 clear_pmdnuma:
        BUG_ON(!PageLocked(page));
+        was_writable = pmd_write(pmd);
        pmd = pmd_modify(pmd, vma->vm_page_prot);
+        pmd = pmd_mkyoung(pmd);
+        if (was_writable)
+                pmd = pmd_mkwrite(pmd);
        set_pmd_at(mm, haddr, pmdp, pmd);
        update_mmu_cache_pmd(vma, addr, pmdp);
        unlock_page(page);
@@ -1482,6 +1491,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
        if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
                pmd_t entry;
+                bool preserve_write = prot_numa && pmd_write(*pmd);
+                ret = 1;
                /*
                 * Avoid trapping faults against the zero page. The read-only
@@ -1490,16 +1501,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                 */
                if (prot_numa && is_huge_zero_pmd(*pmd)) {
                        spin_unlock(ptl);
-                        return 0;
+                        return ret;
                }
                if (!prot_numa || !pmd_protnone(*pmd)) {
-                        ret = 1;
                        entry = pmdp_get_and_clear_notify(mm, addr, pmd);
                        entry = pmd_modify(entry, newprot);
+                        if (preserve_write)
+                                entry = pmd_mkwrite(entry);
                        ret = HPAGE_PMD_NR;
                        set_pmd_at(mm, addr, pmd, entry);
-                        BUG_ON(pmd_write(entry));
+                        BUG_ON(!preserve_write && pmd_write(entry));
                }
                spin_unlock(ptl);
        }
@@ -1971,6 +1983,11 @@ static int __init khugepaged_slab_init(void)
        return 0;
 }
+static void __init khugepaged_slab_exit(void)
+{
+        kmem_cache_destroy(mm_slot_cache);
+}
 static inline struct mm_slot *alloc_mm_slot(void)
 {
        if (!mm_slot_cache)     /* initialization failed */
@@ -2104,7 +2121,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
 {
        while (--_pte >= pte) {
                pte_t pteval = *_pte;
-                if (!pte_none(pteval))
+                if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
                        release_pte_page(pte_page(pteval));
        }
 }
@@ -2115,13 +2132,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 {
        struct page *page;
        pte_t *_pte;
-        int none = 0;
+        int none_or_zero = 0;
        bool referenced = false, writable = false;
        for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
             _pte++, address += PAGE_SIZE) {
                pte_t pteval = *_pte;
-                if (pte_none(pteval)) {
+                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
-                        if (++none <= khugepaged_max_ptes_none)
+                        if (++none_or_zero <= khugepaged_max_ptes_none)
                                continue;
                        else
                                goto out;
@@ -2202,9 +2219,21 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
                pte_t pteval = *_pte;
                struct page *src_page;
-                if (pte_none(pteval)) {
+                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
                        clear_user_highpage(page, address);
                        add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
+                        if (is_zero_pfn(pte_pfn(pteval))) {
+                                /*
+                                 * ptl mostly unnecessary.
+                                 */
+                                spin_lock(ptl);
+                                /*
+                                 * paravirt calls inside pte_clear here are
+                                 * superfluous.
+                                 */
+                                pte_clear(vma->vm_mm, address, _pte);
+                                spin_unlock(ptl);
+                        }
                } else {
                        src_page = pte_page(pteval);
                        copy_user_highpage(page, src_page, address, vma);
@@ -2306,8 +2335,8 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
        return true;
 }
-static struct page
+static struct page *
-*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
                       struct vm_area_struct *vma, unsigned long address,
                       int node)
 {
@@ -2321,8 +2350,7 @@ static struct page
         */
        up_read(&mm->mmap_sem);
-        *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
+        *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER);
-                khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
        if (unlikely(!*hpage)) {
                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                *hpage = ERR_PTR(-ENOMEM);
@@ -2375,13 +2403,14 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
        return true;
 }
-static struct page
+static struct page *
-*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
                       struct vm_area_struct *vma, unsigned long address,
                       int node)
 {
        up_read(&mm->mmap_sem);
        VM_BUG_ON(!*hpage);
        return  *hpage;
 }
 #endif
@@ -2416,16 +2445,21 @@ static void collapse_huge_page(struct mm_struct *mm,
        struct mem_cgroup *memcg;
        unsigned long mmun_start;       /* For mmu_notifiers */
        unsigned long mmun_end;         /* For mmu_notifiers */
+        gfp_t gfp;
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        /* Only allocate from the target node */
+        gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
+                __GFP_THISNODE;
        /* release the mmap_sem read lock. */
-        new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
+        new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node);
        if (!new_page)
                return;
        if (unlikely(mem_cgroup_try_charge(new_page, mm,
-                                           GFP_TRANSHUGE, &memcg)))
+                                           gfp, &memcg)))
                return;
        /*
@@ -2538,7 +2572,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 {
        pmd_t *pmd;
        pte_t *pte, *_pte;
-        int ret = 0, none = 0;
+        int ret = 0, none_or_zero = 0;
        struct page *page;
        unsigned long _address;
        spinlock_t *ptl;
@@ -2556,8 +2590,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
        for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
             _pte++, _address += PAGE_SIZE) {
                pte_t pteval = *_pte;
-                if (pte_none(pteval)) {
+                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
-                        if (++none <= khugepaged_max_ptes_none)
+                        if (++none_or_zero <= khugepaged_max_ptes_none)
                                continue;
                        else
                                goto out_unmap;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0a9ac6c26832..271e4432734c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -61,6 +61,9 @@ DEFINE_SPINLOCK(hugetlb_lock);
 static int num_fault_mutexes;
 static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
+/* Forward declaration */
+static int hugetlb_acct_memory(struct hstate *h, long delta);
 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
 {
        bool free = (spool->count == 0) && (spool->used_hpages == 0);
@@ -68,23 +71,36 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
        spin_unlock(&spool->lock);
        /* If no pages are used, and no other handles to the subpool
-         * remain, free the subpool the subpool remain */
+         * remain, give up any reservations mased on minimum size and
-        if (free)
+         * free the subpool */
+        if (free) {
+                if (spool->min_hpages != -1)
+                        hugetlb_acct_memory(spool->hstate,
+                                                -spool->min_hpages);
                kfree(spool);
+        }
 }
-struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
+struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
+                                                long min_hpages)
 {
        struct hugepage_subpool *spool;
-        spool = kmalloc(sizeof(*spool), GFP_KERNEL);
+        spool = kzalloc(sizeof(*spool), GFP_KERNEL);
        if (!spool)
                return NULL;
        spin_lock_init(&spool->lock);
        spool->count = 1;
-        spool->max_hpages = nr_blocks;
+        spool->max_hpages = max_hpages;
-        spool->used_hpages = 0;
+        spool->hstate = h;
+        spool->min_hpages = min_hpages;
+        if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
+                kfree(spool);
+                return NULL;
+        }
+        spool->rsv_hpages = min_hpages;
        return spool;
 }
@@ -97,36 +113,89 @@ void hugepage_put_subpool(struct hugepage_subpool *spool)
        unlock_or_release_subpool(spool);
 }
-static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
+/*
+ * Subpool accounting for allocating and reserving pages.
+ * Return -ENOMEM if there are not enough resources to satisfy the
+ * the request.  Otherwise, return the number of pages by which the
+ * global pools must be adjusted (upward).  The returned value may
+ * only be different than the passed value (delta) in the case where
+ * a subpool minimum size must be manitained.
+ */
+static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
                                      long delta)
 {
-        int ret = 0;
+        long ret = delta;
        if (!spool)
-                return 0;
+                return ret;
        spin_lock(&spool->lock);
-        if ((spool->used_hpages + delta) <= spool->max_hpages) {
-                spool->used_hpages += delta;
+        if (spool->max_hpages != -1) {          /* maximum size accounting */
-        } else {
+                if ((spool->used_hpages + delta) <= spool->max_hpages)
-                ret = -ENOMEM;
+                        spool->used_hpages += delta;
+                else {
+                        ret = -ENOMEM;
+                        goto unlock_ret;
+                }
        }
-        spin_unlock(&spool->lock);
+        if (spool->min_hpages != -1) {          /* minimum size accounting */
+                if (delta > spool->rsv_hpages) {
+                        /*
+                         * Asking for more reserves than those already taken on
+                         * behalf of subpool.  Return difference.
+                         */
+                        ret = delta - spool->rsv_hpages;
+                        spool->rsv_hpages = 0;
+                } else {
+                        ret = 0;        /* reserves already accounted for */
+                        spool->rsv_hpages -= delta;
+                }
+        }
+unlock_ret:
+        spin_unlock(&spool->lock);
        return ret;
 }
-static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
+/*
+ * Subpool accounting for freeing and unreserving pages.
+ * Return the number of global page reservations that must be dropped.
+ * The return value may only be different than the passed value (delta)
+ * in the case where a subpool minimum size must be maintained.
+ */
+static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
                                       long delta)
 {
+        long ret = delta;
        if (!spool)
-                return;
+                return delta;
        spin_lock(&spool->lock);
-        spool->used_hpages -= delta;
-        /* If hugetlbfs_put_super couldn't free spool due to
+        if (spool->max_hpages != -1)            /* maximum size accounting */
-        * an outstanding quota reference, free it now. */
+                spool->used_hpages -= delta;
+        if (spool->min_hpages != -1) {          /* minimum size accounting */
+                if (spool->rsv_hpages + delta <= spool->min_hpages)
+                        ret = 0;
+                else
+                        ret = spool->rsv_hpages + delta - spool->min_hpages;
+                spool->rsv_hpages += delta;
+                if (spool->rsv_hpages > spool->min_hpages)
+                        spool->rsv_hpages = spool->min_hpages;
+        }
+        /*
+         * If hugetlbfs_put_super couldn't free spool due to an outstanding
+         * quota reference, free it now.
+         */
        unlock_or_release_subpool(spool);
+        return ret;
 }
 static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
@@ -855,6 +924,31 @@ struct hstate *size_to_hstate(unsigned long size)
        return NULL;
 }
+/*
+ * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
+ * to hstate->hugepage_activelist.)
+ *
+ * This function can be called for tail pages, but never returns true for them.
+ */
+bool page_huge_active(struct page *page)
+{
+        VM_BUG_ON_PAGE(!PageHuge(page), page);
+        return PageHead(page) && PagePrivate(&page[1]);
+}
+/* never called for tail page */
+static void set_page_huge_active(struct page *page)
+{
+        VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
+        SetPagePrivate(&page[1]);
+}
+static void clear_page_huge_active(struct page *page)
+{
+        VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
+        ClearPagePrivate(&page[1]);
+}
 void free_huge_page(struct page *page)
 {
        /*
@@ -874,7 +968,16 @@ void free_huge_page(struct page *page)
        restore_reserve = PagePrivate(page);
        ClearPagePrivate(page);
+        /*
+         * A return code of zero implies that the subpool will be under its
+         * minimum size if the reservation is not restored after page is free.
+         * Therefore, force restore_reserve operation.
+         */
+        if (hugepage_subpool_put_pages(spool, 1) == 0)
+                restore_reserve = true;
        spin_lock(&hugetlb_lock);
+        clear_page_huge_active(page);
        hugetlb_cgroup_uncharge_page(hstate_index(h),
                                     pages_per_huge_page(h), page);
        if (restore_reserve)
@@ -891,7 +994,6 @@ void free_huge_page(struct page *page)
                enqueue_huge_page(h, page);
        }
        spin_unlock(&hugetlb_lock);
-        hugepage_subpool_put_pages(spool, 1);
 }
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -917,7 +1019,6 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
        __SetPageHead(page);
        __ClearPageReserved(page);
        for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
-                __SetPageTail(p);
                /*
                 * For gigantic hugepages allocated through bootmem at
                 * boot, it's safer to be consistent with the not-gigantic
@@ -933,6 +1034,9 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
                __ClearPageReserved(p);
                set_page_count(p, 0);
                p->first_page = page;
+                /* Make sure p->first_page is always valid for PageTail() */
+                smp_wmb();
+                __SetPageTail(p);
        }
 }
@@ -1384,7 +1488,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        if (chg < 0)
                return ERR_PTR(-ENOMEM);
        if (chg || avoid_reserve)
-                if (hugepage_subpool_get_pages(spool, 1))
+                if (hugepage_subpool_get_pages(spool, 1) < 0)
                        return ERR_PTR(-ENOSPC);
        ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
@@ -2452,6 +2556,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
        struct resv_map *resv = vma_resv_map(vma);
        struct hugepage_subpool *spool = subpool_vma(vma);
        unsigned long reserve, start, end;
+        long gbl_reserve;
        if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
                return;
@@ -2464,8 +2569,12 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
        kref_put(&resv->refs, resv_map_release);
        if (reserve) {
-                hugetlb_acct_memory(h, -reserve);
+                /*
-                hugepage_subpool_put_pages(spool, reserve);
+                 * Decrement reserve counts.  The global reserve count may be
+                 * adjusted if the subpool has a minimum size.
+                 */
+                gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
+                hugetlb_acct_memory(h, -gbl_reserve);
        }
 }
@@ -2889,6 +2998,7 @@ retry_avoidcopy:
        copy_user_huge_page(new_page, old_page, address, vma,
                            pages_per_huge_page(h));
        __SetPageUptodate(new_page);
+        set_page_huge_active(new_page);
        mmun_start = address & huge_page_mask(h);
        mmun_end = mmun_start + huge_page_size(h);
@@ -3001,6 +3111,7 @@ retry:
                }
                clear_huge_page(page, address, pages_per_huge_page(h));
                __SetPageUptodate(page);
+                set_page_huge_active(page);
                if (vma->vm_flags & VM_MAYSHARE) {
                        int err;
@@ -3276,6 +3387,15 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                struct page *page;
                /*
+                 * If we have a pending SIGKILL, don't keep faulting pages and
+                 * potentially allocating memory.
+                 */
+                if (unlikely(fatal_signal_pending(current))) {
+                        remainder = 0;
+                        break;
+                }
+                /*
                 * Some archs (sparc64, sh*) have multiple pte_ts to
                 * each hugepage.  We have to make sure we get the
                 * first, for the page indexing below to work.
@@ -3436,6 +3556,7 @@ int hugetlb_reserve_pages(struct inode *inode,
        struct hstate *h = hstate_inode(inode);
        struct hugepage_subpool *spool = subpool_inode(inode);
        struct resv_map *resv_map;
+        long gbl_reserve;
        /*
         * Only apply hugepage reservation if asked. At fault time, an
@@ -3472,8 +3593,13 @@ int hugetlb_reserve_pages(struct inode *inode,
                goto out_err;
        }
-        /* There must be enough pages in the subpool for the mapping */
+        /*
-        if (hugepage_subpool_get_pages(spool, chg)) {
+         * There must be enough pages in the subpool for the mapping. If
+         * the subpool has a minimum size, there may be some global
+         * reservations already in place (gbl_reserve).
+         */
+        gbl_reserve = hugepage_subpool_get_pages(spool, chg);
+        if (gbl_reserve < 0) {
                ret = -ENOSPC;
                goto out_err;
        }
@@ -3482,9 +3608,10 @@ int hugetlb_reserve_pages(struct inode *inode,
         * Check enough hugepages are available for the reservation.
         * Hand the pages back to the subpool if there are not
         */
-        ret = hugetlb_acct_memory(h, chg);
+        ret = hugetlb_acct_memory(h, gbl_reserve);
        if (ret < 0) {
-                hugepage_subpool_put_pages(spool, chg);
+                /* put back original number of pages, chg */
+                (void)hugepage_subpool_put_pages(spool, chg);
                goto out_err;
        }
@@ -3514,6 +3641,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
        struct resv_map *resv_map = inode_resv_map(inode);
        long chg = 0;
        struct hugepage_subpool *spool = subpool_inode(inode);
+        long gbl_reserve;
        if (resv_map)
                chg = region_truncate(resv_map, offset);
@@ -3521,8 +3649,12 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
        inode->i_blocks -= (blocks_per_huge_page(h) * freed);
        spin_unlock(&inode->i_lock);
-        hugepage_subpool_put_pages(spool, (chg - freed));
+        /*
-        hugetlb_acct_memory(h, -(chg - freed));
+         * If the subpool has a minimum size, the number of global
+         * reservations to be released may be adjusted.
+         */
+        gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
+        hugetlb_acct_memory(h, -gbl_reserve);
 }
 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
@@ -3733,8 +3865,7 @@ retry:
        if (!pmd_huge(*pmd))
                goto out;
        if (pmd_present(*pmd)) {
-                page = pte_page(*(pte_t *)pmd) +
+                page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
-                        ((address & ~PMD_MASK) >> PAGE_SHIFT);
                if (flags & FOLL_GET)
                        get_page(page);
        } else {
@@ -3765,20 +3896,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
 #ifdef CONFIG_MEMORY_FAILURE
-/* Should be called in hugetlb_lock */
-static int is_hugepage_on_freelist(struct page *hpage)
-{
-        struct page *page;
-        struct page *tmp;
-        struct hstate *h = page_hstate(hpage);
-        int nid = page_to_nid(hpage);
-        list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
-                if (page == hpage)
-                        return 1;
-        return 0;
-}
 /*
 * This function is called from memory failure code.
 * Assume the caller holds page lock of the head page.
@@ -3790,7 +3907,11 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
        int ret = -EBUSY;
        spin_lock(&hugetlb_lock);
-        if (is_hugepage_on_freelist(hpage)) {
+        /*
+         * Just checking !page_huge_active is not enough, because that could be
+         * an isolated/hwpoisoned hugepage (which have >0 refcount).
+         */
+        if (!page_huge_active(hpage) && !page_count(hpage)) {
                /*
                 * Hwpoisoned hugepage isn't linked to activelist or freelist,
                 * but dangling hpage->lru can trigger list-debug warnings
@@ -3810,42 +3931,27 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
 bool isolate_huge_page(struct page *page, struct list_head *list)
 {
+        bool ret = true;
        VM_BUG_ON_PAGE(!PageHead(page), page);
-        if (!get_page_unless_zero(page))
-                return false;
        spin_lock(&hugetlb_lock);
+        if (!page_huge_active(page) || !get_page_unless_zero(page)) {
+                ret = false;
+                goto unlock;
+        }
+        clear_page_huge_active(page);
        list_move_tail(&page->lru, list);
+unlock:
        spin_unlock(&hugetlb_lock);
-        return true;
+        return ret;
 }
 void putback_active_hugepage(struct page *page)
 {
        VM_BUG_ON_PAGE(!PageHead(page), page);
        spin_lock(&hugetlb_lock);
+        set_page_huge_active(page);
        list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
        spin_unlock(&hugetlb_lock);
        put_page(page);
 }
-bool is_hugepage_active(struct page *page)
-{
-        VM_BUG_ON_PAGE(!PageHuge(page), page);
-        /*
-         * This function can be called for a tail page because the caller,
-         * scan_movable_pages, scans through a given pfn-range which typically
-         * covers one memory block. In systems using gigantic hugepage (1GB
-         * for x86_64,) a hugepage is larger than a memory block, and we don't
-         * support migrating such large hugepages for now, so return false
-         * when called for tail pages.
-         */
-        if (PageTail(page))
-                return false;
-        /*
-         * Refcount of a hwpoisoned hugepages is 1, but they are not active,
-         * so we should return false for them.
-         */
-        if (unlikely(PageHWPoison(page)))
-                return false;
-        return page_count(page) > 0;
-}
diff --git a/mm/internal.h b/mm/internal.h
index a96da5b0029d..a25e359a4039 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -200,6 +200,8 @@ isolate_freepages_range(struct compact_control *cc,
 unsigned long
 isolate_migratepages_range(struct compact_control *cc,
                           unsigned long low_pfn, unsigned long end_pfn);
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+                        int migratetype, bool only_stealable, bool *can_steal);
 #endif
@@ -222,13 +224,13 @@ static inline unsigned long page_order(struct page *page)
 * PageBuddy() should be checked first by the caller to minimize race window,
 * and invalid values must be handled gracefully.
 *
- * ACCESS_ONCE is used so that if the caller assigns the result into a local
+ * READ_ONCE is used so that if the caller assigns the result into a local
 * variable and e.g. tests it for valid range before using, the compiler cannot
 * decide to remove the variable and inline the page_private(page) multiple
 * times, potentially observing different values in the tests and the actual
 * use of the result.
 */
-#define page_order_unsafe(page)         ACCESS_ONCE(page_private(page))
+#define page_order_unsafe(page)         READ_ONCE(page_private(page))
 static inline bool is_cow_mapping(vm_flags_t flags)
 {
@@ -240,7 +242,7 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
                struct vm_area_struct *prev, struct rb_node *rb_parent);
 #ifdef CONFIG_MMU
-extern long __mlock_vma_pages_range(struct vm_area_struct *vma,
+extern long populate_vma_page_range(struct vm_area_struct *vma,
                unsigned long start, unsigned long end, int *nonblocking);
 extern void munlock_vma_pages_range(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end);
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
deleted file mode 100644
index 827732047da1..000000000000
--- a/mm/iov_iter.c
+++ /dev/null
@@ -1,753 +0,0 @@
-#include <linux/export.h>
-#include <linux/uio.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <net/checksum.h>
-#define iterate_iovec(i, n, __v, __p, skip, STEP) {     \
-        size_t left;                                    \
-        size_t wanted = n;                              \
-        __p = i->iov;                                   \
-        __v.iov_len = min(n, __p->iov_len - skip);      \
-        if (likely(__v.iov_len)) {                      \
-                __v.iov_base = __p->iov_base + skip;    \
-                left = (STEP);                          \
-                __v.iov_len -= left;                    \
-                skip += __v.iov_len;                    \
-                n -= __v.iov_len;                       \
-        } else {                                        \
-                left = 0;                               \
-        }                                               \
-        while (unlikely(!left && n)) {                  \
-                __p++;                                  \
-                __v.iov_len = min(n, __p->iov_len);     \
-                if (unlikely(!__v.iov_len))             \
-                        continue;                       \
-                __v.iov_base = __p->iov_base;           \
-                left = (STEP);                          \
-                __v.iov_len -= left;                    \
-                skip = __v.iov_len;                     \
-                n -= __v.iov_len;                       \
-        }                                               \
-        n = wanted - n;                                 \
-}
-#define iterate_kvec(i, n, __v, __p, skip, STEP) {      \
-        size_t wanted = n;                              \
-        __p = i->kvec;                                  \
-        __v.iov_len = min(n, __p->iov_len - skip);      \
-        if (likely(__v.iov_len)) {                      \
-                __v.iov_base = __p->iov_base + skip;    \
-                (void)(STEP);                           \
-                skip += __v.iov_len;                    \
-                n -= __v.iov_len;                       \
-        }                                               \
-        while (unlikely(n)) {                           \
-                __p++;                                  \
-                __v.iov_len = min(n, __p->iov_len);     \
-                if (unlikely(!__v.iov_len))             \
-                        continue;                       \
-                __v.iov_base = __p->iov_base;           \
-                (void)(STEP);                           \
-                skip = __v.iov_len;                     \
-                n -= __v.iov_len;                       \
-        }                                               \
-        n = wanted;                                     \
-}
-#define iterate_bvec(i, n, __v, __p, skip, STEP) {      \
-        size_t wanted = n;                              \
-        __p = i->bvec;                                  \
-        __v.bv_len = min_t(size_t, n, __p->bv_len - skip);      \
-        if (likely(__v.bv_len)) {                       \
-                __v.bv_page = __p->bv_page;             \
-                __v.bv_offset = __p->bv_offset + skip;  \
-                (void)(STEP);                           \
-                skip += __v.bv_len;                     \
-                n -= __v.bv_len;                        \
-        }                                               \
-        while (unlikely(n)) {                           \
-                __p++;                                  \
-                __v.bv_len = min_t(size_t, n, __p->bv_len);     \
-                if (unlikely(!__v.bv_len))              \
-                        continue;                       \
-                __v.bv_page = __p->bv_page;             \
-                __v.bv_offset = __p->bv_offset;         \
-                (void)(STEP);                           \
-                skip = __v.bv_len;                      \
-                n -= __v.bv_len;                        \
-        }                                               \
-        n = wanted;                                     \
-}
-#define iterate_all_kinds(i, n, v, I, B, K) {                   \
-        size_t skip = i->iov_offset;                            \
-        if (unlikely(i->type & ITER_BVEC)) {                    \
-                const struct bio_vec *bvec;                     \
-                struct bio_vec v;                               \
-                iterate_bvec(i, n, v, bvec, skip, (B))          \
-        } else if (unlikely(i->type & ITER_KVEC)) {             \
-                const struct kvec *kvec;                        \
-                struct kvec v;                                  \
-                iterate_kvec(i, n, v, kvec, skip, (K))          \
-        } else {                                                \
-                const struct iovec *iov;                        \
-                struct iovec v;                                 \
-                iterate_iovec(i, n, v, iov, skip, (I))          \
-        }                                                       \
-}
-#define iterate_and_advance(i, n, v, I, B, K) {                 \
-        size_t skip = i->iov_offset;                            \
-        if (unlikely(i->type & ITER_BVEC)) {                    \
-                const struct bio_vec *bvec;                     \
-                struct bio_vec v;                               \
-                iterate_bvec(i, n, v, bvec, skip, (B))          \
-                if (skip == bvec->bv_len) {                     \
-                        bvec++;                                 \
-                        skip = 0;                               \
-                }                                               \
-                i->nr_segs -= bvec - i->bvec;                   \
-                i->bvec = bvec;                                 \
-        } else if (unlikely(i->type & ITER_KVEC)) {             \
-                const struct kvec *kvec;                        \
-                struct kvec v;                                  \
-                iterate_kvec(i, n, v, kvec, skip, (K))          \
-                if (skip == kvec->iov_len) {                    \
-                        kvec++;                                 \
-                        skip = 0;                               \
-                }                                               \
-                i->nr_segs -= kvec - i->kvec;                   \
-                i->kvec = kvec;                                 \
-        } else {                                                \
-                const struct iovec *iov;                        \
-                struct iovec v;                                 \
-                iterate_iovec(i, n, v, iov, skip, (I))          \
-                if (skip == iov->iov_len) {                     \
-                        iov++;                                  \
-                        skip = 0;                               \
-                }                                               \
-                i->nr_segs -= iov - i->iov;                     \
-                i->iov = iov;                                   \
-        }                                                       \
-        i->count -= n;                                          \
-        i->iov_offset = skip;                                   \
-}
-static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
-                         struct iov_iter *i)
-{
-        size_t skip, copy, left, wanted;
-        const struct iovec *iov;
-        char __user *buf;
-        void *kaddr, *from;
-        if (unlikely(bytes > i->count))
-                bytes = i->count;
-        if (unlikely(!bytes))
-                return 0;
-        wanted = bytes;
-        iov = i->iov;
-        skip = i->iov_offset;
-        buf = iov->iov_base + skip;
-        copy = min(bytes, iov->iov_len - skip);
-        if (!fault_in_pages_writeable(buf, copy)) {
-                kaddr = kmap_atomic(page);
-                from = kaddr + offset;
-                /* first chunk, usually the only one */
-                left = __copy_to_user_inatomic(buf, from, copy);
-                copy -= left;
-                skip += copy;
-                from += copy;
-                bytes -= copy;
-                while (unlikely(!left && bytes)) {
-                        iov++;
-                        buf = iov->iov_base;
-                        copy = min(bytes, iov->iov_len);
-                        left = __copy_to_user_inatomic(buf, from, copy);
-                        copy -= left;
-                        skip = copy;
-                        from += copy;
-                        bytes -= copy;
-                }
-                if (likely(!bytes)) {
-                        kunmap_atomic(kaddr);
-                        goto done;
-                }
-                offset = from - kaddr;
-                buf += copy;
-                kunmap_atomic(kaddr);
-                copy = min(bytes, iov->iov_len - skip);
-        }
-        /* Too bad - revert to non-atomic kmap */
-        kaddr = kmap(page);
-        from = kaddr + offset;
-        left = __copy_to_user(buf, from, copy);
-        copy -= left;
-        skip += copy;
-        from += copy;
-        bytes -= copy;
-        while (unlikely(!left && bytes)) {
-                iov++;
-                buf = iov->iov_base;
-                copy = min(bytes, iov->iov_len);
-                left = __copy_to_user(buf, from, copy);
-                copy -= left;
-                skip = copy;
-                from += copy;
-                bytes -= copy;
-        }
-        kunmap(page);
-done:
-        if (skip == iov->iov_len) {
-                iov++;
-                skip = 0;
-        }
-        i->count -= wanted - bytes;
-        i->nr_segs -= iov - i->iov;
-        i->iov = iov;
-        i->iov_offset = skip;
-        return wanted - bytes;
-}
-static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes,
-                         struct iov_iter *i)
-{
-        size_t skip, copy, left, wanted;
-        const struct iovec *iov;
-        char __user *buf;
-        void *kaddr, *to;
-        if (unlikely(bytes > i->count))
-                bytes = i->count;
-        if (unlikely(!bytes))
-                return 0;
-        wanted = bytes;
-        iov = i->iov;
-        skip = i->iov_offset;
-        buf = iov->iov_base + skip;
-        copy = min(bytes, iov->iov_len - skip);
-        if (!fault_in_pages_readable(buf, copy)) {
-                kaddr = kmap_atomic(page);
-                to = kaddr + offset;
-                /* first chunk, usually the only one */
-                left = __copy_from_user_inatomic(to, buf, copy);
-                copy -= left;
-                skip += copy;
-                to += copy;
-                bytes -= copy;
-                while (unlikely(!left && bytes)) {
-                        iov++;
-                        buf = iov->iov_base;
-                        copy = min(bytes, iov->iov_len);
-                        left = __copy_from_user_inatomic(to, buf, copy);
-                        copy -= left;
-                        skip = copy;
-                        to += copy;
-                        bytes -= copy;
-                }
-                if (likely(!bytes)) {
-                        kunmap_atomic(kaddr);
-                        goto done;
-                }
-                offset = to - kaddr;
-                buf += copy;
-                kunmap_atomic(kaddr);
-                copy = min(bytes, iov->iov_len - skip);
-        }
-        /* Too bad - revert to non-atomic kmap */
-        kaddr = kmap(page);
-        to = kaddr + offset;
-        left = __copy_from_user(to, buf, copy);
-        copy -= left;
-        skip += copy;
-        to += copy;
-        bytes -= copy;
-        while (unlikely(!left && bytes)) {
-                iov++;
-                buf = iov->iov_base;
-                copy = min(bytes, iov->iov_len);
-                left = __copy_from_user(to, buf, copy);
-                copy -= left;
-                skip = copy;
-                to += copy;
-                bytes -= copy;
-        }
-        kunmap(page);
-done:
-        if (skip == iov->iov_len) {
-                iov++;
-                skip = 0;
-        }
-        i->count -= wanted - bytes;
-        i->nr_segs -= iov - i->iov;
-        i->iov = iov;
-        i->iov_offset = skip;
-        return wanted - bytes;
-}
-/*
- * Fault in the first iovec of the given iov_iter, to a maximum length
- * of bytes. Returns 0 on success, or non-zero if the memory could not be
- * accessed (ie. because it is an invalid address).
- *
- * writev-intensive code may want this to prefault several iovecs -- that
- * would be possible (callers must not rely on the fact that _only_ the
- * first iovec will be faulted with the current implementation).
- */
-int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
-{
-        if (!(i->type & (ITER_BVEC|ITER_KVEC))) {
-                char __user *buf = i->iov->iov_base + i->iov_offset;
-                bytes = min(bytes, i->iov->iov_len - i->iov_offset);
-                return fault_in_pages_readable(buf, bytes);
-        }
-        return 0;
-}
-EXPORT_SYMBOL(iov_iter_fault_in_readable);
-void iov_iter_init(struct iov_iter *i, int direction,
-                        const struct iovec *iov, unsigned long nr_segs,
-                        size_t count)
-{
-        /* It will get better.  Eventually... */
-        if (segment_eq(get_fs(), KERNEL_DS)) {
-                direction |= ITER_KVEC;
-                i->type = direction;
-                i->kvec = (struct kvec *)iov;
-        } else {
-                i->type = direction;
-                i->iov = iov;
-        }
-        i->nr_segs = nr_segs;
-        i->iov_offset = 0;
-        i->count = count;
-}
-EXPORT_SYMBOL(iov_iter_init);
-static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len)
-{
-        char *from = kmap_atomic(page);
-        memcpy(to, from + offset, len);
-        kunmap_atomic(from);
-}
-static void memcpy_to_page(struct page *page, size_t offset, char *from, size_t len)
-{
-        char *to = kmap_atomic(page);
-        memcpy(to + offset, from, len);
-        kunmap_atomic(to);
-}
-static void memzero_page(struct page *page, size_t offset, size_t len)
-{
-        char *addr = kmap_atomic(page);
-        memset(addr + offset, 0, len);
-        kunmap_atomic(addr);
-}
-size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i)
-{
-        char *from = addr;
-        if (unlikely(bytes > i->count))
-                bytes = i->count;
-        if (unlikely(!bytes))
-                return 0;
-        iterate_and_advance(i, bytes, v,
-                __copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len,
-                               v.iov_len),
-                memcpy_to_page(v.bv_page, v.bv_offset,
-                               (from += v.bv_len) - v.bv_len, v.bv_len),
-                memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len)
-        )
-        return bytes;
-}
-EXPORT_SYMBOL(copy_to_iter);
-size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
-{
-        char *to = addr;
-        if (unlikely(bytes > i->count))
-                bytes = i->count;
-        if (unlikely(!bytes))
-                return 0;
-        iterate_and_advance(i, bytes, v,
-                __copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base,
-                                 v.iov_len),
-                memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
-                                 v.bv_offset, v.bv_len),
-                memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
-        )
-        return bytes;
-}
-EXPORT_SYMBOL(copy_from_iter);
-size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
-{
-        char *to = addr;
-        if (unlikely(bytes > i->count))
-                bytes = i->count;
-        if (unlikely(!bytes))
-                return 0;
-        iterate_and_advance(i, bytes, v,
-                __copy_from_user_nocache((to += v.iov_len) - v.iov_len,
-                                         v.iov_base, v.iov_len),
-                memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
-                                 v.bv_offset, v.bv_len),
-                memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
-        )
-        return bytes;
-}
-EXPORT_SYMBOL(copy_from_iter_nocache);
-size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
-                         struct iov_iter *i)
-{
-        if (i->type & (ITER_BVEC|ITER_KVEC)) {
-                void *kaddr = kmap_atomic(page);
-                size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
-                kunmap_atomic(kaddr);
-                return wanted;
-        } else
-                return copy_page_to_iter_iovec(page, offset, bytes, i);
-}
-EXPORT_SYMBOL(copy_page_to_iter);
-size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
-                         struct iov_iter *i)
-{
-        if (i->type & (ITER_BVEC|ITER_KVEC)) {
-                void *kaddr = kmap_atomic(page);
-                size_t wanted = copy_from_iter(kaddr + offset, bytes, i);
-                kunmap_atomic(kaddr);
-                return wanted;
-        } else
-                return copy_page_from_iter_iovec(page, offset, bytes, i);
-}
-EXPORT_SYMBOL(copy_page_from_iter);
-size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
-{
-        if (unlikely(bytes > i->count))
-                bytes = i->count;
-        if (unlikely(!bytes))
-                return 0;
-        iterate_and_advance(i, bytes, v,
-                __clear_user(v.iov_base, v.iov_len),
-                memzero_page(v.bv_page, v.bv_offset, v.bv_len),
-                memset(v.iov_base, 0, v.iov_len)
-        )
-        return bytes;
-}
-EXPORT_SYMBOL(iov_iter_zero);
-size_t iov_iter_copy_from_user_atomic(struct page *page,
-                struct iov_iter *i, unsigned long offset, size_t bytes)
-{
-        char *kaddr = kmap_atomic(page), *p = kaddr + offset;
-        iterate_all_kinds(i, bytes, v,
-                __copy_from_user_inatomic((p += v.iov_len) - v.iov_len,
-                                          v.iov_base, v.iov_len),
-                memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
-                                 v.bv_offset, v.bv_len),
-                memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
-        )
-        kunmap_atomic(kaddr);
-        return bytes;
-}
-EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
-void iov_iter_advance(struct iov_iter *i, size_t size)
-{
-        iterate_and_advance(i, size, v, 0, 0, 0)
-}
-EXPORT_SYMBOL(iov_iter_advance);
-/*
- * Return the count of just the current iov_iter segment.
- */
-size_t iov_iter_single_seg_count(const struct iov_iter *i)
-{
-        if (i->nr_segs == 1)
-                return i->count;
-        else if (i->type & ITER_BVEC)
-                return min(i->count, i->bvec->bv_len - i->iov_offset);
-        else
-                return min(i->count, i->iov->iov_len - i->iov_offset);
-}
-EXPORT_SYMBOL(iov_iter_single_seg_count);
-void iov_iter_kvec(struct iov_iter *i, int direction,
-                        const struct kvec *kvec, unsigned long nr_segs,
-                        size_t count)
-{
-        BUG_ON(!(direction & ITER_KVEC));
-        i->type = direction;
-        i->kvec = kvec;
-        i->nr_segs = nr_segs;
-        i->iov_offset = 0;
-        i->count = count;
-}
-EXPORT_SYMBOL(iov_iter_kvec);
-void iov_iter_bvec(struct iov_iter *i, int direction,
-                        const struct bio_vec *bvec, unsigned long nr_segs,
-                        size_t count)
-{
-        BUG_ON(!(direction & ITER_BVEC));
-        i->type = direction;
-        i->bvec = bvec;
-        i->nr_segs = nr_segs;
-        i->iov_offset = 0;
-        i->count = count;
-}
-EXPORT_SYMBOL(iov_iter_bvec);
-unsigned long iov_iter_alignment(const struct iov_iter *i)
-{
-        unsigned long res = 0;
-        size_t size = i->count;
-        if (!size)
-                return 0;
-        iterate_all_kinds(i, size, v,
-                (res |= (unsigned long)v.iov_base | v.iov_len, 0),
-                res |= v.bv_offset | v.bv_len,
-                res |= (unsigned long)v.iov_base | v.iov_len
-        )
-        return res;
-}
-EXPORT_SYMBOL(iov_iter_alignment);
-ssize_t iov_iter_get_pages(struct iov_iter *i,
-                   struct page **pages, size_t maxsize, unsigned maxpages,
-                   size_t *start)
-{
-        if (maxsize > i->count)
-                maxsize = i->count;
-        if (!maxsize)
-                return 0;
-        iterate_all_kinds(i, maxsize, v, ({
-                unsigned long addr = (unsigned long)v.iov_base;
-                size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
-                int n;
-                int res;
-                if (len > maxpages * PAGE_SIZE)
-                        len = maxpages * PAGE_SIZE;
-                addr &= ~(PAGE_SIZE - 1);
-                n = DIV_ROUND_UP(len, PAGE_SIZE);
-                res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages);
-                if (unlikely(res < 0))
-                        return res;
-                return (res == n ? len : res * PAGE_SIZE) - *start;
-        0;}),({
-                /* can't be more than PAGE_SIZE */
-                *start = v.bv_offset;
-                get_page(*pages = v.bv_page);
-                return v.bv_len;
-        }),({
-                return -EFAULT;
-        })
-        )
-        return 0;
-}
-EXPORT_SYMBOL(iov_iter_get_pages);
-static struct page **get_pages_array(size_t n)
-{
-        struct page **p = kmalloc(n * sizeof(struct page *), GFP_KERNEL);
-        if (!p)
-                p = vmalloc(n * sizeof(struct page *));
-        return p;
-}
-ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
-                   struct page ***pages, size_t maxsize,
-                   size_t *start)
-{
-        struct page **p;
-        if (maxsize > i->count)
-                maxsize = i->count;
-        if (!maxsize)
-                return 0;
-        iterate_all_kinds(i, maxsize, v, ({
-                unsigned long addr = (unsigned long)v.iov_base;
-                size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
-                int n;
-                int res;
-                addr &= ~(PAGE_SIZE - 1);
-                n = DIV_ROUND_UP(len, PAGE_SIZE);
-                p = get_pages_array(n);
-                if (!p)
-                        return -ENOMEM;
-                res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p);
-                if (unlikely(res < 0)) {
-                        kvfree(p);
-                        return res;
-                }
-                *pages = p;
-                return (res == n ? len : res * PAGE_SIZE) - *start;
-        0;}),({
-                /* can't be more than PAGE_SIZE */
-                *start = v.bv_offset;
-                *pages = p = get_pages_array(1);
-                if (!p)
-                        return -ENOMEM;
-                get_page(*p = v.bv_page);
-                return v.bv_len;
-        }),({
-                return -EFAULT;
-        })
-        )
-        return 0;
-}
-EXPORT_SYMBOL(iov_iter_get_pages_alloc);
-size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
-                               struct iov_iter *i)
-{
-        char *to = addr;
-        __wsum sum, next;
-        size_t off = 0;
-        if (unlikely(bytes > i->count))
-                bytes = i->count;
-        if (unlikely(!bytes))
-                return 0;
-        sum = *csum;
-        iterate_and_advance(i, bytes, v, ({
-                int err = 0;
-                next = csum_and_copy_from_user(v.iov_base, 
-                                               (to += v.iov_len) - v.iov_len,
-                                               v.iov_len, 0, &err);
-                if (!err) {
-                        sum = csum_block_add(sum, next, off);
-                        off += v.iov_len;
-                }
-                err ? v.iov_len : 0;
-        }), ({
-                char *p = kmap_atomic(v.bv_page);
-                next = csum_partial_copy_nocheck(p + v.bv_offset,
-                                                 (to += v.bv_len) - v.bv_len,
-                                                 v.bv_len, 0);
-                kunmap_atomic(p);
-                sum = csum_block_add(sum, next, off);
-                off += v.bv_len;
-        }),({
-                next = csum_partial_copy_nocheck(v.iov_base,
-                                                 (to += v.iov_len) - v.iov_len,
-                                                 v.iov_len, 0);
-                sum = csum_block_add(sum, next, off);
-                off += v.iov_len;
-        })
-        )
-        *csum = sum;
-        return bytes;
-}
-EXPORT_SYMBOL(csum_and_copy_from_iter);
-size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum,
-                             struct iov_iter *i)
-{
-        char *from = addr;
-        __wsum sum, next;
-        size_t off = 0;
-        if (unlikely(bytes > i->count))
-                bytes = i->count;
-        if (unlikely(!bytes))
-                return 0;
-        sum = *csum;
-        iterate_and_advance(i, bytes, v, ({
-                int err = 0;
-                next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len,
-                                             v.iov_base, 
-                                             v.iov_len, 0, &err);
-                if (!err) {
-                        sum = csum_block_add(sum, next, off);
-                        off += v.iov_len;
-                }
-                err ? v.iov_len : 0;
-        }), ({
-                char *p = kmap_atomic(v.bv_page);
-                next = csum_partial_copy_nocheck((from += v.bv_len) - v.bv_len,
-                                                 p + v.bv_offset,
-                                                 v.bv_len, 0);
-                kunmap_atomic(p);
-                sum = csum_block_add(sum, next, off);
-                off += v.bv_len;
-        }),({
-                next = csum_partial_copy_nocheck((from += v.iov_len) - v.iov_len,
-                                                 v.iov_base,
-                                                 v.iov_len, 0);
-                sum = csum_block_add(sum, next, off);
-                off += v.iov_len;
-        })
-        )
-        *csum = sum;
-        return bytes;
-}
-EXPORT_SYMBOL(csum_and_copy_to_iter);
-int iov_iter_npages(const struct iov_iter *i, int maxpages)
-{
-        size_t size = i->count;
-        int npages = 0;
-        if (!size)
-                return 0;
-        iterate_all_kinds(i, size, v, ({
-                unsigned long p = (unsigned long)v.iov_base;
-                npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
-                        - p / PAGE_SIZE;
-                if (npages >= maxpages)
-                        return maxpages;
-        0;}),({
-                npages++;
-                if (npages >= maxpages)
-                        return maxpages;
-        }),({
-                unsigned long p = (unsigned long)v.iov_base;
-                npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
-                        - p / PAGE_SIZE;
-                if (npages >= maxpages)
-                        return maxpages;
-        })
-        )
-        return npages;
-}
-EXPORT_SYMBOL(iov_iter_npages);
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 78fee632a7ee..6c513a63ea84 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -29,6 +29,7 @@
 #include <linux/stacktrace.h>
 #include <linux/string.h>
 #include <linux/types.h>
+#include <linux/vmalloc.h>
 #include <linux/kasan.h>
 #include "kasan.h"
@@ -388,6 +389,19 @@ void kasan_krealloc(const void *object, size_t size)
                kasan_kmalloc(page->slab_cache, object, size);
 }
+void kasan_kfree(void *ptr)
+{
+        struct page *page;
+        page = virt_to_head_page(ptr);
+        if (unlikely(!PageSlab(page)))
+                kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
+                                KASAN_FREE_PAGE);
+        else
+                kasan_slab_free(page->slab_cache, ptr);
+}
 void kasan_kfree_large(const void *ptr)
 {
        struct page *page = virt_to_page(ptr);
@@ -414,12 +428,19 @@ int kasan_module_alloc(void *addr, size_t size)
                        GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
                        PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
                        __builtin_return_address(0));
-        return ret ? 0 : -ENOMEM;
+        if (ret) {
+                find_vm_area(addr)->flags |= VM_KASAN;
+                return 0;
+        }
+        return -ENOMEM;
 }
-void kasan_module_free(void *addr)
+void kasan_free_shadow(const struct vm_struct *vm)
 {
-        vfree(kasan_mem_to_shadow(addr));
+        if (vm->flags & VM_KASAN)
+                vfree(kasan_mem_to_shadow(vm->addr));
 }
 static void register_global(struct kasan_global *global)
diff --git a/mm/ksm.c b/mm/ksm.c
index 4162dce2eb44..7ee101eaacdf 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -542,7 +542,7 @@ static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
        expected_mapping = (void *)stable_node +
                                (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
 again:
-        kpfn = ACCESS_ONCE(stable_node->kpfn);
+        kpfn = READ_ONCE(stable_node->kpfn);
        page = pfn_to_page(kpfn);
        /*
@@ -551,7 +551,7 @@ again:
         * but on Alpha we need to be more careful.
         */
        smp_read_barrier_depends();
-        if (ACCESS_ONCE(page->mapping) != expected_mapping)
+        if (READ_ONCE(page->mapping) != expected_mapping)
                goto stale;
        /*
@@ -577,14 +577,14 @@ again:
                cpu_relax();
        }
-        if (ACCESS_ONCE(page->mapping) != expected_mapping) {
+        if (READ_ONCE(page->mapping) != expected_mapping) {
                put_page(page);
                goto stale;
        }
        if (lock_it) {
                lock_page(page);
-                if (ACCESS_ONCE(page->mapping) != expected_mapping) {
+                if (READ_ONCE(page->mapping) != expected_mapping) {
                        unlock_page(page);
                        put_page(page);
                        goto stale;
@@ -600,7 +600,7 @@ stale:
         * before checking whether node->kpfn has been changed.
         */
        smp_rmb();
-        if (ACCESS_ONCE(stable_node->kpfn) != kpfn)
+        if (READ_ONCE(stable_node->kpfn) != kpfn)
                goto again;
        remove_node_from_stable_tree(stable_node);
        return NULL;
diff --git a/mm/memblock.c b/mm/memblock.c
index 252b77bdf65e..9318b567ed79 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -580,10 +580,24 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
        return memblock_add_range(&memblock.memory, base, size, nid, 0);
 }
+static int __init_memblock memblock_add_region(phys_addr_t base,
+                                                phys_addr_t size,
+                                                int nid,
+                                                unsigned long flags)
+{
+        struct memblock_type *_rgn = &memblock.memory;
+        memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
+                     (unsigned long long)base,
+                     (unsigned long long)base + size - 1,
+                     flags, (void *)_RET_IP_);
+        return memblock_add_range(_rgn, base, size, nid, flags);
+}
 int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
 {
-        return memblock_add_range(&memblock.memory, base, size,
+        return memblock_add_region(base, size, MAX_NUMNODES, 0);
-                                   MAX_NUMNODES, 0);
 }
 /**
@@ -699,14 +713,14 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base,
                                                   int nid,
                                                   unsigned long flags)
 {
-        struct memblock_type *_rgn = &memblock.reserved;
+        struct memblock_type *type = &memblock.reserved;
        memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
                     (unsigned long long)base,
                     (unsigned long long)base + size - 1,
                     flags, (void *)_RET_IP_);
-        return memblock_add_range(_rgn, base, size, nid, flags);
+        return memblock_add_range(type, base, size, nid, flags);
 }
 int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9fe07692eaad..14c2f2017e37 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -14,6 +14,12 @@
 * Copyright (C) 2012 Parallels Inc. and Google Inc.
 * Authors: Glauber Costa and Suleiman Souhlal
 *
+ * Native page reclaim
+ * Charge lifetime sanitation
+ * Lockless page tracking & accounting
+ * Unified hierarchy configuration model
+ * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
+ *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
@@ -253,11 +259,6 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
- *
- * TODO: Add a water mark for the memory controller. Reclaim will begin when
- * we hit the water mark. May be even add a low water mark, such that
- * no reclaim occurs from a cgroup at it's low water mark, this is
- * a feature that will be implemented much later in the future.
 */
 struct mem_cgroup {
        struct cgroup_subsys_state css;
@@ -454,6 +455,12 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
        return memcg->css.id;
 }
+/*
+ * A helper function to get mem_cgroup from ID. must be called under
+ * rcu_read_lock().  The caller is responsible for calling
+ * css_tryget_online() if the mem_cgroup is used for charging. (dropping
+ * refcnt from swap can be called against removed memcg.)
+ */
 static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
 {
        struct cgroup_subsys_state *css;
@@ -667,7 +674,7 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
 static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
 {
        unsigned long nr_pages = page_counter_read(&memcg->memory);
-        unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit);
+        unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
        unsigned long excess = 0;
        if (nr_pages > soft_limit)
@@ -1035,7 +1042,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                        goto out_unlock;
                do {
-                        pos = ACCESS_ONCE(iter->position);
+                        pos = READ_ONCE(iter->position);
                        /*
                         * A racing update may change the position and
                         * put the last reference, hence css_tryget(),
@@ -1352,13 +1359,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
        unsigned long limit;
        count = page_counter_read(&memcg->memory);
-        limit = ACCESS_ONCE(memcg->memory.limit);
+        limit = READ_ONCE(memcg->memory.limit);
        if (count < limit)
                margin = limit - count;
        if (do_swap_account) {
                count = page_counter_read(&memcg->memsw);
-                limit = ACCESS_ONCE(memcg->memsw.limit);
+                limit = READ_ONCE(memcg->memsw.limit);
                if (count <= limit)
                        margin = min(margin, limit - count);
        }
@@ -1436,15 +1443,17 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
        struct mem_cgroup *iter;
        unsigned int i;
-        if (!p)
-                return;
        mutex_lock(&oom_info_lock);
        rcu_read_lock();
-        pr_info("Task in ");
+        if (p) {
-        pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
+                pr_info("Task in ");
-        pr_cont(" killed as a result of limit of ");
+                pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
+                pr_cont(" killed as a result of limit of ");
+        } else {
+                pr_info("Memory limit reached of cgroup ");
+        }
        pr_cont_cgroup_path(memcg->css.cgroup);
        pr_cont("\n");
@@ -1531,7 +1540,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                return;
        }
-        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
+        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
        totalpages = mem_cgroup_get_limit(memcg) ? : 1;
        for_each_mem_cgroup_tree(iter, memcg) {
                struct css_task_iter it;
@@ -2341,20 +2350,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
 }
 /*
- * A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock().  The caller is responsible for calling
- * css_tryget_online() if the mem_cgroup is used for charging. (dropping
- * refcnt from swap can be called against removed memcg.)
- */
-static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
-{
-        /* ID 0 is unused ID */
-        if (!id)
-                return NULL;
-        return mem_cgroup_from_id(id);
-}
-/*
 * try_get_mem_cgroup_from_page - look up page's memcg association
 * @page: the page
 *
@@ -2380,7 +2375,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
                ent.val = page_private(page);
                id = lookup_swap_cgroup_id(ent);
                rcu_read_lock();
-                memcg = mem_cgroup_lookup(id);
+                memcg = mem_cgroup_from_id(id);
                if (memcg && !css_tryget_online(&memcg->css))
                        memcg = NULL;
                rcu_read_unlock();
@@ -2642,7 +2637,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
                return cachep;
        memcg = get_mem_cgroup_from_mm(current->mm);
-        kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id);
+        kmemcg_id = READ_ONCE(memcg->kmemcg_id);
        if (kmemcg_id < 0)
                goto out;
@@ -2779,92 +2774,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-/**
- * mem_cgroup_move_account - move account of the page
- * @page: the page
- * @nr_pages: number of regular pages (>1 for huge pages)
- * @from: mem_cgroup which the page is moved from.
- * @to: mem_cgroup which the page is moved to. @from != @to.
- *
- * The caller must confirm following.
- * - page is not on LRU (isolate_page() is useful.)
- * - compound_lock is held when nr_pages > 1
- *
- * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
- * from old cgroup.
- */
-static int mem_cgroup_move_account(struct page *page,
-                                   unsigned int nr_pages,
-                                   struct mem_cgroup *from,
-                                   struct mem_cgroup *to)
-{
-        unsigned long flags;
-        int ret;
-        VM_BUG_ON(from == to);
-        VM_BUG_ON_PAGE(PageLRU(page), page);
-        /*
-         * The page is isolated from LRU. So, collapse function
-         * will not handle this page. But page splitting can happen.
-         * Do this check under compound_page_lock(). The caller should
-         * hold it.
-         */
-        ret = -EBUSY;
-        if (nr_pages > 1 && !PageTransHuge(page))
-                goto out;
-        /*
-         * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
-         * of its source page while we change it: page migration takes
-         * both pages off the LRU, but page cache replacement doesn't.
-         */
-        if (!trylock_page(page))
-                goto out;
-        ret = -EINVAL;
-        if (page->mem_cgroup != from)
-                goto out_unlock;
-        spin_lock_irqsave(&from->move_lock, flags);
-        if (!PageAnon(page) && page_mapped(page)) {
-                __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
-                               nr_pages);
-                __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
-                               nr_pages);
-        }
-        if (PageWriteback(page)) {
-                __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
-                               nr_pages);
-                __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
-                               nr_pages);
-        }
-        /*
-         * It is safe to change page->mem_cgroup here because the page
-         * is referenced, charged, and isolated - we can't race with
-         * uncharging, charging, migration, or LRU putback.
-         */
-        /* caller should have done css_get */
-        page->mem_cgroup = to;
-        spin_unlock_irqrestore(&from->move_lock, flags);
-        ret = 0;
-        local_irq_disable();
-        mem_cgroup_charge_statistics(to, page, nr_pages);
-        memcg_check_events(to, page);
-        mem_cgroup_charge_statistics(from, page, -nr_pages);
-        memcg_check_events(from, page);
-        local_irq_enable();
-out_unlock:
-        unlock_page(page);
-out:
-        return ret;
-}
 #ifdef CONFIG_MEMCG_SWAP
 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
                                         bool charge)
@@ -4816,6 +4725,92 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
        return page;
 }
+/**
+ * mem_cgroup_move_account - move account of the page
+ * @page: the page
+ * @nr_pages: number of regular pages (>1 for huge pages)
+ * @from: mem_cgroup which the page is moved from.
+ * @to: mem_cgroup which the page is moved to. @from != @to.
+ *
+ * The caller must confirm following.
+ * - page is not on LRU (isolate_page() is useful.)
+ * - compound_lock is held when nr_pages > 1
+ *
+ * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
+ * from old cgroup.
+ */
+static int mem_cgroup_move_account(struct page *page,
+                                   unsigned int nr_pages,
+                                   struct mem_cgroup *from,
+                                   struct mem_cgroup *to)
+{
+        unsigned long flags;
+        int ret;
+        VM_BUG_ON(from == to);
+        VM_BUG_ON_PAGE(PageLRU(page), page);
+        /*
+         * The page is isolated from LRU. So, collapse function
+         * will not handle this page. But page splitting can happen.
+         * Do this check under compound_page_lock(). The caller should
+         * hold it.
+         */
+        ret = -EBUSY;
+        if (nr_pages > 1 && !PageTransHuge(page))
+                goto out;
+        /*
+         * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
+         * of its source page while we change it: page migration takes
+         * both pages off the LRU, but page cache replacement doesn't.
+         */
+        if (!trylock_page(page))
+                goto out;
+        ret = -EINVAL;
+        if (page->mem_cgroup != from)
+                goto out_unlock;
+        spin_lock_irqsave(&from->move_lock, flags);
+        if (!PageAnon(page) && page_mapped(page)) {
+                __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+                               nr_pages);
+                __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+                               nr_pages);
+        }
+        if (PageWriteback(page)) {
+                __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+                               nr_pages);
+                __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+                               nr_pages);
+        }
+        /*
+         * It is safe to change page->mem_cgroup here because the page
+         * is referenced, charged, and isolated - we can't race with
+         * uncharging, charging, migration, or LRU putback.
+         */
+        /* caller should have done css_get */
+        page->mem_cgroup = to;
+        spin_unlock_irqrestore(&from->move_lock, flags);
+        ret = 0;
+        local_irq_disable();
+        mem_cgroup_charge_statistics(to, page, nr_pages);
+        memcg_check_events(to, page);
+        mem_cgroup_charge_statistics(from, page, -nr_pages);
+        memcg_check_events(from, page);
+        local_irq_enable();
+out_unlock:
+        unlock_page(page);
+out:
+        return ret;
+}
 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
                unsigned long addr, pte_t ptent, union mc_target *target)
 {
@@ -5012,7 +5007,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
         * tunable will only affect upcoming migrations, not the current one.
         * So we need to save it, and keep it going.
         */
-        move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate);
+        move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
        if (move_flags) {
                struct mm_struct *mm;
                struct mem_cgroup *from = mem_cgroup_from_task(p);
@@ -5232,7 +5227,9 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
         * on for the root memcg is enough.
         */
        if (cgroup_on_dfl(root_css->cgroup))
-                mem_cgroup_from_css(root_css)->use_hierarchy = true;
+                root_mem_cgroup->use_hierarchy = true;
+        else
+                root_mem_cgroup->use_hierarchy = false;
 }
 static u64 memory_current_read(struct cgroup_subsys_state *css,
@@ -5244,7 +5241,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
 static int memory_low_show(struct seq_file *m, void *v)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-        unsigned long low = ACCESS_ONCE(memcg->low);
+        unsigned long low = READ_ONCE(memcg->low);
        if (low == PAGE_COUNTER_MAX)
                seq_puts(m, "max\n");
@@ -5274,7 +5271,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
 static int memory_high_show(struct seq_file *m, void *v)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-        unsigned long high = ACCESS_ONCE(memcg->high);
+        unsigned long high = READ_ONCE(memcg->high);
        if (high == PAGE_COUNTER_MAX)
                seq_puts(m, "max\n");
@@ -5304,7 +5301,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
 static int memory_max_show(struct seq_file *m, void *v)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-        unsigned long max = ACCESS_ONCE(memcg->memory.limit);
+        unsigned long max = READ_ONCE(memcg->memory.limit);
        if (max == PAGE_COUNTER_MAX)
                seq_puts(m, "max\n");
@@ -5859,7 +5856,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
        id = swap_cgroup_record(entry, 0);
        rcu_read_lock();
-        memcg = mem_cgroup_lookup(id);
+        memcg = mem_cgroup_from_id(id);
        if (memcg) {
                if (!mem_cgroup_is_root(memcg))
                        page_counter_uncharge(&memcg->memsw, 1);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d487f8dc6d39..d9359b770cd9 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -521,6 +521,52 @@ static const char *action_name[] = {
        [RECOVERED] = "Recovered",
 };
+enum action_page_type {
+        MSG_KERNEL,
+        MSG_KERNEL_HIGH_ORDER,
+        MSG_SLAB,
+        MSG_DIFFERENT_COMPOUND,
+        MSG_POISONED_HUGE,
+        MSG_HUGE,
+        MSG_FREE_HUGE,
+        MSG_UNMAP_FAILED,
+        MSG_DIRTY_SWAPCACHE,
+        MSG_CLEAN_SWAPCACHE,
+        MSG_DIRTY_MLOCKED_LRU,
+        MSG_CLEAN_MLOCKED_LRU,
+        MSG_DIRTY_UNEVICTABLE_LRU,
+        MSG_CLEAN_UNEVICTABLE_LRU,
+        MSG_DIRTY_LRU,
+        MSG_CLEAN_LRU,
+        MSG_TRUNCATED_LRU,
+        MSG_BUDDY,
+        MSG_BUDDY_2ND,
+        MSG_UNKNOWN,
+};
+static const char * const action_page_types[] = {
+        [MSG_KERNEL]                    = "reserved kernel page",
+        [MSG_KERNEL_HIGH_ORDER]         = "high-order kernel page",
+        [MSG_SLAB]                      = "kernel slab page",
+        [MSG_DIFFERENT_COMPOUND]        = "different compound page after locking",
+        [MSG_POISONED_HUGE]             = "huge page already hardware poisoned",
+        [MSG_HUGE]                      = "huge page",
+        [MSG_FREE_HUGE]                 = "free huge page",
+        [MSG_UNMAP_FAILED]              = "unmapping failed page",
+        [MSG_DIRTY_SWAPCACHE]           = "dirty swapcache page",
+        [MSG_CLEAN_SWAPCACHE]           = "clean swapcache page",
+        [MSG_DIRTY_MLOCKED_LRU]         = "dirty mlocked LRU page",
+        [MSG_CLEAN_MLOCKED_LRU]         = "clean mlocked LRU page",
+        [MSG_DIRTY_UNEVICTABLE_LRU]     = "dirty unevictable LRU page",
+        [MSG_CLEAN_UNEVICTABLE_LRU]     = "clean unevictable LRU page",
+        [MSG_DIRTY_LRU]                 = "dirty LRU page",
+        [MSG_CLEAN_LRU]                 = "clean LRU page",
+        [MSG_TRUNCATED_LRU]             = "already truncated LRU page",
+        [MSG_BUDDY]                     = "free buddy page",
+        [MSG_BUDDY_2ND]                 = "free buddy page (2nd try)",
+        [MSG_UNKNOWN]                   = "unknown page",
+};
 /*
 * XXX: It is possible that a page is isolated from LRU cache,
 * and then kept in swap cache or failed to remove from page cache.
@@ -777,10 +823,10 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 static struct page_state {
        unsigned long mask;
        unsigned long res;
-        char *msg;
+        enum action_page_type type;
        int (*action)(struct page *p, unsigned long pfn);
 } error_states[] = {
-        { reserved,     reserved,       "reserved kernel",      me_kernel },
+        { reserved,     reserved,       MSG_KERNEL,     me_kernel },
        /*
         * free pages are specially detected outside this table:
         * PG_buddy pages only make a small fraction of all free pages.
@@ -791,31 +837,31 @@ static struct page_state {
         * currently unused objects without touching them. But just
         * treat it as standard kernel for now.
         */
-        { slab,         slab,           "kernel slab",  me_kernel },
+        { slab,         slab,           MSG_SLAB,       me_kernel },
 #ifdef CONFIG_PAGEFLAGS_EXTENDED
-        { head,         head,           "huge",         me_huge_page },
+        { head,         head,           MSG_HUGE,               me_huge_page },
-        { tail,         tail,           "huge",         me_huge_page },
+        { tail,         tail,           MSG_HUGE,               me_huge_page },
 #else
-        { compound,     compound,       "huge",         me_huge_page },
+        { compound,     compound,       MSG_HUGE,               me_huge_page },
 #endif
-        { sc|dirty,     sc|dirty,       "dirty swapcache",      me_swapcache_dirty },
+        { sc|dirty,     sc|dirty,       MSG_DIRTY_SWAPCACHE,    me_swapcache_dirty },
-        { sc|dirty,     sc,             "clean swapcache",      me_swapcache_clean },
+        { sc|dirty,     sc,             MSG_CLEAN_SWAPCACHE,    me_swapcache_clean },
-        { mlock|dirty,  mlock|dirty,    "dirty mlocked LRU",    me_pagecache_dirty },
+        { mlock|dirty,  mlock|dirty,    MSG_DIRTY_MLOCKED_LRU,  me_pagecache_dirty },
-        { mlock|dirty,  mlock,          "clean mlocked LRU",    me_pagecache_clean },
+        { mlock|dirty,  mlock,          MSG_CLEAN_MLOCKED_LRU,  me_pagecache_clean },
-        { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
+        { unevict|dirty, unevict|dirty, MSG_DIRTY_UNEVICTABLE_LRU,      me_pagecache_dirty },
-        { unevict|dirty, unevict,       "clean unevictable LRU", me_pagecache_clean },
+        { unevict|dirty, unevict,       MSG_CLEAN_UNEVICTABLE_LRU,      me_pagecache_clean },
-        { lru|dirty,    lru|dirty,      "dirty LRU",    me_pagecache_dirty },
+        { lru|dirty,    lru|dirty,      MSG_DIRTY_LRU,  me_pagecache_dirty },
-        { lru|dirty,    lru,            "clean LRU",    me_pagecache_clean },
+        { lru|dirty,    lru,            MSG_CLEAN_LRU,  me_pagecache_clean },
        /*
         * Catchall entry: must be at end.
         */
-        { 0,            0,              "unknown page state",   me_unknown },
+        { 0,            0,              MSG_UNKNOWN,    me_unknown },
 };
 #undef dirty
@@ -835,10 +881,10 @@ static struct page_state {
 * "Dirty/Clean" indication is not 100% accurate due to the possibility of
 * setting PG_dirty outside page lock. See also comment above set_page_dirty().
 */
-static void action_result(unsigned long pfn, char *msg, int result)
+static void action_result(unsigned long pfn, enum action_page_type type, int result)
 {
-        pr_err("MCE %#lx: %s page recovery: %s\n",
+        pr_err("MCE %#lx: recovery action for %s: %s\n",
-                pfn, msg, action_name[result]);
+                pfn, action_page_types[type], action_name[result]);
 }
 static int page_action(struct page_state *ps, struct page *p,
@@ -854,11 +900,11 @@ static int page_action(struct page_state *ps, struct page *p,
                count--;
        if (count != 0) {
                printk(KERN_ERR
-                       "MCE %#lx: %s page still referenced by %d users\n",
+                       "MCE %#lx: %s still referenced by %d users\n",
-                       pfn, ps->msg, count);
+                       pfn, action_page_types[ps->type], count);
                result = FAILED;
        }
-        action_result(pfn, ps->msg, result);
+        action_result(pfn, ps->type, result);
        /* Could do more checks here if page looks ok */
        /*
@@ -1106,7 +1152,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        if (!(flags & MF_COUNT_INCREASED) &&
                !get_page_unless_zero(hpage)) {
                if (is_free_buddy_page(p)) {
-                        action_result(pfn, "free buddy", DELAYED);
+                        action_result(pfn, MSG_BUDDY, DELAYED);
                        return 0;
                } else if (PageHuge(hpage)) {
                        /*
@@ -1123,12 +1169,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                        }
                        set_page_hwpoison_huge_page(hpage);
                        res = dequeue_hwpoisoned_huge_page(hpage);
-                        action_result(pfn, "free huge",
+                        action_result(pfn, MSG_FREE_HUGE,
                                      res ? IGNORED : DELAYED);
                        unlock_page(hpage);
                        return res;
                } else {
-                        action_result(pfn, "high order kernel", IGNORED);
+                        action_result(pfn, MSG_KERNEL_HIGH_ORDER, IGNORED);
                        return -EBUSY;
                }
        }
@@ -1150,9 +1196,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                         */
                        if (is_free_buddy_page(p)) {
                                if (flags & MF_COUNT_INCREASED)
-                                        action_result(pfn, "free buddy", DELAYED);
+                                        action_result(pfn, MSG_BUDDY, DELAYED);
                                else
-                                        action_result(pfn, "free buddy, 2nd try", DELAYED);
+                                        action_result(pfn, MSG_BUDDY_2ND,
+                                                      DELAYED);
                                return 0;
                        }
                }
@@ -1165,7 +1212,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * If this happens just bail out.
         */
        if (compound_head(p) != hpage) {
-                action_result(pfn, "different compound page after locking", IGNORED);
+                action_result(pfn, MSG_DIFFERENT_COMPOUND, IGNORED);
                res = -EBUSY;
                goto out;
        }
@@ -1205,8 +1252,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * on the head page to show that the hugepage is hwpoisoned
         */
        if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
-                action_result(pfn, "hugepage already hardware poisoned",
+                action_result(pfn, MSG_POISONED_HUGE, IGNORED);
-                                IGNORED);
                unlock_page(hpage);
                put_page(hpage);
                return 0;
@@ -1235,7 +1281,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         */
        if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
            != SWAP_SUCCESS) {
-                action_result(pfn, "unmapping failed", IGNORED);
+                action_result(pfn, MSG_UNMAP_FAILED, IGNORED);
                res = -EBUSY;
                goto out;
        }
@@ -1244,7 +1290,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * Torn down by someone else?
         */
        if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
-                action_result(pfn, "already truncated LRU", IGNORED);
+                action_result(pfn, MSG_TRUNCATED_LRU, IGNORED);
                res = -EBUSY;
                goto out;
        }
@@ -1540,8 +1586,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
        }
        unlock_page(hpage);
-        /* Keep page count to indicate a given hugepage is isolated. */
+        ret = isolate_huge_page(hpage, &pagelist);
-        list_move(&hpage->lru, &pagelist);
+        if (ret) {
+                /*
+                 * get_any_page() and isolate_huge_page() takes a refcount each,
+                 * so need to drop one here.
+                 */
+                put_page(hpage);
+        } else {
+                pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
+                return -EBUSY;
+        }
        ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
                                MIGRATE_SYNC, MR_MEMORY_FAILURE);
        if (ret) {
diff --git a/mm/memory.c b/mm/memory.c
index 8068893697bb..22e037e3364e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -690,12 +690,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
        /*
         * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
         */
-        if (vma->vm_ops)
+        pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
-                printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n",
+                 vma->vm_file,
-                       vma->vm_ops->fault);
+                 vma->vm_ops ? vma->vm_ops->fault : NULL,
-        if (vma->vm_file)
+                 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
-                printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n",
+                 mapping ? mapping->a_ops->readpage : NULL);
-                       vma->vm_file->f_op->mmap);
        dump_stack();
        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 }
@@ -1983,167 +1982,91 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
 }
 /*
- * This routine handles present pages, when users try to write
+ * Handle write page faults for pages that can be reused in the current vma
- * to a shared page. It is done by copying the page to a new address
- * and decrementing the shared-page counter for the old page.
 *
- * Note that this routine assumes that the protection checks have been
+ * This can happen either due to the mapping being with the VM_SHARED flag,
- * done by the caller (the low-level page fault routine in most cases).
+ * or due to us being the last reference standing to the page. In either
- * Thus we can safely just mark it writable once we've done any necessary
+ * case, all we need to do here is to mark the page as writable and update
- * COW.
+ * any related book-keeping.
- *
- * We also mark the page dirty at this point even though the page will
- * change only once the write actually happens. This avoids a few races,
- * and potentially makes it more efficient.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), with pte both mapped and locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
 */
-static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+static inline int wp_page_reuse(struct mm_struct *mm,
-                unsigned long address, pte_t *page_table, pmd_t *pmd,
+                        struct vm_area_struct *vma, unsigned long address,
-                spinlock_t *ptl, pte_t orig_pte)
+                        pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
+                        struct page *page, int page_mkwrite,
+                        int dirty_shared)
        __releases(ptl)
 {
-        struct page *old_page, *new_page = NULL;
        pte_t entry;
-        int ret = 0;
-        int page_mkwrite = 0;
-        bool dirty_shared = false;
-        unsigned long mmun_start = 0;   /* For mmu_notifiers */
-        unsigned long mmun_end = 0;     /* For mmu_notifiers */
-        struct mem_cgroup *memcg;
-        old_page = vm_normal_page(vma, address, orig_pte);
-        if (!old_page) {
-                /*
-                 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
-                 * VM_PFNMAP VMA.
-                 *
-                 * We should not cow pages in a shared writeable mapping.
-                 * Just mark the pages writable as we can't do any dirty
-                 * accounting on raw pfn maps.
-                 */
-                if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
-                                     (VM_WRITE|VM_SHARED))
-                        goto reuse;
-                goto gotten;
-        }
        /*
-         * Take out anonymous pages first, anonymous shared vmas are
+         * Clear the pages cpupid information as the existing
-         * not dirty accountable.
+         * information potentially belongs to a now completely
+         * unrelated process.
         */
-        if (PageAnon(old_page) && !PageKsm(old_page)) {
+        if (page)
-                if (!trylock_page(old_page)) {
+                page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
-                        page_cache_get(old_page);
-                        pte_unmap_unlock(page_table, ptl);
-                        lock_page(old_page);
-                        page_table = pte_offset_map_lock(mm, pmd, address,
-                                                         &ptl);
-                        if (!pte_same(*page_table, orig_pte)) {
-                                unlock_page(old_page);
-                                goto unlock;
-                        }
-                        page_cache_release(old_page);
-                }
-                if (reuse_swap_page(old_page)) {
-                        /*
-                         * The page is all ours.  Move it to our anon_vma so
-                         * the rmap code will not search our parent or siblings.
-                         * Protected against the rmap code by the page lock.
-                         */
-                        page_move_anon_rmap(old_page, vma, address);
-                        unlock_page(old_page);
-                        goto reuse;
-                }
-                unlock_page(old_page);
-        } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
-                                        (VM_WRITE|VM_SHARED))) {
-                page_cache_get(old_page);
-                /*
-                 * Only catch write-faults on shared writable pages,
-                 * read-only shared pages can get COWed by
-                 * get_user_pages(.write=1, .force=1).
-                 */
-                if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
-                        int tmp;
-                        pte_unmap_unlock(page_table, ptl);
-                        tmp = do_page_mkwrite(vma, old_page, address);
-                        if (unlikely(!tmp || (tmp &
-                                        (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
-                                page_cache_release(old_page);
-                                return tmp;
-                        }
-                        /*
-                         * Since we dropped the lock we need to revalidate
-                         * the PTE as someone else may have changed it.  If
-                         * they did, we just return, as we can count on the
-                         * MMU to tell us if they didn't also make it writable.
-                         */
-                        page_table = pte_offset_map_lock(mm, pmd, address,
-                                                         &ptl);
-                        if (!pte_same(*page_table, orig_pte)) {
-                                unlock_page(old_page);
-                                goto unlock;
-                        }
-                        page_mkwrite = 1;
-                }
-                dirty_shared = true;
-reuse:
-                /*
-                 * Clear the pages cpupid information as the existing
-                 * information potentially belongs to a now completely
-                 * unrelated process.
-                 */
-                if (old_page)
-                        page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
-                flush_cache_page(vma, address, pte_pfn(orig_pte));
-                entry = pte_mkyoung(orig_pte);
-                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                if (ptep_set_access_flags(vma, address, page_table, entry,1))
-                        update_mmu_cache(vma, address, page_table);
-                pte_unmap_unlock(page_table, ptl);
-                ret |= VM_FAULT_WRITE;
-                if (dirty_shared) {
+        flush_cache_page(vma, address, pte_pfn(orig_pte));
-                        struct address_space *mapping;
+        entry = pte_mkyoung(orig_pte);
-                        int dirtied;
+        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+        if (ptep_set_access_flags(vma, address, page_table, entry, 1))
+                update_mmu_cache(vma, address, page_table);
+        pte_unmap_unlock(page_table, ptl);
-                        if (!page_mkwrite)
+        if (dirty_shared) {
-                                lock_page(old_page);
+                struct address_space *mapping;
+                int dirtied;
-                        dirtied = set_page_dirty(old_page);
+                if (!page_mkwrite)
-                        VM_BUG_ON_PAGE(PageAnon(old_page), old_page);
+                        lock_page(page);
-                        mapping = old_page->mapping;
-                        unlock_page(old_page);
-                        page_cache_release(old_page);
-                        if ((dirtied || page_mkwrite) && mapping) {
+                dirtied = set_page_dirty(page);
-                                /*
+                VM_BUG_ON_PAGE(PageAnon(page), page);
-                                 * Some device drivers do not set page.mapping
+                mapping = page->mapping;
-                                 * but still dirty their pages
+                unlock_page(page);
-                                 */
+                page_cache_release(page);
-                                balance_dirty_pages_ratelimited(mapping);
-                        }
-                        if (!page_mkwrite)
+                if ((dirtied || page_mkwrite) && mapping) {
-                                file_update_time(vma->vm_file);
+                        /*
+                         * Some device drivers do not set page.mapping
+                         * but still dirty their pages
+                         */
+                        balance_dirty_pages_ratelimited(mapping);
                }
-                return ret;
+                if (!page_mkwrite)
+                        file_update_time(vma->vm_file);
        }
-        /*
+        return VM_FAULT_WRITE;
-         * Ok, we need to copy. Oh, well..
+}
-         */
-        page_cache_get(old_page);
+/*
-gotten:
+ * Handle the case of a page which we actually need to copy to a new page.
-        pte_unmap_unlock(page_table, ptl);
+ *
+ * Called with mmap_sem locked and the old page referenced, but
+ * without the ptl held.
+ *
+ * High level logic flow:
+ *
+ * - Allocate a page, copy the content of the old page to the new one.
+ * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
+ * - Take the PTL. If the pte changed, bail out and release the allocated page
+ * - If the pte is still the way we remember it, update the page table and all
+ *   relevant references. This includes dropping the reference the page-table
+ *   held to the old page, as well as updating the rmap.
+ * - In any case, unlock the PTL and drop the reference we took to the old page.
+ */
+static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
+                        unsigned long address, pte_t *page_table, pmd_t *pmd,
+                        pte_t orig_pte, struct page *old_page)
+{
+        struct page *new_page = NULL;
+        spinlock_t *ptl = NULL;
+        pte_t entry;
+        int page_copied = 0;
+        const unsigned long mmun_start = address & PAGE_MASK;   /* For mmu_notifiers */
+        const unsigned long mmun_end = mmun_start + PAGE_SIZE;  /* For mmu_notifiers */
+        struct mem_cgroup *memcg;
        if (unlikely(anon_vma_prepare(vma)))
                goto oom;
@@ -2163,8 +2086,6 @@ gotten:
        if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
                goto oom_free_new;
-        mmun_start  = address & PAGE_MASK;
-        mmun_end    = mmun_start + PAGE_SIZE;
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        /*
@@ -2177,8 +2098,9 @@ gotten:
                                dec_mm_counter_fast(mm, MM_FILEPAGES);
                                inc_mm_counter_fast(mm, MM_ANONPAGES);
                        }
-                } else
+                } else {
                        inc_mm_counter_fast(mm, MM_ANONPAGES);
+                }
                flush_cache_page(vma, address, pte_pfn(orig_pte));
                entry = mk_pte(new_page, vma->vm_page_prot);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2227,29 +2149,29 @@ gotten:
                /* Free the old page.. */
                new_page = old_page;
-                ret |= VM_FAULT_WRITE;
+                page_copied = 1;
-        } else
+        } else {
                mem_cgroup_cancel_charge(new_page, memcg);
+        }
        if (new_page)
                page_cache_release(new_page);
-unlock:
        pte_unmap_unlock(page_table, ptl);
-        if (mmun_end > mmun_start)
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-                mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        if (old_page) {
                /*
                 * Don't let another task, with possibly unlocked vma,
                 * keep the mlocked page.
                 */
-                if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
+                if (page_copied && (vma->vm_flags & VM_LOCKED)) {
                        lock_page(old_page);    /* LRU manipulation */
                        munlock_vma_page(old_page);
                        unlock_page(old_page);
                }
                page_cache_release(old_page);
        }
-        return ret;
+        return page_copied ? VM_FAULT_WRITE : 0;
 oom_free_new:
        page_cache_release(new_page);
 oom:
@@ -2258,6 +2180,179 @@ oom:
        return VM_FAULT_OOM;
 }
+/*
+ * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
+ * mapping
+ */
+static int wp_pfn_shared(struct mm_struct *mm,
+                        struct vm_area_struct *vma, unsigned long address,
+                        pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
+                        pmd_t *pmd)
+{
+        if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
+                struct vm_fault vmf = {
+                        .page = NULL,
+                        .pgoff = linear_page_index(vma, address),
+                        .virtual_address = (void __user *)(address & PAGE_MASK),
+                        .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
+                };
+                int ret;
+                pte_unmap_unlock(page_table, ptl);
+                ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
+                if (ret & VM_FAULT_ERROR)
+                        return ret;
+                page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+                /*
+                 * We might have raced with another page fault while we
+                 * released the pte_offset_map_lock.
+                 */
+                if (!pte_same(*page_table, orig_pte)) {
+                        pte_unmap_unlock(page_table, ptl);
+                        return 0;
+                }
+        }
+        return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte,
+                             NULL, 0, 0);
+}
+static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
+                          unsigned long address, pte_t *page_table,
+                          pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
+                          struct page *old_page)
+        __releases(ptl)
+{
+        int page_mkwrite = 0;
+        page_cache_get(old_page);
+        /*
+         * Only catch write-faults on shared writable pages,
+         * read-only shared pages can get COWed by
+         * get_user_pages(.write=1, .force=1).
+         */
+        if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+                int tmp;
+                pte_unmap_unlock(page_table, ptl);
+                tmp = do_page_mkwrite(vma, old_page, address);
+                if (unlikely(!tmp || (tmp &
+                                      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
+                        page_cache_release(old_page);
+                        return tmp;
+                }
+                /*
+                 * Since we dropped the lock we need to revalidate
+                 * the PTE as someone else may have changed it.  If
+                 * they did, we just return, as we can count on the
+                 * MMU to tell us if they didn't also make it writable.
+                 */
+                page_table = pte_offset_map_lock(mm, pmd, address,
+                                                 &ptl);
+                if (!pte_same(*page_table, orig_pte)) {
+                        unlock_page(old_page);
+                        pte_unmap_unlock(page_table, ptl);
+                        page_cache_release(old_page);
+                        return 0;
+                }
+                page_mkwrite = 1;
+        }
+        return wp_page_reuse(mm, vma, address, page_table, ptl,
+                             orig_pte, old_page, page_mkwrite, 1);
+}
+/*
+ * This routine handles present pages, when users try to write
+ * to a shared page. It is done by copying the page to a new address
+ * and decrementing the shared-page counter for the old page.
+ *
+ * Note that this routine assumes that the protection checks have been
+ * done by the caller (the low-level page fault routine in most cases).
+ * Thus we can safely just mark it writable once we've done any necessary
+ * COW.
+ *
+ * We also mark the page dirty at this point even though the page will
+ * change only once the write actually happens. This avoids a few races,
+ * and potentially makes it more efficient.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), with pte both mapped and locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                unsigned long address, pte_t *page_table, pmd_t *pmd,
+                spinlock_t *ptl, pte_t orig_pte)
+        __releases(ptl)
+{
+        struct page *old_page;
+        old_page = vm_normal_page(vma, address, orig_pte);
+        if (!old_page) {
+                /*
+                 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
+                 * VM_PFNMAP VMA.
+                 *
+                 * We should not cow pages in a shared writeable mapping.
+                 * Just mark the pages writable and/or call ops->pfn_mkwrite.
+                 */
+                if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+                                     (VM_WRITE|VM_SHARED))
+                        return wp_pfn_shared(mm, vma, address, page_table, ptl,
+                                             orig_pte, pmd);
+                pte_unmap_unlock(page_table, ptl);
+                return wp_page_copy(mm, vma, address, page_table, pmd,
+                                    orig_pte, old_page);
+        }
+        /*
+         * Take out anonymous pages first, anonymous shared vmas are
+         * not dirty accountable.
+         */
+        if (PageAnon(old_page) && !PageKsm(old_page)) {
+                if (!trylock_page(old_page)) {
+                        page_cache_get(old_page);
+                        pte_unmap_unlock(page_table, ptl);
+                        lock_page(old_page);
+                        page_table = pte_offset_map_lock(mm, pmd, address,
+                                                         &ptl);
+                        if (!pte_same(*page_table, orig_pte)) {
+                                unlock_page(old_page);
+                                pte_unmap_unlock(page_table, ptl);
+                                page_cache_release(old_page);
+                                return 0;
+                        }
+                        page_cache_release(old_page);
+                }
+                if (reuse_swap_page(old_page)) {
+                        /*
+                         * The page is all ours.  Move it to our anon_vma so
+                         * the rmap code will not search our parent or siblings.
+                         * Protected against the rmap code by the page lock.
+                         */
+                        page_move_anon_rmap(old_page, vma, address);
+                        unlock_page(old_page);
+                        return wp_page_reuse(mm, vma, address, page_table, ptl,
+                                             orig_pte, old_page, 0, 0);
+                }
+                unlock_page(old_page);
+        } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+                                        (VM_WRITE|VM_SHARED))) {
+                return wp_page_shared(mm, vma, address, page_table, pmd,
+                                      ptl, orig_pte, old_page);
+        }
+        /*
+         * Ok, we need to copy. Oh, well..
+         */
+        page_cache_get(old_page);
+        pte_unmap_unlock(page_table, ptl);
+        return wp_page_copy(mm, vma, address, page_table, pmd,
+                            orig_pte, old_page);
+}
 static void unmap_mapping_range_vma(struct vm_area_struct *vma,
                unsigned long start_addr, unsigned long end_addr,
                struct zap_details *details)
@@ -2784,7 +2879,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
        struct vm_fault vmf;
        int off;
-        nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT;
+        nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
        mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
        start_addr = max(address & mask, vma->vm_start);
@@ -3035,6 +3130,7 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        int last_cpupid;
        int target_nid;
        bool migrated = false;
+        bool was_writable = pte_write(pte);
        int flags = 0;
        /* A PROT_NONE fault should not end up here */
@@ -3059,6 +3155,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        /* Make it present again */
        pte = pte_modify(pte, vma->vm_page_prot);
        pte = pte_mkyoung(pte);
+        if (was_writable)
+                pte = pte_mkwrite(pte);
        set_pte_at(mm, addr, ptep, pte);
        update_mmu_cache(vma, addr, ptep);
@@ -3069,11 +3167,14 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        }
        /*
-         * Avoid grouping on DSO/COW pages in specific and RO pages
+         * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
-         * in general, RO pages shouldn't hurt as much anyway since
+         * much anyway since they can be in shared cache state. This misses
-         * they can be in shared cache state.
+         * the case where a mapping is writable but the process never writes
+         * to it but pte_write gets cleared during protection updates and
+         * pte_dirty has unpredictable behaviour between PTE scan updates,
+         * background writeback, dirty balancing and application behaviour.
         */
-        if (!pte_write(pte))
+        if (!(vma->vm_flags & VM_WRITE))
                flags |= TNF_NO_GROUP;
        /*
@@ -3097,7 +3198,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (migrated) {
                page_nid = target_nid;
                flags |= TNF_MIGRATED;
-        }
+        } else
+                flags |= TNF_MIGRATE_FAIL;
 out:
        if (page_nid != -1)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9fab10795bea..457bde530cbe 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -104,7 +104,7 @@ void put_online_mems(void)
 }
-static void mem_hotplug_begin(void)
+void mem_hotplug_begin(void)
 {
        mem_hotplug.active_writer = current;
@@ -119,7 +119,7 @@ static void mem_hotplug_begin(void)
        }
 }
-static void mem_hotplug_done(void)
+void mem_hotplug_done(void)
 {
        mem_hotplug.active_writer = NULL;
        mutex_unlock(&mem_hotplug.lock);
@@ -502,7 +502,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
        end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
        for (i = start_sec; i <= end_sec; i++) {
-                err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
+                err = __add_section(nid, zone, section_nr_to_pfn(i));
                /*
                 * EEXIST is finally dealt with by ioresource collision
@@ -959,6 +959,7 @@ static void node_states_set_node(int node, struct memory_notify *arg)
 }
+/* Must be protected by mem_hotplug_begin() */
 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
 {
        unsigned long flags;
@@ -969,7 +970,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
        int ret;
        struct memory_notify arg;
-        mem_hotplug_begin();
        /*
         * This doesn't need a lock to do pfn_to_page().
         * The section can't be removed here because of the
@@ -977,21 +977,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
         */
        zone = page_zone(pfn_to_page(pfn));
-        ret = -EINVAL;
        if ((zone_idx(zone) > ZONE_NORMAL ||
            online_type == MMOP_ONLINE_MOVABLE) &&
            !can_online_high_movable(zone))
-                goto out;
+                return -EINVAL;
        if (online_type == MMOP_ONLINE_KERNEL &&
            zone_idx(zone) == ZONE_MOVABLE) {
                if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages))
-                        goto out;
+                        return -EINVAL;
        }
        if (online_type == MMOP_ONLINE_MOVABLE &&
            zone_idx(zone) == ZONE_MOVABLE - 1) {
                if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages))
-                        goto out;
+                        return -EINVAL;
        }
        /* Previous code may changed the zone of the pfn range */
@@ -1007,7 +1006,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
        ret = notifier_to_errno(ret);
        if (ret) {
                memory_notify(MEM_CANCEL_ONLINE, &arg);
-                goto out;
+                return ret;
        }
        /*
         * If this zone is not populated, then it is not in zonelist.
@@ -1031,7 +1030,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
                       (((unsigned long long) pfn + nr_pages)
                            << PAGE_SHIFT) - 1);
                memory_notify(MEM_CANCEL_ONLINE, &arg);
-                goto out;
+                return ret;
        }
        zone->present_pages += onlined_pages;
@@ -1061,9 +1060,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
        if (onlined_pages)
                memory_notify(MEM_ONLINE, &arg);
-out:
+        return 0;
-        mem_hotplug_done();
-        return ret;
 }
 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
@@ -1092,6 +1089,10 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
                        return NULL;
                arch_refresh_nodedata(nid, pgdat);
+        } else {
+                /* Reset the nr_zones and classzone_idx to 0 before reuse */
+                pgdat->nr_zones = 0;
+                pgdat->classzone_idx = 0;
        }
        /* we can use NODE_DATA(nid) from here */
@@ -1372,7 +1373,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
                        if (PageLRU(page))
                                return pfn;
                        if (PageHuge(page)) {
-                                if (is_hugepage_active(page))
+                                if (page_huge_active(page))
                                        return pfn;
                                else
                                        pfn = round_up(pfn + 1,
@@ -1684,21 +1685,18 @@ static int __ref __offline_pages(unsigned long start_pfn,
        if (!test_pages_in_a_zone(start_pfn, end_pfn))
                return -EINVAL;
-        mem_hotplug_begin();
        zone = page_zone(pfn_to_page(start_pfn));
        node = zone_to_nid(zone);
        nr_pages = end_pfn - start_pfn;
-        ret = -EINVAL;
        if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
-                goto out;
+                return -EINVAL;
        /* set above range as isolated */
        ret = start_isolate_page_range(start_pfn, end_pfn,
                                       MIGRATE_MOVABLE, true);
        if (ret)
-                goto out;
+                return ret;
        arg.start_pfn = start_pfn;
        arg.nr_pages = nr_pages;
@@ -1791,7 +1789,6 @@ repeat:
        writeback_set_ratelimit();
        memory_notify(MEM_OFFLINE, &arg);
-        mem_hotplug_done();
        return 0;
 failed_removal:
@@ -1801,12 +1798,10 @@ failed_removal:
        memory_notify(MEM_CANCEL_OFFLINE, &arg);
        /* pushback to free area */
        undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
-out:
-        mem_hotplug_done();
        return ret;
 }
+/* Must be protected by mem_hotplug_begin() */
 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
        return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
@@ -1977,15 +1972,6 @@ void try_offline_node(int nid)
                if (is_vmalloc_addr(zone->wait_table))
                        vfree(zone->wait_table);
        }
-        /*
-         * Since there is no way to guarentee the address of pgdat/zone is not
-         * on stack of any kernel threads or used by other kernel objects
-         * without reference counting or other symchronizing method, do not
-         * reset node_data and free pgdat here. Just reset it to 0 and reuse
-         * the memory when the node is online again.
-         */
-        memset(pgdat, 0, sizeof(*pgdat));
 }
 EXPORT_SYMBOL(try_offline_node);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4721046a134a..ede26291d4aa 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -945,7 +945,8 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x
                return alloc_huge_page_node(page_hstate(compound_head(page)),
                                        node);
        else
-                return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
+                return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE |
+                                                    __GFP_THISNODE, 0);
 }
 /*
@@ -1985,7 +1986,8 @@ retry_cpuset:
                nmask = policy_nodemask(gfp, pol);
                if (!nmask || node_isset(node, *nmask)) {
                        mpol_cond_put(pol);
-                        page = alloc_pages_exact_node(node, gfp, order);
+                        page = alloc_pages_exact_node(node,
+                                                gfp | __GFP_THISNODE, order);
                        goto out;
                }
        }
diff --git a/mm/mempool.c b/mm/mempool.c
index e209c98c7203..2cc08de8b1db 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -6,26 +6,138 @@
 *  extreme VM load.
 *
 *  started by Ingo Molnar, Copyright (C) 2001
+ *  debugging by David Rientjes, Copyright (C) 2015
 */
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/kasan.h>
 #include <linux/kmemleak.h>
 #include <linux/export.h>
 #include <linux/mempool.h>
 #include <linux/blkdev.h>
 #include <linux/writeback.h>
+#include "slab.h"
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
+static void poison_error(mempool_t *pool, void *element, size_t size,
+                         size_t byte)
+{
+        const int nr = pool->curr_nr;
+        const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0);
+        const int end = min_t(int, byte + (BITS_PER_LONG / 8), size);
+        int i;
+        pr_err("BUG: mempool element poison mismatch\n");
+        pr_err("Mempool %p size %zu\n", pool, size);
+        pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : "");
+        for (i = start; i < end; i++)
+                pr_cont("%x ", *(u8 *)(element + i));
+        pr_cont("%s\n", end < size ? "..." : "");
+        dump_stack();
+}
+static void __check_element(mempool_t *pool, void *element, size_t size)
+{
+        u8 *obj = element;
+        size_t i;
+        for (i = 0; i < size; i++) {
+                u8 exp = (i < size - 1) ? POISON_FREE : POISON_END;
+                if (obj[i] != exp) {
+                        poison_error(pool, element, size, i);
+                        return;
+                }
+        }
+        memset(obj, POISON_INUSE, size);
+}
+static void check_element(mempool_t *pool, void *element)
+{
+        /* Mempools backed by slab allocator */
+        if (pool->free == mempool_free_slab || pool->free == mempool_kfree)
+                __check_element(pool, element, ksize(element));
+        /* Mempools backed by page allocator */
+        if (pool->free == mempool_free_pages) {
+                int order = (int)(long)pool->pool_data;
+                void *addr = kmap_atomic((struct page *)element);
+                __check_element(pool, addr, 1UL << (PAGE_SHIFT + order));
+                kunmap_atomic(addr);
+        }
+}
+static void __poison_element(void *element, size_t size)
+{
+        u8 *obj = element;
+        memset(obj, POISON_FREE, size - 1);
+        obj[size - 1] = POISON_END;
+}
+static void poison_element(mempool_t *pool, void *element)
+{
+        /* Mempools backed by slab allocator */
+        if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
+                __poison_element(element, ksize(element));
+        /* Mempools backed by page allocator */
+        if (pool->alloc == mempool_alloc_pages) {
+                int order = (int)(long)pool->pool_data;
+                void *addr = kmap_atomic((struct page *)element);
+                __poison_element(addr, 1UL << (PAGE_SHIFT + order));
+                kunmap_atomic(addr);
+        }
+}
+#else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
+static inline void check_element(mempool_t *pool, void *element)
+{
+}
+static inline void poison_element(mempool_t *pool, void *element)
+{
+}
+#endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
+static void kasan_poison_element(mempool_t *pool, void *element)
+{
+        if (pool->alloc == mempool_alloc_slab)
+                kasan_slab_free(pool->pool_data, element);
+        if (pool->alloc == mempool_kmalloc)
+                kasan_kfree(element);
+        if (pool->alloc == mempool_alloc_pages)
+                kasan_free_pages(element, (unsigned long)pool->pool_data);
+}
+static void kasan_unpoison_element(mempool_t *pool, void *element)
+{
+        if (pool->alloc == mempool_alloc_slab)
+                kasan_slab_alloc(pool->pool_data, element);
+        if (pool->alloc == mempool_kmalloc)
+                kasan_krealloc(element, (size_t)pool->pool_data);
+        if (pool->alloc == mempool_alloc_pages)
+                kasan_alloc_pages(element, (unsigned long)pool->pool_data);
+}
 static void add_element(mempool_t *pool, void *element)
 {
        BUG_ON(pool->curr_nr >= pool->min_nr);
+        poison_element(pool, element);
+        kasan_poison_element(pool, element);
        pool->elements[pool->curr_nr++] = element;
 }
 static void *remove_element(mempool_t *pool)
 {
-        BUG_ON(pool->curr_nr <= 0);
+        void *element = pool->elements[--pool->curr_nr];
-        return pool->elements[--pool->curr_nr];
+        BUG_ON(pool->curr_nr < 0);
+        check_element(pool, element);
+        kasan_unpoison_element(pool, element);
+        return element;
 }
 /**
@@ -113,23 +225,24 @@ EXPORT_SYMBOL(mempool_create_node);
 *              mempool_create().
 * @new_min_nr: the new minimum number of elements guaranteed to be
 *              allocated for this pool.
- * @gfp_mask:   the usual allocation bitmask.
 *
 * This function shrinks/grows the pool. In the case of growing,
 * it cannot be guaranteed that the pool will be grown to the new
 * size immediately, but new mempool_free() calls will refill it.
+ * This function may sleep.
 *
 * Note, the caller must guarantee that no mempool_destroy is called
 * while this function is running. mempool_alloc() & mempool_free()
 * might be called (eg. from IRQ contexts) while this function executes.
 */
-int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
+int mempool_resize(mempool_t *pool, int new_min_nr)
 {
        void *element;
        void **new_elements;
        unsigned long flags;
        BUG_ON(new_min_nr <= 0);
+        might_sleep();
        spin_lock_irqsave(&pool->lock, flags);
        if (new_min_nr <= pool->min_nr) {
@@ -145,7 +258,8 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
        spin_unlock_irqrestore(&pool->lock, flags);
        /* Grow the pool */
-        new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask);
+        new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements),
+                                     GFP_KERNEL);
        if (!new_elements)
                return -ENOMEM;
@@ -164,7 +278,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
        while (pool->curr_nr < pool->min_nr) {
                spin_unlock_irqrestore(&pool->lock, flags);
-                element = pool->alloc(gfp_mask, pool->pool_data);
+                element = pool->alloc(GFP_KERNEL, pool->pool_data);
                if (!element)
                        goto out;
                spin_lock_irqsave(&pool->lock, flags);
@@ -332,6 +446,7 @@ EXPORT_SYMBOL(mempool_free);
 void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
 {
        struct kmem_cache *mem = pool_data;
+        VM_BUG_ON(mem->ctor);
        return kmem_cache_alloc(mem, gfp_mask);
 }
 EXPORT_SYMBOL(mempool_alloc_slab);
diff --git a/mm/memtest.c b/mm/memtest.c
new file mode 100644
index 000000000000..1997d934b13b
--- /dev/null
+++ b/mm/memtest.c
@@ -0,0 +1,118 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/pfn.h>
+#include <linux/memblock.h>
+static u64 patterns[] __initdata = {
+        /* The first entry has to be 0 to leave memtest with zeroed memory */
+        0,
+        0xffffffffffffffffULL,
+        0x5555555555555555ULL,
+        0xaaaaaaaaaaaaaaaaULL,
+        0x1111111111111111ULL,
+        0x2222222222222222ULL,
+        0x4444444444444444ULL,
+        0x8888888888888888ULL,
+        0x3333333333333333ULL,
+        0x6666666666666666ULL,
+        0x9999999999999999ULL,
+        0xccccccccccccccccULL,
+        0x7777777777777777ULL,
+        0xbbbbbbbbbbbbbbbbULL,
+        0xddddddddddddddddULL,
+        0xeeeeeeeeeeeeeeeeULL,
+        0x7a6c7258554e494cULL, /* yeah ;-) */
+};
+static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad)
+{
+        printk(KERN_INFO "  %016llx bad mem addr %010llx - %010llx reserved\n",
+               (unsigned long long) pattern,
+               (unsigned long long) start_bad,
+               (unsigned long long) end_bad);
+        memblock_reserve(start_bad, end_bad - start_bad);
+}
+static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size)
+{
+        u64 *p, *start, *end;
+        phys_addr_t start_bad, last_bad;
+        phys_addr_t start_phys_aligned;
+        const size_t incr = sizeof(pattern);
+        start_phys_aligned = ALIGN(start_phys, incr);
+        start = __va(start_phys_aligned);
+        end = start + (size - (start_phys_aligned - start_phys)) / incr;
+        start_bad = 0;
+        last_bad = 0;
+        for (p = start; p < end; p++)
+                *p = pattern;
+        for (p = start; p < end; p++, start_phys_aligned += incr) {
+                if (*p == pattern)
+                        continue;
+                if (start_phys_aligned == last_bad + incr) {
+                        last_bad += incr;
+                        continue;
+                }
+                if (start_bad)
+                        reserve_bad_mem(pattern, start_bad, last_bad + incr);
+                start_bad = last_bad = start_phys_aligned;
+        }
+        if (start_bad)
+                reserve_bad_mem(pattern, start_bad, last_bad + incr);
+}
+static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
+{
+        u64 i;
+        phys_addr_t this_start, this_end;
+        for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
+                this_start = clamp(this_start, start, end);
+                this_end = clamp(this_end, start, end);
+                if (this_start < this_end) {
+                        printk(KERN_INFO "  %010llx - %010llx pattern %016llx\n",
+                               (unsigned long long)this_start,
+                               (unsigned long long)this_end,
+                               (unsigned long long)cpu_to_be64(pattern));
+                        memtest(pattern, this_start, this_end - this_start);
+                }
+        }
+}
+/* default is disabled */
+static int memtest_pattern __initdata;
+static int __init parse_memtest(char *arg)
+{
+        if (arg)
+                memtest_pattern = simple_strtoul(arg, NULL, 0);
+        else
+                memtest_pattern = ARRAY_SIZE(patterns);
+        return 0;
+}
+early_param("memtest", parse_memtest);
+void __init early_memtest(phys_addr_t start, phys_addr_t end)
+{
+        unsigned int i;
+        unsigned int idx = 0;
+        if (!memtest_pattern)
+                return;
+        printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
+        for (i = memtest_pattern-1; i < UINT_MAX; --i) {
+                idx = i % ARRAY_SIZE(patterns);
+                do_one_pass(patterns[idx], start, end);
+        }
+}
diff --git a/mm/migrate.c b/mm/migrate.c
index 85e042686031..f53838fe3dfe 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -537,7 +537,8 @@ void migrate_page_copy(struct page *newpage, struct page *page)
         * Please do not reorder this without considering how mm/ksm.c's
         * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
         */
-        ClearPageSwapCache(page);
+        if (PageSwapCache(page))
+                ClearPageSwapCache(page);
        ClearPagePrivate(page);
        set_page_private(page, 0);
@@ -901,12 +902,23 @@ out:
 }
 /*
+ * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move().  Work
+ * around it.
+ */
+#if (GCC_VERSION >= 40700 && GCC_VERSION < 40900) && defined(CONFIG_ARM)
+#define ICE_noinline noinline
+#else
+#define ICE_noinline
+#endif
+/*
 * Obtain the lock on page, remove all ptes and migrate the page
 * to the newly allocated page in newpage.
 */
-static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
+static ICE_noinline int unmap_and_move(new_page_t get_new_page,
-                        unsigned long private, struct page *page, int force,
+                                   free_page_t put_new_page,
-                        enum migrate_mode mode)
+                                   unsigned long private, struct page *page,
+                                   int force, enum migrate_mode mode)
 {
        int rc = 0;
        int *result = NULL;
@@ -1554,30 +1566,10 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
 * page migration rate limiting control.
 * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
 * window of time. Default here says do not migrate more than 1280M per second.
- * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
- * as it is faults that reset the window, pte updates will happen unconditionally
- * if there has not been a fault since @pteupdate_interval_millisecs after the
- * throttle window closed.
 */
 static unsigned int migrate_interval_millisecs __read_mostly = 100;
-static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
 static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
-/* Returns true if NUMA migration is currently rate limited */
-bool migrate_ratelimited(int node)
-{
-        pg_data_t *pgdat = NODE_DATA(node);
-        if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
-                                msecs_to_jiffies(pteupdate_interval_millisecs)))
-                return false;
-        if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
-                return false;
-        return true;
-}
 /* Returns true if the node is migrate rate-limited after the update */
 static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
                                        unsigned long nr_pages)
diff --git a/mm/mlock.c b/mm/mlock.c
index 73cf0987088c..6fd2cf15e868 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -26,10 +26,10 @@
 int can_do_mlock(void)
 {
-        if (capable(CAP_IPC_LOCK))
-                return 1;
        if (rlimit(RLIMIT_MEMLOCK) != 0)
                return 1;
+        if (capable(CAP_IPC_LOCK))
+                return 1;
        return 0;
 }
 EXPORT_SYMBOL(can_do_mlock);
@@ -205,62 +205,6 @@ out:
        return nr_pages - 1;
 }
-/**
- * __mlock_vma_pages_range() -  mlock a range of pages in the vma.
- * @vma:   target vma
- * @start: start address
- * @end:   end address
- * @nonblocking:
- *
- * This takes care of making the pages present too.
- *
- * return 0 on success, negative error code on error.
- *
- * vma->vm_mm->mmap_sem must be held.
- *
- * If @nonblocking is NULL, it may be held for read or write and will
- * be unperturbed.
- *
- * If @nonblocking is non-NULL, it must held for read only and may be
- * released.  If it's released, *@nonblocking will be set to 0.
- */
-long __mlock_vma_pages_range(struct vm_area_struct *vma,
-                unsigned long start, unsigned long end, int *nonblocking)
-{
-        struct mm_struct *mm = vma->vm_mm;
-        unsigned long nr_pages = (end - start) / PAGE_SIZE;
-        int gup_flags;
-        VM_BUG_ON(start & ~PAGE_MASK);
-        VM_BUG_ON(end   & ~PAGE_MASK);
-        VM_BUG_ON_VMA(start < vma->vm_start, vma);
-        VM_BUG_ON_VMA(end   > vma->vm_end, vma);
-        VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
-        gup_flags = FOLL_TOUCH | FOLL_MLOCK;
-        /*
-         * We want to touch writable mappings with a write fault in order
-         * to break COW, except for shared mappings because these don't COW
-         * and we would not want to dirty them for nothing.
-         */
-        if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
-                gup_flags |= FOLL_WRITE;
-        /*
-         * We want mlock to succeed for regions that have any permissions
-         * other than PROT_NONE.
-         */
-        if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
-                gup_flags |= FOLL_FORCE;
-        /*
-         * We made sure addr is within a VMA, so the following will
-         * not result in a stack expansion that recurses back here.
-         */
-        return __get_user_pages(current, mm, start, nr_pages, gup_flags,
-                                NULL, NULL, nonblocking);
-}
 /*
 * convert get_user_pages() return value to posix mlock() error
 */
@@ -596,7 +540,7 @@ success:
        /*
         * vm_flags is protected by the mmap_sem held in write mode.
         * It's okay if try_to_unmap_one unmaps a page just after we
-         * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
+         * set VM_LOCKED, populate_vma_page_range will bring it back.
         */
        if (lock)
@@ -660,69 +604,6 @@ static int do_mlock(unsigned long start, size_t len, int on)
        return error;
 }
-/*
- * __mm_populate - populate and/or mlock pages within a range of address space.
- *
- * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
- * flags. VMAs must be already marked with the desired vm_flags, and
- * mmap_sem must not be held.
- */
-int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
-{
-        struct mm_struct *mm = current->mm;
-        unsigned long end, nstart, nend;
-        struct vm_area_struct *vma = NULL;
-        int locked = 0;
-        long ret = 0;
-        VM_BUG_ON(start & ~PAGE_MASK);
-        VM_BUG_ON(len != PAGE_ALIGN(len));
-        end = start + len;
-        for (nstart = start; nstart < end; nstart = nend) {
-                /*
-                 * We want to fault in pages for [nstart; end) address range.
-                 * Find first corresponding VMA.
-                 */
-                if (!locked) {
-                        locked = 1;
-                        down_read(&mm->mmap_sem);
-                        vma = find_vma(mm, nstart);
-                } else if (nstart >= vma->vm_end)
-                        vma = vma->vm_next;
-                if (!vma || vma->vm_start >= end)
-                        break;
-                /*
-                 * Set [nstart; nend) to intersection of desired address
-                 * range with the first VMA. Also, skip undesirable VMA types.
-                 */
-                nend = min(end, vma->vm_end);
-                if (vma->vm_flags & (VM_IO | VM_PFNMAP))
-                        continue;
-                if (nstart < vma->vm_start)
-                        nstart = vma->vm_start;
-                /*
-                 * Now fault in a range of pages. __mlock_vma_pages_range()
-                 * double checks the vma flags, so that it won't mlock pages
-                 * if the vma was already munlocked.
-                 */
-                ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
-                if (ret < 0) {
-                        if (ignore_errors) {
-                                ret = 0;
-                                continue;       /* continue at next VMA */
-                        }
-                        ret = __mlock_posix_error_return(ret);
-                        break;
-                }
-                nend = nstart + ret * PAGE_SIZE;
-                ret = 0;
-        }
-        if (locked)
-                up_read(&mm->mmap_sem);
-        return ret;     /* 0 or negative error code */
-}
 SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
 {
        unsigned long locked;
@@ -750,9 +631,13 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
                error = do_mlock(start, len, 1);
        up_write(&current->mm->mmap_sem);
-        if (!error)
+        if (error)
-                error = __mm_populate(start, len, 0);
+                return error;
-        return error;
+        error = __mm_populate(start, len, 0);
+        if (error)
+                return __mlock_posix_error_return(error);
+        return 0;
 }
 SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
diff --git a/mm/mmap.c b/mm/mmap.c
index da9990acc08b..bb50cacc3ea5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -774,10 +774,8 @@ again:			remove_next = 1 + (end > next->vm_end);
                        importer->anon_vma = exporter->anon_vma;
                        error = anon_vma_clone(importer, exporter);
-                        if (error) {
+                        if (error)
-                                importer->anon_vma = NULL;
                                return error;
-                        }
                }
        }
@@ -1135,7 +1133,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
 * by another page fault trying to merge _that_. But that's ok: if it
 * is being set up, that automatically means that it will be a singleton
 * acceptable for merging, so we can do all of this optimistically. But
- * we do that ACCESS_ONCE() to make sure that we never re-load the pointer.
+ * we do that READ_ONCE() to make sure that we never re-load the pointer.
 *
 * IOW: that the "list_is_singular()" test on the anon_vma_chain only
 * matters for the 'stable anon_vma' case (ie the thing we want to avoid
@@ -1149,7 +1147,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
 static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
 {
        if (anon_vma_compatible(a, b)) {
-                struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma);
+                struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
                if (anon_vma && list_is_singular(&old->anon_vma_chain))
                        return anon_vma;
@@ -1553,11 +1551,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
        /* Clear old maps */
        error = -ENOMEM;
-munmap_back:
+        while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
-        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
+                              &rb_parent)) {
                if (do_munmap(mm, addr, len))
                        return -ENOMEM;
-                goto munmap_back;
        }
        /*
@@ -1573,7 +1570,8 @@ munmap_back:
        /*
         * Can we just expand an old mapping?
         */
-        vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
+        vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
+                        NULL);
        if (vma)
                goto out;
@@ -2102,7 +2100,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
        actual_size = size;
        if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN)))
                actual_size -= PAGE_SIZE;
-        if (actual_size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
+        if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur))
                return -ENOMEM;
        /* mlock limit tests */
@@ -2110,7 +2108,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
                unsigned long locked;
                unsigned long limit;
                locked = mm->locked_vm + grow;
-                limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
+                limit = READ_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
                limit >>= PAGE_SHIFT;
                if (locked > limit && !capable(CAP_IPC_LOCK))
                        return -ENOMEM;
@@ -2318,7 +2316,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
        if (!prev || expand_stack(prev, addr))
                return NULL;
        if (prev->vm_flags & VM_LOCKED)
-                __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);
+                populate_vma_page_range(prev, addr, prev->vm_end, NULL);
        return prev;
 }
 #else
@@ -2353,7 +2351,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
        if (expand_stack(vma, addr))
                return NULL;
        if (vma->vm_flags & VM_LOCKED)
-                __mlock_vma_pages_range(vma, addr, start, NULL);
+                populate_vma_page_range(vma, addr, start, NULL);
        return vma;
 }
 #endif
@@ -2741,11 +2739,10 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
        /*
         * Clear old maps.  this also does some error checking for us
         */
- munmap_back:
+        while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
-        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
+                              &rb_parent)) {
                if (do_munmap(mm, addr, len))
                        return -ENOMEM;
-                goto munmap_back;
        }
        /* Check against address space limits *after* clearing old maps... */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 44727811bf4c..88584838e704 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -75,6 +75,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                oldpte = *pte;
                if (pte_present(oldpte)) {
                        pte_t ptent;
+                        bool preserve_write = prot_numa && pte_write(oldpte);
                        /*
                         * Avoid trapping faults against the zero or KSM
@@ -94,6 +95,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                        ptent = ptep_modify_prot_start(mm, addr, pte);
                        ptent = pte_modify(ptent, newprot);
+                        if (preserve_write)
+                                ptent = pte_mkwrite(ptent);
                        /* Avoid taking write faults for known dirty pages */
                        if (dirty_accountable && pte_dirty(ptent) &&
diff --git a/mm/mremap.c b/mm/mremap.c
index 57dadc025c64..034e2d360652 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -286,8 +286,14 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                old_len = new_len;
                old_addr = new_addr;
                new_addr = -ENOMEM;
-        } else if (vma->vm_file && vma->vm_file->f_op->mremap)
+        } else if (vma->vm_file && vma->vm_file->f_op->mremap) {
-                vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
+                err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
+                if (err < 0) {
+                        move_page_tables(new_vma, new_addr, vma, old_addr,
+                                         moved_len, true);
+                        return err;
+                }
+        }
        /* Conceal VM_ACCOUNT so old reservation is not undone */
        if (vm_flags & VM_ACCOUNT) {
@@ -339,25 +345,25 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
        struct vm_area_struct *vma = find_vma(mm, addr);
        if (!vma || vma->vm_start > addr)
-                goto Efault;
+                return ERR_PTR(-EFAULT);
        if (is_vm_hugetlb_page(vma))
-                goto Einval;
+                return ERR_PTR(-EINVAL);
        /* We can't remap across vm area boundaries */
        if (old_len > vma->vm_end - addr)
-                goto Efault;
+                return ERR_PTR(-EFAULT);
        /* Need to be careful about a growing mapping */
        if (new_len > old_len) {
                unsigned long pgoff;
                if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
-                        goto Efault;
+                        return ERR_PTR(-EFAULT);
                pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
                pgoff += vma->vm_pgoff;
                if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
-                        goto Einval;
+                        return ERR_PTR(-EINVAL);
        }
        if (vma->vm_flags & VM_LOCKED) {
@@ -366,29 +372,20 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
                lock_limit = rlimit(RLIMIT_MEMLOCK);
                locked += new_len - old_len;
                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-                        goto Eagain;
+                        return ERR_PTR(-EAGAIN);
        }
        if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
-                goto Enomem;
+                return ERR_PTR(-ENOMEM);
        if (vma->vm_flags & VM_ACCOUNT) {
                unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
                if (security_vm_enough_memory_mm(mm, charged))
-                        goto Efault;
+                        return ERR_PTR(-ENOMEM);
                *p = charged;
        }
        return vma;
-Efault: /* very odd choice for most of the cases, but... */
-        return ERR_PTR(-EFAULT);
-Einval:
-        return ERR_PTR(-EINVAL);
-Enomem:
-        return ERR_PTR(-ENOMEM);
-Eagain:
-        return ERR_PTR(-EAGAIN);
 }
 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
diff --git a/mm/nommu.c b/mm/nommu.c
index 3e67e7538ecf..e544508e2a4b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -62,6 +62,7 @@ void *high_memory;
 EXPORT_SYMBOL(high_memory);
 struct page *mem_map;
 unsigned long max_mapnr;
+EXPORT_SYMBOL(max_mapnr);
 unsigned long highest_memmap_pfn;
 struct percpu_counter vm_committed_as;
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
@@ -1015,7 +1016,7 @@ static int validate_mmap_request(struct file *file,
                 * device */
                if (!file->f_op->get_unmapped_area)
                        capabilities &= ~NOMMU_MAP_DIRECT;
-                if (!file->f_op->read)
+                if (!(file->f_mode & FMODE_CAN_READ))
                        capabilities &= ~NOMMU_MAP_COPY;
                /* The file shall have been opened with read permission. */
@@ -1239,7 +1240,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
                old_fs = get_fs();
                set_fs(KERNEL_DS);
-                ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos);
+                ret = __vfs_read(vma->vm_file, base, len, &fpos);
                set_fs(old_fs);
                if (ret < 0)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 642f38cb175a..2b665da1b3c9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -408,7 +408,7 @@ bool oom_killer_disabled __read_mostly;
 static DECLARE_RWSEM(oom_sem);
 /**
- * mark_tsk_oom_victim - marks the given taks as OOM victim.
+ * mark_tsk_oom_victim - marks the given task as OOM victim.
 * @tsk: task to mark
 *
 * Has to be called with oom_sem taken for read and never after
@@ -612,7 +612,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
 */
 void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
-                        int order, const nodemask_t *nodemask)
+                        int order, const nodemask_t *nodemask,
+                        struct mem_cgroup *memcg)
 {
        if (likely(!sysctl_panic_on_oom))
                return;
@@ -625,7 +626,7 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
                if (constraint != CONSTRAINT_NONE)
                        return;
        }
-        dump_header(NULL, gfp_mask, order, NULL, nodemask);
+        dump_header(NULL, gfp_mask, order, memcg, nodemask);
        panic("Out of memory: %s panic_on_oom is enabled\n",
                sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
 }
@@ -740,7 +741,7 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
                                                &totalpages);
        mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
-        check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
+        check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
        if (sysctl_oom_kill_allocating_task && current->mm &&
            !oom_unkillable_task(current, NULL, nodemask) &&
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 45e187b2d971..5daf5568b9e1 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -857,8 +857,11 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
         *                   bw * elapsed + write_bandwidth * (period - elapsed)
         * write_bandwidth = ---------------------------------------------------
         *                                          period
+         *
+         * @written may have decreased due to account_page_redirty().
+         * Avoid underflowing @bw calculation.
         */
-        bw = written - bdi->written_stamp;
+        bw = written - min(written, bdi->written_stamp);
        bw *= HZ;
        if (unlikely(elapsed > period)) {
                do_div(bw, elapsed);
@@ -922,7 +925,7 @@ static void global_update_bandwidth(unsigned long thresh,
                                    unsigned long now)
 {
        static DEFINE_SPINLOCK(dirty_lock);
-        static unsigned long update_time;
+        static unsigned long update_time = INITIAL_JIFFIES;
        /*
         * check locklessly first to optimize away locking for the most time
@@ -2108,6 +2111,25 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 EXPORT_SYMBOL(account_page_dirtied);
 /*
+ * Helper function for deaccounting dirty page without writeback.
+ *
+ * Doing this should *normally* only ever be done when a page
+ * is truncated, and is not actually mapped anywhere at all. However,
+ * fs/buffer.c does this when it notices that somebody has cleaned
+ * out all the buffers on a page without actually doing it through
+ * the VM. Can you say "ext3 is horribly ugly"? Thought you could.
+ */
+void account_page_cleaned(struct page *page, struct address_space *mapping)
+{
+        if (mapping_cap_account_dirty(mapping)) {
+                dec_zone_page_state(page, NR_FILE_DIRTY);
+                dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
+                task_io_account_cancelled_write(PAGE_CACHE_SIZE);
+        }
+}
+EXPORT_SYMBOL(account_page_cleaned);
+/*
 * For address_spaces which do not use buffers.  Just tag the page as dirty in
 * its radix tree.
 *
@@ -2206,7 +2228,8 @@ int set_page_dirty(struct page *page)
                 * it will confuse readahead and make it restart the size rampup
                 * process. But it's a trivial problem.
                 */
-                ClearPageReclaim(page);
+                if (PageReclaim(page))
+                        ClearPageReclaim(page);
 #ifdef CONFIG_BLOCK
                if (!spd)
                        spd = __set_page_dirty_buffers;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7abfa70cdc1a..ebffa0e4a9c0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1032,11 +1032,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 static int fallbacks[MIGRATE_TYPES][4] = {
        [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
        [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
+        [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
 #ifdef CONFIG_CMA
-        [MIGRATE_MOVABLE]     = { MIGRATE_CMA,         MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
        [MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
-#else
-        [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
 #endif
        [MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
 #ifdef CONFIG_MEMORY_ISOLATION
@@ -1044,6 +1042,17 @@ static int fallbacks[MIGRATE_TYPES][4] = {
 #endif
 };
+#ifdef CONFIG_CMA
+static struct page *__rmqueue_cma_fallback(struct zone *zone,
+                                        unsigned int order)
+{
+        return __rmqueue_smallest(zone, order, MIGRATE_CMA);
+}
+#else
+static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
+                                        unsigned int order) { return NULL; }
+#endif
 /*
 * Move the free pages in a range to the free lists of the requested type.
 * Note that start_page and end_pages are not aligned on a pageblock
@@ -1136,14 +1145,40 @@ static void change_pageblock_range(struct page *pageblock_page,
 * as fragmentation caused by those allocations polluting movable pageblocks
 * is worse than movable allocations stealing from unmovable and reclaimable
 * pageblocks.
- *
- * If we claim more than half of the pageblock, change pageblock's migratetype
- * as well.
 */
-static void try_to_steal_freepages(struct zone *zone, struct page *page,
+static bool can_steal_fallback(unsigned int order, int start_mt)
-                                  int start_type, int fallback_type)
+{
+        /*
+         * Leaving this order check is intended, although there is
+         * relaxed order check in next check. The reason is that
+         * we can actually steal whole pageblock if this condition met,
+         * but, below check doesn't guarantee it and that is just heuristic
+         * so could be changed anytime.
+         */
+        if (order >= pageblock_order)
+                return true;
+        if (order >= pageblock_order / 2 ||
+                start_mt == MIGRATE_RECLAIMABLE ||
+                start_mt == MIGRATE_UNMOVABLE ||
+                page_group_by_mobility_disabled)
+                return true;
+        return false;
+}
+/*
+ * This function implements actual steal behaviour. If order is large enough,
+ * we can steal whole pageblock. If not, we first move freepages in this
+ * pageblock and check whether half of pages are moved or not. If half of
+ * pages are moved, we can change migratetype of pageblock and permanently
+ * use it's pages as requested migratetype in the future.
+ */
+static void steal_suitable_fallback(struct zone *zone, struct page *page,
+                                                          int start_type)
 {
        int current_order = page_order(page);
+        int pages;
        /* Take ownership for orders >= pageblock_order */
        if (current_order >= pageblock_order) {
@@ -1151,19 +1186,49 @@ static void try_to_steal_freepages(struct zone *zone, struct page *page,
                return;
        }
-        if (current_order >= pageblock_order / 2 ||
+        pages = move_freepages_block(zone, page, start_type);
-            start_type == MIGRATE_RECLAIMABLE ||
-            start_type == MIGRATE_UNMOVABLE ||
+        /* Claim the whole block if over half of it is free */
-            page_group_by_mobility_disabled) {
+        if (pages >= (1 << (pageblock_order-1)) ||
-                int pages;
+                        page_group_by_mobility_disabled)
+                set_pageblock_migratetype(page, start_type);
+}
+/*
+ * Check whether there is a suitable fallback freepage with requested order.
+ * If only_stealable is true, this function returns fallback_mt only if
+ * we can steal other freepages all together. This would help to reduce
+ * fragmentation due to mixed migratetype pages in one pageblock.
+ */
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+                        int migratetype, bool only_stealable, bool *can_steal)
+{
+        int i;
+        int fallback_mt;
+        if (area->nr_free == 0)
+                return -1;
+        *can_steal = false;
+        for (i = 0;; i++) {
+                fallback_mt = fallbacks[migratetype][i];
+                if (fallback_mt == MIGRATE_RESERVE)
+                        break;
+                if (list_empty(&area->free_list[fallback_mt]))
+                        continue;
-                pages = move_freepages_block(zone, page, start_type);
+                if (can_steal_fallback(order, migratetype))
+                        *can_steal = true;
-                /* Claim the whole block if over half of it is free */
+                if (!only_stealable)
-                if (pages >= (1 << (pageblock_order-1)) ||
+                        return fallback_mt;
-                                page_group_by_mobility_disabled)
-                        set_pageblock_migratetype(page, start_type);
+                if (*can_steal)
+                        return fallback_mt;
        }
+        return -1;
 }
 /* Remove an element from the buddy allocator from the fallback list */
@@ -1173,64 +1238,45 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
        struct free_area *area;
        unsigned int current_order;
        struct page *page;
+        int fallback_mt;
+        bool can_steal;
        /* Find the largest possible block of pages in the other list */
        for (current_order = MAX_ORDER-1;
                                current_order >= order && current_order <= MAX_ORDER-1;
                                --current_order) {
-                int i;
+                area = &(zone->free_area[current_order]);
-                for (i = 0;; i++) {
+                fallback_mt = find_suitable_fallback(area, current_order,
-                        int migratetype = fallbacks[start_migratetype][i];
+                                start_migratetype, false, &can_steal);
-                        int buddy_type = start_migratetype;
+                if (fallback_mt == -1)
+                        continue;
-                        /* MIGRATE_RESERVE handled later if necessary */
-                        if (migratetype == MIGRATE_RESERVE)
-                                break;
-                        area = &(zone->free_area[current_order]);
-                        if (list_empty(&area->free_list[migratetype]))
-                                continue;
-                        page = list_entry(area->free_list[migratetype].next,
-                                        struct page, lru);
-                        area->nr_free--;
-                        if (!is_migrate_cma(migratetype)) {
-                                try_to_steal_freepages(zone, page,
-                                                        start_migratetype,
-                                                        migratetype);
-                        } else {
-                                /*
-                                 * When borrowing from MIGRATE_CMA, we need to
-                                 * release the excess buddy pages to CMA
-                                 * itself, and we do not try to steal extra
-                                 * free pages.
-                                 */
-                                buddy_type = migratetype;
-                        }
-                        /* Remove the page from the freelists */
+                page = list_entry(area->free_list[fallback_mt].next,
-                        list_del(&page->lru);
+                                                struct page, lru);
-                        rmv_page_order(page);
+                if (can_steal)
+                        steal_suitable_fallback(zone, page, start_migratetype);
-                        expand(zone, page, order, current_order, area,
+                /* Remove the page from the freelists */
-                                        buddy_type);
+                area->nr_free--;
+                list_del(&page->lru);
+                rmv_page_order(page);
-                        /*
+                expand(zone, page, order, current_order, area,
-                         * The freepage_migratetype may differ from pageblock's
+                                        start_migratetype);
-                         * migratetype depending on the decisions in
+                /*
-                         * try_to_steal_freepages(). This is OK as long as it
+                 * The freepage_migratetype may differ from pageblock's
-                         * does not differ for MIGRATE_CMA pageblocks. For CMA
+                 * migratetype depending on the decisions in
-                         * we need to make sure unallocated pages flushed from
+                 * try_to_steal_freepages(). This is OK as long as it
-                         * pcp lists are returned to the correct freelist.
+                 * does not differ for MIGRATE_CMA pageblocks. For CMA
-                         */
+                 * we need to make sure unallocated pages flushed from
-                        set_freepage_migratetype(page, buddy_type);
+                 * pcp lists are returned to the correct freelist.
+                 */
+                set_freepage_migratetype(page, start_migratetype);
-                        trace_mm_page_alloc_extfrag(page, order, current_order,
+                trace_mm_page_alloc_extfrag(page, order, current_order,
-                                start_migratetype, migratetype);
+                        start_migratetype, fallback_mt);
-                        return page;
+                return page;
-                }
        }
        return NULL;
@@ -1249,7 +1295,11 @@ retry_reserve:
        page = __rmqueue_smallest(zone, order, migratetype);
        if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
-                page = __rmqueue_fallback(zone, order, migratetype);
+                if (migratetype == MIGRATE_MOVABLE)
+                        page = __rmqueue_cma_fallback(zone, order);
+                if (!page)
+                        page = __rmqueue_fallback(zone, order, migratetype);
                /*
                 * Use MIGRATE_RESERVE rather than fail an allocation. goto
@@ -1321,7 +1371,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
        int to_drain, batch;
        local_irq_save(flags);
-        batch = ACCESS_ONCE(pcp->batch);
+        batch = READ_ONCE(pcp->batch);
        to_drain = min(pcp->count, batch);
        if (to_drain > 0) {
                free_pcppages_bulk(zone, to_drain, pcp);
@@ -1520,7 +1570,7 @@ void free_hot_cold_page(struct page *page, bool cold)
                list_add_tail(&page->lru, &pcp->lists[migratetype]);
        pcp->count++;
        if (pcp->count >= pcp->high) {
-                unsigned long batch = ACCESS_ONCE(pcp->batch);
+                unsigned long batch = READ_ONCE(pcp->batch);
                free_pcppages_bulk(zone, batch, pcp);
                pcp->count -= batch;
        }
@@ -2362,18 +2412,13 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                        *did_some_progress = 1;
                        goto out;
                }
-                /*
+                /* The OOM killer may not free memory on a specific node */
-                 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
-                 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
-                 * The caller should handle page allocation failure by itself if
-                 * it specifies __GFP_THISNODE.
-                 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
-                 */
                if (gfp_mask & __GFP_THISNODE)
                        goto out;
        }
        /* Exhausted what can be done so it's blamo time */
-        if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false))
+        if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
+                        || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
                *did_some_progress = 1;
 out:
        oom_zonelist_unlock(ac->zonelist, gfp_mask);
@@ -2622,15 +2667,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        }
        /*
-         * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
+         * If this allocation cannot block and it is for a specific node, then
-         * __GFP_NOWARN set) should not cause reclaim since the subsystem
+         * fail early.  There's no need to wakeup kswapd or retry for a
-         * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
+         * speculative node-specific allocation.
-         * using a larger set of nodes after it has established that the
-         * allowed per node queues are empty and that nodes are
-         * over allocated.
         */
-        if (IS_ENABLED(CONFIG_NUMA) &&
+        if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait)
-            (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
                goto nopage;
 retry:
@@ -2823,7 +2864,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        /*
         * Check the zones suitable for the gfp_mask contain at least one
         * valid zone. It's possible to have an empty zonelist as a result
-         * of GFP_THISNODE and a memoryless node
+         * of __GFP_THISNODE and a memoryless node
         */
        if (unlikely(!zonelist->_zonerefs->zone))
                return NULL;
@@ -3200,38 +3241,31 @@ static void show_migration_types(unsigned char type)
 * Show free area list (used inside shift_scroll-lock stuff)
 * We also calculate the percentage fragmentation. We do this by counting the
 * memory on each free list with the exception of the first item on the list.
- * Suppresses nodes that are not allowed by current's cpuset if
+ *
- * SHOW_MEM_FILTER_NODES is passed.
+ * Bits in @filter:
+ * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
+ *   cpuset.
 */
 void show_free_areas(unsigned int filter)
 {
+        unsigned long free_pcp = 0;
        int cpu;
        struct zone *zone;
        for_each_populated_zone(zone) {
                if (skip_free_areas_node(filter, zone_to_nid(zone)))
                        continue;
-                show_node(zone);
-                printk("%s per-cpu:\n", zone->name);
-                for_each_online_cpu(cpu) {
+                for_each_online_cpu(cpu)
-                        struct per_cpu_pageset *pageset;
+                        free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
-                        pageset = per_cpu_ptr(zone->pageset, cpu);
-                        printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
-                               cpu, pageset->pcp.high,
-                               pageset->pcp.batch, pageset->pcp.count);
-                }
        }
        printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
                " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
-                " unevictable:%lu"
+                " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
-                " dirty:%lu writeback:%lu unstable:%lu\n"
+                " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
-                " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
                " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
-                " free_cma:%lu\n",
+                " free:%lu free_pcp:%lu free_cma:%lu\n",
                global_page_state(NR_ACTIVE_ANON),
                global_page_state(NR_INACTIVE_ANON),
                global_page_state(NR_ISOLATED_ANON),
@@ -3242,13 +3276,14 @@ void show_free_areas(unsigned int filter)
                global_page_state(NR_FILE_DIRTY),
                global_page_state(NR_WRITEBACK),
                global_page_state(NR_UNSTABLE_NFS),
-                global_page_state(NR_FREE_PAGES),
                global_page_state(NR_SLAB_RECLAIMABLE),
                global_page_state(NR_SLAB_UNRECLAIMABLE),
                global_page_state(NR_FILE_MAPPED),
                global_page_state(NR_SHMEM),
                global_page_state(NR_PAGETABLE),
                global_page_state(NR_BOUNCE),
+                global_page_state(NR_FREE_PAGES),
+                free_pcp,
                global_page_state(NR_FREE_CMA_PAGES));
        for_each_populated_zone(zone) {
@@ -3256,6 +3291,11 @@ void show_free_areas(unsigned int filter)
                if (skip_free_areas_node(filter, zone_to_nid(zone)))
                        continue;
+                free_pcp = 0;
+                for_each_online_cpu(cpu)
+                        free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
                show_node(zone);
                printk("%s"
                        " free:%lukB"
@@ -3282,6 +3322,8 @@ void show_free_areas(unsigned int filter)
                        " pagetables:%lukB"
                        " unstable:%lukB"
                        " bounce:%lukB"
+                        " free_pcp:%lukB"
+                        " local_pcp:%ukB"
                        " free_cma:%lukB"
                        " writeback_tmp:%lukB"
                        " pages_scanned:%lu"
@@ -3313,6 +3355,8 @@ void show_free_areas(unsigned int filter)
                        K(zone_page_state(zone, NR_PAGETABLE)),
                        K(zone_page_state(zone, NR_UNSTABLE_NFS)),
                        K(zone_page_state(zone, NR_BOUNCE)),
+                        K(free_pcp),
+                        K(this_cpu_read(zone->pageset->pcp.count)),
                        K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
                        K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
                        K(zone_page_state(zone, NR_PAGES_SCANNED)),
@@ -5716,7 +5760,7 @@ static void __setup_per_zone_wmarks(void)
                         * value here.
                         *
                         * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
-                         * deltas controls asynch page reclaim, and so should
+                         * deltas control asynch page reclaim, and so should
                         * not be capped for highmem.
                         */
                        unsigned long min_pages;
@@ -6163,7 +6207,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
        mask <<= (BITS_PER_LONG - bitidx - 1);
        flags <<= (BITS_PER_LONG - bitidx - 1);
-        word = ACCESS_ONCE(bitmap[word_bitidx]);
+        word = READ_ONCE(bitmap[word_bitidx]);
        for (;;) {
                old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
                if (word == old_word)
diff --git a/mm/page_io.c b/mm/page_io.c
index e6045804c8d8..6424869e275e 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -20,8 +20,8 @@
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/frontswap.h>
-#include <linux/aio.h>
 #include <linux/blkdev.h>
+#include <linux/uio.h>
 #include <asm/pgtable.h>
 static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -274,13 +274,10 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
                iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE);
                init_sync_kiocb(&kiocb, swap_file);
                kiocb.ki_pos = page_file_offset(page);
-                kiocb.ki_nbytes = PAGE_SIZE;
                set_page_writeback(page);
                unlock_page(page);
-                ret = mapping->a_ops->direct_IO(ITER_BVEC | WRITE,
+                ret = mapping->a_ops->direct_IO(&kiocb, &from, kiocb.ki_pos);
-                                                &kiocb, &from,
-                                                kiocb.ki_pos);
                if (ret == PAGE_SIZE) {
                        count_vm_event(PSWPOUT);
                        ret = 0;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 72f5ac381ab3..755a42c76eb4 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -103,6 +103,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)
                        if (!is_migrate_isolate_page(buddy)) {
                                __isolate_free_page(page, order);
+                                kernel_map_pages(page, (1 << order), 1);
                                set_page_refcounted(page);
                                isolated_page = page;
                        }
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 75c1f2878519..29f2f8b853ae 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -265,8 +265,15 @@ int walk_page_range(unsigned long start, unsigned long end,
                        vma = vma->vm_next;
                        err = walk_page_test(start, next, walk);
-                        if (err > 0)
+                        if (err > 0) {
+                                /*
+                                 * positive return values are purely for
+                                 * controlling the pagewalk, so should never
+                                 * be passed to the callers.
+                                 */
+                                err = 0;
                                continue;
+                        }
                        if (err < 0)
                                break;
                }
diff --git a/mm/percpu.c b/mm/percpu.c
index 73c97a5f4495..dfd02484e8de 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1310,7 +1310,7 @@ bool is_kernel_percpu_address(unsigned long addr)
 * and, from the second one, the backing allocator (currently either vm or
 * km) provides translation.
 *
- * The addr can be tranlated simply without checking if it falls into the
+ * The addr can be translated simply without checking if it falls into the
 * first chunk. But the current code reflects better how percpu allocator
 * actually works, and the verification can discover both bugs in percpu
 * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
@@ -1762,7 +1762,7 @@ early_param("percpu_alloc", percpu_alloc_setup);
 * and other parameters considering needed percpu size, allocation
 * atom size and distances between CPUs.
 *
- * Groups are always mutliples of atom size and CPUs which are of
+ * Groups are always multiples of atom size and CPUs which are of
 * LOCAL_DISTANCE both ways are grouped together and share space for
 * units in the same group.  The returned configuration is guaranteed
 * to have CPUs on different nodes on different groups and >=75% usage
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index b1597690530c..e88d071648c2 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -257,22 +257,18 @@ static ssize_t process_vm_rw(pid_t pid,
        struct iovec *iov_r = iovstack_r;
        struct iov_iter iter;
        ssize_t rc;
+        int dir = vm_write ? WRITE : READ;
        if (flags != 0)
                return -EINVAL;
        /* Check iovecs */
-        if (vm_write)
+        rc = import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter);
-                rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV,
+        if (rc < 0)
-                                           iovstack_l, &iov_l);
+                return rc;
-        else
+        if (!iov_iter_count(&iter))
-                rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV,
-                                           iovstack_l, &iov_l);
-        if (rc <= 0)
                goto free_iovecs;
-        iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc);
        rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
                                   iovstack_r, &iov_r);
        if (rc <= 0)
@@ -283,8 +279,7 @@ static ssize_t process_vm_rw(pid_t pid,
 free_iovecs:
        if (iov_r != iovstack_r)
                kfree(iov_r);
-        if (iov_l != iovstack_l)
+        kfree(iov_l);
-                kfree(iov_l);
        return rc;
 }
@@ -320,21 +315,16 @@ compat_process_vm_rw(compat_pid_t pid,
        struct iovec *iov_r = iovstack_r;
        struct iov_iter iter;
        ssize_t rc = -EFAULT;
+        int dir = vm_write ? WRITE : READ;
        if (flags != 0)
                return -EINVAL;
-        if (vm_write)
+        rc = compat_import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter);
-                rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt,
+        if (rc < 0)
-                                                  UIO_FASTIOV, iovstack_l,
+                return rc;
-                                                  &iov_l);
+        if (!iov_iter_count(&iter))
-        else
-                rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt,
-                                                  UIO_FASTIOV, iovstack_l,
-                                                  &iov_l);
-        if (rc <= 0)
                goto free_iovecs;
-        iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc);
        rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt,
                                          UIO_FASTIOV, iovstack_r,
                                          &iov_r);
@@ -346,8 +336,7 @@ compat_process_vm_rw(compat_pid_t pid,
 free_iovecs:
        if (iov_r != iovstack_r)
                kfree(iov_r);
-        if (iov_l != iovstack_l)
+        kfree(iov_l);
-                kfree(iov_l);
        return rc;
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 5e3e09081164..24dd3f9fee27 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -287,6 +287,13 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
        return 0;
 enomem_failure:
+        /*
+         * dst->anon_vma is dropped here otherwise its degree can be incorrectly
+         * decremented in unlink_anon_vmas().
+         * We can safely do this because callers of anon_vma_clone() don't care
+         * about dst->anon_vma if anon_vma_clone() failed.
+         */
+        dst->anon_vma = NULL;
        unlink_anon_vmas(dst);
        return -ENOMEM;
 }
@@ -449,7 +456,7 @@ struct anon_vma *page_get_anon_vma(struct page *page)
        unsigned long anon_mapping;
        rcu_read_lock();
-        anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
+        anon_mapping = (unsigned long)READ_ONCE(page->mapping);
        if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                goto out;
        if (!page_mapped(page))
@@ -493,14 +500,14 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
        unsigned long anon_mapping;
        rcu_read_lock();
-        anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
+        anon_mapping = (unsigned long)READ_ONCE(page->mapping);
        if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                goto out;
        if (!page_mapped(page))
                goto out;
        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
-        root_anon_vma = ACCESS_ONCE(anon_vma->root);
+        root_anon_vma = READ_ONCE(anon_vma->root);
        if (down_read_trylock(&root_anon_vma->rwsem)) {
                /*
                 * If the page is still mapped, then this anon_vma is still
diff --git a/mm/shmem.c b/mm/shmem.c
index cf2d0ca010bc..de981370fbc5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -31,7 +31,7 @@
 #include <linux/mm.h>
 #include <linux/export.h>
 #include <linux/swap.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
 static struct vfsmount *shm_mnt;
@@ -544,7 +544,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
 static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = d_inode(dentry);
        struct shmem_inode_info *info = SHMEM_I(inode);
        int error;
@@ -2274,7 +2274,7 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 */
 static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
 {
-        struct inode *inode = old_dentry->d_inode;
+        struct inode *inode = d_inode(old_dentry);
        int ret;
        /*
@@ -2298,7 +2298,7 @@ out:
 static int shmem_unlink(struct inode *dir, struct dentry *dentry)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = d_inode(dentry);
        if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
                shmem_free_inode(inode->i_sb);
@@ -2315,7 +2315,7 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
        if (!simple_empty(dentry))
                return -ENOTEMPTY;
-        drop_nlink(dentry->d_inode);
+        drop_nlink(d_inode(dentry));
        drop_nlink(dir);
        return shmem_unlink(dir, dentry);
 }
@@ -2336,8 +2336,8 @@ static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, stru
        }
        old_dir->i_ctime = old_dir->i_mtime =
        new_dir->i_ctime = new_dir->i_mtime =
-        old_dentry->d_inode->i_ctime =
+        d_inode(old_dentry)->i_ctime =
-        new_dentry->d_inode->i_ctime = CURRENT_TIME;
+        d_inode(new_dentry)->i_ctime = CURRENT_TIME;
        return 0;
 }
@@ -2376,7 +2376,7 @@ static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
 */
 static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags)
 {
-        struct inode *inode = old_dentry->d_inode;
+        struct inode *inode = d_inode(old_dentry);
        int they_are_dirs = S_ISDIR(inode->i_mode);
        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
@@ -2396,10 +2396,10 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
                        return error;
        }
-        if (new_dentry->d_inode) {
+        if (d_really_is_positive(new_dentry)) {
                (void) shmem_unlink(new_dir, new_dentry);
                if (they_are_dirs) {
-                        drop_nlink(new_dentry->d_inode);
+                        drop_nlink(d_inode(new_dentry));
                        drop_nlink(old_dir);
                }
        } else if (they_are_dirs) {
@@ -2476,14 +2476,14 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
 {
-        nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);
+        nd_set_link(nd, SHMEM_I(d_inode(dentry))->symlink);
        return NULL;
 }
 static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
        struct page *page = NULL;
-        int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
+        int error = shmem_getpage(d_inode(dentry), 0, &page, SGP_READ, NULL);
        nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
        if (page)
                unlock_page(page);
@@ -2574,7 +2574,7 @@ static int shmem_xattr_validate(const char *name)
 static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
                              void *buffer, size_t size)
 {
-        struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
+        struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
        int err;
        /*
@@ -2595,7 +2595,7 @@ static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
 static int shmem_setxattr(struct dentry *dentry, const char *name,
                          const void *value, size_t size, int flags)
 {
-        struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
+        struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
        int err;
        /*
@@ -2615,7 +2615,7 @@ static int shmem_setxattr(struct dentry *dentry, const char *name,
 static int shmem_removexattr(struct dentry *dentry, const char *name)
 {
-        struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
+        struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
        int err;
        /*
@@ -2635,7 +2635,7 @@ static int shmem_removexattr(struct dentry *dentry, const char *name)
 static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
-        struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
+        struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
        return simple_xattr_list(&info->xattrs, buffer, size);
 }
 #endif /* CONFIG_TMPFS_XATTR */
@@ -3118,8 +3118,6 @@ static const struct file_operations shmem_file_operations = {
        .mmap           = shmem_mmap,
 #ifdef CONFIG_TMPFS
        .llseek         = shmem_file_llseek,
-        .read           = new_sync_read,
-        .write          = new_sync_write,
        .read_iter      = shmem_file_read_iter,
        .write_iter     = generic_file_write_iter,
        .fsync          = noop_fsync,
diff --git a/mm/slab.c b/mm/slab.c
index c4b89eaf4c96..7eb38dd1cefa 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -857,6 +857,11 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
        return NULL;
 }
+static inline gfp_t gfp_exact_node(gfp_t flags)
+{
+        return flags;
+}
 #else   /* CONFIG_NUMA */
 static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
@@ -1023,6 +1028,15 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
        return __cache_free_alien(cachep, objp, node, page_node);
 }
+/*
+ * Construct gfp mask to allocate from a specific node but do not invoke reclaim
+ * or warn about failures.
+ */
+static inline gfp_t gfp_exact_node(gfp_t flags)
+{
+        return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT;
+}
 #endif
 /*
@@ -2825,7 +2839,7 @@ alloc_done:
        if (unlikely(!ac->avail)) {
                int x;
 force_grow:
-                x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
+                x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
                /* cache_grow can reenable interrupts, then ac could change. */
                ac = cpu_cache_get(cachep);
@@ -3019,7 +3033,7 @@ retry:
                        get_node(cache, nid) &&
                        get_node(cache, nid)->free_objects) {
                                obj = ____cache_alloc_node(cache,
-                                        flags | GFP_THISNODE, nid);
+                                        gfp_exact_node(flags), nid);
                                if (obj)
                                        break;
                }
@@ -3047,7 +3061,7 @@ retry:
                        nid = page_to_nid(page);
                        if (cache_grow(cache, flags, nid, page)) {
                                obj = ____cache_alloc_node(cache,
-                                        flags | GFP_THISNODE, nid);
+                                        gfp_exact_node(flags), nid);
                                if (!obj)
                                        /*
                                         * Another processor may allocate the
@@ -3118,7 +3132,7 @@ retry:
 must_grow:
        spin_unlock(&n->list_lock);
-        x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
+        x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL);
        if (x)
                goto retry;
diff --git a/mm/slob.c b/mm/slob.c
index 94a7fede6d48..4765f65019c7 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -532,7 +532,7 @@ int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
        return 0;
 }
-void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
+static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 {
        void *b;
@@ -558,7 +558,6 @@ void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
        kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
        return b;
 }
-EXPORT_SYMBOL(slob_alloc_node);
 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
diff --git a/mm/slub.c b/mm/slub.c
index 6832c4eab104..54c0876b43d5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -374,7 +374,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
                if (cmpxchg_double(&page->freelist, &page->counters,
                                   freelist_old, counters_old,
                                   freelist_new, counters_new))
-                        return 1;
+                        return true;
        } else
 #endif
        {
@@ -384,7 +384,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
                        page->freelist = freelist_new;
                        set_page_slub_counters(page, counters_new);
                        slab_unlock(page);
-                        return 1;
+                        return true;
                }
                slab_unlock(page);
        }
@@ -396,7 +396,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
        pr_info("%s %s: cmpxchg double redo ", n, s->name);
 #endif
-        return 0;
+        return false;
 }
 static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
@@ -410,7 +410,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
                if (cmpxchg_double(&page->freelist, &page->counters,
                                   freelist_old, counters_old,
                                   freelist_new, counters_new))
-                        return 1;
+                        return true;
        } else
 #endif
        {
@@ -424,7 +424,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
                        set_page_slub_counters(page, counters_new);
                        slab_unlock(page);
                        local_irq_restore(flags);
-                        return 1;
+                        return true;
                }
                slab_unlock(page);
                local_irq_restore(flags);
@@ -437,7 +437,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
        pr_info("%s %s: cmpxchg double redo ", n, s->name);
 #endif
-        return 0;
+        return false;
 }
 #ifdef CONFIG_SLUB_DEBUG
@@ -1137,15 +1137,6 @@ static int __init setup_slub_debug(char *str)
                 */
                goto check_slabs;
-        if (tolower(*str) == 'o') {
-                /*
-                 * Avoid enabling debugging on caches if its minimum order
-                 * would increase as a result.
-                 */
-                disable_higher_order_debug = 1;
-                goto out;
-        }
        slub_debug = 0;
        if (*str == '-')
                /*
@@ -1176,6 +1167,13 @@ static int __init setup_slub_debug(char *str)
                case 'a':
                        slub_debug |= SLAB_FAILSLAB;
                        break;
+                case 'o':
+                        /*
+                         * Avoid enabling debugging on caches if its minimum
+                         * order would increase as a result.
+                         */
+                        disable_higher_order_debug = 1;
+                        break;
                default:
                        pr_err("slub_debug option '%c' unknown. skipped\n",
                               *str);
@@ -2449,7 +2447,8 @@ redo:
        do {
                tid = this_cpu_read(s->cpu_slab->tid);
                c = raw_cpu_ptr(s->cpu_slab);
-        } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));
+        } while (IS_ENABLED(CONFIG_PREEMPT) &&
+                 unlikely(tid != READ_ONCE(c->tid)));
        /*
         * Irqless object alloc/free algorithm used here depends on sequence
@@ -2718,7 +2717,8 @@ redo:
        do {
                tid = this_cpu_read(s->cpu_slab->tid);
                c = raw_cpu_ptr(s->cpu_slab);
-        } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));
+        } while (IS_ENABLED(CONFIG_PREEMPT) &&
+                 unlikely(tid != READ_ONCE(c->tid)));
        /* Same with comment on barrier() in slab_alloc_node() */
        barrier();
@@ -4277,7 +4277,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
                        int node;
                        struct page *page;
-                        page = ACCESS_ONCE(c->page);
+                        page = READ_ONCE(c->page);
                        if (!page)
                                continue;
@@ -4292,7 +4292,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
                        total += x;
                        nodes[node] += x;
-                        page = ACCESS_ONCE(c->partial);
+                        page = READ_ONCE(c->partial);
                        if (page) {
                                node = page_to_nid(page);
                                if (flags & SO_TOTAL)
diff --git a/mm/swap.c b/mm/swap.c
index cd3a5e64cea9..a7251a8ed532 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,6 +31,7 @@
 #include <linux/memcontrol.h>
 #include <linux/gfp.h>
 #include <linux/uio.h>
+#include <linux/hugetlb.h>
 #include "internal.h"
@@ -42,7 +43,7 @@ int page_cluster;
 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
 /*
 * This path almost never happens for VM activity - pages are normally
@@ -75,7 +76,14 @@ static void __put_compound_page(struct page *page)
 {
        compound_page_dtor *dtor;
-        __page_cache_release(page);
+        /*
+         * __page_cache_release() is supposed to be called for thp, not for
+         * hugetlb. This is because hugetlb page does never have PageLRU set
+         * (it's never listed to any LRU lists) and no memcg routines should
+         * be called for hugetlb (it has a separate hugetlb_cgroup.)
+         */
+        if (!PageHuge(page))
+                __page_cache_release(page);
        dtor = get_compound_page_dtor(page);
        (*dtor)(page);
 }
@@ -743,7 +751,7 @@ void lru_cache_add_active_or_unevictable(struct page *page,
 * be write it out by flusher threads as this is much more effective
 * than the single-page writeout from reclaim.
 */
-static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
                              void *arg)
 {
        int lru, file;
@@ -811,36 +819,36 @@ void lru_add_drain_cpu(int cpu)
                local_irq_restore(flags);
        }
-        pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+        pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
        if (pagevec_count(pvec))
-                pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+                pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
        activate_page_drain(cpu);
 }
 /**
- * deactivate_page - forcefully deactivate a page
+ * deactivate_file_page - forcefully deactivate a file page
 * @page: page to deactivate
 *
 * This function hints the VM that @page is a good reclaim candidate,
 * for example if its invalidation fails due to the page being dirty
 * or under writeback.
 */
-void deactivate_page(struct page *page)
+void deactivate_file_page(struct page *page)
 {
        /*
-         * In a workload with many unevictable page such as mprotect, unevictable
+         * In a workload with many unevictable page such as mprotect,
-         * page deactivation for accelerating reclaim is pointless.
+         * unevictable page deactivation for accelerating reclaim is pointless.
         */
        if (PageUnevictable(page))
                return;
        if (likely(get_page_unless_zero(page))) {
-                struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+                struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
                if (!pagevec_add(pvec, page))
-                        pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+                        pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
-                put_cpu_var(lru_deactivate_pvecs);
+                put_cpu_var(lru_deactivate_file_pvecs);
        }
 }
@@ -872,7 +880,7 @@ void lru_add_drain_all(void)
                if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
                    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
-                    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+                    pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
                    need_activate_page_drain(cpu)) {
                        INIT_WORK(work, lru_add_drain_per_cpu);
                        schedule_work_on(cpu, work);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 405923f77334..8bc8e66138da 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -390,7 +390,7 @@ static unsigned long swapin_nr_pages(unsigned long offset)
        unsigned int pages, max_pages, last_ra;
        static atomic_t last_readahead_pages;
-        max_pages = 1 << ACCESS_ONCE(page_cluster);
+        max_pages = 1 << READ_ONCE(page_cluster);
        if (max_pages <= 1)
                return 1;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 63f55ccb9b26..a7e72103f23b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1312,7 +1312,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
                        else
                                continue;
                }
-                count = ACCESS_ONCE(si->swap_map[i]);
+                count = READ_ONCE(si->swap_map[i]);
                if (count && swap_count(count) != SWAP_MAP_BAD)
                        break;
        }
diff --git a/mm/truncate.c b/mm/truncate.c
index ddec5a5966d7..66af9031fae8 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -93,35 +93,6 @@ void do_invalidatepage(struct page *page, unsigned int offset,
 }
 /*
- * This cancels just the dirty bit on the kernel page itself, it
- * does NOT actually remove dirty bits on any mmap's that may be
- * around. It also leaves the page tagged dirty, so any sync
- * activity will still find it on the dirty lists, and in particular,
- * clear_page_dirty_for_io() will still look at the dirty bits in
- * the VM.
- *
- * Doing this should *normally* only ever be done when a page
- * is truncated, and is not actually mapped anywhere at all. However,
- * fs/buffer.c does this when it notices that somebody has cleaned
- * out all the buffers on a page without actually doing it through
- * the VM. Can you say "ext3 is horribly ugly"? Tought you could.
- */
-void cancel_dirty_page(struct page *page, unsigned int account_size)
-{
-        if (TestClearPageDirty(page)) {
-                struct address_space *mapping = page->mapping;
-                if (mapping && mapping_cap_account_dirty(mapping)) {
-                        dec_zone_page_state(page, NR_FILE_DIRTY);
-                        dec_bdi_stat(inode_to_bdi(mapping->host),
-                                        BDI_RECLAIMABLE);
-                        if (account_size)
-                                task_io_account_cancelled_write(account_size);
-                }
-        }
-}
-EXPORT_SYMBOL(cancel_dirty_page);
-/*
 * If truncate cannot remove the fs-private metadata from the page, the page
 * becomes orphaned.  It will be left on the LRU and may even be mapped into
 * user pagetables if we're racing with filemap_fault().
@@ -140,7 +111,13 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
        if (page_has_private(page))
                do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
-        cancel_dirty_page(page, PAGE_CACHE_SIZE);
+        /*
+         * Some filesystems seem to re-dirty the page even after
+         * the VM has canceled the dirty bit (eg ext3 journaling).
+         * Hence dirty accounting check is placed after invalidation.
+         */
+        if (TestClearPageDirty(page))
+                account_page_cleaned(page, mapping);
        ClearPageMappedToDisk(page);
        delete_from_page_cache(page);
@@ -513,7 +490,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
                         * of interest and try to speed up its reclaim.
                         */
                        if (!ret)
-                                deactivate_page(page);
+                                deactivate_file_page(page);
                        count += ret;
                }
                pagevec_remove_exceptionals(&pvec);
diff --git a/mm/util.c b/mm/util.c
index 3981ae9d1b15..68ff8a5361e7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -325,9 +325,37 @@ void kvfree(const void *addr)
 }
 EXPORT_SYMBOL(kvfree);
+static inline void *__page_rmapping(struct page *page)
+{
+        unsigned long mapping;
+        mapping = (unsigned long)page->mapping;
+        mapping &= ~PAGE_MAPPING_FLAGS;
+        return (void *)mapping;
+}
+/* Neutral page->mapping pointer to address_space or anon_vma or other */
+void *page_rmapping(struct page *page)
+{
+        page = compound_head(page);
+        return __page_rmapping(page);
+}
+struct anon_vma *page_anon_vma(struct page *page)
+{
+        unsigned long mapping;
+        page = compound_head(page);
+        mapping = (unsigned long)page->mapping;
+        if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+                return NULL;
+        return __page_rmapping(page);
+}
 struct address_space *page_mapping(struct page *page)
 {
-        struct address_space *mapping = page->mapping;
+        unsigned long mapping;
        /* This happens if someone calls flush_dcache_page on slab page */
        if (unlikely(PageSlab(page)))
@@ -337,10 +365,13 @@ struct address_space *page_mapping(struct page *page)
                swp_entry_t entry;
                entry.val = page_private(page);
-                mapping = swap_address_space(entry);
+                return swap_address_space(entry);
-        } else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
+        }
-                mapping = NULL;
-        return mapping;
+        mapping = (unsigned long)page->mapping;
+        if (mapping & PAGE_MAPPING_FLAGS)
+                return NULL;
+        return page->mapping;
 }
 int overcommit_ratio_handler(struct ctl_table *table, int write,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 35b25e1340ca..2faaa2976447 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -29,6 +29,7 @@
 #include <linux/atomic.h>
 #include <linux/compiler.h>
 #include <linux/llist.h>
+#include <linux/bitops.h>
 #include <asm/uaccess.h>
 #include <asm/tlbflush.h>
@@ -74,6 +75,8 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                if (pmd_clear_huge(pmd))
+                        continue;
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                vunmap_pte_range(pmd, addr, next);
@@ -88,6 +91,8 @@ static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
        pud = pud_offset(pgd, addr);
        do {
                next = pud_addr_end(addr, end);
+                if (pud_clear_huge(pud))
+                        continue;
                if (pud_none_or_clear_bad(pud))
                        continue;
                vunmap_pmd_range(pud, addr, next);
@@ -760,7 +765,7 @@ struct vmap_block {
        spinlock_t lock;
        struct vmap_area *va;
        unsigned long free, dirty;
-        DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
+        unsigned long dirty_min, dirty_max; /*< dirty range */
        struct list_head free_list;
        struct rcu_head rcu_head;
        struct list_head purge;
@@ -791,13 +796,31 @@ static unsigned long addr_to_vb_idx(unsigned long addr)
        return addr;
 }
-static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
+static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
+{
+        unsigned long addr;
+        addr = va_start + (pages_off << PAGE_SHIFT);
+        BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
+        return (void *)addr;
+}
+/**
+ * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
+ *                  block. Of course pages number can't exceed VMAP_BBMAP_BITS
+ * @order:    how many 2^order pages should be occupied in newly allocated block
+ * @gfp_mask: flags for the page level allocator
+ *
+ * Returns: virtual address in a newly allocated block or ERR_PTR(-errno)
+ */
+static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
 {
        struct vmap_block_queue *vbq;
        struct vmap_block *vb;
        struct vmap_area *va;
        unsigned long vb_idx;
        int node, err;
+        void *vaddr;
        node = numa_node_id();
@@ -821,11 +844,15 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
                return ERR_PTR(err);
        }
+        vaddr = vmap_block_vaddr(va->va_start, 0);
        spin_lock_init(&vb->lock);
        vb->va = va;
-        vb->free = VMAP_BBMAP_BITS;
+        /* At least something should be left free */
+        BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
+        vb->free = VMAP_BBMAP_BITS - (1UL << order);
        vb->dirty = 0;
-        bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
+        vb->dirty_min = VMAP_BBMAP_BITS;
+        vb->dirty_max = 0;
        INIT_LIST_HEAD(&vb->free_list);
        vb_idx = addr_to_vb_idx(va->va_start);
@@ -837,11 +864,11 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
        vbq = &get_cpu_var(vmap_block_queue);
        spin_lock(&vbq->lock);
-        list_add_rcu(&vb->free_list, &vbq->free);
+        list_add_tail_rcu(&vb->free_list, &vbq->free);
        spin_unlock(&vbq->lock);
        put_cpu_var(vmap_block_queue);
-        return vb;
+        return vaddr;
 }
 static void free_vmap_block(struct vmap_block *vb)
@@ -876,7 +903,8 @@ static void purge_fragmented_blocks(int cpu)
                if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
                        vb->free = 0; /* prevent further allocs after releasing lock */
                        vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
-                        bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
+                        vb->dirty_min = 0;
+                        vb->dirty_max = VMAP_BBMAP_BITS;
                        spin_lock(&vbq->lock);
                        list_del_rcu(&vb->free_list);
                        spin_unlock(&vbq->lock);
@@ -905,7 +933,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
 {
        struct vmap_block_queue *vbq;
        struct vmap_block *vb;
-        unsigned long addr = 0;
+        void *vaddr = NULL;
        unsigned int order;
        BUG_ON(size & ~PAGE_MASK);
@@ -920,43 +948,38 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
        }
        order = get_order(size);
-again:
        rcu_read_lock();
        vbq = &get_cpu_var(vmap_block_queue);
        list_for_each_entry_rcu(vb, &vbq->free, free_list) {
-                int i;
+                unsigned long pages_off;
                spin_lock(&vb->lock);
-                if (vb->free < 1UL << order)
+                if (vb->free < (1UL << order)) {
-                        goto next;
+                        spin_unlock(&vb->lock);
+                        continue;
+                }
-                i = VMAP_BBMAP_BITS - vb->free;
+                pages_off = VMAP_BBMAP_BITS - vb->free;
-                addr = vb->va->va_start + (i << PAGE_SHIFT);
+                vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
-                BUG_ON(addr_to_vb_idx(addr) !=
-                                addr_to_vb_idx(vb->va->va_start));
                vb->free -= 1UL << order;
                if (vb->free == 0) {
                        spin_lock(&vbq->lock);
                        list_del_rcu(&vb->free_list);
                        spin_unlock(&vbq->lock);
                }
                spin_unlock(&vb->lock);
                break;
-next:
-                spin_unlock(&vb->lock);
        }
        put_cpu_var(vmap_block_queue);
        rcu_read_unlock();
-        if (!addr) {
+        /* Allocate new block if nothing was found */
-                vb = new_vmap_block(gfp_mask);
+        if (!vaddr)
-                if (IS_ERR(vb))
+                vaddr = new_vmap_block(order, gfp_mask);
-                        return vb;
-                goto again;
-        }
-        return (void *)addr;
+        return vaddr;
 }
 static void vb_free(const void *addr, unsigned long size)
@@ -974,6 +997,7 @@ static void vb_free(const void *addr, unsigned long size)
        order = get_order(size);
        offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
+        offset >>= PAGE_SHIFT;
        vb_idx = addr_to_vb_idx((unsigned long)addr);
        rcu_read_lock();
@@ -984,7 +1008,10 @@ static void vb_free(const void *addr, unsigned long size)
        vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
        spin_lock(&vb->lock);
-        BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
+        /* Expand dirty range */
+        vb->dirty_min = min(vb->dirty_min, offset);
+        vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
        vb->dirty += 1UL << order;
        if (vb->dirty == VMAP_BBMAP_BITS) {
@@ -1023,25 +1050,18 @@ void vm_unmap_aliases(void)
                rcu_read_lock();
                list_for_each_entry_rcu(vb, &vbq->free, free_list) {
-                        int i, j;
                        spin_lock(&vb->lock);
-                        i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
+                        if (vb->dirty) {
-                        if (i < VMAP_BBMAP_BITS) {
+                                unsigned long va_start = vb->va->va_start;
                                unsigned long s, e;
-                                j = find_last_bit(vb->dirty_map,
+                                s = va_start + (vb->dirty_min << PAGE_SHIFT);
-                                                        VMAP_BBMAP_BITS);
+                                e = va_start + (vb->dirty_max << PAGE_SHIFT);
-                                j = j + 1; /* need exclusive index */
-                                s = vb->va->va_start + (i << PAGE_SHIFT);
+                                start = min(s, start);
-                                e = vb->va->va_start + (j << PAGE_SHIFT);
+                                end   = max(e, end);
-                                flush = 1;
-                                if (s < start)
+                                flush = 1;
-                                        start = s;
-                                if (e > end)
-                                        end = e;
                        }
                        spin_unlock(&vb->lock);
                }
@@ -1314,7 +1334,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
        BUG_ON(in_interrupt());
        if (flags & VM_IOREMAP)
-                align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER);
+                align = 1ul << clamp_t(int, fls_long(size),
+                                       PAGE_SHIFT, IOREMAP_MAX_ORDER);
        size = PAGE_ALIGN(size);
        if (unlikely(!size))
@@ -1418,6 +1439,7 @@ struct vm_struct *remove_vm_area(const void *addr)
                spin_unlock(&vmap_area_lock);
                vmap_debug_free_range(va->va_start, va->va_end);
+                kasan_free_shadow(vm);
                free_unmap_vmap_area(va);
                vm->size -= PAGE_SIZE;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 0dec1fa5f656..08bd7a3d464a 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -12,35 +12,6 @@
 */
 /*
- * This allocator is designed for use with zram. Thus, the allocator is
- * supposed to work well under low memory conditions. In particular, it
- * never attempts higher order page allocation which is very likely to
- * fail under memory pressure. On the other hand, if we just use single
- * (0-order) pages, it would suffer from very high fragmentation --
- * any object of size PAGE_SIZE/2 or larger would occupy an entire page.
- * This was one of the major issues with its predecessor (xvmalloc).
- *
- * To overcome these issues, zsmalloc allocates a bunch of 0-order pages
- * and links them together using various 'struct page' fields. These linked
- * pages act as a single higher-order page i.e. an object can span 0-order
- * page boundaries. The code refers to these linked pages as a single entity
- * called zspage.
- *
- * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
- * since this satisfies the requirements of all its current users (in the
- * worst case, page is incompressible and is thus stored "as-is" i.e. in
- * uncompressed form). For allocation requests larger than this size, failure
- * is returned (see zs_malloc).
- *
- * Additionally, zs_malloc() does not return a dereferenceable pointer.
- * Instead, it returns an opaque handle (unsigned long) which encodes actual
- * location of the allocated object. The reason for this indirection is that
- * zsmalloc does not keep zspages permanently mapped since that would cause
- * issues on 32-bit systems where the VA region for kernel space mappings
- * is very small. So, before using the allocating memory, the object has to
- * be mapped using zs_map_object() to get a usable pointer and subsequently
- * unmapped using zs_unmap_object().
- *
 * Following is how we use various fields and flags of underlying
 * struct page(s) to form a zspage.
 *
@@ -57,6 +28,8 @@
 *
 *      page->private (union with page->first_page): refers to the
 *              component page after the first page
+ *              If the page is first_page for huge object, it stores handle.
+ *              Look at size_class->huge.
 *      page->freelist: points to the first free object in zspage.
 *              Free objects are linked together using in-place
 *              metadata.
@@ -78,6 +51,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/bitops.h>
 #include <linux/errno.h>
 #include <linux/highmem.h>
@@ -110,6 +84,8 @@
 #define ZS_MAX_ZSPAGE_ORDER 2
 #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
+#define ZS_HANDLE_SIZE (sizeof(unsigned long))
 /*
 * Object location (<PFN>, <obj_idx>) is encoded as
 * as single (unsigned long) handle value.
@@ -133,13 +109,33 @@
 #endif
 #endif
 #define _PFN_BITS               (MAX_PHYSMEM_BITS - PAGE_SHIFT)
-#define OBJ_INDEX_BITS  (BITS_PER_LONG - _PFN_BITS)
+/*
+ * Memory for allocating for handle keeps object position by
+ * encoding <page, obj_idx> and the encoded value has a room
+ * in least bit(ie, look at obj_to_location).
+ * We use the bit to synchronize between object access by
+ * user and migration.
+ */
+#define HANDLE_PIN_BIT  0
+/*
+ * Head in allocated object should have OBJ_ALLOCATED_TAG
+ * to identify the object was allocated or not.
+ * It's okay to add the status bit in the least bit because
+ * header keeps handle which is 4byte-aligned address so we
+ * have room for two bit at least.
+ */
+#define OBJ_ALLOCATED_TAG 1
+#define OBJ_TAG_BITS 1
+#define OBJ_INDEX_BITS  (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
 #define OBJ_INDEX_MASK  ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
 #define MAX(a, b) ((a) >= (b) ? (a) : (b))
 /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
 #define ZS_MIN_ALLOC_SIZE \
        MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
+/* each chunk includes extra space to keep handle */
 #define ZS_MAX_ALLOC_SIZE       PAGE_SIZE
 /*
@@ -172,6 +168,8 @@ enum fullness_group {
 enum zs_stat_type {
        OBJ_ALLOCATED,
        OBJ_USED,
+        CLASS_ALMOST_FULL,
+        CLASS_ALMOST_EMPTY,
        NR_ZS_STAT_TYPE,
 };
@@ -216,6 +214,8 @@ struct size_class {
        /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
        int pages_per_zspage;
+        /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
+        bool huge;
 #ifdef CONFIG_ZSMALLOC_STAT
        struct zs_size_stat stats;
@@ -233,14 +233,24 @@ struct size_class {
 * This must be power of 2 and less than or equal to ZS_ALIGN
 */
 struct link_free {
-        /* Handle of next free chunk (encodes <PFN, obj_idx>) */
+        union {
-        void *next;
+                /*
+                 * Position of next free chunk (encodes <PFN, obj_idx>)
+                 * It's valid for non-allocated object
+                 */
+                void *next;
+                /*
+                 * Handle of allocated object.
+                 */
+                unsigned long handle;
+        };
 };
 struct zs_pool {
        char *name;
        struct size_class **size_class;
+        struct kmem_cache *handle_cachep;
        gfp_t flags;    /* allocation flags used when growing pool */
        atomic_long_t pages_allocated;
@@ -267,8 +277,37 @@ struct mapping_area {
 #endif
        char *vm_addr; /* address of kmap_atomic()'ed pages */
        enum zs_mapmode vm_mm; /* mapping mode */
+        bool huge;
 };
+static int create_handle_cache(struct zs_pool *pool)
+{
+        pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
+                                        0, 0, NULL);
+        return pool->handle_cachep ? 0 : 1;
+}
+static void destroy_handle_cache(struct zs_pool *pool)
+{
+        kmem_cache_destroy(pool->handle_cachep);
+}
+static unsigned long alloc_handle(struct zs_pool *pool)
+{
+        return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
+                pool->flags & ~__GFP_HIGHMEM);
+}
+static void free_handle(struct zs_pool *pool, unsigned long handle)
+{
+        kmem_cache_free(pool->handle_cachep, (void *)handle);
+}
+static void record_obj(unsigned long handle, unsigned long obj)
+{
+        *(unsigned long *)handle = obj;
+}
 /* zpool driver */
 #ifdef CONFIG_ZPOOL
@@ -346,6 +385,11 @@ static struct zpool_driver zs_zpool_driver = {
 MODULE_ALIAS("zpool-zsmalloc");
 #endif /* CONFIG_ZPOOL */
+static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
+{
+        return pages_per_zspage * PAGE_SIZE / size;
+}
 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
 static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
@@ -396,9 +440,182 @@ static int get_size_class_index(int size)
                idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
                                ZS_SIZE_CLASS_DELTA);
-        return idx;
+        return min(zs_size_classes - 1, idx);
+}
+#ifdef CONFIG_ZSMALLOC_STAT
+static inline void zs_stat_inc(struct size_class *class,
+                                enum zs_stat_type type, unsigned long cnt)
+{
+        class->stats.objs[type] += cnt;
+}
+static inline void zs_stat_dec(struct size_class *class,
+                                enum zs_stat_type type, unsigned long cnt)
+{
+        class->stats.objs[type] -= cnt;
+}
+static inline unsigned long zs_stat_get(struct size_class *class,
+                                enum zs_stat_type type)
+{
+        return class->stats.objs[type];
+}
+static int __init zs_stat_init(void)
+{
+        if (!debugfs_initialized())
+                return -ENODEV;
+        zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
+        if (!zs_stat_root)
+                return -ENOMEM;
+        return 0;
+}
+static void __exit zs_stat_exit(void)
+{
+        debugfs_remove_recursive(zs_stat_root);
+}
+static int zs_stats_size_show(struct seq_file *s, void *v)
+{
+        int i;
+        struct zs_pool *pool = s->private;
+        struct size_class *class;
+        int objs_per_zspage;
+        unsigned long class_almost_full, class_almost_empty;
+        unsigned long obj_allocated, obj_used, pages_used;
+        unsigned long total_class_almost_full = 0, total_class_almost_empty = 0;
+        unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
+        seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n",
+                        "class", "size", "almost_full", "almost_empty",
+                        "obj_allocated", "obj_used", "pages_used",
+                        "pages_per_zspage");
+        for (i = 0; i < zs_size_classes; i++) {
+                class = pool->size_class[i];
+                if (class->index != i)
+                        continue;
+                spin_lock(&class->lock);
+                class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL);
+                class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
+                obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
+                obj_used = zs_stat_get(class, OBJ_USED);
+                spin_unlock(&class->lock);
+                objs_per_zspage = get_maxobj_per_zspage(class->size,
+                                class->pages_per_zspage);
+                pages_used = obj_allocated / objs_per_zspage *
+                                class->pages_per_zspage;
+                seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n",
+                        i, class->size, class_almost_full, class_almost_empty,
+                        obj_allocated, obj_used, pages_used,
+                        class->pages_per_zspage);
+                total_class_almost_full += class_almost_full;
+                total_class_almost_empty += class_almost_empty;
+                total_objs += obj_allocated;
+                total_used_objs += obj_used;
+                total_pages += pages_used;
+        }
+        seq_puts(s, "\n");
+        seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n",
+                        "Total", "", total_class_almost_full,
+                        total_class_almost_empty, total_objs,
+                        total_used_objs, total_pages);
+        return 0;
+}
+static int zs_stats_size_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, zs_stats_size_show, inode->i_private);
+}
+static const struct file_operations zs_stat_size_ops = {
+        .open           = zs_stats_size_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+        struct dentry *entry;
+        if (!zs_stat_root)
+                return -ENODEV;
+        entry = debugfs_create_dir(name, zs_stat_root);
+        if (!entry) {
+                pr_warn("debugfs dir <%s> creation failed\n", name);
+                return -ENOMEM;
+        }
+        pool->stat_dentry = entry;
+        entry = debugfs_create_file("classes", S_IFREG | S_IRUGO,
+                        pool->stat_dentry, pool, &zs_stat_size_ops);
+        if (!entry) {
+                pr_warn("%s: debugfs file entry <%s> creation failed\n",
+                                name, "classes");
+                return -ENOMEM;
+        }
+        return 0;
+}
+static void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+        debugfs_remove_recursive(pool->stat_dentry);
+}
+#else /* CONFIG_ZSMALLOC_STAT */
+static inline void zs_stat_inc(struct size_class *class,
+                                enum zs_stat_type type, unsigned long cnt)
+{
+}
+static inline void zs_stat_dec(struct size_class *class,
+                                enum zs_stat_type type, unsigned long cnt)
+{
+}
+static inline unsigned long zs_stat_get(struct size_class *class,
+                                enum zs_stat_type type)
+{
+        return 0;
+}
+static int __init zs_stat_init(void)
+{
+        return 0;
+}
+static void __exit zs_stat_exit(void)
+{
+}
+static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+        return 0;
+}
+static inline void zs_pool_stat_destroy(struct zs_pool *pool)
+{
 }
+#endif
 /*
 * For each size class, zspages are divided into different groups
 * depending on how "full" they are. This was done so that we could
@@ -419,7 +636,7 @@ static enum fullness_group get_fullness_group(struct page *page)
                fg = ZS_EMPTY;
        else if (inuse == max_objects)
                fg = ZS_FULL;
-        else if (inuse <= max_objects / fullness_threshold_frac)
+        else if (inuse <= 3 * max_objects / fullness_threshold_frac)
                fg = ZS_ALMOST_EMPTY;
        else
                fg = ZS_ALMOST_FULL;
@@ -448,6 +665,8 @@ static void insert_zspage(struct page *page, struct size_class *class,
                list_add_tail(&page->lru, &(*head)->lru);
        *head = page;
+        zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
+                        CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
 }
 /*
@@ -473,6 +692,8 @@ static void remove_zspage(struct page *page, struct size_class *class,
                                        struct page, lru);
        list_del_init(&page->lru);
+        zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ?
+                        CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
 }
 /*
@@ -484,11 +705,10 @@ static void remove_zspage(struct page *page, struct size_class *class,
 * page from the freelist of the old fullness group to that of the new
 * fullness group.
 */
-static enum fullness_group fix_fullness_group(struct zs_pool *pool,
+static enum fullness_group fix_fullness_group(struct size_class *class,
                                                struct page *page)
 {
        int class_idx;
-        struct size_class *class;
        enum fullness_group currfg, newfg;
        BUG_ON(!is_first_page(page));
@@ -498,7 +718,6 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool,
        if (newfg == currfg)
                goto out;
-        class = pool->size_class[class_idx];
        remove_zspage(page, class, currfg);
        insert_zspage(page, class, newfg);
        set_zspage_mapping(page, class_idx, newfg);
@@ -512,7 +731,8 @@ out:
 * to form a zspage for each size class. This is important
 * to reduce wastage due to unusable space left at end of
 * each zspage which is given as:
- *      wastage = Zp - Zp % size_class
+ *     wastage = Zp % class_size
+ *     usage = Zp - wastage
 * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
 *
 * For example, for size class of 3/8 * PAGE_SIZE, we should
@@ -571,35 +791,50 @@ static struct page *get_next_page(struct page *page)
 /*
 * Encode <page, obj_idx> as a single handle value.
- * On hardware platforms with physical memory starting at 0x0 the pfn
+ * We use the least bit of handle for tagging.
- * could be 0 so we ensure that the handle will never be 0 by adjusting the
- * encoded obj_idx value before encoding.
 */
-static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
+static void *location_to_obj(struct page *page, unsigned long obj_idx)
 {
-        unsigned long handle;
+        unsigned long obj;
        if (!page) {
                BUG_ON(obj_idx);
                return NULL;
        }
-        handle = page_to_pfn(page) << OBJ_INDEX_BITS;
+        obj = page_to_pfn(page) << OBJ_INDEX_BITS;
-        handle |= ((obj_idx + 1) & OBJ_INDEX_MASK);
+        obj |= ((obj_idx) & OBJ_INDEX_MASK);
+        obj <<= OBJ_TAG_BITS;
-        return (void *)handle;
+        return (void *)obj;
 }
 /*
 * Decode <page, obj_idx> pair from the given object handle. We adjust the
 * decoded obj_idx back to its original value since it was adjusted in
- * obj_location_to_handle().
+ * location_to_obj().
 */
-static void obj_handle_to_location(unsigned long handle, struct page **page,
+static void obj_to_location(unsigned long obj, struct page **page,
                                unsigned long *obj_idx)
 {
-        *page = pfn_to_page(handle >> OBJ_INDEX_BITS);
+        obj >>= OBJ_TAG_BITS;
-        *obj_idx = (handle & OBJ_INDEX_MASK) - 1;
+        *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
+        *obj_idx = (obj & OBJ_INDEX_MASK);
+}
+static unsigned long handle_to_obj(unsigned long handle)
+{
+        return *(unsigned long *)handle;
+}
+static unsigned long obj_to_head(struct size_class *class, struct page *page,
+                        void *obj)
+{
+        if (class->huge) {
+                VM_BUG_ON(!is_first_page(page));
+                return *(unsigned long *)page_private(page);
+        } else
+                return *(unsigned long *)obj;
 }
 static unsigned long obj_idx_to_offset(struct page *page,
@@ -613,6 +848,25 @@ static unsigned long obj_idx_to_offset(struct page *page,
        return off + obj_idx * class_size;
 }
+static inline int trypin_tag(unsigned long handle)
+{
+        unsigned long *ptr = (unsigned long *)handle;
+        return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr);
+}
+static void pin_tag(unsigned long handle)
+{
+        while (!trypin_tag(handle));
+}
+static void unpin_tag(unsigned long handle)
+{
+        unsigned long *ptr = (unsigned long *)handle;
+        clear_bit_unlock(HANDLE_PIN_BIT, ptr);
+}
 static void reset_page(struct page *page)
 {
        clear_bit(PG_private, &page->flags);
@@ -674,7 +928,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
                link = (struct link_free *)vaddr + off / sizeof(*link);
                while ((off += class->size) < PAGE_SIZE) {
-                        link->next = obj_location_to_handle(page, i++);
+                        link->next = location_to_obj(page, i++);
                        link += class->size / sizeof(*link);
                }
@@ -684,7 +938,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
                 * page (if present)
                 */
                next_page = get_next_page(page);
-                link->next = obj_location_to_handle(next_page, 0);
+                link->next = location_to_obj(next_page, 0);
                kunmap_atomic(vaddr);
                page = next_page;
                off %= PAGE_SIZE;
@@ -738,7 +992,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
        init_zspage(first_page, class);
-        first_page->freelist = obj_location_to_handle(first_page, 0);
+        first_page->freelist = location_to_obj(first_page, 0);
        /* Maximum number of objects we can store in this zspage */
        first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
@@ -860,12 +1114,19 @@ static void __zs_unmap_object(struct mapping_area *area,
 {
        int sizes[2];
        void *addr;
-        char *buf = area->vm_buf;
+        char *buf;
        /* no write fastpath */
        if (area->vm_mm == ZS_MM_RO)
                goto out;
+        buf = area->vm_buf;
+        if (!area->huge) {
+                buf = buf + ZS_HANDLE_SIZE;
+                size -= ZS_HANDLE_SIZE;
+                off += ZS_HANDLE_SIZE;
+        }
        sizes[0] = PAGE_SIZE - off;
        sizes[1] = size - sizes[0];
@@ -952,11 +1213,6 @@ static void init_zs_size_classes(void)
        zs_size_classes = nr;
 }
-static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
-{
-        return pages_per_zspage * PAGE_SIZE / size;
-}
 static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
 {
        if (prev->pages_per_zspage != pages_per_zspage)
@@ -969,166 +1225,13 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
        return true;
 }
-#ifdef CONFIG_ZSMALLOC_STAT
+static bool zspage_full(struct page *page)
-static inline void zs_stat_inc(struct size_class *class,
-                                enum zs_stat_type type, unsigned long cnt)
-{
-        class->stats.objs[type] += cnt;
-}
-static inline void zs_stat_dec(struct size_class *class,
-                                enum zs_stat_type type, unsigned long cnt)
-{
-        class->stats.objs[type] -= cnt;
-}
-static inline unsigned long zs_stat_get(struct size_class *class,
-                                enum zs_stat_type type)
-{
-        return class->stats.objs[type];
-}
-static int __init zs_stat_init(void)
-{
-        if (!debugfs_initialized())
-                return -ENODEV;
-        zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
-        if (!zs_stat_root)
-                return -ENOMEM;
-        return 0;
-}
-static void __exit zs_stat_exit(void)
-{
-        debugfs_remove_recursive(zs_stat_root);
-}
-static int zs_stats_size_show(struct seq_file *s, void *v)
 {
-        int i;
+        BUG_ON(!is_first_page(page));
-        struct zs_pool *pool = s->private;
-        struct size_class *class;
-        int objs_per_zspage;
-        unsigned long obj_allocated, obj_used, pages_used;
-        unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
-        seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
-                                "obj_allocated", "obj_used", "pages_used");
-        for (i = 0; i < zs_size_classes; i++) {
-                class = pool->size_class[i];
-                if (class->index != i)
-                        continue;
-                spin_lock(&class->lock);
-                obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
-                obj_used = zs_stat_get(class, OBJ_USED);
-                spin_unlock(&class->lock);
-                objs_per_zspage = get_maxobj_per_zspage(class->size,
-                                class->pages_per_zspage);
-                pages_used = obj_allocated / objs_per_zspage *
-                                class->pages_per_zspage;
-                seq_printf(s, " %5u %5u    %10lu %10lu %10lu\n", i,
-                        class->size, obj_allocated, obj_used, pages_used);
-                total_objs += obj_allocated;
-                total_used_objs += obj_used;
-                total_pages += pages_used;
-        }
-        seq_puts(s, "\n");
-        seq_printf(s, " %5s %5s    %10lu %10lu %10lu\n", "Total", "",
-                        total_objs, total_used_objs, total_pages);
-        return 0;
-}
-static int zs_stats_size_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, zs_stats_size_show, inode->i_private);
-}
-static const struct file_operations zs_stat_size_ops = {
-        .open           = zs_stats_size_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-static int zs_pool_stat_create(char *name, struct zs_pool *pool)
-{
-        struct dentry *entry;
-        if (!zs_stat_root)
-                return -ENODEV;
-        entry = debugfs_create_dir(name, zs_stat_root);
-        if (!entry) {
-                pr_warn("debugfs dir <%s> creation failed\n", name);
-                return -ENOMEM;
-        }
-        pool->stat_dentry = entry;
-        entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
-                        pool->stat_dentry, pool, &zs_stat_size_ops);
-        if (!entry) {
-                pr_warn("%s: debugfs file entry <%s> creation failed\n",
-                                name, "obj_in_classes");
-                return -ENOMEM;
-        }
-        return 0;
-}
-static void zs_pool_stat_destroy(struct zs_pool *pool)
-{
-        debugfs_remove_recursive(pool->stat_dentry);
-}
-#else /* CONFIG_ZSMALLOC_STAT */
-static inline void zs_stat_inc(struct size_class *class,
-                                enum zs_stat_type type, unsigned long cnt)
-{
-}
-static inline void zs_stat_dec(struct size_class *class,
-                                enum zs_stat_type type, unsigned long cnt)
-{
-}
-static inline unsigned long zs_stat_get(struct size_class *class,
-                                enum zs_stat_type type)
-{
-        return 0;
-}
-static int __init zs_stat_init(void)
-{
-        return 0;
-}
-static void __exit zs_stat_exit(void)
-{
-}
-static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
-{
-        return 0;
-}
-static inline void zs_pool_stat_destroy(struct zs_pool *pool)
+        return page->inuse == page->objects;
-{
 }
-#endif
 unsigned long zs_get_total_pages(struct zs_pool *pool)
 {
        return atomic_long_read(&pool->pages_allocated);
@@ -1153,13 +1256,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
                        enum zs_mapmode mm)
 {
        struct page *page;
-        unsigned long obj_idx, off;
+        unsigned long obj, obj_idx, off;
        unsigned int class_idx;
        enum fullness_group fg;
        struct size_class *class;
        struct mapping_area *area;
        struct page *pages[2];
+        void *ret;
        BUG_ON(!handle);
@@ -1170,7 +1274,11 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
         */
        BUG_ON(in_interrupt());
-        obj_handle_to_location(handle, &page, &obj_idx);
+        /* From now on, migration cannot move the object */
+        pin_tag(handle);
+        obj = handle_to_obj(handle);
+        obj_to_location(obj, &page, &obj_idx);
        get_zspage_mapping(get_first_page(page), &class_idx, &fg);
        class = pool->size_class[class_idx];
        off = obj_idx_to_offset(page, obj_idx, class->size);
@@ -1180,7 +1288,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
        if (off + class->size <= PAGE_SIZE) {
                /* this object is contained entirely within a page */
                area->vm_addr = kmap_atomic(page);
-                return area->vm_addr + off;
+                ret = area->vm_addr + off;
+                goto out;
        }
        /* this object spans two pages */
@@ -1188,14 +1297,19 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
        pages[1] = get_next_page(page);
        BUG_ON(!pages[1]);
-        return __zs_map_object(area, pages, off, class->size);
+        ret = __zs_map_object(area, pages, off, class->size);
+out:
+        if (!class->huge)
+                ret += ZS_HANDLE_SIZE;
+        return ret;
 }
 EXPORT_SYMBOL_GPL(zs_map_object);
 void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
 {
        struct page *page;
-        unsigned long obj_idx, off;
+        unsigned long obj, obj_idx, off;
        unsigned int class_idx;
        enum fullness_group fg;
@@ -1204,7 +1318,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
        BUG_ON(!handle);
-        obj_handle_to_location(handle, &page, &obj_idx);
+        obj = handle_to_obj(handle);
+        obj_to_location(obj, &page, &obj_idx);
        get_zspage_mapping(get_first_page(page), &class_idx, &fg);
        class = pool->size_class[class_idx];
        off = obj_idx_to_offset(page, obj_idx, class->size);
@@ -1222,9 +1337,42 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
                __zs_unmap_object(area, pages, off, class->size);
        }
        put_cpu_var(zs_map_area);
+        unpin_tag(handle);
 }
 EXPORT_SYMBOL_GPL(zs_unmap_object);
+static unsigned long obj_malloc(struct page *first_page,
+                struct size_class *class, unsigned long handle)
+{
+        unsigned long obj;
+        struct link_free *link;
+        struct page *m_page;
+        unsigned long m_objidx, m_offset;
+        void *vaddr;
+        handle |= OBJ_ALLOCATED_TAG;
+        obj = (unsigned long)first_page->freelist;
+        obj_to_location(obj, &m_page, &m_objidx);
+        m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
+        vaddr = kmap_atomic(m_page);
+        link = (struct link_free *)vaddr + m_offset / sizeof(*link);
+        first_page->freelist = link->next;
+        if (!class->huge)
+                /* record handle in the header of allocated chunk */
+                link->handle = handle;
+        else
+                /* record handle in first_page->private */
+                set_page_private(first_page, handle);
+        kunmap_atomic(vaddr);
+        first_page->inuse++;
+        zs_stat_inc(class, OBJ_USED, 1);
+        return obj;
+}
 /**
 * zs_malloc - Allocate block of given size from pool.
 * @pool: pool to allocate from
@@ -1236,17 +1384,19 @@ EXPORT_SYMBOL_GPL(zs_unmap_object);
 */
 unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 {
-        unsigned long obj;
+        unsigned long handle, obj;
-        struct link_free *link;
        struct size_class *class;
-        void *vaddr;
+        struct page *first_page;
-        struct page *first_page, *m_page;
-        unsigned long m_objidx, m_offset;
        if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
                return 0;
+        handle = alloc_handle(pool);
+        if (!handle)
+                return 0;
+        /* extra space in chunk to keep the handle */
+        size += ZS_HANDLE_SIZE;
        class = pool->size_class[get_size_class_index(size)];
        spin_lock(&class->lock);
@@ -1255,8 +1405,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
        if (!first_page) {
                spin_unlock(&class->lock);
                first_page = alloc_zspage(class, pool->flags);
-                if (unlikely(!first_page))
+                if (unlikely(!first_page)) {
+                        free_handle(pool, handle);
                        return 0;
+                }
                set_zspage_mapping(first_page, class->index, ZS_EMPTY);
                atomic_long_add(class->pages_per_zspage,
@@ -1267,73 +1419,360 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
                                class->size, class->pages_per_zspage));
        }
-        obj = (unsigned long)first_page->freelist;
+        obj = obj_malloc(first_page, class, handle);
-        obj_handle_to_location(obj, &m_page, &m_objidx);
-        m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
-        vaddr = kmap_atomic(m_page);
-        link = (struct link_free *)vaddr + m_offset / sizeof(*link);
-        first_page->freelist = link->next;
-        memset(link, POISON_INUSE, sizeof(*link));
-        kunmap_atomic(vaddr);
-        first_page->inuse++;
-        zs_stat_inc(class, OBJ_USED, 1);
        /* Now move the zspage to another fullness group, if required */
-        fix_fullness_group(pool, first_page);
+        fix_fullness_group(class, first_page);
+        record_obj(handle, obj);
        spin_unlock(&class->lock);
-        return obj;
+        return handle;
 }
 EXPORT_SYMBOL_GPL(zs_malloc);
-void zs_free(struct zs_pool *pool, unsigned long obj)
+static void obj_free(struct zs_pool *pool, struct size_class *class,
+                        unsigned long obj)
 {
        struct link_free *link;
        struct page *first_page, *f_page;
        unsigned long f_objidx, f_offset;
        void *vaddr;
        int class_idx;
-        struct size_class *class;
        enum fullness_group fullness;
-        if (unlikely(!obj))
+        BUG_ON(!obj);
-                return;
-        obj_handle_to_location(obj, &f_page, &f_objidx);
+        obj &= ~OBJ_ALLOCATED_TAG;
+        obj_to_location(obj, &f_page, &f_objidx);
        first_page = get_first_page(f_page);
        get_zspage_mapping(first_page, &class_idx, &fullness);
-        class = pool->size_class[class_idx];
        f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
-        spin_lock(&class->lock);
+        vaddr = kmap_atomic(f_page);
        /* Insert this object in containing zspage's freelist */
-        vaddr = kmap_atomic(f_page);
        link = (struct link_free *)(vaddr + f_offset);
        link->next = first_page->freelist;
+        if (class->huge)
+                set_page_private(first_page, 0);
        kunmap_atomic(vaddr);
        first_page->freelist = (void *)obj;
        first_page->inuse--;
-        fullness = fix_fullness_group(pool, first_page);
        zs_stat_dec(class, OBJ_USED, 1);
-        if (fullness == ZS_EMPTY)
+}
+void zs_free(struct zs_pool *pool, unsigned long handle)
+{
+        struct page *first_page, *f_page;
+        unsigned long obj, f_objidx;
+        int class_idx;
+        struct size_class *class;
+        enum fullness_group fullness;
+        if (unlikely(!handle))
+                return;
+        pin_tag(handle);
+        obj = handle_to_obj(handle);
+        obj_to_location(obj, &f_page, &f_objidx);
+        first_page = get_first_page(f_page);
+        get_zspage_mapping(first_page, &class_idx, &fullness);
+        class = pool->size_class[class_idx];
+        spin_lock(&class->lock);
+        obj_free(pool, class, obj);
+        fullness = fix_fullness_group(class, first_page);
+        if (fullness == ZS_EMPTY) {
                zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
                                class->size, class->pages_per_zspage));
+                atomic_long_sub(class->pages_per_zspage,
+                                &pool->pages_allocated);
+                free_zspage(first_page);
+        }
        spin_unlock(&class->lock);
+        unpin_tag(handle);
+        free_handle(pool, handle);
+}
+EXPORT_SYMBOL_GPL(zs_free);
+static void zs_object_copy(unsigned long src, unsigned long dst,
+                                struct size_class *class)
+{
+        struct page *s_page, *d_page;
+        unsigned long s_objidx, d_objidx;
+        unsigned long s_off, d_off;
+        void *s_addr, *d_addr;
+        int s_size, d_size, size;
+        int written = 0;
+        s_size = d_size = class->size;
+        obj_to_location(src, &s_page, &s_objidx);
+        obj_to_location(dst, &d_page, &d_objidx);
+        s_off = obj_idx_to_offset(s_page, s_objidx, class->size);
+        d_off = obj_idx_to_offset(d_page, d_objidx, class->size);
+        if (s_off + class->size > PAGE_SIZE)
+                s_size = PAGE_SIZE - s_off;
+        if (d_off + class->size > PAGE_SIZE)
+                d_size = PAGE_SIZE - d_off;
+        s_addr = kmap_atomic(s_page);
+        d_addr = kmap_atomic(d_page);
+        while (1) {
+                size = min(s_size, d_size);
+                memcpy(d_addr + d_off, s_addr + s_off, size);
+                written += size;
+                if (written == class->size)
+                        break;
+                s_off += size;
+                s_size -= size;
+                d_off += size;
+                d_size -= size;
+                if (s_off >= PAGE_SIZE) {
+                        kunmap_atomic(d_addr);
+                        kunmap_atomic(s_addr);
+                        s_page = get_next_page(s_page);
+                        BUG_ON(!s_page);
+                        s_addr = kmap_atomic(s_page);
+                        d_addr = kmap_atomic(d_page);
+                        s_size = class->size - written;
+                        s_off = 0;
+                }
+                if (d_off >= PAGE_SIZE) {
+                        kunmap_atomic(d_addr);
+                        d_page = get_next_page(d_page);
+                        BUG_ON(!d_page);
+                        d_addr = kmap_atomic(d_page);
+                        d_size = class->size - written;
+                        d_off = 0;
+                }
+        }
+        kunmap_atomic(d_addr);
+        kunmap_atomic(s_addr);
+}
+/*
+ * Find alloced object in zspage from index object and
+ * return handle.
+ */
+static unsigned long find_alloced_obj(struct page *page, int index,
+                                        struct size_class *class)
+{
+        unsigned long head;
+        int offset = 0;
+        unsigned long handle = 0;
+        void *addr = kmap_atomic(page);
+        if (!is_first_page(page))
+                offset = page->index;
+        offset += class->size * index;
+        while (offset < PAGE_SIZE) {
+                head = obj_to_head(class, page, addr + offset);
+                if (head & OBJ_ALLOCATED_TAG) {
+                        handle = head & ~OBJ_ALLOCATED_TAG;
+                        if (trypin_tag(handle))
+                                break;
+                        handle = 0;
+                }
+                offset += class->size;
+                index++;
+        }
+        kunmap_atomic(addr);
+        return handle;
+}
+struct zs_compact_control {
+        /* Source page for migration which could be a subpage of zspage. */
+        struct page *s_page;
+        /* Destination page for migration which should be a first page
+         * of zspage. */
+        struct page *d_page;
+         /* Starting object index within @s_page which used for live object
+          * in the subpage. */
+        int index;
+        /* how many of objects are migrated */
+        int nr_migrated;
+};
+static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
+                                struct zs_compact_control *cc)
+{
+        unsigned long used_obj, free_obj;
+        unsigned long handle;
+        struct page *s_page = cc->s_page;
+        struct page *d_page = cc->d_page;
+        unsigned long index = cc->index;
+        int nr_migrated = 0;
+        int ret = 0;
+        while (1) {
+                handle = find_alloced_obj(s_page, index, class);
+                if (!handle) {
+                        s_page = get_next_page(s_page);
+                        if (!s_page)
+                                break;
+                        index = 0;
+                        continue;
+                }
+                /* Stop if there is no more space */
+                if (zspage_full(d_page)) {
+                        unpin_tag(handle);
+                        ret = -ENOMEM;
+                        break;
+                }
+                used_obj = handle_to_obj(handle);
+                free_obj = obj_malloc(d_page, class, handle);
+                zs_object_copy(used_obj, free_obj, class);
+                index++;
+                record_obj(handle, free_obj);
+                unpin_tag(handle);
+                obj_free(pool, class, used_obj);
+                nr_migrated++;
+        }
+        /* Remember last position in this iteration */
+        cc->s_page = s_page;
+        cc->index = index;
+        cc->nr_migrated = nr_migrated;
+        return ret;
+}
+static struct page *alloc_target_page(struct size_class *class)
+{
+        int i;
+        struct page *page;
+        for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
+                page = class->fullness_list[i];
+                if (page) {
+                        remove_zspage(page, class, i);
+                        break;
+                }
+        }
+        return page;
+}
+static void putback_zspage(struct zs_pool *pool, struct size_class *class,
+                                struct page *first_page)
+{
+        enum fullness_group fullness;
+        BUG_ON(!is_first_page(first_page));
+        fullness = get_fullness_group(first_page);
+        insert_zspage(first_page, class, fullness);
+        set_zspage_mapping(first_page, class->index, fullness);
        if (fullness == ZS_EMPTY) {
+                zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+                        class->size, class->pages_per_zspage));
                atomic_long_sub(class->pages_per_zspage,
                                &pool->pages_allocated);
                free_zspage(first_page);
        }
 }
-EXPORT_SYMBOL_GPL(zs_free);
+static struct page *isolate_source_page(struct size_class *class)
+{
+        struct page *page;
+        page = class->fullness_list[ZS_ALMOST_EMPTY];
+        if (page)
+                remove_zspage(page, class, ZS_ALMOST_EMPTY);
+        return page;
+}
+static unsigned long __zs_compact(struct zs_pool *pool,
+                                struct size_class *class)
+{
+        int nr_to_migrate;
+        struct zs_compact_control cc;
+        struct page *src_page;
+        struct page *dst_page = NULL;
+        unsigned long nr_total_migrated = 0;
+        spin_lock(&class->lock);
+        while ((src_page = isolate_source_page(class))) {
+                BUG_ON(!is_first_page(src_page));
+                /* The goal is to migrate all live objects in source page */
+                nr_to_migrate = src_page->inuse;
+                cc.index = 0;
+                cc.s_page = src_page;
+                while ((dst_page = alloc_target_page(class))) {
+                        cc.d_page = dst_page;
+                        /*
+                         * If there is no more space in dst_page, try to
+                         * allocate another zspage.
+                         */
+                        if (!migrate_zspage(pool, class, &cc))
+                                break;
+                        putback_zspage(pool, class, dst_page);
+                        nr_total_migrated += cc.nr_migrated;
+                        nr_to_migrate -= cc.nr_migrated;
+                }
+                /* Stop if we couldn't find slot */
+                if (dst_page == NULL)
+                        break;
+                putback_zspage(pool, class, dst_page);
+                putback_zspage(pool, class, src_page);
+                spin_unlock(&class->lock);
+                nr_total_migrated += cc.nr_migrated;
+                cond_resched();
+                spin_lock(&class->lock);
+        }
+        if (src_page)
+                putback_zspage(pool, class, src_page);
+        spin_unlock(&class->lock);
+        return nr_total_migrated;
+}
+unsigned long zs_compact(struct zs_pool *pool)
+{
+        int i;
+        unsigned long nr_migrated = 0;
+        struct size_class *class;
+        for (i = zs_size_classes - 1; i >= 0; i--) {
+                class = pool->size_class[i];
+                if (!class)
+                        continue;
+                if (class->index != i)
+                        continue;
+                nr_migrated += __zs_compact(pool, class);
+        }
+        return nr_migrated;
+}
+EXPORT_SYMBOL_GPL(zs_compact);
 /**
 * zs_create_pool - Creates an allocation pool to work from.
@@ -1355,20 +1794,20 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
        if (!pool)
                return NULL;
-        pool->name = kstrdup(name, GFP_KERNEL);
-        if (!pool->name) {
-                kfree(pool);
-                return NULL;
-        }
        pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
                        GFP_KERNEL);
        if (!pool->size_class) {
-                kfree(pool->name);
                kfree(pool);
                return NULL;
        }
+        pool->name = kstrdup(name, GFP_KERNEL);
+        if (!pool->name)
+                goto err;
+        if (create_handle_cache(pool))
+                goto err;
        /*
         * Iterate reversly, because, size of size_class that we want to use
         * for merging should be larger or equal to current size.
@@ -1406,6 +1845,9 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
                class->size = size;
                class->index = i;
                class->pages_per_zspage = pages_per_zspage;
+                if (pages_per_zspage == 1 &&
+                        get_maxobj_per_zspage(size, pages_per_zspage) == 1)
+                        class->huge = true;
                spin_lock_init(&class->lock);
                pool->size_class[i] = class;
@@ -1450,6 +1892,7 @@ void zs_destroy_pool(struct zs_pool *pool)
                kfree(class);
        }
+        destroy_handle_cache(pool);
        kfree(pool->size_class);
        kfree(pool->name);
        kfree(pool);