Merge remote-tracking branch 'origin/x86/boot' into x86/mm2

Coming patches to x86/mm2 require the changes and advanced baseline in x86/boot. Resolved Conflicts: arch/x86/kernel/setup.c mm/nobootmem.c Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
author: H. Peter Anvin <hpa@linux.intel.com> 2013-01-29 17:59:09 -0500
committer: H. Peter Anvin <hpa@linux.intel.com> 2013-01-29 18:10:15 -0500
commit: de65d816aa44f9ddd79861ae21d75010cc1fd003 (patch)
tree: 04a637a43b2e52a733d0dcb7595a47057571e7da /mm
parent: 9710f581bb4c35589ac046b0cfc0deb7f369fc85 (diff)
parent: 5dcd14ecd41ea2b3ae3295a9b30d98769d52165f (diff)
47 files changed, 5954 insertions, 2099 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a3f8dddaaab3..278e3ab1f169 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -143,6 +143,25 @@ config NO_BOOTMEM
 config MEMORY_ISOLATION
        boolean
+config MOVABLE_NODE
+        boolean "Enable to assign a node which has only movable memory"
+        depends on HAVE_MEMBLOCK
+        depends on NO_BOOTMEM
+        depends on X86_64
+        depends on NUMA
+        default n
+        help
+          Allow a node to have only movable memory.  Pages used by the kernel,
+          such as direct mapping pages cannot be migrated.  So the corresponding
+          memory device cannot be hotplugged.  This option allows users to
+          online all the memory of a node as movable memory so that the whole
+          node can be hotplugged.  Users who don't use the memory hotplug
+          feature are fine with this option on since they don't online memory
+          as movable.
+          Say Y here if you want to hotplug a whole node.
+          Say N here if you want kernel to use memory on all nodes evenly.
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
@@ -188,6 +207,21 @@ config SPLIT_PTLOCK_CPUS
        default "4"
 #
+# support for memory balloon compaction
+config BALLOON_COMPACTION
+        bool "Allow for balloon memory compaction/migration"
+        def_bool y
+        depends on COMPACTION && VIRTIO_BALLOON
+        help
+          Memory fragmentation introduced by ballooning might reduce
+          significantly the number of 2MB contiguous memory blocks that can be
+          used within a guest, thus imposing performance penalties associated
+          with the reduced number of transparent huge pages that could be used
+          by the guest workload. Allowing the compaction & migration for memory
+          pages enlisted as being part of memory balloon devices avoids the
+          scenario aforementioned and helps improving memory defragmentation.
+#
 # support for memory compaction
 config COMPACTION
        bool "Allow for memory compaction"
diff --git a/mm/Makefile b/mm/Makefile
index 6b025f80af34..3a4628751f89 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -16,7 +16,8 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
                           readahead.o swap.o truncate.o vmscan.o shmem.o \
                           util.o mmzone.o vmstat.o backing-dev.o \
                           mm_init.o mmu_context.o percpu.o slab_common.o \
-                           compaction.o interval_tree.o $(mmu-y)
+                           compaction.o balloon_compaction.o \
+                           interval_tree.o $(mmu-y)
 obj-y += init-mm.o
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
new file mode 100644
index 000000000000..07dbc8ec46cf
--- /dev/null
+++ b/mm/balloon_compaction.c
@@ -0,0 +1,302 @@
+/*
+ * mm/balloon_compaction.c
+ *
+ * Common interface for making balloon pages movable by compaction.
+ *
+ * Copyright (C) 2012, Red Hat, Inc.  Rafael Aquini <aquini@redhat.com>
+ */
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/balloon_compaction.h>
+/*
+ * balloon_devinfo_alloc - allocates a balloon device information descriptor.
+ * @balloon_dev_descriptor: pointer to reference the balloon device which
+ *                          this struct balloon_dev_info will be servicing.
+ *
+ * Driver must call it to properly allocate and initialize an instance of
+ * struct balloon_dev_info which will be used to reference a balloon device
+ * as well as to keep track of the balloon device page list.
+ */
+struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor)
+{
+        struct balloon_dev_info *b_dev_info;
+        b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL);
+        if (!b_dev_info)
+                return ERR_PTR(-ENOMEM);
+        b_dev_info->balloon_device = balloon_dev_descriptor;
+        b_dev_info->mapping = NULL;
+        b_dev_info->isolated_pages = 0;
+        spin_lock_init(&b_dev_info->pages_lock);
+        INIT_LIST_HEAD(&b_dev_info->pages);
+        return b_dev_info;
+}
+EXPORT_SYMBOL_GPL(balloon_devinfo_alloc);
+/*
+ * balloon_page_enqueue - allocates a new page and inserts it into the balloon
+ *                        page list.
+ * @b_dev_info: balloon device decriptor where we will insert a new page to
+ *
+ * Driver must call it to properly allocate a new enlisted balloon page
+ * before definetively removing it from the guest system.
+ * This function returns the page address for the recently enqueued page or
+ * NULL in the case we fail to allocate a new page this turn.
+ */
+struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
+{
+        unsigned long flags;
+        struct page *page = alloc_page(balloon_mapping_gfp_mask() |
+                                        __GFP_NOMEMALLOC | __GFP_NORETRY);
+        if (!page)
+                return NULL;
+        /*
+         * Block others from accessing the 'page' when we get around to
+         * establishing additional references. We should be the only one
+         * holding a reference to the 'page' at this point.
+         */
+        BUG_ON(!trylock_page(page));
+        spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+        balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages);
+        spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+        unlock_page(page);
+        return page;
+}
+EXPORT_SYMBOL_GPL(balloon_page_enqueue);
+/*
+ * balloon_page_dequeue - removes a page from balloon's page list and returns
+ *                        the its address to allow the driver release the page.
+ * @b_dev_info: balloon device decriptor where we will grab a page from.
+ *
+ * Driver must call it to properly de-allocate a previous enlisted balloon page
+ * before definetively releasing it back to the guest system.
+ * This function returns the page address for the recently dequeued page or
+ * NULL in the case we find balloon's page list temporarily empty due to
+ * compaction isolated pages.
+ */
+struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
+{
+        struct page *page, *tmp;
+        unsigned long flags;
+        bool dequeued_page;
+        dequeued_page = false;
+        list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
+                /*
+                 * Block others from accessing the 'page' while we get around
+                 * establishing additional references and preparing the 'page'
+                 * to be released by the balloon driver.
+                 */
+                if (trylock_page(page)) {
+                        spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+                        /*
+                         * Raise the page refcount here to prevent any wrong
+                         * attempt to isolate this page, in case of coliding
+                         * with balloon_page_isolate() just after we release
+                         * the page lock.
+                         *
+                         * balloon_page_free() will take care of dropping
+                         * this extra refcount later.
+                         */
+                        get_page(page);
+                        balloon_page_delete(page);
+                        spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+                        unlock_page(page);
+                        dequeued_page = true;
+                        break;
+                }
+        }
+        if (!dequeued_page) {
+                /*
+                 * If we are unable to dequeue a balloon page because the page
+                 * list is empty and there is no isolated pages, then something
+                 * went out of track and some balloon pages are lost.
+                 * BUG() here, otherwise the balloon driver may get stuck into
+                 * an infinite loop while attempting to release all its pages.
+                 */
+                spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+                if (unlikely(list_empty(&b_dev_info->pages) &&
+                             !b_dev_info->isolated_pages))
+                        BUG();
+                spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+                page = NULL;
+        }
+        return page;
+}
+EXPORT_SYMBOL_GPL(balloon_page_dequeue);
+#ifdef CONFIG_BALLOON_COMPACTION
+/*
+ * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages.
+ * @b_dev_info: holds the balloon device information descriptor.
+ * @a_ops: balloon_mapping address_space_operations descriptor.
+ *
+ * Driver must call it to properly allocate and initialize an instance of
+ * struct address_space which will be used as the special page->mapping for
+ * balloon device enlisted page instances.
+ */
+struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
+                                const struct address_space_operations *a_ops)
+{
+        struct address_space *mapping;
+        mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
+        if (!mapping)
+                return ERR_PTR(-ENOMEM);
+        /*
+         * Give a clean 'zeroed' status to all elements of this special
+         * balloon page->mapping struct address_space instance.
+         */
+        address_space_init_once(mapping);
+        /*
+         * Set mapping->flags appropriately, to allow balloon pages
+         * ->mapping identification.
+         */
+        mapping_set_balloon(mapping);
+        mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask());
+        /* balloon's page->mapping->a_ops callback descriptor */
+        mapping->a_ops = a_ops;
+        /*
+         * Establish a pointer reference back to the balloon device descriptor
+         * this particular page->mapping will be servicing.
+         * This is used by compaction / migration procedures to identify and
+         * access the balloon device pageset while isolating / migrating pages.
+         *
+         * As some balloon drivers can register multiple balloon devices
+         * for a single guest, this also helps compaction / migration to
+         * properly deal with multiple balloon pagesets, when required.
+         */
+        mapping->private_data = b_dev_info;
+        b_dev_info->mapping = mapping;
+        return mapping;
+}
+EXPORT_SYMBOL_GPL(balloon_mapping_alloc);
+static inline void __isolate_balloon_page(struct page *page)
+{
+        struct balloon_dev_info *b_dev_info = page->mapping->private_data;
+        unsigned long flags;
+        spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+        list_del(&page->lru);
+        b_dev_info->isolated_pages++;
+        spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+}
+static inline void __putback_balloon_page(struct page *page)
+{
+        struct balloon_dev_info *b_dev_info = page->mapping->private_data;
+        unsigned long flags;
+        spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+        list_add(&page->lru, &b_dev_info->pages);
+        b_dev_info->isolated_pages--;
+        spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+}
+static inline int __migrate_balloon_page(struct address_space *mapping,
+                struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+        return page->mapping->a_ops->migratepage(mapping, newpage, page, mode);
+}
+/* __isolate_lru_page() counterpart for a ballooned page */
+bool balloon_page_isolate(struct page *page)
+{
+        /*
+         * Avoid burning cycles with pages that are yet under __free_pages(),
+         * or just got freed under us.
+         *
+         * In case we 'win' a race for a balloon page being freed under us and
+         * raise its refcount preventing __free_pages() from doing its job
+         * the put_page() at the end of this block will take care of
+         * release this page, thus avoiding a nasty leakage.
+         */
+        if (likely(get_page_unless_zero(page))) {
+                /*
+                 * As balloon pages are not isolated from LRU lists, concurrent
+                 * compaction threads can race against page migration functions
+                 * as well as race against the balloon driver releasing a page.
+                 *
+                 * In order to avoid having an already isolated balloon page
+                 * being (wrongly) re-isolated while it is under migration,
+                 * or to avoid attempting to isolate pages being released by
+                 * the balloon driver, lets be sure we have the page lock
+                 * before proceeding with the balloon page isolation steps.
+                 */
+                if (likely(trylock_page(page))) {
+                        /*
+                         * A ballooned page, by default, has just one refcount.
+                         * Prevent concurrent compaction threads from isolating
+                         * an already isolated balloon page by refcount check.
+                         */
+                        if (__is_movable_balloon_page(page) &&
+                            page_count(page) == 2) {
+                                __isolate_balloon_page(page);
+                                unlock_page(page);
+                                return true;
+                        }
+                        unlock_page(page);
+                }
+                put_page(page);
+        }
+        return false;
+}
+/* putback_lru_page() counterpart for a ballooned page */
+void balloon_page_putback(struct page *page)
+{
+        /*
+         * 'lock_page()' stabilizes the page and prevents races against
+         * concurrent isolation threads attempting to re-isolate it.
+         */
+        lock_page(page);
+        if (__is_movable_balloon_page(page)) {
+                __putback_balloon_page(page);
+                /* drop the extra ref count taken for page isolation */
+                put_page(page);
+        } else {
+                WARN_ON(1);
+                dump_page(page);
+        }
+        unlock_page(page);
+}
+/* move_to_new_page() counterpart for a ballooned page */
+int balloon_page_migrate(struct page *newpage,
+                         struct page *page, enum migrate_mode mode)
+{
+        struct address_space *mapping;
+        int rc = -EAGAIN;
+        /*
+         * Block others from accessing the 'newpage' when we get around to
+         * establishing additional references. We should be the only one
+         * holding a reference to the 'newpage' at this point.
+         */
+        BUG_ON(!trylock_page(newpage));
+        if (WARN_ON(!__is_movable_balloon_page(page))) {
+                dump_page(page);
+                unlock_page(newpage);
+                return rc;
+        }
+        mapping = page->mapping;
+        if (mapping)
+                rc = __migrate_balloon_page(mapping, newpage, page, mode);
+        unlock_page(newpage);
+        return rc;
+}
+#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/bootmem.c b/mm/bootmem.c
index f468185b3b28..b93376c39b61 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -147,21 +147,21 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
 /*
 * free_bootmem_late - free bootmem pages directly to page allocator
- * @addr: starting address of the range
+ * @addr: starting physical address of the range
 * @size: size of the range in bytes
 *
 * This is only useful when the bootmem allocator has already been torn
 * down, but we are still initializing the system.  Pages are given directly
 * to the page allocator, no bootmem metadata is updated because it is gone.
 */
-void __init free_bootmem_late(unsigned long addr, unsigned long size)
+void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
 {
        unsigned long cursor, end;
-        kmemleak_free_part(__va(addr), size);
+        kmemleak_free_part(__va(physaddr), size);
-        cursor = PFN_UP(addr);
+        cursor = PFN_UP(physaddr);
-        end = PFN_DOWN(addr + size);
+        end = PFN_DOWN(physaddr + size);
        for (; cursor < end; cursor++) {
                __free_pages_bootmem(pfn_to_page(cursor), 0);
@@ -185,10 +185,23 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
        while (start < end) {
                unsigned long *map, idx, vec;
+                unsigned shift;
                map = bdata->node_bootmem_map;
                idx = start - bdata->node_min_pfn;
+                shift = idx & (BITS_PER_LONG - 1);
+                /*
+                 * vec holds at most BITS_PER_LONG map bits,
+                 * bit 0 corresponds to start.
+                 */
                vec = ~map[idx / BITS_PER_LONG];
+                if (shift) {
+                        vec >>= shift;
+                        if (end - start >= BITS_PER_LONG)
+                                vec |= ~map[idx / BITS_PER_LONG + 1] <<
+                                        (BITS_PER_LONG - shift);
+                }
                /*
                 * If we have a properly aligned and fully unreserved
                 * BITS_PER_LONG block of pages in front of us, free
@@ -201,19 +214,18 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
                        count += BITS_PER_LONG;
                        start += BITS_PER_LONG;
                } else {
-                        unsigned long off = 0;
+                        unsigned long cur = start;
-                        vec >>= start & (BITS_PER_LONG - 1);
+                        start = ALIGN(start + 1, BITS_PER_LONG);
-                        while (vec) {
+                        while (vec && cur != start) {
                                if (vec & 1) {
-                                        page = pfn_to_page(start + off);
+                                        page = pfn_to_page(cur);
                                        __free_pages_bootmem(page, 0);
                                        count++;
                                }
                                vec >>= 1;
-                                off++;
+                                ++cur;
                        }
-                        start = ALIGN(start + 1, BITS_PER_LONG);
                }
        }
@@ -229,6 +241,22 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
        return count;
 }
+static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
+{
+        struct zone *z;
+        /*
+         * In free_area_init_core(), highmem zone's managed_pages is set to
+         * present_pages, and bootmem allocator doesn't allocate from highmem
+         * zones. So there's no need to recalculate managed_pages because all
+         * highmem pages will be managed by the buddy system. Here highmem
+         * zone also includes highmem movable zone.
+         */
+        for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
+                if (!is_highmem(z))
+                        z->managed_pages = 0;
+}
 /**
 * free_all_bootmem_node - release a node's free pages to the buddy allocator
 * @pgdat: node to be released
@@ -238,6 +266,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
 {
        register_page_bootmem_info_node(pgdat);
+        reset_node_lowmem_managed_pages(pgdat);
        return free_all_bootmem_core(pgdat->bdata);
 }
@@ -250,6 +279,10 @@ unsigned long __init free_all_bootmem(void)
 {
        unsigned long total_pages = 0;
        bootmem_data_t *bdata;
+        struct pglist_data *pgdat;
+        for_each_online_pgdat(pgdat)
+                reset_node_lowmem_managed_pages(pgdat);
        list_for_each_entry(bdata, &bdata_list, list)
                total_pages += free_all_bootmem_core(bdata);
@@ -377,21 +410,21 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 /**
 * free_bootmem - mark a page range as usable
- * @addr: starting address of the range
+ * @addr: starting physical address of the range
 * @size: size of the range in bytes
 *
 * Partial pages will be considered reserved and left as they are.
 *
 * The range must be contiguous but may span node boundaries.
 */
-void __init free_bootmem(unsigned long addr, unsigned long size)
+void __init free_bootmem(unsigned long physaddr, unsigned long size)
 {
        unsigned long start, end;
-        kmemleak_free_part(__va(addr), size);
+        kmemleak_free_part(__va(physaddr), size);
-        start = PFN_UP(addr);
+        start = PFN_UP(physaddr);
-        end = PFN_DOWN(addr + size);
+        end = PFN_DOWN(physaddr + size);
        mark_bootmem(start, end, 0, 0);
 }
@@ -439,12 +472,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
        return mark_bootmem(start, end, 1, flags);
 }
-int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
-                                   int flags)
-{
-        return reserve_bootmem(phys, len, flags);
-}
 static unsigned long __init align_idx(struct bootmem_data *bdata,
                                      unsigned long idx, unsigned long step)
 {
@@ -575,27 +602,6 @@ find_block:
        return NULL;
 }
-static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
-                                        unsigned long size, unsigned long align,
-                                        unsigned long goal, unsigned long limit)
-{
-        if (WARN_ON_ONCE(slab_is_available()))
-                return kzalloc(size, GFP_NOWAIT);
-#ifdef CONFIG_HAVE_ARCH_BOOTMEM
-        {
-                bootmem_data_t *p_bdata;
-                p_bdata = bootmem_arch_preferred_node(bdata, size, align,
-                                                        goal, limit);
-                if (p_bdata)
-                        return alloc_bootmem_bdata(p_bdata, size, align,
-                                                        goal, limit);
-        }
-#endif
-        return NULL;
-}
 static void * __init alloc_bootmem_core(unsigned long size,
                                        unsigned long align,
                                        unsigned long goal,
@@ -604,9 +610,8 @@ static void * __init alloc_bootmem_core(unsigned long size,
        bootmem_data_t *bdata;
        void *region;
-        region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit);
+        if (WARN_ON_ONCE(slab_is_available()))
-        if (region)
+                return kzalloc(size, GFP_NOWAIT);
-                return region;
        list_for_each_entry(bdata, &bdata_list, list) {
                if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
@@ -704,11 +709,9 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 {
        void *ptr;
+        if (WARN_ON_ONCE(slab_is_available()))
+                return kzalloc(size, GFP_NOWAIT);
 again:
-        ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size,
-                                           align, goal, limit);
-        if (ptr)
-                return ptr;
        /* do not panic in alloc_bootmem_bdata() */
        if (limit && goal + size > limit)
diff --git a/mm/compaction.c b/mm/compaction.c
index 9eef55838fca..c62bd063d766 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -14,8 +14,24 @@
 #include <linux/backing-dev.h>
 #include <linux/sysctl.h>
 #include <linux/sysfs.h>
+#include <linux/balloon_compaction.h>
 #include "internal.h"
+#ifdef CONFIG_COMPACTION
+static inline void count_compact_event(enum vm_event_item item)
+{
+        count_vm_event(item);
+}
+static inline void count_compact_events(enum vm_event_item item, long delta)
+{
+        count_vm_events(item, delta);
+}
+#else
+#define count_compact_event(item) do { } while (0)
+#define count_compact_events(item, delta) do { } while (0)
+#endif
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
 #define CREATE_TRACE_POINTS
@@ -214,60 +230,6 @@ static bool suitable_migration_target(struct page *page)
        return false;
 }
-static void compact_capture_page(struct compact_control *cc)
-{
-        unsigned long flags;
-        int mtype, mtype_low, mtype_high;
-        if (!cc->page || *cc->page)
-                return;
-        /*
-         * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
-         * regardless of the migratetype of the freelist is is captured from.
-         * This is fine because the order for a high-order MIGRATE_MOVABLE
-         * allocation is typically at least a pageblock size and overall
-         * fragmentation is not impaired. Other allocation types must
-         * capture pages from their own migratelist because otherwise they
-         * could pollute other pageblocks like MIGRATE_MOVABLE with
-         * difficult to move pages and making fragmentation worse overall.
-         */
-        if (cc->migratetype == MIGRATE_MOVABLE) {
-                mtype_low = 0;
-                mtype_high = MIGRATE_PCPTYPES;
-        } else {
-                mtype_low = cc->migratetype;
-                mtype_high = cc->migratetype + 1;
-        }
-        /* Speculatively examine the free lists without zone lock */
-        for (mtype = mtype_low; mtype < mtype_high; mtype++) {
-                int order;
-                for (order = cc->order; order < MAX_ORDER; order++) {
-                        struct page *page;
-                        struct free_area *area;
-                        area = &(cc->zone->free_area[order]);
-                        if (list_empty(&area->free_list[mtype]))
-                                continue;
-                        /* Take the lock and attempt capture of the page */
-                        if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
-                                return;
-                        if (!list_empty(&area->free_list[mtype])) {
-                                page = list_entry(area->free_list[mtype].next,
-                                                        struct page, lru);
-                                if (capture_free_page(page, cc->order, mtype)) {
-                                        spin_unlock_irqrestore(&cc->zone->lock,
-                                                                        flags);
-                                        *cc->page = page;
-                                        return;
-                                }
-                        }
-                        spin_unlock_irqrestore(&cc->zone->lock, flags);
-                }
-        }
-}
 /*
 * Isolate free pages onto a private freelist. Caller must hold zone->lock.
 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
@@ -356,6 +318,9 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
        if (blockpfn == end_pfn)
                update_pageblock_skip(cc, valid_page, total_isolated, false);
+        count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
+        if (total_isolated)
+                count_compact_events(COMPACTISOLATED, total_isolated);
        return total_isolated;
 }
@@ -565,9 +530,24 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                        goto next_pageblock;
                }
-                /* Check may be lockless but that's ok as we recheck later */
+                /*
-                if (!PageLRU(page))
+                 * Check may be lockless but that's ok as we recheck later.
+                 * It's possible to migrate LRU pages and balloon pages
+                 * Skip any other type of page
+                 */
+                if (!PageLRU(page)) {
+                        if (unlikely(balloon_page_movable(page))) {
+                                if (locked && balloon_page_isolate(page)) {
+                                        /* Successfully isolated */
+                                        cc->finished_update_migrate = true;
+                                        list_add(&page->lru, migratelist);
+                                        cc->nr_migratepages++;
+                                        nr_isolated++;
+                                        goto check_compact_cluster;
+                                }
+                        }
                        continue;
+                }
                /*
                 * PageLRU is set. lru_lock normally excludes isolation
@@ -621,6 +601,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                cc->nr_migratepages++;
                nr_isolated++;
+check_compact_cluster:
                /* Avoid isolating too much */
                if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
                        ++low_pfn;
@@ -646,6 +627,10 @@ next_pageblock:
        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
+        count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
+        if (nr_isolated)
+                count_compact_events(COMPACTISOLATED, nr_isolated);
        return low_pfn;
 }
@@ -713,7 +698,15 @@ static void isolate_freepages(struct zone *zone,
                /* Found a block suitable for isolating free pages from */
                isolated = 0;
-                end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
+                /*
+                 * As pfn may not start aligned, pfn+pageblock_nr_page
+                 * may cross a MAX_ORDER_NR_PAGES boundary and miss
+                 * a pfn_valid check. Ensure isolate_freepages_block()
+                 * only scans within a pageblock
+                 */
+                end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+                end_pfn = min(end_pfn, zone_end_pfn);
                isolated = isolate_freepages_block(cc, pfn, end_pfn,
                                                   freelist, false);
                nr_freepages += isolated;
@@ -823,6 +816,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 static int compact_finished(struct zone *zone,
                            struct compact_control *cc)
 {
+        unsigned int order;
        unsigned long watermark;
        if (fatal_signal_pending(current))
@@ -857,22 +851,16 @@ static int compact_finished(struct zone *zone,
                return COMPACT_CONTINUE;
        /* Direct compactor: Is a suitable page free? */
-        if (cc->page) {
+        for (order = cc->order; order < MAX_ORDER; order++) {
-                /* Was a suitable page captured? */
+                struct free_area *area = &zone->free_area[order];
-                if (*cc->page)
+                /* Job done if page is free of the right migratetype */
+                if (!list_empty(&area->free_list[cc->migratetype]))
+                        return COMPACT_PARTIAL;
+                /* Job done if allocation would set block type */
+                if (cc->order >= pageblock_order && area->nr_free)
                        return COMPACT_PARTIAL;
-        } else {
-                unsigned int order;
-                for (order = cc->order; order < MAX_ORDER; order++) {
-                        struct free_area *area = &zone->free_area[cc->order];
-                        /* Job done if page is free of the right migratetype */
-                        if (!list_empty(&area->free_list[cc->migratetype]))
-                                return COMPACT_PARTIAL;
-                        /* Job done if allocation would set block type */
-                        if (cc->order >= pageblock_order && area->nr_free)
-                                return COMPACT_PARTIAL;
-                }
        }
        return COMPACT_CONTINUE;
@@ -978,7 +966,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                switch (isolate_migratepages(zone, cc)) {
                case ISOLATE_ABORT:
                        ret = COMPACT_PARTIAL;
-                        putback_lru_pages(&cc->migratepages);
+                        putback_movable_pages(&cc->migratepages);
                        cc->nr_migratepages = 0;
                        goto out;
                case ISOLATE_NONE:
@@ -990,29 +978,23 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                nr_migrate = cc->nr_migratepages;
                err = migrate_pages(&cc->migratepages, compaction_alloc,
                                (unsigned long)cc, false,
-                                cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
+                                cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
+                                MR_COMPACTION);
                update_nr_listpages(cc);
                nr_remaining = cc->nr_migratepages;
-                count_vm_event(COMPACTBLOCKS);
-                count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
-                if (nr_remaining)
-                        count_vm_events(COMPACTPAGEFAILED, nr_remaining);
                trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
                                                nr_remaining);
-                /* Release LRU pages not migrated */
+                /* Release isolated pages not migrated */
                if (err) {
-                        putback_lru_pages(&cc->migratepages);
+                        putback_movable_pages(&cc->migratepages);
                        cc->nr_migratepages = 0;
                        if (err == -ENOMEM) {
                                ret = COMPACT_PARTIAL;
                                goto out;
                        }
                }
-                /* Capture a page now if it is a suitable size */
-                compact_capture_page(cc);
        }
 out:
@@ -1025,8 +1007,7 @@ out:
 static unsigned long compact_zone_order(struct zone *zone,
                                 int order, gfp_t gfp_mask,
-                                 bool sync, bool *contended,
+                                 bool sync, bool *contended)
-                                 struct page **page)
 {
        unsigned long ret;
        struct compact_control cc = {
@@ -1036,7 +1017,6 @@ static unsigned long compact_zone_order(struct zone *zone,
                .migratetype = allocflags_to_migratetype(gfp_mask),
                .zone = zone,
                .sync = sync,
-                .page = page,
        };
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
@@ -1066,7 +1046,7 @@ int sysctl_extfrag_threshold = 500;
 */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
-                        bool sync, bool *contended, struct page **page)
+                        bool sync, bool *contended)
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1080,7 +1060,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
        if (!order || !may_enter_fs || !may_perform_io)
                return rc;
-        count_vm_event(COMPACTSTALL);
+        count_compact_event(COMPACTSTALL);
 #ifdef CONFIG_CMA
        if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
@@ -1092,7 +1072,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
                int status;
                status = compact_zone_order(zone, order, gfp_mask, sync,
-                                                contended, page);
+                                                contended);
                rc = max(status, rc);
                /* If a normal allocation would succeed, stop compacting */
@@ -1148,7 +1128,6 @@ int compact_pgdat(pg_data_t *pgdat, int order)
        struct compact_control cc = {
                .order = order,
                .sync = false,
-                .page = NULL,
        };
        return __compact_pgdat(pgdat, &cc);
@@ -1159,14 +1138,13 @@ static int compact_node(int nid)
        struct compact_control cc = {
                .order = -1,
                .sync = true,
-                .page = NULL,
        };
        return __compact_pgdat(NODE_DATA(nid), &cc);
 }
 /* Compact all nodes in the system */
-static int compact_nodes(void)
+static void compact_nodes(void)
 {
        int nid;
@@ -1175,8 +1153,6 @@ static int compact_nodes(void)
        for_each_online_node(nid)
                compact_node(nid);
-        return COMPACT_COMPLETE;
 }
 /* The written value is actually unused, all memory is compacted */
@@ -1187,7 +1163,7 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
                        void __user *buffer, size_t *length, loff_t *ppos)
 {
        if (write)
-                return compact_nodes();
+                compact_nodes();
        return 0;
 }
diff --git a/mm/dmapool.c b/mm/dmapool.c
index c5ab33bca0a8..c69781e97cf9 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -50,7 +50,6 @@ struct dma_pool {		/* the pool */
        size_t allocation;
        size_t boundary;
        char name[32];
-        wait_queue_head_t waitq;
        struct list_head pools;
 };
@@ -62,8 +61,6 @@ struct dma_page {		/* cacheable header for 'allocation' bytes */
        unsigned int offset;
 };
-#define POOL_TIMEOUT_JIFFIES    ((100 /* msec */ * HZ) / 1000)
 static DEFINE_MUTEX(pools_lock);
 static ssize_t
@@ -172,7 +169,6 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
        retval->size = size;
        retval->boundary = boundary;
        retval->allocation = allocation;
-        init_waitqueue_head(&retval->waitq);
        if (dev) {
                int ret;
@@ -227,7 +223,6 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
                memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
 #endif
                pool_initialise_page(pool, page);
-                list_add(&page->page_list, &pool->page_list);
                page->in_use = 0;
                page->offset = 0;
        } else {
@@ -315,30 +310,21 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
        might_sleep_if(mem_flags & __GFP_WAIT);
        spin_lock_irqsave(&pool->lock, flags);
- restart:
        list_for_each_entry(page, &pool->page_list, page_list) {
                if (page->offset < pool->allocation)
                        goto ready;
        }
-        page = pool_alloc_page(pool, GFP_ATOMIC);
-        if (!page) {
-                if (mem_flags & __GFP_WAIT) {
-                        DECLARE_WAITQUEUE(wait, current);
-                        __set_current_state(TASK_UNINTERRUPTIBLE);
+        /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
-                        __add_wait_queue(&pool->waitq, &wait);
+        spin_unlock_irqrestore(&pool->lock, flags);
-                        spin_unlock_irqrestore(&pool->lock, flags);
-                        schedule_timeout(POOL_TIMEOUT_JIFFIES);
+        page = pool_alloc_page(pool, mem_flags);
+        if (!page)
+                return NULL;
-                        spin_lock_irqsave(&pool->lock, flags);
+        spin_lock_irqsave(&pool->lock, flags);
-                        __remove_wait_queue(&pool->waitq, &wait);
-                        goto restart;
-                }
-                retval = NULL;
-                goto done;
-        }
+        list_add(&page->page_list, &pool->page_list);
 ready:
        page->in_use++;
        offset = page->offset;
@@ -346,9 +332,32 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
        retval = offset + page->vaddr;
        *handle = offset + page->dma;
 #ifdef  DMAPOOL_DEBUG
+        {
+                int i;
+                u8 *data = retval;
+                /* page->offset is stored in first 4 bytes */
+                for (i = sizeof(page->offset); i < pool->size; i++) {
+                        if (data[i] == POOL_POISON_FREED)
+                                continue;
+                        if (pool->dev)
+                                dev_err(pool->dev,
+                                        "dma_pool_alloc %s, %p (corruped)\n",
+                                        pool->name, retval);
+                        else
+                                pr_err("dma_pool_alloc %s, %p (corruped)\n",
+                                        pool->name, retval);
+                        /*
+                         * Dump the first 4 bytes even if they are not
+                         * POOL_POISON_FREED
+                         */
+                        print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1,
+                                        data, pool->size, 1);
+                        break;
+                }
+        }
        memset(retval, POOL_POISON_ALLOCATED, pool->size);
 #endif
- done:
        spin_unlock_irqrestore(&pool->lock, flags);
        return retval;
 }
@@ -435,8 +444,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
        page->in_use--;
        *(int *)vaddr = page->offset;
        page->offset = offset;
-        if (waitqueue_active(&pool->waitq))
-                wake_up_locked(&pool->waitq);
        /*
         * Resist a temptation to do
         *    if (!is_page_busy(page)) pool_free_page(pool, page);
diff --git a/mm/highmem.c b/mm/highmem.c
index 2da13a5c50e2..b32b70cdaed6 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -99,12 +99,13 @@ struct page *kmap_to_page(void *vaddr)
        unsigned long addr = (unsigned long)vaddr;
        if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
-                int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT;
+                int i = PKMAP_NR(addr);
                return pte_page(pkmap_page_table[i]);
        }
        return virt_to_page(addr);
 }
+EXPORT_SYMBOL(kmap_to_page);
 static void flush_all_zero_pkmaps(void)
 {
@@ -137,8 +138,7 @@ static void flush_all_zero_pkmaps(void)
                 * So no dangers, even with speculative execution.
                 */
                page = pte_page(pkmap_page_table[i]);
-                pte_clear(&init_mm, (unsigned long)page_address(page),
+                pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]);
-                          &pkmap_page_table[i]);
                set_page_address(page, NULL);
                need_flush = 1;
@@ -324,11 +324,7 @@ struct page_address_map {
        struct list_head list;
 };
-/*
+static struct page_address_map page_address_maps[LAST_PKMAP];
- * page_address_map freelist, allocated from page_address_maps.
- */
-static struct list_head page_address_pool;      /* freelist */
-static spinlock_t pool_lock;                    /* protects page_address_pool */
 /*
 * Hash table bucket
@@ -393,14 +389,7 @@ void set_page_address(struct page *page, void *virtual)
        pas = page_slot(page);
        if (virtual) {          /* Add */
-                BUG_ON(list_empty(&page_address_pool));
+                pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)];
-                spin_lock_irqsave(&pool_lock, flags);
-                pam = list_entry(page_address_pool.next,
-                                struct page_address_map, list);
-                list_del(&pam->list);
-                spin_unlock_irqrestore(&pool_lock, flags);
                pam->page = page;
                pam->virtual = virtual;
@@ -413,9 +402,6 @@ void set_page_address(struct page *page, void *virtual)
                        if (pam->page == page) {
                                list_del(&pam->list);
                                spin_unlock_irqrestore(&pas->lock, flags);
-                                spin_lock_irqsave(&pool_lock, flags);
-                                list_add_tail(&pam->list, &page_address_pool);
-                                spin_unlock_irqrestore(&pool_lock, flags);
                                goto done;
                        }
                }
@@ -425,20 +411,14 @@ done:
        return;
 }
-static struct page_address_map page_address_maps[LAST_PKMAP];
 void __init page_address_init(void)
 {
        int i;
-        INIT_LIST_HEAD(&page_address_pool);
-        for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
-                list_add(&page_address_maps[i].list, &page_address_pool);
        for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
                INIT_LIST_HEAD(&page_address_htable[i].lh);
                spin_lock_init(&page_address_htable[i].lock);
        }
-        spin_lock_init(&pool_lock);
 }
 #endif  /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 40f17c34b415..6001ee6347a9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -12,12 +12,15 @@
 #include <linux/mmu_notifier.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
+#include <linux/shrinker.h>
 #include <linux/mm_inline.h>
 #include <linux/kthread.h>
 #include <linux/khugepaged.h>
 #include <linux/freezer.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
+#include <linux/migrate.h>
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
 #include "internal.h"
@@ -37,7 +40,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
        (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
 #endif
        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
-        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
+        (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
 /* default scan 8*512 pte (or vmas) every 30 second */
 static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
@@ -159,6 +163,77 @@ static int start_khugepaged(void)
        return err;
 }
+static atomic_t huge_zero_refcount;
+static unsigned long huge_zero_pfn __read_mostly;
+static inline bool is_huge_zero_pfn(unsigned long pfn)
+{
+        unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
+        return zero_pfn && pfn == zero_pfn;
+}
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+        return is_huge_zero_pfn(pmd_pfn(pmd));
+}
+static unsigned long get_huge_zero_page(void)
+{
+        struct page *zero_page;
+retry:
+        if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
+                return ACCESS_ONCE(huge_zero_pfn);
+        zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
+                        HPAGE_PMD_ORDER);
+        if (!zero_page) {
+                count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
+                return 0;
+        }
+        count_vm_event(THP_ZERO_PAGE_ALLOC);
+        preempt_disable();
+        if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
+                preempt_enable();
+                __free_page(zero_page);
+                goto retry;
+        }
+        /* We take additional reference here. It will be put back by shrinker */
+        atomic_set(&huge_zero_refcount, 2);
+        preempt_enable();
+        return ACCESS_ONCE(huge_zero_pfn);
+}
+static void put_huge_zero_page(void)
+{
+        /*
+         * Counter should never go to zero here. Only shrinker can put
+         * last reference.
+         */
+        BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
+}
+static int shrink_huge_zero_page(struct shrinker *shrink,
+                struct shrink_control *sc)
+{
+        if (!sc->nr_to_scan)
+                /* we can free zero page only if last reference remains */
+                return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+        if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
+                unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
+                BUG_ON(zero_pfn == 0);
+                __free_page(__pfn_to_page(zero_pfn));
+        }
+        return 0;
+}
+static struct shrinker huge_zero_page_shrinker = {
+        .shrink = shrink_huge_zero_page,
+        .seeks = DEFAULT_SEEKS,
+};
 #ifdef CONFIG_SYSFS
 static ssize_t double_flag_show(struct kobject *kobj,
@@ -284,6 +359,20 @@ static ssize_t defrag_store(struct kobject *kobj,
 static struct kobj_attribute defrag_attr =
        __ATTR(defrag, 0644, defrag_show, defrag_store);
+static ssize_t use_zero_page_show(struct kobject *kobj,
+                struct kobj_attribute *attr, char *buf)
+{
+        return single_flag_show(kobj, attr, buf,
+                                TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+}
+static ssize_t use_zero_page_store(struct kobject *kobj,
+                struct kobj_attribute *attr, const char *buf, size_t count)
+{
+        return single_flag_store(kobj, attr, buf, count,
+                                 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+}
+static struct kobj_attribute use_zero_page_attr =
+        __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
 #ifdef CONFIG_DEBUG_VM
 static ssize_t debug_cow_show(struct kobject *kobj,
                                struct kobj_attribute *attr, char *buf)
@@ -305,6 +394,7 @@ static struct kobj_attribute debug_cow_attr =
 static struct attribute *hugepage_attr[] = {
        &enabled_attr.attr,
        &defrag_attr.attr,
+        &use_zero_page_attr.attr,
 #ifdef CONFIG_DEBUG_VM
        &debug_cow_attr.attr,
 #endif
@@ -484,19 +574,19 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
        *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
        if (unlikely(!*hugepage_kobj)) {
-                printk(KERN_ERR "hugepage: failed kobject create\n");
+                printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n");
                return -ENOMEM;
        }
        err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
        if (err) {
-                printk(KERN_ERR "hugepage: failed register hugeage group\n");
+                printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
                goto delete_obj;
        }
        err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
        if (err) {
-                printk(KERN_ERR "hugepage: failed register hugeage group\n");
+                printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
                goto remove_hp_group;
        }
@@ -550,6 +640,8 @@ static int __init hugepage_init(void)
                goto out;
        }
+        register_shrinker(&huge_zero_page_shrinker);
        /*
         * By default disable transparent hugepages on smaller systems,
         * where the extra memory used could hurt more than TLB overhead
@@ -599,13 +691,22 @@ out:
 }
 __setup("transparent_hugepage=", setup_transparent_hugepage);
-static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 {
        if (likely(vma->vm_flags & VM_WRITE))
                pmd = pmd_mkwrite(pmd);
        return pmd;
 }
+static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)
+{
+        pmd_t entry;
+        entry = mk_pmd(page, vma->vm_page_prot);
+        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+        entry = pmd_mkhuge(entry);
+        return entry;
+}
 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long haddr, pmd_t *pmd,
@@ -629,9 +730,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                pte_free(mm, pgtable);
        } else {
                pmd_t entry;
-                entry = mk_pmd(page, vma->vm_page_prot);
+                entry = mk_huge_pmd(page, vma);
-                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-                entry = pmd_mkhuge(entry);
                /*
                 * The spinlocking to take the lru_lock inside
                 * page_add_new_anon_rmap() acts as a full memory
@@ -671,6 +770,22 @@ static inline struct page *alloc_hugepage(int defrag)
 }
 #endif
+static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+                struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
+                unsigned long zero_pfn)
+{
+        pmd_t entry;
+        if (!pmd_none(*pmd))
+                return false;
+        entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
+        entry = pmd_wrprotect(entry);
+        entry = pmd_mkhuge(entry);
+        set_pmd_at(mm, haddr, pmd, entry);
+        pgtable_trans_huge_deposit(mm, pgtable);
+        mm->nr_ptes++;
+        return true;
+}
 int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                               unsigned long address, pmd_t *pmd,
                               unsigned int flags)
@@ -684,6 +799,30 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        return VM_FAULT_OOM;
                if (unlikely(khugepaged_enter(vma)))
                        return VM_FAULT_OOM;
+                if (!(flags & FAULT_FLAG_WRITE) &&
+                                transparent_hugepage_use_zero_page()) {
+                        pgtable_t pgtable;
+                        unsigned long zero_pfn;
+                        bool set;
+                        pgtable = pte_alloc_one(mm, haddr);
+                        if (unlikely(!pgtable))
+                                return VM_FAULT_OOM;
+                        zero_pfn = get_huge_zero_page();
+                        if (unlikely(!zero_pfn)) {
+                                pte_free(mm, pgtable);
+                                count_vm_event(THP_FAULT_FALLBACK);
+                                goto out;
+                        }
+                        spin_lock(&mm->page_table_lock);
+                        set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
+                                        zero_pfn);
+                        spin_unlock(&mm->page_table_lock);
+                        if (!set) {
+                                pte_free(mm, pgtable);
+                                put_huge_zero_page();
+                        }
+                        return 0;
+                }
                page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
                                          vma, haddr, numa_node_id(), 0);
                if (unlikely(!page)) {
@@ -710,7 +849,8 @@ out:
         * run pte_offset_map on the pmd, if an huge pmd could
         * materialize from under us from a different thread.
         */
-        if (unlikely(__pte_alloc(mm, vma, pmd, address)))
+        if (unlikely(pmd_none(*pmd)) &&
+            unlikely(__pte_alloc(mm, vma, pmd, address)))
                return VM_FAULT_OOM;
        /* if an huge pmd materialized from under us just retry later */
        if (unlikely(pmd_trans_huge(*pmd)))
@@ -748,6 +888,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                pte_free(dst_mm, pgtable);
                goto out_unlock;
        }
+        /*
+         * mm->page_table_lock is enough to be sure that huge zero pmd is not
+         * under splitting since we don't split the page itself, only pmd to
+         * a page table.
+         */
+        if (is_huge_zero_pmd(pmd)) {
+                unsigned long zero_pfn;
+                bool set;
+                /*
+                 * get_huge_zero_page() will never allocate a new page here,
+                 * since we already have a zero page to copy. It just takes a
+                 * reference.
+                 */
+                zero_pfn = get_huge_zero_page();
+                set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
+                                zero_pfn);
+                BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
+                ret = 0;
+                goto out_unlock;
+        }
        if (unlikely(pmd_trans_splitting(pmd))) {
                /* split huge page running from under us */
                spin_unlock(&src_mm->page_table_lock);
@@ -777,6 +937,102 @@ out:
        return ret;
 }
+void huge_pmd_set_accessed(struct mm_struct *mm,
+                           struct vm_area_struct *vma,
+                           unsigned long address,
+                           pmd_t *pmd, pmd_t orig_pmd,
+                           int dirty)
+{
+        pmd_t entry;
+        unsigned long haddr;
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_same(*pmd, orig_pmd)))
+                goto unlock;
+        entry = pmd_mkyoung(orig_pmd);
+        haddr = address & HPAGE_PMD_MASK;
+        if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
+                update_mmu_cache_pmd(vma, address, pmd);
+unlock:
+        spin_unlock(&mm->page_table_lock);
+}
+static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
+                struct vm_area_struct *vma, unsigned long address,
+                pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
+{
+        pgtable_t pgtable;
+        pmd_t _pmd;
+        struct page *page;
+        int i, ret = 0;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
+        page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+        if (!page) {
+                ret |= VM_FAULT_OOM;
+                goto out;
+        }
+        if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
+                put_page(page);
+                ret |= VM_FAULT_OOM;
+                goto out;
+        }
+        clear_user_highpage(page, address);
+        __SetPageUptodate(page);
+        mmun_start = haddr;
+        mmun_end   = haddr + HPAGE_PMD_SIZE;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_same(*pmd, orig_pmd)))
+                goto out_free_page;
+        pmdp_clear_flush(vma, haddr, pmd);
+        /* leave pmd empty until pte is filled */
+        pgtable = pgtable_trans_huge_withdraw(mm);
+        pmd_populate(mm, &_pmd, pgtable);
+        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+                pte_t *pte, entry;
+                if (haddr == (address & PAGE_MASK)) {
+                        entry = mk_pte(page, vma->vm_page_prot);
+                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+                        page_add_new_anon_rmap(page, vma, haddr);
+                } else {
+                        entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
+                        entry = pte_mkspecial(entry);
+                }
+                pte = pte_offset_map(&_pmd, haddr);
+                VM_BUG_ON(!pte_none(*pte));
+                set_pte_at(mm, haddr, pte, entry);
+                pte_unmap(pte);
+        }
+        smp_wmb(); /* make pte visible before pmd */
+        pmd_populate(mm, pmd, pgtable);
+        spin_unlock(&mm->page_table_lock);
+        put_huge_zero_page();
+        inc_mm_counter(mm, MM_ANONPAGES);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+        ret |= VM_FAULT_WRITE;
+out:
+        return ret;
+out_free_page:
+        spin_unlock(&mm->page_table_lock);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+        mem_cgroup_uncharge_page(page);
+        put_page(page);
+        goto out;
+}
 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long address,
@@ -883,19 +1139,21 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
 {
        int ret = 0;
-        struct page *page, *new_page;
+        struct page *page = NULL, *new_page;
        unsigned long haddr;
        unsigned long mmun_start;       /* For mmu_notifiers */
        unsigned long mmun_end;         /* For mmu_notifiers */
        VM_BUG_ON(!vma->anon_vma);
+        haddr = address & HPAGE_PMD_MASK;
+        if (is_huge_zero_pmd(orig_pmd))
+                goto alloc;
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_same(*pmd, orig_pmd)))
                goto out_unlock;
        page = pmd_page(orig_pmd);
        VM_BUG_ON(!PageCompound(page) || !PageHead(page));
-        haddr = address & HPAGE_PMD_MASK;
        if (page_mapcount(page) == 1) {
                pmd_t entry;
                entry = pmd_mkyoung(orig_pmd);
@@ -907,7 +1165,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        }
        get_page(page);
        spin_unlock(&mm->page_table_lock);
+alloc:
        if (transparent_hugepage_enabled(vma) &&
            !transparent_hugepage_debug_cow())
                new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
@@ -917,24 +1175,34 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (unlikely(!new_page)) {
                count_vm_event(THP_FAULT_FALLBACK);
-                ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
+                if (is_huge_zero_pmd(orig_pmd)) {
-                                                   pmd, orig_pmd, page, haddr);
+                        ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
-                if (ret & VM_FAULT_OOM)
+                                        address, pmd, orig_pmd, haddr);
-                        split_huge_page(page);
+                } else {
-                put_page(page);
+                        ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
+                                        pmd, orig_pmd, page, haddr);
+                        if (ret & VM_FAULT_OOM)
+                                split_huge_page(page);
+                        put_page(page);
+                }
                goto out;
        }
        count_vm_event(THP_FAULT_ALLOC);
        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
                put_page(new_page);
-                split_huge_page(page);
+                if (page) {
-                put_page(page);
+                        split_huge_page(page);
+                        put_page(page);
+                }
                ret |= VM_FAULT_OOM;
                goto out;
        }
-        copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
+        if (is_huge_zero_pmd(orig_pmd))
+                clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
+        else
+                copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
        __SetPageUptodate(new_page);
        mmun_start = haddr;
@@ -942,7 +1210,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        spin_lock(&mm->page_table_lock);
-        put_page(page);
+        if (page)
+                put_page(page);
        if (unlikely(!pmd_same(*pmd, orig_pmd))) {
                spin_unlock(&mm->page_table_lock);
                mem_cgroup_uncharge_page(new_page);
@@ -950,16 +1219,19 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto out_mn;
        } else {
                pmd_t entry;
-                VM_BUG_ON(!PageHead(page));
+                entry = mk_huge_pmd(new_page, vma);
-                entry = mk_pmd(new_page, vma->vm_page_prot);
-                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-                entry = pmd_mkhuge(entry);
                pmdp_clear_flush(vma, haddr, pmd);
                page_add_new_anon_rmap(new_page, vma, haddr);
                set_pmd_at(mm, haddr, pmd, entry);
                update_mmu_cache_pmd(vma, address, pmd);
-                page_remove_rmap(page);
+                if (is_huge_zero_pmd(orig_pmd)) {
-                put_page(page);
+                        add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+                        put_huge_zero_page();
+                } else {
+                        VM_BUG_ON(!PageHead(page));
+                        page_remove_rmap(page);
+                        put_page(page);
+                }
                ret |= VM_FAULT_WRITE;
        }
        spin_unlock(&mm->page_table_lock);
@@ -1017,6 +1289,81 @@ out:
        return page;
 }
+/* NUMA hinting page fault entry point for trans huge pmds */
+int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                                unsigned long addr, pmd_t pmd, pmd_t *pmdp)
+{
+        struct page *page;
+        unsigned long haddr = addr & HPAGE_PMD_MASK;
+        int target_nid;
+        int current_nid = -1;
+        bool migrated;
+        bool page_locked = false;
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_same(pmd, *pmdp)))
+                goto out_unlock;
+        page = pmd_page(pmd);
+        get_page(page);
+        current_nid = page_to_nid(page);
+        count_vm_numa_event(NUMA_HINT_FAULTS);
+        if (current_nid == numa_node_id())
+                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+        target_nid = mpol_misplaced(page, vma, haddr);
+        if (target_nid == -1) {
+                put_page(page);
+                goto clear_pmdnuma;
+        }
+        /* Acquire the page lock to serialise THP migrations */
+        spin_unlock(&mm->page_table_lock);
+        lock_page(page);
+        page_locked = true;
+        /* Confirm the PTE did not while locked */
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_same(pmd, *pmdp))) {
+                unlock_page(page);
+                put_page(page);
+                goto out_unlock;
+        }
+        spin_unlock(&mm->page_table_lock);
+        /* Migrate the THP to the requested node */
+        migrated = migrate_misplaced_transhuge_page(mm, vma,
+                                pmdp, pmd, addr,
+                                page, target_nid);
+        if (migrated)
+                current_nid = target_nid;
+        else {
+                spin_lock(&mm->page_table_lock);
+                if (unlikely(!pmd_same(pmd, *pmdp))) {
+                        unlock_page(page);
+                        goto out_unlock;
+                }
+                goto clear_pmdnuma;
+        }
+        task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+        return 0;
+clear_pmdnuma:
+        pmd = pmd_mknonnuma(pmd);
+        set_pmd_at(mm, haddr, pmdp, pmd);
+        VM_BUG_ON(pmd_numa(*pmdp));
+        update_mmu_cache_pmd(vma, addr, pmdp);
+        if (page_locked)
+                unlock_page(page);
+out_unlock:
+        spin_unlock(&mm->page_table_lock);
+        if (current_nid != -1)
+                task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+        return 0;
+}
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 pmd_t *pmd, unsigned long addr)
 {
@@ -1028,15 +1375,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                pmd_t orig_pmd;
                pgtable = pgtable_trans_huge_withdraw(tlb->mm);
                orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
-                page = pmd_page(orig_pmd);
                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
-                page_remove_rmap(page);
+                if (is_huge_zero_pmd(orig_pmd)) {
-                VM_BUG_ON(page_mapcount(page) < 0);
+                        tlb->mm->nr_ptes--;
-                add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+                        spin_unlock(&tlb->mm->page_table_lock);
-                VM_BUG_ON(!PageHead(page));
+                        put_huge_zero_page();
-                tlb->mm->nr_ptes--;
+                } else {
-                spin_unlock(&tlb->mm->page_table_lock);
+                        page = pmd_page(orig_pmd);
-                tlb_remove_page(tlb, page);
+                        page_remove_rmap(page);
+                        VM_BUG_ON(page_mapcount(page) < 0);
+                        add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+                        VM_BUG_ON(!PageHead(page));
+                        tlb->mm->nr_ptes--;
+                        spin_unlock(&tlb->mm->page_table_lock);
+                        tlb_remove_page(tlb, page);
+                }
                pte_free(tlb->mm, pgtable);
                ret = 1;
        }
@@ -1099,7 +1452,7 @@ out:
 }
 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-                unsigned long addr, pgprot_t newprot)
+                unsigned long addr, pgprot_t newprot, int prot_numa)
 {
        struct mm_struct *mm = vma->vm_mm;
        int ret = 0;
@@ -1107,7 +1460,18 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
        if (__pmd_trans_huge_lock(pmd, vma) == 1) {
                pmd_t entry;
                entry = pmdp_get_and_clear(mm, addr, pmd);
-                entry = pmd_modify(entry, newprot);
+                if (!prot_numa) {
+                        entry = pmd_modify(entry, newprot);
+                        BUG_ON(pmd_write(entry));
+                } else {
+                        struct page *page = pmd_page(*pmd);
+                        /* only check non-shared pages */
+                        if (page_mapcount(page) == 1 &&
+                            !pmd_numa(*pmd)) {
+                                entry = pmd_mknuma(entry);
+                        }
+                }
                set_pmd_at(mm, addr, pmd, entry);
                spin_unlock(&vma->vm_mm->page_table_lock);
                ret = 1;
@@ -1146,22 +1510,14 @@ pmd_t *page_check_address_pmd(struct page *page,
                              unsigned long address,
                              enum page_check_address_pmd_flag flag)
 {
-        pgd_t *pgd;
-        pud_t *pud;
        pmd_t *pmd, *ret = NULL;
        if (address & ~HPAGE_PMD_MASK)
                goto out;
-        pgd = pgd_offset(mm, address);
+        pmd = mm_find_pmd(mm, address);
-        if (!pgd_present(*pgd))
+        if (!pmd)
                goto out;
-        pud = pud_offset(pgd, address);
-        if (!pud_present(*pud))
-                goto out;
-        pmd = pmd_offset(pud, address);
        if (pmd_none(*pmd))
                goto out;
        if (pmd_page(*pmd) != page)
@@ -1205,7 +1561,7 @@ static int __split_huge_page_splitting(struct page *page,
                 * We can't temporarily set the pmd to null in order
                 * to split it, the pmd must remain marked huge at all
                 * times or the VM won't take the pmd_trans_huge paths
-                 * and it won't wait on the anon_vma->root->mutex to
+                 * and it won't wait on the anon_vma->root->rwsem to
                 * serialize against split_huge_page*.
                 */
                pmdp_splitting_flush(vma, address, pmd);
@@ -1296,6 +1652,7 @@ static void __split_huge_page_refcount(struct page *page)
                page_tail->mapping = page->mapping;
                page_tail->index = page->index + i;
+                page_xchg_last_nid(page_tail, page_last_nid(page));
                BUG_ON(!PageAnon(page_tail));
                BUG_ON(!PageUptodate(page_tail));
@@ -1363,6 +1720,8 @@ static int __split_huge_page_map(struct page *page,
                                BUG_ON(page_mapcount(page) != 1);
                        if (!pmd_young(*pmd))
                                entry = pte_mkold(entry);
+                        if (pmd_numa(*pmd))
+                                entry = pte_mknuma(entry);
                        pte = pte_offset_map(&_pmd, haddr);
                        BUG_ON(!pte_none(*pte));
                        set_pte_at(mm, haddr, pte, entry);
@@ -1405,7 +1764,7 @@ static int __split_huge_page_map(struct page *page,
        return ret;
 }
-/* must be called with anon_vma->root->mutex hold */
+/* must be called with anon_vma->root->rwsem held */
 static void __split_huge_page(struct page *page,
                              struct anon_vma *anon_vma)
 {
@@ -1458,10 +1817,21 @@ int split_huge_page(struct page *page)
        struct anon_vma *anon_vma;
        int ret = 1;
+        BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
        BUG_ON(!PageAnon(page));
-        anon_vma = page_lock_anon_vma(page);
+        /*
+         * The caller does not necessarily hold an mmap_sem that would prevent
+         * the anon_vma disappearing so we first we take a reference to it
+         * and then lock the anon_vma for write. This is similar to
+         * page_lock_anon_vma_read except the write lock is taken to serialise
+         * against parallel split or collapse operations.
+         */
+        anon_vma = page_get_anon_vma(page);
        if (!anon_vma)
                goto out;
+        anon_vma_lock_write(anon_vma);
        ret = 0;
        if (!PageCompound(page))
                goto out_unlock;
@@ -1472,7 +1842,8 @@ int split_huge_page(struct page *page)
        BUG_ON(PageCompound(page));
 out_unlock:
-        page_unlock_anon_vma(anon_vma);
+        anon_vma_unlock(anon_vma);
+        put_anon_vma(anon_vma);
 out:
        return ret;
 }
@@ -1701,64 +2072,49 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
        }
 }
-static void release_all_pte_pages(pte_t *pte)
-{
-        release_pte_pages(pte, pte + HPAGE_PMD_NR);
-}
 static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                                        unsigned long address,
                                        pte_t *pte)
 {
        struct page *page;
        pte_t *_pte;
-        int referenced = 0, isolated = 0, none = 0;
+        int referenced = 0, none = 0;
        for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
             _pte++, address += PAGE_SIZE) {
                pte_t pteval = *_pte;
                if (pte_none(pteval)) {
                        if (++none <= khugepaged_max_ptes_none)
                                continue;
-                        else {
+                        else
-                                release_pte_pages(pte, _pte);
                                goto out;
-                        }
                }
-                if (!pte_present(pteval) || !pte_write(pteval)) {
+                if (!pte_present(pteval) || !pte_write(pteval))
-                        release_pte_pages(pte, _pte);
                        goto out;
-                }
                page = vm_normal_page(vma, address, pteval);
-                if (unlikely(!page)) {
+                if (unlikely(!page))
-                        release_pte_pages(pte, _pte);
                        goto out;
-                }
                VM_BUG_ON(PageCompound(page));
                BUG_ON(!PageAnon(page));
                VM_BUG_ON(!PageSwapBacked(page));
                /* cannot use mapcount: can't collapse if there's a gup pin */
-                if (page_count(page) != 1) {
+                if (page_count(page) != 1)
-                        release_pte_pages(pte, _pte);
                        goto out;
-                }
                /*
                 * We can do it before isolate_lru_page because the
                 * page can't be freed from under us. NOTE: PG_lock
                 * is needed to serialize against split_huge_page
                 * when invoked from the VM.
                 */
-                if (!trylock_page(page)) {
+                if (!trylock_page(page))
-                        release_pte_pages(pte, _pte);
                        goto out;
-                }
                /*
                 * Isolate the page to avoid collapsing an hugepage
                 * currently in use by the VM.
                 */
                if (isolate_lru_page(page)) {
                        unlock_page(page);
-                        release_pte_pages(pte, _pte);
                        goto out;
                }
                /* 0 stands for page_is_file_cache(page) == false */
@@ -1771,12 +2127,11 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                    mmu_notifier_test_young(vma->vm_mm, address))
                        referenced = 1;
        }
-        if (unlikely(!referenced))
+        if (likely(referenced))
-                release_all_pte_pages(pte);
+                return 1;
-        else
-                isolated = 1;
 out:
-        return isolated;
+        release_pte_pages(pte, _pte);
+        return 0;
 }
 static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
@@ -1918,14 +2273,26 @@ static struct page
 }
 #endif
+static bool hugepage_vma_check(struct vm_area_struct *vma)
+{
+        if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+            (vma->vm_flags & VM_NOHUGEPAGE))
+                return false;
+        if (!vma->anon_vma || vma->vm_ops)
+                return false;
+        if (is_vma_temporary_stack(vma))
+                return false;
+        VM_BUG_ON(vma->vm_flags & VM_NO_THP);
+        return true;
+}
 static void collapse_huge_page(struct mm_struct *mm,
                                   unsigned long address,
                                   struct page **hpage,
                                   struct vm_area_struct *vma,
                                   int node)
 {
-        pgd_t *pgd;
-        pud_t *pud;
        pmd_t *pmd, _pmd;
        pte_t *pte;
        pgtable_t pgtable;
@@ -1960,31 +2327,15 @@ static void collapse_huge_page(struct mm_struct *mm,
        hend = vma->vm_end & HPAGE_PMD_MASK;
        if (address < hstart || address + HPAGE_PMD_SIZE > hend)
                goto out;
+        if (!hugepage_vma_check(vma))
-        if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
-            (vma->vm_flags & VM_NOHUGEPAGE))
-                goto out;
-        if (!vma->anon_vma || vma->vm_ops)
-                goto out;
-        if (is_vma_temporary_stack(vma))
                goto out;
-        VM_BUG_ON(vma->vm_flags & VM_NO_THP);
+        pmd = mm_find_pmd(mm, address);
+        if (!pmd)
-        pgd = pgd_offset(mm, address);
-        if (!pgd_present(*pgd))
-                goto out;
-        pud = pud_offset(pgd, address);
-        if (!pud_present(*pud))
                goto out;
+        if (pmd_trans_huge(*pmd))
-        pmd = pmd_offset(pud, address);
-        /* pmd can't go away or become huge under us */
-        if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
                goto out;
-        anon_vma_lock(vma->anon_vma);
+        anon_vma_lock_write(vma->anon_vma);
        pte = pte_offset_map(pmd, address);
        ptl = pte_lockptr(mm, pmd);
@@ -2028,9 +2379,7 @@ static void collapse_huge_page(struct mm_struct *mm,
        __SetPageUptodate(new_page);
        pgtable = pmd_pgtable(_pmd);
-        _pmd = mk_pmd(new_page, vma->vm_page_prot);
+        _pmd = mk_huge_pmd(new_page, vma);
-        _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
-        _pmd = pmd_mkhuge(_pmd);
        /*
         * spin_lock() below is not the equivalent of smp_wmb(), so
@@ -2064,8 +2413,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                               unsigned long address,
                               struct page **hpage)
 {
-        pgd_t *pgd;
-        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte, *_pte;
        int ret = 0, referenced = 0, none = 0;
@@ -2076,16 +2423,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-        pgd = pgd_offset(mm, address);
+        pmd = mm_find_pmd(mm, address);
-        if (!pgd_present(*pgd))
+        if (!pmd)
                goto out;
+        if (pmd_trans_huge(*pmd))
-        pud = pud_offset(pgd, address);
-        if (!pud_present(*pud))
-                goto out;
-        pmd = pmd_offset(pud, address);
-        if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
                goto out;
        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2193,20 +2534,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
                        progress++;
                        break;
                }
+                if (!hugepage_vma_check(vma)) {
-                if ((!(vma->vm_flags & VM_HUGEPAGE) &&
+skip:
-                     !khugepaged_always()) ||
-                    (vma->vm_flags & VM_NOHUGEPAGE)) {
-                skip:
                        progress++;
                        continue;
                }
-                if (!vma->anon_vma || vma->vm_ops)
-                        goto skip;
-                if (is_vma_temporary_stack(vma))
-                        goto skip;
-                VM_BUG_ON(vma->vm_flags & VM_NO_THP);
                hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
                hend = vma->vm_end & HPAGE_PMD_MASK;
                if (hstart >= hend)
@@ -2356,19 +2688,65 @@ static int khugepaged(void *none)
        return 0;
 }
-void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
+static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
+                unsigned long haddr, pmd_t *pmd)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        pgtable_t pgtable;
+        pmd_t _pmd;
+        int i;
+        pmdp_clear_flush(vma, haddr, pmd);
+        /* leave pmd empty until pte is filled */
+        pgtable = pgtable_trans_huge_withdraw(mm);
+        pmd_populate(mm, &_pmd, pgtable);
+        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+                pte_t *pte, entry;
+                entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
+                entry = pte_mkspecial(entry);
+                pte = pte_offset_map(&_pmd, haddr);
+                VM_BUG_ON(!pte_none(*pte));
+                set_pte_at(mm, haddr, pte, entry);
+                pte_unmap(pte);
+        }
+        smp_wmb(); /* make pte visible before pmd */
+        pmd_populate(mm, pmd, pgtable);
+        put_huge_zero_page();
+}
+void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
+                pmd_t *pmd)
 {
        struct page *page;
+        struct mm_struct *mm = vma->vm_mm;
+        unsigned long haddr = address & HPAGE_PMD_MASK;
+        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_end;         /* For mmu_notifiers */
+        BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
+        mmun_start = haddr;
+        mmun_end   = haddr + HPAGE_PMD_SIZE;
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_trans_huge(*pmd))) {
                spin_unlock(&mm->page_table_lock);
+                mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+                return;
+        }
+        if (is_huge_zero_pmd(*pmd)) {
+                __split_huge_zero_page_pmd(vma, haddr, pmd);
+                spin_unlock(&mm->page_table_lock);
+                mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
                return;
        }
        page = pmd_page(*pmd);
        VM_BUG_ON(!page_count(page));
        get_page(page);
        spin_unlock(&mm->page_table_lock);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        split_huge_page(page);
@@ -2376,31 +2754,31 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
        BUG_ON(pmd_trans_huge(*pmd));
 }
+void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
+                pmd_t *pmd)
+{
+        struct vm_area_struct *vma;
+        vma = find_vma(mm, address);
+        BUG_ON(vma == NULL);
+        split_huge_page_pmd(vma, address, pmd);
+}
 static void split_huge_page_address(struct mm_struct *mm,
                                    unsigned long address)
 {
-        pgd_t *pgd;
-        pud_t *pud;
        pmd_t *pmd;
        VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
-        pgd = pgd_offset(mm, address);
+        pmd = mm_find_pmd(mm, address);
-        if (!pgd_present(*pgd))
+        if (!pmd)
-                return;
-        pud = pud_offset(pgd, address);
-        if (!pud_present(*pud))
-                return;
-        pmd = pmd_offset(pud, address);
-        if (!pmd_present(*pmd))
                return;
        /*
         * Caller holds the mmap_sem write mode, so a huge pmd cannot
         * materialize from under us.
         */
-        split_huge_page_pmd(mm, pmd);
+        split_huge_page_pmd_mm(mm, address, pmd);
 }
 void __vma_adjust_trans_huge(struct vm_area_struct *vma,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 59a0059b39e2..4f3ea0b1e57c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1,6 +1,6 @@
 /*
 * Generic hugetlb support.
- * (C) William Irwin, April 2004
+ * (C) Nadia Yvette Chambers, April 2004
 */
 #include <linux/list.h>
 #include <linux/init.h>
@@ -1057,7 +1057,7 @@ static void return_unused_surplus_pages(struct hstate *h,
         * on-line nodes with memory and will handle the hstate accounting.
         */
        while (nr_pages--) {
-                if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1))
+                if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
                        break;
        }
 }
@@ -1180,14 +1180,14 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 int __weak alloc_bootmem_huge_page(struct hstate *h)
 {
        struct huge_bootmem_page *m;
-        int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
+        int nr_nodes = nodes_weight(node_states[N_MEMORY]);
        while (nr_nodes) {
                void *addr;
                addr = __alloc_bootmem_node_nopanic(
                                NODE_DATA(hstate_next_node_to_alloc(h,
-                                                &node_states[N_HIGH_MEMORY])),
+                                                &node_states[N_MEMORY])),
                                huge_page_size(h), huge_page_size(h), 0);
                if (addr) {
@@ -1259,7 +1259,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
                        if (!alloc_bootmem_huge_page(h))
                                break;
                } else if (!alloc_fresh_huge_page(h,
-                                         &node_states[N_HIGH_MEMORY]))
+                                         &node_states[N_MEMORY]))
                        break;
        }
        h->max_huge_pages = i;
@@ -1527,7 +1527,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
                if (!(obey_mempolicy &&
                                init_nodemask_of_mempolicy(nodes_allowed))) {
                        NODEMASK_FREE(nodes_allowed);
-                        nodes_allowed = &node_states[N_HIGH_MEMORY];
+                        nodes_allowed = &node_states[N_MEMORY];
                }
        } else if (nodes_allowed) {
                /*
@@ -1537,11 +1537,11 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
                count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
                init_nodemask_of_node(nodes_allowed, nid);
        } else
-                nodes_allowed = &node_states[N_HIGH_MEMORY];
+                nodes_allowed = &node_states[N_MEMORY];
        h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
-        if (nodes_allowed != &node_states[N_HIGH_MEMORY])
+        if (nodes_allowed != &node_states[N_MEMORY])
                NODEMASK_FREE(nodes_allowed);
        return len;
@@ -1800,7 +1800,7 @@ static void hugetlb_unregister_all_nodes(void)
         * remove hstate attributes from any nodes that have them.
         */
        for (nid = 0; nid < nr_node_ids; nid++)
-                hugetlb_unregister_node(&node_devices[nid]);
+                hugetlb_unregister_node(node_devices[nid]);
 }
 /*
@@ -1844,8 +1844,8 @@ static void hugetlb_register_all_nodes(void)
 {
        int nid;
-        for_each_node_state(nid, N_HIGH_MEMORY) {
+        for_each_node_state(nid, N_MEMORY) {
-                struct node *node = &node_devices[nid];
+                struct node *node = node_devices[nid];
                if (node->dev.id == nid)
                        hugetlb_register_node(node);
        }
@@ -1906,14 +1906,12 @@ static int __init hugetlb_init(void)
                default_hstate.max_huge_pages = default_hstate_max_huge_pages;
        hugetlb_init_hstates();
        gather_bootmem_prealloc();
        report_hugepages();
        hugetlb_sysfs_init();
        hugetlb_register_all_nodes();
+        hugetlb_cgroup_file_init();
        return 0;
 }
@@ -1939,17 +1937,10 @@ void __init hugetlb_add_hstate(unsigned order)
        for (i = 0; i < MAX_NUMNODES; ++i)
                INIT_LIST_HEAD(&h->hugepage_freelists[i]);
        INIT_LIST_HEAD(&h->hugepage_activelist);
-        h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
+        h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
-        h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
+        h->next_nid_to_free = first_node(node_states[N_MEMORY]);
        snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
                                        huge_page_size(h)/1024);
-        /*
-         * Add cgroup control files only if the huge page consists
-         * of more than two normal pages. This is because we use
-         * page[2].lru.next for storing cgoup details.
-         */
-        if (order >= HUGETLB_CGROUP_MIN_ORDER)
-                hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
        parsed_hstate = h;
 }
@@ -2035,11 +2026,11 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
                if (!(obey_mempolicy &&
                               init_nodemask_of_mempolicy(nodes_allowed))) {
                        NODEMASK_FREE(nodes_allowed);
-                        nodes_allowed = &node_states[N_HIGH_MEMORY];
+                        nodes_allowed = &node_states[N_MEMORY];
                }
                h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
-                if (nodes_allowed != &node_states[N_HIGH_MEMORY])
+                if (nodes_allowed != &node_states[N_MEMORY])
                        NODEMASK_FREE(nodes_allowed);
        }
 out:
@@ -2386,8 +2377,10 @@ again:
                /*
                 * HWPoisoned hugepage is already unmapped and dropped reference
                 */
-                if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
+                if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
+                        pte_clear(mm, address, ptep);
                        continue;
+                }
                page = pte_page(pte);
                /*
@@ -3014,7 +3007,7 @@ same_page:
        return i ? i : -EFAULT;
 }
-void hugetlb_change_protection(struct vm_area_struct *vma,
+unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end, pgprot_t newprot)
 {
        struct mm_struct *mm = vma->vm_mm;
@@ -3022,6 +3015,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
        pte_t *ptep;
        pte_t pte;
        struct hstate *h = hstate_vma(vma);
+        unsigned long pages = 0;
        BUG_ON(address >= end);
        flush_cache_range(vma, address, end);
@@ -3032,12 +3026,15 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
                ptep = huge_pte_offset(mm, address);
                if (!ptep)
                        continue;
-                if (huge_pmd_unshare(mm, &address, ptep))
+                if (huge_pmd_unshare(mm, &address, ptep)) {
+                        pages++;
                        continue;
+                }
                if (!huge_pte_none(huge_ptep_get(ptep))) {
                        pte = huge_ptep_get_and_clear(mm, address, ptep);
                        pte = pte_mkhuge(pte_modify(pte, newprot));
                        set_huge_pte_at(mm, address, ptep, pte);
+                        pages++;
                }
        }
        spin_unlock(&mm->page_table_lock);
@@ -3049,6 +3046,8 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
         */
        flush_tlb_range(vma, start, end);
        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+        return pages << h->order;
 }
 int hugetlb_reserve_pages(struct inode *inode,
@@ -3170,7 +3169,13 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
        spin_lock(&hugetlb_lock);
        if (is_hugepage_on_freelist(hpage)) {
-                list_del(&hpage->lru);
+                /*
+                 * Hwpoisoned hugepage isn't linked to activelist or freelist,
+                 * but dangling hpage->lru can trigger list-debug warnings
+                 * (this happens when we call unpoison_memory() on it),
+                 * so let it point to itself with list_del_init().
+                 */
+                list_del_init(&hpage->lru);
                set_page_refcounted(hpage);
                h->free_huge_pages--;
                h->free_huge_pages_node[nid]--;
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index a3f358fb8a0c..9cea7de22ffb 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -77,7 +77,7 @@ static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
        return false;
 }
-static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
+static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgroup)
 {
        int idx;
        struct cgroup *parent_cgroup;
@@ -101,7 +101,7 @@ static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
        return &h_cgroup->css;
 }
-static void hugetlb_cgroup_destroy(struct cgroup *cgroup)
+static void hugetlb_cgroup_css_free(struct cgroup *cgroup)
 {
        struct hugetlb_cgroup *h_cgroup;
@@ -155,18 +155,13 @@ out:
 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
 * the parent cgroup.
 */
-static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
+static void hugetlb_cgroup_css_offline(struct cgroup *cgroup)
 {
        struct hstate *h;
        struct page *page;
-        int ret = 0, idx = 0;
+        int idx = 0;
        do {
-                if (cgroup_task_count(cgroup) ||
-                    !list_empty(&cgroup->children)) {
-                        ret = -EBUSY;
-                        goto out;
-                }
                for_each_hstate(h) {
                        spin_lock(&hugetlb_lock);
                        list_for_each_entry(page, &h->hugepage_activelist, lru)
@@ -177,8 +172,6 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
                }
                cond_resched();
        } while (hugetlb_cgroup_have_usage(cgroup));
-out:
-        return ret;
 }
 int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
@@ -340,7 +333,7 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize)
        return buf;
 }
-int __init hugetlb_cgroup_file_init(int idx)
+static void __init __hugetlb_cgroup_file_init(int idx)
 {
        char buf[32];
        struct cftype *cft;
@@ -382,7 +375,22 @@ int __init hugetlb_cgroup_file_init(int idx)
        WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
-        return 0;
+        return;
+}
+void __init hugetlb_cgroup_file_init(void)
+{
+        struct hstate *h;
+        for_each_hstate(h) {
+                /*
+                 * Add cgroup control files only if the huge page consists
+                 * of more than two normal pages. This is because we use
+                 * page[2].lru.next for storing cgroup details.
+                 */
+                if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
+                        __hugetlb_cgroup_file_init(hstate_index(h));
+        }
 }
 /*
@@ -411,8 +419,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
 struct cgroup_subsys hugetlb_subsys = {
        .name = "hugetlb",
-        .create     = hugetlb_cgroup_create,
+        .css_alloc      = hugetlb_cgroup_css_alloc,
-        .pre_destroy = hugetlb_cgroup_pre_destroy,
+        .css_offline    = hugetlb_cgroup_css_offline,
-        .destroy    = hugetlb_cgroup_destroy,
+        .css_free       = hugetlb_cgroup_css_free,
-        .subsys_id  = hugetlb_subsys_id,
+        .subsys_id      = hugetlb_subsys_id,
 };
diff --git a/mm/internal.h b/mm/internal.h
index a4fa284f6bc2..9ba21100ebf3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -92,6 +92,11 @@ extern int isolate_lru_page(struct page *page);
 extern void putback_lru_page(struct page *page);
 /*
+ * in mm/rmap.c:
+ */
+extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
+/*
 * in mm/page_alloc.c
 */
 extern void __free_pages_bootmem(struct page *page, unsigned int order);
@@ -130,7 +135,6 @@ struct compact_control {
        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
        struct zone *zone;
        bool contended;                 /* True if a lock was contended */
-        struct page **page;             /* Page captured of requested size */
 };
 unsigned long
@@ -212,15 +216,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
 {
        if (TestClearPageMlocked(page)) {
                unsigned long flags;
+                int nr_pages = hpage_nr_pages(page);
                local_irq_save(flags);
-                __dec_zone_page_state(page, NR_MLOCK);
+                __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
                SetPageMlocked(newpage);
-                __inc_zone_page_state(newpage, NR_MLOCK);
+                __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
                local_irq_restore(flags);
        }
 }
+extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern unsigned long vma_address(struct page *page,
                                 struct vm_area_struct *vma);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index a217cc544060..752a705c77c2 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1556,7 +1556,8 @@ static int dump_str_object_info(const char *str)
        struct kmemleak_object *object;
        unsigned long addr;
-        addr= simple_strtoul(str, NULL, 0);
+        if (kstrtoul(str, 0, &addr))
+                return -EINVAL;
        object = find_and_get_object(addr, 0);
        if (!object) {
                pr_info("Unknown object at 0x%08lx\n", addr);
diff --git a/mm/ksm.c b/mm/ksm.c
index ae539f0b8aa1..51573858938d 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -778,8 +778,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
                        struct page *kpage, pte_t orig_pte)
 {
        struct mm_struct *mm = vma->vm_mm;
-        pgd_t *pgd;
-        pud_t *pud;
        pmd_t *pmd;
        pte_t *ptep;
        spinlock_t *ptl;
@@ -792,18 +790,10 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        if (addr == -EFAULT)
                goto out;
-        pgd = pgd_offset(mm, addr);
+        pmd = mm_find_pmd(mm, addr);
-        if (!pgd_present(*pgd))
+        if (!pmd)
                goto out;
-        pud = pud_offset(pgd, addr);
-        if (!pud_present(*pud))
-                goto out;
-        pmd = pmd_offset(pud, addr);
        BUG_ON(pmd_trans_huge(*pmd));
-        if (!pmd_present(*pmd))
-                goto out;
        mmun_start = addr;
        mmun_end   = addr + PAGE_SIZE;
@@ -1634,7 +1624,7 @@ again:
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
-                anon_vma_lock(anon_vma);
+                anon_vma_lock_read(anon_vma);
                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
                                               0, ULONG_MAX) {
                        vma = vmac->vma;
@@ -1658,7 +1648,7 @@ again:
                        if (!search_new_forks || !mapcount)
                                break;
                }
-                anon_vma_unlock(anon_vma);
+                anon_vma_unlock_read(anon_vma);
                if (!mapcount)
                        goto out;
        }
@@ -1688,7 +1678,7 @@ again:
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
-                anon_vma_lock(anon_vma);
+                anon_vma_lock_read(anon_vma);
                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
                                               0, ULONG_MAX) {
                        vma = vmac->vma;
@@ -1707,11 +1697,11 @@ again:
                        ret = try_to_unmap_one(page, vma,
                                        rmap_item->address, flags);
                        if (ret != SWAP_AGAIN || !page_mapped(page)) {
-                                anon_vma_unlock(anon_vma);
+                                anon_vma_unlock_read(anon_vma);
                                goto out;
                        }
                }
-                anon_vma_unlock(anon_vma);
+                anon_vma_unlock_read(anon_vma);
        }
        if (!search_new_forks++)
                goto again;
@@ -1741,7 +1731,7 @@ again:
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
-                anon_vma_lock(anon_vma);
+                anon_vma_lock_read(anon_vma);
                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
                                               0, ULONG_MAX) {
                        vma = vmac->vma;
@@ -1759,11 +1749,11 @@ again:
                        ret = rmap_one(page, vma, rmap_item->address, arg);
                        if (ret != SWAP_AGAIN) {
-                                anon_vma_unlock(anon_vma);
+                                anon_vma_unlock_read(anon_vma);
                                goto out;
                        }
                }
-                anon_vma_unlock(anon_vma);
+                anon_vma_unlock_read(anon_vma);
        }
        if (!search_new_forks++)
                goto again;
@@ -1929,12 +1919,9 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
        if (ksm_run != flags) {
                ksm_run = flags;
                if (flags & KSM_RUN_UNMERGE) {
-                        int oom_score_adj;
+                        set_current_oom_origin();
-                        oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
                        err = unmerge_and_remove_all_rmap_items();
-                        compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX,
+                        clear_current_oom_origin();
-                                                                oom_score_adj);
                        if (err) {
                                ksm_run = KSM_RUN_STOP;
                                count = err;
diff --git a/mm/memblock.c b/mm/memblock.c
index 625905523c2a..88adc8afb610 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -314,7 +314,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
                }
                this->size += next->size;
-                memmove(next, next + 1, (type->cnt - (i + 1)) * sizeof(*next));
+                /* move forward from next + 1, index of which is i + 2 */
+                memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next));
                type->cnt--;
        }
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dd39ba000b31..09255ec8159c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -10,6 +10,10 @@
 * Copyright (C) 2009 Nokia Corporation
 * Author: Kirill A. Shutemov
 *
+ * Kernel Memory Controller
+ * Copyright (C) 2012 Parallels Inc. and Google Inc.
+ * Authors: Glauber Costa and Suleiman Souhlal
+ *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
@@ -59,6 +63,8 @@
 #include <trace/events/vmscan.h>
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
+EXPORT_SYMBOL(mem_cgroup_subsys);
 #define MEM_CGROUP_RECLAIM_RETRIES      5
 static struct mem_cgroup *root_mem_cgroup __read_mostly;
@@ -266,6 +272,10 @@ struct mem_cgroup {
        };
        /*
+         * the counter to account for kernel memory usage.
+         */
+        struct res_counter kmem;
+        /*
         * Per cgroup active and inactive list, similar to the
         * per zone LRU lists.
         */
@@ -280,6 +290,7 @@ struct mem_cgroup {
         * Should the accounting and control be hierarchical, per subtree?
         */
        bool use_hierarchy;
+        unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
        bool            oom_lock;
        atomic_t        under_oom;
@@ -330,8 +341,61 @@ struct mem_cgroup {
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
        struct tcp_memcontrol tcp_mem;
 #endif
+#if defined(CONFIG_MEMCG_KMEM)
+        /* analogous to slab_common's slab_caches list. per-memcg */
+        struct list_head memcg_slab_caches;
+        /* Not a spinlock, we can take a lot of time walking the list */
+        struct mutex slab_caches_mutex;
+        /* Index in the kmem_cache->memcg_params->memcg_caches array */
+        int kmemcg_id;
+#endif
 };
+/* internal only representation about the status of kmem accounting. */
+enum {
+        KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
+        KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
+        KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
+};
+/* We account when limit is on, but only after call sites are patched */
+#define KMEM_ACCOUNTED_MASK \
+                ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
+#ifdef CONFIG_MEMCG_KMEM
+static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
+{
+        set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
+}
+static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+{
+        return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
+}
+static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
+{
+        set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
+}
+static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
+{
+        clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
+}
+static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
+{
+        if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
+                set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
+}
+static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
+{
+        return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
+                                  &memcg->kmem_account_flags);
+}
+#endif
 /* Stuffs for move charges at task migration. */
 /*
 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
@@ -386,9 +450,13 @@ enum charge_type {
 };
 /* for encoding cft->private value on file */
-#define _MEM                    (0)
+enum res_type {
-#define _MEMSWAP                (1)
+        _MEM,
-#define _OOM_TYPE               (2)
+        _MEMSWAP,
+        _OOM_TYPE,
+        _KMEM,
+};
 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
 #define MEMFILE_TYPE(val)       ((val) >> 16 & 0xffff)
 #define MEMFILE_ATTR(val)       ((val) & 0xffff)
@@ -485,6 +553,75 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
 }
 #endif
+#ifdef CONFIG_MEMCG_KMEM
+/*
+ * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
+ * There are two main reasons for not using the css_id for this:
+ *  1) this works better in sparse environments, where we have a lot of memcgs,
+ *     but only a few kmem-limited. Or also, if we have, for instance, 200
+ *     memcgs, and none but the 200th is kmem-limited, we'd have to have a
+ *     200 entry array for that.
+ *
+ *  2) In order not to violate the cgroup API, we would like to do all memory
+ *     allocation in ->create(). At that point, we haven't yet allocated the
+ *     css_id. Having a separate index prevents us from messing with the cgroup
+ *     core for this
+ *
+ * The current size of the caches array is stored in
+ * memcg_limited_groups_array_size.  It will double each time we have to
+ * increase it.
+ */
+static DEFINE_IDA(kmem_limited_groups);
+int memcg_limited_groups_array_size;
+/*
+ * MIN_SIZE is different than 1, because we would like to avoid going through
+ * the alloc/free process all the time. In a small machine, 4 kmem-limited
+ * cgroups is a reasonable guess. In the future, it could be a parameter or
+ * tunable, but that is strictly not necessary.
+ *
+ * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
+ * this constant directly from cgroup, but it is understandable that this is
+ * better kept as an internal representation in cgroup.c. In any case, the
+ * css_id space is not getting any smaller, and we don't have to necessarily
+ * increase ours as well if it increases.
+ */
+#define MEMCG_CACHES_MIN_SIZE 4
+#define MEMCG_CACHES_MAX_SIZE 65535
+/*
+ * A lot of the calls to the cache allocation functions are expected to be
+ * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
+ * conditional to this static branch, we'll have to allow modules that does
+ * kmem_cache_alloc and the such to see this symbol as well
+ */
+struct static_key memcg_kmem_enabled_key;
+EXPORT_SYMBOL(memcg_kmem_enabled_key);
+static void disarm_kmem_keys(struct mem_cgroup *memcg)
+{
+        if (memcg_kmem_is_active(memcg)) {
+                static_key_slow_dec(&memcg_kmem_enabled_key);
+                ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
+        }
+        /*
+         * This check can't live in kmem destruction function,
+         * since the charges will outlive the cgroup
+         */
+        WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
+}
+#else
+static void disarm_kmem_keys(struct mem_cgroup *memcg)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+static void disarm_static_keys(struct mem_cgroup *memcg)
+{
+        disarm_sock_keys(memcg);
+        disarm_kmem_keys(memcg);
+}
 static void drain_all_stock_async(struct mem_cgroup *memcg);
 static struct mem_cgroup_per_zone *
@@ -800,7 +937,7 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
        int nid;
        u64 total = 0;
-        for_each_node_state(nid, N_HIGH_MEMORY)
+        for_each_node_state(nid, N_MEMORY)
                total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
        return total;
 }
@@ -1015,13 +1152,10 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
             iter != NULL;                              \
             iter = mem_cgroup_iter(NULL, iter, NULL))
-void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
+void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
        struct mem_cgroup *memcg;
-        if (!mm)
-                return;
        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (unlikely(!memcg))
@@ -1040,7 +1174,7 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 out:
        rcu_read_unlock();
 }
-EXPORT_SYMBOL(mem_cgroup_count_vm_event);
+EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
 /**
 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
@@ -1454,6 +1588,10 @@ done:
                res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
                res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
                res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
+        printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
+                res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
+                res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
+                res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
 }
 /*
@@ -1498,8 +1636,8 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
        return limit;
 }
-void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
+static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
-                              int order)
+                                     int order)
 {
        struct mem_cgroup *iter;
        unsigned long chosen_points = 0;
@@ -1644,9 +1782,9 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
                return;
        /* make a nodemask where this memcg uses memory from */
-        memcg->scan_nodes = node_states[N_HIGH_MEMORY];
+        memcg->scan_nodes = node_states[N_MEMORY];
-        for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
+        for_each_node_mask(nid, node_states[N_MEMORY]) {
                if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
                        node_clear(nid, memcg->scan_nodes);
@@ -1717,7 +1855,7 @@ static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
        /*
         * Check rest of nodes.
         */
-        for_each_node_state(nid, N_HIGH_MEMORY) {
+        for_each_node_state(nid, N_MEMORY) {
                if (node_isset(nid, memcg->scan_nodes))
                        continue;
                if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
@@ -2061,20 +2199,28 @@ struct memcg_stock_pcp {
 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
 static DEFINE_MUTEX(percpu_charge_mutex);
-/*
+/**
- * Try to consume stocked charge on this cpu. If success, one page is consumed
+ * consume_stock: Try to consume stocked charge on this cpu.
- * from local stock and true is returned. If the stock is 0 or charges from a
+ * @memcg: memcg to consume from.
- * cgroup which is not current target, returns false. This stock will be
+ * @nr_pages: how many pages to charge.
- * refilled.
+ *
+ * The charges will only happen if @memcg matches the current cpu's memcg
+ * stock, and at least @nr_pages are available in that stock.  Failure to
+ * service an allocation will refill the stock.
+ *
+ * returns true if successful, false otherwise.
 */
-static bool consume_stock(struct mem_cgroup *memcg)
+static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
        struct memcg_stock_pcp *stock;
        bool ret = true;
+        if (nr_pages > CHARGE_BATCH)
+                return false;
        stock = &get_cpu_var(memcg_stock);
-        if (memcg == stock->cached && stock->nr_pages)
+        if (memcg == stock->cached && stock->nr_pages >= nr_pages)
-                stock->nr_pages--;
+                stock->nr_pages -= nr_pages;
        else /* need to call res_counter_charge */
                ret = false;
        put_cpu_var(memcg_stock);
@@ -2251,7 +2397,8 @@ enum {
 };
 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
-                                unsigned int nr_pages, bool oom_check)
+                                unsigned int nr_pages, unsigned int min_pages,
+                                bool oom_check)
 {
        unsigned long csize = nr_pages * PAGE_SIZE;
        struct mem_cgroup *mem_over_limit;
@@ -2274,18 +2421,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
        } else
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
        /*
-         * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
-         * of regular pages (CHARGE_BATCH), or a single regular page (1).
-         *
         * Never reclaim on behalf of optional batching, retry with a
         * single page instead.
         */
-        if (nr_pages == CHARGE_BATCH)
+        if (nr_pages > min_pages)
                return CHARGE_RETRY;
        if (!(gfp_mask & __GFP_WAIT))
                return CHARGE_WOULDBLOCK;
+        if (gfp_mask & __GFP_NORETRY)
+                return CHARGE_NOMEM;
        ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
        if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
                return CHARGE_RETRY;
@@ -2298,7 +2445,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
         * unlikely to succeed so close to the limit, and we fall back
         * to regular pages anyway in case of failure.
         */
-        if (nr_pages == 1 && ret)
+        if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
                return CHARGE_RETRY;
        /*
@@ -2370,10 +2517,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 again:
        if (*ptr) { /* css should be a valid one */
                memcg = *ptr;
-                VM_BUG_ON(css_is_removed(&memcg->css));
                if (mem_cgroup_is_root(memcg))
                        goto done;
-                if (nr_pages == 1 && consume_stock(memcg))
+                if (consume_stock(memcg, nr_pages))
                        goto done;
                css_get(&memcg->css);
        } else {
@@ -2398,7 +2544,7 @@ again:
                        rcu_read_unlock();
                        goto done;
                }
-                if (nr_pages == 1 && consume_stock(memcg)) {
+                if (consume_stock(memcg, nr_pages)) {
                        /*
                         * It seems dagerous to access memcg without css_get().
                         * But considering how consume_stok works, it's not
@@ -2433,7 +2579,8 @@ again:
                        nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
                }
-                ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
+                ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
+                    oom_check);
                switch (ret) {
                case CHARGE_OK:
                        break;
@@ -2510,9 +2657,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
 /*
 * A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock(). The caller must check css_is_removed() or some if
+ * rcu_read_lock().  The caller is responsible for calling css_tryget if
- * it's concern. (dropping refcnt from swap can be called against removed
+ * the mem_cgroup is used for charging. (dropping refcnt from swap can be
- * memcg.)
+ * called against removed memcg.)
 */
 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
 {
@@ -2626,6 +2773,766 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
        memcg_check_events(memcg, page);
 }
+static DEFINE_MUTEX(set_limit_mutex);
+#ifdef CONFIG_MEMCG_KMEM
+static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
+{
+        return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
+                (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
+}
+/*
+ * This is a bit cumbersome, but it is rarely used and avoids a backpointer
+ * in the memcg_cache_params struct.
+ */
+static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
+{
+        struct kmem_cache *cachep;
+        VM_BUG_ON(p->is_root_cache);
+        cachep = p->root_cache;
+        return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
+}
+#ifdef CONFIG_SLABINFO
+static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft,
+                                        struct seq_file *m)
+{
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+        struct memcg_cache_params *params;
+        if (!memcg_can_account_kmem(memcg))
+                return -EIO;
+        print_slabinfo_header(m);
+        mutex_lock(&memcg->slab_caches_mutex);
+        list_for_each_entry(params, &memcg->memcg_slab_caches, list)
+                cache_show(memcg_params_to_cache(params), m);
+        mutex_unlock(&memcg->slab_caches_mutex);
+        return 0;
+}
+#endif
+static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
+{
+        struct res_counter *fail_res;
+        struct mem_cgroup *_memcg;
+        int ret = 0;
+        bool may_oom;
+        ret = res_counter_charge(&memcg->kmem, size, &fail_res);
+        if (ret)
+                return ret;
+        /*
+         * Conditions under which we can wait for the oom_killer. Those are
+         * the same conditions tested by the core page allocator
+         */
+        may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
+        _memcg = memcg;
+        ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
+                                      &_memcg, may_oom);
+        if (ret == -EINTR)  {
+                /*
+                 * __mem_cgroup_try_charge() chosed to bypass to root due to
+                 * OOM kill or fatal signal.  Since our only options are to
+                 * either fail the allocation or charge it to this cgroup, do
+                 * it as a temporary condition. But we can't fail. From a
+                 * kmem/slab perspective, the cache has already been selected,
+                 * by mem_cgroup_kmem_get_cache(), so it is too late to change
+                 * our minds.
+                 *
+                 * This condition will only trigger if the task entered
+                 * memcg_charge_kmem in a sane state, but was OOM-killed during
+                 * __mem_cgroup_try_charge() above. Tasks that were already
+                 * dying when the allocation triggers should have been already
+                 * directed to the root cgroup in memcontrol.h
+                 */
+                res_counter_charge_nofail(&memcg->res, size, &fail_res);
+                if (do_swap_account)
+                        res_counter_charge_nofail(&memcg->memsw, size,
+                                                  &fail_res);
+                ret = 0;
+        } else if (ret)
+                res_counter_uncharge(&memcg->kmem, size);
+        return ret;
+}
+static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
+{
+        res_counter_uncharge(&memcg->res, size);
+        if (do_swap_account)
+                res_counter_uncharge(&memcg->memsw, size);
+        /* Not down to 0 */
+        if (res_counter_uncharge(&memcg->kmem, size))
+                return;
+        if (memcg_kmem_test_and_clear_dead(memcg))
+                mem_cgroup_put(memcg);
+}
+void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
+{
+        if (!memcg)
+                return;
+        mutex_lock(&memcg->slab_caches_mutex);
+        list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
+        mutex_unlock(&memcg->slab_caches_mutex);
+}
+/*
+ * helper for acessing a memcg's index. It will be used as an index in the
+ * child cache array in kmem_cache, and also to derive its name. This function
+ * will return -1 when this is not a kmem-limited memcg.
+ */
+int memcg_cache_id(struct mem_cgroup *memcg)
+{
+        return memcg ? memcg->kmemcg_id : -1;
+}
+/*
+ * This ends up being protected by the set_limit mutex, during normal
+ * operation, because that is its main call site.
+ *
+ * But when we create a new cache, we can call this as well if its parent
+ * is kmem-limited. That will have to hold set_limit_mutex as well.
+ */
+int memcg_update_cache_sizes(struct mem_cgroup *memcg)
+{
+        int num, ret;
+        num = ida_simple_get(&kmem_limited_groups,
+                                0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
+        if (num < 0)
+                return num;
+        /*
+         * After this point, kmem_accounted (that we test atomically in
+         * the beginning of this conditional), is no longer 0. This
+         * guarantees only one process will set the following boolean
+         * to true. We don't need test_and_set because we're protected
+         * by the set_limit_mutex anyway.
+         */
+        memcg_kmem_set_activated(memcg);
+        ret = memcg_update_all_caches(num+1);
+        if (ret) {
+                ida_simple_remove(&kmem_limited_groups, num);
+                memcg_kmem_clear_activated(memcg);
+                return ret;
+        }
+        memcg->kmemcg_id = num;
+        INIT_LIST_HEAD(&memcg->memcg_slab_caches);
+        mutex_init(&memcg->slab_caches_mutex);
+        return 0;
+}
+static size_t memcg_caches_array_size(int num_groups)
+{
+        ssize_t size;
+        if (num_groups <= 0)
+                return 0;
+        size = 2 * num_groups;
+        if (size < MEMCG_CACHES_MIN_SIZE)
+                size = MEMCG_CACHES_MIN_SIZE;
+        else if (size > MEMCG_CACHES_MAX_SIZE)
+                size = MEMCG_CACHES_MAX_SIZE;
+        return size;
+}
+/*
+ * We should update the current array size iff all caches updates succeed. This
+ * can only be done from the slab side. The slab mutex needs to be held when
+ * calling this.
+ */
+void memcg_update_array_size(int num)
+{
+        if (num > memcg_limited_groups_array_size)
+                memcg_limited_groups_array_size = memcg_caches_array_size(num);
+}
+int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
+{
+        struct memcg_cache_params *cur_params = s->memcg_params;
+        VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
+        if (num_groups > memcg_limited_groups_array_size) {
+                int i;
+                ssize_t size = memcg_caches_array_size(num_groups);
+                size *= sizeof(void *);
+                size += sizeof(struct memcg_cache_params);
+                s->memcg_params = kzalloc(size, GFP_KERNEL);
+                if (!s->memcg_params) {
+                        s->memcg_params = cur_params;
+                        return -ENOMEM;
+                }
+                s->memcg_params->is_root_cache = true;
+                /*
+                 * There is the chance it will be bigger than
+                 * memcg_limited_groups_array_size, if we failed an allocation
+                 * in a cache, in which case all caches updated before it, will
+                 * have a bigger array.
+                 *
+                 * But if that is the case, the data after
+                 * memcg_limited_groups_array_size is certainly unused
+                 */
+                for (i = 0; i < memcg_limited_groups_array_size; i++) {
+                        if (!cur_params->memcg_caches[i])
+                                continue;
+                        s->memcg_params->memcg_caches[i] =
+                                                cur_params->memcg_caches[i];
+                }
+                /*
+                 * Ideally, we would wait until all caches succeed, and only
+                 * then free the old one. But this is not worth the extra
+                 * pointer per-cache we'd have to have for this.
+                 *
+                 * It is not a big deal if some caches are left with a size
+                 * bigger than the others. And all updates will reset this
+                 * anyway.
+                 */
+                kfree(cur_params);
+        }
+        return 0;
+}
+int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
+                         struct kmem_cache *root_cache)
+{
+        size_t size = sizeof(struct memcg_cache_params);
+        if (!memcg_kmem_enabled())
+                return 0;
+        if (!memcg)
+                size += memcg_limited_groups_array_size * sizeof(void *);
+        s->memcg_params = kzalloc(size, GFP_KERNEL);
+        if (!s->memcg_params)
+                return -ENOMEM;
+        if (memcg) {
+                s->memcg_params->memcg = memcg;
+                s->memcg_params->root_cache = root_cache;
+        }
+        return 0;
+}
+void memcg_release_cache(struct kmem_cache *s)
+{
+        struct kmem_cache *root;
+        struct mem_cgroup *memcg;
+        int id;
+        /*
+         * This happens, for instance, when a root cache goes away before we
+         * add any memcg.
+         */
+        if (!s->memcg_params)
+                return;
+        if (s->memcg_params->is_root_cache)
+                goto out;
+        memcg = s->memcg_params->memcg;
+        id  = memcg_cache_id(memcg);
+        root = s->memcg_params->root_cache;
+        root->memcg_params->memcg_caches[id] = NULL;
+        mem_cgroup_put(memcg);
+        mutex_lock(&memcg->slab_caches_mutex);
+        list_del(&s->memcg_params->list);
+        mutex_unlock(&memcg->slab_caches_mutex);
+out:
+        kfree(s->memcg_params);
+}
+/*
+ * During the creation a new cache, we need to disable our accounting mechanism
+ * altogether. This is true even if we are not creating, but rather just
+ * enqueing new caches to be created.
+ *
+ * This is because that process will trigger allocations; some visible, like
+ * explicit kmallocs to auxiliary data structures, name strings and internal
+ * cache structures; some well concealed, like INIT_WORK() that can allocate
+ * objects during debug.
+ *
+ * If any allocation happens during memcg_kmem_get_cache, we will recurse back
+ * to it. This may not be a bounded recursion: since the first cache creation
+ * failed to complete (waiting on the allocation), we'll just try to create the
+ * cache again, failing at the same point.
+ *
+ * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
+ * memcg_kmem_skip_account. So we enclose anything that might allocate memory
+ * inside the following two functions.
+ */
+static inline void memcg_stop_kmem_account(void)
+{
+        VM_BUG_ON(!current->mm);
+        current->memcg_kmem_skip_account++;
+}
+static inline void memcg_resume_kmem_account(void)
+{
+        VM_BUG_ON(!current->mm);
+        current->memcg_kmem_skip_account--;
+}
+static void kmem_cache_destroy_work_func(struct work_struct *w)
+{
+        struct kmem_cache *cachep;
+        struct memcg_cache_params *p;
+        p = container_of(w, struct memcg_cache_params, destroy);
+        cachep = memcg_params_to_cache(p);
+        /*
+         * If we get down to 0 after shrink, we could delete right away.
+         * However, memcg_release_pages() already puts us back in the workqueue
+         * in that case. If we proceed deleting, we'll get a dangling
+         * reference, and removing the object from the workqueue in that case
+         * is unnecessary complication. We are not a fast path.
+         *
+         * Note that this case is fundamentally different from racing with
+         * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
+         * kmem_cache_shrink, not only we would be reinserting a dead cache
+         * into the queue, but doing so from inside the worker racing to
+         * destroy it.
+         *
+         * So if we aren't down to zero, we'll just schedule a worker and try
+         * again
+         */
+        if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
+                kmem_cache_shrink(cachep);
+                if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
+                        return;
+        } else
+                kmem_cache_destroy(cachep);
+}
+void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
+{
+        if (!cachep->memcg_params->dead)
+                return;
+        /*
+         * There are many ways in which we can get here.
+         *
+         * We can get to a memory-pressure situation while the delayed work is
+         * still pending to run. The vmscan shrinkers can then release all
+         * cache memory and get us to destruction. If this is the case, we'll
+         * be executed twice, which is a bug (the second time will execute over
+         * bogus data). In this case, cancelling the work should be fine.
+         *
+         * But we can also get here from the worker itself, if
+         * kmem_cache_shrink is enough to shake all the remaining objects and
+         * get the page count to 0. In this case, we'll deadlock if we try to
+         * cancel the work (the worker runs with an internal lock held, which
+         * is the same lock we would hold for cancel_work_sync().)
+         *
+         * Since we can't possibly know who got us here, just refrain from
+         * running if there is already work pending
+         */
+        if (work_pending(&cachep->memcg_params->destroy))
+                return;
+        /*
+         * We have to defer the actual destroying to a workqueue, because
+         * we might currently be in a context that cannot sleep.
+         */
+        schedule_work(&cachep->memcg_params->destroy);
+}
+static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s)
+{
+        char *name;
+        struct dentry *dentry;
+        rcu_read_lock();
+        dentry = rcu_dereference(memcg->css.cgroup->dentry);
+        rcu_read_unlock();
+        BUG_ON(dentry == NULL);
+        name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name,
+                         memcg_cache_id(memcg), dentry->d_name.name);
+        return name;
+}
+static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
+                                         struct kmem_cache *s)
+{
+        char *name;
+        struct kmem_cache *new;
+        name = memcg_cache_name(memcg, s);
+        if (!name)
+                return NULL;
+        new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align,
+                                      (s->flags & ~SLAB_PANIC), s->ctor, s);
+        if (new)
+                new->allocflags |= __GFP_KMEMCG;
+        kfree(name);
+        return new;
+}
+/*
+ * This lock protects updaters, not readers. We want readers to be as fast as
+ * they can, and they will either see NULL or a valid cache value. Our model
+ * allow them to see NULL, in which case the root memcg will be selected.
+ *
+ * We need this lock because multiple allocations to the same cache from a non
+ * will span more than one worker. Only one of them can create the cache.
+ */
+static DEFINE_MUTEX(memcg_cache_mutex);
+static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
+                                                  struct kmem_cache *cachep)
+{
+        struct kmem_cache *new_cachep;
+        int idx;
+        BUG_ON(!memcg_can_account_kmem(memcg));
+        idx = memcg_cache_id(memcg);
+        mutex_lock(&memcg_cache_mutex);
+        new_cachep = cachep->memcg_params->memcg_caches[idx];
+        if (new_cachep)
+                goto out;
+        new_cachep = kmem_cache_dup(memcg, cachep);
+        if (new_cachep == NULL) {
+                new_cachep = cachep;
+                goto out;
+        }
+        mem_cgroup_get(memcg);
+        atomic_set(&new_cachep->memcg_params->nr_pages , 0);
+        cachep->memcg_params->memcg_caches[idx] = new_cachep;
+        /*
+         * the readers won't lock, make sure everybody sees the updated value,
+         * so they won't put stuff in the queue again for no reason
+         */
+        wmb();
+out:
+        mutex_unlock(&memcg_cache_mutex);
+        return new_cachep;
+}
+void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
+{
+        struct kmem_cache *c;
+        int i;
+        if (!s->memcg_params)
+                return;
+        if (!s->memcg_params->is_root_cache)
+                return;
+        /*
+         * If the cache is being destroyed, we trust that there is no one else
+         * requesting objects from it. Even if there are, the sanity checks in
+         * kmem_cache_destroy should caught this ill-case.
+         *
+         * Still, we don't want anyone else freeing memcg_caches under our
+         * noses, which can happen if a new memcg comes to life. As usual,
+         * we'll take the set_limit_mutex to protect ourselves against this.
+         */
+        mutex_lock(&set_limit_mutex);
+        for (i = 0; i < memcg_limited_groups_array_size; i++) {
+                c = s->memcg_params->memcg_caches[i];
+                if (!c)
+                        continue;
+                /*
+                 * We will now manually delete the caches, so to avoid races
+                 * we need to cancel all pending destruction workers and
+                 * proceed with destruction ourselves.
+                 *
+                 * kmem_cache_destroy() will call kmem_cache_shrink internally,
+                 * and that could spawn the workers again: it is likely that
+                 * the cache still have active pages until this very moment.
+                 * This would lead us back to mem_cgroup_destroy_cache.
+                 *
+                 * But that will not execute at all if the "dead" flag is not
+                 * set, so flip it down to guarantee we are in control.
+                 */
+                c->memcg_params->dead = false;
+                cancel_work_sync(&c->memcg_params->destroy);
+                kmem_cache_destroy(c);
+        }
+        mutex_unlock(&set_limit_mutex);
+}
+struct create_work {
+        struct mem_cgroup *memcg;
+        struct kmem_cache *cachep;
+        struct work_struct work;
+};
+static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+{
+        struct kmem_cache *cachep;
+        struct memcg_cache_params *params;
+        if (!memcg_kmem_is_active(memcg))
+                return;
+        mutex_lock(&memcg->slab_caches_mutex);
+        list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
+                cachep = memcg_params_to_cache(params);
+                cachep->memcg_params->dead = true;
+                INIT_WORK(&cachep->memcg_params->destroy,
+                                  kmem_cache_destroy_work_func);
+                schedule_work(&cachep->memcg_params->destroy);
+        }
+        mutex_unlock(&memcg->slab_caches_mutex);
+}
+static void memcg_create_cache_work_func(struct work_struct *w)
+{
+        struct create_work *cw;
+        cw = container_of(w, struct create_work, work);
+        memcg_create_kmem_cache(cw->memcg, cw->cachep);
+        /* Drop the reference gotten when we enqueued. */
+        css_put(&cw->memcg->css);
+        kfree(cw);
+}
+/*
+ * Enqueue the creation of a per-memcg kmem_cache.
+ * Called with rcu_read_lock.
+ */
+static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
+                                         struct kmem_cache *cachep)
+{
+        struct create_work *cw;
+        cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
+        if (cw == NULL)
+                return;
+        /* The corresponding put will be done in the workqueue. */
+        if (!css_tryget(&memcg->css)) {
+                kfree(cw);
+                return;
+        }
+        cw->memcg = memcg;
+        cw->cachep = cachep;
+        INIT_WORK(&cw->work, memcg_create_cache_work_func);
+        schedule_work(&cw->work);
+}
+static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
+                                       struct kmem_cache *cachep)
+{
+        /*
+         * We need to stop accounting when we kmalloc, because if the
+         * corresponding kmalloc cache is not yet created, the first allocation
+         * in __memcg_create_cache_enqueue will recurse.
+         *
+         * However, it is better to enclose the whole function. Depending on
+         * the debugging options enabled, INIT_WORK(), for instance, can
+         * trigger an allocation. This too, will make us recurse. Because at
+         * this point we can't allow ourselves back into memcg_kmem_get_cache,
+         * the safest choice is to do it like this, wrapping the whole function.
+         */
+        memcg_stop_kmem_account();
+        __memcg_create_cache_enqueue(memcg, cachep);
+        memcg_resume_kmem_account();
+}
+/*
+ * Return the kmem_cache we're supposed to use for a slab allocation.
+ * We try to use the current memcg's version of the cache.
+ *
+ * If the cache does not exist yet, if we are the first user of it,
+ * we either create it immediately, if possible, or create it asynchronously
+ * in a workqueue.
+ * In the latter case, we will let the current allocation go through with
+ * the original cache.
+ *
+ * Can't be called in interrupt context or from kernel threads.
+ * This function needs to be called with rcu_read_lock() held.
+ */
+struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
+                                          gfp_t gfp)
+{
+        struct mem_cgroup *memcg;
+        int idx;
+        VM_BUG_ON(!cachep->memcg_params);
+        VM_BUG_ON(!cachep->memcg_params->is_root_cache);
+        if (!current->mm || current->memcg_kmem_skip_account)
+                return cachep;
+        rcu_read_lock();
+        memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
+        rcu_read_unlock();
+        if (!memcg_can_account_kmem(memcg))
+                return cachep;
+        idx = memcg_cache_id(memcg);
+        /*
+         * barrier to mare sure we're always seeing the up to date value.  The
+         * code updating memcg_caches will issue a write barrier to match this.
+         */
+        read_barrier_depends();
+        if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) {
+                /*
+                 * If we are in a safe context (can wait, and not in interrupt
+                 * context), we could be be predictable and return right away.
+                 * This would guarantee that the allocation being performed
+                 * already belongs in the new cache.
+                 *
+                 * However, there are some clashes that can arrive from locking.
+                 * For instance, because we acquire the slab_mutex while doing
+                 * kmem_cache_dup, this means no further allocation could happen
+                 * with the slab_mutex held.
+                 *
+                 * Also, because cache creation issue get_online_cpus(), this
+                 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
+                 * that ends up reversed during cpu hotplug. (cpuset allocates
+                 * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
+                 * better to defer everything.
+                 */
+                memcg_create_cache_enqueue(memcg, cachep);
+                return cachep;
+        }
+        return cachep->memcg_params->memcg_caches[idx];
+}
+EXPORT_SYMBOL(__memcg_kmem_get_cache);
+/*
+ * We need to verify if the allocation against current->mm->owner's memcg is
+ * possible for the given order. But the page is not allocated yet, so we'll
+ * need a further commit step to do the final arrangements.
+ *
+ * It is possible for the task to switch cgroups in this mean time, so at
+ * commit time, we can't rely on task conversion any longer.  We'll then use
+ * the handle argument to return to the caller which cgroup we should commit
+ * against. We could also return the memcg directly and avoid the pointer
+ * passing, but a boolean return value gives better semantics considering
+ * the compiled-out case as well.
+ *
+ * Returning true means the allocation is possible.
+ */
+bool
+__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
+{
+        struct mem_cgroup *memcg;
+        int ret;
+        *_memcg = NULL;
+        memcg = try_get_mem_cgroup_from_mm(current->mm);
+        /*
+         * very rare case described in mem_cgroup_from_task. Unfortunately there
+         * isn't much we can do without complicating this too much, and it would
+         * be gfp-dependent anyway. Just let it go
+         */
+        if (unlikely(!memcg))
+                return true;
+        if (!memcg_can_account_kmem(memcg)) {
+                css_put(&memcg->css);
+                return true;
+        }
+        ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
+        if (!ret)
+                *_memcg = memcg;
+        css_put(&memcg->css);
+        return (ret == 0);
+}
+void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
+                              int order)
+{
+        struct page_cgroup *pc;
+        VM_BUG_ON(mem_cgroup_is_root(memcg));
+        /* The page allocation failed. Revert */
+        if (!page) {
+                memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
+                return;
+        }
+        pc = lookup_page_cgroup(page);
+        lock_page_cgroup(pc);
+        pc->mem_cgroup = memcg;
+        SetPageCgroupUsed(pc);
+        unlock_page_cgroup(pc);
+}
+void __memcg_kmem_uncharge_pages(struct page *page, int order)
+{
+        struct mem_cgroup *memcg = NULL;
+        struct page_cgroup *pc;
+        pc = lookup_page_cgroup(page);
+        /*
+         * Fast unlocked return. Theoretically might have changed, have to
+         * check again after locking.
+         */
+        if (!PageCgroupUsed(pc))
+                return;
+        lock_page_cgroup(pc);
+        if (PageCgroupUsed(pc)) {
+                memcg = pc->mem_cgroup;
+                ClearPageCgroupUsed(pc);
+        }
+        unlock_page_cgroup(pc);
+        /*
+         * We trust that only if there is a memcg associated with the page, it
+         * is a valid allocation
+         */
+        if (!memcg)
+                return;
+        VM_BUG_ON(mem_cgroup_is_root(memcg));
+        memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
+}
+#else
+static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
@@ -2709,13 +3616,6 @@ static int mem_cgroup_move_account(struct page *page,
        /* caller should have done css_get */
        pc->mem_cgroup = to;
        mem_cgroup_charge_statistics(to, anon, nr_pages);
-        /*
-         * We charges against "to" which may not have any tasks. Then, "to"
-         * can be under rmdir(). But in current implementation, caller of
-         * this function is just force_empty() and move charge, so it's
-         * guaranteed that "to" is never removed. So, we don't check rmdir
-         * status here.
-         */
        move_unlock_mem_cgroup(from, &flags);
        ret = 0;
 unlock:
@@ -2729,10 +3629,27 @@ out:
        return ret;
 }
-/*
+/**
- * move charges to its parent.
+ * mem_cgroup_move_parent - moves page to the parent group
+ * @page: the page to move
+ * @pc: page_cgroup of the page
+ * @child: page's cgroup
+ *
+ * move charges to its parent or the root cgroup if the group has no
+ * parent (aka use_hierarchy==0).
+ * Although this might fail (get_page_unless_zero, isolate_lru_page or
+ * mem_cgroup_move_account fails) the failure is always temporary and
+ * it signals a race with a page removal/uncharge or migration. In the
+ * first case the page is on the way out and it will vanish from the LRU
+ * on the next attempt and the call should be retried later.
+ * Isolation from the LRU fails only if page has been isolated from
+ * the LRU since we looked at it and that usually means either global
+ * reclaim or migration going on. The page will either get back to the
+ * LRU or vanish.
+ * Finaly mem_cgroup_move_account fails only if the page got uncharged
+ * (!PageCgroupUsed) or moved to a different group. The page will
+ * disappear in the next attempt.
 */
 static int mem_cgroup_move_parent(struct page *page,
                                  struct page_cgroup *pc,
                                  struct mem_cgroup *child)
@@ -2742,9 +3659,7 @@ static int mem_cgroup_move_parent(struct page *page,
        unsigned long uninitialized_var(flags);
        int ret;
-        /* Is ROOT ? */
+        VM_BUG_ON(mem_cgroup_is_root(child));
-        if (mem_cgroup_is_root(child))
-                return -EINVAL;
        ret = -EBUSY;
        if (!get_page_unless_zero(page))
@@ -2761,8 +3676,10 @@ static int mem_cgroup_move_parent(struct page *page,
        if (!parent)
                parent = root_mem_cgroup;
-        if (nr_pages > 1)
+        if (nr_pages > 1) {
+                VM_BUG_ON(!PageTransHuge(page));
                flags = compound_lock_irqsave(page);
+        }
        ret = mem_cgroup_move_account(page, nr_pages,
                                pc, child, parent);
@@ -2904,7 +3821,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
                return;
        if (!memcg)
                return;
-        cgroup_exclude_rmdir(&memcg->css);
        __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
        /*
@@ -2918,12 +3834,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
                swp_entry_t ent = {.val = page_private(page)};
                mem_cgroup_uncharge_swap(ent);
        }
-        /*
-         * At swapin, we may charge account against cgroup which has no tasks.
-         * So, rmdir()->pre_destroy() can be called while we do this charge.
-         * In that case, we need to call pre_destroy() again. check it here.
-         */
-        cgroup_release_and_wakeup_rmdir(&memcg->css);
 }
 void mem_cgroup_commit_charge_swapin(struct page *page,
@@ -3288,15 +4198,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
                                  struct mem_cgroup **memcgp)
 {
        struct mem_cgroup *memcg = NULL;
+        unsigned int nr_pages = 1;
        struct page_cgroup *pc;
        enum charge_type ctype;
        *memcgp = NULL;
-        VM_BUG_ON(PageTransHuge(page));
        if (mem_cgroup_disabled())
                return;
+        if (PageTransHuge(page))
+                nr_pages <<= compound_order(page);
        pc = lookup_page_cgroup(page);
        lock_page_cgroup(pc);
        if (PageCgroupUsed(pc)) {
@@ -3358,7 +4271,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
         * charged to the res_counter since we plan on replacing the
         * old one and only one page is going to be left afterwards.
         */
-        __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
+        __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
 }
 /* remove redundant charge if migration failed*/
@@ -3371,8 +4284,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
        if (!memcg)
                return;
-        /* blocks rmdir() */
-        cgroup_exclude_rmdir(&memcg->css);
        if (!migration_ok) {
                used = oldpage;
                unused = newpage;
@@ -3406,13 +4318,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
         */
        if (anon)
                mem_cgroup_uncharge_page(used);
-        /*
-         * At migration, we may charge account against cgroup which has no
-         * tasks.
-         * So, rmdir()->pre_destroy() can be called while we do this charge.
-         * In that case, we need to call pre_destroy() again. check it here.
-         */
-        cgroup_release_and_wakeup_rmdir(&memcg->css);
 }
 /*
@@ -3490,8 +4395,6 @@ void mem_cgroup_print_bad_page(struct page *page)
 }
 #endif
-static DEFINE_MUTEX(set_limit_mutex);
 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
                                unsigned long long val)
 {
@@ -3712,17 +4615,22 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
        return nr_reclaimed;
 }
-/*
+/**
+ * mem_cgroup_force_empty_list - clears LRU of a group
+ * @memcg: group to clear
+ * @node: NUMA node
+ * @zid: zone id
+ * @lru: lru to to clear
+ *
 * Traverse a specified page_cgroup list and try to drop them all.  This doesn't
- * reclaim the pages page themselves - it just removes the page_cgroups.
+ * reclaim the pages page themselves - pages are moved to the parent (or root)
- * Returns true if some page_cgroups were not freed, indicating that the caller
+ * group.
- * must retry this operation.
 */
-static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
+static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
                                int node, int zid, enum lru_list lru)
 {
        struct lruvec *lruvec;
-        unsigned long flags, loop;
+        unsigned long flags;
        struct list_head *list;
        struct page *busy;
        struct zone *zone;
@@ -3731,11 +4639,8 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
        lruvec = mem_cgroup_zone_lruvec(zone, memcg);
        list = &lruvec->lists[lru];
-        loop = mem_cgroup_get_lru_size(lruvec, lru);
-        /* give some margin against EBUSY etc...*/
-        loop += 256;
        busy = NULL;
-        while (loop--) {
+        do {
                struct page_cgroup *pc;
                struct page *page;
@@ -3761,76 +4666,80 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
                        cond_resched();
                } else
                        busy = NULL;
-        }
+        } while (!list_empty(list));
-        return !list_empty(list);
 }
 /*
- * make mem_cgroup's charge to be 0 if there is no task.
+ * make mem_cgroup's charge to be 0 if there is no task by moving
+ * all the charges and pages to the parent.
 * This enables deleting this mem_cgroup.
+ *
+ * Caller is responsible for holding css reference on the memcg.
 */
-static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
+static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
 {
-        int ret;
+        int node, zid;
-        int node, zid, shrink;
+        u64 usage;
-        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
-        struct cgroup *cgrp = memcg->css.cgroup;
-        css_get(&memcg->css);
-        shrink = 0;
-        /* should free all ? */
-        if (free_all)
-                goto try_to_free;
-move_account:
        do {
-                ret = -EBUSY;
-                if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
-                        goto out;
                /* This is for making all *used* pages to be on LRU. */
                lru_add_drain_all();
                drain_all_stock_sync(memcg);
-                ret = 0;
                mem_cgroup_start_move(memcg);
-                for_each_node_state(node, N_HIGH_MEMORY) {
+                for_each_node_state(node, N_MEMORY) {
-                        for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
+                        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
                                enum lru_list lru;
                                for_each_lru(lru) {
-                                        ret = mem_cgroup_force_empty_list(memcg,
+                                        mem_cgroup_force_empty_list(memcg,
                                                        node, zid, lru);
-                                        if (ret)
-                                                break;
                                }
                        }
-                        if (ret)
-                                break;
                }
                mem_cgroup_end_move(memcg);
                memcg_oom_recover(memcg);
                cond_resched();
-        /* "ret" should also be checked to ensure all lists are empty. */
-        } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
-out:
-        css_put(&memcg->css);
-        return ret;
-try_to_free:
+                /*
+                 * Kernel memory may not necessarily be trackable to a specific
+                 * process. So they are not migrated, and therefore we can't
+                 * expect their value to drop to 0 here.
+                 * Having res filled up with kmem only is enough.
+                 *
+                 * This is a safety check because mem_cgroup_force_empty_list
+                 * could have raced with mem_cgroup_replace_page_cache callers
+                 * so the lru seemed empty but the page could have been added
+                 * right after the check. RES_USAGE should be safe as we always
+                 * charge before adding to the LRU.
+                 */
+                usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
+                        res_counter_read_u64(&memcg->kmem, RES_USAGE);
+        } while (usage > 0);
+}
+/*
+ * Reclaims as many pages from the given memcg as possible and moves
+ * the rest to the parent.
+ *
+ * Caller is responsible for holding css reference for memcg.
+ */
+static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
+{
+        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+        struct cgroup *cgrp = memcg->css.cgroup;
        /* returns EBUSY if there is a task or if we come here twice. */
-        if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
+        if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
-                ret = -EBUSY;
+                return -EBUSY;
-                goto out;
-        }
        /* we call try-to-free pages for make this cgroup empty */
        lru_add_drain_all();
        /* try to free all pages in this cgroup */
-        shrink = 1;
        while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
                int progress;
-                if (signal_pending(current)) {
+                if (signal_pending(current))
-                        ret = -EINTR;
+                        return -EINTR;
-                        goto out;
-                }
                progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
                                                false);
                if (!progress) {
@@ -3841,13 +4750,23 @@ try_to_free:
        }
        lru_add_drain();
-        /* try move_account...there may be some *locked* pages. */
+        mem_cgroup_reparent_charges(memcg);
-        goto move_account;
+        return 0;
 }
 static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
 {
-        return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+        int ret;
+        if (mem_cgroup_is_root(memcg))
+                return -EINVAL;
+        css_get(&memcg->css);
+        ret = mem_cgroup_force_empty(memcg);
+        css_put(&memcg->css);
+        return ret;
 }
@@ -3938,7 +4857,8 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
        char str[64];
        u64 val;
-        int type, name, len;
+        int name, len;
+        enum res_type type;
        type = MEMFILE_TYPE(cft->private);
        name = MEMFILE_ATTR(cft->private);
@@ -3959,6 +4879,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
                else
                        val = res_counter_read_u64(&memcg->memsw, name);
                break;
+        case _KMEM:
+                val = res_counter_read_u64(&memcg->kmem, name);
+                break;
        default:
                BUG();
        }
@@ -3966,6 +4889,125 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
        len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
        return simple_read_from_buffer(buf, nbytes, ppos, str, len);
 }
+static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
+{
+        int ret = -EINVAL;
+#ifdef CONFIG_MEMCG_KMEM
+        bool must_inc_static_branch = false;
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+        /*
+         * For simplicity, we won't allow this to be disabled.  It also can't
+         * be changed if the cgroup has children already, or if tasks had
+         * already joined.
+         *
+         * If tasks join before we set the limit, a person looking at
+         * kmem.usage_in_bytes will have no way to determine when it took
+         * place, which makes the value quite meaningless.
+         *
+         * After it first became limited, changes in the value of the limit are
+         * of course permitted.
+         *
+         * Taking the cgroup_lock is really offensive, but it is so far the only
+         * way to guarantee that no children will appear. There are plenty of
+         * other offenders, and they should all go away. Fine grained locking
+         * is probably the way to go here. When we are fully hierarchical, we
+         * can also get rid of the use_hierarchy check.
+         */
+        cgroup_lock();
+        mutex_lock(&set_limit_mutex);
+        if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
+                if (cgroup_task_count(cont) || (memcg->use_hierarchy &&
+                                                !list_empty(&cont->children))) {
+                        ret = -EBUSY;
+                        goto out;
+                }
+                ret = res_counter_set_limit(&memcg->kmem, val);
+                VM_BUG_ON(ret);
+                ret = memcg_update_cache_sizes(memcg);
+                if (ret) {
+                        res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
+                        goto out;
+                }
+                must_inc_static_branch = true;
+                /*
+                 * kmem charges can outlive the cgroup. In the case of slab
+                 * pages, for instance, a page contain objects from various
+                 * processes, so it is unfeasible to migrate them away. We
+                 * need to reference count the memcg because of that.
+                 */
+                mem_cgroup_get(memcg);
+        } else
+                ret = res_counter_set_limit(&memcg->kmem, val);
+out:
+        mutex_unlock(&set_limit_mutex);
+        cgroup_unlock();
+        /*
+         * We are by now familiar with the fact that we can't inc the static
+         * branch inside cgroup_lock. See disarm functions for details. A
+         * worker here is overkill, but also wrong: After the limit is set, we
+         * must start accounting right away. Since this operation can't fail,
+         * we can safely defer it to here - no rollback will be needed.
+         *
+         * The boolean used to control this is also safe, because
+         * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be
+         * able to set it to true;
+         */
+        if (must_inc_static_branch) {
+                static_key_slow_inc(&memcg_kmem_enabled_key);
+                /*
+                 * setting the active bit after the inc will guarantee no one
+                 * starts accounting before all call sites are patched
+                 */
+                memcg_kmem_set_active(memcg);
+        }
+#endif
+        return ret;
+}
+static int memcg_propagate_kmem(struct mem_cgroup *memcg)
+{
+        int ret = 0;
+        struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+        if (!parent)
+                goto out;
+        memcg->kmem_account_flags = parent->kmem_account_flags;
+#ifdef CONFIG_MEMCG_KMEM
+        /*
+         * When that happen, we need to disable the static branch only on those
+         * memcgs that enabled it. To achieve this, we would be forced to
+         * complicate the code by keeping track of which memcgs were the ones
+         * that actually enabled limits, and which ones got it from its
+         * parents.
+         *
+         * It is a lot simpler just to do static_key_slow_inc() on every child
+         * that is accounted.
+         */
+        if (!memcg_kmem_is_active(memcg))
+                goto out;
+        /*
+         * destroy(), called if we fail, will issue static_key_slow_inc() and
+         * mem_cgroup_put() if kmem is enabled. We have to either call them
+         * unconditionally, or clear the KMEM_ACTIVE flag. I personally find
+         * this more consistent, since it always leads to the same destroy path
+         */
+        mem_cgroup_get(memcg);
+        static_key_slow_inc(&memcg_kmem_enabled_key);
+        mutex_lock(&set_limit_mutex);
+        ret = memcg_update_cache_sizes(memcg);
+        mutex_unlock(&set_limit_mutex);
+#endif
+out:
+        return ret;
+}
 /*
 * The user of this function is...
 * RES_LIMIT.
@@ -3974,7 +5016,8 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
                            const char *buffer)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-        int type, name;
+        enum res_type type;
+        int name;
        unsigned long long val;
        int ret;
@@ -3996,8 +5039,12 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
                        break;
                if (type == _MEM)
                        ret = mem_cgroup_resize_limit(memcg, val);
-                else
+                else if (type == _MEMSWAP)
                        ret = mem_cgroup_resize_memsw_limit(memcg, val);
+                else if (type == _KMEM)
+                        ret = memcg_update_kmem_limit(cont, val);
+                else
+                        return -EINVAL;
                break;
        case RES_SOFT_LIMIT:
                ret = res_counter_memparse_write_strategy(buffer, &val);
@@ -4050,7 +5097,8 @@ out:
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-        int type, name;
+        int name;
+        enum res_type type;
        type = MEMFILE_TYPE(event);
        name = MEMFILE_ATTR(event);
@@ -4062,14 +5110,22 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
        case RES_MAX_USAGE:
                if (type == _MEM)
                        res_counter_reset_max(&memcg->res);
-                else
+                else if (type == _MEMSWAP)
                        res_counter_reset_max(&memcg->memsw);
+                else if (type == _KMEM)
+                        res_counter_reset_max(&memcg->kmem);
+                else
+                        return -EINVAL;
                break;
        case RES_FAILCNT:
                if (type == _MEM)
                        res_counter_reset_failcnt(&memcg->res);
-                else
+                else if (type == _MEMSWAP)
                        res_counter_reset_failcnt(&memcg->memsw);
+                else if (type == _KMEM)
+                        res_counter_reset_failcnt(&memcg->kmem);
+                else
+                        return -EINVAL;
                break;
        }
@@ -4120,7 +5176,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
        total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
        seq_printf(m, "total=%lu", total_nr);
-        for_each_node_state(nid, N_HIGH_MEMORY) {
+        for_each_node_state(nid, N_MEMORY) {
                node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
                seq_printf(m, " N%d=%lu", nid, node_nr);
        }
@@ -4128,7 +5184,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
        file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
        seq_printf(m, "file=%lu", file_nr);
-        for_each_node_state(nid, N_HIGH_MEMORY) {
+        for_each_node_state(nid, N_MEMORY) {
                node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
                                LRU_ALL_FILE);
                seq_printf(m, " N%d=%lu", nid, node_nr);
@@ -4137,7 +5193,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
        anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
        seq_printf(m, "anon=%lu", anon_nr);
-        for_each_node_state(nid, N_HIGH_MEMORY) {
+        for_each_node_state(nid, N_MEMORY) {
                node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
                                LRU_ALL_ANON);
                seq_printf(m, " N%d=%lu", nid, node_nr);
@@ -4146,7 +5202,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
        unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
        seq_printf(m, "unevictable=%lu", unevictable_nr);
-        for_each_node_state(nid, N_HIGH_MEMORY) {
+        for_each_node_state(nid, N_MEMORY) {
                node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
                                BIT(LRU_UNEVICTABLE));
                seq_printf(m, " N%d=%lu", nid, node_nr);
@@ -4386,7 +5442,7 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
        struct mem_cgroup_thresholds *thresholds;
        struct mem_cgroup_threshold_ary *new;
-        int type = MEMFILE_TYPE(cft->private);
+        enum res_type type = MEMFILE_TYPE(cft->private);
        u64 threshold, usage;
        int i, size, ret;
@@ -4469,7 +5525,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
        struct mem_cgroup_thresholds *thresholds;
        struct mem_cgroup_threshold_ary *new;
-        int type = MEMFILE_TYPE(cft->private);
+        enum res_type type = MEMFILE_TYPE(cft->private);
        u64 usage;
        int i, j, size;
@@ -4547,7 +5603,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
        struct mem_cgroup_eventfd_list *event;
-        int type = MEMFILE_TYPE(cft->private);
+        enum res_type type = MEMFILE_TYPE(cft->private);
        BUG_ON(type != _OOM_TYPE);
        event = kmalloc(sizeof(*event), GFP_KERNEL);
@@ -4572,7 +5628,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
        struct mem_cgroup_eventfd_list *ev, *tmp;
-        int type = MEMFILE_TYPE(cft->private);
+        enum res_type type = MEMFILE_TYPE(cft->private);
        BUG_ON(type != _OOM_TYPE);
@@ -4631,12 +5687,33 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
 #ifdef CONFIG_MEMCG_KMEM
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
+        int ret;
+        memcg->kmemcg_id = -1;
+        ret = memcg_propagate_kmem(memcg);
+        if (ret)
+                return ret;
        return mem_cgroup_sockets_init(memcg, ss);
 };
 static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
 {
        mem_cgroup_sockets_destroy(memcg);
+        memcg_kmem_mark_dead(memcg);
+        if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
+                return;
+        /*
+         * Charges already down to 0, undo mem_cgroup_get() done in the charge
+         * path here, being careful not to race with memcg_uncharge_kmem: it is
+         * possible that the charges went down to 0 between mark_dead and the
+         * res_counter read, so in that case, we don't need the put
+         */
+        if (memcg_kmem_test_and_clear_dead(memcg))
+                mem_cgroup_put(memcg);
 }
 #else
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
@@ -4745,6 +5822,37 @@ static struct cftype mem_cgroup_files[] = {
                .read = mem_cgroup_read,
        },
 #endif
+#ifdef CONFIG_MEMCG_KMEM
+        {
+                .name = "kmem.limit_in_bytes",
+                .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
+                .write_string = mem_cgroup_write,
+                .read = mem_cgroup_read,
+        },
+        {
+                .name = "kmem.usage_in_bytes",
+                .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
+                .read = mem_cgroup_read,
+        },
+        {
+                .name = "kmem.failcnt",
+                .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
+                .trigger = mem_cgroup_reset,
+                .read = mem_cgroup_read,
+        },
+        {
+                .name = "kmem.max_usage_in_bytes",
+                .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
+                .trigger = mem_cgroup_reset,
+                .read = mem_cgroup_read,
+        },
+#ifdef CONFIG_SLABINFO
+        {
+                .name = "kmem.slabinfo",
+                .read_seq_string = mem_cgroup_slabinfo_read,
+        },
+#endif
+#endif
        { },    /* terminate */
 };
@@ -4812,16 +5920,29 @@ out_free:
 }
 /*
- * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
+ * At destroying mem_cgroup, references from swap_cgroup can remain.
- * but in process context.  The work_freeing structure is overlaid
+ * (scanning all at force_empty is too costly...)
- * on the rcu_freeing structure, which itself is overlaid on memsw.
+ *
+ * Instead of clearing all references at force_empty, we remember
+ * the number of reference from swap_cgroup and free mem_cgroup when
+ * it goes down to 0.
+ *
+ * Removal of cgroup itself succeeds regardless of refs from swap.
 */
-static void free_work(struct work_struct *work)
+static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
-        struct mem_cgroup *memcg;
+        int node;
        int size = sizeof(struct mem_cgroup);
-        memcg = container_of(work, struct mem_cgroup, work_freeing);
+        mem_cgroup_remove_from_trees(memcg);
+        free_css_id(&mem_cgroup_subsys, &memcg->css);
+        for_each_node(node)
+                free_mem_cgroup_per_zone_info(memcg, node);
+        free_percpu(memcg->stat);
        /*
         * We need to make sure that (at least for now), the jump label
         * destruction code runs outside of the cgroup lock. This is because
@@ -4833,45 +5954,34 @@ static void free_work(struct work_struct *work)
         * to move this code around, and make sure it is outside
         * the cgroup_lock.
         */
-        disarm_sock_keys(memcg);
+        disarm_static_keys(memcg);
        if (size < PAGE_SIZE)
                kfree(memcg);
        else
                vfree(memcg);
 }
-static void free_rcu(struct rcu_head *rcu_head)
-{
-        struct mem_cgroup *memcg;
-        memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
-        INIT_WORK(&memcg->work_freeing, free_work);
-        schedule_work(&memcg->work_freeing);
-}
 /*
- * At destroying mem_cgroup, references from swap_cgroup can remain.
+ * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
- * (scanning all at force_empty is too costly...)
+ * but in process context.  The work_freeing structure is overlaid
- *
+ * on the rcu_freeing structure, which itself is overlaid on memsw.
- * Instead of clearing all references at force_empty, we remember
- * the number of reference from swap_cgroup and free mem_cgroup when
- * it goes down to 0.
- *
- * Removal of cgroup itself succeeds regardless of refs from swap.
 */
+static void free_work(struct work_struct *work)
-static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
-        int node;
+        struct mem_cgroup *memcg;
-        mem_cgroup_remove_from_trees(memcg);
+        memcg = container_of(work, struct mem_cgroup, work_freeing);
-        free_css_id(&mem_cgroup_subsys, &memcg->css);
+        __mem_cgroup_free(memcg);
+}
-        for_each_node(node)
+static void free_rcu(struct rcu_head *rcu_head)
-                free_mem_cgroup_per_zone_info(memcg, node);
+{
+        struct mem_cgroup *memcg;
-        free_percpu(memcg->stat);
+        memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
-        call_rcu(&memcg->rcu_freeing, free_rcu);
+        INIT_WORK(&memcg->work_freeing, free_work);
+        schedule_work(&memcg->work_freeing);
 }
 static void mem_cgroup_get(struct mem_cgroup *memcg)
@@ -4883,7 +5993,7 @@ static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
 {
        if (atomic_sub_and_test(count, &memcg->refcnt)) {
                struct mem_cgroup *parent = parent_mem_cgroup(memcg);
-                __mem_cgroup_free(memcg);
+                call_rcu(&memcg->rcu_freeing, free_rcu);
                if (parent)
                        mem_cgroup_put(parent);
        }
@@ -4953,7 +6063,7 @@ err_cleanup:
 }
 static struct cgroup_subsys_state * __ref
-mem_cgroup_create(struct cgroup *cont)
+mem_cgroup_css_alloc(struct cgroup *cont)
 {
        struct mem_cgroup *memcg, *parent;
        long error = -ENOMEM;
@@ -4980,7 +6090,6 @@ mem_cgroup_create(struct cgroup *cont)
                                                &per_cpu(memcg_stock, cpu);
                        INIT_WORK(&stock->work, drain_local_stock);
                }
-                hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
        } else {
                parent = mem_cgroup_from_cont(cont->parent);
                memcg->use_hierarchy = parent->use_hierarchy;
@@ -4990,6 +6099,8 @@ mem_cgroup_create(struct cgroup *cont)
        if (parent && parent->use_hierarchy) {
                res_counter_init(&memcg->res, &parent->res);
                res_counter_init(&memcg->memsw, &parent->memsw);
+                res_counter_init(&memcg->kmem, &parent->kmem);
                /*
                 * We increment refcnt of the parent to ensure that we can
                 * safely access it on res_counter_charge/uncharge.
@@ -5000,6 +6111,7 @@ mem_cgroup_create(struct cgroup *cont)
        } else {
                res_counter_init(&memcg->res, NULL);
                res_counter_init(&memcg->memsw, NULL);
+                res_counter_init(&memcg->kmem, NULL);
                /*
                 * Deeper hierachy with use_hierarchy == false doesn't make
                 * much sense so let cgroup subsystem know about this
@@ -5034,14 +6146,15 @@ free_out:
        return ERR_PTR(error);
 }
-static int mem_cgroup_pre_destroy(struct cgroup *cont)
+static void mem_cgroup_css_offline(struct cgroup *cont)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-        return mem_cgroup_force_empty(memcg, false);
+        mem_cgroup_reparent_charges(memcg);
+        mem_cgroup_destroy_all_caches(memcg);
 }
-static void mem_cgroup_destroy(struct cgroup *cont)
+static void mem_cgroup_css_free(struct cgroup *cont)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
@@ -5631,18 +6744,30 @@ static void mem_cgroup_move_task(struct cgroup *cont,
 struct cgroup_subsys mem_cgroup_subsys = {
        .name = "memory",
        .subsys_id = mem_cgroup_subsys_id,
-        .create = mem_cgroup_create,
+        .css_alloc = mem_cgroup_css_alloc,
-        .pre_destroy = mem_cgroup_pre_destroy,
+        .css_offline = mem_cgroup_css_offline,
-        .destroy = mem_cgroup_destroy,
+        .css_free = mem_cgroup_css_free,
        .can_attach = mem_cgroup_can_attach,
        .cancel_attach = mem_cgroup_cancel_attach,
        .attach = mem_cgroup_move_task,
        .base_cftypes = mem_cgroup_files,
        .early_init = 0,
        .use_id = 1,
-        .__DEPRECATED_clear_css_refs = true,
 };
+/*
+ * The rest of init is performed during ->css_alloc() for root css which
+ * happens before initcalls.  hotcpu_notifier() can't be done together as
+ * it would introduce circular locking by adding cgroup_lock -> cpu hotplug
+ * dependency.  Do it from a subsys_initcall().
+ */
+static int __init mem_cgroup_init(void)
+{
+        hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
+        return 0;
+}
+subsys_initcall(mem_cgroup_init);
 #ifdef CONFIG_MEMCG_SWAP
 static int __init enable_swap_account(char *s)
 {
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6c5899b9034a..c6e4dd3e1c08 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
        struct anon_vma *av;
        pgoff_t pgoff;
-        av = page_lock_anon_vma(page);
+        av = page_lock_anon_vma_read(page);
        if (av == NULL) /* Not actually mapped anymore */
                return;
@@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
                }
        }
        read_unlock(&tasklist_lock);
-        page_unlock_anon_vma(av);
+        page_unlock_anon_vma_read(av);
 }
 /*
@@ -781,16 +781,16 @@ static struct page_state {
        { compound,     compound,       "huge",         me_huge_page },
 #endif
-        { sc|dirty,     sc|dirty,       "swapcache",    me_swapcache_dirty },
+        { sc|dirty,     sc|dirty,       "dirty swapcache",      me_swapcache_dirty },
-        { sc|dirty,     sc,             "swapcache",    me_swapcache_clean },
+        { sc|dirty,     sc,             "clean swapcache",      me_swapcache_clean },
-        { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
+        { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
-        { unevict,      unevict,        "unevictable LRU", me_pagecache_clean},
+        { unevict,      unevict,        "clean unevictable LRU", me_pagecache_clean },
-        { mlock|dirty,  mlock|dirty,    "mlocked LRU",  me_pagecache_dirty },
+        { mlock|dirty,  mlock|dirty,    "dirty mlocked LRU",    me_pagecache_dirty },
-        { mlock,        mlock,          "mlocked LRU",  me_pagecache_clean },
+        { mlock,        mlock,          "clean mlocked LRU",    me_pagecache_clean },
-        { lru|dirty,    lru|dirty,      "LRU",          me_pagecache_dirty },
+        { lru|dirty,    lru|dirty,      "dirty LRU",    me_pagecache_dirty },
        { lru|dirty,    lru,            "clean LRU",    me_pagecache_clean },
        /*
@@ -812,14 +812,14 @@ static struct page_state {
 #undef slab
 #undef reserved
+/*
+ * "Dirty/Clean" indication is not 100% accurate due to the possibility of
+ * setting PG_dirty outside page lock. See also comment above set_page_dirty().
+ */
 static void action_result(unsigned long pfn, char *msg, int result)
 {
-        struct page *page = pfn_to_page(pfn);
+        pr_err("MCE %#lx: %s page recovery: %s\n",
+                pfn, msg, action_name[result]);
-        printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
-                pfn,
-                PageDirty(page) ? "dirty " : "",
-                msg, action_name[result]);
 }
 static int page_action(struct page_state *ps, struct page *p,
@@ -1385,7 +1385,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
         * Isolate the page, so that it doesn't get reallocated if it
         * was free.
         */
-        set_migratetype_isolate(p);
+        set_migratetype_isolate(p, true);
        /*
         * When the target page is a free hugepage, just remove it
         * from free hugepage list.
@@ -1476,9 +1476,17 @@ int soft_offline_page(struct page *page, int flags)
 {
        int ret;
        unsigned long pfn = page_to_pfn(page);
+        struct page *hpage = compound_trans_head(page);
        if (PageHuge(page))
                return soft_offline_huge_page(page, flags);
+        if (PageTransHuge(hpage)) {
+                if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
+                        pr_info("soft offline: %#lx: failed to split THP\n",
+                                pfn);
+                        return -EBUSY;
+                }
+        }
        ret = get_any_page(page, pfn, flags);
        if (ret < 0)
@@ -1558,7 +1566,8 @@ int soft_offline_page(struct page *page, int flags)
                                            page_is_file_cache(page));
                list_add(&page->lru, &pagelist);
                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
-                                                        false, MIGRATE_SYNC);
+                                                        false, MIGRATE_SYNC,
+                                                        MR_MEMORY_FAILURE);
                if (ret) {
                        putback_lru_pages(&pagelist);
                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index 221fc9ffcab1..bb1369f7b9b4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,6 +57,8 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
 #include <linux/gfp.h>
+#include <linux/migrate.h>
+#include <linux/string.h>
 #include <asm/io.h>
 #include <asm/pgalloc.h>
@@ -182,10 +184,14 @@ static int tlb_next_batch(struct mmu_gather *tlb)
                return 1;
        }
+        if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
+                return 0;
        batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
        if (!batch)
                return 0;
+        tlb->batch_count++;
        batch->next = NULL;
        batch->nr   = 0;
        batch->max  = MAX_GATHER_BATCH;
@@ -214,6 +220,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
        tlb->local.nr   = 0;
        tlb->local.max  = ARRAY_SIZE(tlb->__pages);
        tlb->active     = &tlb->local;
+        tlb->batch_count = 0;
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
        tlb->batch = NULL;
@@ -717,20 +724,6 @@ static inline bool is_cow_mapping(vm_flags_t flags)
        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 }
-#ifndef is_zero_pfn
-static inline int is_zero_pfn(unsigned long pfn)
-{
-        return pfn == zero_pfn;
-}
-#endif
-#ifndef my_zero_pfn
-static inline unsigned long my_zero_pfn(unsigned long addr)
-{
-        return zero_pfn;
-}
-#endif
 /*
 * vm_normal_page -- This function gets the "struct page" associated with a pte.
 *
@@ -1250,7 +1243,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                                        BUG();
                                }
 #endif
-                                split_huge_page_pmd(vma->vm_mm, pmd);
+                                split_huge_page_pmd(vma, addr, pmd);
                        } else if (zap_huge_pmd(tlb, vma, pmd, addr))
                                goto next;
                        /* fall through */
@@ -1517,9 +1510,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
                goto out;
        }
+        if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+                goto no_page_table;
        if (pmd_trans_huge(*pmd)) {
                if (flags & FOLL_SPLIT) {
-                        split_huge_page_pmd(mm, pmd);
+                        split_huge_page_pmd(vma, address, pmd);
                        goto split_fallthrough;
                }
                spin_lock(&mm->page_table_lock);
@@ -1546,6 +1541,8 @@ split_fallthrough:
        pte = *ptep;
        if (!pte_present(pte))
                goto no_page;
+        if ((flags & FOLL_NUMA) && pte_numa(pte))
+                goto no_page;
        if ((flags & FOLL_WRITE) && !pte_write(pte))
                goto unlock;
@@ -1697,6 +1694,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
        vm_flags &= (gup_flags & FOLL_FORCE) ?
                        (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+        /*
+         * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
+         * would be called on PROT_NONE ranges. We must never invoke
+         * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
+         * page faults would unprotect the PROT_NONE ranges if
+         * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
+         * bitflag. So to avoid that, don't set FOLL_NUMA if
+         * FOLL_FORCE is set.
+         */
+        if (!(gup_flags & FOLL_FORCE))
+                gup_flags |= FOLL_NUMA;
        i = 0;
        do {
@@ -2794,13 +2804,8 @@ unlock:
 oom_free_new:
        page_cache_release(new_page);
 oom:
-        if (old_page) {
+        if (old_page)
-                if (page_mkwrite) {
-                        unlock_page(old_page);
-                        page_cache_release(old_page);
-                }
                page_cache_release(old_page);
-        }
        return VM_FAULT_OOM;
 unwritable_page:
@@ -3431,6 +3436,170 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
+int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+                                unsigned long addr, int current_nid)
+{
+        get_page(page);
+        count_vm_numa_event(NUMA_HINT_FAULTS);
+        if (current_nid == numa_node_id())
+                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+        return mpol_misplaced(page, vma, addr);
+}
+int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                   unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
+{
+        struct page *page = NULL;
+        spinlock_t *ptl;
+        int current_nid = -1;
+        int target_nid;
+        bool migrated = false;
+        /*
+        * The "pte" at this point cannot be used safely without
+        * validation through pte_unmap_same(). It's of NUMA type but
+        * the pfn may be screwed if the read is non atomic.
+        *
+        * ptep_modify_prot_start is not called as this is clearing
+        * the _PAGE_NUMA bit and it is not really expected that there
+        * would be concurrent hardware modifications to the PTE.
+        */
+        ptl = pte_lockptr(mm, pmd);
+        spin_lock(ptl);
+        if (unlikely(!pte_same(*ptep, pte))) {
+                pte_unmap_unlock(ptep, ptl);
+                goto out;
+        }
+        pte = pte_mknonnuma(pte);
+        set_pte_at(mm, addr, ptep, pte);
+        update_mmu_cache(vma, addr, ptep);
+        page = vm_normal_page(vma, addr, pte);
+        if (!page) {
+                pte_unmap_unlock(ptep, ptl);
+                return 0;
+        }
+        current_nid = page_to_nid(page);
+        target_nid = numa_migrate_prep(page, vma, addr, current_nid);
+        pte_unmap_unlock(ptep, ptl);
+        if (target_nid == -1) {
+                /*
+                 * Account for the fault against the current node if it not
+                 * being replaced regardless of where the page is located.
+                 */
+                current_nid = numa_node_id();
+                put_page(page);
+                goto out;
+        }
+        /* Migrate to the requested node */
+        migrated = migrate_misplaced_page(page, target_nid);
+        if (migrated)
+                current_nid = target_nid;
+out:
+        if (current_nid != -1)
+                task_numa_fault(current_nid, 1, migrated);
+        return 0;
+}
+/* NUMA hinting page fault entry point for regular pmds */
+#ifdef CONFIG_NUMA_BALANCING
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, pmd_t *pmdp)
+{
+        pmd_t pmd;
+        pte_t *pte, *orig_pte;
+        unsigned long _addr = addr & PMD_MASK;
+        unsigned long offset;
+        spinlock_t *ptl;
+        bool numa = false;
+        int local_nid = numa_node_id();
+        spin_lock(&mm->page_table_lock);
+        pmd = *pmdp;
+        if (pmd_numa(pmd)) {
+                set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
+                numa = true;
+        }
+        spin_unlock(&mm->page_table_lock);
+        if (!numa)
+                return 0;
+        /* we're in a page fault so some vma must be in the range */
+        BUG_ON(!vma);
+        BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
+        offset = max(_addr, vma->vm_start) & ~PMD_MASK;
+        VM_BUG_ON(offset >= PMD_SIZE);
+        orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
+        pte += offset >> PAGE_SHIFT;
+        for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
+                pte_t pteval = *pte;
+                struct page *page;
+                int curr_nid = local_nid;
+                int target_nid;
+                bool migrated;
+                if (!pte_present(pteval))
+                        continue;
+                if (!pte_numa(pteval))
+                        continue;
+                if (addr >= vma->vm_end) {
+                        vma = find_vma(mm, addr);
+                        /* there's a pte present so there must be a vma */
+                        BUG_ON(!vma);
+                        BUG_ON(addr < vma->vm_start);
+                }
+                if (pte_numa(pteval)) {
+                        pteval = pte_mknonnuma(pteval);
+                        set_pte_at(mm, addr, pte, pteval);
+                }
+                page = vm_normal_page(vma, addr, pteval);
+                if (unlikely(!page))
+                        continue;
+                /* only check non-shared pages */
+                if (unlikely(page_mapcount(page) != 1))
+                        continue;
+                /*
+                 * Note that the NUMA fault is later accounted to either
+                 * the node that is currently running or where the page is
+                 * migrated to.
+                 */
+                curr_nid = local_nid;
+                target_nid = numa_migrate_prep(page, vma, addr,
+                                               page_to_nid(page));
+                if (target_nid == -1) {
+                        put_page(page);
+                        continue;
+                }
+                /* Migrate to the requested node */
+                pte_unmap_unlock(pte, ptl);
+                migrated = migrate_misplaced_page(page, target_nid);
+                if (migrated)
+                        curr_nid = target_nid;
+                task_numa_fault(curr_nid, 1, migrated);
+                pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+        }
+        pte_unmap_unlock(orig_pte, ptl);
+        return 0;
+}
+#else
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, pmd_t *pmdp)
+{
+        BUG();
+        return 0;
+}
+#endif /* CONFIG_NUMA_BALANCING */
 /*
 * These routines also need to handle stuff like marking pages dirty
 * and/or accessed for architectures that don't do it in hardware (most
@@ -3469,6 +3638,9 @@ int handle_pte_fault(struct mm_struct *mm,
                                        pte, pmd, flags, entry);
        }
+        if (pte_numa(entry))
+                return do_numa_page(mm, vma, address, entry, pte, pmd);
        ptl = pte_lockptr(mm, pmd);
        spin_lock(ptl);
        if (unlikely(!pte_same(*pte, entry)))
@@ -3537,9 +3709,21 @@ retry:
                barrier();
                if (pmd_trans_huge(orig_pmd)) {
-                        if (flags & FAULT_FLAG_WRITE &&
+                        unsigned int dirty = flags & FAULT_FLAG_WRITE;
-                            !pmd_write(orig_pmd) &&
-                            !pmd_trans_splitting(orig_pmd)) {
+                        /*
+                         * If the pmd is splitting, return and retry the
+                         * the fault.  Alternative: wait until the split
+                         * is done, and goto retry.
+                         */
+                        if (pmd_trans_splitting(orig_pmd))
+                                return 0;
+                        if (pmd_numa(orig_pmd))
+                                return do_huge_pmd_numa_page(mm, vma, address,
+                                                             orig_pmd, pmd);
+                        if (dirty && !pmd_write(orig_pmd)) {
                                ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
                                                          orig_pmd);
                                /*
@@ -3550,17 +3734,25 @@ retry:
                                if (unlikely(ret & VM_FAULT_OOM))
                                        goto retry;
                                return ret;
+                        } else {
+                                huge_pmd_set_accessed(mm, vma, address, pmd,
+                                                      orig_pmd, dirty);
                        }
                        return 0;
                }
        }
+        if (pmd_numa(*pmd))
+                return do_pmd_numa_page(mm, vma, address, pmd);
        /*
         * Use __pte_alloc instead of pte_alloc_map, because we can't
         * run pte_offset_map on the pmd, if an huge pmd could
         * materialize from under us from a different thread.
         */
-        if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
+        if (unlikely(pmd_none(*pmd)) &&
+            unlikely(__pte_alloc(mm, vma, pmd, address)))
                return VM_FAULT_OOM;
        /* if an huge pmd materialized from under us just retry later */
        if (unlikely(pmd_trans_huge(*pmd)))
@@ -3940,15 +4132,12 @@ void print_vma_addr(char *prefix, unsigned long ip)
                struct file *f = vma->vm_file;
                char *buf = (char *)__get_free_page(GFP_KERNEL);
                if (buf) {
-                        char *p, *s;
+                        char *p;
                        p = d_path(&f->f_path, buf, PAGE_SIZE);
                        if (IS_ERR(p))
                                p = "?";
-                        s = strrchr(p, '/');
+                        printk("%s%s[%lx+%lx]", prefix, kbasename(p),
-                        if (s)
-                                p = s+1;
-                        printk("%s%s[%lx+%lx]", prefix, p,
                                        vma->vm_start,
                                        vma->vm_end - vma->vm_start);
                        free_page((unsigned long)buf);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e4eeacae2b91..d04ed87bfacb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info,  struct page *page,
 void __ref put_page_bootmem(struct page *page)
 {
        unsigned long type;
+        static DEFINE_MUTEX(ppb_lock);
        type = (unsigned long) page->lru.next;
        BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -115,7 +116,14 @@ void __ref put_page_bootmem(struct page *page)
                ClearPagePrivate(page);
                set_page_private(page, 0);
                INIT_LIST_HEAD(&page->lru);
+                /*
+                 * Please refer to comment for __free_pages_bootmem()
+                 * for why we serialize here.
+                 */
+                mutex_lock(&ppb_lock);
                __free_pages_bootmem(page, 0);
+                mutex_unlock(&ppb_lock);
        }
 }
@@ -205,7 +213,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
        zone_span_writelock(zone);
        old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
-        if (start_pfn < zone->zone_start_pfn)
+        if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn)
                zone->zone_start_pfn = start_pfn;
        zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
@@ -214,13 +222,134 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
        zone_span_writeunlock(zone);
 }
+static void resize_zone(struct zone *zone, unsigned long start_pfn,
+                unsigned long end_pfn)
+{
+        zone_span_writelock(zone);
+        if (end_pfn - start_pfn) {
+                zone->zone_start_pfn = start_pfn;
+                zone->spanned_pages = end_pfn - start_pfn;
+        } else {
+                /*
+                 * make it consist as free_area_init_core(),
+                 * if spanned_pages = 0, then keep start_pfn = 0
+                 */
+                zone->zone_start_pfn = 0;
+                zone->spanned_pages = 0;
+        }
+        zone_span_writeunlock(zone);
+}
+static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
+                unsigned long end_pfn)
+{
+        enum zone_type zid = zone_idx(zone);
+        int nid = zone->zone_pgdat->node_id;
+        unsigned long pfn;
+        for (pfn = start_pfn; pfn < end_pfn; pfn++)
+                set_page_links(pfn_to_page(pfn), zid, nid, pfn);
+}
+static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
+                unsigned long start_pfn, unsigned long end_pfn)
+{
+        int ret;
+        unsigned long flags;
+        unsigned long z1_start_pfn;
+        if (!z1->wait_table) {
+                ret = init_currently_empty_zone(z1, start_pfn,
+                        end_pfn - start_pfn, MEMMAP_HOTPLUG);
+                if (ret)
+                        return ret;
+        }
+        pgdat_resize_lock(z1->zone_pgdat, &flags);
+        /* can't move pfns which are higher than @z2 */
+        if (end_pfn > z2->zone_start_pfn + z2->spanned_pages)
+                goto out_fail;
+        /* the move out part mast at the left most of @z2 */
+        if (start_pfn > z2->zone_start_pfn)
+                goto out_fail;
+        /* must included/overlap */
+        if (end_pfn <= z2->zone_start_pfn)
+                goto out_fail;
+        /* use start_pfn for z1's start_pfn if z1 is empty */
+        if (z1->spanned_pages)
+                z1_start_pfn = z1->zone_start_pfn;
+        else
+                z1_start_pfn = start_pfn;
+        resize_zone(z1, z1_start_pfn, end_pfn);
+        resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages);
+        pgdat_resize_unlock(z1->zone_pgdat, &flags);
+        fix_zone_id(z1, start_pfn, end_pfn);
+        return 0;
+out_fail:
+        pgdat_resize_unlock(z1->zone_pgdat, &flags);
+        return -1;
+}
+static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
+                unsigned long start_pfn, unsigned long end_pfn)
+{
+        int ret;
+        unsigned long flags;
+        unsigned long z2_end_pfn;
+        if (!z2->wait_table) {
+                ret = init_currently_empty_zone(z2, start_pfn,
+                        end_pfn - start_pfn, MEMMAP_HOTPLUG);
+                if (ret)
+                        return ret;
+        }
+        pgdat_resize_lock(z1->zone_pgdat, &flags);
+        /* can't move pfns which are lower than @z1 */
+        if (z1->zone_start_pfn > start_pfn)
+                goto out_fail;
+        /* the move out part mast at the right most of @z1 */
+        if (z1->zone_start_pfn + z1->spanned_pages >  end_pfn)
+                goto out_fail;
+        /* must included/overlap */
+        if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages)
+                goto out_fail;
+        /* use end_pfn for z2's end_pfn if z2 is empty */
+        if (z2->spanned_pages)
+                z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages;
+        else
+                z2_end_pfn = end_pfn;
+        resize_zone(z1, z1->zone_start_pfn, start_pfn);
+        resize_zone(z2, start_pfn, z2_end_pfn);
+        pgdat_resize_unlock(z1->zone_pgdat, &flags);
+        fix_zone_id(z2, start_pfn, end_pfn);
+        return 0;
+out_fail:
+        pgdat_resize_unlock(z1->zone_pgdat, &flags);
+        return -1;
+}
 static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
                            unsigned long end_pfn)
 {
        unsigned long old_pgdat_end_pfn =
                pgdat->node_start_pfn + pgdat->node_spanned_pages;
-        if (start_pfn < pgdat->node_start_pfn)
+        if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
                pgdat->node_start_pfn = start_pfn;
        pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
@@ -460,8 +589,99 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
        return 0;
 }
+#ifdef CONFIG_MOVABLE_NODE
+/*
+ * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
+ * normal memory.
+ */
+static bool can_online_high_movable(struct zone *zone)
+{
+        return true;
+}
+#else /* CONFIG_MOVABLE_NODE */
+/* ensure every online node has NORMAL memory */
+static bool can_online_high_movable(struct zone *zone)
+{
+        return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
+}
+#endif /* CONFIG_MOVABLE_NODE */
-int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
+/* check which state of node_states will be changed when online memory */
+static void node_states_check_changes_online(unsigned long nr_pages,
+        struct zone *zone, struct memory_notify *arg)
+{
+        int nid = zone_to_nid(zone);
+        enum zone_type zone_last = ZONE_NORMAL;
+        /*
+         * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
+         * contains nodes which have zones of 0...ZONE_NORMAL,
+         * set zone_last to ZONE_NORMAL.
+         *
+         * If we don't have HIGHMEM nor movable node,
+         * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
+         * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
+         */
+        if (N_MEMORY == N_NORMAL_MEMORY)
+                zone_last = ZONE_MOVABLE;
+        /*
+         * if the memory to be online is in a zone of 0...zone_last, and
+         * the zones of 0...zone_last don't have memory before online, we will
+         * need to set the node to node_states[N_NORMAL_MEMORY] after
+         * the memory is online.
+         */
+        if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
+                arg->status_change_nid_normal = nid;
+        else
+                arg->status_change_nid_normal = -1;
+#ifdef CONFIG_HIGHMEM
+        /*
+         * If we have movable node, node_states[N_HIGH_MEMORY]
+         * contains nodes which have zones of 0...ZONE_HIGHMEM,
+         * set zone_last to ZONE_HIGHMEM.
+         *
+         * If we don't have movable node, node_states[N_NORMAL_MEMORY]
+         * contains nodes which have zones of 0...ZONE_MOVABLE,
+         * set zone_last to ZONE_MOVABLE.
+         */
+        zone_last = ZONE_HIGHMEM;
+        if (N_MEMORY == N_HIGH_MEMORY)
+                zone_last = ZONE_MOVABLE;
+        if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
+                arg->status_change_nid_high = nid;
+        else
+                arg->status_change_nid_high = -1;
+#else
+        arg->status_change_nid_high = arg->status_change_nid_normal;
+#endif
+        /*
+         * if the node don't have memory befor online, we will need to
+         * set the node to node_states[N_MEMORY] after the memory
+         * is online.
+         */
+        if (!node_state(nid, N_MEMORY))
+                arg->status_change_nid = nid;
+        else
+                arg->status_change_nid = -1;
+}
+static void node_states_set_node(int node, struct memory_notify *arg)
+{
+        if (arg->status_change_nid_normal >= 0)
+                node_set_state(node, N_NORMAL_MEMORY);
+        if (arg->status_change_nid_high >= 0)
+                node_set_state(node, N_HIGH_MEMORY);
+        node_set_state(node, N_MEMORY);
+}
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
 {
        unsigned long onlined_pages = 0;
        struct zone *zone;
@@ -471,13 +691,40 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
        struct memory_notify arg;
        lock_memory_hotplug();
+        /*
+         * This doesn't need a lock to do pfn_to_page().
+         * The section can't be removed here because of the
+         * memory_block->state_mutex.
+         */
+        zone = page_zone(pfn_to_page(pfn));
+        if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
+            !can_online_high_movable(zone)) {
+                unlock_memory_hotplug();
+                return -1;
+        }
+        if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
+                if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
+                        unlock_memory_hotplug();
+                        return -1;
+                }
+        }
+        if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
+                if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) {
+                        unlock_memory_hotplug();
+                        return -1;
+                }
+        }
+        /* Previous code may changed the zone of the pfn range */
+        zone = page_zone(pfn_to_page(pfn));
        arg.start_pfn = pfn;
        arg.nr_pages = nr_pages;
-        arg.status_change_nid = -1;
+        node_states_check_changes_online(nr_pages, zone, &arg);
        nid = page_to_nid(pfn_to_page(pfn));
-        if (node_present_pages(nid) == 0)
-                arg.status_change_nid = nid;
        ret = memory_notify(MEM_GOING_ONLINE, &arg);
        ret = notifier_to_errno(ret);
@@ -487,23 +734,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
                return ret;
        }
        /*
-         * This doesn't need a lock to do pfn_to_page().
-         * The section can't be removed here because of the
-         * memory_block->state_mutex.
-         */
-        zone = page_zone(pfn_to_page(pfn));
-        /*
         * If this zone is not populated, then it is not in zonelist.
         * This means the page allocator ignores this zone.
         * So, zonelist must be updated after online.
         */
        mutex_lock(&zonelists_mutex);
-        if (!populated_zone(zone))
+        if (!populated_zone(zone)) {
                need_zonelists_rebuild = 1;
+                build_all_zonelists(NULL, zone);
+        }
        ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
                online_pages_range);
        if (ret) {
+                if (need_zonelists_rebuild)
+                        zone_pcp_reset(zone);
                mutex_unlock(&zonelists_mutex);
                printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
                       (unsigned long long) pfn << PAGE_SHIFT,
@@ -514,12 +759,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
                return ret;
        }
+        zone->managed_pages += onlined_pages;
        zone->present_pages += onlined_pages;
        zone->zone_pgdat->node_present_pages += onlined_pages;
        if (onlined_pages) {
-                node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
+                node_states_set_node(zone_to_nid(zone), &arg);
                if (need_zonelists_rebuild)
-                        build_all_zonelists(NULL, zone);
+                        build_all_zonelists(NULL, NULL);
                else
                        zone_pcp_update(zone);
        }
@@ -812,7 +1058,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                 * migrate_pages returns # of failed pages.
                 */
                ret = migrate_pages(&source, alloc_migrate_target, 0,
-                                                        true, MIGRATE_SYNC);
+                                                        true, MIGRATE_SYNC,
+                                                        MR_MEMORY_HOTPLUG);
                if (ret)
                        putback_lru_pages(&source);
        }
@@ -847,7 +1094,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
 {
        int ret;
        long offlined = *(long *)data;
-        ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
+        ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
        offlined = nr_pages;
        if (!ret)
                *(long *)data += offlined;
@@ -867,6 +1114,132 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
        return offlined;
 }
+#ifdef CONFIG_MOVABLE_NODE
+/*
+ * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have
+ * normal memory.
+ */
+static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
+{
+        return true;
+}
+#else /* CONFIG_MOVABLE_NODE */
+/* ensure the node has NORMAL memory if it is still online */
+static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
+{
+        struct pglist_data *pgdat = zone->zone_pgdat;
+        unsigned long present_pages = 0;
+        enum zone_type zt;
+        for (zt = 0; zt <= ZONE_NORMAL; zt++)
+                present_pages += pgdat->node_zones[zt].present_pages;
+        if (present_pages > nr_pages)
+                return true;
+        present_pages = 0;
+        for (; zt <= ZONE_MOVABLE; zt++)
+                present_pages += pgdat->node_zones[zt].present_pages;
+        /*
+         * we can't offline the last normal memory until all
+         * higher memory is offlined.
+         */
+        return present_pages == 0;
+}
+#endif /* CONFIG_MOVABLE_NODE */
+/* check which state of node_states will be changed when offline memory */
+static void node_states_check_changes_offline(unsigned long nr_pages,
+                struct zone *zone, struct memory_notify *arg)
+{
+        struct pglist_data *pgdat = zone->zone_pgdat;
+        unsigned long present_pages = 0;
+        enum zone_type zt, zone_last = ZONE_NORMAL;
+        /*
+         * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
+         * contains nodes which have zones of 0...ZONE_NORMAL,
+         * set zone_last to ZONE_NORMAL.
+         *
+         * If we don't have HIGHMEM nor movable node,
+         * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
+         * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
+         */
+        if (N_MEMORY == N_NORMAL_MEMORY)
+                zone_last = ZONE_MOVABLE;
+        /*
+         * check whether node_states[N_NORMAL_MEMORY] will be changed.
+         * If the memory to be offline is in a zone of 0...zone_last,
+         * and it is the last present memory, 0...zone_last will
+         * become empty after offline , thus we can determind we will
+         * need to clear the node from node_states[N_NORMAL_MEMORY].
+         */
+        for (zt = 0; zt <= zone_last; zt++)
+                present_pages += pgdat->node_zones[zt].present_pages;
+        if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
+                arg->status_change_nid_normal = zone_to_nid(zone);
+        else
+                arg->status_change_nid_normal = -1;
+#ifdef CONFIG_HIGHMEM
+        /*
+         * If we have movable node, node_states[N_HIGH_MEMORY]
+         * contains nodes which have zones of 0...ZONE_HIGHMEM,
+         * set zone_last to ZONE_HIGHMEM.
+         *
+         * If we don't have movable node, node_states[N_NORMAL_MEMORY]
+         * contains nodes which have zones of 0...ZONE_MOVABLE,
+         * set zone_last to ZONE_MOVABLE.
+         */
+        zone_last = ZONE_HIGHMEM;
+        if (N_MEMORY == N_HIGH_MEMORY)
+                zone_last = ZONE_MOVABLE;
+        for (; zt <= zone_last; zt++)
+                present_pages += pgdat->node_zones[zt].present_pages;
+        if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
+                arg->status_change_nid_high = zone_to_nid(zone);
+        else
+                arg->status_change_nid_high = -1;
+#else
+        arg->status_change_nid_high = arg->status_change_nid_normal;
+#endif
+        /*
+         * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
+         */
+        zone_last = ZONE_MOVABLE;
+        /*
+         * check whether node_states[N_HIGH_MEMORY] will be changed
+         * If we try to offline the last present @nr_pages from the node,
+         * we can determind we will need to clear the node from
+         * node_states[N_HIGH_MEMORY].
+         */
+        for (; zt <= zone_last; zt++)
+                present_pages += pgdat->node_zones[zt].present_pages;
+        if (nr_pages >= present_pages)
+                arg->status_change_nid = zone_to_nid(zone);
+        else
+                arg->status_change_nid = -1;
+}
+static void node_states_clear_node(int node, struct memory_notify *arg)
+{
+        if (arg->status_change_nid_normal >= 0)
+                node_clear_state(node, N_NORMAL_MEMORY);
+        if ((N_MEMORY != N_NORMAL_MEMORY) &&
+            (arg->status_change_nid_high >= 0))
+                node_clear_state(node, N_HIGH_MEMORY);
+        if ((N_MEMORY != N_HIGH_MEMORY) &&
+            (arg->status_change_nid >= 0))
+                node_clear_state(node, N_MEMORY);
+}
 static int __ref __offline_pages(unsigned long start_pfn,
                  unsigned long end_pfn, unsigned long timeout)
 {
@@ -893,16 +1266,19 @@ static int __ref __offline_pages(unsigned long start_pfn,
        node = zone_to_nid(zone);
        nr_pages = end_pfn - start_pfn;
+        ret = -EINVAL;
+        if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
+                goto out;
        /* set above range as isolated */
-        ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+        ret = start_isolate_page_range(start_pfn, end_pfn,
+                                       MIGRATE_MOVABLE, true);
        if (ret)
                goto out;
        arg.start_pfn = start_pfn;
        arg.nr_pages = nr_pages;
-        arg.status_change_nid = -1;
+        node_states_check_changes_offline(nr_pages, zone, &arg);
-        if (nr_pages >= node_present_pages(node))
-                arg.status_change_nid = node;
        ret = memory_notify(MEM_GOING_OFFLINE, &arg);
        ret = notifier_to_errno(ret);
@@ -943,10 +1319,10 @@ repeat:
                        goto repeat;
                }
        }
-        /* drain all zone's lru pagevec, this is asyncronous... */
+        /* drain all zone's lru pagevec, this is asynchronous... */
        lru_add_drain_all();
        yield();
-        /* drain pcp pages , this is synchrouns. */
+        /* drain pcp pages, this is synchronous. */
        drain_all_pages();
        /* check again */
        offlined_pages = check_pages_isolated(start_pfn, end_pfn);
@@ -955,12 +1331,13 @@ repeat:
                goto failed_removal;
        }
        printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
-        /* Ok, all of our target is islaoted.
+        /* Ok, all of our target is isolated.
           We cannot do rollback at this point. */
        offline_isolated_pages(start_pfn, end_pfn);
        /* reset pagetype flags and makes migrate type to be MOVABLE */
        undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
        /* removal success */
+        zone->managed_pages -= offlined_pages;
        zone->present_pages -= offlined_pages;
        zone->zone_pgdat->node_present_pages -= offlined_pages;
        totalram_pages -= offlined_pages;
@@ -975,10 +1352,9 @@ repeat:
        } else
                zone_pcp_update(zone);
-        if (!node_present_pages(node)) {
+        node_states_clear_node(node, &arg);
-                node_clear_state(node, N_HIGH_MEMORY);
+        if (arg.status_change_nid >= 0)
                kswapd_stop(node);
-        }
        vm_total_pages = nr_free_pagecache_pages();
        writeback_set_ratelimit();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d04a8a54c294..e2df1c1fb41f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -90,6 +90,7 @@
 #include <linux/syscalls.h>
 #include <linux/ctype.h>
 #include <linux/mm_inline.h>
+#include <linux/mmu_notifier.h>
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -117,6 +118,26 @@ static struct mempolicy default_policy = {
        .flags = MPOL_F_LOCAL,
 };
+static struct mempolicy preferred_node_policy[MAX_NUMNODES];
+static struct mempolicy *get_task_policy(struct task_struct *p)
+{
+        struct mempolicy *pol = p->mempolicy;
+        int node;
+        if (!pol) {
+                node = numa_node_id();
+                if (node != -1)
+                        pol = &preferred_node_policy[node];
+                /* preferred_node_policy is not initialised early in boot */
+                if (!pol->mode)
+                        pol = NULL;
+        }
+        return pol;
+}
 static const struct mempolicy_operations {
        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
        /*
@@ -212,9 +233,9 @@ static int mpol_set_nodemask(struct mempolicy *pol,
        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
        if (pol == NULL)
                return 0;
-        /* Check N_HIGH_MEMORY */
+        /* Check N_MEMORY */
        nodes_and(nsc->mask1,
-                  cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
+                  cpuset_current_mems_allowed, node_states[N_MEMORY]);
        VM_BUG_ON(!nodes);
        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
@@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
        if (mode == MPOL_DEFAULT) {
                if (nodes && !nodes_empty(*nodes))
                        return ERR_PTR(-EINVAL);
-                return NULL;    /* simply delete any existing policy */
+                return NULL;
        }
        VM_BUG_ON(!nodes);
@@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
                             (flags & MPOL_F_RELATIVE_NODES)))
                                return ERR_PTR(-EINVAL);
                }
+        } else if (mode == MPOL_LOCAL) {
+                if (!nodes_empty(*nodes))
+                        return ERR_PTR(-EINVAL);
+                mode = MPOL_PREFERRED;
        } else if (nodes_empty(*nodes))
                return ERR_PTR(-EINVAL);
        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -511,7 +536,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
-                split_huge_page_pmd(vma->vm_mm, pmd);
+                split_huge_page_pmd(vma, addr, pmd);
                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
                        continue;
                if (check_pte_range(vma, pmd, addr, next, nodes,
@@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
        return 0;
 }
+#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+/*
+ * This is used to mark a range of virtual addresses to be inaccessible.
+ * These are later cleared by a NUMA hinting fault. Depending on these
+ * faults, pages may be migrated for better NUMA placement.
+ *
+ * This is assuming that NUMA faults are handled using PROT_NONE. If
+ * an architecture makes a different choice, it will need further
+ * changes to the core.
+ */
+unsigned long change_prot_numa(struct vm_area_struct *vma,
+                        unsigned long addr, unsigned long end)
+{
+        int nr_updated;
+        BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
+        nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
+        if (nr_updated)
+                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
+        return nr_updated;
+}
+#else
+static unsigned long change_prot_numa(struct vm_area_struct *vma,
+                        unsigned long addr, unsigned long end)
+{
+        return 0;
+}
+#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
 /*
 * Check if all pages in a range are on a set of nodes.
 * If pagelist != NULL then isolate pages from the LRU and
@@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                return ERR_PTR(-EFAULT);
        prev = NULL;
        for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+                unsigned long endvma = vma->vm_end;
+                if (endvma > end)
+                        endvma = end;
+                if (vma->vm_start > start)
+                        start = vma->vm_start;
                if (!(flags & MPOL_MF_DISCONTIG_OK)) {
                        if (!vma->vm_next && vma->vm_end < end)
                                return ERR_PTR(-EFAULT);
                        if (prev && prev->vm_end < vma->vm_start)
                                return ERR_PTR(-EFAULT);
                }
-                if (!is_vm_hugetlb_page(vma) &&
-                    ((flags & MPOL_MF_STRICT) ||
+                if (is_vm_hugetlb_page(vma))
+                        goto next;
+                if (flags & MPOL_MF_LAZY) {
+                        change_prot_numa(vma, start, endvma);
+                        goto next;
+                }
+                if ((flags & MPOL_MF_STRICT) ||
                     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
-                                vma_migratable(vma)))) {
+                      vma_migratable(vma))) {
-                        unsigned long endvma = vma->vm_end;
-                        if (endvma > end)
-                                endvma = end;
-                        if (vma->vm_start > start)
-                                start = vma->vm_start;
                        err = check_pgd_range(vma, start, endvma, nodes,
                                                flags, private);
                        if (err) {
@@ -602,6 +667,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                                break;
                        }
                }
+next:
                prev = vma;
        }
        return first;
@@ -961,7 +1027,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_node_page, dest,
-                                                        false, MIGRATE_SYNC);
+                                                        false, MIGRATE_SYNC,
+                                                        MR_SYSCALL);
                if (err)
                        putback_lru_pages(&pagelist);
        }
@@ -1133,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len,
        int err;
        LIST_HEAD(pagelist);
-        if (flags & ~(unsigned long)(MPOL_MF_STRICT |
+        if (flags & ~(unsigned long)MPOL_MF_VALID)
-                                     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                return -EINVAL;
        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
                return -EPERM;
@@ -1157,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len,
        if (IS_ERR(new))
                return PTR_ERR(new);
+        if (flags & MPOL_MF_LAZY)
+                new->flags |= MPOL_F_MOF;
        /*
         * If we are using the default policy then operation
         * on discontinuous address spaces is okay after all
@@ -1193,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len,
        vma = check_range(mm, start, end, nmask,
                          flags | MPOL_MF_INVERT, &pagelist);
-        err = PTR_ERR(vma);
+        err = PTR_ERR(vma);     /* maybe ... */
-        if (!IS_ERR(vma)) {
+        if (!IS_ERR(vma))
-                int nr_failed = 0;
                err = mbind_range(mm, start, end, new);
+        if (!err) {
+                int nr_failed = 0;
                if (!list_empty(&pagelist)) {
+                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
                        nr_failed = migrate_pages(&pagelist, new_vma_page,
                                                (unsigned long)vma,
-                                                false, MIGRATE_SYNC);
+                                                false, MIGRATE_SYNC,
+                                                MR_MEMPOLICY_MBIND);
                        if (nr_failed)
                                putback_lru_pages(&pagelist);
                }
-                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+                if (nr_failed && (flags & MPOL_MF_STRICT))
                        err = -EIO;
        } else
                putback_lru_pages(&pagelist);
@@ -1388,7 +1460,7 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
                goto out_put;
        }
-        if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
+        if (!nodes_subset(*new, node_states[N_MEMORY])) {
                err = -EINVAL;
                goto out_put;
        }
@@ -1546,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 struct mempolicy *get_vma_policy(struct task_struct *task,
                struct vm_area_struct *vma, unsigned long addr)
 {
-        struct mempolicy *pol = task->mempolicy;
+        struct mempolicy *pol = get_task_policy(task);
        if (vma) {
                if (vma->vm_ops && vma->vm_ops->get_policy) {
@@ -1907,7 +1979,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
                unsigned long addr, int node)
 {
        struct mempolicy *pol;
-        struct zonelist *zl;
        struct page *page;
        unsigned int cpuset_mems_cookie;
@@ -1926,23 +1997,11 @@ retry_cpuset:
                return page;
        }
-        zl = policy_zonelist(gfp, pol, node);
+        page = __alloc_pages_nodemask(gfp, order,
-        if (unlikely(mpol_needs_cond_ref(pol))) {
+                                      policy_zonelist(gfp, pol, node),
-                /*
-                 * slow path: ref counted shared policy
-                 */
-                struct page *page =  __alloc_pages_nodemask(gfp, order,
-                                                zl, policy_nodemask(gfp, pol));
-                __mpol_put(pol);
-                if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
-                        goto retry_cpuset;
-                return page;
-        }
-        /*
-         * fast path:  default or task policy
-         */
-        page = __alloc_pages_nodemask(gfp, order, zl,
                                      policy_nodemask(gfp, pol));
+        if (unlikely(mpol_needs_cond_ref(pol)))
+                __mpol_put(pol);
        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
                goto retry_cpuset;
        return page;
@@ -1969,7 +2028,7 @@ retry_cpuset:
 */
 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
-        struct mempolicy *pol = current->mempolicy;
+        struct mempolicy *pol = get_task_policy(current);
        struct page *page;
        unsigned int cpuset_mems_cookie;
@@ -2037,28 +2096,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
        return new;
 }
-/*
- * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
- * eliminate the * MPOL_F_* flags that require conditional ref and
- * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
- * after return.  Use the returned value.
- *
- * Allows use of a mempolicy for, e.g., multiple allocations with a single
- * policy lookup, even if the policy needs/has extra ref on lookup.
- * shmem_readahead needs this.
- */
-struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
-                                                struct mempolicy *frompol)
-{
-        if (!mpol_needs_cond_ref(frompol))
-                return frompol;
-        *tompol = *frompol;
-        tompol->flags &= ~MPOL_F_SHARED;        /* copy doesn't need unref */
-        __mpol_put(frompol);
-        return tompol;
-}
 /* Slow path of a mempolicy comparison */
 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 {
@@ -2095,7 +2132,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 */
 /* lookup first element intersecting start-end */
-/* Caller holds sp->mutex */
+/* Caller holds sp->lock */
 static struct sp_node *
 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
 {
@@ -2159,13 +2196,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
        if (!sp->root.rb_node)
                return NULL;
-        mutex_lock(&sp->mutex);
+        spin_lock(&sp->lock);
        sn = sp_lookup(sp, idx, idx+1);
        if (sn) {
                mpol_get(sn->policy);
                pol = sn->policy;
        }
-        mutex_unlock(&sp->mutex);
+        spin_unlock(&sp->lock);
        return pol;
 }
@@ -2175,6 +2212,115 @@ static void sp_free(struct sp_node *n)
        kmem_cache_free(sn_cache, n);
 }
+/**
+ * mpol_misplaced - check whether current page node is valid in policy
+ *
+ * @page   - page to be checked
+ * @vma    - vm area where page mapped
+ * @addr   - virtual address where page mapped
+ *
+ * Lookup current policy node id for vma,addr and "compare to" page's
+ * node id.
+ *
+ * Returns:
+ *      -1      - not misplaced, page is in the right node
+ *      node    - node id where the page should be
+ *
+ * Policy determination "mimics" alloc_page_vma().
+ * Called from fault path where we know the vma and faulting address.
+ */
+int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
+{
+        struct mempolicy *pol;
+        struct zone *zone;
+        int curnid = page_to_nid(page);
+        unsigned long pgoff;
+        int polnid = -1;
+        int ret = -1;
+        BUG_ON(!vma);
+        pol = get_vma_policy(current, vma, addr);
+        if (!(pol->flags & MPOL_F_MOF))
+                goto out;
+        switch (pol->mode) {
+        case MPOL_INTERLEAVE:
+                BUG_ON(addr >= vma->vm_end);
+                BUG_ON(addr < vma->vm_start);
+                pgoff = vma->vm_pgoff;
+                pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
+                polnid = offset_il_node(pol, vma, pgoff);
+                break;
+        case MPOL_PREFERRED:
+                if (pol->flags & MPOL_F_LOCAL)
+                        polnid = numa_node_id();
+                else
+                        polnid = pol->v.preferred_node;
+                break;
+        case MPOL_BIND:
+                /*
+                 * allows binding to multiple nodes.
+                 * use current page if in policy nodemask,
+                 * else select nearest allowed node, if any.
+                 * If no allowed nodes, use current [!misplaced].
+                 */
+                if (node_isset(curnid, pol->v.nodes))
+                        goto out;
+                (void)first_zones_zonelist(
+                                node_zonelist(numa_node_id(), GFP_HIGHUSER),
+                                gfp_zone(GFP_HIGHUSER),
+                                &pol->v.nodes, &zone);
+                polnid = zone->node;
+                break;
+        default:
+                BUG();
+        }
+        /* Migrate the page towards the node whose CPU is referencing it */
+        if (pol->flags & MPOL_F_MORON) {
+                int last_nid;
+                polnid = numa_node_id();
+                /*
+                 * Multi-stage node selection is used in conjunction
+                 * with a periodic migration fault to build a temporal
+                 * task<->page relation. By using a two-stage filter we
+                 * remove short/unlikely relations.
+                 *
+                 * Using P(p) ~ n_p / n_t as per frequentist
+                 * probability, we can equate a task's usage of a
+                 * particular page (n_p) per total usage of this
+                 * page (n_t) (in a given time-span) to a probability.
+                 *
+                 * Our periodic faults will sample this probability and
+                 * getting the same result twice in a row, given these
+                 * samples are fully independent, is then given by
+                 * P(n)^2, provided our sample period is sufficiently
+                 * short compared to the usage pattern.
+                 *
+                 * This quadric squishes small probabilities, making
+                 * it less likely we act on an unlikely task<->page
+                 * relation.
+                 */
+                last_nid = page_xchg_last_nid(page, polnid);
+                if (last_nid != polnid)
+                        goto out;
+        }
+        if (curnid != polnid)
+                ret = polnid;
+out:
+        mpol_cond_put(pol);
+        return ret;
+}
 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
@@ -2182,6 +2328,14 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
        sp_free(n);
 }
+static void sp_node_init(struct sp_node *node, unsigned long start,
+                        unsigned long end, struct mempolicy *pol)
+{
+        node->start = start;
+        node->end = end;
+        node->policy = pol;
+}
 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
                                struct mempolicy *pol)
 {
@@ -2198,10 +2352,7 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
                return NULL;
        }
        newpol->flags |= MPOL_F_SHARED;
+        sp_node_init(n, start, end, newpol);
-        n->start = start;
-        n->end = end;
-        n->policy = newpol;
        return n;
 }
@@ -2211,9 +2362,12 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
                                 unsigned long end, struct sp_node *new)
 {
        struct sp_node *n;
+        struct sp_node *n_new = NULL;
+        struct mempolicy *mpol_new = NULL;
        int ret = 0;
-        mutex_lock(&sp->mutex);
+restart:
+        spin_lock(&sp->lock);
        n = sp_lookup(sp, start, end);
        /* Take care of old policies in the same range. */
        while (n && n->start < end) {
@@ -2226,14 +2380,16 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
                } else {
                        /* Old policy spanning whole new range. */
                        if (n->end > end) {
-                                struct sp_node *new2;
+                                if (!n_new)
-                                new2 = sp_alloc(end, n->end, n->policy);
+                                        goto alloc_new;
-                                if (!new2) {
-                                        ret = -ENOMEM;
+                                *mpol_new = *n->policy;
-                                        goto out;
+                                atomic_set(&mpol_new->refcnt, 1);
-                                }
+                                sp_node_init(n_new, n->end, end, mpol_new);
+                                sp_insert(sp, n_new);
                                n->end = start;
-                                sp_insert(sp, new2);
+                                n_new = NULL;
+                                mpol_new = NULL;
                                break;
                        } else
                                n->end = start;
@@ -2244,9 +2400,27 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
        }
        if (new)
                sp_insert(sp, new);
-out:
+        spin_unlock(&sp->lock);
-        mutex_unlock(&sp->mutex);
+        ret = 0;
+err_out:
+        if (mpol_new)
+                mpol_put(mpol_new);
+        if (n_new)
+                kmem_cache_free(sn_cache, n_new);
        return ret;
+alloc_new:
+        spin_unlock(&sp->lock);
+        ret = -ENOMEM;
+        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+        if (!n_new)
+                goto err_out;
+        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
+        if (!mpol_new)
+                goto err_out;
+        goto restart;
 }
 /**
@@ -2264,7 +2438,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
        int ret;
        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
-        mutex_init(&sp->mutex);
+        spin_lock_init(&sp->lock);
        if (mpol) {
                struct vm_area_struct pvma;
@@ -2330,16 +2504,60 @@ void mpol_free_shared_policy(struct shared_policy *p)
        if (!p->root.rb_node)
                return;
-        mutex_lock(&p->mutex);
+        spin_lock(&p->lock);
        next = rb_first(&p->root);
        while (next) {
                n = rb_entry(next, struct sp_node, nd);
                next = rb_next(&n->nd);
                sp_delete(p, n);
        }
-        mutex_unlock(&p->mutex);
+        spin_unlock(&p->lock);
+}
+#ifdef CONFIG_NUMA_BALANCING
+static bool __initdata numabalancing_override;
+static void __init check_numabalancing_enable(void)
+{
+        bool numabalancing_default = false;
+        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
+                numabalancing_default = true;
+        if (nr_node_ids > 1 && !numabalancing_override) {
+                printk(KERN_INFO "Enabling automatic NUMA balancing. "
+                        "Configure with numa_balancing= or sysctl");
+                set_numabalancing_state(numabalancing_default);
+        }
 }
+static int __init setup_numabalancing(char *str)
+{
+        int ret = 0;
+        if (!str)
+                goto out;
+        numabalancing_override = true;
+        if (!strcmp(str, "enable")) {
+                set_numabalancing_state(true);
+                ret = 1;
+        } else if (!strcmp(str, "disable")) {
+                set_numabalancing_state(false);
+                ret = 1;
+        }
+out:
+        if (!ret)
+                printk(KERN_WARNING "Unable to parse numa_balancing=\n");
+        return ret;
+}
+__setup("numa_balancing=", setup_numabalancing);
+#else
+static inline void __init check_numabalancing_enable(void)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
 /* assumes fs == KERNEL_DS */
 void __init numa_policy_init(void)
 {
@@ -2355,13 +2573,22 @@ void __init numa_policy_init(void)
                                     sizeof(struct sp_node),
                                     0, SLAB_PANIC, NULL);
+        for_each_node(nid) {
+                preferred_node_policy[nid] = (struct mempolicy) {
+                        .refcnt = ATOMIC_INIT(1),
+                        .mode = MPOL_PREFERRED,
+                        .flags = MPOL_F_MOF | MPOL_F_MORON,
+                        .v = { .preferred_node = nid, },
+                };
+        }
        /*
         * Set interleaving policy for system init. Interleaving is only
         * enabled across suitably sized nodes (default is >= 16MB), or
         * fall back to the largest node if they're all smaller.
         */
        nodes_clear(interleave_nodes);
-        for_each_node_state(nid, N_HIGH_MEMORY) {
+        for_each_node_state(nid, N_MEMORY) {
                unsigned long total_pages = node_present_pages(nid);
                /* Preserve the largest node */
@@ -2381,6 +2608,8 @@ void __init numa_policy_init(void)
        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
                printk("numa_policy_init: interleaving failed\n");
+        check_numabalancing_enable();
 }
 /* Reset policy of current process to default */
@@ -2394,44 +2623,34 @@ void numa_default_policy(void)
 */
 /*
- * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
+ * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
- * Used only for mpol_parse_str() and mpol_to_str()
 */
-#define MPOL_LOCAL MPOL_MAX
 static const char * const policy_modes[] =
 {
        [MPOL_DEFAULT]    = "default",
        [MPOL_PREFERRED]  = "prefer",
        [MPOL_BIND]       = "bind",
        [MPOL_INTERLEAVE] = "interleave",
-        [MPOL_LOCAL]      = "local"
+        [MPOL_LOCAL]      = "local",
 };
 #ifdef CONFIG_TMPFS
 /**
- * mpol_parse_str - parse string to mempolicy
+ * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
 * @str:  string containing mempolicy to parse
 * @mpol:  pointer to struct mempolicy pointer, returned on success.
- * @no_context:  flag whether to "contextualize" the mempolicy
 *
 * Format of input:
 *      <mode>[=<flags>][:<nodelist>]
 *
- * if @no_context is true, save the input nodemask in w.user_nodemask in
- * the returned mempolicy.  This will be used to "clone" the mempolicy in
- * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
- * mount option.  Note that if 'static' or 'relative' mode flags were
- * specified, the input nodemask will already have been saved.  Saving
- * it again is redundant, but safe.
- *
 * On success, returns 0, else 1
 */
-int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
+int mpol_parse_str(char *str, struct mempolicy **mpol)
 {
        struct mempolicy *new = NULL;
        unsigned short mode;
-        unsigned short uninitialized_var(mode_flags);
+        unsigned short mode_flags;
        nodemask_t nodes;
        char *nodelist = strchr(str, ':');
        char *flags = strchr(str, '=');
@@ -2442,7 +2661,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
                *nodelist++ = '\0';
                if (nodelist_parse(nodelist, nodes))
                        goto out;
-                if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
+                if (!nodes_subset(nodes, node_states[N_MEMORY]))
                        goto out;
        } else
                nodes_clear(nodes);
@@ -2450,12 +2669,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
        if (flags)
                *flags++ = '\0';        /* terminate mode string */
-        for (mode = 0; mode <= MPOL_LOCAL; mode++) {
+        for (mode = 0; mode < MPOL_MAX; mode++) {
                if (!strcmp(str, policy_modes[mode])) {
                        break;
                }
        }
-        if (mode > MPOL_LOCAL)
+        if (mode >= MPOL_MAX)
                goto out;
        switch (mode) {
@@ -2476,7 +2695,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
                 * Default to online nodes with memory if no nodelist
                 */
                if (!nodelist)
-                        nodes = node_states[N_HIGH_MEMORY];
+                        nodes = node_states[N_MEMORY];
                break;
        case MPOL_LOCAL:
                /*
@@ -2519,24 +2738,23 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
        if (IS_ERR(new))
                goto out;
-        if (no_context) {
+        /*
-                /* save for contextualization */
+         * Save nodes for mpol_to_str() to show the tmpfs mount options
-                new->w.user_nodemask = nodes;
+         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
-        } else {
+         */
-                int ret;
+        if (mode != MPOL_PREFERRED)
-                NODEMASK_SCRATCH(scratch);
+                new->v.nodes = nodes;
-                if (scratch) {
+        else if (nodelist)
-                        task_lock(current);
+                new->v.preferred_node = first_node(nodes);
-                        ret = mpol_set_nodemask(new, &nodes, scratch);
+        else
-                        task_unlock(current);
+                new->flags |= MPOL_F_LOCAL;
-                } else
-                        ret = -ENOMEM;
+        /*
-                NODEMASK_SCRATCH_FREE(scratch);
+         * Save nodes for contextualization: this will be used to "clone"
-                if (ret) {
+         * the mempolicy in a specific context [cpuset] at a later time.
-                        mpol_put(new);
+         */
-                        goto out;
+        new->w.user_nodemask = nodes;
-                }
-        }
        err = 0;
 out:
@@ -2556,13 +2774,12 @@ out:
 * @buffer:  to contain formatted mempolicy string
 * @maxlen:  length of @buffer
 * @pol:  pointer to mempolicy to be formatted
- * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
 *
 * Convert a mempolicy into a string.
 * Returns the number of characters in buffer (if positive)
 * or an error (negative)
 */
-int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
+int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 {
        char *p = buffer;
        int l;
@@ -2588,7 +2805,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
        case MPOL_PREFERRED:
                nodes_clear(nodes);
                if (flags & MPOL_F_LOCAL)
-                        mode = MPOL_LOCAL;      /* pseudo-policy */
+                        mode = MPOL_LOCAL;
                else
                        node_set(pol->v.preferred_node, nodes);
                break;
@@ -2596,10 +2813,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
        case MPOL_BIND:
                /* Fall through */
        case MPOL_INTERLEAVE:
-                if (no_context)
+                nodes = pol->v.nodes;
-                        nodes = pol->w.user_nodemask;
-                else
-                        nodes = pol->v.nodes;
                break;
        default:
diff --git a/mm/migrate.c b/mm/migrate.c
index 77ed2d773705..c38778610aa8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -35,9 +35,13 @@
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
 #include <linux/gfp.h>
+#include <linux/balloon_compaction.h>
 #include <asm/tlbflush.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/migrate.h>
 #include "internal.h"
 /*
@@ -79,7 +83,30 @@ void putback_lru_pages(struct list_head *l)
                list_del(&page->lru);
                dec_zone_page_state(page, NR_ISOLATED_ANON +
                                page_is_file_cache(page));
-                putback_lru_page(page);
+                        putback_lru_page(page);
+        }
+}
+/*
+ * Put previously isolated pages back onto the appropriate lists
+ * from where they were once taken off for compaction/migration.
+ *
+ * This function shall be used instead of putback_lru_pages(),
+ * whenever the isolated pageset has been built by isolate_migratepages_range()
+ */
+void putback_movable_pages(struct list_head *l)
+{
+        struct page *page;
+        struct page *page2;
+        list_for_each_entry_safe(page, page2, l, lru) {
+                list_del(&page->lru);
+                dec_zone_page_state(page, NR_ISOLATED_ANON +
+                                page_is_file_cache(page));
+                if (unlikely(balloon_page_movable(page)))
+                        balloon_page_putback(page);
+                else
+                        putback_lru_page(page);
        }
 }
@@ -91,8 +118,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
 {
        struct mm_struct *mm = vma->vm_mm;
        swp_entry_t entry;
-        pgd_t *pgd;
-        pud_t *pud;
        pmd_t *pmd;
        pte_t *ptep, pte;
        spinlock_t *ptl;
@@ -103,19 +128,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
                        goto out;
                ptl = &mm->page_table_lock;
        } else {
-                pgd = pgd_offset(mm, addr);
+                pmd = mm_find_pmd(mm, addr);
-                if (!pgd_present(*pgd))
+                if (!pmd)
-                        goto out;
-                pud = pud_offset(pgd, addr);
-                if (!pud_present(*pud))
                        goto out;
-                pmd = pmd_offset(pud, addr);
                if (pmd_trans_huge(*pmd))
                        goto out;
-                if (!pmd_present(*pmd))
-                        goto out;
                ptep = pte_offset_map(pmd, addr);
@@ -279,14 +296,14 @@ static int migrate_page_move_mapping(struct address_space *mapping,
                struct page *newpage, struct page *page,
                struct buffer_head *head, enum migrate_mode mode)
 {
-        int expected_count;
+        int expected_count = 0;
        void **pslot;
        if (!mapping) {
                /* Anonymous page without mapping */
                if (page_count(page) != 1)
                        return -EAGAIN;
-                return 0;
+                return MIGRATEPAGE_SUCCESS;
        }
        spin_lock_irq(&mapping->tree_lock);
@@ -356,7 +373,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        }
        spin_unlock_irq(&mapping->tree_lock);
-        return 0;
+        return MIGRATEPAGE_SUCCESS;
 }
 /*
@@ -372,7 +389,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
        if (!mapping) {
                if (page_count(page) != 1)
                        return -EAGAIN;
-                return 0;
+                return MIGRATEPAGE_SUCCESS;
        }
        spin_lock_irq(&mapping->tree_lock);
@@ -399,7 +416,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
        page_unfreeze_refs(page, expected_count - 1);
        spin_unlock_irq(&mapping->tree_lock);
-        return 0;
+        return MIGRATEPAGE_SUCCESS;
 }
 /*
@@ -407,7 +424,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
 */
 void migrate_page_copy(struct page *newpage, struct page *page)
 {
-        if (PageHuge(page))
+        if (PageHuge(page) || PageTransHuge(page))
                copy_huge_page(newpage, page);
        else
                copy_highpage(newpage, page);
@@ -486,11 +503,11 @@ int migrate_page(struct address_space *mapping,
        rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
-        if (rc)
+        if (rc != MIGRATEPAGE_SUCCESS)
                return rc;
        migrate_page_copy(newpage, page);
-        return 0;
+        return MIGRATEPAGE_SUCCESS;
 }
 EXPORT_SYMBOL(migrate_page);
@@ -513,7 +530,7 @@ int buffer_migrate_page(struct address_space *mapping,
        rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
-        if (rc)
+        if (rc != MIGRATEPAGE_SUCCESS)
                return rc;
        /*
@@ -549,7 +566,7 @@ int buffer_migrate_page(struct address_space *mapping,
        } while (bh != head);
-        return 0;
+        return MIGRATEPAGE_SUCCESS;
 }
 EXPORT_SYMBOL(buffer_migrate_page);
 #endif
@@ -628,7 +645,7 @@ static int fallback_migrate_page(struct address_space *mapping,
 *
 * Return value:
 *   < 0 - error code
- *  == 0 - success
+ *  MIGRATEPAGE_SUCCESS - success
 */
 static int move_to_new_page(struct page *newpage, struct page *page,
                                int remap_swapcache, enum migrate_mode mode)
@@ -665,7 +682,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
        else
                rc = fallback_migrate_page(mapping, newpage, page, mode);
-        if (rc) {
+        if (rc != MIGRATEPAGE_SUCCESS) {
                newpage->mapping = NULL;
        } else {
                if (remap_swapcache)
@@ -751,7 +768,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
         */
        if (PageAnon(page)) {
                /*
-                 * Only page_lock_anon_vma() understands the subtleties of
+                 * Only page_lock_anon_vma_read() understands the subtleties of
                 * getting a hold on an anon_vma from outside one of its mms.
                 */
                anon_vma = page_get_anon_vma(page);
@@ -778,6 +795,18 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
                }
        }
+        if (unlikely(balloon_page_movable(page))) {
+                /*
+                 * A ballooned page does not need any special attention from
+                 * physical to virtual reverse mapping procedures.
+                 * Skip any attempt to unmap PTEs or to remap swap cache,
+                 * in order to avoid burning cycles at rmap level, and perform
+                 * the page migration right away (proteced by page lock).
+                 */
+                rc = balloon_page_migrate(newpage, page, mode);
+                goto uncharge;
+        }
        /*
         * Corner case handling:
         * 1. When a new swap-cache page is read into, it is added to the LRU
@@ -814,7 +843,9 @@ skip_unmap:
                put_anon_vma(anon_vma);
 uncharge:
-        mem_cgroup_end_migration(mem, page, newpage, rc == 0);
+        mem_cgroup_end_migration(mem, page, newpage,
+                                 (rc == MIGRATEPAGE_SUCCESS ||
+                                  rc == MIGRATEPAGE_BALLOON_SUCCESS));
 unlock:
        unlock_page(page);
 out:
@@ -846,6 +877,18 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                        goto out;
        rc = __unmap_and_move(page, newpage, force, offlining, mode);
+        if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
+                /*
+                 * A ballooned page has been migrated already.
+                 * Now, it's the time to wrap-up counters,
+                 * handle the page back to Buddy and return.
+                 */
+                dec_zone_page_state(page, NR_ISOLATED_ANON +
+                                    page_is_file_cache(page));
+                balloon_page_free(page);
+                return MIGRATEPAGE_SUCCESS;
+        }
 out:
        if (rc != -EAGAIN) {
                /*
@@ -958,10 +1001,11 @@ out:
 */
 int migrate_pages(struct list_head *from,
                new_page_t get_new_page, unsigned long private, bool offlining,
-                enum migrate_mode mode)
+                enum migrate_mode mode, int reason)
 {
        int retry = 1;
        int nr_failed = 0;
+        int nr_succeeded = 0;
        int pass = 0;
        struct page *page;
        struct page *page2;
@@ -987,7 +1031,8 @@ int migrate_pages(struct list_head *from,
                        case -EAGAIN:
                                retry++;
                                break;
-                        case 0:
+                        case MIGRATEPAGE_SUCCESS:
+                                nr_succeeded++;
                                break;
                        default:
                                /* Permanent failure */
@@ -996,15 +1041,18 @@ int migrate_pages(struct list_head *from,
                        }
                }
        }
-        rc = 0;
+        rc = nr_failed + retry;
 out:
+        if (nr_succeeded)
+                count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
+        if (nr_failed)
+                count_vm_events(PGMIGRATE_FAIL, nr_failed);
+        trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
        if (!swapwrite)
                current->flags &= ~PF_SWAPWRITE;
-        if (rc)
+        return rc;
-                return rc;
-        return nr_failed + retry;
 }
 int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
@@ -1024,7 +1072,7 @@ int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
                        /* try again */
                        cond_resched();
                        break;
-                case 0:
+                case MIGRATEPAGE_SUCCESS:
                        goto out;
                default:
                        rc = -EIO;
@@ -1139,7 +1187,8 @@ set_status:
        err = 0;
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_page_node,
-                                (unsigned long)pm, 0, MIGRATE_SYNC);
+                                (unsigned long)pm, 0, MIGRATE_SYNC,
+                                MR_SYSCALL);
                if (err)
                        putback_lru_pages(&pagelist);
        }
@@ -1201,7 +1250,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
                        if (node < 0 || node >= MAX_NUMNODES)
                                goto out_pm;
-                        if (!node_state(node, N_HIGH_MEMORY))
+                        if (!node_state(node, N_MEMORY))
                                goto out_pm;
                        err = -EACCES;
@@ -1403,4 +1452,329 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
        }
        return err;
 }
-#endif
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * Returns true if this is a safe migration target node for misplaced NUMA
+ * pages. Currently it only checks the watermarks which crude
+ */
+static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
+                                   int nr_migrate_pages)
+{
+        int z;
+        for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+                struct zone *zone = pgdat->node_zones + z;
+                if (!populated_zone(zone))
+                        continue;
+                if (zone->all_unreclaimable)
+                        continue;
+                /* Avoid waking kswapd by allocating pages_to_migrate pages. */
+                if (!zone_watermark_ok(zone, 0,
+                                       high_wmark_pages(zone) +
+                                       nr_migrate_pages,
+                                       0, 0))
+                        continue;
+                return true;
+        }
+        return false;
+}
+static struct page *alloc_misplaced_dst_page(struct page *page,
+                                           unsigned long data,
+                                           int **result)
+{
+        int nid = (int) data;
+        struct page *newpage;
+        newpage = alloc_pages_exact_node(nid,
+                                         (GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
+                                          __GFP_NOMEMALLOC | __GFP_NORETRY |
+                                          __GFP_NOWARN) &
+                                         ~GFP_IOFS, 0);
+        if (newpage)
+                page_xchg_last_nid(newpage, page_last_nid(page));
+        return newpage;
+}
+/*
+ * page migration rate limiting control.
+ * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
+ * window of time. Default here says do not migrate more than 1280M per second.
+ * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
+ * as it is faults that reset the window, pte updates will happen unconditionally
+ * if there has not been a fault since @pteupdate_interval_millisecs after the
+ * throttle window closed.
+ */
+static unsigned int migrate_interval_millisecs __read_mostly = 100;
+static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
+static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
+/* Returns true if NUMA migration is currently rate limited */
+bool migrate_ratelimited(int node)
+{
+        pg_data_t *pgdat = NODE_DATA(node);
+        if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
+                                msecs_to_jiffies(pteupdate_interval_millisecs)))
+                return false;
+        if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
+                return false;
+        return true;
+}
+/* Returns true if the node is migrate rate-limited after the update */
+bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
+{
+        bool rate_limited = false;
+        /*
+         * Rate-limit the amount of data that is being migrated to a node.
+         * Optimal placement is no good if the memory bus is saturated and
+         * all the time is being spent migrating!
+         */
+        spin_lock(&pgdat->numabalancing_migrate_lock);
+        if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
+                pgdat->numabalancing_migrate_nr_pages = 0;
+                pgdat->numabalancing_migrate_next_window = jiffies +
+                        msecs_to_jiffies(migrate_interval_millisecs);
+        }
+        if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
+                rate_limited = true;
+        else
+                pgdat->numabalancing_migrate_nr_pages += nr_pages;
+        spin_unlock(&pgdat->numabalancing_migrate_lock);
+        
+        return rate_limited;
+}
+int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
+{
+        int ret = 0;
+        /* Avoid migrating to a node that is nearly full */
+        if (migrate_balanced_pgdat(pgdat, 1)) {
+                int page_lru;
+                if (isolate_lru_page(page)) {
+                        put_page(page);
+                        return 0;
+                }
+                /* Page is isolated */
+                ret = 1;
+                page_lru = page_is_file_cache(page);
+                if (!PageTransHuge(page))
+                        inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
+                else
+                        mod_zone_page_state(page_zone(page),
+                                        NR_ISOLATED_ANON + page_lru,
+                                        HPAGE_PMD_NR);
+        }
+        /*
+         * Page is either isolated or there is not enough space on the target
+         * node. If isolated, then it has taken a reference count and the
+         * callers reference can be safely dropped without the page
+         * disappearing underneath us during migration. Otherwise the page is
+         * not to be migrated but the callers reference should still be
+         * dropped so it does not leak.
+         */
+        put_page(page);
+        return ret;
+}
+/*
+ * Attempt to migrate a misplaced page to the specified destination
+ * node. Caller is expected to have an elevated reference count on
+ * the page that will be dropped by this function before returning.
+ */
+int migrate_misplaced_page(struct page *page, int node)
+{
+        pg_data_t *pgdat = NODE_DATA(node);
+        int isolated = 0;
+        int nr_remaining;
+        LIST_HEAD(migratepages);
+        /*
+         * Don't migrate pages that are mapped in multiple processes.
+         * TODO: Handle false sharing detection instead of this hammer
+         */
+        if (page_mapcount(page) != 1) {
+                put_page(page);
+                goto out;
+        }
+        /*
+         * Rate-limit the amount of data that is being migrated to a node.
+         * Optimal placement is no good if the memory bus is saturated and
+         * all the time is being spent migrating!
+         */
+        if (numamigrate_update_ratelimit(pgdat, 1)) {
+                put_page(page);
+                goto out;
+        }
+        isolated = numamigrate_isolate_page(pgdat, page);
+        if (!isolated)
+                goto out;
+        list_add(&page->lru, &migratepages);
+        nr_remaining = migrate_pages(&migratepages,
+                        alloc_misplaced_dst_page,
+                        node, false, MIGRATE_ASYNC,
+                        MR_NUMA_MISPLACED);
+        if (nr_remaining) {
+                putback_lru_pages(&migratepages);
+                isolated = 0;
+        } else
+                count_vm_numa_event(NUMA_PAGE_MIGRATE);
+        BUG_ON(!list_empty(&migratepages));
+out:
+        return isolated;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+int migrate_misplaced_transhuge_page(struct mm_struct *mm,
+                                struct vm_area_struct *vma,
+                                pmd_t *pmd, pmd_t entry,
+                                unsigned long address,
+                                struct page *page, int node)
+{
+        unsigned long haddr = address & HPAGE_PMD_MASK;
+        pg_data_t *pgdat = NODE_DATA(node);
+        int isolated = 0;
+        struct page *new_page = NULL;
+        struct mem_cgroup *memcg = NULL;
+        int page_lru = page_is_file_cache(page);
+        /*
+         * Don't migrate pages that are mapped in multiple processes.
+         * TODO: Handle false sharing detection instead of this hammer
+         */
+        if (page_mapcount(page) != 1)
+                goto out_dropref;
+        /*
+         * Rate-limit the amount of data that is being migrated to a node.
+         * Optimal placement is no good if the memory bus is saturated and
+         * all the time is being spent migrating!
+         */
+        if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
+                goto out_dropref;
+        new_page = alloc_pages_node(node,
+                (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
+        if (!new_page) {
+                count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+                goto out_dropref;
+        }
+        page_xchg_last_nid(new_page, page_last_nid(page));
+        isolated = numamigrate_isolate_page(pgdat, page);
+        /*
+         * Failing to isolate or a GUP pin prevents migration. The expected
+         * page count is 2. 1 for anonymous pages without a mapping and 1
+         * for the callers pin. If the page was isolated, the page will
+         * need to be put back on the LRU.
+         */
+        if (!isolated || page_count(page) != 2) {
+                count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+                put_page(new_page);
+                if (isolated) {
+                        putback_lru_page(page);
+                        isolated = 0;
+                        goto out;
+                }
+                goto out_keep_locked;
+        }
+        /* Prepare a page as a migration target */
+        __set_page_locked(new_page);
+        SetPageSwapBacked(new_page);
+        /* anon mapping, we can simply copy page->mapping to the new page: */
+        new_page->mapping = page->mapping;
+        new_page->index = page->index;
+        migrate_page_copy(new_page, page);
+        WARN_ON(PageLRU(new_page));
+        /* Recheck the target PMD */
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_same(*pmd, entry))) {
+                spin_unlock(&mm->page_table_lock);
+                /* Reverse changes made by migrate_page_copy() */
+                if (TestClearPageActive(new_page))
+                        SetPageActive(page);
+                if (TestClearPageUnevictable(new_page))
+                        SetPageUnevictable(page);
+                mlock_migrate_page(page, new_page);
+                unlock_page(new_page);
+                put_page(new_page);             /* Free it */
+                unlock_page(page);
+                putback_lru_page(page);
+                count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+                goto out;
+        }
+        /*
+         * Traditional migration needs to prepare the memcg charge
+         * transaction early to prevent the old page from being
+         * uncharged when installing migration entries.  Here we can
+         * save the potential rollback and start the charge transfer
+         * only when migration is already known to end successfully.
+         */
+        mem_cgroup_prepare_migration(page, new_page, &memcg);
+        entry = mk_pmd(new_page, vma->vm_page_prot);
+        entry = pmd_mknonnuma(entry);
+        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+        entry = pmd_mkhuge(entry);
+        page_add_new_anon_rmap(new_page, vma, haddr);
+        set_pmd_at(mm, haddr, pmd, entry);
+        update_mmu_cache_pmd(vma, address, &entry);
+        page_remove_rmap(page);
+        /*
+         * Finish the charge transaction under the page table lock to
+         * prevent split_huge_page() from dividing up the charge
+         * before it's fully transferred to the new page.
+         */
+        mem_cgroup_end_migration(memcg, page, new_page, true);
+        spin_unlock(&mm->page_table_lock);
+        unlock_page(new_page);
+        unlock_page(page);
+        put_page(page);                 /* Drop the rmap reference */
+        put_page(page);                 /* Drop the LRU isolation reference */
+        count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
+        count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
+out:
+        mod_zone_page_state(page_zone(page),
+                        NR_ISOLATED_ANON + page_lru,
+                        -HPAGE_PMD_NR);
+        return isolated;
+out_dropref:
+        put_page(page);
+out_keep_locked:
+        return 0;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_NUMA */
diff --git a/mm/mmap.c b/mm/mmap.c
index 9a796c41e7d9..35730ee9d515 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -31,6 +31,7 @@
 #include <linux/audit.h>
 #include <linux/khugepaged.h>
 #include <linux/uprobes.h>
+#include <linux/rbtree_augmented.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -89,6 +90,20 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
 struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
 /*
+ * The global memory commitment made in the system can be a metric
+ * that can be used to drive ballooning decisions when Linux is hosted
+ * as a guest. On Hyper-V, the host implements a policy engine for dynamically
+ * balancing memory across competing virtual machines that are hosted.
+ * Several metrics drive this policy engine including the guest reported
+ * memory commitment.
+ */
+unsigned long vm_memory_committed(void)
+{
+        return percpu_counter_read_positive(&vm_committed_as);
+}
+EXPORT_SYMBOL_GPL(vm_memory_committed);
+/*
 * Check that a process has enough memory to allocate a new virtual
 * mapping. 0 means there is enough memory for the allocation to
 * succeed and -ENOMEM implies there is not.
@@ -297,40 +312,88 @@ out:
        return retval;
 }
+static long vma_compute_subtree_gap(struct vm_area_struct *vma)
+{
+        unsigned long max, subtree_gap;
+        max = vma->vm_start;
+        if (vma->vm_prev)
+                max -= vma->vm_prev->vm_end;
+        if (vma->vm_rb.rb_left) {
+                subtree_gap = rb_entry(vma->vm_rb.rb_left,
+                                struct vm_area_struct, vm_rb)->rb_subtree_gap;
+                if (subtree_gap > max)
+                        max = subtree_gap;
+        }
+        if (vma->vm_rb.rb_right) {
+                subtree_gap = rb_entry(vma->vm_rb.rb_right,
+                                struct vm_area_struct, vm_rb)->rb_subtree_gap;
+                if (subtree_gap > max)
+                        max = subtree_gap;
+        }
+        return max;
+}
 #ifdef CONFIG_DEBUG_VM_RB
 static int browse_rb(struct rb_root *root)
 {
-        int i = 0, j;
+        int i = 0, j, bug = 0;
        struct rb_node *nd, *pn = NULL;
        unsigned long prev = 0, pend = 0;
        for (nd = rb_first(root); nd; nd = rb_next(nd)) {
                struct vm_area_struct *vma;
                vma = rb_entry(nd, struct vm_area_struct, vm_rb);
-                if (vma->vm_start < prev)
+                if (vma->vm_start < prev) {
-                        printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1;
+                        printk("vm_start %lx prev %lx\n", vma->vm_start, prev);
-                if (vma->vm_start < pend)
+                        bug = 1;
+                }
+                if (vma->vm_start < pend) {
                        printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
-                if (vma->vm_start > vma->vm_end)
+                        bug = 1;
-                        printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start);
+                }
+                if (vma->vm_start > vma->vm_end) {
+                        printk("vm_end %lx < vm_start %lx\n",
+                                vma->vm_end, vma->vm_start);
+                        bug = 1;
+                }
+                if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
+                        printk("free gap %lx, correct %lx\n",
+                               vma->rb_subtree_gap,
+                               vma_compute_subtree_gap(vma));
+                        bug = 1;
+                }
                i++;
                pn = nd;
                prev = vma->vm_start;
                pend = vma->vm_end;
        }
        j = 0;
-        for (nd = pn; nd; nd = rb_prev(nd)) {
+        for (nd = pn; nd; nd = rb_prev(nd))
                j++;
+        if (i != j) {
+                printk("backwards %d, forwards %d\n", j, i);
+                bug = 1;
+        }
+        return bug ? -1 : i;
+}
+static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
+{
+        struct rb_node *nd;
+        for (nd = rb_first(root); nd; nd = rb_next(nd)) {
+                struct vm_area_struct *vma;
+                vma = rb_entry(nd, struct vm_area_struct, vm_rb);
+                BUG_ON(vma != ignore &&
+                       vma->rb_subtree_gap != vma_compute_subtree_gap(vma));
        }
-        if (i != j)
-                printk("backwards %d, forwards %d\n", j, i), i = 0;
-        return i;
 }
 void validate_mm(struct mm_struct *mm)
 {
        int bug = 0;
        int i = 0;
+        unsigned long highest_address = 0;
        struct vm_area_struct *vma = mm->mmap;
        while (vma) {
                struct anon_vma_chain *avc;
@@ -338,20 +401,73 @@ void validate_mm(struct mm_struct *mm)
                list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                        anon_vma_interval_tree_verify(avc);
                vma_unlock_anon_vma(vma);
+                highest_address = vma->vm_end;
                vma = vma->vm_next;
                i++;
        }
-        if (i != mm->map_count)
+        if (i != mm->map_count) {
-                printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
+                printk("map_count %d vm_next %d\n", mm->map_count, i);
+                bug = 1;
+        }
+        if (highest_address != mm->highest_vm_end) {
+                printk("mm->highest_vm_end %lx, found %lx\n",
+                       mm->highest_vm_end, highest_address);
+                bug = 1;
+        }
        i = browse_rb(&mm->mm_rb);
-        if (i != mm->map_count)
+        if (i != mm->map_count) {
-                printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
+                printk("map_count %d rb %d\n", mm->map_count, i);
+                bug = 1;
+        }
        BUG_ON(bug);
 }
 #else
+#define validate_mm_rb(root, ignore) do { } while (0)
 #define validate_mm(mm) do { } while (0)
 #endif
+RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
+                     unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
+/*
+ * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
+ * vma->vm_prev->vm_end values changed, without modifying the vma's position
+ * in the rbtree.
+ */
+static void vma_gap_update(struct vm_area_struct *vma)
+{
+        /*
+         * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
+         * function that does exacltly what we want.
+         */
+        vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
+}
+static inline void vma_rb_insert(struct vm_area_struct *vma,
+                                 struct rb_root *root)
+{
+        /* All rb_subtree_gap values must be consistent prior to insertion */
+        validate_mm_rb(root, NULL);
+        rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
+}
+static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
+{
+        /*
+         * All rb_subtree_gap values must be consistent prior to erase,
+         * with the possible exception of the vma being erased.
+         */
+        validate_mm_rb(root, vma);
+        /*
+         * Note rb_erase_augmented is a fairly large inline function,
+         * so make sure we instantiate it only once with our desired
+         * augmented rbtree callbacks.
+         */
+        rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
+}
 /*
 * vma has some anon_vma assigned, and is already inserted on that
 * anon_vma's interval trees.
@@ -421,8 +537,25 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
                struct rb_node **rb_link, struct rb_node *rb_parent)
 {
+        /* Update tracking information for the gap following the new vma. */
+        if (vma->vm_next)
+                vma_gap_update(vma->vm_next);
+        else
+                mm->highest_vm_end = vma->vm_end;
+        /*
+         * vma->vm_prev wasn't known when we followed the rbtree to find the
+         * correct insertion point for that vma. As a result, we could not
+         * update the vma vm_rb parents rb_subtree_gap values on the way down.
+         * So, we first insert the vma with a zero rb_subtree_gap value
+         * (to be consistent with what we did on the way down), and then
+         * immediately update the gap to the correct value. Finally we
+         * rebalance the rbtree after all augmented values have been set.
+         */
        rb_link_node(&vma->vm_rb, rb_parent, rb_link);
-        rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+        vma->rb_subtree_gap = 0;
+        vma_gap_update(vma);
+        vma_rb_insert(vma, &mm->mm_rb);
 }
 static void __vma_link_file(struct vm_area_struct *vma)
@@ -498,12 +631,12 @@ static inline void
 __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
                struct vm_area_struct *prev)
 {
-        struct vm_area_struct *next = vma->vm_next;
+        struct vm_area_struct *next;
-        prev->vm_next = next;
+        vma_rb_erase(vma, &mm->mm_rb);
+        prev->vm_next = next = vma->vm_next;
        if (next)
                next->vm_prev = prev;
-        rb_erase(&vma->vm_rb, &mm->mm_rb);
        if (mm->mmap_cache == vma)
                mm->mmap_cache = prev;
 }
@@ -525,6 +658,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
        struct rb_root *root = NULL;
        struct anon_vma *anon_vma = NULL;
        struct file *file = vma->vm_file;
+        bool start_changed = false, end_changed = false;
        long adjust_next = 0;
        int remove_next = 0;
@@ -602,7 +736,7 @@ again:			remove_next = 1 + (end > next->vm_end);
        if (anon_vma) {
                VM_BUG_ON(adjust_next && next->anon_vma &&
                          anon_vma != next->anon_vma);
-                anon_vma_lock(anon_vma);
+                anon_vma_lock_write(anon_vma);
                anon_vma_interval_tree_pre_update_vma(vma);
                if (adjust_next)
                        anon_vma_interval_tree_pre_update_vma(next);
@@ -615,8 +749,14 @@ again:			remove_next = 1 + (end > next->vm_end);
                        vma_interval_tree_remove(next, root);
        }
-        vma->vm_start = start;
+        if (start != vma->vm_start) {
-        vma->vm_end = end;
+                vma->vm_start = start;
+                start_changed = true;
+        }
+        if (end != vma->vm_end) {
+                vma->vm_end = end;
+                end_changed = true;
+        }
        vma->vm_pgoff = pgoff;
        if (adjust_next) {
                next->vm_start += adjust_next << PAGE_SHIFT;
@@ -645,6 +785,15 @@ again:			remove_next = 1 + (end > next->vm_end);
                 * (it may either follow vma or precede it).
                 */
                __insert_vm_struct(mm, insert);
+        } else {
+                if (start_changed)
+                        vma_gap_update(vma);
+                if (end_changed) {
+                        if (!next)
+                                mm->highest_vm_end = end;
+                        else if (!adjust_next)
+                                vma_gap_update(next);
+                }
        }
        if (anon_vma) {
@@ -678,10 +827,13 @@ again:			remove_next = 1 + (end > next->vm_end);
                 * we must remove another next too. It would clutter
                 * up the code too much to do both in one go.
                 */
-                if (remove_next == 2) {
+                next = vma->vm_next;
-                        next = vma->vm_next;
+                if (remove_next == 2)
                        goto again;
-                }
+                else if (next)
+                        vma_gap_update(next);
+                else
+                        mm->highest_vm_end = end;
        }
        if (insert && file)
                uprobe_mmap(insert);
@@ -1153,8 +1305,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
                 * memory so no accounting is necessary
                 */
                file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
-                                                VM_NORESERVE, &user,
+                                VM_NORESERVE,
-                                                HUGETLB_ANONHUGE_INODE);
+                                &user, HUGETLB_ANONHUGE_INODE,
+                                (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
                if (IS_ERR(file))
                        return PTR_ERR(file);
        }
@@ -1335,7 +1488,11 @@ munmap_back:
                 *
                 * Answer: Yes, several device drivers can do it in their
                 *         f_op->mmap method. -DaveM
+                 * Bug: If addr is changed, prev, rb_link, rb_parent should
+                 *      be updated for vma_link()
                 */
+                WARN_ON_ONCE(addr != vma->vm_start);
                addr = vma->vm_start;
                pgoff = vma->vm_pgoff;
                vm_flags = vma->vm_flags;
@@ -1400,6 +1557,206 @@ unacct_error:
        return error;
 }
+unsigned long unmapped_area(struct vm_unmapped_area_info *info)
+{
+        /*
+         * We implement the search by looking for an rbtree node that
+         * immediately follows a suitable gap. That is,
+         * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
+         * - gap_end   = vma->vm_start        >= info->low_limit  + length;
+         * - gap_end - gap_start >= length
+         */
+        struct mm_struct *mm = current->mm;
+        struct vm_area_struct *vma;
+        unsigned long length, low_limit, high_limit, gap_start, gap_end;
+        /* Adjust search length to account for worst case alignment overhead */
+        length = info->length + info->align_mask;
+        if (length < info->length)
+                return -ENOMEM;
+        /* Adjust search limits by the desired length */
+        if (info->high_limit < length)
+                return -ENOMEM;
+        high_limit = info->high_limit - length;
+        if (info->low_limit > high_limit)
+                return -ENOMEM;
+        low_limit = info->low_limit + length;
+        /* Check if rbtree root looks promising */
+        if (RB_EMPTY_ROOT(&mm->mm_rb))
+                goto check_highest;
+        vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
+        if (vma->rb_subtree_gap < length)
+                goto check_highest;
+        while (true) {
+                /* Visit left subtree if it looks promising */
+                gap_end = vma->vm_start;
+                if (gap_end >= low_limit && vma->vm_rb.rb_left) {
+                        struct vm_area_struct *left =
+                                rb_entry(vma->vm_rb.rb_left,
+                                         struct vm_area_struct, vm_rb);
+                        if (left->rb_subtree_gap >= length) {
+                                vma = left;
+                                continue;
+                        }
+                }
+                gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
+check_current:
+                /* Check if current node has a suitable gap */
+                if (gap_start > high_limit)
+                        return -ENOMEM;
+                if (gap_end >= low_limit && gap_end - gap_start >= length)
+                        goto found;
+                /* Visit right subtree if it looks promising */
+                if (vma->vm_rb.rb_right) {
+                        struct vm_area_struct *right =
+                                rb_entry(vma->vm_rb.rb_right,
+                                         struct vm_area_struct, vm_rb);
+                        if (right->rb_subtree_gap >= length) {
+                                vma = right;
+                                continue;
+                        }
+                }
+                /* Go back up the rbtree to find next candidate node */
+                while (true) {
+                        struct rb_node *prev = &vma->vm_rb;
+                        if (!rb_parent(prev))
+                                goto check_highest;
+                        vma = rb_entry(rb_parent(prev),
+                                       struct vm_area_struct, vm_rb);
+                        if (prev == vma->vm_rb.rb_left) {
+                                gap_start = vma->vm_prev->vm_end;
+                                gap_end = vma->vm_start;
+                                goto check_current;
+                        }
+                }
+        }
+check_highest:
+        /* Check highest gap, which does not precede any rbtree node */
+        gap_start = mm->highest_vm_end;
+        gap_end = ULONG_MAX;  /* Only for VM_BUG_ON below */
+        if (gap_start > high_limit)
+                return -ENOMEM;
+found:
+        /* We found a suitable gap. Clip it with the original low_limit. */
+        if (gap_start < info->low_limit)
+                gap_start = info->low_limit;
+        /* Adjust gap address to the desired alignment */
+        gap_start += (info->align_offset - gap_start) & info->align_mask;
+        VM_BUG_ON(gap_start + info->length > info->high_limit);
+        VM_BUG_ON(gap_start + info->length > gap_end);
+        return gap_start;
+}
+unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
+{
+        struct mm_struct *mm = current->mm;
+        struct vm_area_struct *vma;
+        unsigned long length, low_limit, high_limit, gap_start, gap_end;
+        /* Adjust search length to account for worst case alignment overhead */
+        length = info->length + info->align_mask;
+        if (length < info->length)
+                return -ENOMEM;
+        /*
+         * Adjust search limits by the desired length.
+         * See implementation comment at top of unmapped_area().
+         */
+        gap_end = info->high_limit;
+        if (gap_end < length)
+                return -ENOMEM;
+        high_limit = gap_end - length;
+        if (info->low_limit > high_limit)
+                return -ENOMEM;
+        low_limit = info->low_limit + length;
+        /* Check highest gap, which does not precede any rbtree node */
+        gap_start = mm->highest_vm_end;
+        if (gap_start <= high_limit)
+                goto found_highest;
+        /* Check if rbtree root looks promising */
+        if (RB_EMPTY_ROOT(&mm->mm_rb))
+                return -ENOMEM;
+        vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
+        if (vma->rb_subtree_gap < length)
+                return -ENOMEM;
+        while (true) {
+                /* Visit right subtree if it looks promising */
+                gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
+                if (gap_start <= high_limit && vma->vm_rb.rb_right) {
+                        struct vm_area_struct *right =
+                                rb_entry(vma->vm_rb.rb_right,
+                                         struct vm_area_struct, vm_rb);
+                        if (right->rb_subtree_gap >= length) {
+                                vma = right;
+                                continue;
+                        }
+                }
+check_current:
+                /* Check if current node has a suitable gap */
+                gap_end = vma->vm_start;
+                if (gap_end < low_limit)
+                        return -ENOMEM;
+                if (gap_start <= high_limit && gap_end - gap_start >= length)
+                        goto found;
+                /* Visit left subtree if it looks promising */
+                if (vma->vm_rb.rb_left) {
+                        struct vm_area_struct *left =
+                                rb_entry(vma->vm_rb.rb_left,
+                                         struct vm_area_struct, vm_rb);
+                        if (left->rb_subtree_gap >= length) {
+                                vma = left;
+                                continue;
+                        }
+                }
+                /* Go back up the rbtree to find next candidate node */
+                while (true) {
+                        struct rb_node *prev = &vma->vm_rb;
+                        if (!rb_parent(prev))
+                                return -ENOMEM;
+                        vma = rb_entry(rb_parent(prev),
+                                       struct vm_area_struct, vm_rb);
+                        if (prev == vma->vm_rb.rb_right) {
+                                gap_start = vma->vm_prev ?
+                                        vma->vm_prev->vm_end : 0;
+                                goto check_current;
+                        }
+                }
+        }
+found:
+        /* We found a suitable gap. Clip it with the original high_limit. */
+        if (gap_end > info->high_limit)
+                gap_end = info->high_limit;
+found_highest:
+        /* Compute highest gap address at the desired alignment */
+        gap_end -= info->length;
+        gap_end -= (gap_end - info->align_offset) & info->align_mask;
+        VM_BUG_ON(gap_end < info->low_limit);
+        VM_BUG_ON(gap_end < gap_start);
+        return gap_end;
+}
 /* Get an address range which is currently unmapped.
 * For shmat() with addr=0.
 *
@@ -1418,7 +1775,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
-        unsigned long start_addr;
+        struct vm_unmapped_area_info info;
        if (len > TASK_SIZE)
                return -ENOMEM;
@@ -1433,40 +1790,13 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
                    (!vma || addr + len <= vma->vm_start))
                        return addr;
        }
-        if (len > mm->cached_hole_size) {
-                start_addr = addr = mm->free_area_cache;
-        } else {
-                start_addr = addr = TASK_UNMAPPED_BASE;
-                mm->cached_hole_size = 0;
-        }
-full_search:
+        info.flags = 0;
-        for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+        info.length = len;
-                /* At this point:  (!vma || addr < vma->vm_end). */
+        info.low_limit = TASK_UNMAPPED_BASE;
-                if (TASK_SIZE - len < addr) {
+        info.high_limit = TASK_SIZE;
-                        /*
+        info.align_mask = 0;
-                         * Start a new search - just in case we missed
+        return vm_unmapped_area(&info);
-                         * some holes.
-                         */
-                        if (start_addr != TASK_UNMAPPED_BASE) {
-                                addr = TASK_UNMAPPED_BASE;
-                                start_addr = addr;
-                                mm->cached_hole_size = 0;
-                                goto full_search;
-                        }
-                        return -ENOMEM;
-                }
-                if (!vma || addr + len <= vma->vm_start) {
-                        /*
-                         * Remember the place where we stopped the search:
-                         */
-                        mm->free_area_cache = addr + len;
-                        return addr;
-                }
-                if (addr + mm->cached_hole_size < vma->vm_start)
-                        mm->cached_hole_size = vma->vm_start - addr;
-                addr = vma->vm_end;
-        }
 }
 #endif  
@@ -1491,7 +1821,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 {
        struct vm_area_struct *vma;
        struct mm_struct *mm = current->mm;
-        unsigned long addr = addr0, start_addr;
+        unsigned long addr = addr0;
+        struct vm_unmapped_area_info info;
        /* requested length too big for entire address space */
        if (len > TASK_SIZE)
@@ -1509,53 +1840,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
                        return addr;
        }
-        /* check if free_area_cache is useful for us */
+        info.flags = VM_UNMAPPED_AREA_TOPDOWN;
-        if (len <= mm->cached_hole_size) {
+        info.length = len;
-                mm->cached_hole_size = 0;
+        info.low_limit = PAGE_SIZE;
-                mm->free_area_cache = mm->mmap_base;
+        info.high_limit = mm->mmap_base;
-        }
+        info.align_mask = 0;
+        addr = vm_unmapped_area(&info);
-try_again:
-        /* either no address requested or can't fit in requested address hole */
-        start_addr = addr = mm->free_area_cache;
-        if (addr < len)
-                goto fail;
-        addr -= len;
-        do {
-                /*
-                 * Lookup failure means no vma is above this address,
-                 * else if new region fits below vma->vm_start,
-                 * return with success:
-                 */
-                vma = find_vma(mm, addr);
-                if (!vma || addr+len <= vma->vm_start)
-                        /* remember the address as a hint for next time */
-                        return (mm->free_area_cache = addr);
-                /* remember the largest hole we saw so far */
-                if (addr + mm->cached_hole_size < vma->vm_start)
-                        mm->cached_hole_size = vma->vm_start - addr;
-                /* try just below the current vma->vm_start */
-                addr = vma->vm_start-len;
-        } while (len < vma->vm_start);
-fail:
-        /*
-         * if hint left us with no space for the requested
-         * mapping then try again:
-         *
-         * Note: this is different with the case of bottomup
-         * which does the fully line-search, but we use find_vma
-         * here that causes some holes skipped.
-         */
-        if (start_addr != mm->mmap_base) {
-                mm->free_area_cache = mm->mmap_base;
-                mm->cached_hole_size = 0;
-                goto try_again;
-        }
        /*
         * A failed mmap() very likely causes application failure,
@@ -1563,14 +1853,13 @@ fail:
         * can happen with large stack limits and large mmap()
         * allocations.
         */
-        mm->cached_hole_size = ~0UL;
+        if (addr & ~PAGE_MASK) {
-        mm->free_area_cache = TASK_UNMAPPED_BASE;
+                VM_BUG_ON(addr != -ENOMEM);
-        addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
+                info.flags = 0;
-        /*
+                info.low_limit = TASK_UNMAPPED_BASE;
-         * Restore the topdown base:
+                info.high_limit = TASK_SIZE;
-         */
+                addr = vm_unmapped_area(&info);
-        mm->free_area_cache = mm->mmap_base;
+        }
-        mm->cached_hole_size = ~0UL;
        return addr;
 }
@@ -1780,9 +2069,27 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
                if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
+                                /*
+                                 * vma_gap_update() doesn't support concurrent
+                                 * updates, but we only hold a shared mmap_sem
+                                 * lock here, so we need to protect against
+                                 * concurrent vma expansions.
+                                 * vma_lock_anon_vma() doesn't help here, as
+                                 * we don't guarantee that all growable vmas
+                                 * in a mm share the same root anon vma.
+                                 * So, we reuse mm->page_table_lock to guard
+                                 * against concurrent vma expansions.
+                                 */
+                                spin_lock(&vma->vm_mm->page_table_lock);
                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_end = address;
                                anon_vma_interval_tree_post_update_vma(vma);
+                                if (vma->vm_next)
+                                        vma_gap_update(vma->vm_next);
+                                else
+                                        vma->vm_mm->highest_vm_end = address;
+                                spin_unlock(&vma->vm_mm->page_table_lock);
                                perf_event_mmap(vma);
                        }
                }
@@ -1833,10 +2140,25 @@ int expand_downwards(struct vm_area_struct *vma,
                if (grow <= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
+                                /*
+                                 * vma_gap_update() doesn't support concurrent
+                                 * updates, but we only hold a shared mmap_sem
+                                 * lock here, so we need to protect against
+                                 * concurrent vma expansions.
+                                 * vma_lock_anon_vma() doesn't help here, as
+                                 * we don't guarantee that all growable vmas
+                                 * in a mm share the same root anon vma.
+                                 * So, we reuse mm->page_table_lock to guard
+                                 * against concurrent vma expansions.
+                                 */
+                                spin_lock(&vma->vm_mm->page_table_lock);
                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_start = address;
                                vma->vm_pgoff -= grow;
                                anon_vma_interval_tree_post_update_vma(vma);
+                                vma_gap_update(vma);
+                                spin_unlock(&vma->vm_mm->page_table_lock);
                                perf_event_mmap(vma);
                        }
                }
@@ -1959,14 +2281,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
        insertion_point = (prev ? &prev->vm_next : &mm->mmap);
        vma->vm_prev = NULL;
        do {
-                rb_erase(&vma->vm_rb, &mm->mm_rb);
+                vma_rb_erase(vma, &mm->mm_rb);
                mm->map_count--;
                tail_vma = vma;
                vma = vma->vm_next;
        } while (vma && vma->vm_start < end);
        *insertion_point = vma;
-        if (vma)
+        if (vma) {
                vma->vm_prev = prev;
+                vma_gap_update(vma);
+        } else
+                mm->highest_vm_end = prev ? prev->vm_end : 0;
        tail_vma->vm_next = NULL;
        if (mm->unmap_area == arch_unmap_area)
                addr = prev ? prev->vm_end : mm->mmap_base;
@@ -2561,15 +2886,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
                 * The LSB of head.next can't change from under us
                 * because we hold the mm_all_locks_mutex.
                 */
-                mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem);
+                down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);
                /*
                 * We can safely modify head.next after taking the
-                 * anon_vma->root->mutex. If some other vma in this mm shares
+                 * anon_vma->root->rwsem. If some other vma in this mm shares
                 * the same anon_vma we won't take it again.
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us thanks to the
-                 * anon_vma->root->mutex.
+                 * anon_vma->root->rwsem.
                 */
                if (__test_and_set_bit(0, (unsigned long *)
                                       &anon_vma->root->rb_root.rb_node))
@@ -2671,7 +2996,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us until we release the
-                 * anon_vma->root->mutex.
+                 * anon_vma->root->rwsem.
                 */
                if (!__test_and_clear_bit(0, (unsigned long *)
                                          &anon_vma->root->rb_root.rb_node))
diff --git a/mm/mprotect.c b/mm/mprotect.c
index a40992610ab6..94722a4d6b43 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -35,12 +35,16 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
 }
 #endif
-static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
+static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, unsigned long end, pgprot_t newprot,
-                int dirty_accountable)
+                int dirty_accountable, int prot_numa, bool *ret_all_same_node)
 {
+        struct mm_struct *mm = vma->vm_mm;
        pte_t *pte, oldpte;
        spinlock_t *ptl;
+        unsigned long pages = 0;
+        bool all_same_node = true;
+        int last_nid = -1;
        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        arch_enter_lazy_mmu_mode();
@@ -48,17 +52,43 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                oldpte = *pte;
                if (pte_present(oldpte)) {
                        pte_t ptent;
+                        bool updated = false;
                        ptent = ptep_modify_prot_start(mm, addr, pte);
-                        ptent = pte_modify(ptent, newprot);
+                        if (!prot_numa) {
+                                ptent = pte_modify(ptent, newprot);
+                                updated = true;
+                        } else {
+                                struct page *page;
+                                page = vm_normal_page(vma, addr, oldpte);
+                                if (page) {
+                                        int this_nid = page_to_nid(page);
+                                        if (last_nid == -1)
+                                                last_nid = this_nid;
+                                        if (last_nid != this_nid)
+                                                all_same_node = false;
+                                        /* only check non-shared pages */
+                                        if (!pte_numa(oldpte) &&
+                                            page_mapcount(page) == 1) {
+                                                ptent = pte_mknuma(ptent);
+                                                updated = true;
+                                        }
+                                }
+                        }
                        /*
                         * Avoid taking write faults for pages we know to be
                         * dirty.
                         */
-                        if (dirty_accountable && pte_dirty(ptent))
+                        if (dirty_accountable && pte_dirty(ptent)) {
                                ptent = pte_mkwrite(ptent);
+                                updated = true;
+                        }
+                        if (updated)
+                                pages++;
                        ptep_modify_prot_commit(mm, addr, pte, ptent);
                } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
                        swp_entry_t entry = pte_to_swp_entry(oldpte);
@@ -72,61 +102,101 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                                set_pte_at(mm, addr, pte,
                                        swp_entry_to_pte(entry));
                        }
+                        pages++;
                }
        } while (pte++, addr += PAGE_SIZE, addr != end);
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(pte - 1, ptl);
+        *ret_all_same_node = all_same_node;
+        return pages;
 }
-static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+#ifdef CONFIG_NUMA_BALANCING
-                unsigned long addr, unsigned long end, pgprot_t newprot,
+static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
-                int dirty_accountable)
+                                       pmd_t *pmd)
+{
+        spin_lock(&mm->page_table_lock);
+        set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
+        spin_unlock(&mm->page_table_lock);
+}
+#else
+static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
+                                       pmd_t *pmd)
+{
+        BUG();
+}
+#endif /* CONFIG_NUMA_BALANCING */
+static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
+                pud_t *pud, unsigned long addr, unsigned long end,
+                pgprot_t newprot, int dirty_accountable, int prot_numa)
 {
        pmd_t *pmd;
        unsigned long next;
+        unsigned long pages = 0;
+        bool all_same_node;
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_trans_huge(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE)
-                                split_huge_page_pmd(vma->vm_mm, pmd);
+                                split_huge_page_pmd(vma, addr, pmd);
-                        else if (change_huge_pmd(vma, pmd, addr, newprot))
+                        else if (change_huge_pmd(vma, pmd, addr, newprot,
+                                                 prot_numa)) {
+                                pages += HPAGE_PMD_NR;
                                continue;
+                        }
                        /* fall through */
                }
                if (pmd_none_or_clear_bad(pmd))
                        continue;
-                change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
+                pages += change_pte_range(vma, pmd, addr, next, newprot,
-                                 dirty_accountable);
+                                 dirty_accountable, prot_numa, &all_same_node);
+                /*
+                 * If we are changing protections for NUMA hinting faults then
+                 * set pmd_numa if the examined pages were all on the same
+                 * node. This allows a regular PMD to be handled as one fault
+                 * and effectively batches the taking of the PTL
+                 */
+                if (prot_numa && all_same_node)
+                        change_pmd_protnuma(vma->vm_mm, addr, pmd);
        } while (pmd++, addr = next, addr != end);
+        return pages;
 }
-static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline unsigned long change_pud_range(struct vm_area_struct *vma,
-                unsigned long addr, unsigned long end, pgprot_t newprot,
+                pgd_t *pgd, unsigned long addr, unsigned long end,
-                int dirty_accountable)
+                pgprot_t newprot, int dirty_accountable, int prot_numa)
 {
        pud_t *pud;
        unsigned long next;
+        unsigned long pages = 0;
        pud = pud_offset(pgd, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
-                change_pmd_range(vma, pud, addr, next, newprot,
+                pages += change_pmd_range(vma, pud, addr, next, newprot,
-                                 dirty_accountable);
+                                 dirty_accountable, prot_numa);
        } while (pud++, addr = next, addr != end);
+        return pages;
 }
-static void change_protection(struct vm_area_struct *vma,
+static unsigned long change_protection_range(struct vm_area_struct *vma,
                unsigned long addr, unsigned long end, pgprot_t newprot,
-                int dirty_accountable)
+                int dirty_accountable, int prot_numa)
 {
        struct mm_struct *mm = vma->vm_mm;
        pgd_t *pgd;
        unsigned long next;
        unsigned long start = addr;
+        unsigned long pages = 0;
        BUG_ON(addr >= end);
        pgd = pgd_offset(mm, addr);
@@ -135,10 +205,32 @@ static void change_protection(struct vm_area_struct *vma,
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-                change_pud_range(vma, pgd, addr, next, newprot,
+                pages += change_pud_range(vma, pgd, addr, next, newprot,
-                                 dirty_accountable);
+                                 dirty_accountable, prot_numa);
        } while (pgd++, addr = next, addr != end);
-        flush_tlb_range(vma, start, end);
+        /* Only flush the TLB if we actually modified any entries: */
+        if (pages)
+                flush_tlb_range(vma, start, end);
+        return pages;
+}
+unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
+                       unsigned long end, pgprot_t newprot,
+                       int dirty_accountable, int prot_numa)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        unsigned long pages;
+        mmu_notifier_invalidate_range_start(mm, start, end);
+        if (is_vm_hugetlb_page(vma))
+                pages = hugetlb_change_protection(vma, start, end, newprot);
+        else
+                pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
+        mmu_notifier_invalidate_range_end(mm, start, end);
+        return pages;
 }
 int
@@ -213,12 +305,9 @@ success:
                dirty_accountable = 1;
        }
-        mmu_notifier_invalidate_range_start(mm, start, end);
+        change_protection(vma, start, end, vma->vm_page_prot,
-        if (is_vm_hugetlb_page(vma))
+                          dirty_accountable, 0);
-                hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
-        else
-                change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
-        mmu_notifier_invalidate_range_end(mm, start, end);
        vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
        vm_stat_account(mm, newflags, vma->vm_file, nrpages);
        perf_event_mmap(vma);
@@ -274,8 +363,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
                error = -EINVAL;
                if (!(vma->vm_flags & VM_GROWSDOWN))
                        goto out;
-        }
+        } else {
-        else {
                if (vma->vm_start > start)
                        goto out;
                if (unlikely(grows & PROT_GROWSUP)) {
@@ -291,9 +379,10 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
        for (nstart = start ; ; ) {
                unsigned long newflags;
-                /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
+                /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
-                newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
+                newflags = vm_flags;
+                newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
                /* newflags >> 4 shift VM_MAY% in place of VM_% */
                if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
diff --git a/mm/mremap.c b/mm/mremap.c
index 1b61c2d3307a..e1031e1f6a61 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                }
                if (vma->anon_vma) {
                        anon_vma = vma->anon_vma;
-                        anon_vma_lock(anon_vma);
+                        anon_vma_lock_write(anon_vma);
                }
        }
@@ -182,7 +182,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                                need_flush = true;
                                continue;
                        } else if (!err) {
-                                split_huge_page_pmd(vma->vm_mm, old_pmd);
+                                split_huge_page_pmd(vma, old_addr, old_pmd);
                        }
                        VM_BUG_ON(pmd_trans_huge(*old_pmd));
                }
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index ecc2f13d557d..03d152a76acf 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -137,6 +137,22 @@ unsigned long __init free_low_memory_core_early(int nodeid)
        return count;
 }
+static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
+{
+        struct zone *z;
+        /*
+         * In free_area_init_core(), highmem zone's managed_pages is set to
+         * present_pages, and bootmem allocator doesn't allocate from highmem
+         * zones. So there's no need to recalculate managed_pages because all
+         * highmem pages will be managed by the buddy system. Here highmem
+         * zone also includes highmem movable zone.
+         */
+        for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
+                if (!is_highmem(z))
+                        z->managed_pages = 0;
+}
 /**
 * free_all_bootmem - release free pages to the buddy allocator
 *
@@ -144,6 +160,11 @@ unsigned long __init free_low_memory_core_early(int nodeid)
 */
 unsigned long __init free_all_bootmem(void)
 {
+        struct pglist_data *pgdat;
+        for_each_online_pgdat(pgdat)
+                reset_node_lowmem_managed_pages(pgdat);
        /*
         * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
         *  because in some case like Node0 doesn't have RAM installed
diff --git a/mm/nommu.c b/mm/nommu.c
index 45131b41bcdb..79c3cac87afa 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -66,6 +66,21 @@ int heap_stack_gap = 0;
 atomic_long_t mmap_pages_allocated;
+/*
+ * The global memory commitment made in the system can be a metric
+ * that can be used to drive ballooning decisions when Linux is hosted
+ * as a guest. On Hyper-V, the host implements a policy engine for dynamically
+ * balancing memory across competing virtual machines that are hosted.
+ * Several metrics drive this policy engine including the guest reported
+ * memory commitment.
+ */
+unsigned long vm_memory_committed(void)
+{
+        return percpu_counter_read_positive(&vm_committed_as);
+}
+EXPORT_SYMBOL_GPL(vm_memory_committed);
 EXPORT_SYMBOL(mem_map);
 EXPORT_SYMBOL(num_physpages);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 79e0f3e24831..0399f146ae49 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,48 +44,6 @@ int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
 static DEFINE_SPINLOCK(zone_scan_lock);
-/*
- * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj
- * @old_val: old oom_score_adj for compare
- * @new_val: new oom_score_adj for swap
- *
- * Sets the oom_score_adj value for current to @new_val iff its present value is
- * @old_val.  Usually used to reinstate a previous value to prevent racing with
- * userspacing tuning the value in the interim.
- */
-void compare_swap_oom_score_adj(int old_val, int new_val)
-{
-        struct sighand_struct *sighand = current->sighand;
-        spin_lock_irq(&sighand->siglock);
-        if (current->signal->oom_score_adj == old_val)
-                current->signal->oom_score_adj = new_val;
-        trace_oom_score_adj_update(current);
-        spin_unlock_irq(&sighand->siglock);
-}
-/**
- * test_set_oom_score_adj() - set current's oom_score_adj and return old value
- * @new_val: new oom_score_adj value
- *
- * Sets the oom_score_adj value for current to @new_val with proper
- * synchronization and returns the old value.  Usually used to temporarily
- * set a value, save the old value in the caller, and then reinstate it later.
- */
-int test_set_oom_score_adj(int new_val)
-{
-        struct sighand_struct *sighand = current->sighand;
-        int old_val;
-        spin_lock_irq(&sighand->siglock);
-        old_val = current->signal->oom_score_adj;
-        current->signal->oom_score_adj = new_val;
-        trace_oom_score_adj_update(current);
-        spin_unlock_irq(&sighand->siglock);
-        return old_val;
-}
 #ifdef CONFIG_NUMA
 /**
 * has_intersects_mems_allowed() - check task eligiblity for kill
@@ -193,7 +151,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
        if (!p)
                return 0;
-        adj = p->signal->oom_score_adj;
+        adj = (long)p->signal->oom_score_adj;
        if (adj == OOM_SCORE_ADJ_MIN) {
                task_unlock(p);
                return 0;
@@ -257,7 +215,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
         * the page allocator means a mempolicy is in effect.  Cpuset policy
         * is enforced in get_page_from_freelist().
         */
-        if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
+        if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
                *totalpages = total_swap_pages;
                for_each_node_mask(nid, *nodemask)
                        *totalpages += node_spanned_pages(nid);
@@ -310,26 +268,20 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
        if (!task->mm)
                return OOM_SCAN_CONTINUE;
-        if (task->flags & PF_EXITING) {
+        /*
+         * If task is allocating a lot of memory and has been marked to be
+         * killed first if it triggers an oom, then select it.
+         */
+        if (oom_task_origin(task))
+                return OOM_SCAN_SELECT;
+        if (task->flags & PF_EXITING && !force_kill) {
                /*
-                 * If task is current and is in the process of releasing memory,
+                 * If this task is not being ptraced on exit, then wait for it
-                 * allow the "kill" to set TIF_MEMDIE, which will allow it to
+                 * to finish before killing some other task unnecessarily.
-                 * access memory reserves.  Otherwise, it may stall forever.
-                 *
-                 * The iteration isn't broken here, however, in case other
-                 * threads are found to have already been oom killed.
                 */
-                if (task == current)
+                if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
-                        return OOM_SCAN_SELECT;
+                        return OOM_SCAN_ABORT;
-                else if (!force_kill) {
-                        /*
-                         * If this task is not being ptraced on exit, then wait
-                         * for it to finish before killing some other task
-                         * unnecessarily.
-                         */
-                        if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
-                                return OOM_SCAN_ABORT;
-                }
        }
        return OOM_SCAN_OK;
 }
@@ -412,7 +364,7 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
                        continue;
                }
-                pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu         %5d %s\n",
+                pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu         %5hd %s\n",
                        task->pid, from_kuid(&init_user_ns, task_uid(task)),
                        task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
                        task->mm->nr_ptes,
@@ -428,7 +380,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 {
        task_lock(current);
        pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
-                "oom_score_adj=%d\n",
+                "oom_score_adj=%hd\n",
                current->comm, gfp_mask, order,
                current->signal->oom_score_adj);
        cpuset_print_task_mems_allowed(current);
@@ -639,43 +591,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
        spin_unlock(&zone_scan_lock);
 }
-/*
- * Try to acquire the oom killer lock for all system zones.  Returns zero if a
- * parallel oom killing is taking place, otherwise locks all zones and returns
- * non-zero.
- */
-static int try_set_system_oom(void)
-{
-        struct zone *zone;
-        int ret = 1;
-        spin_lock(&zone_scan_lock);
-        for_each_populated_zone(zone)
-                if (zone_is_oom_locked(zone)) {
-                        ret = 0;
-                        goto out;
-                }
-        for_each_populated_zone(zone)
-                zone_set_flag(zone, ZONE_OOM_LOCKED);
-out:
-        spin_unlock(&zone_scan_lock);
-        return ret;
-}
-/*
- * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation
- * attempts or page faults may now recall the oom killer, if necessary.
- */
-static void clear_system_oom(void)
-{
-        struct zone *zone;
-        spin_lock(&zone_scan_lock);
-        for_each_populated_zone(zone)
-                zone_clear_flag(zone, ZONE_OOM_LOCKED);
-        spin_unlock(&zone_scan_lock);
-}
 /**
 * out_of_memory - kill the "best" process when we run out of memory
 * @zonelist: zonelist pointer
@@ -706,11 +621,11 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
                return;
        /*
-         * If current has a pending SIGKILL, then automatically select it.  The
+         * If current has a pending SIGKILL or is exiting, then automatically
-         * goal is to allow it to allocate so that it may quickly exit and free
+         * select it.  The goal is to allow it to allocate so that it may
-         * its memory.
+         * quickly exit and free its memory.
         */
-        if (fatal_signal_pending(current)) {
+        if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
                set_thread_flag(TIF_MEMDIE);
                return;
        }
@@ -756,15 +671,16 @@ out:
 /*
 * The pagefault handler calls here because it is out of memory, so kill a
- * memory-hogging task.  If a populated zone has ZONE_OOM_LOCKED set, a parallel
+ * memory-hogging task.  If any populated zone has ZONE_OOM_LOCKED set, a
- * oom killing is already in progress so do nothing.  If a task is found with
+ * parallel oom killing is already in progress so do nothing.
- * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit.
 */
 void pagefault_out_of_memory(void)
 {
-        if (try_set_system_oom()) {
+        struct zonelist *zonelist = node_zonelist(first_online_node,
+                                                  GFP_KERNEL);
+        if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
                out_of_memory(NULL, 0, 0, NULL, false);
-                clear_system_oom();
+                clear_zonelist_oom(zonelist, GFP_KERNEL);
        }
-        schedule_timeout_killable(1);
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 830893b2b3c7..0713bfbf0954 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -201,6 +201,18 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
                     zone_reclaimable_pages(z) - z->dirty_balance_reserve;
        }
        /*
+         * Unreclaimable memory (kernel memory or anonymous memory
+         * without swap) can bring down the dirtyable pages below
+         * the zone's dirty balance reserve and the above calculation
+         * will underflow.  However we still want to add in nodes
+         * which are below threshold (negative values) to get a more
+         * accurate calculation but make sure that the total never
+         * underflows.
+         */
+        if ((long)x < 0)
+                x = 0;
+        /*
         * Make sure that the number of highmem pages is never larger
         * than the number of the total dirtyable memory. This can only
         * occur in very strange VM situations but we want to make sure
@@ -222,8 +234,8 @@ static unsigned long global_dirtyable_memory(void)
 {
        unsigned long x;
-        x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() -
+        x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
-            dirty_balance_reserve;
+        x -= min(x, dirty_balance_reserve);
        if (!vm_highmem_is_dirtyable)
                x -= highmem_dirtyable_memory(x);
@@ -290,9 +302,12 @@ static unsigned long zone_dirtyable_memory(struct zone *zone)
         * highmem zone can hold its share of dirty pages, so we don't
         * care about vm_highmem_is_dirtyable here.
         */
-        return zone_page_state(zone, NR_FREE_PAGES) +
+        unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) +
-               zone_reclaimable_pages(zone) -
+                zone_reclaimable_pages(zone);
-               zone->dirty_balance_reserve;
+        /* don't allow this to underflow */
+        nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
+        return nr_pages;
 }
 /**
@@ -1069,7 +1084,7 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
 }
 /*
- * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
+ * After a task dirtied this many pages, balance_dirty_pages_ratelimited()
 * will look to see if it needs to start dirty throttling.
 *
 * If dirty_poll_interval is too low, big NUMA machines will call the expensive
@@ -1436,9 +1451,8 @@ static DEFINE_PER_CPU(int, bdp_ratelimits);
 DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
 /**
- * balance_dirty_pages_ratelimited_nr - balance dirty memory state
+ * balance_dirty_pages_ratelimited - balance dirty memory state
 * @mapping: address_space which was dirtied
- * @nr_pages_dirtied: number of pages which the caller has just dirtied
 *
 * Processes which are dirtying memory should call in here once for each page
 * which was newly dirtied.  The function will periodically check the system's
@@ -1449,8 +1463,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
 * limit we decrease the ratelimiting by a lot, to prevent individual processes
 * from overshooting the limit by (ratelimit_pages) each.
 */
-void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
+void balance_dirty_pages_ratelimited(struct address_space *mapping)
-                                        unsigned long nr_pages_dirtied)
 {
        struct backing_dev_info *bdi = mapping->backing_dev_info;
        int ratelimit;
@@ -1484,6 +1497,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
         */
        p = &__get_cpu_var(dirty_throttle_leaks);
        if (*p > 0 && current->nr_dirtied < ratelimit) {
+                unsigned long nr_pages_dirtied;
                nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
                *p -= nr_pages_dirtied;
                current->nr_dirtied += nr_pages_dirtied;
@@ -1493,7 +1507,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
        if (unlikely(current->nr_dirtied >= ratelimit))
                balance_dirty_pages(mapping, current->nr_dirtied);
 }
-EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
+EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
 void throttle_vm_writeout(gfp_t gfp_mask)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7bb35ac0964a..df2022ff0c8a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -90,6 +90,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 #ifdef CONFIG_HIGHMEM
        [N_HIGH_MEMORY] = { { [0] = 1UL } },
 #endif
+#ifdef CONFIG_MOVABLE_NODE
+        [N_MEMORY] = { { [0] = 1UL } },
+#endif
        [N_CPU] = { { [0] = 1UL } },
 #endif  /* NUMA */
 };
@@ -218,11 +221,6 @@ EXPORT_SYMBOL(nr_online_nodes);
 int page_group_by_mobility_disabled __read_mostly;
-/*
- * NOTE:
- * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
- * Instead, use {un}set_pageblock_isolate.
- */
 void set_pageblock_migratetype(struct page *page, int migratetype)
 {
@@ -368,8 +366,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
        int nr_pages = 1 << order;
        int bad = 0;
-        if (unlikely(compound_order(page) != order) ||
+        if (unlikely(compound_order(page) != order)) {
-            unlikely(!PageHead(page))) {
                bad_page(page);
                bad++;
        }
@@ -523,7 +520,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
 * If a block is freed, and its buddy is also free, then this
 * triggers coalescing into a block of larger size.
 *
- * -- wli
+ * -- nyc
 */
 static inline void __free_one_page(struct page *page,
@@ -608,6 +605,7 @@ static inline int free_pages_check(struct page *page)
                bad_page(page);
                return 1;
        }
+        reset_page_last_nid(page);
        if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
                page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
        return 0;
@@ -667,11 +665,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
                        __free_one_page(page, zone, 0, mt);
                        trace_mm_page_pcpu_drain(page, 0, mt);
-                        if (is_migrate_cma(mt))
+                        if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
-                                __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
+                                __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
+                                if (is_migrate_cma(mt))
+                                        __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
+                        }
                } while (--to_free && --batch_free && !list_empty(list));
        }
-        __mod_zone_page_state(zone, NR_FREE_PAGES, count);
        spin_unlock(&zone->lock);
 }
@@ -730,6 +730,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
        local_irq_restore(flags);
 }
+/*
+ * Read access to zone->managed_pages is safe because it's unsigned long,
+ * but we still need to serialize writers. Currently all callers of
+ * __free_pages_bootmem() except put_page_bootmem() should only be used
+ * at boot time. So for shorter boot time, we shift the burden to
+ * put_page_bootmem() to serialize writers.
+ */
 void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
 {
        unsigned int nr_pages = 1 << order;
@@ -745,6 +752,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
                set_page_count(p, 0);
        }
+        page_zone(page)->managed_pages += 1 << order;
        set_page_refcounted(page);
        __free_pages(page, order);
 }
@@ -780,7 +788,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
 * large block of memory acted on by a series of small allocations.
 * This behavior is a critical factor in sglist merging's success.
 *
- * -- wli
+ * -- nyc
 */
 static inline void expand(struct zone *zone, struct page *page,
        int low, int high, struct free_area *area,
@@ -1376,14 +1384,8 @@ void split_page(struct page *page, unsigned int order)
                set_page_refcounted(page + i);
 }
-/*
+static int __isolate_free_page(struct page *page, unsigned int order)
- * Similar to the split_page family of functions except that the page
- * required at the given order and being isolated now to prevent races
- * with parallel allocators
- */
-int capture_free_page(struct page *page, int alloc_order, int migratetype)
 {
-        unsigned int order;
        unsigned long watermark;
        struct zone *zone;
        int mt;
@@ -1391,27 +1393,23 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
        BUG_ON(!PageBuddy(page));
        zone = page_zone(page);
-        order = page_order(page);
+        mt = get_pageblock_migratetype(page);
-        /* Obey watermarks as if the page was being allocated */
+        if (mt != MIGRATE_ISOLATE) {
-        watermark = low_wmark_pages(zone) + (1 << order);
+                /* Obey watermarks as if the page was being allocated */
-        if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+                watermark = low_wmark_pages(zone) + (1 << order);
-                return 0;
+                if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+                        return 0;
+                __mod_zone_freepage_state(zone, -(1UL << order), mt);
+        }
        /* Remove page from free list */
        list_del(&page->lru);
        zone->free_area[order].nr_free--;
        rmv_page_order(page);
-        mt = get_pageblock_migratetype(page);
+        /* Set the pageblock if the isolated page is at least a pageblock */
-        if (unlikely(mt != MIGRATE_ISOLATE))
-                __mod_zone_freepage_state(zone, -(1UL << order), mt);
-        if (alloc_order != order)
-                expand(zone, page, alloc_order, order,
-                        &zone->free_area[order], migratetype);
-        /* Set the pageblock if the captured page is at least a pageblock */
        if (order >= pageblock_order - 1) {
                struct page *endpage = page + (1 << order) - 1;
                for (; page < endpage; page += pageblock_nr_pages) {
@@ -1440,10 +1438,9 @@ int split_free_page(struct page *page)
        unsigned int order;
        int nr_pages;
-        BUG_ON(!PageBuddy(page));
        order = page_order(page);
-        nr_pages = capture_free_page(page, order, 0);
+        nr_pages = __isolate_free_page(page, order);
        if (!nr_pages)
                return 0;
@@ -1641,20 +1638,6 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        return true;
 }
-#ifdef CONFIG_MEMORY_ISOLATION
-static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
-{
-        if (unlikely(zone->nr_pageblock_isolate))
-                return zone->nr_pageblock_isolate * pageblock_nr_pages;
-        return 0;
-}
-#else
-static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
-{
-        return 0;
-}
-#endif
 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                      int classzone_idx, int alloc_flags)
 {
@@ -1670,14 +1653,6 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
        if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
                free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
-        /*
-         * If the zone has MIGRATE_ISOLATE type free pages, we should consider
-         * it.  nr_zone_isolate_freepages is never accurate so kswapd might not
-         * sleep although it could do so.  But this is more desirable for memory
-         * hotplug than sleeping which can cause a livelock in the direct
-         * reclaim path.
-         */
-        free_pages -= nr_zone_isolate_freepages(z);
        return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
                                                                free_pages);
 }
@@ -1692,7 +1667,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
 *
 * If the zonelist cache is present in the passed in zonelist, then
 * returns a pointer to the allowed node mask (either the current
- * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
+ * tasks mems_allowed, or node_states[N_MEMORY].)
 *
 * If the zonelist cache is not available for this zonelist, does
 * nothing and returns NULL.
@@ -1721,7 +1696,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
        allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
                                        &cpuset_current_mems_allowed :
-                                        &node_states[N_HIGH_MEMORY];
+                                        &node_states[N_MEMORY];
        return allowednodes;
 }
@@ -1871,7 +1846,7 @@ zonelist_scan:
         */
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                                high_zoneidx, nodemask) {
-                if (NUMA_BUILD && zlc_active &&
+                if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
                        !zlc_zone_worth_trying(zonelist, z, allowednodes))
                                continue;
                if ((alloc_flags & ALLOC_CPUSET) &&
@@ -1917,7 +1892,8 @@ zonelist_scan:
                                    classzone_idx, alloc_flags))
                                goto try_this_zone;
-                        if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
+                        if (IS_ENABLED(CONFIG_NUMA) &&
+                                        !did_zlc_setup && nr_online_nodes > 1) {
                                /*
                                 * we do zlc_setup if there are multiple nodes
                                 * and before considering the first zone allowed
@@ -1936,7 +1912,7 @@ zonelist_scan:
                         * As we may have just activated ZLC, check if the first
                         * eligible zone has failed zone_reclaim recently.
                         */
-                        if (NUMA_BUILD && zlc_active &&
+                        if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
                                !zlc_zone_worth_trying(zonelist, z, allowednodes))
                                continue;
@@ -1962,11 +1938,11 @@ try_this_zone:
                if (page)
                        break;
 this_zone_full:
-                if (NUMA_BUILD)
+                if (IS_ENABLED(CONFIG_NUMA))
                        zlc_mark_zone_full(zonelist, z);
        }
-        if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
+        if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
                /* Disable zlc cache for second zonelist scan */
                zlc_active = 0;
                goto zonelist_scan;
@@ -2148,8 +2124,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        bool *contended_compaction, bool *deferred_compaction,
        unsigned long *did_some_progress)
 {
-        struct page *page = NULL;
        if (!order)
                return NULL;
@@ -2161,16 +2135,12 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        current->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
                                                nodemask, sync_migration,
-                                                contended_compaction, &page);
+                                                contended_compaction);
        current->flags &= ~PF_MEMALLOC;
-        /* If compaction captured a page, prep and use it */
-        if (page) {
-                prep_new_page(page, order, gfp_mask);
-                goto got_page;
-        }
        if (*did_some_progress != COMPACT_SKIPPED) {
+                struct page *page;
                /* Page migration frees to the PCP lists but we want merging */
                drain_pages(get_cpu());
                put_cpu();
@@ -2180,7 +2150,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                                alloc_flags & ~ALLOC_NO_WATERMARKS,
                                preferred_zone, migratetype);
                if (page) {
-got_page:
                        preferred_zone->compact_blockskip_flush = false;
                        preferred_zone->compact_considered = 0;
                        preferred_zone->compact_defer_shift = 0;
@@ -2266,7 +2235,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
                return NULL;
        /* After successful reclaim, reconsider all zones for allocation */
-        if (NUMA_BUILD)
+        if (IS_ENABLED(CONFIG_NUMA))
                zlc_clear_zones_full(zonelist);
 retry:
@@ -2412,12 +2381,14 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         * allowed per node queues are empty and that nodes are
         * over allocated.
         */
-        if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+        if (IS_ENABLED(CONFIG_NUMA) &&
+                        (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
                goto nopage;
 restart:
-        wake_all_kswapd(order, zonelist, high_zoneidx,
+        if (!(gfp_mask & __GFP_NO_KSWAPD))
-                                        zone_idx(preferred_zone));
+                wake_all_kswapd(order, zonelist, high_zoneidx,
+                                                zone_idx(preferred_zone));
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -2494,7 +2465,7 @@ rebalance:
         * system then fail the allocation instead of entering direct reclaim.
         */
        if ((deferred_compaction || contended_compaction) &&
-            (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE)
+                                                (gfp_mask & __GFP_NO_KSWAPD))
                goto nopage;
        /* Try direct reclaim and then allocating */
@@ -2595,6 +2566,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        int migratetype = allocflags_to_migratetype(gfp_mask);
        unsigned int cpuset_mems_cookie;
        int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
+        struct mem_cgroup *memcg = NULL;
        gfp_mask &= gfp_allowed_mask;
@@ -2613,6 +2585,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        if (unlikely(!zonelist->_zonerefs->zone))
                return NULL;
+        /*
+         * Will only have any effect when __GFP_KMEMCG is set.  This is
+         * verified in the (always inline) callee
+         */
+        if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
+                return NULL;
 retry_cpuset:
        cpuset_mems_cookie = get_mems_allowed();
@@ -2648,6 +2627,8 @@ out:
        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
                goto retry_cpuset;
+        memcg_kmem_commit_charge(page, memcg, order);
        return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2700,6 +2681,31 @@ void free_pages(unsigned long addr, unsigned int order)
 EXPORT_SYMBOL(free_pages);
+/*
+ * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
+ * pages allocated with __GFP_KMEMCG.
+ *
+ * Those pages are accounted to a particular memcg, embedded in the
+ * corresponding page_cgroup. To avoid adding a hit in the allocator to search
+ * for that information only to find out that it is NULL for users who have no
+ * interest in that whatsoever, we provide these functions.
+ *
+ * The caller knows better which flags it relies on.
+ */
+void __free_memcg_kmem_pages(struct page *page, unsigned int order)
+{
+        memcg_kmem_uncharge_pages(page, order);
+        __free_pages(page, order);
+}
+void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
+{
+        if (addr != 0) {
+                VM_BUG_ON(!virt_addr_valid((void *)addr));
+                __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
+        }
+}
 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
 {
        if (addr) {
@@ -2818,7 +2824,7 @@ unsigned int nr_free_pagecache_pages(void)
 static inline void show_node(struct zone *zone)
 {
-        if (NUMA_BUILD)
+        if (IS_ENABLED(CONFIG_NUMA))
                printk("Node %d ", zone_to_nid(zone));
 }
@@ -2876,6 +2882,31 @@ out:
 #define K(x) ((x) << (PAGE_SHIFT-10))
+static void show_migration_types(unsigned char type)
+{
+        static const char types[MIGRATE_TYPES] = {
+                [MIGRATE_UNMOVABLE]     = 'U',
+                [MIGRATE_RECLAIMABLE]   = 'E',
+                [MIGRATE_MOVABLE]       = 'M',
+                [MIGRATE_RESERVE]       = 'R',
+#ifdef CONFIG_CMA
+                [MIGRATE_CMA]           = 'C',
+#endif
+                [MIGRATE_ISOLATE]       = 'I',
+        };
+        char tmp[MIGRATE_TYPES + 1];
+        char *p = tmp;
+        int i;
+        for (i = 0; i < MIGRATE_TYPES; i++) {
+                if (type & (1 << i))
+                        *p++ = types[i];
+        }
+        *p = '\0';
+        printk("(%s) ", tmp);
+}
 /*
 * Show free area list (used inside shift_scroll-lock stuff)
 * We also calculate the percentage fragmentation. We do this by counting the
@@ -2950,6 +2981,7 @@ void show_free_areas(unsigned int filter)
                        " isolated(anon):%lukB"
                        " isolated(file):%lukB"
                        " present:%lukB"
+                        " managed:%lukB"
                        " mlocked:%lukB"
                        " dirty:%lukB"
                        " writeback:%lukB"
@@ -2979,6 +3011,7 @@ void show_free_areas(unsigned int filter)
                        K(zone_page_state(zone, NR_ISOLATED_ANON)),
                        K(zone_page_state(zone, NR_ISOLATED_FILE)),
                        K(zone->present_pages),
+                        K(zone->managed_pages),
                        K(zone_page_state(zone, NR_MLOCK)),
                        K(zone_page_state(zone, NR_FILE_DIRTY)),
                        K(zone_page_state(zone, NR_WRITEBACK)),
@@ -3004,6 +3037,7 @@ void show_free_areas(unsigned int filter)
        for_each_populated_zone(zone) {
                unsigned long nr[MAX_ORDER], flags, order, total = 0;
+                unsigned char types[MAX_ORDER];
                if (skip_free_areas_node(filter, zone_to_nid(zone)))
                        continue;
@@ -3012,12 +3046,24 @@ void show_free_areas(unsigned int filter)
                spin_lock_irqsave(&zone->lock, flags);
                for (order = 0; order < MAX_ORDER; order++) {
-                        nr[order] = zone->free_area[order].nr_free;
+                        struct free_area *area = &zone->free_area[order];
+                        int type;
+                        nr[order] = area->nr_free;
                        total += nr[order] << order;
+                        types[order] = 0;
+                        for (type = 0; type < MIGRATE_TYPES; type++) {
+                                if (!list_empty(&area->free_list[type]))
+                                        types[order] |= 1 << type;
+                        }
                }
                spin_unlock_irqrestore(&zone->lock, flags);
-                for (order = 0; order < MAX_ORDER; order++)
+                for (order = 0; order < MAX_ORDER; order++) {
                        printk("%lu*%lukB ", nr[order], K(1UL) << order);
+                        if (nr[order])
+                                show_migration_types(types[order]);
+                }
                printk("= %lukB\n", K(total));
        }
@@ -3194,7 +3240,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
                return node;
        }
-        for_each_node_state(n, N_HIGH_MEMORY) {
+        for_each_node_state(n, N_MEMORY) {
                /* Don't want a node to appear more than once */
                if (node_isset(n, *used_node_mask))
@@ -3336,7 +3382,7 @@ static int default_zonelist_order(void)
         * local memory, NODE_ORDER may be suitable.
         */
        average_size = total_size /
-                                (nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
+                                (nodes_weight(node_states[N_MEMORY]) + 1);
        for_each_online_node(nid) {
                low_kmem_size = 0;
                total_size = 0;
@@ -3826,6 +3872,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                mminit_verify_page_links(page, zone, nid, pfn);
                init_page_count(page);
                reset_page_mapcount(page);
+                reset_page_last_nid(page);
                SetPageReserved(page);
                /*
                 * Mark the block movable so that blocks are reserved for
@@ -4432,6 +4479,26 @@ void __init set_pageblock_order(void)
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
+static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
+                                                   unsigned long present_pages)
+{
+        unsigned long pages = spanned_pages;
+        /*
+         * Provide a more accurate estimation if there are holes within
+         * the zone and SPARSEMEM is in use. If there are holes within the
+         * zone, each populated memory region may cost us one or two extra
+         * memmap pages due to alignment because memmap pages for each
+         * populated regions may not naturally algined on page boundary.
+         * So the (present_pages >> 4) heuristic is a tradeoff for that.
+         */
+        if (spanned_pages > present_pages + (present_pages >> 4) &&
+            IS_ENABLED(CONFIG_SPARSEMEM))
+                pages = present_pages;
+        return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
+}
 /*
 * Set up the zone data structures:
 *   - mark all pages reserved
@@ -4449,54 +4516,67 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
        int ret;
        pgdat_resize_init(pgdat);
+#ifdef CONFIG_NUMA_BALANCING
+        spin_lock_init(&pgdat->numabalancing_migrate_lock);
+        pgdat->numabalancing_migrate_nr_pages = 0;
+        pgdat->numabalancing_migrate_next_window = jiffies;
+#endif
        init_waitqueue_head(&pgdat->kswapd_wait);
        init_waitqueue_head(&pgdat->pfmemalloc_wait);
        pgdat_page_cgroup_init(pgdat);
        for (j = 0; j < MAX_NR_ZONES; j++) {
                struct zone *zone = pgdat->node_zones + j;
-                unsigned long size, realsize, memmap_pages;
+                unsigned long size, realsize, freesize, memmap_pages;
                size = zone_spanned_pages_in_node(nid, j, zones_size);
-                realsize = size - zone_absent_pages_in_node(nid, j,
+                realsize = freesize = size - zone_absent_pages_in_node(nid, j,
                                                                zholes_size);
                /*
-                 * Adjust realsize so that it accounts for how much memory
+                 * Adjust freesize so that it accounts for how much memory
                 * is used by this zone for memmap. This affects the watermark
                 * and per-cpu initialisations
                 */
-                memmap_pages =
+                memmap_pages = calc_memmap_size(size, realsize);
-                        PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
+                if (freesize >= memmap_pages) {
-                if (realsize >= memmap_pages) {
+                        freesize -= memmap_pages;
-                        realsize -= memmap_pages;
                        if (memmap_pages)
                                printk(KERN_DEBUG
                                       "  %s zone: %lu pages used for memmap\n",
                                       zone_names[j], memmap_pages);
                } else
                        printk(KERN_WARNING
-                                "  %s zone: %lu pages exceeds realsize %lu\n",
+                                "  %s zone: %lu pages exceeds freesize %lu\n",
-                                zone_names[j], memmap_pages, realsize);
+                                zone_names[j], memmap_pages, freesize);
                /* Account for reserved pages */
-                if (j == 0 && realsize > dma_reserve) {
+                if (j == 0 && freesize > dma_reserve) {
-                        realsize -= dma_reserve;
+                        freesize -= dma_reserve;
                        printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
                                        zone_names[0], dma_reserve);
                }
                if (!is_highmem_idx(j))
-                        nr_kernel_pages += realsize;
+                        nr_kernel_pages += freesize;
-                nr_all_pages += realsize;
+                /* Charge for highmem memmap if there are enough kernel pages */
+                else if (nr_kernel_pages > memmap_pages * 2)
+                        nr_kernel_pages -= memmap_pages;
+                nr_all_pages += freesize;
                zone->spanned_pages = size;
-                zone->present_pages = realsize;
+                zone->present_pages = freesize;
+                /*
+                 * Set an approximate value for lowmem here, it will be adjusted
+                 * when the bootmem allocator frees pages into the buddy system.
+                 * And all highmem pages will be managed by the buddy system.
+                 */
+                zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
 #ifdef CONFIG_NUMA
                zone->node = nid;
-                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
+                zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
                                                / 100;
-                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
+                zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
 #endif
                zone->name = zone_names[j];
                spin_lock_init(&zone->lock);
@@ -4687,7 +4767,7 @@ unsigned long __init find_min_pfn_with_active_regions(void)
 /*
 * early_calculate_totalpages()
 * Sum pages in active regions for movable zone.
- * Populate N_HIGH_MEMORY for calculating usable_nodes.
+ * Populate N_MEMORY for calculating usable_nodes.
 */
 static unsigned long __init early_calculate_totalpages(void)
 {
@@ -4700,7 +4780,7 @@ static unsigned long __init early_calculate_totalpages(void)
                totalpages += pages;
                if (pages)
-                        node_set_state(nid, N_HIGH_MEMORY);
+                        node_set_state(nid, N_MEMORY);
        }
        return totalpages;
 }
@@ -4717,9 +4797,9 @@ static void __init find_zone_movable_pfns_for_nodes(void)
        unsigned long usable_startpfn;
        unsigned long kernelcore_node, kernelcore_remaining;
        /* save the state before borrow the nodemask */
-        nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
+        nodemask_t saved_node_state = node_states[N_MEMORY];
        unsigned long totalpages = early_calculate_totalpages();
-        int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
+        int usable_nodes = nodes_weight(node_states[N_MEMORY]);
        /*
         * If movablecore was specified, calculate what size of
@@ -4754,7 +4834,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
 restart:
        /* Spread kernelcore memory as evenly as possible throughout nodes */
        kernelcore_node = required_kernelcore / usable_nodes;
-        for_each_node_state(nid, N_HIGH_MEMORY) {
+        for_each_node_state(nid, N_MEMORY) {
                unsigned long start_pfn, end_pfn;
                /*
@@ -4846,23 +4926,27 @@ restart:
 out:
        /* restore the node_state */
-        node_states[N_HIGH_MEMORY] = saved_node_state;
+        node_states[N_MEMORY] = saved_node_state;
 }
-/* Any regular memory on that node ? */
+/* Any regular or high memory on that node ? */
-static void __init check_for_regular_memory(pg_data_t *pgdat)
+static void check_for_memory(pg_data_t *pgdat, int nid)
 {
-#ifdef CONFIG_HIGHMEM
        enum zone_type zone_type;
-        for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
+        if (N_MEMORY == N_NORMAL_MEMORY)
+                return;
+        for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
                struct zone *zone = &pgdat->node_zones[zone_type];
                if (zone->present_pages) {
-                        node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
+                        node_set_state(nid, N_HIGH_MEMORY);
+                        if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
+                            zone_type <= ZONE_NORMAL)
+                                node_set_state(nid, N_NORMAL_MEMORY);
                        break;
                }
        }
-#endif
 }
 /**
@@ -4945,8 +5029,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
                /* Any memory on that node */
                if (pgdat->node_present_pages)
-                        node_set_state(nid, N_HIGH_MEMORY);
+                        node_set_state(nid, N_MEMORY);
-                check_for_regular_memory(pgdat);
+                check_for_memory(pgdat, nid);
        }
 }
@@ -5174,10 +5258,6 @@ static void __setup_per_zone_wmarks(void)
                zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
                zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
-                zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
-                zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
-                zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
                setup_zone_migrate_reserve(zone);
                spin_unlock_irqrestore(&zone->lock, flags);
        }
@@ -5505,7 +5585,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
        pfn &= (PAGES_PER_SECTION-1);
        return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #else
-        pfn = pfn - zone->zone_start_pfn;
+        pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
        return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #endif /* CONFIG_SPARSEMEM */
 }
@@ -5575,7 +5655,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
 * expect this function should be exact.
 */
-bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
+bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
+                         bool skip_hwpoisoned_pages)
 {
        unsigned long pfn, iter, found;
        int mt;
@@ -5610,6 +5691,13 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
                        continue;
                }
+                /*
+                 * The HWPoisoned page may be not in buddy system, and
+                 * page_count() is not 0.
+                 */
+                if (skip_hwpoisoned_pages && PageHWPoison(page))
+                        continue;
                if (!PageLRU(page))
                        found++;
                /*
@@ -5652,7 +5740,7 @@ bool is_pageblock_removable_nolock(struct page *page)
                        zone->zone_start_pfn + zone->spanned_pages <= pfn)
                return false;
-        return !has_unmovable_pages(zone, page, 0);
+        return !has_unmovable_pages(zone, page, 0, true);
 }
 #ifdef CONFIG_CMA
@@ -5679,7 +5767,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
        unsigned int tries = 0;
        int ret = 0;
-        migrate_prep_local();
+        migrate_prep();
        while (pfn < end || !list_empty(&cc->migratepages)) {
                if (fatal_signal_pending(current)) {
@@ -5707,61 +5795,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
                ret = migrate_pages(&cc->migratepages,
                                    alloc_migrate_target,
-                                    0, false, MIGRATE_SYNC);
+                                    0, false, MIGRATE_SYNC,
+                                    MR_CMA);
        }
-        putback_lru_pages(&cc->migratepages);
+        putback_movable_pages(&cc->migratepages);
        return ret > 0 ? 0 : ret;
 }
-/*
- * Update zone's cma pages counter used for watermark level calculation.
- */
-static inline void __update_cma_watermarks(struct zone *zone, int count)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&zone->lock, flags);
-        zone->min_cma_pages += count;
-        spin_unlock_irqrestore(&zone->lock, flags);
-        setup_per_zone_wmarks();
-}
-/*
- * Trigger memory pressure bump to reclaim some pages in order to be able to
- * allocate 'count' pages in single page units. Does similar work as
- *__alloc_pages_slowpath() function.
- */
-static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
-{
-        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
-        struct zonelist *zonelist = node_zonelist(0, gfp_mask);
-        int did_some_progress = 0;
-        int order = 1;
-        /*
-         * Increase level of watermarks to force kswapd do his job
-         * to stabilise at new watermark level.
-         */
-        __update_cma_watermarks(zone, count);
-        /* Obey watermarks as if the page was being allocated */
-        while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
-                wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
-                did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
-                                                      NULL);
-                if (!did_some_progress) {
-                        /* Exhausted what can be done so it's blamo time */
-                        out_of_memory(zonelist, gfp_mask, order, NULL, false);
-                }
-        }
-        /* Restore original watermark levels. */
-        __update_cma_watermarks(zone, -count);
-        return count;
-}
 /**
 * alloc_contig_range() -- tries to allocate given range of pages
 * @start:      start PFN to allocate
@@ -5785,7 +5826,6 @@ static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
 int alloc_contig_range(unsigned long start, unsigned long end,
                       unsigned migratetype)
 {
-        struct zone *zone = page_zone(pfn_to_page(start));
        unsigned long outer_start, outer_end;
        int ret = 0, order;
@@ -5823,7 +5863,8 @@ int alloc_contig_range(unsigned long start, unsigned long end,
         */
        ret = start_isolate_page_range(pfn_max_align_down(start),
-                                       pfn_max_align_up(end), migratetype);
+                                       pfn_max_align_up(end), migratetype,
+                                       false);
        if (ret)
                return ret;
@@ -5862,18 +5903,13 @@ int alloc_contig_range(unsigned long start, unsigned long end,
        }
        /* Make sure the range is really isolated. */
-        if (test_pages_isolated(outer_start, end)) {
+        if (test_pages_isolated(outer_start, end, false)) {
                pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
                       outer_start, end);
                ret = -EBUSY;
                goto done;
        }
-        /*
-         * Reclaim enough pages to make sure that contiguous allocation
-         * will not starve the system.
-         */
-        __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
        /* Grab isolated pages from freelists. */
        outer_end = isolate_freepages_range(&cc, outer_start, end);
@@ -5896,8 +5932,15 @@ done:
 void free_contig_range(unsigned long pfn, unsigned nr_pages)
 {
-        for (; nr_pages--; ++pfn)
+        unsigned int count = 0;
-                __free_page(pfn_to_page(pfn));
+        for (; nr_pages--; pfn++) {
+                struct page *page = pfn_to_page(pfn);
+                count += page_count(page) != 1;
+                __free_page(page);
+        }
+        WARN(count != 0, "%d pages are still in use!\n", count);
 }
 #endif
@@ -5931,7 +5974,6 @@ void __meminit zone_pcp_update(struct zone *zone)
 }
 #endif
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void zone_pcp_reset(struct zone *zone)
 {
        unsigned long flags;
@@ -5951,6 +5993,7 @@ void zone_pcp_reset(struct zone *zone)
        local_irq_restore(flags);
 }
+#ifdef CONFIG_MEMORY_HOTREMOVE
 /*
 * All pages in the range must be isolated before calling this.
 */
@@ -5977,6 +6020,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
                        continue;
                }
                page = pfn_to_page(pfn);
+                /*
+                 * The HWPoisoned page may be not in buddy system, and
+                 * page_count() is not 0.
+                 */
+                if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
+                        pfn++;
+                        SetPageReserved(page);
+                        continue;
+                }
                BUG_ON(page_count(page));
                BUG_ON(!PageBuddy(page));
                order = page_order(page);
@@ -5987,8 +6040,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
                list_del(&page->lru);
                rmv_page_order(page);
                zone->free_area[order].nr_free--;
-                __mod_zone_page_state(zone, NR_FREE_PAGES,
-                                      - (1UL << order));
                for (i = 0; i < (1 << order); i++)
                        SetPageReserved((page+i));
                pfn += (1 << order);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5ddad0c6daa6..6d757e3a872a 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -251,6 +251,9 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
                                mn->nr_pages, mn->status_change_nid);
                break;
        case MEM_CANCEL_ONLINE:
+                offline_page_cgroup(mn->start_pfn,
+                                mn->nr_pages, mn->status_change_nid);
+                break;
        case MEM_GOING_OFFLINE:
                break;
        case MEM_ONLINE:
@@ -271,7 +274,7 @@ void __init page_cgroup_init(void)
        if (mem_cgroup_disabled())
                return;
-        for_each_node_state(nid, N_HIGH_MEMORY) {
+        for_each_node_state(nid, N_MEMORY) {
                unsigned long start_pfn, end_pfn;
                start_pfn = node_start_pfn(nid);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index f2f5b4818e94..383bdbb98b04 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -8,29 +8,7 @@
 #include <linux/memory.h>
 #include "internal.h"
-/* called while holding zone->lock */
+int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
-static void set_pageblock_isolate(struct page *page)
-{
-        if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
-                return;
-        set_pageblock_migratetype(page, MIGRATE_ISOLATE);
-        page_zone(page)->nr_pageblock_isolate++;
-}
-/* called while holding zone->lock */
-static void restore_pageblock_isolate(struct page *page, int migratetype)
-{
-        struct zone *zone = page_zone(page);
-        if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
-                return;
-        BUG_ON(zone->nr_pageblock_isolate <= 0);
-        set_pageblock_migratetype(page, migratetype);
-        zone->nr_pageblock_isolate--;
-}
-int set_migratetype_isolate(struct page *page)
 {
        struct zone *zone;
        unsigned long flags, pfn;
@@ -66,7 +44,8 @@ int set_migratetype_isolate(struct page *page)
         * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
         * We just check MOVABLE pages.
         */
-        if (!has_unmovable_pages(zone, page, arg.pages_found))
+        if (!has_unmovable_pages(zone, page, arg.pages_found,
+                                 skip_hwpoisoned_pages))
                ret = 0;
        /*
@@ -79,7 +58,7 @@ out:
                unsigned long nr_pages;
                int migratetype = get_pageblock_migratetype(page);
-                set_pageblock_isolate(page);
+                set_pageblock_migratetype(page, MIGRATE_ISOLATE);
                nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
                __mod_zone_freepage_state(zone, -nr_pages, migratetype);
@@ -102,7 +81,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)
                goto out;
        nr_pages = move_freepages_block(zone, page, migratetype);
        __mod_zone_freepage_state(zone, nr_pages, migratetype);
-        restore_pageblock_isolate(page, migratetype);
+        set_pageblock_migratetype(page, migratetype);
 out:
        spin_unlock_irqrestore(&zone->lock, flags);
 }
@@ -134,7 +113,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
 * Returns 0 on success and -EBUSY if any part of range cannot be isolated.
 */
 int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-                             unsigned migratetype)
+                             unsigned migratetype, bool skip_hwpoisoned_pages)
 {
        unsigned long pfn;
        unsigned long undo_pfn;
@@ -147,7 +126,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
             pfn < end_pfn;
             pfn += pageblock_nr_pages) {
                page = __first_valid_page(pfn, pageblock_nr_pages);
-                if (page && set_migratetype_isolate(page)) {
+                if (page &&
+                    set_migratetype_isolate(page, skip_hwpoisoned_pages)) {
                        undo_pfn = pfn;
                        goto undo;
                }
@@ -190,7 +170,8 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 * Returns 1 if all pages in the range are isolated.
 */
 static int
-__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
+__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
+                                  bool skip_hwpoisoned_pages)
 {
        struct page *page;
@@ -220,6 +201,14 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
                else if (page_count(page) == 0 &&
                        get_freepage_migratetype(page) == MIGRATE_ISOLATE)
                        pfn += 1;
+                else if (skip_hwpoisoned_pages && PageHWPoison(page)) {
+                        /*
+                         * The HWPoisoned page may be not in buddy
+                         * system, and page_count() is not 0.
+                         */
+                        pfn++;
+                        continue;
+                }
                else
                        break;
        }
@@ -228,7 +217,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
        return 1;
 }
-int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
+int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
+                        bool skip_hwpoisoned_pages)
 {
        unsigned long pfn, flags;
        struct page *page;
@@ -251,7 +241,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
        /* Check all pages are free or Marked as ISOLATED */
        zone = page_zone(page);
        spin_lock_irqsave(&zone->lock, flags);
-        ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn);
+        ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
+                                                skip_hwpoisoned_pages);
        spin_unlock_irqrestore(&zone->lock, flags);
        return ret ? 0 : -EBUSY;
 }
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 6c118d012bb5..35aa294656cd 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,7 +58,7 @@ again:
                if (!walk->pte_entry)
                        continue;
-                split_huge_page_pmd(walk->mm, pmd);
+                split_huge_page_pmd_mm(walk->mm, addr, pmd);
                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
                        goto again;
                err = walk_pte_range(pmd, addr, next, walk);
diff --git a/mm/percpu.c b/mm/percpu.c
index ddc5efb9c5bb..8c8e08f3a692 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -631,7 +631,7 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
        if (!chunk)
                return;
        pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
-        kfree(chunk);
+        pcpu_mem_free(chunk, pcpu_chunk_struct_size);
 }
 /*
@@ -1380,6 +1380,9 @@ enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
 static int __init percpu_alloc_setup(char *str)
 {
+        if (!str)
+                return -EINVAL;
        if (0)
                /* nada */;
 #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index e642627da6b7..0c8323fe6c8f 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -12,8 +12,8 @@
 #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 /*
- * Only sets the access flags (dirty, accessed, and
+ * Only sets the access flags (dirty, accessed), as well as write 
- * writable). Furthermore, we know it always gets set to a "more
+ * permission. Furthermore, we know it always gets set to a "more
 * permissive" setting, which allows most architectures to optimize
 * this. We return whether the PTE actually changed, which in turn
 * instructs the caller to do things like update__mmu_cache.  This
@@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
        int changed = !pte_same(*ptep, entry);
        if (changed) {
                set_pte_at(vma->vm_mm, address, ptep, entry);
-                flush_tlb_page(vma, address);
+                flush_tlb_fix_spurious_fault(vma, address);
        }
        return changed;
 }
@@ -88,7 +88,8 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
 {
        pte_t pte;
        pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
-        flush_tlb_page(vma, address);
+        if (pte_accessible(pte))
+                flush_tlb_page(vma, address);
        return pte;
 }
 #endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 2ee1ef0f317b..2c78f8cadc95 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,7 +24,7 @@
 *   mm->mmap_sem
 *     page->flags PG_locked (lock_page)
 *       mapping->i_mmap_mutex
- *         anon_vma->mutex
+ *         anon_vma->rwsem
 *           mm->page_table_lock or pte_lock
 *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)
 *             swap_lock (in swap_duplicate, swap_info_get)
@@ -37,7 +37,7 @@
 *                           in arch-dependent flush_dcache_mmap_lock,
 *                           within bdi.wb->list_lock in __sync_single_inode)
 *
- * anon_vma->mutex,mapping->i_mutex      (memory_failure, collect_procs_anon)
+ * anon_vma->rwsem,mapping->i_mutex      (memory_failure, collect_procs_anon)
 *   ->tasklist_lock
 *     pte map lock
 */
@@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
        VM_BUG_ON(atomic_read(&anon_vma->refcount));
        /*
-         * Synchronize against page_lock_anon_vma() such that
+         * Synchronize against page_lock_anon_vma_read() such that
         * we can safely hold the lock without the anon_vma getting
         * freed.
         *
         * Relies on the full mb implied by the atomic_dec_and_test() from
         * put_anon_vma() against the acquire barrier implied by
-         * mutex_trylock() from page_lock_anon_vma(). This orders:
+         * down_read_trylock() from page_lock_anon_vma_read(). This orders:
         *
-         * page_lock_anon_vma()         VS      put_anon_vma()
+         * page_lock_anon_vma_read()    VS      put_anon_vma()
-         *   mutex_trylock()                      atomic_dec_and_test()
+         *   down_read_trylock()                  atomic_dec_and_test()
         *   LOCK                                 MB
-         *   atomic_read()                        mutex_is_locked()
+         *   atomic_read()                        rwsem_is_locked()
         *
         * LOCK should suffice since the actual taking of the lock must
         * happen _before_ what follows.
         */
-        if (mutex_is_locked(&anon_vma->root->mutex)) {
+        if (rwsem_is_locked(&anon_vma->root->rwsem)) {
-                anon_vma_lock(anon_vma);
+                anon_vma_lock_write(anon_vma);
                anon_vma_unlock(anon_vma);
        }
@@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
 * allocate a new one.
 *
 * Anon-vma allocations are very subtle, because we may have
- * optimistically looked up an anon_vma in page_lock_anon_vma()
+ * optimistically looked up an anon_vma in page_lock_anon_vma_read()
 * and that may actually touch the spinlock even in the newly
 * allocated vma (it depends on RCU to make sure that the
 * anon_vma isn't actually destroyed).
@@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                        allocated = anon_vma;
                }
-                anon_vma_lock(anon_vma);
+                anon_vma_lock_write(anon_vma);
                /* page_table_lock to protect against threads */
                spin_lock(&mm->page_table_lock);
                if (likely(!vma->anon_vma)) {
@@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
        struct anon_vma *new_root = anon_vma->root;
        if (new_root != root) {
                if (WARN_ON_ONCE(root))
-                        mutex_unlock(&root->mutex);
+                        up_write(&root->rwsem);
                root = new_root;
-                mutex_lock(&root->mutex);
+                down_write(&root->rwsem);
        }
        return root;
 }
@@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
 static inline void unlock_anon_vma_root(struct anon_vma *root)
 {
        if (root)
-                mutex_unlock(&root->mutex);
+                up_write(&root->rwsem);
 }
 /*
@@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
        get_anon_vma(anon_vma->root);
        /* Mark this anon_vma as the one where our new (COWed) pages go. */
        vma->anon_vma = anon_vma;
-        anon_vma_lock(anon_vma);
+        anon_vma_lock_write(anon_vma);
        anon_vma_chain_link(vma, avc, anon_vma);
        anon_vma_unlock(anon_vma);
@@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
        /*
         * Iterate the list once more, it now only contains empty and unlinked
         * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
-         * needing to acquire the anon_vma->root->mutex.
+         * needing to write-acquire the anon_vma->root->rwsem.
         */
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma = avc->anon_vma;
@@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data)
 {
        struct anon_vma *anon_vma = data;
-        mutex_init(&anon_vma->mutex);
+        init_rwsem(&anon_vma->rwsem);
        atomic_set(&anon_vma->refcount, 0);
        anon_vma->rb_root = RB_ROOT;
 }
@@ -442,7 +442,7 @@ out:
 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
 * reference like with page_get_anon_vma() and then block on the mutex.
 */
-struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_lock_anon_vma_read(struct page *page)
 {
        struct anon_vma *anon_vma = NULL;
        struct anon_vma *root_anon_vma;
@@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
        root_anon_vma = ACCESS_ONCE(anon_vma->root);
-        if (mutex_trylock(&root_anon_vma->mutex)) {
+        if (down_read_trylock(&root_anon_vma->rwsem)) {
                /*
                 * If the page is still mapped, then this anon_vma is still
                 * its anon_vma, and holding the mutex ensures that it will
                 * not go away, see anon_vma_free().
                 */
                if (!page_mapped(page)) {
-                        mutex_unlock(&root_anon_vma->mutex);
+                        up_read(&root_anon_vma->rwsem);
                        anon_vma = NULL;
                }
                goto out;
@@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
        /* we pinned the anon_vma, its safe to sleep */
        rcu_read_unlock();
-        anon_vma_lock(anon_vma);
+        anon_vma_lock_read(anon_vma);
        if (atomic_dec_and_test(&anon_vma->refcount)) {
                /*
                 * Oops, we held the last refcount, release the lock
                 * and bail -- can't simply use put_anon_vma() because
-                 * we'll deadlock on the anon_vma_lock() recursion.
+                 * we'll deadlock on the anon_vma_lock_write() recursion.
                 */
-                anon_vma_unlock(anon_vma);
+                anon_vma_unlock_read(anon_vma);
                __put_anon_vma(anon_vma);
                anon_vma = NULL;
        }
@@ -504,9 +504,9 @@ out:
        return anon_vma;
 }
-void page_unlock_anon_vma(struct anon_vma *anon_vma)
+void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
 {
-        anon_vma_unlock(anon_vma);
+        anon_vma_unlock_read(anon_vma);
 }
 /*
@@ -562,6 +562,27 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
        return address;
 }
+pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd = NULL;
+        pgd = pgd_offset(mm, address);
+        if (!pgd_present(*pgd))
+                goto out;
+        pud = pud_offset(pgd, address);
+        if (!pud_present(*pud))
+                goto out;
+        pmd = pmd_offset(pud, address);
+        if (!pmd_present(*pmd))
+                pmd = NULL;
+out:
+        return pmd;
+}
 /*
 * Check that @page is mapped at @address into @mm.
 *
@@ -574,8 +595,6 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
                          unsigned long address, spinlock_t **ptlp, int sync)
 {
-        pgd_t *pgd;
-        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;
        spinlock_t *ptl;
@@ -586,17 +605,10 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
                goto check;
        }
-        pgd = pgd_offset(mm, address);
+        pmd = mm_find_pmd(mm, address);
-        if (!pgd_present(*pgd))
+        if (!pmd)
-                return NULL;
-        pud = pud_offset(pgd, address);
-        if (!pud_present(*pud))
                return NULL;
-        pmd = pmd_offset(pud, address);
-        if (!pmd_present(*pmd))
-                return NULL;
        if (pmd_trans_huge(*pmd))
                return NULL;
@@ -732,7 +744,7 @@ static int page_referenced_anon(struct page *page,
        struct anon_vma_chain *avc;
        int referenced = 0;
-        anon_vma = page_lock_anon_vma(page);
+        anon_vma = page_lock_anon_vma_read(page);
        if (!anon_vma)
                return referenced;
@@ -754,7 +766,7 @@ static int page_referenced_anon(struct page *page,
                        break;
        }
-        page_unlock_anon_vma(anon_vma);
+        page_unlock_anon_vma_read(anon_vma);
        return referenced;
 }
@@ -1139,9 +1151,11 @@ void page_remove_rmap(struct page *page)
         * containing the swap entry, but page not yet written to swap.
         *
         * And we can skip it on file pages, so long as the filesystem
-         * participates in dirty tracking; but need to catch shm and tmpfs
+         * participates in dirty tracking (note that this is not only an
-         * and ramfs pages which have been modified since creation by read
+         * optimization but also solves problems caused by dirty flag in
-         * fault.
+         * storage key getting set by a write from inside kernel); but need to
+         * catch shm and tmpfs and ramfs pages which have been modified since
+         * creation by read fault.
         *
         * Note that mapping must be decided above, before decrementing
         * mapcount (which luckily provides a barrier): once page is unmapped,
@@ -1235,12 +1249,14 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        update_hiwater_rss(mm);
        if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
-                if (PageAnon(page))
+                if (!PageHuge(page)) {
-                        dec_mm_counter(mm, MM_ANONPAGES);
+                        if (PageAnon(page))
-                else
+                                dec_mm_counter(mm, MM_ANONPAGES);
-                        dec_mm_counter(mm, MM_FILEPAGES);
+                        else
+                                dec_mm_counter(mm, MM_FILEPAGES);
+                }
                set_pte_at(mm, address, pte,
-                                swp_entry_to_pte(make_hwpoison_entry(page)));
+                           swp_entry_to_pte(make_hwpoison_entry(page)));
        } else if (PageAnon(page)) {
                swp_entry_t entry = { .val = page_private(page) };
@@ -1299,7 +1315,7 @@ out_mlock:
        /*
         * We need mmap_sem locking, Otherwise VM_LOCKED check makes
         * unstable result and race. Plus, We can't wait here because
-         * we now hold anon_vma->mutex or mapping->i_mmap_mutex.
+         * we now hold anon_vma->rwsem or mapping->i_mmap_mutex.
         * if trylock failed, the page remain in evictable lru and later
         * vmscan could retry to move the page to unevictable lru if the
         * page is actually mlocked.
@@ -1345,8 +1361,6 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
                struct vm_area_struct *vma, struct page *check_page)
 {
        struct mm_struct *mm = vma->vm_mm;
-        pgd_t *pgd;
-        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;
        pte_t pteval;
@@ -1366,16 +1380,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
        if (end > vma->vm_end)
                end = vma->vm_end;
-        pgd = pgd_offset(mm, address);
+        pmd = mm_find_pmd(mm, address);
-        if (!pgd_present(*pgd))
+        if (!pmd)
-                return ret;
-        pud = pud_offset(pgd, address);
-        if (!pud_present(*pud))
-                return ret;
-        pmd = pmd_offset(pud, address);
-        if (!pmd_present(*pmd))
                return ret;
        mmun_start = address;
@@ -1474,7 +1480,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
        struct anon_vma_chain *avc;
        int ret = SWAP_AGAIN;
-        anon_vma = page_lock_anon_vma(page);
+        anon_vma = page_lock_anon_vma_read(page);
        if (!anon_vma)
                return ret;
@@ -1501,7 +1507,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
                        break;
        }
-        page_unlock_anon_vma(anon_vma);
+        page_unlock_anon_vma_read(anon_vma);
        return ret;
 }
@@ -1696,7 +1702,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
        int ret = SWAP_AGAIN;
        /*
-         * Note: remove_migration_ptes() cannot use page_lock_anon_vma()
+         * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
         * because that depends on page_mapped(); but not all its usages
         * are holding mmap_sem. Users without mmap_sem are required to
         * take a reference count to prevent the anon_vma disappearing
@@ -1704,7 +1710,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
        anon_vma = page_anon_vma(page);
        if (!anon_vma)
                return ret;
-        anon_vma_lock(anon_vma);
+        anon_vma_lock_read(anon_vma);
        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);
@@ -1712,7 +1718,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
                if (ret != SWAP_AGAIN)
                        break;
        }
-        anon_vma_unlock(anon_vma);
+        anon_vma_unlock_read(anon_vma);
        return ret;
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index 89341b658bd0..5dd56f6efdbd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -889,7 +889,7 @@ static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
        if (!mpol || mpol->mode == MPOL_DEFAULT)
                return;         /* show nothing */
-        mpol_to_str(buffer, sizeof(buffer), mpol, 1);
+        mpol_to_str(buffer, sizeof(buffer), mpol);
        seq_printf(seq, ",mpol=%s", buffer);
 }
@@ -910,25 +910,29 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
                        struct shmem_inode_info *info, pgoff_t index)
 {
-        struct mempolicy mpol, *spol;
        struct vm_area_struct pvma;
+        struct page *page;
-        spol = mpol_cond_copy(&mpol,
-                        mpol_shared_policy_lookup(&info->policy, index));
        /* Create a pseudo vma that just contains the policy */
        pvma.vm_start = 0;
        /* Bias interleave by inode number to distribute better across nodes */
        pvma.vm_pgoff = index + info->vfs_inode.i_ino;
        pvma.vm_ops = NULL;
-        pvma.vm_policy = spol;
+        pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
-        return swapin_readahead(swap, gfp, &pvma, 0);
+        page = swapin_readahead(swap, gfp, &pvma, 0);
+        /* Drop reference taken by mpol_shared_policy_lookup() */
+        mpol_cond_put(pvma.vm_policy);
+        return page;
 }
 static struct page *shmem_alloc_page(gfp_t gfp,
                        struct shmem_inode_info *info, pgoff_t index)
 {
        struct vm_area_struct pvma;
+        struct page *page;
        /* Create a pseudo vma that just contains the policy */
        pvma.vm_start = 0;
@@ -937,10 +941,12 @@ static struct page *shmem_alloc_page(gfp_t gfp,
        pvma.vm_ops = NULL;
        pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
-        /*
+        page = alloc_page_vma(gfp, &pvma, 0);
-         * alloc_page_vma() will drop the shared policy reference
-         */
+        /* Drop reference taken by mpol_shared_policy_lookup() */
-        return alloc_page_vma(gfp, &pvma, 0);
+        mpol_cond_put(pvma.vm_policy);
+        return page;
 }
 #else /* !CONFIG_NUMA */
 #ifdef CONFIG_TMPFS
@@ -1709,6 +1715,96 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
        return error;
 }
+/*
+ * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
+ */
+static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
+                                    pgoff_t index, pgoff_t end, int whence)
+{
+        struct page *page;
+        struct pagevec pvec;
+        pgoff_t indices[PAGEVEC_SIZE];
+        bool done = false;
+        int i;
+        pagevec_init(&pvec, 0);
+        pvec.nr = 1;            /* start small: we may be there already */
+        while (!done) {
+                pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+                                        pvec.nr, pvec.pages, indices);
+                if (!pvec.nr) {
+                        if (whence == SEEK_DATA)
+                                index = end;
+                        break;
+                }
+                for (i = 0; i < pvec.nr; i++, index++) {
+                        if (index < indices[i]) {
+                                if (whence == SEEK_HOLE) {
+                                        done = true;
+                                        break;
+                                }
+                                index = indices[i];
+                        }
+                        page = pvec.pages[i];
+                        if (page && !radix_tree_exceptional_entry(page)) {
+                                if (!PageUptodate(page))
+                                        page = NULL;
+                        }
+                        if (index >= end ||
+                            (page && whence == SEEK_DATA) ||
+                            (!page && whence == SEEK_HOLE)) {
+                                done = true;
+                                break;
+                        }
+                }
+                shmem_deswap_pagevec(&pvec);
+                pagevec_release(&pvec);
+                pvec.nr = PAGEVEC_SIZE;
+                cond_resched();
+        }
+        return index;
+}
+static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
+{
+        struct address_space *mapping = file->f_mapping;
+        struct inode *inode = mapping->host;
+        pgoff_t start, end;
+        loff_t new_offset;
+        if (whence != SEEK_DATA && whence != SEEK_HOLE)
+                return generic_file_llseek_size(file, offset, whence,
+                                        MAX_LFS_FILESIZE, i_size_read(inode));
+        mutex_lock(&inode->i_mutex);
+        /* We're holding i_mutex so we can access i_size directly */
+        if (offset < 0)
+                offset = -EINVAL;
+        else if (offset >= inode->i_size)
+                offset = -ENXIO;
+        else {
+                start = offset >> PAGE_CACHE_SHIFT;
+                end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+                new_offset = shmem_seek_hole_data(mapping, start, end, whence);
+                new_offset <<= PAGE_CACHE_SHIFT;
+                if (new_offset > offset) {
+                        if (new_offset < inode->i_size)
+                                offset = new_offset;
+                        else if (whence == SEEK_DATA)
+                                offset = -ENXIO;
+                        else
+                                offset = inode->i_size;
+                }
+        }
+        if (offset >= 0 && offset != file->f_pos) {
+                file->f_pos = offset;
+                file->f_version = 0;
+        }
+        mutex_unlock(&inode->i_mutex);
+        return offset;
+}
 static long shmem_fallocate(struct file *file, int mode, loff_t offset,
                                                         loff_t len)
 {
@@ -2367,7 +2463,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
                        if (!gid_valid(sbinfo->gid))
                                goto bad_val;
                } else if (!strcmp(this_char,"mpol")) {
-                        if (mpol_parse_str(value, &sbinfo->mpol, 1))
+                        if (mpol_parse_str(value, &sbinfo->mpol))
                                goto bad_val;
                } else {
                        printk(KERN_ERR "tmpfs: Bad mount option %s\n",
@@ -2580,7 +2676,7 @@ static const struct address_space_operations shmem_aops = {
 static const struct file_operations shmem_file_operations = {
        .mmap           = shmem_mmap,
 #ifdef CONFIG_TMPFS
-        .llseek         = generic_file_llseek,
+        .llseek         = shmem_file_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .aio_read       = shmem_file_aio_read,
diff --git a/mm/slab.c b/mm/slab.c
index 33d3363658df..e7667a3584bc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -87,7 +87,6 @@
 */
 #include        <linux/slab.h>
-#include        "slab.h"
 #include        <linux/mm.h>
 #include        <linux/poison.h>
 #include        <linux/swap.h>
@@ -128,6 +127,8 @@
 #include        "internal.h"
+#include        "slab.h"
 /*
 * DEBUG        - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
 *                0 for faster, smaller code (especially in the critical paths).
@@ -162,23 +163,6 @@
 */
 static bool pfmemalloc_active __read_mostly;
-/* Legal flag mask for kmem_cache_create(). */
-#if DEBUG
-# define CREATE_MASK    (SLAB_RED_ZONE | \
-                         SLAB_POISON | SLAB_HWCACHE_ALIGN | \
-                         SLAB_CACHE_DMA | \
-                         SLAB_STORE_USER | \
-                         SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
-                         SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
-                         SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
-#else
-# define CREATE_MASK    (SLAB_HWCACHE_ALIGN | \
-                         SLAB_CACHE_DMA | \
-                         SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
-                         SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
-                         SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
-#endif
 /*
 * kmem_bufctl_t:
 *
@@ -564,15 +548,11 @@ static struct cache_names __initdata cache_names[] = {
 #undef CACHE
 };
-static struct arraycache_init initarray_cache __initdata =
-    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 static struct arraycache_init initarray_generic =
    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 /* internal cache of cache description objs */
-static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES];
 static struct kmem_cache kmem_cache_boot = {
-        .nodelists = kmem_cache_nodelists,
        .batchcount = 1,
        .limit = BOOT_CPUCACHE_ENTRIES,
        .shared = 1,
@@ -662,6 +642,26 @@ static void init_node_lock_keys(int q)
        }
 }
+static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q)
+{
+        struct kmem_list3 *l3;
+        l3 = cachep->nodelists[q];
+        if (!l3)
+                return;
+        slab_set_lock_classes(cachep, &on_slab_l3_key,
+                        &on_slab_alc_key, q);
+}
+static inline void on_slab_lock_classes(struct kmem_cache *cachep)
+{
+        int node;
+        VM_BUG_ON(OFF_SLAB(cachep));
+        for_each_node(node)
+                on_slab_lock_classes_node(cachep, node);
+}
 static inline void init_lock_keys(void)
 {
        int node;
@@ -678,6 +678,14 @@ static inline void init_lock_keys(void)
 {
 }
+static inline void on_slab_lock_classes(struct kmem_cache *cachep)
+{
+}
+static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node)
+{
+}
 static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
 {
 }
@@ -1406,6 +1414,9 @@ static int __cpuinit cpuup_prepare(long cpu)
                free_alien_cache(alien);
                if (cachep->flags & SLAB_DEBUG_OBJECTS)
                        slab_set_debugobj_lock_classes_node(cachep, node);
+                else if (!OFF_SLAB(cachep) &&
+                         !(cachep->flags & SLAB_DESTROY_BY_RCU))
+                        on_slab_lock_classes_node(cachep, node);
        }
        init_node_lock_keys(node);
@@ -1577,28 +1588,33 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index)
 }
 /*
+ * The memory after the last cpu cache pointer is used for the
+ * the nodelists pointer.
+ */
+static void setup_nodelists_pointer(struct kmem_cache *cachep)
+{
+        cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
+}
+/*
 * Initialisation.  Called after the page allocator have been initialised and
 * before smp_init().
 */
 void __init kmem_cache_init(void)
 {
-        size_t left_over;
        struct cache_sizes *sizes;
        struct cache_names *names;
        int i;
-        int order;
-        int node;
        kmem_cache = &kmem_cache_boot;
+        setup_nodelists_pointer(kmem_cache);
        if (num_possible_nodes() == 1)
                use_alien_caches = 0;
-        for (i = 0; i < NUM_INIT_LISTS; i++) {
+        for (i = 0; i < NUM_INIT_LISTS; i++)
                kmem_list3_init(&initkmem_list3[i]);
-                if (i < MAX_NUMNODES)
-                        kmem_cache->nodelists[i] = NULL;
-        }
        set_up_list3s(kmem_cache, CACHE_CACHE);
        /*
@@ -1629,37 +1645,16 @@ void __init kmem_cache_init(void)
         * 6) Resize the head arrays of the kmalloc caches to their final sizes.
         */
-        node = numa_mem_id();
        /* 1) create the kmem_cache */
-        INIT_LIST_HEAD(&slab_caches);
-        list_add(&kmem_cache->list, &slab_caches);
-        kmem_cache->colour_off = cache_line_size();
-        kmem_cache->array[smp_processor_id()] = &initarray_cache.cache;
-        kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
        /*
         * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
         */
-        kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
+        create_boot_cache(kmem_cache, "kmem_cache",
-                                  nr_node_ids * sizeof(struct kmem_list3 *);
+                offsetof(struct kmem_cache, array[nr_cpu_ids]) +
-        kmem_cache->object_size = kmem_cache->size;
+                                  nr_node_ids * sizeof(struct kmem_list3 *),
-        kmem_cache->size = ALIGN(kmem_cache->object_size,
+                                  SLAB_HWCACHE_ALIGN);
-                                        cache_line_size());
+        list_add(&kmem_cache->list, &slab_caches);
-        kmem_cache->reciprocal_buffer_size =
-                reciprocal_value(kmem_cache->size);
-        for (order = 0; order < MAX_ORDER; order++) {
-                cache_estimate(order, kmem_cache->size,
-                        cache_line_size(), 0, &left_over, &kmem_cache->num);
-                if (kmem_cache->num)
-                        break;
-        }
-        BUG_ON(!kmem_cache->num);
-        kmem_cache->gfporder = order;
-        kmem_cache->colour = left_over / kmem_cache->colour_off;
-        kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) +
-                                      sizeof(struct slab), cache_line_size());
        /* 2+3) create the kmalloc caches */
        sizes = malloc_sizes;
@@ -1671,23 +1666,13 @@ void __init kmem_cache_init(void)
         * bug.
         */
-        sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+        sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name,
-        sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name;
+                                        sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS);
-        sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size;
-        sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size;
+        if (INDEX_AC != INDEX_L3)
-        sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
+                sizes[INDEX_L3].cs_cachep =
-        __kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
+                        create_kmalloc_cache(names[INDEX_L3].name,
-        list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches);
+                                sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS);
-        if (INDEX_AC != INDEX_L3) {
-                sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-                sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name;
-                sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size;
-                sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size;
-                sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
-                __kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
-                list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches);
-        }
        slab_early_init = 0;
@@ -1699,24 +1684,14 @@ void __init kmem_cache_init(void)
                 * Note for systems short on memory removing the alignment will
                 * allow tighter packing of the smaller caches.
                 */
-                if (!sizes->cs_cachep) {
+                if (!sizes->cs_cachep)
-                        sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+                        sizes->cs_cachep = create_kmalloc_cache(names->name,
-                        sizes->cs_cachep->name = names->name;
+                                        sizes->cs_size, ARCH_KMALLOC_FLAGS);
-                        sizes->cs_cachep->size = sizes->cs_size;
-                        sizes->cs_cachep->object_size = sizes->cs_size;
-                        sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN;
-                        __kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
-                        list_add(&sizes->cs_cachep->list, &slab_caches);
-                }
 #ifdef CONFIG_ZONE_DMA
-                sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+                sizes->cs_dmacachep = create_kmalloc_cache(
-                sizes->cs_dmacachep->name = names->name_dma;
+                        names->name_dma, sizes->cs_size,
-                sizes->cs_dmacachep->size = sizes->cs_size;
+                        SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS);
-                sizes->cs_dmacachep->object_size = sizes->cs_size;
-                sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN;
-                __kmem_cache_create(sizes->cs_dmacachep,
-                               ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC);
-                list_add(&sizes->cs_dmacachep->list, &slab_caches);
 #endif
                sizes++;
                names++;
@@ -1727,7 +1702,6 @@ void __init kmem_cache_init(void)
                ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
-                BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache);
                memcpy(ptr, cpu_cache_get(kmem_cache),
                       sizeof(struct arraycache_init));
                /*
@@ -1921,6 +1895,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
                if (page->pfmemalloc)
                        SetPageSlabPfmemalloc(page + i);
        }
+        memcg_bind_pages(cachep, cachep->gfporder);
        if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
                kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1957,9 +1932,11 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
                __ClearPageSlab(page);
                page++;
        }
+        memcg_release_pages(cachep, cachep->gfporder);
        if (current->reclaim_state)
                current->reclaim_state->reclaimed_slab += nr_freed;
-        free_pages((unsigned long)addr, cachep->gfporder);
+        free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
 }
 static void kmem_rcu_free(struct rcu_head *head)
@@ -2282,7 +2259,15 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
        if (slab_state == DOWN) {
                /*
-                 * Note: the first kmem_cache_create must create the cache
+                 * Note: Creation of first cache (kmem_cache).
+                 * The setup_list3s is taken care
+                 * of by the caller of __kmem_cache_create
+                 */
+                cachep->array[smp_processor_id()] = &initarray_generic.cache;
+                slab_state = PARTIAL;
+        } else if (slab_state == PARTIAL) {
+                /*
+                 * Note: the second kmem_cache_create must create the cache
                 * that's used by kmalloc(24), otherwise the creation of
                 * further caches will BUG().
                 */
@@ -2290,7 +2275,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
                /*
                 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
-                 * the first cache, then we need to set up all its list3s,
+                 * the second cache, then we need to set up all its list3s,
                 * otherwise the creation of further caches will BUG().
                 */
                set_up_list3s(cachep, SIZE_AC);
@@ -2299,6 +2284,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
                else
                        slab_state = PARTIAL_ARRAYCACHE;
        } else {
+                /* Remaining boot caches */
                cachep->array[smp_processor_id()] =
                        kmalloc(sizeof(struct arraycache_init), gfp);
@@ -2331,11 +2317,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 /**
 * __kmem_cache_create - Create a cache.
- * @name: A string which is used in /proc/slabinfo to identify this cache.
+ * @cachep: cache management descriptor
- * @size: The size of objects to be created in this cache.
- * @align: The required alignment for the objects.
 * @flags: SLAB flags
- * @ctor: A constructor for the objects.
 *
 * Returns a ptr to the cache on success, NULL on failure.
 * Cannot be called within a int, but can be interrupted.
@@ -2378,11 +2361,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
        if (flags & SLAB_DESTROY_BY_RCU)
                BUG_ON(flags & SLAB_POISON);
 #endif
-        /*
-         * Always checks flags, a caller might be expecting debug support which
-         * isn't available.
-         */
-        BUG_ON(flags & ~CREATE_MASK);
        /*
         * Check that size is in terms of words.  This is needed to avoid
@@ -2394,22 +2372,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
                size &= ~(BYTES_PER_WORD - 1);
        }
-        /* calculate the final buffer alignment: */
-        /* 1) arch recommendation: can be overridden for debug */
-        if (flags & SLAB_HWCACHE_ALIGN) {
-                /*
-                 * Default alignment: as specified by the arch code.  Except if
-                 * an object is really small, then squeeze multiple objects into
-                 * one cacheline.
-                 */
-                ralign = cache_line_size();
-                while (size <= ralign / 2)
-                        ralign /= 2;
-        } else {
-                ralign = BYTES_PER_WORD;
-        }
        /*
         * Redzoning and user store require word alignment or possibly larger.
         * Note this will be overridden by architecture or caller mandated
@@ -2426,10 +2388,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
                size &= ~(REDZONE_ALIGN - 1);
        }
-        /* 2) arch mandated alignment */
-        if (ralign < ARCH_SLAB_MINALIGN) {
-                ralign = ARCH_SLAB_MINALIGN;
-        }
        /* 3) caller mandated alignment */
        if (ralign < cachep->align) {
                ralign = cachep->align;
@@ -2447,7 +2405,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
        else
                gfp = GFP_NOWAIT;
-        cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
+        setup_nodelists_pointer(cachep);
 #if DEBUG
        /*
@@ -2566,7 +2524,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
                WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
                slab_set_debugobj_lock_classes(cachep);
-        }
+        } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU))
+                on_slab_lock_classes(cachep);
        return 0;
 }
@@ -3530,6 +3489,8 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
        if (slab_should_failslab(cachep, flags))
                return NULL;
+        cachep = memcg_kmem_get_cache(cachep, flags);
        cache_alloc_debugcheck_before(cachep, flags);
        local_irq_save(save_flags);
@@ -3615,6 +3576,8 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
        if (slab_should_failslab(cachep, flags))
                return NULL;
+        cachep = memcg_kmem_get_cache(cachep, flags);
        cache_alloc_debugcheck_before(cachep, flags);
        local_irq_save(save_flags);
        objp = __do_cache_alloc(cachep, flags);
@@ -3928,6 +3891,9 @@ EXPORT_SYMBOL(__kmalloc);
 void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 {
        unsigned long flags;
+        cachep = cache_from_obj(cachep, objp);
+        if (!cachep)
+                return;
        local_irq_save(flags);
        debug_check_no_locks_freed(objp, cachep->object_size);
@@ -3969,12 +3935,6 @@ void kfree(const void *objp)
 }
 EXPORT_SYMBOL(kfree);
-unsigned int kmem_cache_size(struct kmem_cache *cachep)
-{
-        return cachep->object_size;
-}
-EXPORT_SYMBOL(kmem_cache_size);
 /*
 * This initializes kmem_list3 or resizes various caches for all nodes.
 */
@@ -4081,7 +4041,7 @@ static void do_ccupdate_local(void *info)
 }
 /* Always called with the slab_mutex held */
-static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
                                int batchcount, int shared, gfp_t gfp)
 {
        struct ccupdate_struct *new;
@@ -4124,12 +4084,49 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
        return alloc_kmemlist(cachep, gfp);
 }
+static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+                                int batchcount, int shared, gfp_t gfp)
+{
+        int ret;
+        struct kmem_cache *c = NULL;
+        int i = 0;
+        ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
+        if (slab_state < FULL)
+                return ret;
+        if ((ret < 0) || !is_root_cache(cachep))
+                return ret;
+        VM_BUG_ON(!mutex_is_locked(&slab_mutex));
+        for_each_memcg_cache_index(i) {
+                c = cache_from_memcg(cachep, i);
+                if (c)
+                        /* return value determined by the parent cache only */
+                        __do_tune_cpucache(c, limit, batchcount, shared, gfp);
+        }
+        return ret;
+}
 /* Called with slab_mutex held always */
 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
 {
        int err;
-        int limit, shared;
+        int limit = 0;
+        int shared = 0;
+        int batchcount = 0;
+        if (!is_root_cache(cachep)) {
+                struct kmem_cache *root = memcg_root_cache(cachep);
+                limit = root->limit;
+                shared = root->shared;
+                batchcount = root->batchcount;
+        }
+        if (limit && shared && batchcount)
+                goto skip_setup;
        /*
         * The head array serves three purposes:
         * - create a LIFO ordering, i.e. return objects that are cache-warm
@@ -4171,7 +4168,9 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
        if (limit > 32)
                limit = 32;
 #endif
-        err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
+        batchcount = (limit + 1) / 2;
+skip_setup:
+        err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
        if (err)
                printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
                       cachep->name, -err);
@@ -4276,54 +4275,8 @@ out:
 }
 #ifdef CONFIG_SLABINFO
+void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
-static void print_slabinfo_header(struct seq_file *m)
-{
-        /*
-         * Output format version, so at least we can change it
-         * without _too_ many complaints.
-         */
-#if STATS
-        seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
-#else
-        seq_puts(m, "slabinfo - version: 2.1\n");
-#endif
-        seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
-                 "<objperslab> <pagesperslab>");
-        seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
-        seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
-#if STATS
-        seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
-                 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
-        seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
-#endif
-        seq_putc(m, '\n');
-}
-static void *s_start(struct seq_file *m, loff_t *pos)
-{
-        loff_t n = *pos;
-        mutex_lock(&slab_mutex);
-        if (!n)
-                print_slabinfo_header(m);
-        return seq_list_start(&slab_caches, *pos);
-}
-static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 {
-        return seq_list_next(p, &slab_caches, pos);
-}
-static void s_stop(struct seq_file *m, void *p)
-{
-        mutex_unlock(&slab_mutex);
-}
-static int s_show(struct seq_file *m, void *p)
-{
-        struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
        struct slab *slabp;
        unsigned long active_objs;
        unsigned long num_objs;
@@ -4378,13 +4331,20 @@ static int s_show(struct seq_file *m, void *p)
        if (error)
                printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
-        seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
+        sinfo->active_objs = active_objs;
-                   name, active_objs, num_objs, cachep->size,
+        sinfo->num_objs = num_objs;
-                   cachep->num, (1 << cachep->gfporder));
+        sinfo->active_slabs = active_slabs;
-        seq_printf(m, " : tunables %4u %4u %4u",
+        sinfo->num_slabs = num_slabs;
-                   cachep->limit, cachep->batchcount, cachep->shared);
+        sinfo->shared_avail = shared_avail;
-        seq_printf(m, " : slabdata %6lu %6lu %6lu",
+        sinfo->limit = cachep->limit;
-                   active_slabs, num_slabs, shared_avail);
+        sinfo->batchcount = cachep->batchcount;
+        sinfo->shared = cachep->shared;
+        sinfo->objects_per_slab = cachep->num;
+        sinfo->cache_order = cachep->gfporder;
+}
+void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
+{
 #if STATS
        {                       /* list3 stats */
                unsigned long high = cachep->high_mark;
@@ -4414,31 +4374,8 @@ static int s_show(struct seq_file *m, void *p)
                           allochit, allocmiss, freehit, freemiss);
        }
 #endif
-        seq_putc(m, '\n');
-        return 0;
 }
-/*
- * slabinfo_op - iterator that generates /proc/slabinfo
- *
- * Output layout:
- * cache-name
- * num-active-objs
- * total-objs
- * object size
- * num-active-slabs
- * total-slabs
- * num-pages-per-slab
- * + further values on SMP and with statistics enabled
- */
-static const struct seq_operations slabinfo_op = {
-        .start = s_start,
-        .next = s_next,
-        .stop = s_stop,
-        .show = s_show,
-};
 #define MAX_SLABINFO_WRITE 128
 /**
 * slabinfo_write - Tuning for the slab allocator
@@ -4447,7 +4384,7 @@ static const struct seq_operations slabinfo_op = {
 * @count: data length
 * @ppos: unused
 */
-static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+ssize_t slabinfo_write(struct file *file, const char __user *buffer,
                       size_t count, loff_t *ppos)
 {
        char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
@@ -4490,19 +4427,6 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
        return res;
 }
-static int slabinfo_open(struct inode *inode, struct file *file)
-{
-        return seq_open(file, &slabinfo_op);
-}
-static const struct file_operations proc_slabinfo_operations = {
-        .open           = slabinfo_open,
-        .read           = seq_read,
-        .write          = slabinfo_write,
-        .llseek         = seq_lseek,
-        .release        = seq_release,
-};
 #ifdef CONFIG_DEBUG_SLAB_LEAK
 static void *leaks_start(struct seq_file *m, loff_t *pos)
@@ -4631,6 +4555,16 @@ static int leaks_show(struct seq_file *m, void *p)
        return 0;
 }
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+        return seq_list_next(p, &slab_caches, pos);
+}
+static void s_stop(struct seq_file *m, void *p)
+{
+        mutex_unlock(&slab_mutex);
+}
 static const struct seq_operations slabstats_op = {
        .start = leaks_start,
        .next = s_next,
@@ -4665,7 +4599,6 @@ static const struct file_operations proc_slabstats_operations = {
 static int __init slab_proc_init(void)
 {
-        proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations);
 #ifdef CONFIG_DEBUG_SLAB_LEAK
        proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
 #endif
diff --git a/mm/slab.h b/mm/slab.h
index 7deeb449a301..34a98d642196 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -32,19 +32,201 @@ extern struct list_head slab_caches;
 /* The slab cache that manages slab cache information */
 extern struct kmem_cache *kmem_cache;
+unsigned long calculate_alignment(unsigned long flags,
+                unsigned long align, unsigned long size);
 /* Functions provided by the slab allocators */
 extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags);
+extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
+                        unsigned long flags);
+extern void create_boot_cache(struct kmem_cache *, const char *name,
+                        size_t size, unsigned long flags);
+struct mem_cgroup;
 #ifdef CONFIG_SLUB
-struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
+struct kmem_cache *
-        size_t align, unsigned long flags, void (*ctor)(void *));
+__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
+                   size_t align, unsigned long flags, void (*ctor)(void *));
 #else
-static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
+static inline struct kmem_cache *
-        size_t align, unsigned long flags, void (*ctor)(void *))
+__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
+                   size_t align, unsigned long flags, void (*ctor)(void *))
 { return NULL; }
 #endif
+/* Legal flag mask for kmem_cache_create(), for various configurations */
+#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \
+                         SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS )
+#if defined(CONFIG_DEBUG_SLAB)
+#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
+#elif defined(CONFIG_SLUB_DEBUG)
+#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
+                          SLAB_TRACE | SLAB_DEBUG_FREE)
+#else
+#define SLAB_DEBUG_FLAGS (0)
+#endif
+#if defined(CONFIG_SLAB)
+#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
+                          SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK)
+#elif defined(CONFIG_SLUB)
+#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
+                          SLAB_TEMPORARY | SLAB_NOTRACK)
+#else
+#define SLAB_CACHE_FLAGS (0)
+#endif
+#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
 int __kmem_cache_shutdown(struct kmem_cache *);
+struct seq_file;
+struct file;
+struct slabinfo {
+        unsigned long active_objs;
+        unsigned long num_objs;
+        unsigned long active_slabs;
+        unsigned long num_slabs;
+        unsigned long shared_avail;
+        unsigned int limit;
+        unsigned int batchcount;
+        unsigned int shared;
+        unsigned int objects_per_slab;
+        unsigned int cache_order;
+};
+void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo);
+void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
+ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+                       size_t count, loff_t *ppos);
+#ifdef CONFIG_MEMCG_KMEM
+static inline bool is_root_cache(struct kmem_cache *s)
+{
+        return !s->memcg_params || s->memcg_params->is_root_cache;
+}
+static inline bool cache_match_memcg(struct kmem_cache *cachep,
+                                     struct mem_cgroup *memcg)
+{
+        return (is_root_cache(cachep) && !memcg) ||
+                                (cachep->memcg_params->memcg == memcg);
+}
+static inline void memcg_bind_pages(struct kmem_cache *s, int order)
+{
+        if (!is_root_cache(s))
+                atomic_add(1 << order, &s->memcg_params->nr_pages);
+}
+static inline void memcg_release_pages(struct kmem_cache *s, int order)
+{
+        if (is_root_cache(s))
+                return;
+        if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages))
+                mem_cgroup_destroy_cache(s);
+}
+static inline bool slab_equal_or_root(struct kmem_cache *s,
+                                        struct kmem_cache *p)
+{
+        return (p == s) ||
+                (s->memcg_params && (p == s->memcg_params->root_cache));
+}
+/*
+ * We use suffixes to the name in memcg because we can't have caches
+ * created in the system with the same name. But when we print them
+ * locally, better refer to them with the base name
+ */
+static inline const char *cache_name(struct kmem_cache *s)
+{
+        if (!is_root_cache(s))
+                return s->memcg_params->root_cache->name;
+        return s->name;
+}
+static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
+{
+        return s->memcg_params->memcg_caches[idx];
+}
+static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
+{
+        if (is_root_cache(s))
+                return s;
+        return s->memcg_params->root_cache;
+}
+#else
+static inline bool is_root_cache(struct kmem_cache *s)
+{
+        return true;
+}
+static inline bool cache_match_memcg(struct kmem_cache *cachep,
+                                     struct mem_cgroup *memcg)
+{
+        return true;
+}
+static inline void memcg_bind_pages(struct kmem_cache *s, int order)
+{
+}
+static inline void memcg_release_pages(struct kmem_cache *s, int order)
+{
+}
+static inline bool slab_equal_or_root(struct kmem_cache *s,
+                                      struct kmem_cache *p)
+{
+        return true;
+}
+static inline const char *cache_name(struct kmem_cache *s)
+{
+        return s->name;
+}
+static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
+{
+        return NULL;
+}
+static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
+{
+        return s;
+}
+#endif
+static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
+{
+        struct kmem_cache *cachep;
+        struct page *page;
+        /*
+         * When kmemcg is not being used, both assignments should return the
+         * same value. but we don't want to pay the assignment price in that
+         * case. If it is not compiled in, the compiler should be smart enough
+         * to not do even the assignment. In that case, slab_equal_or_root
+         * will also be a constant.
+         */
+        if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE))
+                return s;
+        page = virt_to_head_page(x);
+        cachep = page->slab_cache;
+        if (slab_equal_or_root(cachep, s))
+                return cachep;
+        pr_err("%s: Wrong slab cache. %s but object is from %s\n",
+                __FUNCTION__, cachep->name, s->name);
+        WARN_ON_ONCE(1);
+        return s;
+}
 #endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 069a24e64403..3f3cd97d3fdf 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -13,9 +13,12 @@
 #include <linux/module.h>
 #include <linux/cpu.h>
 #include <linux/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 #include <asm/page.h>
+#include <linux/memcontrol.h>
 #include "slab.h"
@@ -25,7 +28,8 @@ DEFINE_MUTEX(slab_mutex);
 struct kmem_cache *kmem_cache;
 #ifdef CONFIG_DEBUG_VM
-static int kmem_cache_sanity_check(const char *name, size_t size)
+static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
+                                   size_t size)
 {
        struct kmem_cache *s = NULL;
@@ -51,7 +55,13 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
                        continue;
                }
-                if (!strcmp(s->name, name)) {
+                /*
+                 * For simplicity, we won't check this in the list of memcg
+                 * caches. We have control over memcg naming, and if there
+                 * aren't duplicates in the global list, there won't be any
+                 * duplicates in the memcg lists as well.
+                 */
+                if (!memcg && !strcmp(s->name, name)) {
                        pr_err("%s (%s): Cache name already exists.\n",
                               __func__, name);
                        dump_stack();
@@ -64,12 +74,69 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
        return 0;
 }
 #else
-static inline int kmem_cache_sanity_check(const char *name, size_t size)
+static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
+                                          const char *name, size_t size)
 {
        return 0;
 }
 #endif
+#ifdef CONFIG_MEMCG_KMEM
+int memcg_update_all_caches(int num_memcgs)
+{
+        struct kmem_cache *s;
+        int ret = 0;
+        mutex_lock(&slab_mutex);
+        list_for_each_entry(s, &slab_caches, list) {
+                if (!is_root_cache(s))
+                        continue;
+                ret = memcg_update_cache_size(s, num_memcgs);
+                /*
+                 * See comment in memcontrol.c, memcg_update_cache_size:
+                 * Instead of freeing the memory, we'll just leave the caches
+                 * up to this point in an updated state.
+                 */
+                if (ret)
+                        goto out;
+        }
+        memcg_update_array_size(num_memcgs);
+out:
+        mutex_unlock(&slab_mutex);
+        return ret;
+}
+#endif
+/*
+ * Figure out what the alignment of the objects will be given a set of
+ * flags, a user specified alignment and the size of the objects.
+ */
+unsigned long calculate_alignment(unsigned long flags,
+                unsigned long align, unsigned long size)
+{
+        /*
+         * If the user wants hardware cache aligned objects then follow that
+         * suggestion if the object is sufficiently large.
+         *
+         * The hardware cache alignment cannot override the specified
+         * alignment though. If that is greater then use it.
+         */
+        if (flags & SLAB_HWCACHE_ALIGN) {
+                unsigned long ralign = cache_line_size();
+                while (size <= ralign / 2)
+                        ralign /= 2;
+                align = max(align, ralign);
+        }
+        if (align < ARCH_SLAB_MINALIGN)
+                align = ARCH_SLAB_MINALIGN;
+        return ALIGN(align, sizeof(void *));
+}
 /*
 * kmem_cache_create - Create a cache.
 * @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -95,8 +162,10 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
 * as davem.
 */
-struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align,
+struct kmem_cache *
-                unsigned long flags, void (*ctor)(void *))
+kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
+                        size_t align, unsigned long flags, void (*ctor)(void *),
+                        struct kmem_cache *parent_cache)
 {
        struct kmem_cache *s = NULL;
        int err = 0;
@@ -104,19 +173,33 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
        get_online_cpus();
        mutex_lock(&slab_mutex);
-        if (!kmem_cache_sanity_check(name, size) == 0)
+        if (!kmem_cache_sanity_check(memcg, name, size) == 0)
                goto out_locked;
+        /*
+         * Some allocators will constraint the set of valid flags to a subset
+         * of all flags. We expect them to define CACHE_CREATE_MASK in this
+         * case, and we'll just provide them with a sanitized version of the
+         * passed flags.
+         */
+        flags &= CACHE_CREATE_MASK;
-        s = __kmem_cache_alias(name, size, align, flags, ctor);
+        s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
        if (s)
                goto out_locked;
        s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
        if (s) {
                s->object_size = s->size = size;
-                s->align = align;
+                s->align = calculate_alignment(flags, align, size);
                s->ctor = ctor;
+                if (memcg_register_cache(memcg, s, parent_cache)) {
+                        kmem_cache_free(kmem_cache, s);
+                        err = -ENOMEM;
+                        goto out_locked;
+                }
                s->name = kstrdup(name, GFP_KERNEL);
                if (!s->name) {
                        kmem_cache_free(kmem_cache, s);
@@ -126,10 +209,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
                err = __kmem_cache_create(s, flags);
                if (!err) {
                        s->refcount = 1;
                        list_add(&s->list, &slab_caches);
+                        memcg_cache_list_add(memcg, s);
                } else {
                        kfree(s->name);
                        kmem_cache_free(kmem_cache, s);
@@ -157,10 +239,20 @@ out_locked:
        return s;
 }
+struct kmem_cache *
+kmem_cache_create(const char *name, size_t size, size_t align,
+                  unsigned long flags, void (*ctor)(void *))
+{
+        return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);
+}
 EXPORT_SYMBOL(kmem_cache_create);
 void kmem_cache_destroy(struct kmem_cache *s)
 {
+        /* Destroy all the children caches if we aren't a memcg cache */
+        kmem_cache_destroy_memcg_children(s);
        get_online_cpus();
        mutex_lock(&slab_mutex);
        s->refcount--;
@@ -172,6 +264,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
                        if (s->flags & SLAB_DESTROY_BY_RCU)
                                rcu_barrier();
+                        memcg_release_cache(s);
                        kfree(s->name);
                        kmem_cache_free(kmem_cache, s);
                } else {
@@ -192,3 +285,182 @@ int slab_is_available(void)
 {
        return slab_state >= UP;
 }
+#ifndef CONFIG_SLOB
+/* Create a cache during boot when no slab services are available yet */
+void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,
+                unsigned long flags)
+{
+        int err;
+        s->name = name;
+        s->size = s->object_size = size;
+        s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
+        err = __kmem_cache_create(s, flags);
+        if (err)
+                panic("Creation of kmalloc slab %s size=%zd failed. Reason %d\n",
+                                        name, size, err);
+        s->refcount = -1;       /* Exempt from merging for now */
+}
+struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
+                                unsigned long flags)
+{
+        struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+        if (!s)
+                panic("Out of memory when creating slab %s\n", name);
+        create_boot_cache(s, name, size, flags);
+        list_add(&s->list, &slab_caches);
+        s->refcount = 1;
+        return s;
+}
+#endif /* !CONFIG_SLOB */
+#ifdef CONFIG_SLABINFO
+void print_slabinfo_header(struct seq_file *m)
+{
+        /*
+         * Output format version, so at least we can change it
+         * without _too_ many complaints.
+         */
+#ifdef CONFIG_DEBUG_SLAB
+        seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
+#else
+        seq_puts(m, "slabinfo - version: 2.1\n");
+#endif
+        seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
+                 "<objperslab> <pagesperslab>");
+        seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+        seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
+#ifdef CONFIG_DEBUG_SLAB
+        seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
+                 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
+        seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
+#endif
+        seq_putc(m, '\n');
+}
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+        loff_t n = *pos;
+        mutex_lock(&slab_mutex);
+        if (!n)
+                print_slabinfo_header(m);
+        return seq_list_start(&slab_caches, *pos);
+}
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+        return seq_list_next(p, &slab_caches, pos);
+}
+static void s_stop(struct seq_file *m, void *p)
+{
+        mutex_unlock(&slab_mutex);
+}
+static void
+memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
+{
+        struct kmem_cache *c;
+        struct slabinfo sinfo;
+        int i;
+        if (!is_root_cache(s))
+                return;
+        for_each_memcg_cache_index(i) {
+                c = cache_from_memcg(s, i);
+                if (!c)
+                        continue;
+                memset(&sinfo, 0, sizeof(sinfo));
+                get_slabinfo(c, &sinfo);
+                info->active_slabs += sinfo.active_slabs;
+                info->num_slabs += sinfo.num_slabs;
+                info->shared_avail += sinfo.shared_avail;
+                info->active_objs += sinfo.active_objs;
+                info->num_objs += sinfo.num_objs;
+        }
+}
+int cache_show(struct kmem_cache *s, struct seq_file *m)
+{
+        struct slabinfo sinfo;
+        memset(&sinfo, 0, sizeof(sinfo));
+        get_slabinfo(s, &sinfo);
+        memcg_accumulate_slabinfo(s, &sinfo);
+        seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
+                   cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size,
+                   sinfo.objects_per_slab, (1 << sinfo.cache_order));
+        seq_printf(m, " : tunables %4u %4u %4u",
+                   sinfo.limit, sinfo.batchcount, sinfo.shared);
+        seq_printf(m, " : slabdata %6lu %6lu %6lu",
+                   sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
+        slabinfo_show_stats(m, s);
+        seq_putc(m, '\n');
+        return 0;
+}
+static int s_show(struct seq_file *m, void *p)
+{
+        struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
+        if (!is_root_cache(s))
+                return 0;
+        return cache_show(s, m);
+}
+/*
+ * slabinfo_op - iterator that generates /proc/slabinfo
+ *
+ * Output layout:
+ * cache-name
+ * num-active-objs
+ * total-objs
+ * object size
+ * num-active-slabs
+ * total-slabs
+ * num-pages-per-slab
+ * + further values on SMP and with statistics enabled
+ */
+static const struct seq_operations slabinfo_op = {
+        .start = s_start,
+        .next = s_next,
+        .stop = s_stop,
+        .show = s_show,
+};
+static int slabinfo_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &slabinfo_op);
+}
+static const struct file_operations proc_slabinfo_operations = {
+        .open           = slabinfo_open,
+        .read           = seq_read,
+        .write          = slabinfo_write,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+static int __init slab_proc_init(void)
+{
+        proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
+        return 0;
+}
+module_init(slab_proc_init);
+#endif /* CONFIG_SLABINFO */
diff --git a/mm/slob.c b/mm/slob.c
index 1e921c5e9576..a99fdf7a0907 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -28,9 +28,8 @@
 * from kmalloc are prepended with a 4-byte header with the kmalloc size.
 * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
 * alloc_pages() directly, allocating compound pages so the page order
- * does not have to be separately tracked, and also stores the exact
+ * does not have to be separately tracked.
- * allocation size in page->private so that it can be used to accurately
+ * These objects are detected in kfree() because PageSlab()
- * provide ksize(). These objects are detected in kfree() because slob_page()
 * is false for them.
 *
 * SLAB is emulated on top of SLOB by simply calling constructors and
@@ -59,7 +58,6 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include "slab.h"
 #include <linux/mm.h>
 #include <linux/swap.h> /* struct reclaim_state */
@@ -74,6 +72,7 @@
 #include <linux/atomic.h>
+#include "slab.h"
 /*
 * slob_block has a field 'units', which indicates size of block if +ve,
 * or offset of next block if -ve (in SLOB_UNITs).
@@ -124,7 +123,6 @@ static inline void clear_slob_page_free(struct page *sp)
 #define SLOB_UNIT sizeof(slob_t)
 #define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
-#define SLOB_ALIGN L1_CACHE_BYTES
 /*
 * struct slob_rcu is inserted at the tail of allocated slob blocks, which
@@ -455,11 +453,6 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
                if (likely(order))
                        gfp |= __GFP_COMP;
                ret = slob_new_pages(gfp, order, node);
-                if (ret) {
-                        struct page *page;
-                        page = virt_to_page(ret);
-                        page->private = size;
-                }
                trace_kmalloc_node(caller, ret,
                                   size, PAGE_SIZE << order, gfp, node);
@@ -506,7 +499,7 @@ void kfree(const void *block)
                unsigned int *m = (unsigned int *)(block - align);
                slob_free(m, *m + align);
        } else
-                put_page(sp);
+                __free_pages(sp, compound_order(sp));
 }
 EXPORT_SYMBOL(kfree);
@@ -514,37 +507,30 @@ EXPORT_SYMBOL(kfree);
 size_t ksize(const void *block)
 {
        struct page *sp;
+        int align;
+        unsigned int *m;
        BUG_ON(!block);
        if (unlikely(block == ZERO_SIZE_PTR))
                return 0;
        sp = virt_to_page(block);
-        if (PageSlab(sp)) {
+        if (unlikely(!PageSlab(sp)))
-                int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+                return PAGE_SIZE << compound_order(sp);
-                unsigned int *m = (unsigned int *)(block - align);
-                return SLOB_UNITS(*m) * SLOB_UNIT;
+        align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
-        } else
+        m = (unsigned int *)(block - align);
-                return sp->private;
+        return SLOB_UNITS(*m) * SLOB_UNIT;
 }
 EXPORT_SYMBOL(ksize);
 int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
 {
-        size_t align = c->size;
        if (flags & SLAB_DESTROY_BY_RCU) {
                /* leave room for rcu footer at the end of object */
                c->size += sizeof(struct slob_rcu);
        }
        c->flags = flags;
-        /* ignore alignment unless it's forced */
-        c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
-        if (c->align < ARCH_SLAB_MINALIGN)
-                c->align = ARCH_SLAB_MINALIGN;
-        if (c->align < align)
-                c->align = align;
        return 0;
 }
@@ -558,12 +544,12 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
        if (c->size < PAGE_SIZE) {
                b = slob_alloc(c->size, flags, c->align, node);
-                trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
+                trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
                                            SLOB_UNITS(c->size) * SLOB_UNIT,
                                            flags, node);
        } else {
                b = slob_new_pages(flags, get_order(c->size), node);
-                trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
+                trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
                                            PAGE_SIZE << get_order(c->size),
                                            flags, node);
        }
@@ -608,12 +594,6 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
 }
 EXPORT_SYMBOL(kmem_cache_free);
-unsigned int kmem_cache_size(struct kmem_cache *c)
-{
-        return c->size;
-}
-EXPORT_SYMBOL(kmem_cache_size);
 int __kmem_cache_shutdown(struct kmem_cache *c)
 {
        /* No way to check for remaining objects */
diff --git a/mm/slub.c b/mm/slub.c
index a0d698467f70..ba2ca53f6c3a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -31,6 +31,7 @@
 #include <linux/fault-inject.h>
 #include <linux/stacktrace.h>
 #include <linux/prefetch.h>
+#include <linux/memcontrol.h>
 #include <trace/events/kmem.h>
@@ -112,9 +113,6 @@
 *                      the fast path and disables lockless freelists.
 */
-#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
-                SLAB_TRACE | SLAB_DEBUG_FREE)
 static inline int kmem_cache_debug(struct kmem_cache *s)
 {
 #ifdef CONFIG_SLUB_DEBUG
@@ -179,8 +177,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
 #define __OBJECT_POISON         0x80000000UL /* Poison object */
 #define __CMPXCHG_DOUBLE        0x40000000UL /* Use cmpxchg_double */
-static int kmem_size = sizeof(struct kmem_cache);
 #ifdef CONFIG_SMP
 static struct notifier_block slab_notifier;
 #endif
@@ -205,13 +201,14 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
 static int sysfs_slab_add(struct kmem_cache *);
 static int sysfs_slab_alias(struct kmem_cache *, const char *);
 static void sysfs_slab_remove(struct kmem_cache *);
+static void memcg_propagate_slab_attrs(struct kmem_cache *s);
 #else
 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
                                                        { return 0; }
 static inline void sysfs_slab_remove(struct kmem_cache *s) { }
+static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
 #endif
 static inline void stat(const struct kmem_cache *s, enum stat_item si)
@@ -1092,11 +1089,11 @@ static noinline struct kmem_cache_node *free_debug_processing(
        if (!check_object(s, page, object, SLUB_RED_ACTIVE))
                goto out;
-        if (unlikely(s != page->slab)) {
+        if (unlikely(s != page->slab_cache)) {
                if (!PageSlab(page)) {
                        slab_err(s, page, "Attempt to free object(0x%p) "
                                "outside of slab", object);
-                } else if (!page->slab) {
+                } else if (!page->slab_cache) {
                        printk(KERN_ERR
                                "SLUB <none>: no slab for object 0x%p.\n",
                                                object);
@@ -1348,6 +1345,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        void *start;
        void *last;
        void *p;
+        int order;
        BUG_ON(flags & GFP_SLAB_BUG_MASK);
@@ -1356,8 +1354,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        if (!page)
                goto out;
+        order = compound_order(page);
        inc_slabs_node(s, page_to_nid(page), page->objects);
-        page->slab = s;
+        memcg_bind_pages(s, order);
+        page->slab_cache = s;
        __SetPageSlab(page);
        if (page->pfmemalloc)
                SetPageSlabPfmemalloc(page);
@@ -1365,7 +1365,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        start = page_address(page);
        if (unlikely(s->flags & SLAB_POISON))
-                memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page));
+                memset(start, POISON_INUSE, PAGE_SIZE << order);
        last = start;
        for_each_object(p, s, start, page->objects) {
@@ -1406,10 +1406,12 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
        __ClearPageSlabPfmemalloc(page);
        __ClearPageSlab(page);
+        memcg_release_pages(s, order);
        reset_page_mapcount(page);
        if (current->reclaim_state)
                current->reclaim_state->reclaimed_slab += pages;
-        __free_pages(page, order);
+        __free_memcg_kmem_pages(page, order);
 }
 #define need_reserve_slab_rcu                                           \
@@ -1424,7 +1426,7 @@ static void rcu_free_slab(struct rcu_head *h)
        else
                page = container_of((struct list_head *)h, struct page, lru);
-        __free_slab(page->slab, page);
+        __free_slab(page->slab_cache, page);
 }
 static void free_slab(struct kmem_cache *s, struct page *page)
@@ -1872,12 +1874,14 @@ redo:
 /*
 * Unfreeze all the cpu partial slabs.
 *
- * This function must be called with interrupt disabled.
+ * This function must be called with interrupts disabled
+ * for the cpu using c (or some other guarantee must be there
+ * to guarantee no concurrent accesses).
 */
-static void unfreeze_partials(struct kmem_cache *s)
+static void unfreeze_partials(struct kmem_cache *s,
+                struct kmem_cache_cpu *c)
 {
        struct kmem_cache_node *n = NULL, *n2 = NULL;
-        struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
        struct page *page, *discard_page = NULL;
        while ((page = c->partial)) {
@@ -1963,7 +1967,7 @@ static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
                                 * set to the per node partial list.
                                 */
                                local_irq_save(flags);
-                                unfreeze_partials(s);
+                                unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
                                local_irq_restore(flags);
                                oldpage = NULL;
                                pobjects = 0;
@@ -2006,7 +2010,7 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
                if (c->page)
                        flush_slab(s, c);
-                unfreeze_partials(s);
+                unfreeze_partials(s, c);
        }
 }
@@ -2325,6 +2329,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
        if (slab_pre_alloc_hook(s, gfpflags))
                return NULL;
+        s = memcg_kmem_get_cache(s, gfpflags);
 redo:
        /*
@@ -2459,7 +2464,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
        void *prior;
        void **object = (void *)x;
        int was_frozen;
-        int inuse;
        struct page new;
        unsigned long counters;
        struct kmem_cache_node *n = NULL;
@@ -2472,13 +2476,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
                return;
        do {
+                if (unlikely(n)) {
+                        spin_unlock_irqrestore(&n->list_lock, flags);
+                        n = NULL;
+                }
                prior = page->freelist;
                counters = page->counters;
                set_freepointer(s, object, prior);
                new.counters = counters;
                was_frozen = new.frozen;
                new.inuse--;
-                if ((!new.inuse || !prior) && !was_frozen && !n) {
+                if ((!new.inuse || !prior) && !was_frozen) {
                        if (!kmem_cache_debug(s) && !prior)
@@ -2503,7 +2511,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
                        }
                }
-                inuse = new.inuse;
        } while (!cmpxchg_double_slab(s, page,
                prior, counters,
@@ -2529,25 +2536,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
                return;
        }
+        if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
+                goto slab_empty;
        /*
-         * was_frozen may have been set after we acquired the list_lock in
+         * Objects left in the slab. If it was not on the partial list before
-         * an earlier loop. So we need to check it here again.
+         * then add it.
         */
-        if (was_frozen)
+        if (kmem_cache_debug(s) && unlikely(!prior)) {
-                stat(s, FREE_FROZEN);
+                remove_full(s, page);
-        else {
+                add_partial(n, page, DEACTIVATE_TO_TAIL);
-                if (unlikely(!inuse && n->nr_partial > s->min_partial))
+                stat(s, FREE_ADD_PARTIAL);
-                        goto slab_empty;
-                /*
-                 * Objects left in the slab. If it was not on the partial list before
-                 * then add it.
-                 */
-                if (unlikely(!prior)) {
-                        remove_full(s, page);
-                        add_partial(n, page, DEACTIVATE_TO_TAIL);
-                        stat(s, FREE_ADD_PARTIAL);
-                }
        }
        spin_unlock_irqrestore(&n->list_lock, flags);
        return;
@@ -2619,19 +2618,10 @@ redo:
 void kmem_cache_free(struct kmem_cache *s, void *x)
 {
-        struct page *page;
+        s = cache_from_obj(s, x);
+        if (!s)
-        page = virt_to_head_page(x);
-        if (kmem_cache_debug(s) && page->slab != s) {
-                pr_err("kmem_cache_free: Wrong slab cache. %s but object"
-                        " is from  %s\n", page->slab->name, s->name);
-                WARN_ON_ONCE(1);
                return;
-        }
+        slab_free(s, virt_to_head_page(x), x, _RET_IP_);
-        slab_free(s, page, x, _RET_IP_);
        trace_kmem_cache_free(_RET_IP_, x);
 }
 EXPORT_SYMBOL(kmem_cache_free);
@@ -2769,32 +2759,6 @@ static inline int calculate_order(int size, int reserved)
        return -ENOSYS;
 }
-/*
- * Figure out what the alignment of the objects will be.
- */
-static unsigned long calculate_alignment(unsigned long flags,
-                unsigned long align, unsigned long size)
-{
-        /*
-         * If the user wants hardware cache aligned objects then follow that
-         * suggestion if the object is sufficiently large.
-         *
-         * The hardware cache alignment cannot override the specified
-         * alignment though. If that is greater then use it.
-         */
-        if (flags & SLAB_HWCACHE_ALIGN) {
-                unsigned long ralign = cache_line_size();
-                while (size <= ralign / 2)
-                        ralign /= 2;
-                align = max(align, ralign);
-        }
-        if (align < ARCH_SLAB_MINALIGN)
-                align = ARCH_SLAB_MINALIGN;
-        return ALIGN(align, sizeof(void *));
-}
 static void
 init_kmem_cache_node(struct kmem_cache_node *n)
 {
@@ -2928,7 +2892,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 {
        unsigned long flags = s->flags;
        unsigned long size = s->object_size;
-        unsigned long align = s->align;
        int order;
        /*
@@ -3000,19 +2963,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 #endif
        /*
-         * Determine the alignment based on various parameters that the
-         * user specified and the dynamic determination of cache line size
-         * on bootup.
-         */
-        align = calculate_alignment(flags, align, s->object_size);
-        s->align = align;
-        /*
         * SLUB stores one object immediately after another beginning from
         * offset 0. In order to align the objects we have to simply size
         * each object to conform to the alignment.
         */
-        size = ALIGN(size, align);
+        size = ALIGN(size, s->align);
        s->size = size;
        if (forced_order >= 0)
                order = forced_order;
@@ -3041,7 +2996,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
                s->max = s->oo;
        return !!oo_objects(s->oo);
 }
 static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
@@ -3127,15 +3081,6 @@ error:
        return -EINVAL;
 }
-/*
- * Determine the size of a slab object
- */
-unsigned int kmem_cache_size(struct kmem_cache *s)
-{
-        return s->object_size;
-}
-EXPORT_SYMBOL(kmem_cache_size);
 static void list_slab_objects(struct kmem_cache *s, struct page *page,
                                                        const char *text)
 {
@@ -3208,8 +3153,19 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
 {
        int rc = kmem_cache_close(s);
-        if (!rc)
+        if (!rc) {
+                /*
+                 * We do the same lock strategy around sysfs_slab_add, see
+                 * __kmem_cache_create. Because this is pretty much the last
+                 * operation we do and the lock will be released shortly after
+                 * that in slab_common.c, we could just move sysfs_slab_remove
+                 * to a later point in common code. We should do that when we
+                 * have a common sysfs framework for all allocators.
+                 */
+                mutex_unlock(&slab_mutex);
                sysfs_slab_remove(s);
+                mutex_lock(&slab_mutex);
+        }
        return rc;
 }
@@ -3261,32 +3217,6 @@ static int __init setup_slub_nomerge(char *str)
 __setup("slub_nomerge", setup_slub_nomerge);
-static struct kmem_cache *__init create_kmalloc_cache(const char *name,
-                                                int size, unsigned int flags)
-{
-        struct kmem_cache *s;
-        s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-        s->name = name;
-        s->size = s->object_size = size;
-        s->align = ARCH_KMALLOC_MINALIGN;
-        /*
-         * This function is called with IRQs disabled during early-boot on
-         * single CPU so there's no need to take slab_mutex here.
-         */
-        if (kmem_cache_open(s, flags))
-                goto panic;
-        list_add(&s->list, &slab_caches);
-        return s;
-panic:
-        panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
-        return NULL;
-}
 /*
 * Conversion table for small slabs sizes / 8 to the index in the
 * kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -3372,7 +3302,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
        struct page *page;
        void *ptr = NULL;
-        flags |= __GFP_COMP | __GFP_NOTRACK;
+        flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
        page = alloc_pages_node(node, flags, get_order(size));
        if (page)
                ptr = page_address(page);
@@ -3424,7 +3354,7 @@ size_t ksize(const void *object)
                return PAGE_SIZE << compound_order(page);
        }
-        return slab_ksize(page->slab);
+        return slab_ksize(page->slab_cache);
 }
 EXPORT_SYMBOL(ksize);
@@ -3449,8 +3379,8 @@ bool verify_mem_not_deleted(const void *x)
        }
        slab_lock(page);
-        if (on_freelist(page->slab, page, object)) {
+        if (on_freelist(page->slab_cache, page, object)) {
-                object_err(page->slab, page, object, "Object is on free-list");
+                object_err(page->slab_cache, page, object, "Object is on free-list");
                rv = false;
        } else {
                rv = true;
@@ -3478,10 +3408,10 @@ void kfree(const void *x)
        if (unlikely(!PageSlab(page))) {
                BUG_ON(!PageCompound(page));
                kmemleak_free(x);
-                __free_pages(page, compound_order(page));
+                __free_memcg_kmem_pages(page, compound_order(page));
                return;
        }
-        slab_free(page->slab, page, object, _RET_IP_);
+        slab_free(page->slab_cache, page, object, _RET_IP_);
 }
 EXPORT_SYMBOL(kfree);
@@ -3573,7 +3503,7 @@ static void slab_mem_offline_callback(void *arg)
        struct memory_notify *marg = arg;
        int offline_node;
-        offline_node = marg->status_change_nid;
+        offline_node = marg->status_change_nid_normal;
        /*
         * If the node still has available memory. we need kmem_cache_node
@@ -3606,7 +3536,7 @@ static int slab_mem_going_online_callback(void *arg)
        struct kmem_cache_node *n;
        struct kmem_cache *s;
        struct memory_notify *marg = arg;
-        int nid = marg->status_change_nid;
+        int nid = marg->status_change_nid_normal;
        int ret = 0;
        /*
@@ -3676,15 +3606,16 @@ static int slab_memory_callback(struct notifier_block *self,
 /*
 * Used for early kmem_cache structures that were allocated using
- * the page allocator
+ * the page allocator. Allocate them properly then fix up the pointers
+ * that may be pointing to the wrong kmem_cache structure.
 */
-static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
+static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
 {
        int node;
+        struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
-        list_add(&s->list, &slab_caches);
+        memcpy(s, static_cache, kmem_cache->object_size);
-        s->refcount = -1;
        for_each_node_state(node, N_NORMAL_MEMORY) {
                struct kmem_cache_node *n = get_node(s, node);
@@ -3692,78 +3623,52 @@ static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
                if (n) {
                        list_for_each_entry(p, &n->partial, lru)
-                                p->slab = s;
+                                p->slab_cache = s;
 #ifdef CONFIG_SLUB_DEBUG
                        list_for_each_entry(p, &n->full, lru)
-                                p->slab = s;
+                                p->slab_cache = s;
 #endif
                }
        }
+        list_add(&s->list, &slab_caches);
+        return s;
 }
 void __init kmem_cache_init(void)
 {
+        static __initdata struct kmem_cache boot_kmem_cache,
+                boot_kmem_cache_node;
        int i;
-        int caches = 0;
+        int caches = 2;
-        struct kmem_cache *temp_kmem_cache;
-        int order;
-        struct kmem_cache *temp_kmem_cache_node;
-        unsigned long kmalloc_size;
        if (debug_guardpage_minorder())
                slub_max_order = 0;
-        kmem_size = offsetof(struct kmem_cache, node) +
+        kmem_cache_node = &boot_kmem_cache_node;
-                        nr_node_ids * sizeof(struct kmem_cache_node *);
+        kmem_cache = &boot_kmem_cache;
-        /* Allocate two kmem_caches from the page allocator */
-        kmalloc_size = ALIGN(kmem_size, cache_line_size());
-        order = get_order(2 * kmalloc_size);
-        kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order);
-        /*
-         * Must first have the slab cache available for the allocations of the
-         * struct kmem_cache_node's. There is special bootstrap code in
-         * kmem_cache_open for slab_state == DOWN.
-         */
-        kmem_cache_node = (void *)kmem_cache + kmalloc_size;
-        kmem_cache_node->name = "kmem_cache_node";
+        create_boot_cache(kmem_cache_node, "kmem_cache_node",
-        kmem_cache_node->size = kmem_cache_node->object_size =
+                sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN);
-                sizeof(struct kmem_cache_node);
-        kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
        hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
        /* Able to allocate the per node structures */
        slab_state = PARTIAL;
-        temp_kmem_cache = kmem_cache;
+        create_boot_cache(kmem_cache, "kmem_cache",
-        kmem_cache->name = "kmem_cache";
+                        offsetof(struct kmem_cache, node) +
-        kmem_cache->size = kmem_cache->object_size = kmem_size;
+                                nr_node_ids * sizeof(struct kmem_cache_node *),
-        kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+                       SLAB_HWCACHE_ALIGN);
-        kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
+        kmem_cache = bootstrap(&boot_kmem_cache);
-        memcpy(kmem_cache, temp_kmem_cache, kmem_size);
        /*
         * Allocate kmem_cache_node properly from the kmem_cache slab.
         * kmem_cache_node is separately allocated so no need to
         * update any list pointers.
         */
-        temp_kmem_cache_node = kmem_cache_node;
+        kmem_cache_node = bootstrap(&boot_kmem_cache_node);
-        kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
-        memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size);
-        kmem_cache_bootstrap_fixup(kmem_cache_node);
-        caches++;
-        kmem_cache_bootstrap_fixup(kmem_cache);
-        caches++;
-        /* Free temporary boot structure */
-        free_pages((unsigned long)temp_kmem_cache, order);
        /* Now we can use the kmem_cache to allocate kmalloc slabs */
@@ -3891,7 +3796,7 @@ static int slab_unmergeable(struct kmem_cache *s)
        return 0;
 }
-static struct kmem_cache *find_mergeable(size_t size,
+static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
                size_t align, unsigned long flags, const char *name,
                void (*ctor)(void *))
 {
@@ -3927,17 +3832,21 @@ static struct kmem_cache *find_mergeable(size_t size,
                if (s->size - size >= sizeof(void *))
                        continue;
+                if (!cache_match_memcg(s, memcg))
+                        continue;
                return s;
        }
        return NULL;
 }
-struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
+struct kmem_cache *
-                size_t align, unsigned long flags, void (*ctor)(void *))
+__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
+                   size_t align, unsigned long flags, void (*ctor)(void *))
 {
        struct kmem_cache *s;
-        s = find_mergeable(size, align, flags, name, ctor);
+        s = find_mergeable(memcg, size, align, flags, name, ctor);
        if (s) {
                s->refcount++;
                /*
@@ -3964,6 +3873,11 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
        if (err)
                return err;
+        /* Mutex is not taken during early boot */
+        if (slab_state <= UP)
+                return 0;
+        memcg_propagate_slab_attrs(s);
        mutex_unlock(&slab_mutex);
        err = sysfs_slab_add(s);
        mutex_lock(&slab_mutex);
@@ -5197,10 +5111,95 @@ static ssize_t slab_attr_store(struct kobject *kobj,
                return -EIO;
        err = attribute->store(s, buf, len);
+#ifdef CONFIG_MEMCG_KMEM
+        if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
+                int i;
+                mutex_lock(&slab_mutex);
+                if (s->max_attr_size < len)
+                        s->max_attr_size = len;
+                /*
+                 * This is a best effort propagation, so this function's return
+                 * value will be determined by the parent cache only. This is
+                 * basically because not all attributes will have a well
+                 * defined semantics for rollbacks - most of the actions will
+                 * have permanent effects.
+                 *
+                 * Returning the error value of any of the children that fail
+                 * is not 100 % defined, in the sense that users seeing the
+                 * error code won't be able to know anything about the state of
+                 * the cache.
+                 *
+                 * Only returning the error code for the parent cache at least
+                 * has well defined semantics. The cache being written to
+                 * directly either failed or succeeded, in which case we loop
+                 * through the descendants with best-effort propagation.
+                 */
+                for_each_memcg_cache_index(i) {
+                        struct kmem_cache *c = cache_from_memcg(s, i);
+                        if (c)
+                                attribute->store(c, buf, len);
+                }
+                mutex_unlock(&slab_mutex);
+        }
+#endif
        return err;
 }
+static void memcg_propagate_slab_attrs(struct kmem_cache *s)
+{
+#ifdef CONFIG_MEMCG_KMEM
+        int i;
+        char *buffer = NULL;
+        if (!is_root_cache(s))
+                return;
+        /*
+         * This mean this cache had no attribute written. Therefore, no point
+         * in copying default values around
+         */
+        if (!s->max_attr_size)
+                return;
+        for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
+                char mbuf[64];
+                char *buf;
+                struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
+                if (!attr || !attr->store || !attr->show)
+                        continue;
+                /*
+                 * It is really bad that we have to allocate here, so we will
+                 * do it only as a fallback. If we actually allocate, though,
+                 * we can just use the allocated buffer until the end.
+                 *
+                 * Most of the slub attributes will tend to be very small in
+                 * size, but sysfs allows buffers up to a page, so they can
+                 * theoretically happen.
+                 */
+                if (buffer)
+                        buf = buffer;
+                else if (s->max_attr_size < ARRAY_SIZE(mbuf))
+                        buf = mbuf;
+                else {
+                        buffer = (char *) get_zeroed_page(GFP_KERNEL);
+                        if (WARN_ON(!buffer))
+                                continue;
+                        buf = buffer;
+                }
+                attr->show(s->memcg_params->root_cache, buf);
+                attr->store(s, buf, strlen(buf));
+        }
+        if (buffer)
+                free_page((unsigned long)buffer);
+#endif
+}
 static const struct sysfs_ops slab_sysfs_ops = {
        .show = slab_attr_show,
        .store = slab_attr_store,
@@ -5257,6 +5256,12 @@ static char *create_unique_id(struct kmem_cache *s)
        if (p != name + 1)
                *p++ = '-';
        p += sprintf(p, "%07d", s->size);
+#ifdef CONFIG_MEMCG_KMEM
+        if (!is_root_cache(s))
+                p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg));
+#endif
        BUG_ON(p > name + ID_STR_LENGTH - 1);
        return name;
 }
@@ -5265,13 +5270,8 @@ static int sysfs_slab_add(struct kmem_cache *s)
 {
        int err;
        const char *name;
-        int unmergeable;
+        int unmergeable = slab_unmergeable(s);
-        if (slab_state < FULL)
-                /* Defer until later */
-                return 0;
-        unmergeable = slab_unmergeable(s);
        if (unmergeable) {
                /*
                 * Slabcache can never be merged so we can use the name proper.
@@ -5405,49 +5405,14 @@ __initcall(slab_sysfs_init);
 * The /proc/slabinfo ABI
 */
 #ifdef CONFIG_SLABINFO
-static void print_slabinfo_header(struct seq_file *m)
+void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
-{
-        seq_puts(m, "slabinfo - version: 2.1\n");
-        seq_puts(m, "# name            <active_objs> <num_objs> <object_size> "
-                 "<objperslab> <pagesperslab>");
-        seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
-        seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
-        seq_putc(m, '\n');
-}
-static void *s_start(struct seq_file *m, loff_t *pos)
-{
-        loff_t n = *pos;
-        mutex_lock(&slab_mutex);
-        if (!n)
-                print_slabinfo_header(m);
-        return seq_list_start(&slab_caches, *pos);
-}
-static void *s_next(struct seq_file *m, void *p, loff_t *pos)
-{
-        return seq_list_next(p, &slab_caches, pos);
-}
-static void s_stop(struct seq_file *m, void *p)
-{
-        mutex_unlock(&slab_mutex);
-}
-static int s_show(struct seq_file *m, void *p)
 {
        unsigned long nr_partials = 0;
        unsigned long nr_slabs = 0;
-        unsigned long nr_inuse = 0;
        unsigned long nr_objs = 0;
        unsigned long nr_free = 0;
-        struct kmem_cache *s;
        int node;
-        s = list_entry(p, struct kmem_cache, list);
        for_each_online_node(node) {
                struct kmem_cache_node *n = get_node(s, node);
@@ -5460,41 +5425,21 @@ static int s_show(struct seq_file *m, void *p)
                nr_free += count_partial(n, count_free);
        }
-        nr_inuse = nr_objs - nr_free;
+        sinfo->active_objs = nr_objs - nr_free;
+        sinfo->num_objs = nr_objs;
-        seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse,
+        sinfo->active_slabs = nr_slabs;
-                   nr_objs, s->size, oo_objects(s->oo),
+        sinfo->num_slabs = nr_slabs;
-                   (1 << oo_order(s->oo)));
+        sinfo->objects_per_slab = oo_objects(s->oo);
-        seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0);
+        sinfo->cache_order = oo_order(s->oo);
-        seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
-                   0UL);
-        seq_putc(m, '\n');
-        return 0;
 }
-static const struct seq_operations slabinfo_op = {
+void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
-        .start = s_start,
-        .next = s_next,
-        .stop = s_stop,
-        .show = s_show,
-};
-static int slabinfo_open(struct inode *inode, struct file *file)
 {
-        return seq_open(file, &slabinfo_op);
 }
-static const struct file_operations proc_slabinfo_operations = {
+ssize_t slabinfo_write(struct file *file, const char __user *buffer,
-        .open           = slabinfo_open,
+                       size_t count, loff_t *ppos)
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = seq_release,
-};
-static int __init slab_proc_init(void)
 {
-        proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
+        return -EIO;
-        return 0;
 }
-module_init(slab_proc_init);
 #endif /* CONFIG_SLABINFO */
diff --git a/mm/sparse.c b/mm/sparse.c
index fac95f2888f2..6b5fb762e2ca 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -617,7 +617,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 {
        return; /* XXX: Not implemented yet */
 }
-static void free_map_bootmem(struct page *page, unsigned long nr_pages)
+static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
 {
 }
 #else
@@ -638,7 +638,6 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
 got_map_page:
        ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
 got_map_ptr:
-        memset(ret, 0, memmap_size);
        return ret;
 }
@@ -658,10 +657,11 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
                           get_order(sizeof(struct page) * nr_pages));
 }
-static void free_map_bootmem(struct page *page, unsigned long nr_pages)
+static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
 {
        unsigned long maps_section_nr, removing_section_nr, i;
        unsigned long magic;
+        struct page *page = virt_to_page(memmap);
        for (i = 0; i < nr_pages; i++, page++) {
                magic = (unsigned long) page->lru.next;
@@ -710,13 +710,10 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
         */
        if (memmap) {
-                struct page *memmap_page;
-                memmap_page = virt_to_page(memmap);
                nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
                        >> PAGE_SHIFT;
-                free_map_bootmem(memmap_page, nr_pages);
+                free_map_bootmem(memmap, nr_pages);
        }
 }
@@ -760,6 +757,8 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
                goto out;
        }
+        memset(memmap, 0, sizeof(struct page) * nr_pages);
        ms->section_mem_map |= SECTION_MARKED_PRESENT;
        ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
@@ -773,6 +772,27 @@ out:
        return ret;
 }
+#ifdef CONFIG_MEMORY_FAILURE
+static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
+{
+        int i;
+        if (!memmap)
+                return;
+        for (i = 0; i < PAGES_PER_SECTION; i++) {
+                if (PageHWPoison(&memmap[i])) {
+                        atomic_long_sub(1, &mce_bad_pages);
+                        ClearPageHWPoison(&memmap[i]);
+                }
+        }
+}
+#else
+static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
+{
+}
+#endif
 void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
 {
        struct page *memmap = NULL;
@@ -786,6 +806,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
                ms->pageblock_flags = NULL;
        }
+        clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
        free_section_usemap(memmap, usemap);
 }
 #endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f91a25547ffe..e97a0e5aea91 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1443,13 +1443,12 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
        return generic_swapfile_activate(sis, swap_file, span);
 }
-static void enable_swap_info(struct swap_info_struct *p, int prio,
+static void _enable_swap_info(struct swap_info_struct *p, int prio,
                                unsigned char *swap_map,
                                unsigned long *frontswap_map)
 {
        int i, prev;
-        spin_lock(&swap_lock);
        if (prio >= 0)
                p->prio = prio;
        else
@@ -1472,10 +1471,25 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
                swap_list.head = swap_list.next = p->type;
        else
                swap_info[prev]->next = p->type;
+}
+static void enable_swap_info(struct swap_info_struct *p, int prio,
+                                unsigned char *swap_map,
+                                unsigned long *frontswap_map)
+{
+        spin_lock(&swap_lock);
+        _enable_swap_info(p, prio, swap_map, frontswap_map);
        frontswap_init(p->type);
        spin_unlock(&swap_lock);
 }
+static void reinsert_swap_info(struct swap_info_struct *p)
+{
+        spin_lock(&swap_lock);
+        _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
+        spin_unlock(&swap_lock);
+}
 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 {
        struct swap_info_struct *p = NULL;
@@ -1484,7 +1498,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        struct address_space *mapping;
        struct inode *inode;
        struct filename *pathname;
-        int oom_score_adj;
        int i, type, prev;
        int err;
@@ -1543,19 +1556,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        p->flags &= ~SWP_WRITEOK;
        spin_unlock(&swap_lock);
-        oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
+        set_current_oom_origin();
        err = try_to_unuse(type, false, 0); /* force all pages to be unused */
-        compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
+        clear_current_oom_origin();
        if (err) {
-                /*
-                 * reading p->prio and p->swap_map outside the lock is
-                 * safe here because only sys_swapon and sys_swapoff
-                 * change them, and there can be no other sys_swapon or
-                 * sys_swapoff for this swap_info_struct at this point.
-                 */
                /* re-insert swap space back into swap_list */
-                enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
+                reinsert_swap_info(p);
                goto out_dput;
        }
diff --git a/mm/truncate.c b/mm/truncate.c
index d51ce92d6e83..c75b736e54b7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -577,29 +577,6 @@ void truncate_setsize(struct inode *inode, loff_t newsize)
 EXPORT_SYMBOL(truncate_setsize);
 /**
- * vmtruncate - unmap mappings "freed" by truncate() syscall
- * @inode: inode of the file used
- * @newsize: file offset to start truncating
- *
- * This function is deprecated and truncate_setsize or truncate_pagecache
- * should be used instead, together with filesystem specific block truncation.
- */
-int vmtruncate(struct inode *inode, loff_t newsize)
-{
-        int error;
-        error = inode_newsize_ok(inode, newsize);
-        if (error)
-                return error;
-        truncate_setsize(inode, newsize);
-        if (inode->i_op->truncate)
-                inode->i_op->truncate(inode);
-        return 0;
-}
-EXPORT_SYMBOL(vmtruncate);
-/**
 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
 * @inode: inode
 * @lstart: offset of beginning of hole
diff --git a/mm/util.c b/mm/util.c
index dc3036cdcc6a..c55e26b17d93 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -152,7 +152,7 @@ EXPORT_SYMBOL(__krealloc);
 *
 * The contents of the object pointed to are preserved up to the
 * lesser of the new and old sizes.  If @p is %NULL, krealloc()
- * behaves exactly like kmalloc().  If @size is 0 and @p is not a
+ * behaves exactly like kmalloc().  If @new_size is 0 and @p is not a
 * %NULL pointer, the object pointed to is freed.
 */
 void *krealloc(const void *p, size_t new_size, gfp_t flags)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 78e08300db21..5123a169ab7b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2550,7 +2550,7 @@ static void s_stop(struct seq_file *m, void *p)
 static void show_numa_info(struct seq_file *m, struct vm_struct *v)
 {
-        if (NUMA_BUILD) {
+        if (IS_ENABLED(CONFIG_NUMA)) {
                unsigned int nr, *counters = m->private;
                if (!counters)
@@ -2615,7 +2615,7 @@ static int vmalloc_open(struct inode *inode, struct file *file)
        unsigned int *ptr = NULL;
        int ret;
-        if (NUMA_BUILD) {
+        if (IS_ENABLED(CONFIG_NUMA)) {
                ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
                if (ptr == NULL)
                        return -ENOMEM;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 48550c66f1f2..196709f5ee58 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1177,7 +1177,11 @@ int isolate_lru_page(struct page *page)
 }
 /*
- * Are there way too many processes in the direct reclaim path already?
+ * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
+ * then get resheduled. When there are massive number of tasks doing page
+ * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
+ * the LRU list will go small and be scanned faster than necessary, leading to
+ * unnecessary swapping, thrashing and OOM.
 */
 static int too_many_isolated(struct zone *zone, int file,
                struct scan_control *sc)
@@ -1198,6 +1202,14 @@ static int too_many_isolated(struct zone *zone, int file,
                isolated = zone_page_state(zone, NR_ISOLATED_ANON);
        }
+        /*
+         * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
+         * won't get blocked by normal direct-reclaimers, forming a circular
+         * deadlock.
+         */
+        if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
+                inactive >>= 3;
        return isolated > inactive;
 }
@@ -1679,13 +1691,24 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
        if (global_reclaim(sc)) {
                free  = zone_page_state(zone, NR_FREE_PAGES);
-                /* If we have very few page cache pages,
-                   force-scan anon pages. */
                if (unlikely(file + free <= high_wmark_pages(zone))) {
+                        /*
+                         * If we have very few page cache pages, force-scan
+                         * anon pages.
+                         */
                        fraction[0] = 1;
                        fraction[1] = 0;
                        denominator = 1;
                        goto out;
+                } else if (!inactive_file_is_low_global(zone)) {
+                        /*
+                         * There is enough inactive page cache, do not
+                         * reclaim anything from the working set right now.
+                         */
+                        fraction[0] = 0;
+                        fraction[1] = 1;
+                        denominator = 1;
+                        goto out;
                }
        }
@@ -1752,7 +1775,7 @@ out:
 /* Use reclaim/compaction for costly allocs or under memory pressure */
 static bool in_reclaim_compaction(struct scan_control *sc)
 {
-        if (COMPACTION_BUILD && sc->order &&
+        if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
                        (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
                         sc->priority < DEF_PRIORITY - 2))
                return true;
@@ -2005,7 +2028,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                        if (zone->all_unreclaimable &&
                                        sc->priority != DEF_PRIORITY)
                                continue;       /* Let kswapd poll it */
-                        if (COMPACTION_BUILD) {
+                        if (IS_ENABLED(CONFIG_COMPACTION)) {
                                /*
                                 * If we already have plenty of memory free for
                                 * compaction in this zone, don't free any more.
@@ -2207,9 +2230,12 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
 * Throttle direct reclaimers if backing storage is backed by the network
 * and the PFMEMALLOC reserve for the preferred node is getting dangerously
 * depleted. kswapd will continue to make progress and wake the processes
- * when the low watermark is reached
+ * when the low watermark is reached.
+ *
+ * Returns true if a fatal signal was delivered during throttling. If this
+ * happens, the page allocator should not consider triggering the OOM killer.
 */
-static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
+static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
                                        nodemask_t *nodemask)
 {
        struct zone *zone;
@@ -2224,13 +2250,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
         * processes to block on log_wait_commit().
         */
        if (current->flags & PF_KTHREAD)
-                return;
+                goto out;
+        /*
+         * If a fatal signal is pending, this process should not throttle.
+         * It should return quickly so it can exit and free its memory
+         */
+        if (fatal_signal_pending(current))
+                goto out;
        /* Check if the pfmemalloc reserves are ok */
        first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
        pgdat = zone->zone_pgdat;
        if (pfmemalloc_watermark_ok(pgdat))
-                return;
+                goto out;
        /* Account for the throttling */
        count_vm_event(PGSCAN_DIRECT_THROTTLE);
@@ -2246,12 +2279,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
        if (!(gfp_mask & __GFP_FS)) {
                wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
                        pfmemalloc_watermark_ok(pgdat), HZ);
-                return;
+                goto check_pending;
        }
        /* Throttle until kswapd wakes the process */
        wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
                pfmemalloc_watermark_ok(pgdat));
+check_pending:
+        if (fatal_signal_pending(current))
+                return true;
+out:
+        return false;
 }
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
@@ -2273,13 +2314,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                .gfp_mask = sc.gfp_mask,
        };
-        throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
        /*
-         * Do not enter reclaim if fatal signal is pending. 1 is returned so
+         * Do not enter reclaim if fatal signal was delivered while throttled.
-         * that the page allocator does not consider triggering OOM
+         * 1 is returned so that the page allocator does not OOM kill at this
+         * point.
         */
-        if (fatal_signal_pending(current))
+        if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
                return 1;
        trace_mm_vmscan_direct_reclaim_begin(order,
@@ -2397,13 +2437,31 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
        } while (memcg);
 }
+static bool zone_balanced(struct zone *zone, int order,
+                          unsigned long balance_gap, int classzone_idx)
+{
+        if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
+                                    balance_gap, classzone_idx, 0))
+                return false;
+        if (IS_ENABLED(CONFIG_COMPACTION) && order &&
+            !compaction_suitable(zone, order))
+                return false;
+        return true;
+}
 /*
- * pgdat_balanced is used when checking if a node is balanced for high-order
+ * pgdat_balanced() is used when checking if a node is balanced.
- * allocations. Only zones that meet watermarks and are in a zone allowed
+ *
- * by the callers classzone_idx are added to balanced_pages. The total of
+ * For order-0, all zones must be balanced!
- * balanced pages must be at least 25% of the zones allowed by classzone_idx
+ *
- * for the node to be considered balanced. Forcing all zones to be balanced
+ * For high-order allocations only zones that meet watermarks and are in a
- * for high orders can cause excessive reclaim when there are imbalanced zones.
+ * zone allowed by the callers classzone_idx are added to balanced_pages. The
+ * total of balanced pages must be at least 25% of the zones allowed by
+ * classzone_idx for the node to be considered balanced. Forcing all zones to
+ * be balanced for high orders can cause excessive reclaim when there are
+ * imbalanced zones.
 * The choice of 25% is due to
 *   o a 16M DMA zone that is balanced will not balance a zone on any
 *     reasonable sized machine
@@ -2413,17 +2471,43 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
 *     Similarly, on x86-64 the Normal zone would need to be at least 1G
 *     to balance a node on its own. These seemed like reasonable ratios.
 */
-static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
+static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
-                                                int classzone_idx)
 {
        unsigned long present_pages = 0;
+        unsigned long balanced_pages = 0;
        int i;
-        for (i = 0; i <= classzone_idx; i++)
+        /* Check the watermark levels */
-                present_pages += pgdat->node_zones[i].present_pages;
+        for (i = 0; i <= classzone_idx; i++) {
+                struct zone *zone = pgdat->node_zones + i;
+                if (!populated_zone(zone))
+                        continue;
+                present_pages += zone->present_pages;
-        /* A special case here: if zone has no page, we think it's balanced */
+                /*
-        return balanced_pages >= (present_pages >> 2);
+                 * A special case here:
+                 *
+                 * balance_pgdat() skips over all_unreclaimable after
+                 * DEF_PRIORITY. Effectively, it considers them balanced so
+                 * they must be considered balanced here as well!
+                 */
+                if (zone->all_unreclaimable) {
+                        balanced_pages += zone->present_pages;
+                        continue;
+                }
+                if (zone_balanced(zone, order, 0, i))
+                        balanced_pages += zone->present_pages;
+                else if (!order)
+                        return false;
+        }
+        if (order)
+                return balanced_pages >= (present_pages >> 2);
+        else
+                return true;
 }
 /*
@@ -2435,10 +2519,6 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
 static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
                                        int classzone_idx)
 {
-        int i;
-        unsigned long balanced = 0;
-        bool all_zones_ok = true;
        /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
        if (remaining)
                return false;
@@ -2457,40 +2537,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
                return false;
        }
-        /* Check the watermark levels */
+        return pgdat_balanced(pgdat, order, classzone_idx);
-        for (i = 0; i <= classzone_idx; i++) {
-                struct zone *zone = pgdat->node_zones + i;
-                if (!populated_zone(zone))
-                        continue;
-                /*
-                 * balance_pgdat() skips over all_unreclaimable after
-                 * DEF_PRIORITY. Effectively, it considers them balanced so
-                 * they must be considered balanced here as well if kswapd
-                 * is to sleep
-                 */
-                if (zone->all_unreclaimable) {
-                        balanced += zone->present_pages;
-                        continue;
-                }
-                if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
-                                                        i, 0))
-                        all_zones_ok = false;
-                else
-                        balanced += zone->present_pages;
-        }
-        /*
-         * For high-order requests, the balanced zones must contain at least
-         * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
-         * must be balanced
-         */
-        if (order)
-                return pgdat_balanced(pgdat, balanced, classzone_idx);
-        else
-                return all_zones_ok;
 }
 /*
@@ -2517,8 +2564,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                                                        int *classzone_idx)
 {
-        int all_zones_ok;
+        struct zone *unbalanced_zone;
-        unsigned long balanced;
        int i;
        int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
        unsigned long total_scanned;
@@ -2551,8 +2597,7 @@ loop_again:
                unsigned long lru_pages = 0;
                int has_under_min_watermark_zone = 0;
-                all_zones_ok = 1;
+                unbalanced_zone = NULL;
-                balanced = 0;
                /*
                 * Scan in the highmem->dma direction for the highest
@@ -2585,8 +2630,7 @@ loop_again:
                                break;
                        }
-                        if (!zone_watermark_ok_safe(zone, order,
+                        if (!zone_balanced(zone, order, 0, 0)) {
-                                        high_wmark_pages(zone), 0, 0)) {
                                end_zone = i;
                                break;
                        } else {
@@ -2656,15 +2700,14 @@ loop_again:
                         * Do not reclaim more than needed for compaction.
                         */
                        testorder = order;
-                        if (COMPACTION_BUILD && order &&
+                        if (IS_ENABLED(CONFIG_COMPACTION) && order &&
                                        compaction_suitable(zone, order) !=
                                                COMPACT_SKIPPED)
                                testorder = 0;
                        if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
-                                    !zone_watermark_ok_safe(zone, testorder,
+                            !zone_balanced(zone, testorder,
-                                        high_wmark_pages(zone) + balance_gap,
+                                           balance_gap, end_zone)) {
-                                        end_zone, 0)) {
                                shrink_zone(zone, &sc);
                                reclaim_state->reclaimed_slab = 0;
@@ -2691,9 +2734,8 @@ loop_again:
                                continue;
                        }
-                        if (!zone_watermark_ok_safe(zone, testorder,
+                        if (!zone_balanced(zone, testorder, 0, end_zone)) {
-                                        high_wmark_pages(zone), end_zone, 0)) {
+                                unbalanced_zone = zone;
-                                all_zones_ok = 0;
                                /*
                                 * We are still under min water mark.  This
                                 * means that we have a GFP_ATOMIC allocation
@@ -2711,8 +2753,6 @@ loop_again:
                                 * speculatively avoid congestion waits
                                 */
                                zone_clear_flag(zone, ZONE_CONGESTED);
-                                if (i <= *classzone_idx)
-                                        balanced += zone->present_pages;
                        }
                }
@@ -2726,7 +2766,7 @@ loop_again:
                                pfmemalloc_watermark_ok(pgdat))
                        wake_up(&pgdat->pfmemalloc_wait);
-                if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
+                if (pgdat_balanced(pgdat, order, *classzone_idx))
                        break;          /* kswapd: all done */
                /*
                 * OK, kswapd is getting into trouble.  Take a nap, then take
@@ -2735,8 +2775,8 @@ loop_again:
                if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
                        if (has_under_min_watermark_zone)
                                count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
-                        else
+                        else if (unbalanced_zone)
-                                congestion_wait(BLK_RW_ASYNC, HZ/10);
+                                wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10);
                }
                /*
@@ -2750,12 +2790,7 @@ loop_again:
        } while (--sc.priority >= 0);
 out:
-        /*
+        if (!pgdat_balanced(pgdat, order, *classzone_idx)) {
-         * order-0: All zones must meet high watermark for a balanced node
-         * high-order: Balanced zones must make up at least 25% of the node
-         *             for the node to be balanced
-         */
-        if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
                cond_resched();
                try_to_freeze();
@@ -2797,29 +2832,10 @@ out:
                        if (!populated_zone(zone))
                                continue;
-                        if (zone->all_unreclaimable &&
-                            sc.priority != DEF_PRIORITY)
-                                continue;
-                        /* Would compaction fail due to lack of free memory? */
-                        if (COMPACTION_BUILD &&
-                            compaction_suitable(zone, order) == COMPACT_SKIPPED)
-                                goto loop_again;
-                        /* Confirm the zone is balanced for order-0 */
-                        if (!zone_watermark_ok(zone, 0,
-                                        high_wmark_pages(zone), 0, 0)) {
-                                order = sc.order = 0;
-                                goto loop_again;
-                        }
                        /* Check if the memory needs to be defragmented. */
                        if (zone_watermark_ok(zone, order,
                                    low_wmark_pages(zone), *classzone_idx, 0))
                                zones_need_compaction = 0;
-                        /* If balanced, clear the congested flag */
-                        zone_clear_flag(zone, ZONE_CONGESTED);
                }
                if (zones_need_compaction)
@@ -2944,7 +2960,7 @@ static int kswapd(void *p)
        classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
        balanced_classzone_idx = classzone_idx;
        for ( ; ; ) {
-                int ret;
+                bool ret;
                /*
                 * If the last balance_pgdat was unsuccessful it's unlikely a
@@ -3106,13 +3122,13 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
   not required for correctness.  So if the last cpu in a node goes
   away, we get changed to run anywhere: as the first one comes back,
   restore their cpu bindings. */
-static int __devinit cpu_callback(struct notifier_block *nfb,
+static int cpu_callback(struct notifier_block *nfb, unsigned long action,
-                                  unsigned long action, void *hcpu)
+                        void *hcpu)
 {
        int nid;
        if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
-                for_each_node_state(nid, N_HIGH_MEMORY) {
+                for_each_node_state(nid, N_MEMORY) {
                        pg_data_t *pgdat = NODE_DATA(nid);
                        const struct cpumask *mask;
@@ -3168,7 +3184,7 @@ static int __init kswapd_init(void)
        int nid;
        swap_setup();
-        for_each_node_state(nid, N_HIGH_MEMORY)
+        for_each_node_state(nid, N_MEMORY)
                kswapd_run(nid);
        hotcpu_notifier(cpu_callback, 0);
        return 0;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c7370579111b..9800306c8195 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -774,10 +774,20 @@ const char * const vmstat_text[] = {
        "pgrotated",
+#ifdef CONFIG_NUMA_BALANCING
+        "numa_pte_updates",
+        "numa_hint_faults",
+        "numa_hint_faults_local",
+        "numa_pages_migrated",
+#endif
+#ifdef CONFIG_MIGRATION
+        "pgmigrate_success",
+        "pgmigrate_fail",
+#endif
 #ifdef CONFIG_COMPACTION
-        "compact_blocks_moved",
+        "compact_migrate_scanned",
-        "compact_pages_moved",
+        "compact_free_scanned",
-        "compact_pagemigrate_failed",
+        "compact_isolated",
        "compact_stall",
        "compact_fail",
        "compact_success",
@@ -801,6 +811,8 @@ const char * const vmstat_text[] = {
        "thp_collapse_alloc",
        "thp_collapse_alloc_failed",
        "thp_split",
+        "thp_zero_page_alloc",
+        "thp_zero_page_alloc_failed",
 #endif
 #endif /* CONFIG_VM_EVENTS_COUNTERS */
@@ -930,7 +942,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
        pg_data_t *pgdat = (pg_data_t *)arg;
        /* check memoryless node */
-        if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
+        if (!node_state(pgdat->node_id, N_MEMORY))
                return 0;
        seq_printf(m, "Page block order: %d\n", pageblock_order);
@@ -992,14 +1004,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                   "\n        high     %lu"
                   "\n        scanned  %lu"
                   "\n        spanned  %lu"
-                   "\n        present  %lu",
+                   "\n        present  %lu"
+                   "\n        managed  %lu",
                   zone_page_state(zone, NR_FREE_PAGES),
                   min_wmark_pages(zone),
                   low_wmark_pages(zone),
                   high_wmark_pages(zone),
                   zone->pages_scanned,
                   zone->spanned_pages,
-                   zone->present_pages);
+                   zone->present_pages,
+                   zone->managed_pages);
        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
                seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
@@ -1292,7 +1306,7 @@ static int unusable_show(struct seq_file *m, void *arg)
        pg_data_t *pgdat = (pg_data_t *)arg;
        /* check memoryless node */
-        if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
+        if (!node_state(pgdat->node_id, N_MEMORY))
                return 0;
        walk_zones_in_node(m, pgdat, unusable_show_print);
author	H. Peter Anvin <hpa@linux.intel.com>	2013-01-29 17:59:09 -0500
committer	H. Peter Anvin <hpa@linux.intel.com>	2013-01-29 18:10:15 -0500
commit	de65d816aa44f9ddd79861ae21d75010cc1fd003 (patch)
tree	04a637a43b2e52a733d0dcb7595a47057571e7da /mm
parent	9710f581bb4c35589ac046b0cfc0deb7f369fc85 (diff)
parent	5dcd14ecd41ea2b3ae3295a9b30d98769d52165f (diff)